/[escript]/trunk/paso/src/GMRES.c
ViewVC logotype

Annotation of /trunk/paso/src/GMRES.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2358 - (hide annotations)
Wed Apr 1 22:25:24 2009 UTC (10 years, 6 months ago) by gross
File MIME type: text/plain
File size: 27270 byte(s)
mpi is not really ready yet.
1 ksteube 1312
2     /*******************************************************
3 ksteube 1811 *
4     * Copyright (c) 2003-2008 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13 dhawcroft 631
14 ksteube 1811
15 jgs 150 /*
16     * Purpose
17     * =======
18     *
19     * GMRES solves the linear system A*x=b using the
20     * truncated and restered GMRES method with preconditioning.
21     *
22     * Convergence test: norm( b - A*x )< TOL.
23     *
24     *
25     *
26     * Arguments
27     * =========
28     *
29 jgs 154 * r (input/output) double array, dimension n.
30 jgs 150 * On entry, residual of inital guess X
31     *
32 jgs 154 * x (input/output) double array, dimension n.
33 jgs 150 * On input, the initial guess.
34     *
35     * iter (input/output) int
36     * On input, the maximum num_iterations to be performed.
37     * On output, actual number of num_iterations performed.
38     *
39     * tolerance (input/output) DOUBLE PRECISION
40     * On input, the allowable convergence measure for
41     * norm( b - A*x )
42     * On output, the final value of this measure.
43     *
44 jgs 154 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
45 jgs 150 *
46     * restart (input) If restart>0, iteration is resterted a after restart steps.
47     *
48     * INFO (output) int
49     *
50     * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
51 jgs 154 * =SOLVER_MAXNUM_ITER_REACHED
52 jgs 150 * =SOLVER_INPUT_ERROR Illegal parameter:
53 jgs 154 * =SOLVER_BREAKDOWN: bad luck!
54     * =SOLVER_MEMORY_ERROR : no memory available
55 jgs 150 *
56     * ==============================================================
57     */
58    
59 gross 700 #include "Common.h"
60     #include "SystemMatrix.h"
61 jgs 150 #include "Solver.h"
62     #ifdef _OPENMP
63     #include <omp.h>
64     #endif
65    
66     err_t Paso_Solver_GMRES(
67     Paso_SystemMatrix * A,
68     double * r,
69     double * x,
70     dim_t *iter,
71 gross 584 double * tolerance,dim_t Length_of_recursion,dim_t restart,
72     Paso_Performance* pp) {
73 jgs 150
74 gross 1759 /* Local variables */
75 jgs 150
76 gross 1759 #ifdef _OPENMP
77     const int num_threads=omp_get_max_threads();
78     #else
79     const int num_threads=1;
80     #endif
81     double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
82 gross 1028 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
83 gross 1759 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
84 jfenwick 1986 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual=0,diff,L2_R,Norm_of_residual_global=0;
85 jgs 154 double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
86 jfenwick 1986 dim_t maxit,Num_iter_global=0,num_iter_restart=0,num_iter;
87 gross 1759 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
88 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
89 jgs 154 err_t Status=SOLVER_NO_ERROR;
90 jgs 150
91 gross 1028
92 jgs 150 /* adapt original routine parameters */
93 ksteube 1312 n = Paso_SystemMatrix_getTotalNumRows(A);
94 gross 1028 Length_of_mem=MAX(Length_of_recursion,0)+1;
95 jgs 150
96     /* Test the input parameters. */
97 jgs 154 if (restart>0) restart=MAX(Length_of_recursion,restart);
98     if (n < 0 || Length_of_recursion<=0) {
99 jgs 150 return SOLVER_INPUT_ERROR;
100     }
101    
102     /* allocate memory: */
103 gross 1028 X_PRES=TMPMEMALLOC(Length_of_mem,double*);
104     R_PRES=TMPMEMALLOC(Length_of_mem,double*);
105     P_PRES=TMPMEMALLOC(Length_of_mem,double*);
106 gross 1759 loc_dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
107     dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
108 gross 1028 P_PRES_dot_AP=TMPMEMALLOC(Length_of_mem,double);
109     R_PRES_dot_P_PRES=TMPMEMALLOC(Length_of_mem,double);
110     BREAKF=TMPMEMALLOC(Length_of_mem,double);
111     ALPHA=TMPMEMALLOC(Length_of_mem,double);
112 jgs 154 AP=TMPMEMALLOC(n,double);
113 gross 1028 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
114 gross 1759 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
115 gross 1028 Status=SOLVER_MEMORY_ERROR;
116     } else {
117     for (i=0;i<Length_of_mem;i++) {
118     X_PRES[i]=TMPMEMALLOC(n,double);
119     R_PRES[i]=TMPMEMALLOC(n,double);
120     P_PRES[i]=TMPMEMALLOC(n,double);
121     if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
122     }
123 jgs 150 }
124 jgs 154 if ( Status ==SOLVER_NO_ERROR ) {
125 jgs 150
126     /* now PRES starts : */
127     maxit=*iter;
128     tol=*tolerance;
129    
130 gross 1556 /* initialization */
131 jgs 150
132     restartFlag=TRUE;
133     num_iter=0;
134 gross 1759
135     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
136     for (th=0;th<num_threads;++th) {
137     local_n=n/num_threads;
138     rest=n-local_n*num_threads;
139     n_start=local_n*th+MIN(th,rest);
140     n_end=local_n*(th+1)+MIN(th+1,rest);
141     memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
142     for(i=0;i<Length_of_mem;++i) {
143     memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
144     memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
145     memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146     }
147 jgs 150 }
148    
149 jgs 154 while (! (convergeFlag || maxIterFlag || breakFlag)) {
150 jgs 150 if (restartFlag) {
151 jfenwick 1974 BREAKF[0]=PASO_ONE;
152 gross 1759 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
153     for (th=0;th<num_threads;++th) {
154     local_n=n/num_threads;
155     rest=n-local_n*num_threads;
156     n_start=local_n*th+MIN(th,rest);
157     n_end=local_n*(th+1)+MIN(th+1,rest);
158     memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
159     memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
160 jgs 154 }
161 jgs 150 num_iter_restart=0;
162     restartFlag=FALSE;
163     }
164     ++num_iter;
165     ++num_iter_restart;
166 jgs 154 /* order is the dimension of the space on which the residual is minimized: */
167     order=MIN(num_iter_restart,Length_of_recursion);
168 jgs 150 /***
169 jgs 154 *** calculate new search direction P from R_PRES
170 jgs 150 ***/
171 jgs 154 Paso_Solver_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
172 jgs 150 /***
173     *** apply A to P to get AP
174     ***/
175 jfenwick 1974 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, &P_PRES[0][0],PASO_ZERO, &AP[0]);
176 jgs 150 /***
177     ***** calculation of the norm of R and the scalar products of
178     *** the residuals and A*P:
179     ***/
180 gross 1759 memset(loc_dots,0,sizeof(double)*(order+1));
181     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
182     for (th=0;th<num_threads;++th) {
183     local_n=n/num_threads;
184     rest=n-local_n*num_threads;
185     n_start=local_n*th+MIN(th,rest);
186     n_end=local_n*(th+1)+MIN(th+1,rest);
187     if (order==0) {
188 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
189 gross 1759 #pragma ivdep
190     for (z=n_start; z < n_end; ++z) {
191     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
192     }
193     #pragma omp critical
194     {
195     loc_dots[0]+=R_PRES_dot_P;
196     }
197     } else if (order==1) {
198 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
199     P_PRES_dot_AP0=PASO_ZERO;
200 gross 1759 #pragma ivdep
201     for (z=n_start; z < n_end; ++z) {
202     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
203     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
204     }
205     #pragma omp critical
206     {
207     loc_dots[0]+=R_PRES_dot_P;
208     loc_dots[1]+=P_PRES_dot_AP0;
209     }
210     } else if (order==2) {
211 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
212     P_PRES_dot_AP0=PASO_ZERO;
213     P_PRES_dot_AP1=PASO_ZERO;
214 gross 1759 #pragma ivdep
215     for (z=n_start; z < n_end; ++z) {
216     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
217     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
218     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
219     }
220     #pragma omp critical
221     {
222     loc_dots[0]+=R_PRES_dot_P;
223     loc_dots[1]+=P_PRES_dot_AP0;
224     loc_dots[2]+=P_PRES_dot_AP1;
225     }
226     } else if (order==3) {
227 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
228     P_PRES_dot_AP0=PASO_ZERO;
229     P_PRES_dot_AP1=PASO_ZERO;
230     P_PRES_dot_AP2=PASO_ZERO;
231 gross 1759 #pragma ivdep
232     for (z=n_start; z < n_end; ++z) {
233     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
234     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
235     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
236     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
237     }
238     #pragma omp critical
239     {
240     loc_dots[0]+=R_PRES_dot_P;
241     loc_dots[1]+=P_PRES_dot_AP0;
242     loc_dots[2]+=P_PRES_dot_AP1;
243     loc_dots[3]+=P_PRES_dot_AP2;
244     }
245     } else if (order==4) {
246 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
247     P_PRES_dot_AP0=PASO_ZERO;
248     P_PRES_dot_AP1=PASO_ZERO;
249     P_PRES_dot_AP2=PASO_ZERO;
250     P_PRES_dot_AP3=PASO_ZERO;
251 gross 1759 #pragma ivdep
252     for (z=n_start; z < n_end; ++z) {
253     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
254     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
255     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
256     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
257     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
258     }
259     #pragma omp critical
260     {
261     loc_dots[0]+=R_PRES_dot_P;
262     loc_dots[1]+=P_PRES_dot_AP0;
263     loc_dots[2]+=P_PRES_dot_AP1;
264     loc_dots[3]+=P_PRES_dot_AP2;
265     loc_dots[4]+=P_PRES_dot_AP3;
266     }
267     } else if (order==5) {
268 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
269     P_PRES_dot_AP0=PASO_ZERO;
270     P_PRES_dot_AP1=PASO_ZERO;
271     P_PRES_dot_AP2=PASO_ZERO;
272     P_PRES_dot_AP3=PASO_ZERO;
273     P_PRES_dot_AP4=PASO_ZERO;
274 gross 1759 #pragma ivdep
275     for (z=n_start; z < n_end; ++z) {
276     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
277     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
278     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
279     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
280     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
281     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
282     }
283     #pragma omp critical
284     {
285     loc_dots[0]+=R_PRES_dot_P;
286     loc_dots[1]+=P_PRES_dot_AP0;
287     loc_dots[2]+=P_PRES_dot_AP1;
288     loc_dots[3]+=P_PRES_dot_AP2;
289     loc_dots[4]+=P_PRES_dot_AP3;
290     loc_dots[5]+=P_PRES_dot_AP4;
291     }
292     } else if (order==6) {
293 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
294     P_PRES_dot_AP0=PASO_ZERO;
295     P_PRES_dot_AP1=PASO_ZERO;
296     P_PRES_dot_AP2=PASO_ZERO;
297     P_PRES_dot_AP3=PASO_ZERO;
298     P_PRES_dot_AP4=PASO_ZERO;
299     P_PRES_dot_AP5=PASO_ZERO;
300 gross 1759 #pragma ivdep
301     for (z=n_start; z < n_end; ++z) {
302     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
303     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
304     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
305     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
306     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
307     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
308     P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
309     }
310     #pragma omp critical
311     {
312     loc_dots[0]+=R_PRES_dot_P;
313     loc_dots[1]+=P_PRES_dot_AP0;
314     loc_dots[2]+=P_PRES_dot_AP1;
315     loc_dots[3]+=P_PRES_dot_AP2;
316     loc_dots[4]+=P_PRES_dot_AP3;
317     loc_dots[5]+=P_PRES_dot_AP4;
318     loc_dots[6]+=P_PRES_dot_AP5;
319     }
320     } else {
321 jfenwick 1974 R_PRES_dot_P=PASO_ZERO;
322     P_PRES_dot_AP0=PASO_ZERO;
323     P_PRES_dot_AP1=PASO_ZERO;
324     P_PRES_dot_AP2=PASO_ZERO;
325     P_PRES_dot_AP3=PASO_ZERO;
326     P_PRES_dot_AP4=PASO_ZERO;
327     P_PRES_dot_AP5=PASO_ZERO;
328     P_PRES_dot_AP6=PASO_ZERO;
329 gross 1759 #pragma ivdep
330     for (z=n_start; z < n_end; ++z) {
331     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
332     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
333     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
334     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
335     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
336     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
337     P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
338     P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
339     }
340     #pragma omp critical
341     {
342     loc_dots[0]+=R_PRES_dot_P;
343     loc_dots[1]+=P_PRES_dot_AP0;
344     loc_dots[2]+=P_PRES_dot_AP1;
345     loc_dots[3]+=P_PRES_dot_AP2;
346     loc_dots[4]+=P_PRES_dot_AP3;
347     loc_dots[5]+=P_PRES_dot_AP4;
348     loc_dots[6]+=P_PRES_dot_AP5;
349     loc_dots[7]+=P_PRES_dot_AP6;
350     }
351     for (i=7;i<order;++i) {
352 jfenwick 1974 P_PRES_dot_AP0=PASO_ZERO;
353 gross 1759 #pragma ivdep
354     for (z=n_start; z < n_end; ++z) {
355     P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
356     }
357     #pragma omp critical
358     {
359     loc_dots[i+1]+=P_PRES_dot_AP0;
360     }
361     }
362     }
363 jgs 150 }
364 gross 1759 #ifdef PASO_MPI
365     MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
366     R_PRES_dot_P_PRES[0]=dots[0];
367     memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
368     #else
369     R_PRES_dot_P_PRES[0]=loc_dots[0];
370     memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
371     #endif
372     R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
373 jgs 154 /*** if sum_BREAKF is equal to zero a breakdown occurs.
374     *** iteration procedure can be continued but R_PRES is not the
375     *** residual of X_PRES approximation.
376     ***/
377     sum_BREAKF=0.;
378 gross 1759 #pragma ivdep
379 jgs 154 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
380 jfenwick 1974 breakFlag=!((ABS(R_PRES_dot_AP0) > PASO_ZERO) && (sum_BREAKF >PASO_ZERO));
381 jgs 154 if (!breakFlag) {
382     breakFlag=FALSE;
383     /***
384     ***** X_PRES and R_PRES are moved to memory:
385     ***/
386 gross 1556 Factor=0.;
387 gross 1759 #pragma ivdep
388 gross 1556 for (i=0;i<order;++i) {
389 jgs 154 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
390     Factor+=BREAKF[i]*ALPHA[i];
391 gross 1556 }
392 jgs 154
393 gross 1556 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
394     save_R_PRES=R_PRES[Length_of_mem-1];
395     save_XPRES=X_PRES[Length_of_mem-1];
396     save_P_PRES=P_PRES[Length_of_mem-1];
397 gross 1759 #pragma ivdep
398 gross 1556 for (i=Length_of_mem-1;i>0;--i) {
399 jgs 154 BREAKF[i]=BREAKF[i-1];
400     R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
401     R_PRES[i]=R_PRES[i-1];
402     X_PRES[i]=X_PRES[i-1];
403     P_PRES[i]=P_PRES[i-1];
404 gross 1556 }
405     R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
406     R_PRES[0]=save_R_PRES;
407     X_PRES[0]=save_XPRES;
408     P_PRES[0]=save_P_PRES;
409 jgs 154
410 jfenwick 1974 if (ABS(Factor)<=PASO_ZERO) {
411 jgs 154 Factor=1.;
412 jfenwick 1974 BREAKF[0]=PASO_ZERO;
413 gross 1556 } else {
414 jgs 154 Factor=1./Factor;
415 jfenwick 1974 BREAKF[0]=PASO_ONE;
416 jgs 150 }
417 gross 1556 for (i=0;i<order;++i) ALPHA[i]*=Factor;
418 jgs 150 /*
419 jgs 154 ***** update of solution X_PRES and its residual R_PRES:
420 jgs 150 ***
421     *** P is used to accumulate X and AP to accumulate R. X and R
422     *** are still needed until they are put into the X and R memory
423 jgs 154 *** R_PRES and X_PRES
424 jgs 150 ***
425     **/
426 jgs 154 breakf0=BREAKF[0];
427 gross 1759 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
428     for (th=0;th<num_threads;++th) {
429     local_n=n/num_threads;
430     rest=n-local_n*num_threads;
431     n_start=local_n*th+MIN(th,rest);
432     n_end=local_n*(th+1)+MIN(th+1,rest);
433     if (order==0) {
434     #pragma ivdep
435     for (z=n_start; z < n_end; ++z) {
436     R_PRES[0][z]= Factor* AP[z];
437     X_PRES[0][z]=-Factor*P_PRES[1][z];
438     }
439     } else if (order==1) {
440     #pragma ivdep
441     for (z=n_start; z < n_end; ++z) {
442     R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
443     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
444     }
445     } else if (order==2) {
446     #pragma ivdep
447     for (z=n_start; z < n_end; ++z) {
448     R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
449     +ALPHA[1]*R_PRES[2][z];
450     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
451     +ALPHA[1]*X_PRES[2][z];
452     }
453     } else if (order==3) {
454     #pragma ivdep
455     for (z=n_start; z < n_end; ++z) {
456     R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
457     +ALPHA[1]*R_PRES[2][z]
458     +ALPHA[2]*R_PRES[3][z];
459     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
460     +ALPHA[1]*X_PRES[2][z]
461     +ALPHA[2]*X_PRES[3][z];
462     }
463     } else if (order==4) {
464     #pragma ivdep
465     for (z=n_start; z < n_end; ++z) {
466     R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
467     +ALPHA[1]*R_PRES[2][z]
468     +ALPHA[2]*R_PRES[3][z]
469     +ALPHA[3]*R_PRES[4][z];
470     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
471     +ALPHA[1]*X_PRES[2][z]
472     +ALPHA[2]*X_PRES[3][z]
473     +ALPHA[3]*X_PRES[4][z];
474     }
475     } else if (order==5) {
476     #pragma ivdep
477     for (z=n_start; z < n_end; ++z) {
478     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
479     +ALPHA[1]*R_PRES[2][z]
480     +ALPHA[2]*R_PRES[3][z]
481     +ALPHA[3]*R_PRES[4][z]
482     +ALPHA[4]*R_PRES[5][z];
483     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
484     +ALPHA[1]*X_PRES[2][z]
485     +ALPHA[2]*X_PRES[3][z]
486     +ALPHA[3]*X_PRES[4][z]
487     +ALPHA[4]*X_PRES[5][z];
488     }
489     } else if (order==6) {
490     #pragma ivdep
491     for (z=n_start; z < n_end; ++z) {
492     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
493     +ALPHA[1]*R_PRES[2][z]
494     +ALPHA[2]*R_PRES[3][z]
495     +ALPHA[3]*R_PRES[4][z]
496     +ALPHA[4]*R_PRES[5][z]
497     +ALPHA[5]*R_PRES[6][z];
498     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
499     +ALPHA[1]*X_PRES[2][z]
500     +ALPHA[2]*X_PRES[3][z]
501     +ALPHA[3]*X_PRES[4][z]
502     +ALPHA[4]*X_PRES[5][z]
503     +ALPHA[5]*X_PRES[6][z];
504     }
505     } else {
506     #pragma ivdep
507     for (z=n_start; z < n_end; ++z) {
508     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
509     +ALPHA[1]*R_PRES[2][z]
510     +ALPHA[2]*R_PRES[3][z]
511     +ALPHA[3]*R_PRES[4][z]
512     +ALPHA[4]*R_PRES[5][z]
513     +ALPHA[5]*R_PRES[6][z]
514     +ALPHA[6]*R_PRES[7][z];
515     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
516     +ALPHA[1]*X_PRES[2][z]
517     +ALPHA[2]*X_PRES[3][z]
518     +ALPHA[3]*X_PRES[4][z]
519     +ALPHA[4]*X_PRES[5][z]
520     +ALPHA[5]*X_PRES[6][z]
521     +ALPHA[6]*X_PRES[7][z];
522     }
523     for (i=7;i<order;++i) {
524     #pragma ivdep
525     for (z=n_start; z < n_end; ++z) {
526     R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
527     X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
528     }
529     }
530     }
531 jgs 150 }
532 jgs 154 if (breakf0>0.) {
533 jgs 150 /***
534 jgs 154 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
535 jgs 150 ***/
536 gross 1759 memset(loc_dots,0,sizeof(double)*3);
537     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
538     for (th=0;th<num_threads;++th) {
539     local_n=n/num_threads;
540     rest=n-local_n*num_threads;
541     n_start=local_n*th+MIN(th,rest);
542     n_end=local_n*(th+1)+MIN(th+1,rest);
543 jfenwick 1974 SC1=PASO_ZERO;
544     SC2=PASO_ZERO;
545 gross 1759 #pragma ivdep
546     for (z=n_start; z < n_end; ++z) {
547     diff=R_PRES[0][z]-r[z];
548     SC1+=diff*diff;
549     SC2+=diff*r[z];
550     }
551     #pragma omp critical
552     {
553     loc_dots[0]+=SC1;
554     loc_dots[1]+=SC2;
555     }
556 jgs 150 }
557 gross 1759 #ifdef PASO_MPI
558     MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
559     SC1=dots[0];
560     SC2=dots[1];
561     #else
562     SC1=loc_dots[0];
563     SC2=loc_dots[1];
564     #endif
565 jfenwick 1974 gamma=(SC1<=PASO_ZERO) ? PASO_ZERO : -SC2/SC1;
566 gross 1759 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
567     for (th=0;th<num_threads;++th) {
568     local_n=n/num_threads;
569     rest=n-local_n*num_threads;
570     n_start=local_n*th+MIN(th,rest);
571     n_end=local_n*(th+1)+MIN(th+1,rest);
572 jfenwick 1974 L2_R=PASO_ZERO;
573 gross 1759 #pragma ivdep
574     for (z=n_start; z < n_end; ++z) {
575     x[z]+=gamma*(X_PRES[0][z]-x[z]);
576     r[z]+=gamma*(R_PRES[0][z]-r[z]);
577     L2_R+=r[z]*r[z];
578     }
579     #pragma omp critical
580     {
581     loc_dots[2]+=L2_R;
582     }
583 jgs 150 }
584 gross 1759 #ifdef PASO_MPI
585     MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
586     L2_R=dots[2];
587     #else
588     L2_R=loc_dots[2];
589     #endif
590 jgs 150 norm_of_residual=sqrt(L2_R);
591     convergeFlag = (norm_of_residual <= tol);
592 jgs 154 if (restart>0) restartFlag=(num_iter_restart >= restart);
593     } else {
594     convergeFlag=FALSE;
595     restartFlag=FALSE;
596 jgs 150 }
597 jgs 154 maxIterFlag = (num_iter >= maxit);
598     }
599     }
600     /* end of iteration */
601 gross 1556 Norm_of_residual_global=norm_of_residual;
602     Num_iter_global=num_iter;
603     if (maxIterFlag) {
604 jgs 154 Status = SOLVER_MAXITER_REACHED;
605 jgs 150 } else if (breakFlag) {
606 jgs 154 Status = SOLVER_BREAKDOWN;
607 gross 1556 }
608     }
609 jgs 154 for (i=0;i<Length_of_recursion;i++) {
610     TMPMEMFREE(X_PRES[i]);
611     TMPMEMFREE(R_PRES[i]);
612     TMPMEMFREE(P_PRES[i]);
613 jgs 150 }
614 gross 1028 TMPMEMFREE(AP);
615     TMPMEMFREE(X_PRES);
616     TMPMEMFREE(R_PRES);
617     TMPMEMFREE(P_PRES);
618     TMPMEMFREE(P_PRES_dot_AP);
619     TMPMEMFREE(R_PRES_dot_P_PRES);
620     TMPMEMFREE(BREAKF);
621     TMPMEMFREE(ALPHA);
622 gross 1759 TMPMEMFREE(dots);
623     TMPMEMFREE(loc_dots);
624 jgs 154 *iter=Num_iter_global;
625     *tolerance=Norm_of_residual_global;
626     return Status;
627 gross 2108 }
628    

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26