/[escript]/trunk/paso/src/GMRES.c
ViewVC logotype

Annotation of /trunk/paso/src/GMRES.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1759 - (hide annotations)
Mon Sep 8 02:31:22 2008 UTC (11 years, 5 months ago) by gross
File MIME type: text/plain
File size: 27046 byte(s)
MPI version plus some openmp optimization
1 ksteube 1312
2 jgs 150 /* $Id$ */
3    
4 ksteube 1312 /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15 dhawcroft 631
16 jgs 150 /*
17     * Purpose
18     * =======
19     *
20     * GMRES solves the linear system A*x=b using the
21     * truncated and restered GMRES method with preconditioning.
22     *
23     * Convergence test: norm( b - A*x )< TOL.
24     *
25     *
26     *
27     * Arguments
28     * =========
29     *
30 jgs 154 * r (input/output) double array, dimension n.
31 jgs 150 * On entry, residual of inital guess X
32     *
33 jgs 154 * x (input/output) double array, dimension n.
34 jgs 150 * On input, the initial guess.
35     *
36     * iter (input/output) int
37     * On input, the maximum num_iterations to be performed.
38     * On output, actual number of num_iterations performed.
39     *
40     * tolerance (input/output) DOUBLE PRECISION
41     * On input, the allowable convergence measure for
42     * norm( b - A*x )
43     * On output, the final value of this measure.
44     *
45 jgs 154 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
46 jgs 150 *
47     * restart (input) If restart>0, iteration is resterted a after restart steps.
48     *
49     * INFO (output) int
50     *
51     * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
52 jgs 154 * =SOLVER_MAXNUM_ITER_REACHED
53 jgs 150 * =SOLVER_INPUT_ERROR Illegal parameter:
54 jgs 154 * =SOLVER_BREAKDOWN: bad luck!
55     * =SOLVER_MEMORY_ERROR : no memory available
56 jgs 150 *
57     * ==============================================================
58     */
59    
60 gross 700 #include "Common.h"
61     #include "SystemMatrix.h"
62 jgs 150 #include "Solver.h"
63     #ifdef _OPENMP
64     #include <omp.h>
65     #endif
66    
67     err_t Paso_Solver_GMRES(
68     Paso_SystemMatrix * A,
69     double * r,
70     double * x,
71     dim_t *iter,
72 gross 584 double * tolerance,dim_t Length_of_recursion,dim_t restart,
73     Paso_Performance* pp) {
74 jgs 150
75 gross 1759 /* Local variables */
76 jgs 150
77 gross 1759 #ifdef _OPENMP
78     const int num_threads=omp_get_max_threads();
79     #else
80     const int num_threads=1;
81     #endif
82     double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
83 gross 1028 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
84 gross 1759 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
85 jgs 154 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual,diff,L2_R,Norm_of_residual_global;
86     double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
87     dim_t maxit,Num_iter_global,num_iter_restart,num_iter;
88 gross 1759 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
89 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
90 jgs 154 err_t Status=SOLVER_NO_ERROR;
91 jgs 150
92 gross 1028
93 jgs 150 /* adapt original routine parameters */
94 ksteube 1312 n = Paso_SystemMatrix_getTotalNumRows(A);
95 gross 1028 Length_of_mem=MAX(Length_of_recursion,0)+1;
96 jgs 150
97     /* Test the input parameters. */
98 jgs 154 if (restart>0) restart=MAX(Length_of_recursion,restart);
99     if (n < 0 || Length_of_recursion<=0) {
100 jgs 150 return SOLVER_INPUT_ERROR;
101     }
102    
103     /* allocate memory: */
104 gross 1028 X_PRES=TMPMEMALLOC(Length_of_mem,double*);
105     R_PRES=TMPMEMALLOC(Length_of_mem,double*);
106     P_PRES=TMPMEMALLOC(Length_of_mem,double*);
107 gross 1759 loc_dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
108     dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
109 gross 1028 P_PRES_dot_AP=TMPMEMALLOC(Length_of_mem,double);
110     R_PRES_dot_P_PRES=TMPMEMALLOC(Length_of_mem,double);
111     BREAKF=TMPMEMALLOC(Length_of_mem,double);
112     ALPHA=TMPMEMALLOC(Length_of_mem,double);
113 jgs 154 AP=TMPMEMALLOC(n,double);
114 gross 1028 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
115 gross 1759 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
116 gross 1028 Status=SOLVER_MEMORY_ERROR;
117     } else {
118     for (i=0;i<Length_of_mem;i++) {
119     X_PRES[i]=TMPMEMALLOC(n,double);
120     R_PRES[i]=TMPMEMALLOC(n,double);
121     P_PRES[i]=TMPMEMALLOC(n,double);
122     if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
123     }
124 jgs 150 }
125 jgs 154 if ( Status ==SOLVER_NO_ERROR ) {
126 jgs 150
127     /* now PRES starts : */
128     maxit=*iter;
129     tol=*tolerance;
130    
131 gross 1556 /* initialization */
132 jgs 150
133     restartFlag=TRUE;
134     num_iter=0;
135 gross 1759
136     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
137     for (th=0;th<num_threads;++th) {
138     local_n=n/num_threads;
139     rest=n-local_n*num_threads;
140     n_start=local_n*th+MIN(th,rest);
141     n_end=local_n*(th+1)+MIN(th+1,rest);
142     memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
143     for(i=0;i<Length_of_mem;++i) {
144     memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
145     memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146     memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
147     }
148 jgs 150 }
149    
150 jgs 154 while (! (convergeFlag || maxIterFlag || breakFlag)) {
151 jgs 150 if (restartFlag) {
152 jgs 154 BREAKF[0]=ONE;
153 gross 1759 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
154     for (th=0;th<num_threads;++th) {
155     local_n=n/num_threads;
156     rest=n-local_n*num_threads;
157     n_start=local_n*th+MIN(th,rest);
158     n_end=local_n*(th+1)+MIN(th+1,rest);
159     memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
160     memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
161 jgs 154 }
162 jgs 150 num_iter_restart=0;
163     restartFlag=FALSE;
164     }
165     ++num_iter;
166     ++num_iter_restart;
167 jgs 154 /* order is the dimension of the space on which the residual is minimized: */
168     order=MIN(num_iter_restart,Length_of_recursion);
169 jgs 150 /***
170 jgs 154 *** calculate new search direction P from R_PRES
171 jgs 150 ***/
172 jgs 154 Paso_Solver_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
173 jgs 150 /***
174     *** apply A to P to get AP
175     ***/
176 gross 415 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, &P_PRES[0][0],ZERO, &AP[0]);
177 jgs 150 /***
178     ***** calculation of the norm of R and the scalar products of
179     *** the residuals and A*P:
180     ***/
181 gross 1759 memset(loc_dots,0,sizeof(double)*(order+1));
182     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
183     for (th=0;th<num_threads;++th) {
184     local_n=n/num_threads;
185     rest=n-local_n*num_threads;
186     n_start=local_n*th+MIN(th,rest);
187     n_end=local_n*(th+1)+MIN(th+1,rest);
188     if (order==0) {
189     R_PRES_dot_P=ZERO;
190     #pragma ivdep
191     for (z=n_start; z < n_end; ++z) {
192     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
193     }
194     #pragma omp critical
195     {
196     loc_dots[0]+=R_PRES_dot_P;
197     }
198     } else if (order==1) {
199     R_PRES_dot_P=ZERO;
200     P_PRES_dot_AP0=ZERO;
201     #pragma ivdep
202     for (z=n_start; z < n_end; ++z) {
203     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
204     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
205     }
206     #pragma omp critical
207     {
208     loc_dots[0]+=R_PRES_dot_P;
209     loc_dots[1]+=P_PRES_dot_AP0;
210     }
211     } else if (order==2) {
212     R_PRES_dot_P=ZERO;
213     P_PRES_dot_AP0=ZERO;
214     P_PRES_dot_AP1=ZERO;
215     #pragma ivdep
216     for (z=n_start; z < n_end; ++z) {
217     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
218     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
219     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
220     }
221     #pragma omp critical
222     {
223     loc_dots[0]+=R_PRES_dot_P;
224     loc_dots[1]+=P_PRES_dot_AP0;
225     loc_dots[2]+=P_PRES_dot_AP1;
226     }
227     } else if (order==3) {
228     R_PRES_dot_P=ZERO;
229     P_PRES_dot_AP0=ZERO;
230     P_PRES_dot_AP1=ZERO;
231     P_PRES_dot_AP2=ZERO;
232     #pragma ivdep
233     for (z=n_start; z < n_end; ++z) {
234     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
235     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
236     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
237     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
238     }
239     #pragma omp critical
240     {
241     loc_dots[0]+=R_PRES_dot_P;
242     loc_dots[1]+=P_PRES_dot_AP0;
243     loc_dots[2]+=P_PRES_dot_AP1;
244     loc_dots[3]+=P_PRES_dot_AP2;
245     }
246     } else if (order==4) {
247     R_PRES_dot_P=ZERO;
248     P_PRES_dot_AP0=ZERO;
249     P_PRES_dot_AP1=ZERO;
250     P_PRES_dot_AP2=ZERO;
251     P_PRES_dot_AP3=ZERO;
252     #pragma ivdep
253     for (z=n_start; z < n_end; ++z) {
254     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
255     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
256     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
257     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
258     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
259     }
260     #pragma omp critical
261     {
262     loc_dots[0]+=R_PRES_dot_P;
263     loc_dots[1]+=P_PRES_dot_AP0;
264     loc_dots[2]+=P_PRES_dot_AP1;
265     loc_dots[3]+=P_PRES_dot_AP2;
266     loc_dots[4]+=P_PRES_dot_AP3;
267     }
268     } else if (order==5) {
269     R_PRES_dot_P=ZERO;
270     P_PRES_dot_AP0=ZERO;
271     P_PRES_dot_AP1=ZERO;
272     P_PRES_dot_AP2=ZERO;
273     P_PRES_dot_AP3=ZERO;
274     P_PRES_dot_AP4=ZERO;
275     #pragma ivdep
276     for (z=n_start; z < n_end; ++z) {
277     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
278     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
279     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
280     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
281     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
282     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
283     }
284     #pragma omp critical
285     {
286     loc_dots[0]+=R_PRES_dot_P;
287     loc_dots[1]+=P_PRES_dot_AP0;
288     loc_dots[2]+=P_PRES_dot_AP1;
289     loc_dots[3]+=P_PRES_dot_AP2;
290     loc_dots[4]+=P_PRES_dot_AP3;
291     loc_dots[5]+=P_PRES_dot_AP4;
292     }
293     } else if (order==6) {
294     R_PRES_dot_P=ZERO;
295     P_PRES_dot_AP0=ZERO;
296     P_PRES_dot_AP1=ZERO;
297     P_PRES_dot_AP2=ZERO;
298     P_PRES_dot_AP3=ZERO;
299     P_PRES_dot_AP4=ZERO;
300     P_PRES_dot_AP5=ZERO;
301     #pragma ivdep
302     for (z=n_start; z < n_end; ++z) {
303     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
304     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
305     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
306     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
307     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
308     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
309     P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
310     }
311     #pragma omp critical
312     {
313     loc_dots[0]+=R_PRES_dot_P;
314     loc_dots[1]+=P_PRES_dot_AP0;
315     loc_dots[2]+=P_PRES_dot_AP1;
316     loc_dots[3]+=P_PRES_dot_AP2;
317     loc_dots[4]+=P_PRES_dot_AP3;
318     loc_dots[5]+=P_PRES_dot_AP4;
319     loc_dots[6]+=P_PRES_dot_AP5;
320     }
321     } else {
322     R_PRES_dot_P=ZERO;
323     P_PRES_dot_AP0=ZERO;
324     P_PRES_dot_AP1=ZERO;
325     P_PRES_dot_AP2=ZERO;
326     P_PRES_dot_AP3=ZERO;
327     P_PRES_dot_AP4=ZERO;
328     P_PRES_dot_AP5=ZERO;
329     P_PRES_dot_AP6=ZERO;
330     #pragma ivdep
331     for (z=n_start; z < n_end; ++z) {
332     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
333     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
334     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
335     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
336     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
337     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
338     P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
339     P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
340     }
341     #pragma omp critical
342     {
343     loc_dots[0]+=R_PRES_dot_P;
344     loc_dots[1]+=P_PRES_dot_AP0;
345     loc_dots[2]+=P_PRES_dot_AP1;
346     loc_dots[3]+=P_PRES_dot_AP2;
347     loc_dots[4]+=P_PRES_dot_AP3;
348     loc_dots[5]+=P_PRES_dot_AP4;
349     loc_dots[6]+=P_PRES_dot_AP5;
350     loc_dots[7]+=P_PRES_dot_AP6;
351     }
352     for (i=7;i<order;++i) {
353     P_PRES_dot_AP0=ZERO;
354     #pragma ivdep
355     for (z=n_start; z < n_end; ++z) {
356     P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
357     }
358     #pragma omp critical
359     {
360     loc_dots[i+1]+=P_PRES_dot_AP0;
361     }
362     }
363     }
364 jgs 150 }
365 gross 1759 #ifdef PASO_MPI
366     MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
367     R_PRES_dot_P_PRES[0]=dots[0];
368     memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
369     #else
370     R_PRES_dot_P_PRES[0]=loc_dots[0];
371     memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
372     #endif
373     R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
374 jgs 154 /*** if sum_BREAKF is equal to zero a breakdown occurs.
375     *** iteration procedure can be continued but R_PRES is not the
376     *** residual of X_PRES approximation.
377     ***/
378     sum_BREAKF=0.;
379 gross 1759 #pragma ivdep
380 jgs 154 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
381 gross 1759 breakFlag=!((ABS(R_PRES_dot_AP0) > ZERO) && (sum_BREAKF >ZERO));
382 jgs 154 if (!breakFlag) {
383     breakFlag=FALSE;
384     /***
385     ***** X_PRES and R_PRES are moved to memory:
386     ***/
387 gross 1556 Factor=0.;
388 gross 1759 #pragma ivdep
389 gross 1556 for (i=0;i<order;++i) {
390 jgs 154 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
391     Factor+=BREAKF[i]*ALPHA[i];
392 gross 1556 }
393 jgs 154
394 gross 1556 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
395     save_R_PRES=R_PRES[Length_of_mem-1];
396     save_XPRES=X_PRES[Length_of_mem-1];
397     save_P_PRES=P_PRES[Length_of_mem-1];
398 gross 1759 #pragma ivdep
399 gross 1556 for (i=Length_of_mem-1;i>0;--i) {
400 jgs 154 BREAKF[i]=BREAKF[i-1];
401     R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
402     R_PRES[i]=R_PRES[i-1];
403     X_PRES[i]=X_PRES[i-1];
404     P_PRES[i]=P_PRES[i-1];
405 gross 1556 }
406     R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
407     R_PRES[0]=save_R_PRES;
408     X_PRES[0]=save_XPRES;
409     P_PRES[0]=save_P_PRES;
410 jgs 154
411 gross 1556 if (ABS(Factor)<=ZERO) {
412 jgs 154 Factor=1.;
413     BREAKF[0]=ZERO;
414 gross 1556 } else {
415 jgs 154 Factor=1./Factor;
416     BREAKF[0]=ONE;
417 jgs 150 }
418 gross 1556 for (i=0;i<order;++i) ALPHA[i]*=Factor;
419 jgs 150 /*
420 jgs 154 ***** update of solution X_PRES and its residual R_PRES:
421 jgs 150 ***
422     *** P is used to accumulate X and AP to accumulate R. X and R
423     *** are still needed until they are put into the X and R memory
424 jgs 154 *** R_PRES and X_PRES
425 jgs 150 ***
426     **/
427 jgs 154 breakf0=BREAKF[0];
428 gross 1759 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
429     for (th=0;th<num_threads;++th) {
430     local_n=n/num_threads;
431     rest=n-local_n*num_threads;
432     n_start=local_n*th+MIN(th,rest);
433     n_end=local_n*(th+1)+MIN(th+1,rest);
434     if (order==0) {
435     #pragma ivdep
436     for (z=n_start; z < n_end; ++z) {
437     R_PRES[0][z]= Factor* AP[z];
438     X_PRES[0][z]=-Factor*P_PRES[1][z];
439     }
440     } else if (order==1) {
441     #pragma ivdep
442     for (z=n_start; z < n_end; ++z) {
443     R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
444     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
445     }
446     } else if (order==2) {
447     #pragma ivdep
448     for (z=n_start; z < n_end; ++z) {
449     R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
450     +ALPHA[1]*R_PRES[2][z];
451     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
452     +ALPHA[1]*X_PRES[2][z];
453     }
454     } else if (order==3) {
455     #pragma ivdep
456     for (z=n_start; z < n_end; ++z) {
457     R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
458     +ALPHA[1]*R_PRES[2][z]
459     +ALPHA[2]*R_PRES[3][z];
460     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
461     +ALPHA[1]*X_PRES[2][z]
462     +ALPHA[2]*X_PRES[3][z];
463     }
464     } else if (order==4) {
465     #pragma ivdep
466     for (z=n_start; z < n_end; ++z) {
467     R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
468     +ALPHA[1]*R_PRES[2][z]
469     +ALPHA[2]*R_PRES[3][z]
470     +ALPHA[3]*R_PRES[4][z];
471     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
472     +ALPHA[1]*X_PRES[2][z]
473     +ALPHA[2]*X_PRES[3][z]
474     +ALPHA[3]*X_PRES[4][z];
475     }
476     } else if (order==5) {
477     #pragma ivdep
478     for (z=n_start; z < n_end; ++z) {
479     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
480     +ALPHA[1]*R_PRES[2][z]
481     +ALPHA[2]*R_PRES[3][z]
482     +ALPHA[3]*R_PRES[4][z]
483     +ALPHA[4]*R_PRES[5][z];
484     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
485     +ALPHA[1]*X_PRES[2][z]
486     +ALPHA[2]*X_PRES[3][z]
487     +ALPHA[3]*X_PRES[4][z]
488     +ALPHA[4]*X_PRES[5][z];
489     }
490     } else if (order==6) {
491     #pragma ivdep
492     for (z=n_start; z < n_end; ++z) {
493     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
494     +ALPHA[1]*R_PRES[2][z]
495     +ALPHA[2]*R_PRES[3][z]
496     +ALPHA[3]*R_PRES[4][z]
497     +ALPHA[4]*R_PRES[5][z]
498     +ALPHA[5]*R_PRES[6][z];
499     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
500     +ALPHA[1]*X_PRES[2][z]
501     +ALPHA[2]*X_PRES[3][z]
502     +ALPHA[3]*X_PRES[4][z]
503     +ALPHA[4]*X_PRES[5][z]
504     +ALPHA[5]*X_PRES[6][z];
505     }
506     } else {
507     #pragma ivdep
508     for (z=n_start; z < n_end; ++z) {
509     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
510     +ALPHA[1]*R_PRES[2][z]
511     +ALPHA[2]*R_PRES[3][z]
512     +ALPHA[3]*R_PRES[4][z]
513     +ALPHA[4]*R_PRES[5][z]
514     +ALPHA[5]*R_PRES[6][z]
515     +ALPHA[6]*R_PRES[7][z];
516     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
517     +ALPHA[1]*X_PRES[2][z]
518     +ALPHA[2]*X_PRES[3][z]
519     +ALPHA[3]*X_PRES[4][z]
520     +ALPHA[4]*X_PRES[5][z]
521     +ALPHA[5]*X_PRES[6][z]
522     +ALPHA[6]*X_PRES[7][z];
523     }
524     for (i=7;i<order;++i) {
525     #pragma ivdep
526     for (z=n_start; z < n_end; ++z) {
527     R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
528     X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
529     }
530     }
531     }
532 jgs 150 }
533 jgs 154 if (breakf0>0.) {
534 jgs 150 /***
535 jgs 154 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
536 jgs 150 ***/
537 gross 1759 memset(loc_dots,0,sizeof(double)*3);
538     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
539     for (th=0;th<num_threads;++th) {
540     local_n=n/num_threads;
541     rest=n-local_n*num_threads;
542     n_start=local_n*th+MIN(th,rest);
543     n_end=local_n*(th+1)+MIN(th+1,rest);
544     SC1=ZERO;
545     SC2=ZERO;
546     #pragma ivdep
547     for (z=n_start; z < n_end; ++z) {
548     diff=R_PRES[0][z]-r[z];
549     SC1+=diff*diff;
550     SC2+=diff*r[z];
551     }
552     #pragma omp critical
553     {
554     loc_dots[0]+=SC1;
555     loc_dots[1]+=SC2;
556     }
557 jgs 150 }
558 gross 1759 #ifdef PASO_MPI
559     MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
560     SC1=dots[0];
561     SC2=dots[1];
562     #else
563     SC1=loc_dots[0];
564     SC2=loc_dots[1];
565     #endif
566 jgs 154 gamma=(SC1<=ZERO) ? ZERO : -SC2/SC1;
567 gross 1759 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
568     for (th=0;th<num_threads;++th) {
569     local_n=n/num_threads;
570     rest=n-local_n*num_threads;
571     n_start=local_n*th+MIN(th,rest);
572     n_end=local_n*(th+1)+MIN(th+1,rest);
573     L2_R=ZERO;
574     #pragma ivdep
575     for (z=n_start; z < n_end; ++z) {
576     x[z]+=gamma*(X_PRES[0][z]-x[z]);
577     r[z]+=gamma*(R_PRES[0][z]-r[z]);
578     L2_R+=r[z]*r[z];
579     }
580     #pragma omp critical
581     {
582     loc_dots[2]+=L2_R;
583     }
584 jgs 150 }
585 gross 1759 #ifdef PASO_MPI
586     MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
587     L2_R=dots[2];
588     #else
589     L2_R=loc_dots[2];
590     #endif
591 jgs 150 norm_of_residual=sqrt(L2_R);
592     convergeFlag = (norm_of_residual <= tol);
593 jgs 154 if (restart>0) restartFlag=(num_iter_restart >= restart);
594     } else {
595     convergeFlag=FALSE;
596     restartFlag=FALSE;
597 jgs 150 }
598 jgs 154 maxIterFlag = (num_iter >= maxit);
599     }
600     }
601     /* end of iteration */
602 gross 1556 Norm_of_residual_global=norm_of_residual;
603     Num_iter_global=num_iter;
604     if (maxIterFlag) {
605 jgs 154 Status = SOLVER_MAXITER_REACHED;
606 jgs 150 } else if (breakFlag) {
607 jgs 154 Status = SOLVER_BREAKDOWN;
608 gross 1556 }
609     }
610 jgs 154 for (i=0;i<Length_of_recursion;i++) {
611     TMPMEMFREE(X_PRES[i]);
612     TMPMEMFREE(R_PRES[i]);
613     TMPMEMFREE(P_PRES[i]);
614 jgs 150 }
615 gross 1028 TMPMEMFREE(AP);
616     TMPMEMFREE(X_PRES);
617     TMPMEMFREE(R_PRES);
618     TMPMEMFREE(P_PRES);
619     TMPMEMFREE(P_PRES_dot_AP);
620     TMPMEMFREE(R_PRES_dot_P_PRES);
621     TMPMEMFREE(BREAKF);
622     TMPMEMFREE(ALPHA);
623 gross 1759 TMPMEMFREE(dots);
624     TMPMEMFREE(loc_dots);
625 jgs 154 *iter=Num_iter_global;
626     *tolerance=Norm_of_residual_global;
627     return Status;
628 jgs 150 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26