/[escript]/trunk/paso/src/GMRES.c
ViewVC logotype

Annotation of /trunk/paso/src/GMRES.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1767 - (hide annotations)
Mon Sep 8 02:53:50 2008 UTC (11 years, 5 months ago) by gross
File MIME type: text/plain
File size: 27047 byte(s)
print statements removed
1 ksteube 1312
2 jgs 150 /* $Id$ */
3    
4 ksteube 1312 /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15 dhawcroft 631
16 jgs 150 /*
17     * Purpose
18     * =======
19     *
20     * GMRES solves the linear system A*x=b using the
21     * truncated and restered GMRES method with preconditioning.
22     *
23     * Convergence test: norm( b - A*x )< TOL.
24     *
25     *
26     *
27     * Arguments
28     * =========
29     *
30 jgs 154 * r (input/output) double array, dimension n.
31 jgs 150 * On entry, residual of inital guess X
32     *
33 jgs 154 * x (input/output) double array, dimension n.
34 jgs 150 * On input, the initial guess.
35     *
36     * iter (input/output) int
37     * On input, the maximum num_iterations to be performed.
38     * On output, actual number of num_iterations performed.
39     *
40     * tolerance (input/output) DOUBLE PRECISION
41     * On input, the allowable convergence measure for
42     * norm( b - A*x )
43     * On output, the final value of this measure.
44     *
45 jgs 154 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
46 jgs 150 *
47     * restart (input) If restart>0, iteration is resterted a after restart steps.
48     *
49     * INFO (output) int
50     *
51     * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
52 jgs 154 * =SOLVER_MAXNUM_ITER_REACHED
53 jgs 150 * =SOLVER_INPUT_ERROR Illegal parameter:
54 jgs 154 * =SOLVER_BREAKDOWN: bad luck!
55     * =SOLVER_MEMORY_ERROR : no memory available
56 jgs 150 *
57     * ==============================================================
58     */
59    
60 gross 700 #include "Common.h"
61     #include "SystemMatrix.h"
62 jgs 150 #include "Solver.h"
63     #ifdef _OPENMP
64     #include <omp.h>
65     #endif
66    
67     err_t Paso_Solver_GMRES(
68     Paso_SystemMatrix * A,
69     double * r,
70     double * x,
71     dim_t *iter,
72 gross 584 double * tolerance,dim_t Length_of_recursion,dim_t restart,
73     Paso_Performance* pp) {
74 jgs 150
75 gross 1759 /* Local variables */
76 jgs 150
77 gross 1759 #ifdef _OPENMP
78     const int num_threads=omp_get_max_threads();
79     #else
80     const int num_threads=1;
81     #endif
82     double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
83 gross 1028 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
84 gross 1759 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
85 jgs 154 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual,diff,L2_R,Norm_of_residual_global;
86     double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
87     dim_t maxit,Num_iter_global,num_iter_restart,num_iter;
88 gross 1759 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
89 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
90 jgs 154 err_t Status=SOLVER_NO_ERROR;
91 jgs 150
92 gross 1028
93 jgs 150 /* adapt original routine parameters */
94 ksteube 1312 n = Paso_SystemMatrix_getTotalNumRows(A);
95 gross 1028 Length_of_mem=MAX(Length_of_recursion,0)+1;
96 jgs 150
97     /* Test the input parameters. */
98 jgs 154 if (restart>0) restart=MAX(Length_of_recursion,restart);
99     if (n < 0 || Length_of_recursion<=0) {
100 jgs 150 return SOLVER_INPUT_ERROR;
101     }
102    
103     /* allocate memory: */
104 gross 1028 X_PRES=TMPMEMALLOC(Length_of_mem,double*);
105     R_PRES=TMPMEMALLOC(Length_of_mem,double*);
106     P_PRES=TMPMEMALLOC(Length_of_mem,double*);
107 gross 1759 loc_dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
108     dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
109 gross 1028 P_PRES_dot_AP=TMPMEMALLOC(Length_of_mem,double);
110     R_PRES_dot_P_PRES=TMPMEMALLOC(Length_of_mem,double);
111     BREAKF=TMPMEMALLOC(Length_of_mem,double);
112     ALPHA=TMPMEMALLOC(Length_of_mem,double);
113 jgs 154 AP=TMPMEMALLOC(n,double);
114 gross 1028 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
115 gross 1759 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
116 gross 1028 Status=SOLVER_MEMORY_ERROR;
117     } else {
118     for (i=0;i<Length_of_mem;i++) {
119     X_PRES[i]=TMPMEMALLOC(n,double);
120     R_PRES[i]=TMPMEMALLOC(n,double);
121     P_PRES[i]=TMPMEMALLOC(n,double);
122     if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
123     }
124 jgs 150 }
125 jgs 154 if ( Status ==SOLVER_NO_ERROR ) {
126 jgs 150
127     /* now PRES starts : */
128     maxit=*iter;
129     tol=*tolerance;
130    
131 gross 1556 /* initialization */
132 jgs 150
133     restartFlag=TRUE;
134     num_iter=0;
135 gross 1759
136     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
137     for (th=0;th<num_threads;++th) {
138     local_n=n/num_threads;
139     rest=n-local_n*num_threads;
140     n_start=local_n*th+MIN(th,rest);
141     n_end=local_n*(th+1)+MIN(th+1,rest);
142     memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
143     for(i=0;i<Length_of_mem;++i) {
144     memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
145     memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146     memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
147     }
148 jgs 150 }
149    
150 jgs 154 while (! (convergeFlag || maxIterFlag || breakFlag)) {
151 jgs 150 if (restartFlag) {
152 jgs 154 BREAKF[0]=ONE;
153 gross 1759 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
154     for (th=0;th<num_threads;++th) {
155     local_n=n/num_threads;
156     rest=n-local_n*num_threads;
157     n_start=local_n*th+MIN(th,rest);
158     n_end=local_n*(th+1)+MIN(th+1,rest);
159     memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
160     memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
161 jgs 154 }
162 jgs 150 num_iter_restart=0;
163     restartFlag=FALSE;
164     }
165     ++num_iter;
166     ++num_iter_restart;
167 jgs 154 /* order is the dimension of the space on which the residual is minimized: */
168     order=MIN(num_iter_restart,Length_of_recursion);
169 jgs 150 /***
170 jgs 154 *** calculate new search direction P from R_PRES
171 jgs 150 ***/
172 jgs 154 Paso_Solver_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
173 jgs 150 /***
174     *** apply A to P to get AP
175     ***/
176 gross 415 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, &P_PRES[0][0],ZERO, &AP[0]);
177 jgs 150 /***
178     ***** calculation of the norm of R and the scalar products of
179     *** the residuals and A*P:
180     ***/
181 gross 1759 memset(loc_dots,0,sizeof(double)*(order+1));
182     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
183     for (th=0;th<num_threads;++th) {
184     local_n=n/num_threads;
185     rest=n-local_n*num_threads;
186     n_start=local_n*th+MIN(th,rest);
187     n_end=local_n*(th+1)+MIN(th+1,rest);
188     if (order==0) {
189     R_PRES_dot_P=ZERO;
190     #pragma ivdep
191     for (z=n_start; z < n_end; ++z) {
192     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
193     }
194     #pragma omp critical
195     {
196     loc_dots[0]+=R_PRES_dot_P;
197     }
198     } else if (order==1) {
199     R_PRES_dot_P=ZERO;
200     P_PRES_dot_AP0=ZERO;
201     #pragma ivdep
202     for (z=n_start; z < n_end; ++z) {
203     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
204     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
205     }
206     #pragma omp critical
207     {
208     loc_dots[0]+=R_PRES_dot_P;
209     loc_dots[1]+=P_PRES_dot_AP0;
210     }
211     } else if (order==2) {
212     R_PRES_dot_P=ZERO;
213     P_PRES_dot_AP0=ZERO;
214     P_PRES_dot_AP1=ZERO;
215     #pragma ivdep
216     for (z=n_start; z < n_end; ++z) {
217     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
218     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
219     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
220     }
221     #pragma omp critical
222     {
223     loc_dots[0]+=R_PRES_dot_P;
224     loc_dots[1]+=P_PRES_dot_AP0;
225     loc_dots[2]+=P_PRES_dot_AP1;
226     }
227     } else if (order==3) {
228     R_PRES_dot_P=ZERO;
229     P_PRES_dot_AP0=ZERO;
230     P_PRES_dot_AP1=ZERO;
231     P_PRES_dot_AP2=ZERO;
232     #pragma ivdep
233     for (z=n_start; z < n_end; ++z) {
234     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
235     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
236     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
237     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
238     }
239     #pragma omp critical
240     {
241     loc_dots[0]+=R_PRES_dot_P;
242     loc_dots[1]+=P_PRES_dot_AP0;
243     loc_dots[2]+=P_PRES_dot_AP1;
244     loc_dots[3]+=P_PRES_dot_AP2;
245     }
246     } else if (order==4) {
247     R_PRES_dot_P=ZERO;
248     P_PRES_dot_AP0=ZERO;
249     P_PRES_dot_AP1=ZERO;
250     P_PRES_dot_AP2=ZERO;
251     P_PRES_dot_AP3=ZERO;
252     #pragma ivdep
253     for (z=n_start; z < n_end; ++z) {
254     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
255     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
256     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
257     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
258     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
259     }
260     #pragma omp critical
261     {
262     loc_dots[0]+=R_PRES_dot_P;
263     loc_dots[1]+=P_PRES_dot_AP0;
264     loc_dots[2]+=P_PRES_dot_AP1;
265     loc_dots[3]+=P_PRES_dot_AP2;
266     loc_dots[4]+=P_PRES_dot_AP3;
267     }
268     } else if (order==5) {
269     R_PRES_dot_P=ZERO;
270     P_PRES_dot_AP0=ZERO;
271     P_PRES_dot_AP1=ZERO;
272     P_PRES_dot_AP2=ZERO;
273     P_PRES_dot_AP3=ZERO;
274     P_PRES_dot_AP4=ZERO;
275     #pragma ivdep
276     for (z=n_start; z < n_end; ++z) {
277     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
278     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
279     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
280     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
281     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
282     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
283     }
284     #pragma omp critical
285     {
286     loc_dots[0]+=R_PRES_dot_P;
287     loc_dots[1]+=P_PRES_dot_AP0;
288     loc_dots[2]+=P_PRES_dot_AP1;
289     loc_dots[3]+=P_PRES_dot_AP2;
290     loc_dots[4]+=P_PRES_dot_AP3;
291     loc_dots[5]+=P_PRES_dot_AP4;
292     }
293     } else if (order==6) {
294     R_PRES_dot_P=ZERO;
295     P_PRES_dot_AP0=ZERO;
296     P_PRES_dot_AP1=ZERO;
297     P_PRES_dot_AP2=ZERO;
298     P_PRES_dot_AP3=ZERO;
299     P_PRES_dot_AP4=ZERO;
300     P_PRES_dot_AP5=ZERO;
301     #pragma ivdep
302     for (z=n_start; z < n_end; ++z) {
303     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
304     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
305     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
306     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
307     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
308     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
309     P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
310     }
311     #pragma omp critical
312     {
313     loc_dots[0]+=R_PRES_dot_P;
314     loc_dots[1]+=P_PRES_dot_AP0;
315     loc_dots[2]+=P_PRES_dot_AP1;
316     loc_dots[3]+=P_PRES_dot_AP2;
317     loc_dots[4]+=P_PRES_dot_AP3;
318     loc_dots[5]+=P_PRES_dot_AP4;
319     loc_dots[6]+=P_PRES_dot_AP5;
320     }
321     } else {
322     R_PRES_dot_P=ZERO;
323     P_PRES_dot_AP0=ZERO;
324     P_PRES_dot_AP1=ZERO;
325     P_PRES_dot_AP2=ZERO;
326     P_PRES_dot_AP3=ZERO;
327     P_PRES_dot_AP4=ZERO;
328     P_PRES_dot_AP5=ZERO;
329     P_PRES_dot_AP6=ZERO;
330     #pragma ivdep
331     for (z=n_start; z < n_end; ++z) {
332     R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
333     P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
334     P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
335     P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
336     P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
337     P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
338     P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
339     P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
340     }
341     #pragma omp critical
342     {
343     loc_dots[0]+=R_PRES_dot_P;
344     loc_dots[1]+=P_PRES_dot_AP0;
345     loc_dots[2]+=P_PRES_dot_AP1;
346     loc_dots[3]+=P_PRES_dot_AP2;
347     loc_dots[4]+=P_PRES_dot_AP3;
348     loc_dots[5]+=P_PRES_dot_AP4;
349     loc_dots[6]+=P_PRES_dot_AP5;
350     loc_dots[7]+=P_PRES_dot_AP6;
351     }
352     for (i=7;i<order;++i) {
353     P_PRES_dot_AP0=ZERO;
354     #pragma ivdep
355     for (z=n_start; z < n_end; ++z) {
356     P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
357     }
358     #pragma omp critical
359     {
360     loc_dots[i+1]+=P_PRES_dot_AP0;
361     }
362     }
363     }
364 jgs 150 }
365 gross 1759 #ifdef PASO_MPI
366     MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
367     R_PRES_dot_P_PRES[0]=dots[0];
368     memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
369     #else
370     R_PRES_dot_P_PRES[0]=loc_dots[0];
371     memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
372     #endif
373     R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
374 gross 1767
375 jgs 154 /*** if sum_BREAKF is equal to zero a breakdown occurs.
376     *** iteration procedure can be continued but R_PRES is not the
377     *** residual of X_PRES approximation.
378     ***/
379     sum_BREAKF=0.;
380 gross 1759 #pragma ivdep
381 jgs 154 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
382 gross 1759 breakFlag=!((ABS(R_PRES_dot_AP0) > ZERO) && (sum_BREAKF >ZERO));
383 jgs 154 if (!breakFlag) {
384     breakFlag=FALSE;
385     /***
386     ***** X_PRES and R_PRES are moved to memory:
387     ***/
388 gross 1556 Factor=0.;
389 gross 1759 #pragma ivdep
390 gross 1556 for (i=0;i<order;++i) {
391 jgs 154 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
392     Factor+=BREAKF[i]*ALPHA[i];
393 gross 1556 }
394 jgs 154
395 gross 1556 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
396     save_R_PRES=R_PRES[Length_of_mem-1];
397     save_XPRES=X_PRES[Length_of_mem-1];
398     save_P_PRES=P_PRES[Length_of_mem-1];
399 gross 1759 #pragma ivdep
400 gross 1556 for (i=Length_of_mem-1;i>0;--i) {
401 jgs 154 BREAKF[i]=BREAKF[i-1];
402     R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
403     R_PRES[i]=R_PRES[i-1];
404     X_PRES[i]=X_PRES[i-1];
405     P_PRES[i]=P_PRES[i-1];
406 gross 1556 }
407     R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
408     R_PRES[0]=save_R_PRES;
409     X_PRES[0]=save_XPRES;
410     P_PRES[0]=save_P_PRES;
411 jgs 154
412 gross 1556 if (ABS(Factor)<=ZERO) {
413 jgs 154 Factor=1.;
414     BREAKF[0]=ZERO;
415 gross 1556 } else {
416 jgs 154 Factor=1./Factor;
417     BREAKF[0]=ONE;
418 jgs 150 }
419 gross 1556 for (i=0;i<order;++i) ALPHA[i]*=Factor;
420 jgs 150 /*
421 jgs 154 ***** update of solution X_PRES and its residual R_PRES:
422 jgs 150 ***
423     *** P is used to accumulate X and AP to accumulate R. X and R
424     *** are still needed until they are put into the X and R memory
425 jgs 154 *** R_PRES and X_PRES
426 jgs 150 ***
427     **/
428 jgs 154 breakf0=BREAKF[0];
429 gross 1759 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
430     for (th=0;th<num_threads;++th) {
431     local_n=n/num_threads;
432     rest=n-local_n*num_threads;
433     n_start=local_n*th+MIN(th,rest);
434     n_end=local_n*(th+1)+MIN(th+1,rest);
435     if (order==0) {
436     #pragma ivdep
437     for (z=n_start; z < n_end; ++z) {
438     R_PRES[0][z]= Factor* AP[z];
439     X_PRES[0][z]=-Factor*P_PRES[1][z];
440     }
441     } else if (order==1) {
442     #pragma ivdep
443     for (z=n_start; z < n_end; ++z) {
444     R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
445     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
446     }
447     } else if (order==2) {
448     #pragma ivdep
449     for (z=n_start; z < n_end; ++z) {
450     R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
451     +ALPHA[1]*R_PRES[2][z];
452     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
453     +ALPHA[1]*X_PRES[2][z];
454     }
455     } else if (order==3) {
456     #pragma ivdep
457     for (z=n_start; z < n_end; ++z) {
458     R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
459     +ALPHA[1]*R_PRES[2][z]
460     +ALPHA[2]*R_PRES[3][z];
461     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
462     +ALPHA[1]*X_PRES[2][z]
463     +ALPHA[2]*X_PRES[3][z];
464     }
465     } else if (order==4) {
466     #pragma ivdep
467     for (z=n_start; z < n_end; ++z) {
468     R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
469     +ALPHA[1]*R_PRES[2][z]
470     +ALPHA[2]*R_PRES[3][z]
471     +ALPHA[3]*R_PRES[4][z];
472     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
473     +ALPHA[1]*X_PRES[2][z]
474     +ALPHA[2]*X_PRES[3][z]
475     +ALPHA[3]*X_PRES[4][z];
476     }
477     } else if (order==5) {
478     #pragma ivdep
479     for (z=n_start; z < n_end; ++z) {
480     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
481     +ALPHA[1]*R_PRES[2][z]
482     +ALPHA[2]*R_PRES[3][z]
483     +ALPHA[3]*R_PRES[4][z]
484     +ALPHA[4]*R_PRES[5][z];
485     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
486     +ALPHA[1]*X_PRES[2][z]
487     +ALPHA[2]*X_PRES[3][z]
488     +ALPHA[3]*X_PRES[4][z]
489     +ALPHA[4]*X_PRES[5][z];
490     }
491     } else if (order==6) {
492     #pragma ivdep
493     for (z=n_start; z < n_end; ++z) {
494     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
495     +ALPHA[1]*R_PRES[2][z]
496     +ALPHA[2]*R_PRES[3][z]
497     +ALPHA[3]*R_PRES[4][z]
498     +ALPHA[4]*R_PRES[5][z]
499     +ALPHA[5]*R_PRES[6][z];
500     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
501     +ALPHA[1]*X_PRES[2][z]
502     +ALPHA[2]*X_PRES[3][z]
503     +ALPHA[3]*X_PRES[4][z]
504     +ALPHA[4]*X_PRES[5][z]
505     +ALPHA[5]*X_PRES[6][z];
506     }
507     } else {
508     #pragma ivdep
509     for (z=n_start; z < n_end; ++z) {
510     R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
511     +ALPHA[1]*R_PRES[2][z]
512     +ALPHA[2]*R_PRES[3][z]
513     +ALPHA[3]*R_PRES[4][z]
514     +ALPHA[4]*R_PRES[5][z]
515     +ALPHA[5]*R_PRES[6][z]
516     +ALPHA[6]*R_PRES[7][z];
517     X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
518     +ALPHA[1]*X_PRES[2][z]
519     +ALPHA[2]*X_PRES[3][z]
520     +ALPHA[3]*X_PRES[4][z]
521     +ALPHA[4]*X_PRES[5][z]
522     +ALPHA[5]*X_PRES[6][z]
523     +ALPHA[6]*X_PRES[7][z];
524     }
525     for (i=7;i<order;++i) {
526     #pragma ivdep
527     for (z=n_start; z < n_end; ++z) {
528     R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
529     X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
530     }
531     }
532     }
533 jgs 150 }
534 jgs 154 if (breakf0>0.) {
535 jgs 150 /***
536 jgs 154 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
537 jgs 150 ***/
538 gross 1759 memset(loc_dots,0,sizeof(double)*3);
539     #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
540     for (th=0;th<num_threads;++th) {
541     local_n=n/num_threads;
542     rest=n-local_n*num_threads;
543     n_start=local_n*th+MIN(th,rest);
544     n_end=local_n*(th+1)+MIN(th+1,rest);
545     SC1=ZERO;
546     SC2=ZERO;
547     #pragma ivdep
548     for (z=n_start; z < n_end; ++z) {
549     diff=R_PRES[0][z]-r[z];
550     SC1+=diff*diff;
551     SC2+=diff*r[z];
552     }
553     #pragma omp critical
554     {
555     loc_dots[0]+=SC1;
556     loc_dots[1]+=SC2;
557     }
558 jgs 150 }
559 gross 1759 #ifdef PASO_MPI
560     MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
561     SC1=dots[0];
562     SC2=dots[1];
563     #else
564     SC1=loc_dots[0];
565     SC2=loc_dots[1];
566     #endif
567 jgs 154 gamma=(SC1<=ZERO) ? ZERO : -SC2/SC1;
568 gross 1759 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
569     for (th=0;th<num_threads;++th) {
570     local_n=n/num_threads;
571     rest=n-local_n*num_threads;
572     n_start=local_n*th+MIN(th,rest);
573     n_end=local_n*(th+1)+MIN(th+1,rest);
574     L2_R=ZERO;
575     #pragma ivdep
576     for (z=n_start; z < n_end; ++z) {
577     x[z]+=gamma*(X_PRES[0][z]-x[z]);
578     r[z]+=gamma*(R_PRES[0][z]-r[z]);
579     L2_R+=r[z]*r[z];
580     }
581     #pragma omp critical
582     {
583     loc_dots[2]+=L2_R;
584     }
585 jgs 150 }
586 gross 1759 #ifdef PASO_MPI
587     MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
588     L2_R=dots[2];
589     #else
590     L2_R=loc_dots[2];
591     #endif
592 jgs 150 norm_of_residual=sqrt(L2_R);
593     convergeFlag = (norm_of_residual <= tol);
594 jgs 154 if (restart>0) restartFlag=(num_iter_restart >= restart);
595     } else {
596     convergeFlag=FALSE;
597     restartFlag=FALSE;
598 jgs 150 }
599 jgs 154 maxIterFlag = (num_iter >= maxit);
600     }
601     }
602     /* end of iteration */
603 gross 1556 Norm_of_residual_global=norm_of_residual;
604     Num_iter_global=num_iter;
605     if (maxIterFlag) {
606 jgs 154 Status = SOLVER_MAXITER_REACHED;
607 jgs 150 } else if (breakFlag) {
608 jgs 154 Status = SOLVER_BREAKDOWN;
609 gross 1556 }
610     }
611 jgs 154 for (i=0;i<Length_of_recursion;i++) {
612     TMPMEMFREE(X_PRES[i]);
613     TMPMEMFREE(R_PRES[i]);
614     TMPMEMFREE(P_PRES[i]);
615 jgs 150 }
616 gross 1028 TMPMEMFREE(AP);
617     TMPMEMFREE(X_PRES);
618     TMPMEMFREE(R_PRES);
619     TMPMEMFREE(P_PRES);
620     TMPMEMFREE(P_PRES_dot_AP);
621     TMPMEMFREE(R_PRES_dot_P_PRES);
622     TMPMEMFREE(BREAKF);
623     TMPMEMFREE(ALPHA);
624 gross 1759 TMPMEMFREE(dots);
625     TMPMEMFREE(loc_dots);
626 jgs 154 *iter=Num_iter_global;
627     *tolerance=Norm_of_residual_global;
628     return Status;
629 jgs 150 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26