/[escript]/trunk/paso/src/GMRES.c
ViewVC logotype

Contents of /trunk/paso/src/GMRES.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1767 - (show annotations)
Mon Sep 8 02:53:50 2008 UTC (11 years, 11 months ago) by gross
File MIME type: text/plain
File size: 27047 byte(s)
print statements removed
1
2 /* $Id$ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /*
17 * Purpose
18 * =======
19 *
20 * GMRES solves the linear system A*x=b using the
21 * truncated and restered GMRES method with preconditioning.
22 *
23 * Convergence test: norm( b - A*x )< TOL.
24 *
25 *
26 *
27 * Arguments
28 * =========
29 *
30 * r (input/output) double array, dimension n.
31 * On entry, residual of inital guess X
32 *
33 * x (input/output) double array, dimension n.
34 * On input, the initial guess.
35 *
36 * iter (input/output) int
37 * On input, the maximum num_iterations to be performed.
38 * On output, actual number of num_iterations performed.
39 *
40 * tolerance (input/output) DOUBLE PRECISION
41 * On input, the allowable convergence measure for
42 * norm( b - A*x )
43 * On output, the final value of this measure.
44 *
45 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
46 *
47 * restart (input) If restart>0, iteration is resterted a after restart steps.
48 *
49 * INFO (output) int
50 *
51 * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
52 * =SOLVER_MAXNUM_ITER_REACHED
53 * =SOLVER_INPUT_ERROR Illegal parameter:
54 * =SOLVER_BREAKDOWN: bad luck!
55 * =SOLVER_MEMORY_ERROR : no memory available
56 *
57 * ==============================================================
58 */
59
60 #include "Common.h"
61 #include "SystemMatrix.h"
62 #include "Solver.h"
63 #ifdef _OPENMP
64 #include <omp.h>
65 #endif
66
67 err_t Paso_Solver_GMRES(
68 Paso_SystemMatrix * A,
69 double * r,
70 double * x,
71 dim_t *iter,
72 double * tolerance,dim_t Length_of_recursion,dim_t restart,
73 Paso_Performance* pp) {
74
75 /* Local variables */
76
77 #ifdef _OPENMP
78 const int num_threads=omp_get_max_threads();
79 #else
80 const int num_threads=1;
81 #endif
82 double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
83 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
84 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
85 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual,diff,L2_R,Norm_of_residual_global;
86 double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
87 dim_t maxit,Num_iter_global,num_iter_restart,num_iter;
88 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
89 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
90 err_t Status=SOLVER_NO_ERROR;
91
92
93 /* adapt original routine parameters */
94 n = Paso_SystemMatrix_getTotalNumRows(A);
95 Length_of_mem=MAX(Length_of_recursion,0)+1;
96
97 /* Test the input parameters. */
98 if (restart>0) restart=MAX(Length_of_recursion,restart);
99 if (n < 0 || Length_of_recursion<=0) {
100 return SOLVER_INPUT_ERROR;
101 }
102
103 /* allocate memory: */
104 X_PRES=TMPMEMALLOC(Length_of_mem,double*);
105 R_PRES=TMPMEMALLOC(Length_of_mem,double*);
106 P_PRES=TMPMEMALLOC(Length_of_mem,double*);
107 loc_dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
108 dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
109 P_PRES_dot_AP=TMPMEMALLOC(Length_of_mem,double);
110 R_PRES_dot_P_PRES=TMPMEMALLOC(Length_of_mem,double);
111 BREAKF=TMPMEMALLOC(Length_of_mem,double);
112 ALPHA=TMPMEMALLOC(Length_of_mem,double);
113 AP=TMPMEMALLOC(n,double);
114 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
115 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
116 Status=SOLVER_MEMORY_ERROR;
117 } else {
118 for (i=0;i<Length_of_mem;i++) {
119 X_PRES[i]=TMPMEMALLOC(n,double);
120 R_PRES[i]=TMPMEMALLOC(n,double);
121 P_PRES[i]=TMPMEMALLOC(n,double);
122 if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
123 }
124 }
125 if ( Status ==SOLVER_NO_ERROR ) {
126
127 /* now PRES starts : */
128 maxit=*iter;
129 tol=*tolerance;
130
131 /* initialization */
132
133 restartFlag=TRUE;
134 num_iter=0;
135
136 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
137 for (th=0;th<num_threads;++th) {
138 local_n=n/num_threads;
139 rest=n-local_n*num_threads;
140 n_start=local_n*th+MIN(th,rest);
141 n_end=local_n*(th+1)+MIN(th+1,rest);
142 memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
143 for(i=0;i<Length_of_mem;++i) {
144 memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
145 memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146 memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
147 }
148 }
149
150 while (! (convergeFlag || maxIterFlag || breakFlag)) {
151 if (restartFlag) {
152 BREAKF[0]=ONE;
153 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
154 for (th=0;th<num_threads;++th) {
155 local_n=n/num_threads;
156 rest=n-local_n*num_threads;
157 n_start=local_n*th+MIN(th,rest);
158 n_end=local_n*(th+1)+MIN(th+1,rest);
159 memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
160 memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
161 }
162 num_iter_restart=0;
163 restartFlag=FALSE;
164 }
165 ++num_iter;
166 ++num_iter_restart;
167 /* order is the dimension of the space on which the residual is minimized: */
168 order=MIN(num_iter_restart,Length_of_recursion);
169 /***
170 *** calculate new search direction P from R_PRES
171 ***/
172 Paso_Solver_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
173 /***
174 *** apply A to P to get AP
175 ***/
176 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, &P_PRES[0][0],ZERO, &AP[0]);
177 /***
178 ***** calculation of the norm of R and the scalar products of
179 *** the residuals and A*P:
180 ***/
181 memset(loc_dots,0,sizeof(double)*(order+1));
182 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
183 for (th=0;th<num_threads;++th) {
184 local_n=n/num_threads;
185 rest=n-local_n*num_threads;
186 n_start=local_n*th+MIN(th,rest);
187 n_end=local_n*(th+1)+MIN(th+1,rest);
188 if (order==0) {
189 R_PRES_dot_P=ZERO;
190 #pragma ivdep
191 for (z=n_start; z < n_end; ++z) {
192 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
193 }
194 #pragma omp critical
195 {
196 loc_dots[0]+=R_PRES_dot_P;
197 }
198 } else if (order==1) {
199 R_PRES_dot_P=ZERO;
200 P_PRES_dot_AP0=ZERO;
201 #pragma ivdep
202 for (z=n_start; z < n_end; ++z) {
203 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
204 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
205 }
206 #pragma omp critical
207 {
208 loc_dots[0]+=R_PRES_dot_P;
209 loc_dots[1]+=P_PRES_dot_AP0;
210 }
211 } else if (order==2) {
212 R_PRES_dot_P=ZERO;
213 P_PRES_dot_AP0=ZERO;
214 P_PRES_dot_AP1=ZERO;
215 #pragma ivdep
216 for (z=n_start; z < n_end; ++z) {
217 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
218 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
219 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
220 }
221 #pragma omp critical
222 {
223 loc_dots[0]+=R_PRES_dot_P;
224 loc_dots[1]+=P_PRES_dot_AP0;
225 loc_dots[2]+=P_PRES_dot_AP1;
226 }
227 } else if (order==3) {
228 R_PRES_dot_P=ZERO;
229 P_PRES_dot_AP0=ZERO;
230 P_PRES_dot_AP1=ZERO;
231 P_PRES_dot_AP2=ZERO;
232 #pragma ivdep
233 for (z=n_start; z < n_end; ++z) {
234 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
235 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
236 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
237 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
238 }
239 #pragma omp critical
240 {
241 loc_dots[0]+=R_PRES_dot_P;
242 loc_dots[1]+=P_PRES_dot_AP0;
243 loc_dots[2]+=P_PRES_dot_AP1;
244 loc_dots[3]+=P_PRES_dot_AP2;
245 }
246 } else if (order==4) {
247 R_PRES_dot_P=ZERO;
248 P_PRES_dot_AP0=ZERO;
249 P_PRES_dot_AP1=ZERO;
250 P_PRES_dot_AP2=ZERO;
251 P_PRES_dot_AP3=ZERO;
252 #pragma ivdep
253 for (z=n_start; z < n_end; ++z) {
254 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
255 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
256 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
257 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
258 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
259 }
260 #pragma omp critical
261 {
262 loc_dots[0]+=R_PRES_dot_P;
263 loc_dots[1]+=P_PRES_dot_AP0;
264 loc_dots[2]+=P_PRES_dot_AP1;
265 loc_dots[3]+=P_PRES_dot_AP2;
266 loc_dots[4]+=P_PRES_dot_AP3;
267 }
268 } else if (order==5) {
269 R_PRES_dot_P=ZERO;
270 P_PRES_dot_AP0=ZERO;
271 P_PRES_dot_AP1=ZERO;
272 P_PRES_dot_AP2=ZERO;
273 P_PRES_dot_AP3=ZERO;
274 P_PRES_dot_AP4=ZERO;
275 #pragma ivdep
276 for (z=n_start; z < n_end; ++z) {
277 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
278 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
279 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
280 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
281 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
282 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
283 }
284 #pragma omp critical
285 {
286 loc_dots[0]+=R_PRES_dot_P;
287 loc_dots[1]+=P_PRES_dot_AP0;
288 loc_dots[2]+=P_PRES_dot_AP1;
289 loc_dots[3]+=P_PRES_dot_AP2;
290 loc_dots[4]+=P_PRES_dot_AP3;
291 loc_dots[5]+=P_PRES_dot_AP4;
292 }
293 } else if (order==6) {
294 R_PRES_dot_P=ZERO;
295 P_PRES_dot_AP0=ZERO;
296 P_PRES_dot_AP1=ZERO;
297 P_PRES_dot_AP2=ZERO;
298 P_PRES_dot_AP3=ZERO;
299 P_PRES_dot_AP4=ZERO;
300 P_PRES_dot_AP5=ZERO;
301 #pragma ivdep
302 for (z=n_start; z < n_end; ++z) {
303 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
304 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
305 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
306 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
307 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
308 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
309 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
310 }
311 #pragma omp critical
312 {
313 loc_dots[0]+=R_PRES_dot_P;
314 loc_dots[1]+=P_PRES_dot_AP0;
315 loc_dots[2]+=P_PRES_dot_AP1;
316 loc_dots[3]+=P_PRES_dot_AP2;
317 loc_dots[4]+=P_PRES_dot_AP3;
318 loc_dots[5]+=P_PRES_dot_AP4;
319 loc_dots[6]+=P_PRES_dot_AP5;
320 }
321 } else {
322 R_PRES_dot_P=ZERO;
323 P_PRES_dot_AP0=ZERO;
324 P_PRES_dot_AP1=ZERO;
325 P_PRES_dot_AP2=ZERO;
326 P_PRES_dot_AP3=ZERO;
327 P_PRES_dot_AP4=ZERO;
328 P_PRES_dot_AP5=ZERO;
329 P_PRES_dot_AP6=ZERO;
330 #pragma ivdep
331 for (z=n_start; z < n_end; ++z) {
332 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
333 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
334 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
335 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
336 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
337 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
338 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
339 P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
340 }
341 #pragma omp critical
342 {
343 loc_dots[0]+=R_PRES_dot_P;
344 loc_dots[1]+=P_PRES_dot_AP0;
345 loc_dots[2]+=P_PRES_dot_AP1;
346 loc_dots[3]+=P_PRES_dot_AP2;
347 loc_dots[4]+=P_PRES_dot_AP3;
348 loc_dots[5]+=P_PRES_dot_AP4;
349 loc_dots[6]+=P_PRES_dot_AP5;
350 loc_dots[7]+=P_PRES_dot_AP6;
351 }
352 for (i=7;i<order;++i) {
353 P_PRES_dot_AP0=ZERO;
354 #pragma ivdep
355 for (z=n_start; z < n_end; ++z) {
356 P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
357 }
358 #pragma omp critical
359 {
360 loc_dots[i+1]+=P_PRES_dot_AP0;
361 }
362 }
363 }
364 }
365 #ifdef PASO_MPI
366 MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
367 R_PRES_dot_P_PRES[0]=dots[0];
368 memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
369 #else
370 R_PRES_dot_P_PRES[0]=loc_dots[0];
371 memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
372 #endif
373 R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
374
375 /*** if sum_BREAKF is equal to zero a breakdown occurs.
376 *** iteration procedure can be continued but R_PRES is not the
377 *** residual of X_PRES approximation.
378 ***/
379 sum_BREAKF=0.;
380 #pragma ivdep
381 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
382 breakFlag=!((ABS(R_PRES_dot_AP0) > ZERO) && (sum_BREAKF >ZERO));
383 if (!breakFlag) {
384 breakFlag=FALSE;
385 /***
386 ***** X_PRES and R_PRES are moved to memory:
387 ***/
388 Factor=0.;
389 #pragma ivdep
390 for (i=0;i<order;++i) {
391 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
392 Factor+=BREAKF[i]*ALPHA[i];
393 }
394
395 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
396 save_R_PRES=R_PRES[Length_of_mem-1];
397 save_XPRES=X_PRES[Length_of_mem-1];
398 save_P_PRES=P_PRES[Length_of_mem-1];
399 #pragma ivdep
400 for (i=Length_of_mem-1;i>0;--i) {
401 BREAKF[i]=BREAKF[i-1];
402 R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
403 R_PRES[i]=R_PRES[i-1];
404 X_PRES[i]=X_PRES[i-1];
405 P_PRES[i]=P_PRES[i-1];
406 }
407 R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
408 R_PRES[0]=save_R_PRES;
409 X_PRES[0]=save_XPRES;
410 P_PRES[0]=save_P_PRES;
411
412 if (ABS(Factor)<=ZERO) {
413 Factor=1.;
414 BREAKF[0]=ZERO;
415 } else {
416 Factor=1./Factor;
417 BREAKF[0]=ONE;
418 }
419 for (i=0;i<order;++i) ALPHA[i]*=Factor;
420 /*
421 ***** update of solution X_PRES and its residual R_PRES:
422 ***
423 *** P is used to accumulate X and AP to accumulate R. X and R
424 *** are still needed until they are put into the X and R memory
425 *** R_PRES and X_PRES
426 ***
427 **/
428 breakf0=BREAKF[0];
429 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
430 for (th=0;th<num_threads;++th) {
431 local_n=n/num_threads;
432 rest=n-local_n*num_threads;
433 n_start=local_n*th+MIN(th,rest);
434 n_end=local_n*(th+1)+MIN(th+1,rest);
435 if (order==0) {
436 #pragma ivdep
437 for (z=n_start; z < n_end; ++z) {
438 R_PRES[0][z]= Factor* AP[z];
439 X_PRES[0][z]=-Factor*P_PRES[1][z];
440 }
441 } else if (order==1) {
442 #pragma ivdep
443 for (z=n_start; z < n_end; ++z) {
444 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
445 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
446 }
447 } else if (order==2) {
448 #pragma ivdep
449 for (z=n_start; z < n_end; ++z) {
450 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
451 +ALPHA[1]*R_PRES[2][z];
452 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
453 +ALPHA[1]*X_PRES[2][z];
454 }
455 } else if (order==3) {
456 #pragma ivdep
457 for (z=n_start; z < n_end; ++z) {
458 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
459 +ALPHA[1]*R_PRES[2][z]
460 +ALPHA[2]*R_PRES[3][z];
461 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
462 +ALPHA[1]*X_PRES[2][z]
463 +ALPHA[2]*X_PRES[3][z];
464 }
465 } else if (order==4) {
466 #pragma ivdep
467 for (z=n_start; z < n_end; ++z) {
468 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
469 +ALPHA[1]*R_PRES[2][z]
470 +ALPHA[2]*R_PRES[3][z]
471 +ALPHA[3]*R_PRES[4][z];
472 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
473 +ALPHA[1]*X_PRES[2][z]
474 +ALPHA[2]*X_PRES[3][z]
475 +ALPHA[3]*X_PRES[4][z];
476 }
477 } else if (order==5) {
478 #pragma ivdep
479 for (z=n_start; z < n_end; ++z) {
480 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
481 +ALPHA[1]*R_PRES[2][z]
482 +ALPHA[2]*R_PRES[3][z]
483 +ALPHA[3]*R_PRES[4][z]
484 +ALPHA[4]*R_PRES[5][z];
485 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
486 +ALPHA[1]*X_PRES[2][z]
487 +ALPHA[2]*X_PRES[3][z]
488 +ALPHA[3]*X_PRES[4][z]
489 +ALPHA[4]*X_PRES[5][z];
490 }
491 } else if (order==6) {
492 #pragma ivdep
493 for (z=n_start; z < n_end; ++z) {
494 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
495 +ALPHA[1]*R_PRES[2][z]
496 +ALPHA[2]*R_PRES[3][z]
497 +ALPHA[3]*R_PRES[4][z]
498 +ALPHA[4]*R_PRES[5][z]
499 +ALPHA[5]*R_PRES[6][z];
500 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
501 +ALPHA[1]*X_PRES[2][z]
502 +ALPHA[2]*X_PRES[3][z]
503 +ALPHA[3]*X_PRES[4][z]
504 +ALPHA[4]*X_PRES[5][z]
505 +ALPHA[5]*X_PRES[6][z];
506 }
507 } else {
508 #pragma ivdep
509 for (z=n_start; z < n_end; ++z) {
510 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
511 +ALPHA[1]*R_PRES[2][z]
512 +ALPHA[2]*R_PRES[3][z]
513 +ALPHA[3]*R_PRES[4][z]
514 +ALPHA[4]*R_PRES[5][z]
515 +ALPHA[5]*R_PRES[6][z]
516 +ALPHA[6]*R_PRES[7][z];
517 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
518 +ALPHA[1]*X_PRES[2][z]
519 +ALPHA[2]*X_PRES[3][z]
520 +ALPHA[3]*X_PRES[4][z]
521 +ALPHA[4]*X_PRES[5][z]
522 +ALPHA[5]*X_PRES[6][z]
523 +ALPHA[6]*X_PRES[7][z];
524 }
525 for (i=7;i<order;++i) {
526 #pragma ivdep
527 for (z=n_start; z < n_end; ++z) {
528 R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
529 X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
530 }
531 }
532 }
533 }
534 if (breakf0>0.) {
535 /***
536 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
537 ***/
538 memset(loc_dots,0,sizeof(double)*3);
539 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
540 for (th=0;th<num_threads;++th) {
541 local_n=n/num_threads;
542 rest=n-local_n*num_threads;
543 n_start=local_n*th+MIN(th,rest);
544 n_end=local_n*(th+1)+MIN(th+1,rest);
545 SC1=ZERO;
546 SC2=ZERO;
547 #pragma ivdep
548 for (z=n_start; z < n_end; ++z) {
549 diff=R_PRES[0][z]-r[z];
550 SC1+=diff*diff;
551 SC2+=diff*r[z];
552 }
553 #pragma omp critical
554 {
555 loc_dots[0]+=SC1;
556 loc_dots[1]+=SC2;
557 }
558 }
559 #ifdef PASO_MPI
560 MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
561 SC1=dots[0];
562 SC2=dots[1];
563 #else
564 SC1=loc_dots[0];
565 SC2=loc_dots[1];
566 #endif
567 gamma=(SC1<=ZERO) ? ZERO : -SC2/SC1;
568 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
569 for (th=0;th<num_threads;++th) {
570 local_n=n/num_threads;
571 rest=n-local_n*num_threads;
572 n_start=local_n*th+MIN(th,rest);
573 n_end=local_n*(th+1)+MIN(th+1,rest);
574 L2_R=ZERO;
575 #pragma ivdep
576 for (z=n_start; z < n_end; ++z) {
577 x[z]+=gamma*(X_PRES[0][z]-x[z]);
578 r[z]+=gamma*(R_PRES[0][z]-r[z]);
579 L2_R+=r[z]*r[z];
580 }
581 #pragma omp critical
582 {
583 loc_dots[2]+=L2_R;
584 }
585 }
586 #ifdef PASO_MPI
587 MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
588 L2_R=dots[2];
589 #else
590 L2_R=loc_dots[2];
591 #endif
592 norm_of_residual=sqrt(L2_R);
593 convergeFlag = (norm_of_residual <= tol);
594 if (restart>0) restartFlag=(num_iter_restart >= restart);
595 } else {
596 convergeFlag=FALSE;
597 restartFlag=FALSE;
598 }
599 maxIterFlag = (num_iter >= maxit);
600 }
601 }
602 /* end of iteration */
603 Norm_of_residual_global=norm_of_residual;
604 Num_iter_global=num_iter;
605 if (maxIterFlag) {
606 Status = SOLVER_MAXITER_REACHED;
607 } else if (breakFlag) {
608 Status = SOLVER_BREAKDOWN;
609 }
610 }
611 for (i=0;i<Length_of_recursion;i++) {
612 TMPMEMFREE(X_PRES[i]);
613 TMPMEMFREE(R_PRES[i]);
614 TMPMEMFREE(P_PRES[i]);
615 }
616 TMPMEMFREE(AP);
617 TMPMEMFREE(X_PRES);
618 TMPMEMFREE(R_PRES);
619 TMPMEMFREE(P_PRES);
620 TMPMEMFREE(P_PRES_dot_AP);
621 TMPMEMFREE(R_PRES_dot_P_PRES);
622 TMPMEMFREE(BREAKF);
623 TMPMEMFREE(ALPHA);
624 TMPMEMFREE(dots);
625 TMPMEMFREE(loc_dots);
626 *iter=Num_iter_global;
627 *tolerance=Norm_of_residual_global;
628 return Status;
629 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26