/[escript]/trunk/paso/src/GMRES.c
ViewVC logotype

Contents of /trunk/paso/src/GMRES.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3642 - (show annotations)
Thu Oct 27 03:41:51 2011 UTC (7 years, 5 months ago) by caltinay
File MIME type: text/plain
File size: 27271 byte(s)
Assorted spelling/comment fixes in paso.

1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /*
16 * Purpose
17 * =======
18 *
19 * GMRES solves the linear system A*x=b using the
20 * truncated and restarted GMRES method with preconditioning.
21 *
22 * Convergence test: norm( b - A*x )< TOL.
23 *
24 *
25 *
26 * Arguments
27 * =========
28 *
29 * r (input/output) double array, dimension n.
30 * On entry, residual of initial guess X
31 *
32 * x (input/output) double array, dimension n.
33 * On input, the initial guess.
34 *
35 * iter (input/output) int
36 * On input, the maximum num_iterations to be performed.
37 * On output, actual number of num_iterations performed.
38 *
39 * tolerance (input/output) DOUBLE PRECISION
40 * On input, the allowable convergence measure for
41 * norm( b - A*x )
42 * On output, the final value of this measure.
43 *
44 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
45 *
46 * restart (input) If restart>0, iteration is restarted a after restart steps.
47 *
48 * INFO (output) int
49 *
50 * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
51 * =SOLVER_MAXNUM_ITER_REACHED
52 * =SOLVER_INPUT_ERROR Illegal parameter:
53 * =SOLVER_BREAKDOWN: bad luck!
54 * =SOLVER_MEMORY_ERROR : no memory available
55 *
56 * ==============================================================
57 */
58
59 #include "Common.h"
60 #include "SystemMatrix.h"
61 #include "Solver.h"
62 #ifdef _OPENMP
63 #include <omp.h>
64 #endif
65
66 err_t Paso_Solver_GMRES(
67 Paso_SystemMatrix * A,
68 double * r,
69 double * x,
70 dim_t *iter,
71 double * tolerance,dim_t Length_of_recursion,dim_t restart,
72 Paso_Performance* pp) {
73
74 /* Local variables */
75
76 #ifdef _OPENMP
77 const int num_threads=omp_get_max_threads();
78 #else
79 const int num_threads=1;
80 #endif
81 double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
82 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
83 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
84 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual=0,diff,L2_R,Norm_of_residual_global=0;
85 double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
86 dim_t maxit,Num_iter_global=0,num_iter_restart=0,num_iter;
87 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
88 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
89 err_t Status=SOLVER_NO_ERROR;
90
91
92 /* adapt original routine parameters */
93 n = Paso_SystemMatrix_getTotalNumRows(A);
94 Length_of_mem=MAX(Length_of_recursion,0)+1;
95
96 /* Test the input parameters. */
97 if (restart>0) restart=MAX(Length_of_recursion,restart);
98 if (n < 0 || Length_of_recursion<=0) {
99 return SOLVER_INPUT_ERROR;
100 }
101
102 /* allocate memory: */
103 X_PRES=TMPMEMALLOC(Length_of_mem,double*);
104 R_PRES=TMPMEMALLOC(Length_of_mem,double*);
105 P_PRES=TMPMEMALLOC(Length_of_mem,double*);
106 loc_dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
107 dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
108 P_PRES_dot_AP=TMPMEMALLOC(Length_of_mem,double);
109 R_PRES_dot_P_PRES=TMPMEMALLOC(Length_of_mem,double);
110 BREAKF=TMPMEMALLOC(Length_of_mem,double);
111 ALPHA=TMPMEMALLOC(Length_of_mem,double);
112 AP=TMPMEMALLOC(n,double);
113 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
114 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
115 Status=SOLVER_MEMORY_ERROR;
116 } else {
117 for (i=0;i<Length_of_mem;i++) {
118 X_PRES[i]=TMPMEMALLOC(n,double);
119 R_PRES[i]=TMPMEMALLOC(n,double);
120 P_PRES[i]=TMPMEMALLOC(n,double);
121 if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
122 }
123 }
124 if ( Status ==SOLVER_NO_ERROR ) {
125
126 /* now PRES starts : */
127 maxit=*iter;
128 tol=*tolerance;
129
130 /* initialization */
131
132 restartFlag=TRUE;
133 num_iter=0;
134
135 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
136 for (th=0;th<num_threads;++th) {
137 local_n=n/num_threads;
138 rest=n-local_n*num_threads;
139 n_start=local_n*th+MIN(th,rest);
140 n_end=local_n*(th+1)+MIN(th+1,rest);
141 memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
142 for(i=0;i<Length_of_mem;++i) {
143 memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
144 memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
145 memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146 }
147 }
148
149 while (! (convergeFlag || maxIterFlag || breakFlag)) {
150 if (restartFlag) {
151 BREAKF[0]=PASO_ONE;
152 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
153 for (th=0;th<num_threads;++th) {
154 local_n=n/num_threads;
155 rest=n-local_n*num_threads;
156 n_start=local_n*th+MIN(th,rest);
157 n_end=local_n*(th+1)+MIN(th+1,rest);
158 memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
159 memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
160 }
161 num_iter_restart=0;
162 restartFlag=FALSE;
163 }
164 ++num_iter;
165 ++num_iter_restart;
166 /* order is the dimension of the space on which the residual is minimized: */
167 order=MIN(num_iter_restart,Length_of_recursion);
168 /***
169 *** calculate new search direction P from R_PRES
170 ***/
171 Paso_SystemMatrix_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
172 /***
173 *** apply A to P to get AP
174 ***/
175 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, &P_PRES[0][0],PASO_ZERO, &AP[0]);
176 /***
177 ***** calculation of the norm of R and the scalar products of
178 *** the residuals and A*P:
179 ***/
180 memset(loc_dots,0,sizeof(double)*(order+1));
181 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
182 for (th=0;th<num_threads;++th) {
183 local_n=n/num_threads;
184 rest=n-local_n*num_threads;
185 n_start=local_n*th+MIN(th,rest);
186 n_end=local_n*(th+1)+MIN(th+1,rest);
187 if (order==0) {
188 R_PRES_dot_P=PASO_ZERO;
189 #pragma ivdep
190 for (z=n_start; z < n_end; ++z) {
191 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
192 }
193 #pragma omp critical
194 {
195 loc_dots[0]+=R_PRES_dot_P;
196 }
197 } else if (order==1) {
198 R_PRES_dot_P=PASO_ZERO;
199 P_PRES_dot_AP0=PASO_ZERO;
200 #pragma ivdep
201 for (z=n_start; z < n_end; ++z) {
202 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
203 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
204 }
205 #pragma omp critical
206 {
207 loc_dots[0]+=R_PRES_dot_P;
208 loc_dots[1]+=P_PRES_dot_AP0;
209 }
210 } else if (order==2) {
211 R_PRES_dot_P=PASO_ZERO;
212 P_PRES_dot_AP0=PASO_ZERO;
213 P_PRES_dot_AP1=PASO_ZERO;
214 #pragma ivdep
215 for (z=n_start; z < n_end; ++z) {
216 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
217 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
218 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
219 }
220 #pragma omp critical
221 {
222 loc_dots[0]+=R_PRES_dot_P;
223 loc_dots[1]+=P_PRES_dot_AP0;
224 loc_dots[2]+=P_PRES_dot_AP1;
225 }
226 } else if (order==3) {
227 R_PRES_dot_P=PASO_ZERO;
228 P_PRES_dot_AP0=PASO_ZERO;
229 P_PRES_dot_AP1=PASO_ZERO;
230 P_PRES_dot_AP2=PASO_ZERO;
231 #pragma ivdep
232 for (z=n_start; z < n_end; ++z) {
233 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
234 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
235 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
236 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
237 }
238 #pragma omp critical
239 {
240 loc_dots[0]+=R_PRES_dot_P;
241 loc_dots[1]+=P_PRES_dot_AP0;
242 loc_dots[2]+=P_PRES_dot_AP1;
243 loc_dots[3]+=P_PRES_dot_AP2;
244 }
245 } else if (order==4) {
246 R_PRES_dot_P=PASO_ZERO;
247 P_PRES_dot_AP0=PASO_ZERO;
248 P_PRES_dot_AP1=PASO_ZERO;
249 P_PRES_dot_AP2=PASO_ZERO;
250 P_PRES_dot_AP3=PASO_ZERO;
251 #pragma ivdep
252 for (z=n_start; z < n_end; ++z) {
253 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
254 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
255 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
256 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
257 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
258 }
259 #pragma omp critical
260 {
261 loc_dots[0]+=R_PRES_dot_P;
262 loc_dots[1]+=P_PRES_dot_AP0;
263 loc_dots[2]+=P_PRES_dot_AP1;
264 loc_dots[3]+=P_PRES_dot_AP2;
265 loc_dots[4]+=P_PRES_dot_AP3;
266 }
267 } else if (order==5) {
268 R_PRES_dot_P=PASO_ZERO;
269 P_PRES_dot_AP0=PASO_ZERO;
270 P_PRES_dot_AP1=PASO_ZERO;
271 P_PRES_dot_AP2=PASO_ZERO;
272 P_PRES_dot_AP3=PASO_ZERO;
273 P_PRES_dot_AP4=PASO_ZERO;
274 #pragma ivdep
275 for (z=n_start; z < n_end; ++z) {
276 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
277 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
278 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
279 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
280 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
281 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
282 }
283 #pragma omp critical
284 {
285 loc_dots[0]+=R_PRES_dot_P;
286 loc_dots[1]+=P_PRES_dot_AP0;
287 loc_dots[2]+=P_PRES_dot_AP1;
288 loc_dots[3]+=P_PRES_dot_AP2;
289 loc_dots[4]+=P_PRES_dot_AP3;
290 loc_dots[5]+=P_PRES_dot_AP4;
291 }
292 } else if (order==6) {
293 R_PRES_dot_P=PASO_ZERO;
294 P_PRES_dot_AP0=PASO_ZERO;
295 P_PRES_dot_AP1=PASO_ZERO;
296 P_PRES_dot_AP2=PASO_ZERO;
297 P_PRES_dot_AP3=PASO_ZERO;
298 P_PRES_dot_AP4=PASO_ZERO;
299 P_PRES_dot_AP5=PASO_ZERO;
300 #pragma ivdep
301 for (z=n_start; z < n_end; ++z) {
302 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
303 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
304 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
305 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
306 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
307 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
308 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
309 }
310 #pragma omp critical
311 {
312 loc_dots[0]+=R_PRES_dot_P;
313 loc_dots[1]+=P_PRES_dot_AP0;
314 loc_dots[2]+=P_PRES_dot_AP1;
315 loc_dots[3]+=P_PRES_dot_AP2;
316 loc_dots[4]+=P_PRES_dot_AP3;
317 loc_dots[5]+=P_PRES_dot_AP4;
318 loc_dots[6]+=P_PRES_dot_AP5;
319 }
320 } else {
321 R_PRES_dot_P=PASO_ZERO;
322 P_PRES_dot_AP0=PASO_ZERO;
323 P_PRES_dot_AP1=PASO_ZERO;
324 P_PRES_dot_AP2=PASO_ZERO;
325 P_PRES_dot_AP3=PASO_ZERO;
326 P_PRES_dot_AP4=PASO_ZERO;
327 P_PRES_dot_AP5=PASO_ZERO;
328 P_PRES_dot_AP6=PASO_ZERO;
329 #pragma ivdep
330 for (z=n_start; z < n_end; ++z) {
331 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
332 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
333 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
334 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
335 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
336 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
337 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
338 P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
339 }
340 #pragma omp critical
341 {
342 loc_dots[0]+=R_PRES_dot_P;
343 loc_dots[1]+=P_PRES_dot_AP0;
344 loc_dots[2]+=P_PRES_dot_AP1;
345 loc_dots[3]+=P_PRES_dot_AP2;
346 loc_dots[4]+=P_PRES_dot_AP3;
347 loc_dots[5]+=P_PRES_dot_AP4;
348 loc_dots[6]+=P_PRES_dot_AP5;
349 loc_dots[7]+=P_PRES_dot_AP6;
350 }
351 for (i=7;i<order;++i) {
352 P_PRES_dot_AP0=PASO_ZERO;
353 #pragma ivdep
354 for (z=n_start; z < n_end; ++z) {
355 P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
356 }
357 #pragma omp critical
358 {
359 loc_dots[i+1]+=P_PRES_dot_AP0;
360 }
361 }
362 }
363 }
364 #ifdef ESYS_MPI
365 MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
366 R_PRES_dot_P_PRES[0]=dots[0];
367 memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
368 #else
369 R_PRES_dot_P_PRES[0]=loc_dots[0];
370 memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
371 #endif
372 R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
373 /*** If sum_BREAKF is equal to zero a breakdown occurs.
374 *** Iteration procedure can be continued but R_PRES is not the
375 *** residual of X_PRES approximation.
376 ***/
377 sum_BREAKF=0.;
378 #pragma ivdep
379 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
380 breakFlag=!((ABS(R_PRES_dot_AP0) > PASO_ZERO) && (sum_BREAKF >PASO_ZERO));
381 if (!breakFlag) {
382 breakFlag=FALSE;
383 /***
384 ***** X_PRES and R_PRES are moved to memory:
385 ***/
386 Factor=0.;
387 #pragma ivdep
388 for (i=0;i<order;++i) {
389 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
390 Factor+=BREAKF[i]*ALPHA[i];
391 }
392
393 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
394 save_R_PRES=R_PRES[Length_of_mem-1];
395 save_XPRES=X_PRES[Length_of_mem-1];
396 save_P_PRES=P_PRES[Length_of_mem-1];
397 #pragma ivdep
398 for (i=Length_of_mem-1;i>0;--i) {
399 BREAKF[i]=BREAKF[i-1];
400 R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
401 R_PRES[i]=R_PRES[i-1];
402 X_PRES[i]=X_PRES[i-1];
403 P_PRES[i]=P_PRES[i-1];
404 }
405 R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
406 R_PRES[0]=save_R_PRES;
407 X_PRES[0]=save_XPRES;
408 P_PRES[0]=save_P_PRES;
409
410 if (ABS(Factor)<=PASO_ZERO) {
411 Factor=1.;
412 BREAKF[0]=PASO_ZERO;
413 } else {
414 Factor=1./Factor;
415 BREAKF[0]=PASO_ONE;
416 }
417 for (i=0;i<order;++i) ALPHA[i]*=Factor;
418 /*
419 ***** Update of solution X_PRES and its residual R_PRES:
420 ***
421 *** P is used to accumulate X and AP to accumulate R. X and R
422 *** are still needed until they are put into the X and R memory
423 *** R_PRES and X_PRES
424 ***
425 **/
426 breakf0=BREAKF[0];
427 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
428 for (th=0;th<num_threads;++th) {
429 local_n=n/num_threads;
430 rest=n-local_n*num_threads;
431 n_start=local_n*th+MIN(th,rest);
432 n_end=local_n*(th+1)+MIN(th+1,rest);
433 if (order==0) {
434 #pragma ivdep
435 for (z=n_start; z < n_end; ++z) {
436 R_PRES[0][z]= Factor* AP[z];
437 X_PRES[0][z]=-Factor*P_PRES[1][z];
438 }
439 } else if (order==1) {
440 #pragma ivdep
441 for (z=n_start; z < n_end; ++z) {
442 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
443 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
444 }
445 } else if (order==2) {
446 #pragma ivdep
447 for (z=n_start; z < n_end; ++z) {
448 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
449 +ALPHA[1]*R_PRES[2][z];
450 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
451 +ALPHA[1]*X_PRES[2][z];
452 }
453 } else if (order==3) {
454 #pragma ivdep
455 for (z=n_start; z < n_end; ++z) {
456 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
457 +ALPHA[1]*R_PRES[2][z]
458 +ALPHA[2]*R_PRES[3][z];
459 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
460 +ALPHA[1]*X_PRES[2][z]
461 +ALPHA[2]*X_PRES[3][z];
462 }
463 } else if (order==4) {
464 #pragma ivdep
465 for (z=n_start; z < n_end; ++z) {
466 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
467 +ALPHA[1]*R_PRES[2][z]
468 +ALPHA[2]*R_PRES[3][z]
469 +ALPHA[3]*R_PRES[4][z];
470 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
471 +ALPHA[1]*X_PRES[2][z]
472 +ALPHA[2]*X_PRES[3][z]
473 +ALPHA[3]*X_PRES[4][z];
474 }
475 } else if (order==5) {
476 #pragma ivdep
477 for (z=n_start; z < n_end; ++z) {
478 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
479 +ALPHA[1]*R_PRES[2][z]
480 +ALPHA[2]*R_PRES[3][z]
481 +ALPHA[3]*R_PRES[4][z]
482 +ALPHA[4]*R_PRES[5][z];
483 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
484 +ALPHA[1]*X_PRES[2][z]
485 +ALPHA[2]*X_PRES[3][z]
486 +ALPHA[3]*X_PRES[4][z]
487 +ALPHA[4]*X_PRES[5][z];
488 }
489 } else if (order==6) {
490 #pragma ivdep
491 for (z=n_start; z < n_end; ++z) {
492 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
493 +ALPHA[1]*R_PRES[2][z]
494 +ALPHA[2]*R_PRES[3][z]
495 +ALPHA[3]*R_PRES[4][z]
496 +ALPHA[4]*R_PRES[5][z]
497 +ALPHA[5]*R_PRES[6][z];
498 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
499 +ALPHA[1]*X_PRES[2][z]
500 +ALPHA[2]*X_PRES[3][z]
501 +ALPHA[3]*X_PRES[4][z]
502 +ALPHA[4]*X_PRES[5][z]
503 +ALPHA[5]*X_PRES[6][z];
504 }
505 } else {
506 #pragma ivdep
507 for (z=n_start; z < n_end; ++z) {
508 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
509 +ALPHA[1]*R_PRES[2][z]
510 +ALPHA[2]*R_PRES[3][z]
511 +ALPHA[3]*R_PRES[4][z]
512 +ALPHA[4]*R_PRES[5][z]
513 +ALPHA[5]*R_PRES[6][z]
514 +ALPHA[6]*R_PRES[7][z];
515 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
516 +ALPHA[1]*X_PRES[2][z]
517 +ALPHA[2]*X_PRES[3][z]
518 +ALPHA[3]*X_PRES[4][z]
519 +ALPHA[4]*X_PRES[5][z]
520 +ALPHA[5]*X_PRES[6][z]
521 +ALPHA[6]*X_PRES[7][z];
522 }
523 for (i=7;i<order;++i) {
524 #pragma ivdep
525 for (z=n_start; z < n_end; ++z) {
526 R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
527 X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
528 }
529 }
530 }
531 }
532 if (breakf0>0.) {
533 /***
534 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
535 ***/
536 memset(loc_dots,0,sizeof(double)*3);
537 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
538 for (th=0;th<num_threads;++th) {
539 local_n=n/num_threads;
540 rest=n-local_n*num_threads;
541 n_start=local_n*th+MIN(th,rest);
542 n_end=local_n*(th+1)+MIN(th+1,rest);
543 SC1=PASO_ZERO;
544 SC2=PASO_ZERO;
545 #pragma ivdep
546 for (z=n_start; z < n_end; ++z) {
547 diff=R_PRES[0][z]-r[z];
548 SC1+=diff*diff;
549 SC2+=diff*r[z];
550 }
551 #pragma omp critical
552 {
553 loc_dots[0]+=SC1;
554 loc_dots[1]+=SC2;
555 }
556 }
557 #ifdef ESYS_MPI
558 MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
559 SC1=dots[0];
560 SC2=dots[1];
561 #else
562 SC1=loc_dots[0];
563 SC2=loc_dots[1];
564 #endif
565 gamma=(SC1<=PASO_ZERO) ? PASO_ZERO : -SC2/SC1;
566 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
567 for (th=0;th<num_threads;++th) {
568 local_n=n/num_threads;
569 rest=n-local_n*num_threads;
570 n_start=local_n*th+MIN(th,rest);
571 n_end=local_n*(th+1)+MIN(th+1,rest);
572 L2_R=PASO_ZERO;
573 #pragma ivdep
574 for (z=n_start; z < n_end; ++z) {
575 x[z]+=gamma*(X_PRES[0][z]-x[z]);
576 r[z]+=gamma*(R_PRES[0][z]-r[z]);
577 L2_R+=r[z]*r[z];
578 }
579 #pragma omp critical
580 {
581 loc_dots[2]+=L2_R;
582 }
583 }
584 #ifdef ESYS_MPI
585 MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
586 L2_R=dots[2];
587 #else
588 L2_R=loc_dots[2];
589 #endif
590 norm_of_residual=sqrt(L2_R);
591 convergeFlag = (norm_of_residual <= tol);
592 if (restart>0) restartFlag=(num_iter_restart >= restart);
593 } else {
594 convergeFlag=FALSE;
595 restartFlag=FALSE;
596 }
597 maxIterFlag = (num_iter >= maxit);
598 }
599 }
600 /* end of iteration */
601 Norm_of_residual_global=norm_of_residual;
602 Num_iter_global=num_iter;
603 if (maxIterFlag) {
604 Status = SOLVER_MAXITER_REACHED;
605 } else if (breakFlag) {
606 Status = SOLVER_BREAKDOWN;
607 }
608 }
609 for (i=0;i<Length_of_recursion;i++) {
610 TMPMEMFREE(X_PRES[i]);
611 TMPMEMFREE(R_PRES[i]);
612 TMPMEMFREE(P_PRES[i]);
613 }
614 TMPMEMFREE(AP);
615 TMPMEMFREE(X_PRES);
616 TMPMEMFREE(R_PRES);
617 TMPMEMFREE(P_PRES);
618 TMPMEMFREE(P_PRES_dot_AP);
619 TMPMEMFREE(R_PRES_dot_P_PRES);
620 TMPMEMFREE(BREAKF);
621 TMPMEMFREE(ALPHA);
622 TMPMEMFREE(dots);
623 TMPMEMFREE(loc_dots);
624 *iter=Num_iter_global;
625 *tolerance=Norm_of_residual_global;
626 return Status;
627 }
628

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26