/[escript]/trunk/paso/src/GMRES.c
ViewVC logotype

Contents of /trunk/paso/src/GMRES.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2107 - (show annotations)
Fri Nov 28 04:39:07 2008 UTC (10 years, 10 months ago) by artak
File MIME type: text/plain
File size: 27269 byte(s)
Current version of AMG uses ILU for relaxation, however it is not stable when schur matrix becames more denser. For example it is not stable for -e paramenter is more 400 in 2D for convection problem 
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2008 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /*
16 * Purpose
17 * =======
18 *
19 * GMRES solves the linear system A*x=b using the
20 * truncated and restered GMRES method with preconditioning.
21 *
22 * Convergence test: norm( b - A*x )< TOL.
23 *
24 *
25 *
26 * Arguments
27 * =========
28 *
29 * r (input/output) double array, dimension n.
30 * On entry, residual of inital guess X
31 *
32 * x (input/output) double array, dimension n.
33 * On input, the initial guess.
34 *
35 * iter (input/output) int
36 * On input, the maximum num_iterations to be performed.
37 * On output, actual number of num_iterations performed.
38 *
39 * tolerance (input/output) DOUBLE PRECISION
40 * On input, the allowable convergence measure for
41 * norm( b - A*x )
42 * On output, the final value of this measure.
43 *
44 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
45 *
46 * restart (input) If restart>0, iteration is resterted a after restart steps.
47 *
48 * INFO (output) int
49 *
50 * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
51 * =SOLVER_MAXNUM_ITER_REACHED
52 * =SOLVER_INPUT_ERROR Illegal parameter:
53 * =SOLVER_BREAKDOWN: bad luck!
54 * =SOLVER_MEMORY_ERROR : no memory available
55 *
56 * ==============================================================
57 */
58
59 #include "Common.h"
60 #include "SystemMatrix.h"
61 #include "Solver.h"
62 #ifdef _OPENMP
63 #include <omp.h>
64 #endif
65
66 err_t Paso_Solver_GMRES(
67 Paso_SystemMatrix * A,
68 double * r,
69 double * x,
70 dim_t *iter,
71 double * tolerance,dim_t Length_of_recursion,dim_t restart,
72 Paso_Performance* pp) {
73
74 /* Local variables */
75
76 #ifdef _OPENMP
77 const int num_threads=omp_get_max_threads();
78 #else
79 const int num_threads=1;
80 #endif
81 double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
82 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
83 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
84 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual=0,diff,L2_R,Norm_of_residual_global=0;
85 double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
86 dim_t maxit,Num_iter_global=0,num_iter_restart=0,num_iter;
87 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
88 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
89 err_t Status=SOLVER_NO_ERROR;
90
91
92 /* adapt original routine parameters */
93 n = Paso_SystemMatrix_getTotalNumRows(A);
94 Length_of_mem=MAX(Length_of_recursion,0)+1;
95
96 /* Test the input parameters. */
97 if (restart>0) restart=MAX(Length_of_recursion,restart);
98 if (n < 0 || Length_of_recursion<=0) {
99 return SOLVER_INPUT_ERROR;
100 }
101
102 /* allocate memory: */
103 X_PRES=TMPMEMALLOC(Length_of_mem,double*);
104 R_PRES=TMPMEMALLOC(Length_of_mem,double*);
105 P_PRES=TMPMEMALLOC(Length_of_mem,double*);
106 loc_dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
107 dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
108 P_PRES_dot_AP=TMPMEMALLOC(Length_of_mem,double);
109 R_PRES_dot_P_PRES=TMPMEMALLOC(Length_of_mem,double);
110 BREAKF=TMPMEMALLOC(Length_of_mem,double);
111 ALPHA=TMPMEMALLOC(Length_of_mem,double);
112 AP=TMPMEMALLOC(n,double);
113 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
114 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
115 Status=SOLVER_MEMORY_ERROR;
116 } else {
117 for (i=0;i<Length_of_mem;i++) {
118 X_PRES[i]=TMPMEMALLOC(n,double);
119 R_PRES[i]=TMPMEMALLOC(n,double);
120 P_PRES[i]=TMPMEMALLOC(n,double);
121 if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
122 }
123 }
124 if ( Status ==SOLVER_NO_ERROR ) {
125
126 /* now PRES starts : */
127 maxit=*iter;
128 tol=*tolerance;
129
130 /* initialization */
131
132 restartFlag=TRUE;
133 num_iter=0;
134
135 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
136 for (th=0;th<num_threads;++th) {
137 local_n=n/num_threads;
138 rest=n-local_n*num_threads;
139 n_start=local_n*th+MIN(th,rest);
140 n_end=local_n*(th+1)+MIN(th+1,rest);
141 memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
142 for(i=0;i<Length_of_mem;++i) {
143 memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
144 memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
145 memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146 }
147 }
148
149 while (! (convergeFlag || maxIterFlag || breakFlag)) {
150 if (restartFlag) {
151 BREAKF[0]=PASO_ONE;
152 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
153 for (th=0;th<num_threads;++th) {
154 local_n=n/num_threads;
155 rest=n-local_n*num_threads;
156 n_start=local_n*th+MIN(th,rest);
157 n_end=local_n*(th+1)+MIN(th+1,rest);
158 memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
159 memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
160 }
161 num_iter_restart=0;
162 restartFlag=FALSE;
163 }
164 ++num_iter;
165 ++num_iter_restart;
166 /* order is the dimension of the space on which the residual is minimized: */
167 order=MIN(num_iter_restart,Length_of_recursion);
168 /***
169 *** calculate new search direction P from R_PRES
170 ***/
171 Paso_Solver_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
172 /***
173 *** apply A to P to get AP
174 ***/
175 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, &P_PRES[0][0],PASO_ZERO, &AP[0]);
176 /***
177 ***** calculation of the norm of R and the scalar products of
178 *** the residuals and A*P:
179 ***/
180 memset(loc_dots,0,sizeof(double)*(order+1));
181 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
182 for (th=0;th<num_threads;++th) {
183 local_n=n/num_threads;
184 rest=n-local_n*num_threads;
185 n_start=local_n*th+MIN(th,rest);
186 n_end=local_n*(th+1)+MIN(th+1,rest);
187 if (order==0) {
188 R_PRES_dot_P=PASO_ZERO;
189 #pragma ivdep
190 for (z=n_start; z < n_end; ++z) {
191 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
192 }
193 #pragma omp critical
194 {
195 loc_dots[0]+=R_PRES_dot_P;
196 }
197 } else if (order==1) {
198 R_PRES_dot_P=PASO_ZERO;
199 P_PRES_dot_AP0=PASO_ZERO;
200 #pragma ivdep
201 for (z=n_start; z < n_end; ++z) {
202 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
203 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
204 }
205 #pragma omp critical
206 {
207 loc_dots[0]+=R_PRES_dot_P;
208 loc_dots[1]+=P_PRES_dot_AP0;
209 }
210 } else if (order==2) {
211 R_PRES_dot_P=PASO_ZERO;
212 P_PRES_dot_AP0=PASO_ZERO;
213 P_PRES_dot_AP1=PASO_ZERO;
214 #pragma ivdep
215 for (z=n_start; z < n_end; ++z) {
216 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
217 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
218 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
219 }
220 #pragma omp critical
221 {
222 loc_dots[0]+=R_PRES_dot_P;
223 loc_dots[1]+=P_PRES_dot_AP0;
224 loc_dots[2]+=P_PRES_dot_AP1;
225 }
226 } else if (order==3) {
227 R_PRES_dot_P=PASO_ZERO;
228 P_PRES_dot_AP0=PASO_ZERO;
229 P_PRES_dot_AP1=PASO_ZERO;
230 P_PRES_dot_AP2=PASO_ZERO;
231 #pragma ivdep
232 for (z=n_start; z < n_end; ++z) {
233 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
234 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
235 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
236 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
237 }
238 #pragma omp critical
239 {
240 loc_dots[0]+=R_PRES_dot_P;
241 loc_dots[1]+=P_PRES_dot_AP0;
242 loc_dots[2]+=P_PRES_dot_AP1;
243 loc_dots[3]+=P_PRES_dot_AP2;
244 }
245 } else if (order==4) {
246 R_PRES_dot_P=PASO_ZERO;
247 P_PRES_dot_AP0=PASO_ZERO;
248 P_PRES_dot_AP1=PASO_ZERO;
249 P_PRES_dot_AP2=PASO_ZERO;
250 P_PRES_dot_AP3=PASO_ZERO;
251 #pragma ivdep
252 for (z=n_start; z < n_end; ++z) {
253 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
254 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
255 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
256 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
257 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
258 }
259 #pragma omp critical
260 {
261 loc_dots[0]+=R_PRES_dot_P;
262 loc_dots[1]+=P_PRES_dot_AP0;
263 loc_dots[2]+=P_PRES_dot_AP1;
264 loc_dots[3]+=P_PRES_dot_AP2;
265 loc_dots[4]+=P_PRES_dot_AP3;
266 }
267 } else if (order==5) {
268 R_PRES_dot_P=PASO_ZERO;
269 P_PRES_dot_AP0=PASO_ZERO;
270 P_PRES_dot_AP1=PASO_ZERO;
271 P_PRES_dot_AP2=PASO_ZERO;
272 P_PRES_dot_AP3=PASO_ZERO;
273 P_PRES_dot_AP4=PASO_ZERO;
274 #pragma ivdep
275 for (z=n_start; z < n_end; ++z) {
276 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
277 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
278 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
279 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
280 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
281 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
282 }
283 #pragma omp critical
284 {
285 loc_dots[0]+=R_PRES_dot_P;
286 loc_dots[1]+=P_PRES_dot_AP0;
287 loc_dots[2]+=P_PRES_dot_AP1;
288 loc_dots[3]+=P_PRES_dot_AP2;
289 loc_dots[4]+=P_PRES_dot_AP3;
290 loc_dots[5]+=P_PRES_dot_AP4;
291 }
292 } else if (order==6) {
293 R_PRES_dot_P=PASO_ZERO;
294 P_PRES_dot_AP0=PASO_ZERO;
295 P_PRES_dot_AP1=PASO_ZERO;
296 P_PRES_dot_AP2=PASO_ZERO;
297 P_PRES_dot_AP3=PASO_ZERO;
298 P_PRES_dot_AP4=PASO_ZERO;
299 P_PRES_dot_AP5=PASO_ZERO;
300 #pragma ivdep
301 for (z=n_start; z < n_end; ++z) {
302 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
303 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
304 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
305 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
306 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
307 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
308 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
309 }
310 #pragma omp critical
311 {
312 loc_dots[0]+=R_PRES_dot_P;
313 loc_dots[1]+=P_PRES_dot_AP0;
314 loc_dots[2]+=P_PRES_dot_AP1;
315 loc_dots[3]+=P_PRES_dot_AP2;
316 loc_dots[4]+=P_PRES_dot_AP3;
317 loc_dots[5]+=P_PRES_dot_AP4;
318 loc_dots[6]+=P_PRES_dot_AP5;
319 }
320 } else {
321 R_PRES_dot_P=PASO_ZERO;
322 P_PRES_dot_AP0=PASO_ZERO;
323 P_PRES_dot_AP1=PASO_ZERO;
324 P_PRES_dot_AP2=PASO_ZERO;
325 P_PRES_dot_AP3=PASO_ZERO;
326 P_PRES_dot_AP4=PASO_ZERO;
327 P_PRES_dot_AP5=PASO_ZERO;
328 P_PRES_dot_AP6=PASO_ZERO;
329 #pragma ivdep
330 for (z=n_start; z < n_end; ++z) {
331 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
332 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
333 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
334 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
335 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
336 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
337 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
338 P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
339 }
340 #pragma omp critical
341 {
342 loc_dots[0]+=R_PRES_dot_P;
343 loc_dots[1]+=P_PRES_dot_AP0;
344 loc_dots[2]+=P_PRES_dot_AP1;
345 loc_dots[3]+=P_PRES_dot_AP2;
346 loc_dots[4]+=P_PRES_dot_AP3;
347 loc_dots[5]+=P_PRES_dot_AP4;
348 loc_dots[6]+=P_PRES_dot_AP5;
349 loc_dots[7]+=P_PRES_dot_AP6;
350 }
351 for (i=7;i<order;++i) {
352 P_PRES_dot_AP0=PASO_ZERO;
353 #pragma ivdep
354 for (z=n_start; z < n_end; ++z) {
355 P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
356 }
357 #pragma omp critical
358 {
359 loc_dots[i+1]+=P_PRES_dot_AP0;
360 }
361 }
362 }
363 }
364 #ifdef PASO_MPI
365 MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
366 R_PRES_dot_P_PRES[0]=dots[0];
367 memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
368 #else
369 R_PRES_dot_P_PRES[0]=loc_dots[0];
370 memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
371 #endif
372 R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
373
374 /*** if sum_BREAKF is equal to zero a breakdown occurs.
375 *** iteration procedure can be continued but R_PRES is not the
376 *** residual of X_PRES approximation.
377 ***/
378 sum_BREAKF=0.;
379 #pragma ivdep
380 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
381 breakFlag=!((ABS(R_PRES_dot_AP0) > PASO_ZERO) && (sum_BREAKF >PASO_ZERO));
382 if (!breakFlag) {
383 breakFlag=FALSE;
384 /***
385 ***** X_PRES and R_PRES are moved to memory:
386 ***/
387 Factor=0.;
388 #pragma ivdep
389 for (i=0;i<order;++i) {
390 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
391 Factor+=BREAKF[i]*ALPHA[i];
392 }
393
394 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
395 save_R_PRES=R_PRES[Length_of_mem-1];
396 save_XPRES=X_PRES[Length_of_mem-1];
397 save_P_PRES=P_PRES[Length_of_mem-1];
398 #pragma ivdep
399 for (i=Length_of_mem-1;i>0;--i) {
400 BREAKF[i]=BREAKF[i-1];
401 R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
402 R_PRES[i]=R_PRES[i-1];
403 X_PRES[i]=X_PRES[i-1];
404 P_PRES[i]=P_PRES[i-1];
405 }
406 R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
407 R_PRES[0]=save_R_PRES;
408 X_PRES[0]=save_XPRES;
409 P_PRES[0]=save_P_PRES;
410
411 if (ABS(Factor)<=PASO_ZERO) {
412 Factor=1.;
413 BREAKF[0]=PASO_ZERO;
414 } else {
415 Factor=1./Factor;
416 BREAKF[0]=PASO_ONE;
417 }
418 for (i=0;i<order;++i) ALPHA[i]*=Factor;
419 /*
420 ***** update of solution X_PRES and its residual R_PRES:
421 ***
422 *** P is used to accumulate X and AP to accumulate R. X and R
423 *** are still needed until they are put into the X and R memory
424 *** R_PRES and X_PRES
425 ***
426 **/
427 breakf0=BREAKF[0];
428 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
429 for (th=0;th<num_threads;++th) {
430 local_n=n/num_threads;
431 rest=n-local_n*num_threads;
432 n_start=local_n*th+MIN(th,rest);
433 n_end=local_n*(th+1)+MIN(th+1,rest);
434 if (order==0) {
435 #pragma ivdep
436 for (z=n_start; z < n_end; ++z) {
437 R_PRES[0][z]= Factor* AP[z];
438 X_PRES[0][z]=-Factor*P_PRES[1][z];
439 }
440 } else if (order==1) {
441 #pragma ivdep
442 for (z=n_start; z < n_end; ++z) {
443 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
444 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
445 }
446 } else if (order==2) {
447 #pragma ivdep
448 for (z=n_start; z < n_end; ++z) {
449 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
450 +ALPHA[1]*R_PRES[2][z];
451 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
452 +ALPHA[1]*X_PRES[2][z];
453 }
454 } else if (order==3) {
455 #pragma ivdep
456 for (z=n_start; z < n_end; ++z) {
457 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
458 +ALPHA[1]*R_PRES[2][z]
459 +ALPHA[2]*R_PRES[3][z];
460 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
461 +ALPHA[1]*X_PRES[2][z]
462 +ALPHA[2]*X_PRES[3][z];
463 }
464 } else if (order==4) {
465 #pragma ivdep
466 for (z=n_start; z < n_end; ++z) {
467 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
468 +ALPHA[1]*R_PRES[2][z]
469 +ALPHA[2]*R_PRES[3][z]
470 +ALPHA[3]*R_PRES[4][z];
471 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
472 +ALPHA[1]*X_PRES[2][z]
473 +ALPHA[2]*X_PRES[3][z]
474 +ALPHA[3]*X_PRES[4][z];
475 }
476 } else if (order==5) {
477 #pragma ivdep
478 for (z=n_start; z < n_end; ++z) {
479 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
480 +ALPHA[1]*R_PRES[2][z]
481 +ALPHA[2]*R_PRES[3][z]
482 +ALPHA[3]*R_PRES[4][z]
483 +ALPHA[4]*R_PRES[5][z];
484 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
485 +ALPHA[1]*X_PRES[2][z]
486 +ALPHA[2]*X_PRES[3][z]
487 +ALPHA[3]*X_PRES[4][z]
488 +ALPHA[4]*X_PRES[5][z];
489 }
490 } else if (order==6) {
491 #pragma ivdep
492 for (z=n_start; z < n_end; ++z) {
493 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
494 +ALPHA[1]*R_PRES[2][z]
495 +ALPHA[2]*R_PRES[3][z]
496 +ALPHA[3]*R_PRES[4][z]
497 +ALPHA[4]*R_PRES[5][z]
498 +ALPHA[5]*R_PRES[6][z];
499 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
500 +ALPHA[1]*X_PRES[2][z]
501 +ALPHA[2]*X_PRES[3][z]
502 +ALPHA[3]*X_PRES[4][z]
503 +ALPHA[4]*X_PRES[5][z]
504 +ALPHA[5]*X_PRES[6][z];
505 }
506 } else {
507 #pragma ivdep
508 for (z=n_start; z < n_end; ++z) {
509 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
510 +ALPHA[1]*R_PRES[2][z]
511 +ALPHA[2]*R_PRES[3][z]
512 +ALPHA[3]*R_PRES[4][z]
513 +ALPHA[4]*R_PRES[5][z]
514 +ALPHA[5]*R_PRES[6][z]
515 +ALPHA[6]*R_PRES[7][z];
516 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
517 +ALPHA[1]*X_PRES[2][z]
518 +ALPHA[2]*X_PRES[3][z]
519 +ALPHA[3]*X_PRES[4][z]
520 +ALPHA[4]*X_PRES[5][z]
521 +ALPHA[5]*X_PRES[6][z]
522 +ALPHA[6]*X_PRES[7][z];
523 }
524 for (i=7;i<order;++i) {
525 #pragma ivdep
526 for (z=n_start; z < n_end; ++z) {
527 R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
528 X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
529 }
530 }
531 }
532 }
533 if (breakf0>0.) {
534 /***
535 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
536 ***/
537 memset(loc_dots,0,sizeof(double)*3);
538 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
539 for (th=0;th<num_threads;++th) {
540 local_n=n/num_threads;
541 rest=n-local_n*num_threads;
542 n_start=local_n*th+MIN(th,rest);
543 n_end=local_n*(th+1)+MIN(th+1,rest);
544 SC1=PASO_ZERO;
545 SC2=PASO_ZERO;
546 #pragma ivdep
547 for (z=n_start; z < n_end; ++z) {
548 diff=R_PRES[0][z]-r[z];
549 SC1+=diff*diff;
550 SC2+=diff*r[z];
551 }
552 #pragma omp critical
553 {
554 loc_dots[0]+=SC1;
555 loc_dots[1]+=SC2;
556 }
557 }
558 #ifdef PASO_MPI
559 MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
560 SC1=dots[0];
561 SC2=dots[1];
562 #else
563 SC1=loc_dots[0];
564 SC2=loc_dots[1];
565 #endif
566 gamma=(SC1<=PASO_ZERO) ? PASO_ZERO : -SC2/SC1;
567 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
568 for (th=0;th<num_threads;++th) {
569 local_n=n/num_threads;
570 rest=n-local_n*num_threads;
571 n_start=local_n*th+MIN(th,rest);
572 n_end=local_n*(th+1)+MIN(th+1,rest);
573 L2_R=PASO_ZERO;
574 #pragma ivdep
575 for (z=n_start; z < n_end; ++z) {
576 x[z]+=gamma*(X_PRES[0][z]-x[z]);
577 r[z]+=gamma*(R_PRES[0][z]-r[z]);
578 L2_R+=r[z]*r[z];
579 }
580 #pragma omp critical
581 {
582 loc_dots[2]+=L2_R;
583 }
584 }
585 #ifdef PASO_MPI
586 MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
587 L2_R=dots[2];
588 #else
589 L2_R=loc_dots[2];
590 #endif
591 norm_of_residual=sqrt(L2_R);
592 convergeFlag = (norm_of_residual <= tol);
593 if (restart>0) restartFlag=(num_iter_restart >= restart);
594 } else {
595 convergeFlag=FALSE;
596 restartFlag=FALSE;
597 }
598 maxIterFlag = (num_iter >= maxit);
599 }
600 }
601 /* end of iteration */
602 Norm_of_residual_global=norm_of_residual;
603 Num_iter_global=num_iter;
604 if (maxIterFlag) {
605 Status = SOLVER_MAXITER_REACHED;
606 } else if (breakFlag) {
607 Status = SOLVER_BREAKDOWN;
608 }
609 }
610 for (i=0;i<Length_of_recursion;i++) {
611 TMPMEMFREE(X_PRES[i]);
612 TMPMEMFREE(R_PRES[i]);
613 TMPMEMFREE(P_PRES[i]);
614 }
615 TMPMEMFREE(AP);
616 TMPMEMFREE(X_PRES);
617 TMPMEMFREE(R_PRES);
618 TMPMEMFREE(P_PRES);
619 TMPMEMFREE(P_PRES_dot_AP);
620 TMPMEMFREE(R_PRES_dot_P_PRES);
621 TMPMEMFREE(BREAKF);
622 TMPMEMFREE(ALPHA);
623 TMPMEMFREE(dots);
624 TMPMEMFREE(loc_dots);
625 *iter=Num_iter_global;
626 *tolerance=Norm_of_residual_global;
627 return Status;
628 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26