/[escript]/trunk/paso/src/GMRES.c
ViewVC logotype

Contents of /trunk/paso/src/GMRES.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1759 - (show annotations)
Mon Sep 8 02:31:22 2008 UTC (10 years, 7 months ago) by gross
File MIME type: text/plain
File size: 27046 byte(s)
MPI version plus some openmp optimization
1
2 /* $Id$ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /*
17 * Purpose
18 * =======
19 *
20 * GMRES solves the linear system A*x=b using the
21 * truncated and restered GMRES method with preconditioning.
22 *
23 * Convergence test: norm( b - A*x )< TOL.
24 *
25 *
26 *
27 * Arguments
28 * =========
29 *
30 * r (input/output) double array, dimension n.
31 * On entry, residual of inital guess X
32 *
33 * x (input/output) double array, dimension n.
34 * On input, the initial guess.
35 *
36 * iter (input/output) int
37 * On input, the maximum num_iterations to be performed.
38 * On output, actual number of num_iterations performed.
39 *
40 * tolerance (input/output) DOUBLE PRECISION
41 * On input, the allowable convergence measure for
42 * norm( b - A*x )
43 * On output, the final value of this measure.
44 *
45 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
46 *
47 * restart (input) If restart>0, iteration is resterted a after restart steps.
48 *
49 * INFO (output) int
50 *
51 * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
52 * =SOLVER_MAXNUM_ITER_REACHED
53 * =SOLVER_INPUT_ERROR Illegal parameter:
54 * =SOLVER_BREAKDOWN: bad luck!
55 * =SOLVER_MEMORY_ERROR : no memory available
56 *
57 * ==============================================================
58 */
59
60 #include "Common.h"
61 #include "SystemMatrix.h"
62 #include "Solver.h"
63 #ifdef _OPENMP
64 #include <omp.h>
65 #endif
66
67 err_t Paso_Solver_GMRES(
68 Paso_SystemMatrix * A,
69 double * r,
70 double * x,
71 dim_t *iter,
72 double * tolerance,dim_t Length_of_recursion,dim_t restart,
73 Paso_Performance* pp) {
74
75 /* Local variables */
76
77 #ifdef _OPENMP
78 const int num_threads=omp_get_max_threads();
79 #else
80 const int num_threads=1;
81 #endif
82 double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
83 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
84 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
85 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual,diff,L2_R,Norm_of_residual_global;
86 double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
87 dim_t maxit,Num_iter_global,num_iter_restart,num_iter;
88 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
89 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
90 err_t Status=SOLVER_NO_ERROR;
91
92
93 /* adapt original routine parameters */
94 n = Paso_SystemMatrix_getTotalNumRows(A);
95 Length_of_mem=MAX(Length_of_recursion,0)+1;
96
97 /* Test the input parameters. */
98 if (restart>0) restart=MAX(Length_of_recursion,restart);
99 if (n < 0 || Length_of_recursion<=0) {
100 return SOLVER_INPUT_ERROR;
101 }
102
103 /* allocate memory: */
104 X_PRES=TMPMEMALLOC(Length_of_mem,double*);
105 R_PRES=TMPMEMALLOC(Length_of_mem,double*);
106 P_PRES=TMPMEMALLOC(Length_of_mem,double*);
107 loc_dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
108 dots=TMPMEMALLOC(MAX(Length_of_mem+1,3),double);
109 P_PRES_dot_AP=TMPMEMALLOC(Length_of_mem,double);
110 R_PRES_dot_P_PRES=TMPMEMALLOC(Length_of_mem,double);
111 BREAKF=TMPMEMALLOC(Length_of_mem,double);
112 ALPHA=TMPMEMALLOC(Length_of_mem,double);
113 AP=TMPMEMALLOC(n,double);
114 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
115 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
116 Status=SOLVER_MEMORY_ERROR;
117 } else {
118 for (i=0;i<Length_of_mem;i++) {
119 X_PRES[i]=TMPMEMALLOC(n,double);
120 R_PRES[i]=TMPMEMALLOC(n,double);
121 P_PRES[i]=TMPMEMALLOC(n,double);
122 if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
123 }
124 }
125 if ( Status ==SOLVER_NO_ERROR ) {
126
127 /* now PRES starts : */
128 maxit=*iter;
129 tol=*tolerance;
130
131 /* initialization */
132
133 restartFlag=TRUE;
134 num_iter=0;
135
136 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
137 for (th=0;th<num_threads;++th) {
138 local_n=n/num_threads;
139 rest=n-local_n*num_threads;
140 n_start=local_n*th+MIN(th,rest);
141 n_end=local_n*(th+1)+MIN(th+1,rest);
142 memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
143 for(i=0;i<Length_of_mem;++i) {
144 memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
145 memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146 memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
147 }
148 }
149
150 while (! (convergeFlag || maxIterFlag || breakFlag)) {
151 if (restartFlag) {
152 BREAKF[0]=ONE;
153 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
154 for (th=0;th<num_threads;++th) {
155 local_n=n/num_threads;
156 rest=n-local_n*num_threads;
157 n_start=local_n*th+MIN(th,rest);
158 n_end=local_n*(th+1)+MIN(th+1,rest);
159 memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
160 memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
161 }
162 num_iter_restart=0;
163 restartFlag=FALSE;
164 }
165 ++num_iter;
166 ++num_iter_restart;
167 /* order is the dimension of the space on which the residual is minimized: */
168 order=MIN(num_iter_restart,Length_of_recursion);
169 /***
170 *** calculate new search direction P from R_PRES
171 ***/
172 Paso_Solver_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
173 /***
174 *** apply A to P to get AP
175 ***/
176 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, &P_PRES[0][0],ZERO, &AP[0]);
177 /***
178 ***** calculation of the norm of R and the scalar products of
179 *** the residuals and A*P:
180 ***/
181 memset(loc_dots,0,sizeof(double)*(order+1));
182 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
183 for (th=0;th<num_threads;++th) {
184 local_n=n/num_threads;
185 rest=n-local_n*num_threads;
186 n_start=local_n*th+MIN(th,rest);
187 n_end=local_n*(th+1)+MIN(th+1,rest);
188 if (order==0) {
189 R_PRES_dot_P=ZERO;
190 #pragma ivdep
191 for (z=n_start; z < n_end; ++z) {
192 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
193 }
194 #pragma omp critical
195 {
196 loc_dots[0]+=R_PRES_dot_P;
197 }
198 } else if (order==1) {
199 R_PRES_dot_P=ZERO;
200 P_PRES_dot_AP0=ZERO;
201 #pragma ivdep
202 for (z=n_start; z < n_end; ++z) {
203 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
204 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
205 }
206 #pragma omp critical
207 {
208 loc_dots[0]+=R_PRES_dot_P;
209 loc_dots[1]+=P_PRES_dot_AP0;
210 }
211 } else if (order==2) {
212 R_PRES_dot_P=ZERO;
213 P_PRES_dot_AP0=ZERO;
214 P_PRES_dot_AP1=ZERO;
215 #pragma ivdep
216 for (z=n_start; z < n_end; ++z) {
217 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
218 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
219 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
220 }
221 #pragma omp critical
222 {
223 loc_dots[0]+=R_PRES_dot_P;
224 loc_dots[1]+=P_PRES_dot_AP0;
225 loc_dots[2]+=P_PRES_dot_AP1;
226 }
227 } else if (order==3) {
228 R_PRES_dot_P=ZERO;
229 P_PRES_dot_AP0=ZERO;
230 P_PRES_dot_AP1=ZERO;
231 P_PRES_dot_AP2=ZERO;
232 #pragma ivdep
233 for (z=n_start; z < n_end; ++z) {
234 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
235 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
236 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
237 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
238 }
239 #pragma omp critical
240 {
241 loc_dots[0]+=R_PRES_dot_P;
242 loc_dots[1]+=P_PRES_dot_AP0;
243 loc_dots[2]+=P_PRES_dot_AP1;
244 loc_dots[3]+=P_PRES_dot_AP2;
245 }
246 } else if (order==4) {
247 R_PRES_dot_P=ZERO;
248 P_PRES_dot_AP0=ZERO;
249 P_PRES_dot_AP1=ZERO;
250 P_PRES_dot_AP2=ZERO;
251 P_PRES_dot_AP3=ZERO;
252 #pragma ivdep
253 for (z=n_start; z < n_end; ++z) {
254 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
255 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
256 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
257 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
258 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
259 }
260 #pragma omp critical
261 {
262 loc_dots[0]+=R_PRES_dot_P;
263 loc_dots[1]+=P_PRES_dot_AP0;
264 loc_dots[2]+=P_PRES_dot_AP1;
265 loc_dots[3]+=P_PRES_dot_AP2;
266 loc_dots[4]+=P_PRES_dot_AP3;
267 }
268 } else if (order==5) {
269 R_PRES_dot_P=ZERO;
270 P_PRES_dot_AP0=ZERO;
271 P_PRES_dot_AP1=ZERO;
272 P_PRES_dot_AP2=ZERO;
273 P_PRES_dot_AP3=ZERO;
274 P_PRES_dot_AP4=ZERO;
275 #pragma ivdep
276 for (z=n_start; z < n_end; ++z) {
277 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
278 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
279 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
280 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
281 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
282 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
283 }
284 #pragma omp critical
285 {
286 loc_dots[0]+=R_PRES_dot_P;
287 loc_dots[1]+=P_PRES_dot_AP0;
288 loc_dots[2]+=P_PRES_dot_AP1;
289 loc_dots[3]+=P_PRES_dot_AP2;
290 loc_dots[4]+=P_PRES_dot_AP3;
291 loc_dots[5]+=P_PRES_dot_AP4;
292 }
293 } else if (order==6) {
294 R_PRES_dot_P=ZERO;
295 P_PRES_dot_AP0=ZERO;
296 P_PRES_dot_AP1=ZERO;
297 P_PRES_dot_AP2=ZERO;
298 P_PRES_dot_AP3=ZERO;
299 P_PRES_dot_AP4=ZERO;
300 P_PRES_dot_AP5=ZERO;
301 #pragma ivdep
302 for (z=n_start; z < n_end; ++z) {
303 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
304 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
305 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
306 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
307 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
308 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
309 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
310 }
311 #pragma omp critical
312 {
313 loc_dots[0]+=R_PRES_dot_P;
314 loc_dots[1]+=P_PRES_dot_AP0;
315 loc_dots[2]+=P_PRES_dot_AP1;
316 loc_dots[3]+=P_PRES_dot_AP2;
317 loc_dots[4]+=P_PRES_dot_AP3;
318 loc_dots[5]+=P_PRES_dot_AP4;
319 loc_dots[6]+=P_PRES_dot_AP5;
320 }
321 } else {
322 R_PRES_dot_P=ZERO;
323 P_PRES_dot_AP0=ZERO;
324 P_PRES_dot_AP1=ZERO;
325 P_PRES_dot_AP2=ZERO;
326 P_PRES_dot_AP3=ZERO;
327 P_PRES_dot_AP4=ZERO;
328 P_PRES_dot_AP5=ZERO;
329 P_PRES_dot_AP6=ZERO;
330 #pragma ivdep
331 for (z=n_start; z < n_end; ++z) {
332 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
333 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
334 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
335 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
336 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
337 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
338 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
339 P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
340 }
341 #pragma omp critical
342 {
343 loc_dots[0]+=R_PRES_dot_P;
344 loc_dots[1]+=P_PRES_dot_AP0;
345 loc_dots[2]+=P_PRES_dot_AP1;
346 loc_dots[3]+=P_PRES_dot_AP2;
347 loc_dots[4]+=P_PRES_dot_AP3;
348 loc_dots[5]+=P_PRES_dot_AP4;
349 loc_dots[6]+=P_PRES_dot_AP5;
350 loc_dots[7]+=P_PRES_dot_AP6;
351 }
352 for (i=7;i<order;++i) {
353 P_PRES_dot_AP0=ZERO;
354 #pragma ivdep
355 for (z=n_start; z < n_end; ++z) {
356 P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
357 }
358 #pragma omp critical
359 {
360 loc_dots[i+1]+=P_PRES_dot_AP0;
361 }
362 }
363 }
364 }
365 #ifdef PASO_MPI
366 MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
367 R_PRES_dot_P_PRES[0]=dots[0];
368 memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
369 #else
370 R_PRES_dot_P_PRES[0]=loc_dots[0];
371 memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
372 #endif
373 R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
374 /*** if sum_BREAKF is equal to zero a breakdown occurs.
375 *** iteration procedure can be continued but R_PRES is not the
376 *** residual of X_PRES approximation.
377 ***/
378 sum_BREAKF=0.;
379 #pragma ivdep
380 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
381 breakFlag=!((ABS(R_PRES_dot_AP0) > ZERO) && (sum_BREAKF >ZERO));
382 if (!breakFlag) {
383 breakFlag=FALSE;
384 /***
385 ***** X_PRES and R_PRES are moved to memory:
386 ***/
387 Factor=0.;
388 #pragma ivdep
389 for (i=0;i<order;++i) {
390 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
391 Factor+=BREAKF[i]*ALPHA[i];
392 }
393
394 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
395 save_R_PRES=R_PRES[Length_of_mem-1];
396 save_XPRES=X_PRES[Length_of_mem-1];
397 save_P_PRES=P_PRES[Length_of_mem-1];
398 #pragma ivdep
399 for (i=Length_of_mem-1;i>0;--i) {
400 BREAKF[i]=BREAKF[i-1];
401 R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
402 R_PRES[i]=R_PRES[i-1];
403 X_PRES[i]=X_PRES[i-1];
404 P_PRES[i]=P_PRES[i-1];
405 }
406 R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
407 R_PRES[0]=save_R_PRES;
408 X_PRES[0]=save_XPRES;
409 P_PRES[0]=save_P_PRES;
410
411 if (ABS(Factor)<=ZERO) {
412 Factor=1.;
413 BREAKF[0]=ZERO;
414 } else {
415 Factor=1./Factor;
416 BREAKF[0]=ONE;
417 }
418 for (i=0;i<order;++i) ALPHA[i]*=Factor;
419 /*
420 ***** update of solution X_PRES and its residual R_PRES:
421 ***
422 *** P is used to accumulate X and AP to accumulate R. X and R
423 *** are still needed until they are put into the X and R memory
424 *** R_PRES and X_PRES
425 ***
426 **/
427 breakf0=BREAKF[0];
428 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
429 for (th=0;th<num_threads;++th) {
430 local_n=n/num_threads;
431 rest=n-local_n*num_threads;
432 n_start=local_n*th+MIN(th,rest);
433 n_end=local_n*(th+1)+MIN(th+1,rest);
434 if (order==0) {
435 #pragma ivdep
436 for (z=n_start; z < n_end; ++z) {
437 R_PRES[0][z]= Factor* AP[z];
438 X_PRES[0][z]=-Factor*P_PRES[1][z];
439 }
440 } else if (order==1) {
441 #pragma ivdep
442 for (z=n_start; z < n_end; ++z) {
443 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
444 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
445 }
446 } else if (order==2) {
447 #pragma ivdep
448 for (z=n_start; z < n_end; ++z) {
449 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
450 +ALPHA[1]*R_PRES[2][z];
451 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
452 +ALPHA[1]*X_PRES[2][z];
453 }
454 } else if (order==3) {
455 #pragma ivdep
456 for (z=n_start; z < n_end; ++z) {
457 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
458 +ALPHA[1]*R_PRES[2][z]
459 +ALPHA[2]*R_PRES[3][z];
460 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
461 +ALPHA[1]*X_PRES[2][z]
462 +ALPHA[2]*X_PRES[3][z];
463 }
464 } else if (order==4) {
465 #pragma ivdep
466 for (z=n_start; z < n_end; ++z) {
467 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
468 +ALPHA[1]*R_PRES[2][z]
469 +ALPHA[2]*R_PRES[3][z]
470 +ALPHA[3]*R_PRES[4][z];
471 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
472 +ALPHA[1]*X_PRES[2][z]
473 +ALPHA[2]*X_PRES[3][z]
474 +ALPHA[3]*X_PRES[4][z];
475 }
476 } else if (order==5) {
477 #pragma ivdep
478 for (z=n_start; z < n_end; ++z) {
479 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
480 +ALPHA[1]*R_PRES[2][z]
481 +ALPHA[2]*R_PRES[3][z]
482 +ALPHA[3]*R_PRES[4][z]
483 +ALPHA[4]*R_PRES[5][z];
484 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
485 +ALPHA[1]*X_PRES[2][z]
486 +ALPHA[2]*X_PRES[3][z]
487 +ALPHA[3]*X_PRES[4][z]
488 +ALPHA[4]*X_PRES[5][z];
489 }
490 } else if (order==6) {
491 #pragma ivdep
492 for (z=n_start; z < n_end; ++z) {
493 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
494 +ALPHA[1]*R_PRES[2][z]
495 +ALPHA[2]*R_PRES[3][z]
496 +ALPHA[3]*R_PRES[4][z]
497 +ALPHA[4]*R_PRES[5][z]
498 +ALPHA[5]*R_PRES[6][z];
499 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
500 +ALPHA[1]*X_PRES[2][z]
501 +ALPHA[2]*X_PRES[3][z]
502 +ALPHA[3]*X_PRES[4][z]
503 +ALPHA[4]*X_PRES[5][z]
504 +ALPHA[5]*X_PRES[6][z];
505 }
506 } else {
507 #pragma ivdep
508 for (z=n_start; z < n_end; ++z) {
509 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
510 +ALPHA[1]*R_PRES[2][z]
511 +ALPHA[2]*R_PRES[3][z]
512 +ALPHA[3]*R_PRES[4][z]
513 +ALPHA[4]*R_PRES[5][z]
514 +ALPHA[5]*R_PRES[6][z]
515 +ALPHA[6]*R_PRES[7][z];
516 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
517 +ALPHA[1]*X_PRES[2][z]
518 +ALPHA[2]*X_PRES[3][z]
519 +ALPHA[3]*X_PRES[4][z]
520 +ALPHA[4]*X_PRES[5][z]
521 +ALPHA[5]*X_PRES[6][z]
522 +ALPHA[6]*X_PRES[7][z];
523 }
524 for (i=7;i<order;++i) {
525 #pragma ivdep
526 for (z=n_start; z < n_end; ++z) {
527 R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
528 X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
529 }
530 }
531 }
532 }
533 if (breakf0>0.) {
534 /***
535 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
536 ***/
537 memset(loc_dots,0,sizeof(double)*3);
538 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
539 for (th=0;th<num_threads;++th) {
540 local_n=n/num_threads;
541 rest=n-local_n*num_threads;
542 n_start=local_n*th+MIN(th,rest);
543 n_end=local_n*(th+1)+MIN(th+1,rest);
544 SC1=ZERO;
545 SC2=ZERO;
546 #pragma ivdep
547 for (z=n_start; z < n_end; ++z) {
548 diff=R_PRES[0][z]-r[z];
549 SC1+=diff*diff;
550 SC2+=diff*r[z];
551 }
552 #pragma omp critical
553 {
554 loc_dots[0]+=SC1;
555 loc_dots[1]+=SC2;
556 }
557 }
558 #ifdef PASO_MPI
559 MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
560 SC1=dots[0];
561 SC2=dots[1];
562 #else
563 SC1=loc_dots[0];
564 SC2=loc_dots[1];
565 #endif
566 gamma=(SC1<=ZERO) ? ZERO : -SC2/SC1;
567 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
568 for (th=0;th<num_threads;++th) {
569 local_n=n/num_threads;
570 rest=n-local_n*num_threads;
571 n_start=local_n*th+MIN(th,rest);
572 n_end=local_n*(th+1)+MIN(th+1,rest);
573 L2_R=ZERO;
574 #pragma ivdep
575 for (z=n_start; z < n_end; ++z) {
576 x[z]+=gamma*(X_PRES[0][z]-x[z]);
577 r[z]+=gamma*(R_PRES[0][z]-r[z]);
578 L2_R+=r[z]*r[z];
579 }
580 #pragma omp critical
581 {
582 loc_dots[2]+=L2_R;
583 }
584 }
585 #ifdef PASO_MPI
586 MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
587 L2_R=dots[2];
588 #else
589 L2_R=loc_dots[2];
590 #endif
591 norm_of_residual=sqrt(L2_R);
592 convergeFlag = (norm_of_residual <= tol);
593 if (restart>0) restartFlag=(num_iter_restart >= restart);
594 } else {
595 convergeFlag=FALSE;
596 restartFlag=FALSE;
597 }
598 maxIterFlag = (num_iter >= maxit);
599 }
600 }
601 /* end of iteration */
602 Norm_of_residual_global=norm_of_residual;
603 Num_iter_global=num_iter;
604 if (maxIterFlag) {
605 Status = SOLVER_MAXITER_REACHED;
606 } else if (breakFlag) {
607 Status = SOLVER_BREAKDOWN;
608 }
609 }
610 for (i=0;i<Length_of_recursion;i++) {
611 TMPMEMFREE(X_PRES[i]);
612 TMPMEMFREE(R_PRES[i]);
613 TMPMEMFREE(P_PRES[i]);
614 }
615 TMPMEMFREE(AP);
616 TMPMEMFREE(X_PRES);
617 TMPMEMFREE(R_PRES);
618 TMPMEMFREE(P_PRES);
619 TMPMEMFREE(P_PRES_dot_AP);
620 TMPMEMFREE(R_PRES_dot_P_PRES);
621 TMPMEMFREE(BREAKF);
622 TMPMEMFREE(ALPHA);
623 TMPMEMFREE(dots);
624 TMPMEMFREE(loc_dots);
625 *iter=Num_iter_global;
626 *tolerance=Norm_of_residual_global;
627 return Status;
628 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26