/[escript]/branches/doubleplusgood/paso/src/GMRES.cpp
ViewVC logotype

Contents of /branches/doubleplusgood/paso/src/GMRES.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4291 - (show annotations)
Thu Mar 7 08:57:58 2013 UTC (6 years, 1 month ago) by jfenwick
File size: 27247 byte(s)


1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2013 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development since 2012 by School of Earth Sciences
13 *
14 *****************************************************************************/
15
16
17 /*
18 * Purpose
19 * =======
20 *
21 * GMRES solves the linear system A*x=b using the
22 * truncated and restarted GMRES method with preconditioning.
23 *
24 * Convergence test: norm( b - A*x )< TOL.
25 *
26 *
27 *
28 * Arguments
29 * =========
30 *
31 * r (input/output) double array, dimension n.
32 * On entry, residual of initial guess X
33 *
34 * x (input/output) double array, dimension n.
35 * On input, the initial guess.
36 *
37 * iter (input/output) int
38 * On input, the maximum num_iterations to be performed.
39 * On output, actual number of num_iterations performed.
40 *
41 * tolerance (input/output) DOUBLE PRECISION
42 * On input, the allowable convergence measure for
43 * norm( b - A*x )
44 * On output, the final value of this measure.
45 *
46 * Length_of_recursion (input) gives the number of residual to be kept in orthogonalization process
47 *
48 * restart (input) If restart>0, iteration is restarted a after restart steps.
49 *
50 * INFO (output) int
51 *
52 * =SOLVER_NO_ERROR: Successful exit. num_iterated approximate solution returned.
53 * =SOLVER_MAXNUM_ITER_REACHED
54 * =SOLVER_INPUT_ERROR Illegal parameter:
55 * =SOLVER_BREAKDOWN: bad luck!
56 * =SOLVER_MEMORY_ERROR : no memory available
57 *
58 * ==============================================================
59 */
60
61 #include "Common.h"
62 #include "SystemMatrix.h"
63 #include "Solver.h"
64 #ifdef _OPENMP
65 #include <omp.h>
66 #endif
67
68 err_t Paso_Solver_GMRES(
69 Paso_SystemMatrix * A,
70 double * r,
71 double * x,
72 dim_t *iter,
73 double * tolerance,dim_t Length_of_recursion,dim_t restart,
74 Paso_Performance* pp) {
75
76 /* Local variables */
77
78 #ifdef _OPENMP
79 const int num_threads=omp_get_max_threads();
80 #else
81 const int num_threads=1;
82 #endif
83 double *AP,**X_PRES,**R_PRES,**P_PRES, *dots, *loc_dots;
84 double *P_PRES_dot_AP,*R_PRES_dot_P_PRES,*BREAKF,*ALPHA;
85 double R_PRES_dot_AP0,P_PRES_dot_AP0,P_PRES_dot_AP1,P_PRES_dot_AP2,P_PRES_dot_AP3,P_PRES_dot_AP4,P_PRES_dot_AP5,P_PRES_dot_AP6,R_PRES_dot_P,breakf0;
86 double tol,Factor,sum_BREAKF,gamma,SC1,SC2,norm_of_residual=0,diff,L2_R,Norm_of_residual_global=0;
87 double *save_XPRES, *save_P_PRES, *save_R_PRES,save_R_PRES_dot_P_PRES;
88 dim_t maxit,Num_iter_global=0,num_iter_restart=0,num_iter;
89 dim_t i,z,order,n, Length_of_mem, th, local_n , rest, n_start ,n_end;
90 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE,restartFlag=FALSE;
91 err_t Status=SOLVER_NO_ERROR;
92
93
94 /* adapt original routine parameters */
95 n = Paso_SystemMatrix_getTotalNumRows(A);
96 Length_of_mem=MAX(Length_of_recursion,0)+1;
97
98 /* Test the input parameters. */
99 if (restart>0) restart=MAX(Length_of_recursion,restart);
100 if (n < 0 || Length_of_recursion<=0) {
101 return SOLVER_INPUT_ERROR;
102 }
103
104 /* allocate memory: */
105 X_PRES=new double*[Length_of_mem];
106 R_PRES=new double*[Length_of_mem];
107 P_PRES=new double*[Length_of_mem];
108 loc_dots=new double[MAX(Length_of_mem+1,3)];
109 dots=new double[MAX(Length_of_mem+1,3)];
110 P_PRES_dot_AP=new double[Length_of_mem];
111 R_PRES_dot_P_PRES=new double[Length_of_mem];
112 BREAKF=new double[Length_of_mem];
113 ALPHA=new double[Length_of_mem];
114 AP=new double[n];
115 if (AP==NULL || X_PRES ==NULL || R_PRES == NULL || P_PRES == NULL ||
116 P_PRES_dot_AP == NULL || R_PRES_dot_P_PRES ==NULL || BREAKF == NULL || ALPHA == NULL || dots==NULL || loc_dots==NULL) {
117 Status=SOLVER_MEMORY_ERROR;
118 } else {
119 for (i=0;i<Length_of_mem;i++) {
120 X_PRES[i]=new double[n];
121 R_PRES[i]=new double[n];
122 P_PRES[i]=new double[n];
123 if (X_PRES[i]==NULL || R_PRES[i]==NULL || P_PRES[i]==NULL) Status=SOLVER_MEMORY_ERROR;
124 }
125 }
126 if ( Status ==SOLVER_NO_ERROR ) {
127
128 /* now PRES starts : */
129 maxit=*iter;
130 tol=*tolerance;
131
132 /* initialization */
133
134 restartFlag=TRUE;
135 num_iter=0;
136
137 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
138 for (th=0;th<num_threads;++th) {
139 local_n=n/num_threads;
140 rest=n-local_n*num_threads;
141 n_start=local_n*th+MIN(th,rest);
142 n_end=local_n*(th+1)+MIN(th+1,rest);
143 memset(&AP[n_start],0,sizeof(double)*(n_end-n_start));
144 for(i=0;i<Length_of_mem;++i) {
145 memset(&P_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
146 memset(&R_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
147 memset(&X_PRES[i][n_start],0,sizeof(double)*(n_end-n_start));
148 }
149 }
150
151 while (! (convergeFlag || maxIterFlag || breakFlag)) {
152 if (restartFlag) {
153 BREAKF[0]=PASO_ONE;
154 #pragma omp parallel for private(th,z, local_n, rest, n_start, n_end)
155 for (th=0;th<num_threads;++th) {
156 local_n=n/num_threads;
157 rest=n-local_n*num_threads;
158 n_start=local_n*th+MIN(th,rest);
159 n_end=local_n*(th+1)+MIN(th+1,rest);
160 memcpy(&R_PRES[0][n_start],&r[n_start],sizeof(double)*(n_end-n_start));
161 memcpy(&X_PRES[0][n_start],&x[n_start],sizeof(double)*(n_end-n_start));
162 }
163 num_iter_restart=0;
164 restartFlag=FALSE;
165 }
166 ++num_iter;
167 ++num_iter_restart;
168 /* order is the dimension of the space on which the residual is minimized: */
169 order=MIN(num_iter_restart,Length_of_recursion);
170 /***
171 *** calculate new search direction P from R_PRES
172 ***/
173 Paso_SystemMatrix_solvePreconditioner(A,&P_PRES[0][0], &R_PRES[0][0]);
174 /***
175 *** apply A to P to get AP
176 ***/
177 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, &P_PRES[0][0],PASO_ZERO, &AP[0]);
178 /***
179 ***** calculation of the norm of R and the scalar products of
180 *** the residuals and A*P:
181 ***/
182 memset(loc_dots,0,sizeof(double)*(order+1));
183 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end, R_PRES_dot_P, P_PRES_dot_AP0, P_PRES_dot_AP1, P_PRES_dot_AP2, P_PRES_dot_AP3, P_PRES_dot_AP4, P_PRES_dot_AP5, P_PRES_dot_AP6)
184 for (th=0;th<num_threads;++th) {
185 local_n=n/num_threads;
186 rest=n-local_n*num_threads;
187 n_start=local_n*th+MIN(th,rest);
188 n_end=local_n*(th+1)+MIN(th+1,rest);
189 if (order==0) {
190 R_PRES_dot_P=PASO_ZERO;
191 #pragma ivdep
192 for (z=n_start; z < n_end; ++z) {
193 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
194 }
195 #pragma omp critical
196 {
197 loc_dots[0]+=R_PRES_dot_P;
198 }
199 } else if (order==1) {
200 R_PRES_dot_P=PASO_ZERO;
201 P_PRES_dot_AP0=PASO_ZERO;
202 #pragma ivdep
203 for (z=n_start; z < n_end; ++z) {
204 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
205 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
206 }
207 #pragma omp critical
208 {
209 loc_dots[0]+=R_PRES_dot_P;
210 loc_dots[1]+=P_PRES_dot_AP0;
211 }
212 } else if (order==2) {
213 R_PRES_dot_P=PASO_ZERO;
214 P_PRES_dot_AP0=PASO_ZERO;
215 P_PRES_dot_AP1=PASO_ZERO;
216 #pragma ivdep
217 for (z=n_start; z < n_end; ++z) {
218 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
219 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
220 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
221 }
222 #pragma omp critical
223 {
224 loc_dots[0]+=R_PRES_dot_P;
225 loc_dots[1]+=P_PRES_dot_AP0;
226 loc_dots[2]+=P_PRES_dot_AP1;
227 }
228 } else if (order==3) {
229 R_PRES_dot_P=PASO_ZERO;
230 P_PRES_dot_AP0=PASO_ZERO;
231 P_PRES_dot_AP1=PASO_ZERO;
232 P_PRES_dot_AP2=PASO_ZERO;
233 #pragma ivdep
234 for (z=n_start; z < n_end; ++z) {
235 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
236 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
237 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
238 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
239 }
240 #pragma omp critical
241 {
242 loc_dots[0]+=R_PRES_dot_P;
243 loc_dots[1]+=P_PRES_dot_AP0;
244 loc_dots[2]+=P_PRES_dot_AP1;
245 loc_dots[3]+=P_PRES_dot_AP2;
246 }
247 } else if (order==4) {
248 R_PRES_dot_P=PASO_ZERO;
249 P_PRES_dot_AP0=PASO_ZERO;
250 P_PRES_dot_AP1=PASO_ZERO;
251 P_PRES_dot_AP2=PASO_ZERO;
252 P_PRES_dot_AP3=PASO_ZERO;
253 #pragma ivdep
254 for (z=n_start; z < n_end; ++z) {
255 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
256 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
257 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
258 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
259 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
260 }
261 #pragma omp critical
262 {
263 loc_dots[0]+=R_PRES_dot_P;
264 loc_dots[1]+=P_PRES_dot_AP0;
265 loc_dots[2]+=P_PRES_dot_AP1;
266 loc_dots[3]+=P_PRES_dot_AP2;
267 loc_dots[4]+=P_PRES_dot_AP3;
268 }
269 } else if (order==5) {
270 R_PRES_dot_P=PASO_ZERO;
271 P_PRES_dot_AP0=PASO_ZERO;
272 P_PRES_dot_AP1=PASO_ZERO;
273 P_PRES_dot_AP2=PASO_ZERO;
274 P_PRES_dot_AP3=PASO_ZERO;
275 P_PRES_dot_AP4=PASO_ZERO;
276 #pragma ivdep
277 for (z=n_start; z < n_end; ++z) {
278 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
279 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
280 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
281 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
282 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
283 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
284 }
285 #pragma omp critical
286 {
287 loc_dots[0]+=R_PRES_dot_P;
288 loc_dots[1]+=P_PRES_dot_AP0;
289 loc_dots[2]+=P_PRES_dot_AP1;
290 loc_dots[3]+=P_PRES_dot_AP2;
291 loc_dots[4]+=P_PRES_dot_AP3;
292 loc_dots[5]+=P_PRES_dot_AP4;
293 }
294 } else if (order==6) {
295 R_PRES_dot_P=PASO_ZERO;
296 P_PRES_dot_AP0=PASO_ZERO;
297 P_PRES_dot_AP1=PASO_ZERO;
298 P_PRES_dot_AP2=PASO_ZERO;
299 P_PRES_dot_AP3=PASO_ZERO;
300 P_PRES_dot_AP4=PASO_ZERO;
301 P_PRES_dot_AP5=PASO_ZERO;
302 #pragma ivdep
303 for (z=n_start; z < n_end; ++z) {
304 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
305 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
306 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
307 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
308 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
309 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
310 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
311 }
312 #pragma omp critical
313 {
314 loc_dots[0]+=R_PRES_dot_P;
315 loc_dots[1]+=P_PRES_dot_AP0;
316 loc_dots[2]+=P_PRES_dot_AP1;
317 loc_dots[3]+=P_PRES_dot_AP2;
318 loc_dots[4]+=P_PRES_dot_AP3;
319 loc_dots[5]+=P_PRES_dot_AP4;
320 loc_dots[6]+=P_PRES_dot_AP5;
321 }
322 } else {
323 R_PRES_dot_P=PASO_ZERO;
324 P_PRES_dot_AP0=PASO_ZERO;
325 P_PRES_dot_AP1=PASO_ZERO;
326 P_PRES_dot_AP2=PASO_ZERO;
327 P_PRES_dot_AP3=PASO_ZERO;
328 P_PRES_dot_AP4=PASO_ZERO;
329 P_PRES_dot_AP5=PASO_ZERO;
330 P_PRES_dot_AP6=PASO_ZERO;
331 #pragma ivdep
332 for (z=n_start; z < n_end; ++z) {
333 R_PRES_dot_P+=R_PRES[0][z]*P_PRES[0][z];
334 P_PRES_dot_AP0+=P_PRES[0][z]*AP[z];
335 P_PRES_dot_AP1+=P_PRES[1][z]*AP[z];
336 P_PRES_dot_AP2+=P_PRES[2][z]*AP[z];
337 P_PRES_dot_AP3+=P_PRES[3][z]*AP[z];
338 P_PRES_dot_AP4+=P_PRES[4][z]*AP[z];
339 P_PRES_dot_AP5+=P_PRES[5][z]*AP[z];
340 P_PRES_dot_AP6+=P_PRES[6][z]*AP[z];
341 }
342 #pragma omp critical
343 {
344 loc_dots[0]+=R_PRES_dot_P;
345 loc_dots[1]+=P_PRES_dot_AP0;
346 loc_dots[2]+=P_PRES_dot_AP1;
347 loc_dots[3]+=P_PRES_dot_AP2;
348 loc_dots[4]+=P_PRES_dot_AP3;
349 loc_dots[5]+=P_PRES_dot_AP4;
350 loc_dots[6]+=P_PRES_dot_AP5;
351 loc_dots[7]+=P_PRES_dot_AP6;
352 }
353 for (i=7;i<order;++i) {
354 P_PRES_dot_AP0=PASO_ZERO;
355 #pragma ivdep
356 for (z=n_start; z < n_end; ++z) {
357 P_PRES_dot_AP0+=P_PRES[i][z]*AP[z];
358 }
359 #pragma omp critical
360 {
361 loc_dots[i+1]+=P_PRES_dot_AP0;
362 }
363 }
364 }
365 }
366 #ifdef ESYS_MPI
367 MPI_Allreduce(loc_dots, dots, order+1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
368 R_PRES_dot_P_PRES[0]=dots[0];
369 memcpy(P_PRES_dot_AP,&dots[1],sizeof(double)*order);
370 #else
371 R_PRES_dot_P_PRES[0]=loc_dots[0];
372 memcpy(P_PRES_dot_AP,&loc_dots[1],sizeof(double)*order);
373 #endif
374 R_PRES_dot_AP0=R_PRES_dot_P_PRES[0];
375 /*** If sum_BREAKF is equal to zero a breakdown occurs.
376 *** Iteration procedure can be continued but R_PRES is not the
377 *** residual of X_PRES approximation.
378 ***/
379 sum_BREAKF=0.;
380 #pragma ivdep
381 for (i=0;i<order;++i) sum_BREAKF +=BREAKF[i];
382 breakFlag=!((ABS(R_PRES_dot_AP0) > PASO_ZERO) && (sum_BREAKF >PASO_ZERO));
383 if (!breakFlag) {
384 breakFlag=FALSE;
385 /***
386 ***** X_PRES and R_PRES are moved to memory:
387 ***/
388 Factor=0.;
389 #pragma ivdep
390 for (i=0;i<order;++i) {
391 ALPHA[i]=-P_PRES_dot_AP[i]/R_PRES_dot_P_PRES[i];
392 Factor+=BREAKF[i]*ALPHA[i];
393 }
394
395 save_R_PRES_dot_P_PRES=R_PRES_dot_P_PRES[Length_of_mem-1];
396 save_R_PRES=R_PRES[Length_of_mem-1];
397 save_XPRES=X_PRES[Length_of_mem-1];
398 save_P_PRES=P_PRES[Length_of_mem-1];
399 #pragma ivdep
400 for (i=Length_of_mem-1;i>0;--i) {
401 BREAKF[i]=BREAKF[i-1];
402 R_PRES_dot_P_PRES[i]=R_PRES_dot_P_PRES[i-1];
403 R_PRES[i]=R_PRES[i-1];
404 X_PRES[i]=X_PRES[i-1];
405 P_PRES[i]=P_PRES[i-1];
406 }
407 R_PRES_dot_P_PRES[0]=save_R_PRES_dot_P_PRES;
408 R_PRES[0]=save_R_PRES;
409 X_PRES[0]=save_XPRES;
410 P_PRES[0]=save_P_PRES;
411
412 if (ABS(Factor)<=PASO_ZERO) {
413 Factor=1.;
414 BREAKF[0]=PASO_ZERO;
415 } else {
416 Factor=1./Factor;
417 BREAKF[0]=PASO_ONE;
418 }
419 for (i=0;i<order;++i) ALPHA[i]*=Factor;
420 /*
421 ***** Update of solution X_PRES and its residual R_PRES:
422 ***
423 *** P is used to accumulate X and AP to accumulate R. X and R
424 *** are still needed until they are put into the X and R memory
425 *** R_PRES and X_PRES
426 ***
427 **/
428 breakf0=BREAKF[0];
429 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end)
430 for (th=0;th<num_threads;++th) {
431 local_n=n/num_threads;
432 rest=n-local_n*num_threads;
433 n_start=local_n*th+MIN(th,rest);
434 n_end=local_n*(th+1)+MIN(th+1,rest);
435 if (order==0) {
436 #pragma ivdep
437 for (z=n_start; z < n_end; ++z) {
438 R_PRES[0][z]= Factor* AP[z];
439 X_PRES[0][z]=-Factor*P_PRES[1][z];
440 }
441 } else if (order==1) {
442 #pragma ivdep
443 for (z=n_start; z < n_end; ++z) {
444 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z];
445 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z];
446 }
447 } else if (order==2) {
448 #pragma ivdep
449 for (z=n_start; z < n_end; ++z) {
450 R_PRES[0][z]= Factor* AP[z]+ALPHA[0]*R_PRES[1][z]
451 +ALPHA[1]*R_PRES[2][z];
452 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
453 +ALPHA[1]*X_PRES[2][z];
454 }
455 } else if (order==3) {
456 #pragma ivdep
457 for (z=n_start; z < n_end; ++z) {
458 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
459 +ALPHA[1]*R_PRES[2][z]
460 +ALPHA[2]*R_PRES[3][z];
461 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
462 +ALPHA[1]*X_PRES[2][z]
463 +ALPHA[2]*X_PRES[3][z];
464 }
465 } else if (order==4) {
466 #pragma ivdep
467 for (z=n_start; z < n_end; ++z) {
468 R_PRES[0][z]= Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
469 +ALPHA[1]*R_PRES[2][z]
470 +ALPHA[2]*R_PRES[3][z]
471 +ALPHA[3]*R_PRES[4][z];
472 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
473 +ALPHA[1]*X_PRES[2][z]
474 +ALPHA[2]*X_PRES[3][z]
475 +ALPHA[3]*X_PRES[4][z];
476 }
477 } else if (order==5) {
478 #pragma ivdep
479 for (z=n_start; z < n_end; ++z) {
480 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
481 +ALPHA[1]*R_PRES[2][z]
482 +ALPHA[2]*R_PRES[3][z]
483 +ALPHA[3]*R_PRES[4][z]
484 +ALPHA[4]*R_PRES[5][z];
485 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
486 +ALPHA[1]*X_PRES[2][z]
487 +ALPHA[2]*X_PRES[3][z]
488 +ALPHA[3]*X_PRES[4][z]
489 +ALPHA[4]*X_PRES[5][z];
490 }
491 } else if (order==6) {
492 #pragma ivdep
493 for (z=n_start; z < n_end; ++z) {
494 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
495 +ALPHA[1]*R_PRES[2][z]
496 +ALPHA[2]*R_PRES[3][z]
497 +ALPHA[3]*R_PRES[4][z]
498 +ALPHA[4]*R_PRES[5][z]
499 +ALPHA[5]*R_PRES[6][z];
500 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
501 +ALPHA[1]*X_PRES[2][z]
502 +ALPHA[2]*X_PRES[3][z]
503 +ALPHA[3]*X_PRES[4][z]
504 +ALPHA[4]*X_PRES[5][z]
505 +ALPHA[5]*X_PRES[6][z];
506 }
507 } else {
508 #pragma ivdep
509 for (z=n_start; z < n_end; ++z) {
510 R_PRES[0][z]=Factor*AP[z]+ALPHA[0]*R_PRES[1][z]
511 +ALPHA[1]*R_PRES[2][z]
512 +ALPHA[2]*R_PRES[3][z]
513 +ALPHA[3]*R_PRES[4][z]
514 +ALPHA[4]*R_PRES[5][z]
515 +ALPHA[5]*R_PRES[6][z]
516 +ALPHA[6]*R_PRES[7][z];
517 X_PRES[0][z]=-Factor*P_PRES[1][z]+ALPHA[0]*X_PRES[1][z]
518 +ALPHA[1]*X_PRES[2][z]
519 +ALPHA[2]*X_PRES[3][z]
520 +ALPHA[3]*X_PRES[4][z]
521 +ALPHA[4]*X_PRES[5][z]
522 +ALPHA[5]*X_PRES[6][z]
523 +ALPHA[6]*X_PRES[7][z];
524 }
525 for (i=7;i<order;++i) {
526 #pragma ivdep
527 for (z=n_start; z < n_end; ++z) {
528 R_PRES[0][z]+=ALPHA[i]*R_PRES[i+1][z];
529 X_PRES[0][z]+=ALPHA[i]*X_PRES[i+1][z];
530 }
531 }
532 }
533 }
534 if (breakf0>0.) {
535 /***
536 ***** calculate gamma from min_(gamma){|R+gamma*(R_PRES-R)|_2}:
537 ***/
538 memset(loc_dots,0,sizeof(double)*3);
539 #pragma omp parallel for private(th,z,i,local_n, rest, n_start, n_end,diff,SC1,SC2)
540 for (th=0;th<num_threads;++th) {
541 local_n=n/num_threads;
542 rest=n-local_n*num_threads;
543 n_start=local_n*th+MIN(th,rest);
544 n_end=local_n*(th+1)+MIN(th+1,rest);
545 SC1=PASO_ZERO;
546 SC2=PASO_ZERO;
547 #pragma ivdep
548 for (z=n_start; z < n_end; ++z) {
549 diff=R_PRES[0][z]-r[z];
550 SC1+=diff*diff;
551 SC2+=diff*r[z];
552 }
553 #pragma omp critical
554 {
555 loc_dots[0]+=SC1;
556 loc_dots[1]+=SC2;
557 }
558 }
559 #ifdef ESYS_MPI
560 MPI_Allreduce(loc_dots, dots, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
561 SC1=dots[0];
562 SC2=dots[1];
563 #else
564 SC1=loc_dots[0];
565 SC2=loc_dots[1];
566 #endif
567 gamma=(SC1<=PASO_ZERO) ? PASO_ZERO : -SC2/SC1;
568 #pragma omp parallel for private(th,z,local_n, rest, n_start, n_end,diff,L2_R)
569 for (th=0;th<num_threads;++th) {
570 local_n=n/num_threads;
571 rest=n-local_n*num_threads;
572 n_start=local_n*th+MIN(th,rest);
573 n_end=local_n*(th+1)+MIN(th+1,rest);
574 L2_R=PASO_ZERO;
575 #pragma ivdep
576 for (z=n_start; z < n_end; ++z) {
577 x[z]+=gamma*(X_PRES[0][z]-x[z]);
578 r[z]+=gamma*(R_PRES[0][z]-r[z]);
579 L2_R+=r[z]*r[z];
580 }
581 #pragma omp critical
582 {
583 loc_dots[2]+=L2_R;
584 }
585 }
586 #ifdef ESYS_MPI
587 MPI_Allreduce(&loc_dots[2], &dots[2], 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
588 L2_R=dots[2];
589 #else
590 L2_R=loc_dots[2];
591 #endif
592 norm_of_residual=sqrt(L2_R);
593 convergeFlag = (norm_of_residual <= tol);
594 if (restart>0) restartFlag=(num_iter_restart >= restart);
595 } else {
596 convergeFlag=FALSE;
597 restartFlag=FALSE;
598 }
599 maxIterFlag = (num_iter >= maxit);
600 }
601 }
602 /* end of iteration */
603 Norm_of_residual_global=norm_of_residual;
604 Num_iter_global=num_iter;
605 if (maxIterFlag) {
606 Status = SOLVER_MAXITER_REACHED;
607 } else if (breakFlag) {
608 Status = SOLVER_BREAKDOWN;
609 }
610 }
611 for (i=0;i<Length_of_recursion;i++) {
612 delete[] X_PRES[i];
613 delete[] R_PRES[i];
614 delete[] P_PRES[i];
615 }
616 delete[] AP;
617 delete[] X_PRES;
618 delete[] R_PRES;
619 delete[] P_PRES;
620 delete[] P_PRES_dot_AP;
621 delete[] R_PRES_dot_P_PRES;
622 delete[] BREAKF;
623 delete[] ALPHA;
624 delete[] dots;
625 delete[] loc_dots;
626 *iter=Num_iter_global;
627 *tolerance=Norm_of_residual_global;
628 return Status;
629 }
630

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26