/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Annotation of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2484 - (hide annotations)
Mon Jun 22 04:22:19 2009 UTC (10 years, 4 months ago) by gross
File MIME type: text/plain
File size: 13657 byte(s)
numarray removed from docu; Locator revised.
1 ksteube 1312
2     /*******************************************************
3 ksteube 1811 *
4     * Copyright (c) 2003-2008 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13 gross 495
14 ksteube 1811
15 jgs 150 /* PCG iterations */
16    
17 gross 700 #include "SystemMatrix.h"
18     #include "Paso.h"
19 jgs 150 #include "Solver.h"
20 woo409 757
21 jgs 150 #ifdef _OPENMP
22     #include <omp.h>
23     #endif
24    
25 ksteube 1312 #ifdef PASO_MPI
26     #include <mpi.h>
27     #endif
28    
29 jgs 150 /*
30     *
31     * Purpose
32     * =======
33     *
34     * PCG solves the linear system A*x = b using the
35     * preconditioned conjugate gradient method plus a smoother
36     * A has to be symmetric.
37     *
38     * Convergence test: norm( b - A*x )< TOL.
39     * For other measures, see the above reference.
40     *
41     * Arguments
42     * =========
43     *
44     * r (input) DOUBLE PRECISION array, dimension N.
45     * On entry, residual of inital guess x
46     *
47     * x (input/output) DOUBLE PRECISION array, dimension N.
48     * On input, the initial guess.
49     *
50     * ITER (input/output) INT
51     * On input, the maximum iterations to be performed.
52     * On output, actual number of iterations performed.
53     *
54     * INFO (output) INT
55     *
56     * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
57     * = SOLVEr_MAXITER_REACHED
58     * = SOLVER_INPUT_ERROR Illegal parameter:
59     * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
60     * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
61     *
62     * ==============================================================
63     */
64    
65 gross 1570 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
66    
67     #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
68     #define USE_DYNAMIC_SCHEDULING
69     #endif
70    
71 jgs 150 err_t Paso_Solver_PCG(
72     Paso_SystemMatrix * A,
73     double * r,
74     double * x,
75     dim_t *iter,
76 gross 584 double * tolerance,
77     Paso_Performance* pp) {
78 jgs 150
79     /* Local variables */
80 jfenwick 1981 dim_t num_iter=0,maxit,num_iter_global, len,rest, np, ipp;
81     #ifdef USE_DYNAMIC_SCHEDULING
82     dim_t chunk_size=-1;
83     #endif
84 gross 1570 register double ss,ss1;
85     dim_t i0, istart, iend;
86 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
87     err_t status = SOLVER_NO_ERROR;
88 ksteube 1312 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
89 jgs 150 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
90     double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
91 phornby 1628 #ifdef PASO_MPI
92     double loc_sum[2], sum[2];
93     #endif
94 jfenwick 1981 double norm_of_residual=0,norm_of_residual_global;
95 phornby 1628 register double d;
96 jgs 150
97 phornby 1628 /* Should not be any executable code before this ifdef */
98    
99 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
100 phornby 1628
101     /* Watch out for these declarations (see above) */
102     char* chksz_chr;
103     dim_t n_chunks;
104    
105 gross 1570 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
106     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
107 phornby 1628 np=omp_get_max_threads();
108 gross 1570 chunk_size=MIN(MAX(1,chunk_size),n/np);
109     n_chunks=n/chunk_size;
110     if (n_chunks*chunk_size<n) n_chunks+=1;
111     #else
112 phornby 1628 np=omp_get_max_threads();
113 gross 1570 len=n/np;
114     rest=n-len*np;
115     #endif
116 jgs 150 /* */
117     /*-----------------------------------------------------------------*/
118     /* */
119     /* Start of Calculation : */
120     /* --------------------- */
121     /* */
122     /* */
123     rs=TMPMEMALLOC(n,double);
124     p=TMPMEMALLOC(n,double);
125     v=TMPMEMALLOC(n,double);
126     x2=TMPMEMALLOC(n,double);
127    
128     /* Test the input parameters. */
129    
130     if (n < 0) {
131     status = SOLVER_INPUT_ERROR;
132     } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
133     status = SOLVER_MEMORY_ERROR;
134     } else {
135     maxit = *iter;
136     tol = *resid;
137 gross 1556 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
138     /* initialize data */
139 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp)
140 jgs 150 {
141 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
142     #pragma omp for schedule(dynamic, 1)
143     for (ipp=0; ipp < n_chunks; ++ipp) {
144     istart=chunk_size*ipp;
145     iend=MIN(istart+chunk_size,n);
146     #else
147     #pragma omp for schedule(static)
148     for (ipp=0; ipp <np; ++ipp) {
149     istart=len*ipp+MIN(ipp,rest);
150     iend=len*(ipp+1)+MIN(ipp+1,rest);
151     #endif
152 gross 1572 #pragma ivdep
153     for (i0=istart;i0<iend;i0++) {
154 gross 1570 rs[i0]=r[i0];
155     x2[i0]=x[i0];
156     p[i0]=0;
157     v[i0]=0;
158 gross 1572 }
159 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
160     }
161     #else
162     }
163     #endif
164 gross 1556 }
165     num_iter=0;
166 phornby 1628
167     /* PGH */
168     /* without this we get a use of an unititialised var below */
169     tau = 0;
170    
171 gross 1556 /* start of iteration */
172     while (!(convergeFlag || maxIterFlag || breakFlag)) {
173 jgs 150 ++(num_iter);
174 phornby 1628
175     /* PGH */
176     /* The next lines were commented out before I got here */
177 jgs 150 /* v=prec(r) */
178 phornby 1628 /* tau=v*r; */
179     /* leading to the use of an unititialised var below */
180    
181 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
182     Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
183 jgs 150 Paso_Solver_solvePreconditioner(A,v,r);
184 gross 584 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
185     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
186 phornby 1628
187 gross 1570 sum_1 = 0;
188 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss)
189 gross 1570 {
190 gross 1571 ss=0;
191 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
192     #pragma omp for schedule(dynamic, 1)
193     for (ipp=0; ipp < n_chunks; ++ipp) {
194     istart=chunk_size*ipp;
195     iend=MIN(istart+chunk_size,n);
196     #else
197 gross 1572 #pragma omp for schedule(static)
198 gross 1570 for (ipp=0; ipp <np; ++ipp) {
199     istart=len*ipp+MIN(ipp,rest);
200     iend=len*(ipp+1)+MIN(ipp+1,rest);
201     #endif
202 gross 1572 #pragma ivdep
203     for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
204 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
205     }
206     #else
207     }
208     #endif
209 gross 1572 #pragma omp critical
210 gross 1571 {
211 gross 1572 sum_1+=ss;
212 gross 1571 }
213 gross 1570 }
214 ksteube 1312 #ifdef PASO_MPI
215     /* In case we have many MPI processes, each of which may have several OMP threads:
216     OMP master participates in an MPI reduction to get global sum_1 */
217 ksteube 1579 loc_sum[0] = sum_1;
218 gross 1556 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
219 ksteube 1312 #endif
220 jgs 150 tau_old=tau;
221     tau=sum_1;
222     /* p=v+beta*p */
223 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,beta)
224 gross 1570 {
225     #ifdef USE_DYNAMIC_SCHEDULING
226     #pragma omp for schedule(dynamic, 1)
227     for (ipp=0; ipp < n_chunks; ++ipp) {
228     istart=chunk_size*ipp;
229     iend=MIN(istart+chunk_size,n);
230     #else
231     #pragma omp for schedule(static)
232     for (ipp=0; ipp <np; ++ipp) {
233     istart=len*ipp+MIN(ipp,rest);
234     iend=len*(ipp+1)+MIN(ipp+1,rest);
235     #endif
236 gross 1572 if (num_iter==1) {
237     #pragma ivdep
238     for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
239     } else {
240     beta=tau/tau_old;
241     #pragma ivdep
242     for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
243     }
244 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
245     }
246     #else
247     }
248     #endif
249 jgs 150 }
250     /* v=A*p */
251 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
252     Performance_startMonitor(pp,PERFORMANCE_MVM);
253 jfenwick 1974 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, p,PASO_ZERO,v);
254 gross 584 Performance_stopMonitor(pp,PERFORMANCE_MVM);
255     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
256 gross 1570
257 jgs 150 /* delta=p*v */
258 gross 1570 sum_2 = 0;
259 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,ss)
260 gross 1570 {
261 gross 1571 ss=0;
262 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
263     #pragma omp for schedule(dynamic, 1)
264     for (ipp=0; ipp < n_chunks; ++ipp) {
265     istart=chunk_size*ipp;
266     iend=MIN(istart+chunk_size,n);
267     #else
268     #pragma omp for schedule(static)
269     for (ipp=0; ipp <np; ++ipp) {
270     istart=len*ipp+MIN(ipp,rest);
271     iend=len*(ipp+1)+MIN(ipp+1,rest);
272     #endif
273 gross 1572 #pragma ivdep
274     for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
275 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
276     }
277     #else
278     }
279     #endif
280 gross 1572 #pragma omp critical
281 gross 1571 {
282 gross 1572 sum_2+=ss;
283 gross 1571 }
284 gross 1570 }
285 ksteube 1312 #ifdef PASO_MPI
286 gross 1556 loc_sum[0] = sum_2;
287     MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
288 ksteube 1312 #endif
289 jgs 150 delta=sum_2;
290 gross 1572 alpha=tau/delta;
291 jgs 150
292     if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
293     /* smoother */
294 gross 1570 sum_3 = 0;
295     sum_4 = 0;
296 gross 1572 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
297 gross 1556 {
298 gross 1571 ss=0;
299     ss1=0;
300 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
301     #pragma omp for schedule(dynamic, 1)
302     for (ipp=0; ipp < n_chunks; ++ipp) {
303     istart=chunk_size*ipp;
304     iend=MIN(istart+chunk_size,n);
305     #else
306     #pragma omp for schedule(static)
307     for (ipp=0; ipp <np; ++ipp) {
308     istart=len*ipp+MIN(ipp,rest);
309     iend=len*(ipp+1)+MIN(ipp+1,rest);
310     #endif
311 gross 1572 #pragma ivdep
312     for (i0=istart;i0<iend;i0++) {
313     r[i0]-=alpha*v[i0];
314     d=r[i0]-rs[i0];
315     ss+=d*d;
316     ss1+=d*rs[i0];
317     }
318 gross 1571 #ifdef USE_DYNAMIC_SCHEDULING
319     }
320     #else
321     }
322     #endif
323 gross 1572 #pragma omp critical
324 gross 1570 {
325 gross 1572 sum_3+=ss;
326     sum_4+=ss1;
327 gross 1570 }
328 ksteube 1312 }
329     #ifdef PASO_MPI
330 gross 1556 loc_sum[0] = sum_3;
331     loc_sum[1] = sum_4;
332     MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
333     sum_3=sum[0];
334     sum_4=sum[1];
335 ksteube 1312 #endif
336 gross 1570 sum_5 = 0;
337 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
338 gross 1556 {
339 jfenwick 1974 gamma_1= ( (ABS(sum_3)<= PASO_ZERO) ? 0 : -sum_4/sum_3) ;
340     gamma_2= PASO_ONE-gamma_1;
341 gross 1571 ss=0;
342 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
343     #pragma omp for schedule(dynamic, 1)
344     for (ipp=0; ipp < n_chunks; ++ipp) {
345     istart=chunk_size*ipp;
346     iend=MIN(istart+chunk_size,n);
347     #else
348     #pragma omp for schedule(static)
349     for (ipp=0; ipp <np; ++ipp) {
350     istart=len*ipp+MIN(ipp,rest);
351     iend=len*(ipp+1)+MIN(ipp+1,rest);
352     #endif
353 gross 1572 #pragma ivdep
354     for (i0=istart;i0<iend;i0++) {
355     rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
356     x2[i0]+=alpha*p[i0];
357     x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
358     ss+=rs[i0]*rs[i0];
359     }
360 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
361 gross 1556 }
362 gross 1570 #else
363     }
364     #endif
365 gross 1571 #pragma omp critical
366     {
367 gross 1572 sum_5+=ss;
368 gross 1571 }
369 gross 495 }
370 ksteube 1312 #ifdef PASO_MPI
371 gross 1556 loc_sum[0] = sum_5;
372     MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
373 ksteube 1312 #endif
374 gross 495 norm_of_residual=sqrt(sum_5);
375     convergeFlag = norm_of_residual <= tol;
376 gross 2484 maxIterFlag = num_iter > maxit;
377 gross 495 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
378     }
379 gross 1556 }
380     /* end of iteration */
381     num_iter_global=num_iter;
382     norm_of_residual_global=norm_of_residual;
383     if (maxIterFlag) {
384     status = SOLVER_MAXITER_REACHED;
385     } else if (breakFlag) {
386     status = SOLVER_BREAKDOWN;
387     }
388     Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
389 jgs 150 TMPMEMFREE(rs);
390     TMPMEMFREE(x2);
391     TMPMEMFREE(v);
392     TMPMEMFREE(p);
393     *iter=num_iter_global;
394     *resid=norm_of_residual_global;
395     }
396     /* End of PCG */
397     return status;
398     }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26