/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Annotation of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1811 - (hide annotations)
Thu Sep 25 23:11:13 2008 UTC (11 years ago) by ksteube
File MIME type: text/plain
File size: 13591 byte(s)
Copyright updated in all files

1 ksteube 1312
2     /*******************************************************
3 ksteube 1811 *
4     * Copyright (c) 2003-2008 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13 gross 495
14 ksteube 1811
15 jgs 150 /* PCG iterations */
16    
17 gross 700 #include "SystemMatrix.h"
18     #include "Paso.h"
19 jgs 150 #include "Solver.h"
20 woo409 757
21 jgs 150 #ifdef _OPENMP
22     #include <omp.h>
23     #endif
24    
25 ksteube 1312 #ifdef PASO_MPI
26     #include <mpi.h>
27     #endif
28    
29 jgs 150 /*
30     *
31     * Purpose
32     * =======
33     *
34     * PCG solves the linear system A*x = b using the
35     * preconditioned conjugate gradient method plus a smoother
36     * A has to be symmetric.
37     *
38     * Convergence test: norm( b - A*x )< TOL.
39     * For other measures, see the above reference.
40     *
41     * Arguments
42     * =========
43     *
44     * r (input) DOUBLE PRECISION array, dimension N.
45     * On entry, residual of inital guess x
46     *
47     * x (input/output) DOUBLE PRECISION array, dimension N.
48     * On input, the initial guess.
49     *
50     * ITER (input/output) INT
51     * On input, the maximum iterations to be performed.
52     * On output, actual number of iterations performed.
53     *
54     * INFO (output) INT
55     *
56     * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
57     * = SOLVEr_MAXITER_REACHED
58     * = SOLVER_INPUT_ERROR Illegal parameter:
59     * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
60     * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
61     *
62     * ==============================================================
63     */
64    
65 gross 1570 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
66    
67     #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
68     #define USE_DYNAMIC_SCHEDULING
69     #endif
70    
71 jgs 150 err_t Paso_Solver_PCG(
72     Paso_SystemMatrix * A,
73     double * r,
74     double * x,
75     dim_t *iter,
76 gross 584 double * tolerance,
77     Paso_Performance* pp) {
78 jgs 150
79     /* Local variables */
80 phornby 1628 dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, np, ipp;
81 gross 1570 register double ss,ss1;
82     dim_t i0, istart, iend;
83 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
84     err_t status = SOLVER_NO_ERROR;
85 ksteube 1312 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
86 jgs 150 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
87     double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
88 phornby 1628 #ifdef PASO_MPI
89     double loc_sum[2], sum[2];
90     #endif
91     double norm_of_residual,norm_of_residual_global;
92     register double d;
93 jgs 150
94 phornby 1628 /* Should not be any executable code before this ifdef */
95    
96 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
97 phornby 1628
98     /* Watch out for these declarations (see above) */
99     char* chksz_chr;
100     dim_t n_chunks;
101    
102 gross 1570 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
103     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
104 phornby 1628 np=omp_get_max_threads();
105 gross 1570 chunk_size=MIN(MAX(1,chunk_size),n/np);
106     n_chunks=n/chunk_size;
107     if (n_chunks*chunk_size<n) n_chunks+=1;
108     #else
109 phornby 1628 np=omp_get_max_threads();
110 gross 1570 len=n/np;
111     rest=n-len*np;
112     #endif
113 jgs 150 /* */
114     /*-----------------------------------------------------------------*/
115     /* */
116     /* Start of Calculation : */
117     /* --------------------- */
118     /* */
119     /* */
120     rs=TMPMEMALLOC(n,double);
121     p=TMPMEMALLOC(n,double);
122     v=TMPMEMALLOC(n,double);
123     x2=TMPMEMALLOC(n,double);
124    
125     /* Test the input parameters. */
126    
127     if (n < 0) {
128     status = SOLVER_INPUT_ERROR;
129     } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
130     status = SOLVER_MEMORY_ERROR;
131     } else {
132     maxit = *iter;
133     tol = *resid;
134 gross 1556 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
135     /* initialize data */
136 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp)
137 jgs 150 {
138 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
139     #pragma omp for schedule(dynamic, 1)
140     for (ipp=0; ipp < n_chunks; ++ipp) {
141     istart=chunk_size*ipp;
142     iend=MIN(istart+chunk_size,n);
143     #else
144     #pragma omp for schedule(static)
145     for (ipp=0; ipp <np; ++ipp) {
146     istart=len*ipp+MIN(ipp,rest);
147     iend=len*(ipp+1)+MIN(ipp+1,rest);
148     #endif
149 gross 1572 #pragma ivdep
150     for (i0=istart;i0<iend;i0++) {
151 gross 1570 rs[i0]=r[i0];
152     x2[i0]=x[i0];
153     p[i0]=0;
154     v[i0]=0;
155 gross 1572 }
156 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
157     }
158     #else
159     }
160     #endif
161 gross 1556 }
162     num_iter=0;
163 phornby 1628
164     /* PGH */
165     /* without this we get a use of an unititialised var below */
166     tau = 0;
167    
168 gross 1556 /* start of iteration */
169     while (!(convergeFlag || maxIterFlag || breakFlag)) {
170 jgs 150 ++(num_iter);
171 phornby 1628
172     /* PGH */
173     /* The next lines were commented out before I got here */
174 jgs 150 /* v=prec(r) */
175 phornby 1628 /* tau=v*r; */
176     /* leading to the use of an unititialised var below */
177    
178 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
179     Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
180 jgs 150 Paso_Solver_solvePreconditioner(A,v,r);
181 gross 584 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
182     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
183 phornby 1628
184 gross 1570 sum_1 = 0;
185 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss)
186 gross 1570 {
187 gross 1571 ss=0;
188 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
189     #pragma omp for schedule(dynamic, 1)
190     for (ipp=0; ipp < n_chunks; ++ipp) {
191     istart=chunk_size*ipp;
192     iend=MIN(istart+chunk_size,n);
193     #else
194 gross 1572 #pragma omp for schedule(static)
195 gross 1570 for (ipp=0; ipp <np; ++ipp) {
196     istart=len*ipp+MIN(ipp,rest);
197     iend=len*(ipp+1)+MIN(ipp+1,rest);
198     #endif
199 gross 1572 #pragma ivdep
200     for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
201 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
202     }
203     #else
204     }
205     #endif
206 gross 1572 #pragma omp critical
207 gross 1571 {
208 gross 1572 sum_1+=ss;
209 gross 1571 }
210 gross 1570 }
211 ksteube 1312 #ifdef PASO_MPI
212     /* In case we have many MPI processes, each of which may have several OMP threads:
213     OMP master participates in an MPI reduction to get global sum_1 */
214 ksteube 1579 loc_sum[0] = sum_1;
215 gross 1556 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
216 ksteube 1312 #endif
217 jgs 150 tau_old=tau;
218     tau=sum_1;
219     /* p=v+beta*p */
220 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,beta)
221 gross 1570 {
222     #ifdef USE_DYNAMIC_SCHEDULING
223     #pragma omp for schedule(dynamic, 1)
224     for (ipp=0; ipp < n_chunks; ++ipp) {
225     istart=chunk_size*ipp;
226     iend=MIN(istart+chunk_size,n);
227     #else
228     #pragma omp for schedule(static)
229     for (ipp=0; ipp <np; ++ipp) {
230     istart=len*ipp+MIN(ipp,rest);
231     iend=len*(ipp+1)+MIN(ipp+1,rest);
232     #endif
233 gross 1572 if (num_iter==1) {
234     #pragma ivdep
235     for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
236     } else {
237     beta=tau/tau_old;
238     #pragma ivdep
239     for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
240     }
241 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
242     }
243     #else
244     }
245     #endif
246 jgs 150 }
247     /* v=A*p */
248 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
249     Performance_startMonitor(pp,PERFORMANCE_MVM);
250 gross 415 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, p,ZERO,v);
251 gross 584 Performance_stopMonitor(pp,PERFORMANCE_MVM);
252     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
253 gross 1570
254 jgs 150 /* delta=p*v */
255 gross 1570 sum_2 = 0;
256 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,ss)
257 gross 1570 {
258 gross 1571 ss=0;
259 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
260     #pragma omp for schedule(dynamic, 1)
261     for (ipp=0; ipp < n_chunks; ++ipp) {
262     istart=chunk_size*ipp;
263     iend=MIN(istart+chunk_size,n);
264     #else
265     #pragma omp for schedule(static)
266     for (ipp=0; ipp <np; ++ipp) {
267     istart=len*ipp+MIN(ipp,rest);
268     iend=len*(ipp+1)+MIN(ipp+1,rest);
269     #endif
270 gross 1572 #pragma ivdep
271     for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
272 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
273     }
274     #else
275     }
276     #endif
277 gross 1572 #pragma omp critical
278 gross 1571 {
279 gross 1572 sum_2+=ss;
280 gross 1571 }
281 gross 1570 }
282 ksteube 1312 #ifdef PASO_MPI
283 gross 1556 loc_sum[0] = sum_2;
284     MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
285 ksteube 1312 #endif
286 jgs 150 delta=sum_2;
287 gross 1572 alpha=tau/delta;
288 jgs 150
289     if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
290     /* smoother */
291 gross 1570 sum_3 = 0;
292     sum_4 = 0;
293 gross 1572 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
294 gross 1556 {
295 gross 1571 ss=0;
296     ss1=0;
297 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
298     #pragma omp for schedule(dynamic, 1)
299     for (ipp=0; ipp < n_chunks; ++ipp) {
300     istart=chunk_size*ipp;
301     iend=MIN(istart+chunk_size,n);
302     #else
303     #pragma omp for schedule(static)
304     for (ipp=0; ipp <np; ++ipp) {
305     istart=len*ipp+MIN(ipp,rest);
306     iend=len*(ipp+1)+MIN(ipp+1,rest);
307     #endif
308 gross 1572 #pragma ivdep
309     for (i0=istart;i0<iend;i0++) {
310     r[i0]-=alpha*v[i0];
311     d=r[i0]-rs[i0];
312     ss+=d*d;
313     ss1+=d*rs[i0];
314     }
315 gross 1571 #ifdef USE_DYNAMIC_SCHEDULING
316     }
317     #else
318     }
319     #endif
320 gross 1572 #pragma omp critical
321 gross 1570 {
322 gross 1572 sum_3+=ss;
323     sum_4+=ss1;
324 gross 1570 }
325 ksteube 1312 }
326     #ifdef PASO_MPI
327 gross 1556 loc_sum[0] = sum_3;
328     loc_sum[1] = sum_4;
329     MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
330     sum_3=sum[0];
331     sum_4=sum[1];
332 ksteube 1312 #endif
333 gross 1570 sum_5 = 0;
334 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
335 gross 1556 {
336 gross 1570 gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ;
337     gamma_2= ONE-gamma_1;
338 gross 1571 ss=0;
339 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
340     #pragma omp for schedule(dynamic, 1)
341     for (ipp=0; ipp < n_chunks; ++ipp) {
342     istart=chunk_size*ipp;
343     iend=MIN(istart+chunk_size,n);
344     #else
345     #pragma omp for schedule(static)
346     for (ipp=0; ipp <np; ++ipp) {
347     istart=len*ipp+MIN(ipp,rest);
348     iend=len*(ipp+1)+MIN(ipp+1,rest);
349     #endif
350 gross 1572 #pragma ivdep
351     for (i0=istart;i0<iend;i0++) {
352     rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
353     x2[i0]+=alpha*p[i0];
354     x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
355     ss+=rs[i0]*rs[i0];
356     }
357 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
358 gross 1556 }
359 gross 1570 #else
360     }
361     #endif
362 gross 1571 #pragma omp critical
363     {
364 gross 1572 sum_5+=ss;
365 gross 1571 }
366 gross 495 }
367 ksteube 1312 #ifdef PASO_MPI
368 gross 1556 loc_sum[0] = sum_5;
369     MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
370 ksteube 1312 #endif
371 gross 495 norm_of_residual=sqrt(sum_5);
372     convergeFlag = norm_of_residual <= tol;
373     maxIterFlag = num_iter == maxit;
374     breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
375     }
376 gross 1556 }
377     /* end of iteration */
378     num_iter_global=num_iter;
379     norm_of_residual_global=norm_of_residual;
380     if (maxIterFlag) {
381     status = SOLVER_MAXITER_REACHED;
382     } else if (breakFlag) {
383     status = SOLVER_BREAKDOWN;
384     }
385     Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
386 jgs 150 TMPMEMFREE(rs);
387     TMPMEMFREE(x2);
388     TMPMEMFREE(v);
389     TMPMEMFREE(p);
390     *iter=num_iter_global;
391     *resid=norm_of_residual_global;
392     }
393     /* End of PCG */
394     return status;
395     }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26