/[escript]/branches/doubleplusgood/paso/src/PCG.cpp
ViewVC logotype

Annotation of /branches/doubleplusgood/paso/src/PCG.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3981 - (hide annotations)
Fri Sep 21 02:47:54 2012 UTC (7 years, 3 months ago) by jfenwick
Original Path: trunk/paso/src/PCG.c
File MIME type: text/plain
File size: 13785 byte(s)
First pass of updating copyright notices
1 ksteube 1312
2 jfenwick 3981 /*****************************************************************************
3 ksteube 1811 *
4 jfenwick 3911 * Copyright (c) 2003-2012 by University of Queensland
5 jfenwick 3981 * http://www.uq.edu.au
6 ksteube 1811 *
7     * Primary Business: Queensland, Australia
8     * Licensed under the Open Software License version 3.0
9     * http://www.opensource.org/licenses/osl-3.0.php
10     *
11 jfenwick 3981 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12     * Development since 2012 by School of Earth Sciences
13     *
14     *****************************************************************************/
15 gross 495
16 ksteube 1811
17 jgs 150 /* PCG iterations */
18    
19 gross 700 #include "SystemMatrix.h"
20     #include "Paso.h"
21 jgs 150 #include "Solver.h"
22 woo409 757
23 jgs 150 #ifdef _OPENMP
24     #include <omp.h>
25     #endif
26    
27 jfenwick 3259 #ifdef ESYS_MPI
28 ksteube 1312 #include <mpi.h>
29     #endif
30    
31 jgs 150 /*
32     *
33     * Purpose
34     * =======
35     *
36     * PCG solves the linear system A*x = b using the
37 caltinay 3642 * preconditioned conjugate gradient method plus a smoother.
38 jgs 150 * A has to be symmetric.
39     *
40     * Convergence test: norm( b - A*x )< TOL.
41     * For other measures, see the above reference.
42     *
43     * Arguments
44     * =========
45     *
46     * r (input) DOUBLE PRECISION array, dimension N.
47 caltinay 3642 * On entry, residual of initial guess x.
48 jgs 150 *
49     * x (input/output) DOUBLE PRECISION array, dimension N.
50     * On input, the initial guess.
51     *
52     * ITER (input/output) INT
53     * On input, the maximum iterations to be performed.
54     * On output, actual number of iterations performed.
55     *
56     * INFO (output) INT
57     *
58     * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
59 caltinay 3642 * = SOLVER_MAXITER_REACHED
60 jgs 150 * = SOLVER_INPUT_ERROR Illegal parameter:
61 caltinay 3642 * = SOLVER_BREAKDOWN: If parameters RHO or OMEGA become smaller
62     * = SOLVER_MEMORY_ERROR : If parameters RHO or OMEGA become smaller
63 jgs 150 *
64     * ==============================================================
65     */
66    
67 gross 1570 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
68    
69     #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
70     #define USE_DYNAMIC_SCHEDULING
71     #endif
72    
73 jgs 150 err_t Paso_Solver_PCG(
74     Paso_SystemMatrix * A,
75     double * r,
76     double * x,
77     dim_t *iter,
78 gross 584 double * tolerance,
79     Paso_Performance* pp) {
80 jgs 150
81     /* Local variables */
82 jfenwick 1981 dim_t num_iter=0,maxit,num_iter_global, len,rest, np, ipp;
83     #ifdef USE_DYNAMIC_SCHEDULING
84     dim_t chunk_size=-1;
85     #endif
86 gross 1570 register double ss,ss1;
87     dim_t i0, istart, iend;
88 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
89     err_t status = SOLVER_NO_ERROR;
90 ksteube 1312 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
91 jgs 150 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
92     double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
93 jfenwick 3259 #ifdef ESYS_MPI
94 phornby 1628 double loc_sum[2], sum[2];
95     #endif
96 jfenwick 1981 double norm_of_residual=0,norm_of_residual_global;
97 phornby 1628 register double d;
98 jgs 150
99 caltinay 3642 /* There should not be any executable code before this ifdef */
100 phornby 1628
101 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
102 phornby 1628
103     /* Watch out for these declarations (see above) */
104     char* chksz_chr;
105     dim_t n_chunks;
106    
107 gross 1570 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
108     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
109 phornby 1628 np=omp_get_max_threads();
110 gross 1570 chunk_size=MIN(MAX(1,chunk_size),n/np);
111     n_chunks=n/chunk_size;
112     if (n_chunks*chunk_size<n) n_chunks+=1;
113     #else
114 phornby 1628 np=omp_get_max_threads();
115 gross 1570 len=n/np;
116     rest=n-len*np;
117     #endif
118 jgs 150 /* */
119     /*-----------------------------------------------------------------*/
120     /* */
121     /* Start of Calculation : */
122     /* --------------------- */
123     /* */
124     /* */
125     rs=TMPMEMALLOC(n,double);
126     p=TMPMEMALLOC(n,double);
127     v=TMPMEMALLOC(n,double);
128     x2=TMPMEMALLOC(n,double);
129    
130     /* Test the input parameters. */
131    
132     if (n < 0) {
133     status = SOLVER_INPUT_ERROR;
134     } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
135     status = SOLVER_MEMORY_ERROR;
136     } else {
137     maxit = *iter;
138     tol = *resid;
139 gross 1556 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
140     /* initialize data */
141 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp)
142 jgs 150 {
143 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
144     #pragma omp for schedule(dynamic, 1)
145     for (ipp=0; ipp < n_chunks; ++ipp) {
146     istart=chunk_size*ipp;
147     iend=MIN(istart+chunk_size,n);
148     #else
149     #pragma omp for schedule(static)
150     for (ipp=0; ipp <np; ++ipp) {
151     istart=len*ipp+MIN(ipp,rest);
152     iend=len*(ipp+1)+MIN(ipp+1,rest);
153     #endif
154 gross 1572 #pragma ivdep
155     for (i0=istart;i0<iend;i0++) {
156 gross 1570 rs[i0]=r[i0];
157     x2[i0]=x[i0];
158     p[i0]=0;
159     v[i0]=0;
160 gross 1572 }
161 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
162     }
163     #else
164     }
165     #endif
166 gross 1556 }
167     num_iter=0;
168 phornby 1628
169     /* PGH */
170 caltinay 3642 /* without this we get a use of an uninitialised var below */
171 phornby 1628 tau = 0;
172    
173 gross 1556 /* start of iteration */
174     while (!(convergeFlag || maxIterFlag || breakFlag)) {
175 jgs 150 ++(num_iter);
176 phornby 1628
177     /* PGH */
178     /* The next lines were commented out before I got here */
179 jgs 150 /* v=prec(r) */
180 phornby 1628 /* tau=v*r; */
181 caltinay 3642 /* leading to the use of an uninitialised var below */
182 phornby 1628
183 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
184     Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
185 gross 3120 Paso_SystemMatrix_solvePreconditioner(A,v,r);
186 gross 584 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
187     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
188 phornby 1628
189 gross 1570 sum_1 = 0;
190 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss)
191 gross 1570 {
192 gross 1571 ss=0;
193 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
194     #pragma omp for schedule(dynamic, 1)
195     for (ipp=0; ipp < n_chunks; ++ipp) {
196     istart=chunk_size*ipp;
197     iend=MIN(istart+chunk_size,n);
198     #else
199 gross 1572 #pragma omp for schedule(static)
200 gross 1570 for (ipp=0; ipp <np; ++ipp) {
201     istart=len*ipp+MIN(ipp,rest);
202     iend=len*(ipp+1)+MIN(ipp+1,rest);
203     #endif
204 gross 1572 #pragma ivdep
205     for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
206 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
207     }
208     #else
209     }
210     #endif
211 gross 1572 #pragma omp critical
212 gross 1571 {
213 gross 1572 sum_1+=ss;
214 gross 1571 }
215 gross 1570 }
216 jfenwick 3259 #ifdef ESYS_MPI
217 ksteube 1312 /* In case we have many MPI processes, each of which may have several OMP threads:
218     OMP master participates in an MPI reduction to get global sum_1 */
219 ksteube 1579 loc_sum[0] = sum_1;
220 gross 1556 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
221 ksteube 1312 #endif
222 jgs 150 tau_old=tau;
223     tau=sum_1;
224     /* p=v+beta*p */
225 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,beta)
226 gross 1570 {
227     #ifdef USE_DYNAMIC_SCHEDULING
228     #pragma omp for schedule(dynamic, 1)
229     for (ipp=0; ipp < n_chunks; ++ipp) {
230     istart=chunk_size*ipp;
231     iend=MIN(istart+chunk_size,n);
232     #else
233     #pragma omp for schedule(static)
234     for (ipp=0; ipp <np; ++ipp) {
235     istart=len*ipp+MIN(ipp,rest);
236     iend=len*(ipp+1)+MIN(ipp+1,rest);
237     #endif
238 gross 1572 if (num_iter==1) {
239     #pragma ivdep
240     for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
241     } else {
242     beta=tau/tau_old;
243     #pragma ivdep
244     for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
245     }
246 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
247     }
248     #else
249     }
250     #endif
251 jgs 150 }
252     /* v=A*p */
253 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
254     Performance_startMonitor(pp,PERFORMANCE_MVM);
255 jfenwick 1974 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, p,PASO_ZERO,v);
256 gross 584 Performance_stopMonitor(pp,PERFORMANCE_MVM);
257     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
258 gross 1570
259 jgs 150 /* delta=p*v */
260 gross 1570 sum_2 = 0;
261 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,ss)
262 gross 1570 {
263 gross 1571 ss=0;
264 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
265     #pragma omp for schedule(dynamic, 1)
266     for (ipp=0; ipp < n_chunks; ++ipp) {
267     istart=chunk_size*ipp;
268     iend=MIN(istart+chunk_size,n);
269     #else
270     #pragma omp for schedule(static)
271     for (ipp=0; ipp <np; ++ipp) {
272     istart=len*ipp+MIN(ipp,rest);
273     iend=len*(ipp+1)+MIN(ipp+1,rest);
274     #endif
275 gross 1572 #pragma ivdep
276     for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
277 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
278     }
279     #else
280     }
281     #endif
282 gross 1572 #pragma omp critical
283 gross 1571 {
284 gross 1572 sum_2+=ss;
285 gross 1571 }
286 gross 1570 }
287 jfenwick 3259 #ifdef ESYS_MPI
288 gross 1556 loc_sum[0] = sum_2;
289     MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
290 ksteube 1312 #endif
291 jgs 150 delta=sum_2;
292 gross 1572 alpha=tau/delta;
293 jgs 150
294     if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
295     /* smoother */
296 gross 1570 sum_3 = 0;
297     sum_4 = 0;
298 gross 1572 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
299 gross 1556 {
300 gross 1571 ss=0;
301     ss1=0;
302 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
303     #pragma omp for schedule(dynamic, 1)
304     for (ipp=0; ipp < n_chunks; ++ipp) {
305     istart=chunk_size*ipp;
306     iend=MIN(istart+chunk_size,n);
307     #else
308     #pragma omp for schedule(static)
309     for (ipp=0; ipp <np; ++ipp) {
310     istart=len*ipp+MIN(ipp,rest);
311     iend=len*(ipp+1)+MIN(ipp+1,rest);
312     #endif
313 gross 1572 #pragma ivdep
314     for (i0=istart;i0<iend;i0++) {
315     r[i0]-=alpha*v[i0];
316     d=r[i0]-rs[i0];
317     ss+=d*d;
318     ss1+=d*rs[i0];
319     }
320 gross 1571 #ifdef USE_DYNAMIC_SCHEDULING
321     }
322     #else
323     }
324     #endif
325 gross 1572 #pragma omp critical
326 gross 1570 {
327 gross 1572 sum_3+=ss;
328     sum_4+=ss1;
329 gross 1570 }
330 ksteube 1312 }
331 jfenwick 3259 #ifdef ESYS_MPI
332 gross 1556 loc_sum[0] = sum_3;
333     loc_sum[1] = sum_4;
334     MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
335     sum_3=sum[0];
336     sum_4=sum[1];
337 ksteube 1312 #endif
338 gross 1570 sum_5 = 0;
339 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
340 gross 1556 {
341 jfenwick 1974 gamma_1= ( (ABS(sum_3)<= PASO_ZERO) ? 0 : -sum_4/sum_3) ;
342     gamma_2= PASO_ONE-gamma_1;
343 gross 1571 ss=0;
344 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
345     #pragma omp for schedule(dynamic, 1)
346     for (ipp=0; ipp < n_chunks; ++ipp) {
347     istart=chunk_size*ipp;
348     iend=MIN(istart+chunk_size,n);
349     #else
350     #pragma omp for schedule(static)
351     for (ipp=0; ipp <np; ++ipp) {
352     istart=len*ipp+MIN(ipp,rest);
353     iend=len*(ipp+1)+MIN(ipp+1,rest);
354     #endif
355 gross 1572 #pragma ivdep
356     for (i0=istart;i0<iend;i0++) {
357     rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
358     x2[i0]+=alpha*p[i0];
359     x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
360     ss+=rs[i0]*rs[i0];
361     }
362 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
363 gross 1556 }
364 gross 1570 #else
365     }
366     #endif
367 gross 1571 #pragma omp critical
368     {
369 gross 1572 sum_5+=ss;
370 gross 1571 }
371 gross 495 }
372 jfenwick 3259 #ifdef ESYS_MPI
373 gross 1556 loc_sum[0] = sum_5;
374     MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
375 ksteube 1312 #endif
376 gross 495 norm_of_residual=sqrt(sum_5);
377     convergeFlag = norm_of_residual <= tol;
378 gross 2484 maxIterFlag = num_iter > maxit;
379 gross 495 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
380     }
381 gross 1556 }
382     /* end of iteration */
383     num_iter_global=num_iter;
384     norm_of_residual_global=norm_of_residual;
385     if (maxIterFlag) {
386     status = SOLVER_MAXITER_REACHED;
387     } else if (breakFlag) {
388     status = SOLVER_BREAKDOWN;
389     }
390     Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
391 jgs 150 TMPMEMFREE(rs);
392     TMPMEMFREE(x2);
393     TMPMEMFREE(v);
394     TMPMEMFREE(p);
395     *iter=num_iter_global;
396     *resid=norm_of_residual_global;
397     }
398     /* End of PCG */
399     return status;
400     }
401 caltinay 3642

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26