/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Annotation of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1579 - (hide annotations)
Mon Jun 2 08:48:36 2008 UTC (11 years, 4 months ago) by ksteube
File MIME type: text/plain
File size: 13206 byte(s)
Fix typo in PCG.c
Improvement of build options for Savanna, ac.apac.edu, shake71

1 ksteube 1312
2 jgs 150 /* $Id$ */
3    
4 ksteube 1312 /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15 gross 495
16 jgs 150 /* PCG iterations */
17    
18 gross 700 #include "SystemMatrix.h"
19     #include "Paso.h"
20 jgs 150 #include "Solver.h"
21 woo409 757
22 jgs 150 #ifdef _OPENMP
23     #include <omp.h>
24     #endif
25    
26 ksteube 1312 #ifdef PASO_MPI
27     #include <mpi.h>
28     #endif
29    
30 jgs 150 /*
31     *
32     * Purpose
33     * =======
34     *
35     * PCG solves the linear system A*x = b using the
36     * preconditioned conjugate gradient method plus a smoother
37     * A has to be symmetric.
38     *
39     * Convergence test: norm( b - A*x )< TOL.
40     * For other measures, see the above reference.
41     *
42     * Arguments
43     * =========
44     *
45     * r (input) DOUBLE PRECISION array, dimension N.
46     * On entry, residual of inital guess x
47     *
48     * x (input/output) DOUBLE PRECISION array, dimension N.
49     * On input, the initial guess.
50     *
51     * ITER (input/output) INT
52     * On input, the maximum iterations to be performed.
53     * On output, actual number of iterations performed.
54     *
55     * INFO (output) INT
56     *
57     * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
58     * = SOLVEr_MAXITER_REACHED
59     * = SOLVER_INPUT_ERROR Illegal parameter:
60     * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
61     * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
62     *
63     * ==============================================================
64     */
65    
66 gross 1570 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
67    
68     #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
69     #define USE_DYNAMIC_SCHEDULING
70     #endif
71    
72 jgs 150 err_t Paso_Solver_PCG(
73     Paso_SystemMatrix * A,
74     double * r,
75     double * x,
76     dim_t *iter,
77 gross 584 double * tolerance,
78     Paso_Performance* pp) {
79 jgs 150
80     /* Local variables */
81 gross 1570 dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, n_chunks, np, ipp;
82     register double ss,ss1;
83     dim_t i0, istart, iend;
84 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
85     err_t status = SOLVER_NO_ERROR;
86 ksteube 1312 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
87 jgs 150 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
88     double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
89 ksteube 1312 double norm_of_residual,norm_of_residual_global, loc_sum[2], sum[2];
90 gross 495 register double r_tmp,d,rs_tmp,x2_tmp,x_tmp;
91 gross 1570 char* chksz_chr;
92     np=omp_get_max_threads();
93 jgs 150
94 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
95     chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
96     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
97     chunk_size=MIN(MAX(1,chunk_size),n/np);
98     n_chunks=n/chunk_size;
99     if (n_chunks*chunk_size<n) n_chunks+=1;
100     #else
101     len=n/np;
102     rest=n-len*np;
103     #endif
104 jgs 150 /* */
105     /*-----------------------------------------------------------------*/
106     /* */
107     /* Start of Calculation : */
108     /* --------------------- */
109     /* */
110     /* */
111     rs=TMPMEMALLOC(n,double);
112     p=TMPMEMALLOC(n,double);
113     v=TMPMEMALLOC(n,double);
114     x2=TMPMEMALLOC(n,double);
115    
116     /* Test the input parameters. */
117    
118     if (n < 0) {
119     status = SOLVER_INPUT_ERROR;
120     } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
121     status = SOLVER_MEMORY_ERROR;
122     } else {
123     maxit = *iter;
124     tol = *resid;
125 gross 1556 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
126     /* initialize data */
127 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp)
128 jgs 150 {
129 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
130     #pragma omp for schedule(dynamic, 1)
131     for (ipp=0; ipp < n_chunks; ++ipp) {
132     istart=chunk_size*ipp;
133     iend=MIN(istart+chunk_size,n);
134     #else
135     #pragma omp for schedule(static)
136     for (ipp=0; ipp <np; ++ipp) {
137     istart=len*ipp+MIN(ipp,rest);
138     iend=len*(ipp+1)+MIN(ipp+1,rest);
139     #endif
140 gross 1572 #pragma ivdep
141     for (i0=istart;i0<iend;i0++) {
142 gross 1570 rs[i0]=r[i0];
143     x2[i0]=x[i0];
144     p[i0]=0;
145     v[i0]=0;
146 gross 1572 }
147 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
148     }
149     #else
150     }
151     #endif
152 gross 1556 }
153     num_iter=0;
154     /* start of iteration */
155     while (!(convergeFlag || maxIterFlag || breakFlag)) {
156 jgs 150 ++(num_iter);
157     /* v=prec(r) */
158 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
159     Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
160 jgs 150 Paso_Solver_solvePreconditioner(A,v,r);
161 gross 584 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
162     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
163 jgs 150 /* tau=v*r */
164 gross 1570 sum_1 = 0;
165 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss)
166 gross 1570 {
167 gross 1571 ss=0;
168 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
169     #pragma omp for schedule(dynamic, 1)
170     for (ipp=0; ipp < n_chunks; ++ipp) {
171     istart=chunk_size*ipp;
172     iend=MIN(istart+chunk_size,n);
173     #else
174 gross 1572 #pragma omp for schedule(static)
175 gross 1570 for (ipp=0; ipp <np; ++ipp) {
176     istart=len*ipp+MIN(ipp,rest);
177     iend=len*(ipp+1)+MIN(ipp+1,rest);
178     #endif
179 gross 1572 #pragma ivdep
180     for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
181 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
182     }
183     #else
184     }
185     #endif
186 gross 1572 #pragma omp critical
187 gross 1571 {
188 gross 1572 sum_1+=ss;
189 gross 1571 }
190 gross 1570 }
191 ksteube 1312 #ifdef PASO_MPI
192     /* In case we have many MPI processes, each of which may have several OMP threads:
193     OMP master participates in an MPI reduction to get global sum_1 */
194 ksteube 1579 loc_sum[0] = sum_1;
195 gross 1556 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
196 ksteube 1312 #endif
197 jgs 150 tau_old=tau;
198     tau=sum_1;
199     /* p=v+beta*p */
200 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,beta)
201 gross 1570 {
202     #ifdef USE_DYNAMIC_SCHEDULING
203     #pragma omp for schedule(dynamic, 1)
204     for (ipp=0; ipp < n_chunks; ++ipp) {
205     istart=chunk_size*ipp;
206     iend=MIN(istart+chunk_size,n);
207     #else
208     #pragma omp for schedule(static)
209     for (ipp=0; ipp <np; ++ipp) {
210     istart=len*ipp+MIN(ipp,rest);
211     iend=len*(ipp+1)+MIN(ipp+1,rest);
212     #endif
213 gross 1572 if (num_iter==1) {
214     #pragma ivdep
215     for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
216     } else {
217     beta=tau/tau_old;
218     #pragma ivdep
219     for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
220     }
221 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
222     }
223     #else
224     }
225     #endif
226 jgs 150 }
227     /* v=A*p */
228 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
229     Performance_startMonitor(pp,PERFORMANCE_MVM);
230 gross 415 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, p,ZERO,v);
231 gross 584 Performance_stopMonitor(pp,PERFORMANCE_MVM);
232     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
233 gross 1570
234 jgs 150 /* delta=p*v */
235 gross 1570 sum_2 = 0;
236 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,ss)
237 gross 1570 {
238 gross 1571 ss=0;
239 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
240     #pragma omp for schedule(dynamic, 1)
241     for (ipp=0; ipp < n_chunks; ++ipp) {
242     istart=chunk_size*ipp;
243     iend=MIN(istart+chunk_size,n);
244     #else
245     #pragma omp for schedule(static)
246     for (ipp=0; ipp <np; ++ipp) {
247     istart=len*ipp+MIN(ipp,rest);
248     iend=len*(ipp+1)+MIN(ipp+1,rest);
249     #endif
250 gross 1572 #pragma ivdep
251     for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
252 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
253     }
254     #else
255     }
256     #endif
257 gross 1572 #pragma omp critical
258 gross 1571 {
259 gross 1572 sum_2+=ss;
260 gross 1571 }
261 gross 1570 }
262 ksteube 1312 #ifdef PASO_MPI
263 gross 1556 loc_sum[0] = sum_2;
264     MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
265 ksteube 1312 #endif
266 jgs 150 delta=sum_2;
267 gross 1572 alpha=tau/delta;
268 jgs 150
269     if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
270     /* smoother */
271 gross 1570 sum_3 = 0;
272     sum_4 = 0;
273 gross 1572 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
274 gross 1556 {
275 gross 1571 ss=0;
276     ss1=0;
277 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
278     #pragma omp for schedule(dynamic, 1)
279     for (ipp=0; ipp < n_chunks; ++ipp) {
280     istart=chunk_size*ipp;
281     iend=MIN(istart+chunk_size,n);
282     #else
283     #pragma omp for schedule(static)
284     for (ipp=0; ipp <np; ++ipp) {
285     istart=len*ipp+MIN(ipp,rest);
286     iend=len*(ipp+1)+MIN(ipp+1,rest);
287     #endif
288 gross 1572 #pragma ivdep
289     for (i0=istart;i0<iend;i0++) {
290     r[i0]-=alpha*v[i0];
291     d=r[i0]-rs[i0];
292     ss+=d*d;
293     ss1+=d*rs[i0];
294     }
295 gross 1571 #ifdef USE_DYNAMIC_SCHEDULING
296     }
297     #else
298     }
299     #endif
300 gross 1572 #pragma omp critical
301 gross 1570 {
302 gross 1572 sum_3+=ss;
303     sum_4+=ss1;
304 gross 1570 }
305 ksteube 1312 }
306     #ifdef PASO_MPI
307 gross 1556 loc_sum[0] = sum_3;
308     loc_sum[1] = sum_4;
309     MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
310     sum_3=sum[0];
311     sum_4=sum[1];
312 ksteube 1312 #endif
313 gross 1570 sum_5 = 0;
314 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
315 gross 1556 {
316 gross 1570 gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ;
317     gamma_2= ONE-gamma_1;
318 gross 1571 ss=0;
319 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
320     #pragma omp for schedule(dynamic, 1)
321     for (ipp=0; ipp < n_chunks; ++ipp) {
322     istart=chunk_size*ipp;
323     iend=MIN(istart+chunk_size,n);
324     #else
325     #pragma omp for schedule(static)
326     for (ipp=0; ipp <np; ++ipp) {
327     istart=len*ipp+MIN(ipp,rest);
328     iend=len*(ipp+1)+MIN(ipp+1,rest);
329     #endif
330 gross 1572 #pragma ivdep
331     for (i0=istart;i0<iend;i0++) {
332     rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
333     x2[i0]+=alpha*p[i0];
334     x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
335     ss+=rs[i0]*rs[i0];
336     }
337 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
338 gross 1556 }
339 gross 1570 #else
340     }
341     #endif
342 gross 1571 #pragma omp critical
343     {
344 gross 1572 sum_5+=ss;
345 gross 1571 }
346 gross 495 }
347 ksteube 1312 #ifdef PASO_MPI
348 gross 1556 loc_sum[0] = sum_5;
349     MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
350 ksteube 1312 #endif
351 gross 495 norm_of_residual=sqrt(sum_5);
352     convergeFlag = norm_of_residual <= tol;
353     maxIterFlag = num_iter == maxit;
354     breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
355     }
356 gross 1556 }
357     /* end of iteration */
358     num_iter_global=num_iter;
359     norm_of_residual_global=norm_of_residual;
360     if (maxIterFlag) {
361     status = SOLVER_MAXITER_REACHED;
362     } else if (breakFlag) {
363     status = SOLVER_BREAKDOWN;
364     }
365     Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
366 jgs 150 TMPMEMFREE(rs);
367     TMPMEMFREE(x2);
368     TMPMEMFREE(v);
369     TMPMEMFREE(p);
370     *iter=num_iter_global;
371     *resid=norm_of_residual_global;
372     }
373     /* End of PCG */
374     return status;
375     }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26