/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Contents of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1974 - (show annotations)
Thu Nov 6 02:40:10 2008 UTC (10 years, 5 months ago) by jfenwick
File MIME type: text/plain
File size: 13611 byte(s)
Changing ZERO and ONE in Solver.h to PASO_ZERO and PASO_ONE.
Changed three static vars to #defines
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2008 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /* PCG iterations */
16
17 #include "SystemMatrix.h"
18 #include "Paso.h"
19 #include "Solver.h"
20
21 #ifdef _OPENMP
22 #include <omp.h>
23 #endif
24
25 #ifdef PASO_MPI
26 #include <mpi.h>
27 #endif
28
29 /*
30 *
31 * Purpose
32 * =======
33 *
34 * PCG solves the linear system A*x = b using the
35 * preconditioned conjugate gradient method plus a smoother
36 * A has to be symmetric.
37 *
38 * Convergence test: norm( b - A*x )< TOL.
39 * For other measures, see the above reference.
40 *
41 * Arguments
42 * =========
43 *
44 * r (input) DOUBLE PRECISION array, dimension N.
45 * On entry, residual of inital guess x
46 *
47 * x (input/output) DOUBLE PRECISION array, dimension N.
48 * On input, the initial guess.
49 *
50 * ITER (input/output) INT
51 * On input, the maximum iterations to be performed.
52 * On output, actual number of iterations performed.
53 *
54 * INFO (output) INT
55 *
56 * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
57 * = SOLVEr_MAXITER_REACHED
58 * = SOLVER_INPUT_ERROR Illegal parameter:
59 * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
60 * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
61 *
62 * ==============================================================
63 */
64
65 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
66
67 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
68 #define USE_DYNAMIC_SCHEDULING
69 #endif
70
71 err_t Paso_Solver_PCG(
72 Paso_SystemMatrix * A,
73 double * r,
74 double * x,
75 dim_t *iter,
76 double * tolerance,
77 Paso_Performance* pp) {
78
79 /* Local variables */
80 dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, np, ipp;
81 register double ss,ss1;
82 dim_t i0, istart, iend;
83 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
84 err_t status = SOLVER_NO_ERROR;
85 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
86 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
87 double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
88 #ifdef PASO_MPI
89 double loc_sum[2], sum[2];
90 #endif
91 double norm_of_residual,norm_of_residual_global;
92 register double d;
93
94 /* Should not be any executable code before this ifdef */
95
96 #ifdef USE_DYNAMIC_SCHEDULING
97
98 /* Watch out for these declarations (see above) */
99 char* chksz_chr;
100 dim_t n_chunks;
101
102 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
103 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
104 np=omp_get_max_threads();
105 chunk_size=MIN(MAX(1,chunk_size),n/np);
106 n_chunks=n/chunk_size;
107 if (n_chunks*chunk_size<n) n_chunks+=1;
108 #else
109 np=omp_get_max_threads();
110 len=n/np;
111 rest=n-len*np;
112 #endif
113 /* */
114 /*-----------------------------------------------------------------*/
115 /* */
116 /* Start of Calculation : */
117 /* --------------------- */
118 /* */
119 /* */
120 rs=TMPMEMALLOC(n,double);
121 p=TMPMEMALLOC(n,double);
122 v=TMPMEMALLOC(n,double);
123 x2=TMPMEMALLOC(n,double);
124
125 /* Test the input parameters. */
126
127 if (n < 0) {
128 status = SOLVER_INPUT_ERROR;
129 } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
130 status = SOLVER_MEMORY_ERROR;
131 } else {
132 maxit = *iter;
133 tol = *resid;
134 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
135 /* initialize data */
136 #pragma omp parallel private(i0, istart, iend, ipp)
137 {
138 #ifdef USE_DYNAMIC_SCHEDULING
139 #pragma omp for schedule(dynamic, 1)
140 for (ipp=0; ipp < n_chunks; ++ipp) {
141 istart=chunk_size*ipp;
142 iend=MIN(istart+chunk_size,n);
143 #else
144 #pragma omp for schedule(static)
145 for (ipp=0; ipp <np; ++ipp) {
146 istart=len*ipp+MIN(ipp,rest);
147 iend=len*(ipp+1)+MIN(ipp+1,rest);
148 #endif
149 #pragma ivdep
150 for (i0=istart;i0<iend;i0++) {
151 rs[i0]=r[i0];
152 x2[i0]=x[i0];
153 p[i0]=0;
154 v[i0]=0;
155 }
156 #ifdef USE_DYNAMIC_SCHEDULING
157 }
158 #else
159 }
160 #endif
161 }
162 num_iter=0;
163
164 /* PGH */
165 /* without this we get a use of an unititialised var below */
166 tau = 0;
167
168 /* start of iteration */
169 while (!(convergeFlag || maxIterFlag || breakFlag)) {
170 ++(num_iter);
171
172 /* PGH */
173 /* The next lines were commented out before I got here */
174 /* v=prec(r) */
175 /* tau=v*r; */
176 /* leading to the use of an unititialised var below */
177
178 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
179 Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
180 Paso_Solver_solvePreconditioner(A,v,r);
181 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
182 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
183
184 sum_1 = 0;
185 #pragma omp parallel private(i0, istart, iend, ipp, ss)
186 {
187 ss=0;
188 #ifdef USE_DYNAMIC_SCHEDULING
189 #pragma omp for schedule(dynamic, 1)
190 for (ipp=0; ipp < n_chunks; ++ipp) {
191 istart=chunk_size*ipp;
192 iend=MIN(istart+chunk_size,n);
193 #else
194 #pragma omp for schedule(static)
195 for (ipp=0; ipp <np; ++ipp) {
196 istart=len*ipp+MIN(ipp,rest);
197 iend=len*(ipp+1)+MIN(ipp+1,rest);
198 #endif
199 #pragma ivdep
200 for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
201 #ifdef USE_DYNAMIC_SCHEDULING
202 }
203 #else
204 }
205 #endif
206 #pragma omp critical
207 {
208 sum_1+=ss;
209 }
210 }
211 #ifdef PASO_MPI
212 /* In case we have many MPI processes, each of which may have several OMP threads:
213 OMP master participates in an MPI reduction to get global sum_1 */
214 loc_sum[0] = sum_1;
215 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
216 #endif
217 tau_old=tau;
218 tau=sum_1;
219 /* p=v+beta*p */
220 #pragma omp parallel private(i0, istart, iend, ipp,beta)
221 {
222 #ifdef USE_DYNAMIC_SCHEDULING
223 #pragma omp for schedule(dynamic, 1)
224 for (ipp=0; ipp < n_chunks; ++ipp) {
225 istart=chunk_size*ipp;
226 iend=MIN(istart+chunk_size,n);
227 #else
228 #pragma omp for schedule(static)
229 for (ipp=0; ipp <np; ++ipp) {
230 istart=len*ipp+MIN(ipp,rest);
231 iend=len*(ipp+1)+MIN(ipp+1,rest);
232 #endif
233 if (num_iter==1) {
234 #pragma ivdep
235 for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
236 } else {
237 beta=tau/tau_old;
238 #pragma ivdep
239 for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
240 }
241 #ifdef USE_DYNAMIC_SCHEDULING
242 }
243 #else
244 }
245 #endif
246 }
247 /* v=A*p */
248 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
249 Performance_startMonitor(pp,PERFORMANCE_MVM);
250 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, p,PASO_ZERO,v);
251 Performance_stopMonitor(pp,PERFORMANCE_MVM);
252 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
253
254 /* delta=p*v */
255 sum_2 = 0;
256 #pragma omp parallel private(i0, istart, iend, ipp,ss)
257 {
258 ss=0;
259 #ifdef USE_DYNAMIC_SCHEDULING
260 #pragma omp for schedule(dynamic, 1)
261 for (ipp=0; ipp < n_chunks; ++ipp) {
262 istart=chunk_size*ipp;
263 iend=MIN(istart+chunk_size,n);
264 #else
265 #pragma omp for schedule(static)
266 for (ipp=0; ipp <np; ++ipp) {
267 istart=len*ipp+MIN(ipp,rest);
268 iend=len*(ipp+1)+MIN(ipp+1,rest);
269 #endif
270 #pragma ivdep
271 for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
272 #ifdef USE_DYNAMIC_SCHEDULING
273 }
274 #else
275 }
276 #endif
277 #pragma omp critical
278 {
279 sum_2+=ss;
280 }
281 }
282 #ifdef PASO_MPI
283 loc_sum[0] = sum_2;
284 MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
285 #endif
286 delta=sum_2;
287 alpha=tau/delta;
288
289 if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
290 /* smoother */
291 sum_3 = 0;
292 sum_4 = 0;
293 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
294 {
295 ss=0;
296 ss1=0;
297 #ifdef USE_DYNAMIC_SCHEDULING
298 #pragma omp for schedule(dynamic, 1)
299 for (ipp=0; ipp < n_chunks; ++ipp) {
300 istart=chunk_size*ipp;
301 iend=MIN(istart+chunk_size,n);
302 #else
303 #pragma omp for schedule(static)
304 for (ipp=0; ipp <np; ++ipp) {
305 istart=len*ipp+MIN(ipp,rest);
306 iend=len*(ipp+1)+MIN(ipp+1,rest);
307 #endif
308 #pragma ivdep
309 for (i0=istart;i0<iend;i0++) {
310 r[i0]-=alpha*v[i0];
311 d=r[i0]-rs[i0];
312 ss+=d*d;
313 ss1+=d*rs[i0];
314 }
315 #ifdef USE_DYNAMIC_SCHEDULING
316 }
317 #else
318 }
319 #endif
320 #pragma omp critical
321 {
322 sum_3+=ss;
323 sum_4+=ss1;
324 }
325 }
326 #ifdef PASO_MPI
327 loc_sum[0] = sum_3;
328 loc_sum[1] = sum_4;
329 MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
330 sum_3=sum[0];
331 sum_4=sum[1];
332 #endif
333 sum_5 = 0;
334 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
335 {
336 gamma_1= ( (ABS(sum_3)<= PASO_ZERO) ? 0 : -sum_4/sum_3) ;
337 gamma_2= PASO_ONE-gamma_1;
338 ss=0;
339 #ifdef USE_DYNAMIC_SCHEDULING
340 #pragma omp for schedule(dynamic, 1)
341 for (ipp=0; ipp < n_chunks; ++ipp) {
342 istart=chunk_size*ipp;
343 iend=MIN(istart+chunk_size,n);
344 #else
345 #pragma omp for schedule(static)
346 for (ipp=0; ipp <np; ++ipp) {
347 istart=len*ipp+MIN(ipp,rest);
348 iend=len*(ipp+1)+MIN(ipp+1,rest);
349 #endif
350 #pragma ivdep
351 for (i0=istart;i0<iend;i0++) {
352 rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
353 x2[i0]+=alpha*p[i0];
354 x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
355 ss+=rs[i0]*rs[i0];
356 }
357 #ifdef USE_DYNAMIC_SCHEDULING
358 }
359 #else
360 }
361 #endif
362 #pragma omp critical
363 {
364 sum_5+=ss;
365 }
366 }
367 #ifdef PASO_MPI
368 loc_sum[0] = sum_5;
369 MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
370 #endif
371 norm_of_residual=sqrt(sum_5);
372 convergeFlag = norm_of_residual <= tol;
373 maxIterFlag = num_iter == maxit;
374 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
375 }
376 }
377 /* end of iteration */
378 num_iter_global=num_iter;
379 norm_of_residual_global=norm_of_residual;
380 if (maxIterFlag) {
381 status = SOLVER_MAXITER_REACHED;
382 } else if (breakFlag) {
383 status = SOLVER_BREAKDOWN;
384 }
385 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
386 TMPMEMFREE(rs);
387 TMPMEMFREE(x2);
388 TMPMEMFREE(v);
389 TMPMEMFREE(p);
390 *iter=num_iter_global;
391 *resid=norm_of_residual_global;
392 }
393 /* End of PCG */
394 return status;
395 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26