/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Contents of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2484 - (show annotations)
Mon Jun 22 04:22:19 2009 UTC (10 years, 3 months ago) by gross
File MIME type: text/plain
File size: 13657 byte(s)
numarray removed from docu; Locator revised.
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2008 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /* PCG iterations */
16
17 #include "SystemMatrix.h"
18 #include "Paso.h"
19 #include "Solver.h"
20
21 #ifdef _OPENMP
22 #include <omp.h>
23 #endif
24
25 #ifdef PASO_MPI
26 #include <mpi.h>
27 #endif
28
29 /*
30 *
31 * Purpose
32 * =======
33 *
34 * PCG solves the linear system A*x = b using the
35 * preconditioned conjugate gradient method plus a smoother
36 * A has to be symmetric.
37 *
38 * Convergence test: norm( b - A*x )< TOL.
39 * For other measures, see the above reference.
40 *
41 * Arguments
42 * =========
43 *
44 * r (input) DOUBLE PRECISION array, dimension N.
45 * On entry, residual of inital guess x
46 *
47 * x (input/output) DOUBLE PRECISION array, dimension N.
48 * On input, the initial guess.
49 *
50 * ITER (input/output) INT
51 * On input, the maximum iterations to be performed.
52 * On output, actual number of iterations performed.
53 *
54 * INFO (output) INT
55 *
56 * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
57 * = SOLVEr_MAXITER_REACHED
58 * = SOLVER_INPUT_ERROR Illegal parameter:
59 * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
60 * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
61 *
62 * ==============================================================
63 */
64
65 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
66
67 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
68 #define USE_DYNAMIC_SCHEDULING
69 #endif
70
71 err_t Paso_Solver_PCG(
72 Paso_SystemMatrix * A,
73 double * r,
74 double * x,
75 dim_t *iter,
76 double * tolerance,
77 Paso_Performance* pp) {
78
79 /* Local variables */
80 dim_t num_iter=0,maxit,num_iter_global, len,rest, np, ipp;
81 #ifdef USE_DYNAMIC_SCHEDULING
82 dim_t chunk_size=-1;
83 #endif
84 register double ss,ss1;
85 dim_t i0, istart, iend;
86 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
87 err_t status = SOLVER_NO_ERROR;
88 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
89 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
90 double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
91 #ifdef PASO_MPI
92 double loc_sum[2], sum[2];
93 #endif
94 double norm_of_residual=0,norm_of_residual_global;
95 register double d;
96
97 /* Should not be any executable code before this ifdef */
98
99 #ifdef USE_DYNAMIC_SCHEDULING
100
101 /* Watch out for these declarations (see above) */
102 char* chksz_chr;
103 dim_t n_chunks;
104
105 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
106 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
107 np=omp_get_max_threads();
108 chunk_size=MIN(MAX(1,chunk_size),n/np);
109 n_chunks=n/chunk_size;
110 if (n_chunks*chunk_size<n) n_chunks+=1;
111 #else
112 np=omp_get_max_threads();
113 len=n/np;
114 rest=n-len*np;
115 #endif
116 /* */
117 /*-----------------------------------------------------------------*/
118 /* */
119 /* Start of Calculation : */
120 /* --------------------- */
121 /* */
122 /* */
123 rs=TMPMEMALLOC(n,double);
124 p=TMPMEMALLOC(n,double);
125 v=TMPMEMALLOC(n,double);
126 x2=TMPMEMALLOC(n,double);
127
128 /* Test the input parameters. */
129
130 if (n < 0) {
131 status = SOLVER_INPUT_ERROR;
132 } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
133 status = SOLVER_MEMORY_ERROR;
134 } else {
135 maxit = *iter;
136 tol = *resid;
137 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
138 /* initialize data */
139 #pragma omp parallel private(i0, istart, iend, ipp)
140 {
141 #ifdef USE_DYNAMIC_SCHEDULING
142 #pragma omp for schedule(dynamic, 1)
143 for (ipp=0; ipp < n_chunks; ++ipp) {
144 istart=chunk_size*ipp;
145 iend=MIN(istart+chunk_size,n);
146 #else
147 #pragma omp for schedule(static)
148 for (ipp=0; ipp <np; ++ipp) {
149 istart=len*ipp+MIN(ipp,rest);
150 iend=len*(ipp+1)+MIN(ipp+1,rest);
151 #endif
152 #pragma ivdep
153 for (i0=istart;i0<iend;i0++) {
154 rs[i0]=r[i0];
155 x2[i0]=x[i0];
156 p[i0]=0;
157 v[i0]=0;
158 }
159 #ifdef USE_DYNAMIC_SCHEDULING
160 }
161 #else
162 }
163 #endif
164 }
165 num_iter=0;
166
167 /* PGH */
168 /* without this we get a use of an unititialised var below */
169 tau = 0;
170
171 /* start of iteration */
172 while (!(convergeFlag || maxIterFlag || breakFlag)) {
173 ++(num_iter);
174
175 /* PGH */
176 /* The next lines were commented out before I got here */
177 /* v=prec(r) */
178 /* tau=v*r; */
179 /* leading to the use of an unititialised var below */
180
181 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
182 Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
183 Paso_Solver_solvePreconditioner(A,v,r);
184 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
185 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
186
187 sum_1 = 0;
188 #pragma omp parallel private(i0, istart, iend, ipp, ss)
189 {
190 ss=0;
191 #ifdef USE_DYNAMIC_SCHEDULING
192 #pragma omp for schedule(dynamic, 1)
193 for (ipp=0; ipp < n_chunks; ++ipp) {
194 istart=chunk_size*ipp;
195 iend=MIN(istart+chunk_size,n);
196 #else
197 #pragma omp for schedule(static)
198 for (ipp=0; ipp <np; ++ipp) {
199 istart=len*ipp+MIN(ipp,rest);
200 iend=len*(ipp+1)+MIN(ipp+1,rest);
201 #endif
202 #pragma ivdep
203 for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
204 #ifdef USE_DYNAMIC_SCHEDULING
205 }
206 #else
207 }
208 #endif
209 #pragma omp critical
210 {
211 sum_1+=ss;
212 }
213 }
214 #ifdef PASO_MPI
215 /* In case we have many MPI processes, each of which may have several OMP threads:
216 OMP master participates in an MPI reduction to get global sum_1 */
217 loc_sum[0] = sum_1;
218 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
219 #endif
220 tau_old=tau;
221 tau=sum_1;
222 /* p=v+beta*p */
223 #pragma omp parallel private(i0, istart, iend, ipp,beta)
224 {
225 #ifdef USE_DYNAMIC_SCHEDULING
226 #pragma omp for schedule(dynamic, 1)
227 for (ipp=0; ipp < n_chunks; ++ipp) {
228 istart=chunk_size*ipp;
229 iend=MIN(istart+chunk_size,n);
230 #else
231 #pragma omp for schedule(static)
232 for (ipp=0; ipp <np; ++ipp) {
233 istart=len*ipp+MIN(ipp,rest);
234 iend=len*(ipp+1)+MIN(ipp+1,rest);
235 #endif
236 if (num_iter==1) {
237 #pragma ivdep
238 for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
239 } else {
240 beta=tau/tau_old;
241 #pragma ivdep
242 for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
243 }
244 #ifdef USE_DYNAMIC_SCHEDULING
245 }
246 #else
247 }
248 #endif
249 }
250 /* v=A*p */
251 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
252 Performance_startMonitor(pp,PERFORMANCE_MVM);
253 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, p,PASO_ZERO,v);
254 Performance_stopMonitor(pp,PERFORMANCE_MVM);
255 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
256
257 /* delta=p*v */
258 sum_2 = 0;
259 #pragma omp parallel private(i0, istart, iend, ipp,ss)
260 {
261 ss=0;
262 #ifdef USE_DYNAMIC_SCHEDULING
263 #pragma omp for schedule(dynamic, 1)
264 for (ipp=0; ipp < n_chunks; ++ipp) {
265 istart=chunk_size*ipp;
266 iend=MIN(istart+chunk_size,n);
267 #else
268 #pragma omp for schedule(static)
269 for (ipp=0; ipp <np; ++ipp) {
270 istart=len*ipp+MIN(ipp,rest);
271 iend=len*(ipp+1)+MIN(ipp+1,rest);
272 #endif
273 #pragma ivdep
274 for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
275 #ifdef USE_DYNAMIC_SCHEDULING
276 }
277 #else
278 }
279 #endif
280 #pragma omp critical
281 {
282 sum_2+=ss;
283 }
284 }
285 #ifdef PASO_MPI
286 loc_sum[0] = sum_2;
287 MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
288 #endif
289 delta=sum_2;
290 alpha=tau/delta;
291
292 if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
293 /* smoother */
294 sum_3 = 0;
295 sum_4 = 0;
296 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
297 {
298 ss=0;
299 ss1=0;
300 #ifdef USE_DYNAMIC_SCHEDULING
301 #pragma omp for schedule(dynamic, 1)
302 for (ipp=0; ipp < n_chunks; ++ipp) {
303 istart=chunk_size*ipp;
304 iend=MIN(istart+chunk_size,n);
305 #else
306 #pragma omp for schedule(static)
307 for (ipp=0; ipp <np; ++ipp) {
308 istart=len*ipp+MIN(ipp,rest);
309 iend=len*(ipp+1)+MIN(ipp+1,rest);
310 #endif
311 #pragma ivdep
312 for (i0=istart;i0<iend;i0++) {
313 r[i0]-=alpha*v[i0];
314 d=r[i0]-rs[i0];
315 ss+=d*d;
316 ss1+=d*rs[i0];
317 }
318 #ifdef USE_DYNAMIC_SCHEDULING
319 }
320 #else
321 }
322 #endif
323 #pragma omp critical
324 {
325 sum_3+=ss;
326 sum_4+=ss1;
327 }
328 }
329 #ifdef PASO_MPI
330 loc_sum[0] = sum_3;
331 loc_sum[1] = sum_4;
332 MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
333 sum_3=sum[0];
334 sum_4=sum[1];
335 #endif
336 sum_5 = 0;
337 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
338 {
339 gamma_1= ( (ABS(sum_3)<= PASO_ZERO) ? 0 : -sum_4/sum_3) ;
340 gamma_2= PASO_ONE-gamma_1;
341 ss=0;
342 #ifdef USE_DYNAMIC_SCHEDULING
343 #pragma omp for schedule(dynamic, 1)
344 for (ipp=0; ipp < n_chunks; ++ipp) {
345 istart=chunk_size*ipp;
346 iend=MIN(istart+chunk_size,n);
347 #else
348 #pragma omp for schedule(static)
349 for (ipp=0; ipp <np; ++ipp) {
350 istart=len*ipp+MIN(ipp,rest);
351 iend=len*(ipp+1)+MIN(ipp+1,rest);
352 #endif
353 #pragma ivdep
354 for (i0=istart;i0<iend;i0++) {
355 rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
356 x2[i0]+=alpha*p[i0];
357 x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
358 ss+=rs[i0]*rs[i0];
359 }
360 #ifdef USE_DYNAMIC_SCHEDULING
361 }
362 #else
363 }
364 #endif
365 #pragma omp critical
366 {
367 sum_5+=ss;
368 }
369 }
370 #ifdef PASO_MPI
371 loc_sum[0] = sum_5;
372 MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
373 #endif
374 norm_of_residual=sqrt(sum_5);
375 convergeFlag = norm_of_residual <= tol;
376 maxIterFlag = num_iter > maxit;
377 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
378 }
379 }
380 /* end of iteration */
381 num_iter_global=num_iter;
382 norm_of_residual_global=norm_of_residual;
383 if (maxIterFlag) {
384 status = SOLVER_MAXITER_REACHED;
385 } else if (breakFlag) {
386 status = SOLVER_BREAKDOWN;
387 }
388 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
389 TMPMEMFREE(rs);
390 TMPMEMFREE(x2);
391 TMPMEMFREE(v);
392 TMPMEMFREE(p);
393 *iter=num_iter_global;
394 *resid=norm_of_residual_global;
395 }
396 /* End of PCG */
397 return status;
398 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26