/[escript]/branches/doubleplusgood/paso/src/PCG.cpp
ViewVC logotype

Contents of /branches/doubleplusgood/paso/src/PCG.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4280 - (show annotations)
Wed Mar 6 06:45:32 2013 UTC (6 years, 1 month ago) by jfenwick
File size: 13741 byte(s)
some memory management replacement
1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2013 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development since 2012 by School of Earth Sciences
13 *
14 *****************************************************************************/
15
16
17 /* PCG iterations */
18
19 #include "SystemMatrix.h"
20 #include "Paso.h"
21 #include "Solver.h"
22
23 #ifdef _OPENMP
24 #include <omp.h>
25 #endif
26
27 #ifdef ESYS_MPI
28 #include <mpi.h>
29 #endif
30
31 /*
32 *
33 * Purpose
34 * =======
35 *
36 * PCG solves the linear system A*x = b using the
37 * preconditioned conjugate gradient method plus a smoother.
38 * A has to be symmetric.
39 *
40 * Convergence test: norm( b - A*x )< TOL.
41 * For other measures, see the above reference.
42 *
43 * Arguments
44 * =========
45 *
46 * r (input) DOUBLE PRECISION array, dimension N.
47 * On entry, residual of initial guess x.
48 *
49 * x (input/output) DOUBLE PRECISION array, dimension N.
50 * On input, the initial guess.
51 *
52 * ITER (input/output) INT
53 * On input, the maximum iterations to be performed.
54 * On output, actual number of iterations performed.
55 *
56 * INFO (output) INT
57 *
58 * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
59 * = SOLVER_MAXITER_REACHED
60 * = SOLVER_INPUT_ERROR Illegal parameter:
61 * = SOLVER_BREAKDOWN: If parameters RHO or OMEGA become smaller
62 * = SOLVER_MEMORY_ERROR : If parameters RHO or OMEGA become smaller
63 *
64 * ==============================================================
65 */
66
67 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
68
69 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
70 #define USE_DYNAMIC_SCHEDULING
71 #endif
72
73 err_t Paso_Solver_PCG(
74 Paso_SystemMatrix * A,
75 double * r,
76 double * x,
77 dim_t *iter,
78 double * tolerance,
79 Paso_Performance* pp) {
80
81 /* Local variables */
82 dim_t num_iter=0,maxit,num_iter_global, len,rest, np, ipp;
83 #ifdef USE_DYNAMIC_SCHEDULING
84 dim_t chunk_size=-1;
85 #endif
86 register double ss,ss1;
87 dim_t i0, istart, iend;
88 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
89 err_t status = SOLVER_NO_ERROR;
90 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
91 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
92 double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
93 #ifdef ESYS_MPI
94 double loc_sum[2], sum[2];
95 #endif
96 double norm_of_residual=0,norm_of_residual_global;
97 register double d;
98
99 /* There should not be any executable code before this ifdef */
100
101 #ifdef USE_DYNAMIC_SCHEDULING
102
103 /* Watch out for these declarations (see above) */
104 char* chksz_chr;
105 dim_t n_chunks;
106
107 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
108 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
109 np=omp_get_max_threads();
110 chunk_size=MIN(MAX(1,chunk_size),n/np);
111 n_chunks=n/chunk_size;
112 if (n_chunks*chunk_size<n) n_chunks+=1;
113 #else
114 np=omp_get_max_threads();
115 len=n/np;
116 rest=n-len*np;
117 #endif
118 /* */
119 /*-----------------------------------------------------------------*/
120 /* */
121 /* Start of Calculation : */
122 /* --------------------- */
123 /* */
124 /* */
125 rs=new double[n];
126 p=new double[n];
127 v=new double[n];
128 x2=new double[n];
129
130 /* Test the input parameters. */
131
132 if (n < 0) {
133 status = SOLVER_INPUT_ERROR;
134 } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
135 status = SOLVER_MEMORY_ERROR;
136 } else {
137 maxit = *iter;
138 tol = *resid;
139 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
140 /* initialize data */
141 #pragma omp parallel private(i0, istart, iend, ipp)
142 {
143 #ifdef USE_DYNAMIC_SCHEDULING
144 #pragma omp for schedule(dynamic, 1)
145 for (ipp=0; ipp < n_chunks; ++ipp) {
146 istart=chunk_size*ipp;
147 iend=MIN(istart+chunk_size,n);
148 #else
149 #pragma omp for schedule(static)
150 for (ipp=0; ipp <np; ++ipp) {
151 istart=len*ipp+MIN(ipp,rest);
152 iend=len*(ipp+1)+MIN(ipp+1,rest);
153 #endif
154 #pragma ivdep
155 for (i0=istart;i0<iend;i0++) {
156 rs[i0]=r[i0];
157 x2[i0]=x[i0];
158 p[i0]=0;
159 v[i0]=0;
160 }
161 #ifdef USE_DYNAMIC_SCHEDULING
162 }
163 #else
164 }
165 #endif
166 }
167 num_iter=0;
168
169 /* PGH */
170 /* without this we get a use of an uninitialised var below */
171 tau = 0;
172
173 /* start of iteration */
174 while (!(convergeFlag || maxIterFlag || breakFlag)) {
175 ++(num_iter);
176
177 /* PGH */
178 /* The next lines were commented out before I got here */
179 /* v=prec(r) */
180 /* tau=v*r; */
181 /* leading to the use of an uninitialised var below */
182
183 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
184 Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
185 Paso_SystemMatrix_solvePreconditioner(A,v,r);
186 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
187 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
188
189 sum_1 = 0;
190 #pragma omp parallel private(i0, istart, iend, ipp, ss)
191 {
192 ss=0;
193 #ifdef USE_DYNAMIC_SCHEDULING
194 #pragma omp for schedule(dynamic, 1)
195 for (ipp=0; ipp < n_chunks; ++ipp) {
196 istart=chunk_size*ipp;
197 iend=MIN(istart+chunk_size,n);
198 #else
199 #pragma omp for schedule(static)
200 for (ipp=0; ipp <np; ++ipp) {
201 istart=len*ipp+MIN(ipp,rest);
202 iend=len*(ipp+1)+MIN(ipp+1,rest);
203 #endif
204 #pragma ivdep
205 for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
206 #ifdef USE_DYNAMIC_SCHEDULING
207 }
208 #else
209 }
210 #endif
211 #pragma omp critical
212 {
213 sum_1+=ss;
214 }
215 }
216 #ifdef ESYS_MPI
217 /* In case we have many MPI processes, each of which may have several OMP threads:
218 OMP master participates in an MPI reduction to get global sum_1 */
219 loc_sum[0] = sum_1;
220 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
221 #endif
222 tau_old=tau;
223 tau=sum_1;
224 /* p=v+beta*p */
225 #pragma omp parallel private(i0, istart, iend, ipp,beta)
226 {
227 #ifdef USE_DYNAMIC_SCHEDULING
228 #pragma omp for schedule(dynamic, 1)
229 for (ipp=0; ipp < n_chunks; ++ipp) {
230 istart=chunk_size*ipp;
231 iend=MIN(istart+chunk_size,n);
232 #else
233 #pragma omp for schedule(static)
234 for (ipp=0; ipp <np; ++ipp) {
235 istart=len*ipp+MIN(ipp,rest);
236 iend=len*(ipp+1)+MIN(ipp+1,rest);
237 #endif
238 if (num_iter==1) {
239 #pragma ivdep
240 for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
241 } else {
242 beta=tau/tau_old;
243 #pragma ivdep
244 for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
245 }
246 #ifdef USE_DYNAMIC_SCHEDULING
247 }
248 #else
249 }
250 #endif
251 }
252 /* v=A*p */
253 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
254 Performance_startMonitor(pp,PERFORMANCE_MVM);
255 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(PASO_ONE, A, p,PASO_ZERO,v);
256 Performance_stopMonitor(pp,PERFORMANCE_MVM);
257 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
258
259 /* delta=p*v */
260 sum_2 = 0;
261 #pragma omp parallel private(i0, istart, iend, ipp,ss)
262 {
263 ss=0;
264 #ifdef USE_DYNAMIC_SCHEDULING
265 #pragma omp for schedule(dynamic, 1)
266 for (ipp=0; ipp < n_chunks; ++ipp) {
267 istart=chunk_size*ipp;
268 iend=MIN(istart+chunk_size,n);
269 #else
270 #pragma omp for schedule(static)
271 for (ipp=0; ipp <np; ++ipp) {
272 istart=len*ipp+MIN(ipp,rest);
273 iend=len*(ipp+1)+MIN(ipp+1,rest);
274 #endif
275 #pragma ivdep
276 for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
277 #ifdef USE_DYNAMIC_SCHEDULING
278 }
279 #else
280 }
281 #endif
282 #pragma omp critical
283 {
284 sum_2+=ss;
285 }
286 }
287 #ifdef ESYS_MPI
288 loc_sum[0] = sum_2;
289 MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
290 #endif
291 delta=sum_2;
292 alpha=tau/delta;
293
294 if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
295 /* smoother */
296 sum_3 = 0;
297 sum_4 = 0;
298 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
299 {
300 ss=0;
301 ss1=0;
302 #ifdef USE_DYNAMIC_SCHEDULING
303 #pragma omp for schedule(dynamic, 1)
304 for (ipp=0; ipp < n_chunks; ++ipp) {
305 istart=chunk_size*ipp;
306 iend=MIN(istart+chunk_size,n);
307 #else
308 #pragma omp for schedule(static)
309 for (ipp=0; ipp <np; ++ipp) {
310 istart=len*ipp+MIN(ipp,rest);
311 iend=len*(ipp+1)+MIN(ipp+1,rest);
312 #endif
313 #pragma ivdep
314 for (i0=istart;i0<iend;i0++) {
315 r[i0]-=alpha*v[i0];
316 d=r[i0]-rs[i0];
317 ss+=d*d;
318 ss1+=d*rs[i0];
319 }
320 #ifdef USE_DYNAMIC_SCHEDULING
321 }
322 #else
323 }
324 #endif
325 #pragma omp critical
326 {
327 sum_3+=ss;
328 sum_4+=ss1;
329 }
330 }
331 #ifdef ESYS_MPI
332 loc_sum[0] = sum_3;
333 loc_sum[1] = sum_4;
334 MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
335 sum_3=sum[0];
336 sum_4=sum[1];
337 #endif
338 sum_5 = 0;
339 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
340 {
341 gamma_1= ( (ABS(sum_3)<= PASO_ZERO) ? 0 : -sum_4/sum_3) ;
342 gamma_2= PASO_ONE-gamma_1;
343 ss=0;
344 #ifdef USE_DYNAMIC_SCHEDULING
345 #pragma omp for schedule(dynamic, 1)
346 for (ipp=0; ipp < n_chunks; ++ipp) {
347 istart=chunk_size*ipp;
348 iend=MIN(istart+chunk_size,n);
349 #else
350 #pragma omp for schedule(static)
351 for (ipp=0; ipp <np; ++ipp) {
352 istart=len*ipp+MIN(ipp,rest);
353 iend=len*(ipp+1)+MIN(ipp+1,rest);
354 #endif
355 #pragma ivdep
356 for (i0=istart;i0<iend;i0++) {
357 rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
358 x2[i0]+=alpha*p[i0];
359 x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
360 ss+=rs[i0]*rs[i0];
361 }
362 #ifdef USE_DYNAMIC_SCHEDULING
363 }
364 #else
365 }
366 #endif
367 #pragma omp critical
368 {
369 sum_5+=ss;
370 }
371 }
372 #ifdef ESYS_MPI
373 loc_sum[0] = sum_5;
374 MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
375 #endif
376 norm_of_residual=sqrt(sum_5);
377 convergeFlag = norm_of_residual <= tol;
378 maxIterFlag = num_iter > maxit;
379 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
380 }
381 }
382 /* end of iteration */
383 num_iter_global=num_iter;
384 norm_of_residual_global=norm_of_residual;
385 if (maxIterFlag) {
386 status = SOLVER_MAXITER_REACHED;
387 } else if (breakFlag) {
388 status = SOLVER_BREAKDOWN;
389 }
390 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
391 delete[] rs;
392 delete[] x2;
393 delete[] v;
394 delete[] p;
395 *iter=num_iter_global;
396 *resid=norm_of_residual_global;
397 }
398 /* End of PCG */
399 return status;
400 }
401

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26