/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Contents of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1579 - (show annotations)
Mon Jun 2 08:48:36 2008 UTC (11 years, 4 months ago) by ksteube
File MIME type: text/plain
File size: 13206 byte(s)
Fix typo in PCG.c
Improvement of build options for Savanna, ac.apac.edu, shake71

1
2 /* $Id$ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /* PCG iterations */
17
18 #include "SystemMatrix.h"
19 #include "Paso.h"
20 #include "Solver.h"
21
22 #ifdef _OPENMP
23 #include <omp.h>
24 #endif
25
26 #ifdef PASO_MPI
27 #include <mpi.h>
28 #endif
29
30 /*
31 *
32 * Purpose
33 * =======
34 *
35 * PCG solves the linear system A*x = b using the
36 * preconditioned conjugate gradient method plus a smoother
37 * A has to be symmetric.
38 *
39 * Convergence test: norm( b - A*x )< TOL.
40 * For other measures, see the above reference.
41 *
42 * Arguments
43 * =========
44 *
45 * r (input) DOUBLE PRECISION array, dimension N.
46 * On entry, residual of inital guess x
47 *
48 * x (input/output) DOUBLE PRECISION array, dimension N.
49 * On input, the initial guess.
50 *
51 * ITER (input/output) INT
52 * On input, the maximum iterations to be performed.
53 * On output, actual number of iterations performed.
54 *
55 * INFO (output) INT
56 *
57 * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
58 * = SOLVEr_MAXITER_REACHED
59 * = SOLVER_INPUT_ERROR Illegal parameter:
60 * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
61 * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
62 *
63 * ==============================================================
64 */
65
66 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
67
68 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
69 #define USE_DYNAMIC_SCHEDULING
70 #endif
71
72 err_t Paso_Solver_PCG(
73 Paso_SystemMatrix * A,
74 double * r,
75 double * x,
76 dim_t *iter,
77 double * tolerance,
78 Paso_Performance* pp) {
79
80 /* Local variables */
81 dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, n_chunks, np, ipp;
82 register double ss,ss1;
83 dim_t i0, istart, iend;
84 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
85 err_t status = SOLVER_NO_ERROR;
86 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
87 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
88 double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
89 double norm_of_residual,norm_of_residual_global, loc_sum[2], sum[2];
90 register double r_tmp,d,rs_tmp,x2_tmp,x_tmp;
91 char* chksz_chr;
92 np=omp_get_max_threads();
93
94 #ifdef USE_DYNAMIC_SCHEDULING
95 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
96 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
97 chunk_size=MIN(MAX(1,chunk_size),n/np);
98 n_chunks=n/chunk_size;
99 if (n_chunks*chunk_size<n) n_chunks+=1;
100 #else
101 len=n/np;
102 rest=n-len*np;
103 #endif
104 /* */
105 /*-----------------------------------------------------------------*/
106 /* */
107 /* Start of Calculation : */
108 /* --------------------- */
109 /* */
110 /* */
111 rs=TMPMEMALLOC(n,double);
112 p=TMPMEMALLOC(n,double);
113 v=TMPMEMALLOC(n,double);
114 x2=TMPMEMALLOC(n,double);
115
116 /* Test the input parameters. */
117
118 if (n < 0) {
119 status = SOLVER_INPUT_ERROR;
120 } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
121 status = SOLVER_MEMORY_ERROR;
122 } else {
123 maxit = *iter;
124 tol = *resid;
125 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
126 /* initialize data */
127 #pragma omp parallel private(i0, istart, iend, ipp)
128 {
129 #ifdef USE_DYNAMIC_SCHEDULING
130 #pragma omp for schedule(dynamic, 1)
131 for (ipp=0; ipp < n_chunks; ++ipp) {
132 istart=chunk_size*ipp;
133 iend=MIN(istart+chunk_size,n);
134 #else
135 #pragma omp for schedule(static)
136 for (ipp=0; ipp <np; ++ipp) {
137 istart=len*ipp+MIN(ipp,rest);
138 iend=len*(ipp+1)+MIN(ipp+1,rest);
139 #endif
140 #pragma ivdep
141 for (i0=istart;i0<iend;i0++) {
142 rs[i0]=r[i0];
143 x2[i0]=x[i0];
144 p[i0]=0;
145 v[i0]=0;
146 }
147 #ifdef USE_DYNAMIC_SCHEDULING
148 }
149 #else
150 }
151 #endif
152 }
153 num_iter=0;
154 /* start of iteration */
155 while (!(convergeFlag || maxIterFlag || breakFlag)) {
156 ++(num_iter);
157 /* v=prec(r) */
158 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
159 Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
160 Paso_Solver_solvePreconditioner(A,v,r);
161 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
162 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
163 /* tau=v*r */
164 sum_1 = 0;
165 #pragma omp parallel private(i0, istart, iend, ipp, ss)
166 {
167 ss=0;
168 #ifdef USE_DYNAMIC_SCHEDULING
169 #pragma omp for schedule(dynamic, 1)
170 for (ipp=0; ipp < n_chunks; ++ipp) {
171 istart=chunk_size*ipp;
172 iend=MIN(istart+chunk_size,n);
173 #else
174 #pragma omp for schedule(static)
175 for (ipp=0; ipp <np; ++ipp) {
176 istart=len*ipp+MIN(ipp,rest);
177 iend=len*(ipp+1)+MIN(ipp+1,rest);
178 #endif
179 #pragma ivdep
180 for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
181 #ifdef USE_DYNAMIC_SCHEDULING
182 }
183 #else
184 }
185 #endif
186 #pragma omp critical
187 {
188 sum_1+=ss;
189 }
190 }
191 #ifdef PASO_MPI
192 /* In case we have many MPI processes, each of which may have several OMP threads:
193 OMP master participates in an MPI reduction to get global sum_1 */
194 loc_sum[0] = sum_1;
195 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
196 #endif
197 tau_old=tau;
198 tau=sum_1;
199 /* p=v+beta*p */
200 #pragma omp parallel private(i0, istart, iend, ipp,beta)
201 {
202 #ifdef USE_DYNAMIC_SCHEDULING
203 #pragma omp for schedule(dynamic, 1)
204 for (ipp=0; ipp < n_chunks; ++ipp) {
205 istart=chunk_size*ipp;
206 iend=MIN(istart+chunk_size,n);
207 #else
208 #pragma omp for schedule(static)
209 for (ipp=0; ipp <np; ++ipp) {
210 istart=len*ipp+MIN(ipp,rest);
211 iend=len*(ipp+1)+MIN(ipp+1,rest);
212 #endif
213 if (num_iter==1) {
214 #pragma ivdep
215 for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
216 } else {
217 beta=tau/tau_old;
218 #pragma ivdep
219 for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
220 }
221 #ifdef USE_DYNAMIC_SCHEDULING
222 }
223 #else
224 }
225 #endif
226 }
227 /* v=A*p */
228 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
229 Performance_startMonitor(pp,PERFORMANCE_MVM);
230 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, p,ZERO,v);
231 Performance_stopMonitor(pp,PERFORMANCE_MVM);
232 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
233
234 /* delta=p*v */
235 sum_2 = 0;
236 #pragma omp parallel private(i0, istart, iend, ipp,ss)
237 {
238 ss=0;
239 #ifdef USE_DYNAMIC_SCHEDULING
240 #pragma omp for schedule(dynamic, 1)
241 for (ipp=0; ipp < n_chunks; ++ipp) {
242 istart=chunk_size*ipp;
243 iend=MIN(istart+chunk_size,n);
244 #else
245 #pragma omp for schedule(static)
246 for (ipp=0; ipp <np; ++ipp) {
247 istart=len*ipp+MIN(ipp,rest);
248 iend=len*(ipp+1)+MIN(ipp+1,rest);
249 #endif
250 #pragma ivdep
251 for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
252 #ifdef USE_DYNAMIC_SCHEDULING
253 }
254 #else
255 }
256 #endif
257 #pragma omp critical
258 {
259 sum_2+=ss;
260 }
261 }
262 #ifdef PASO_MPI
263 loc_sum[0] = sum_2;
264 MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
265 #endif
266 delta=sum_2;
267 alpha=tau/delta;
268
269 if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
270 /* smoother */
271 sum_3 = 0;
272 sum_4 = 0;
273 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
274 {
275 ss=0;
276 ss1=0;
277 #ifdef USE_DYNAMIC_SCHEDULING
278 #pragma omp for schedule(dynamic, 1)
279 for (ipp=0; ipp < n_chunks; ++ipp) {
280 istart=chunk_size*ipp;
281 iend=MIN(istart+chunk_size,n);
282 #else
283 #pragma omp for schedule(static)
284 for (ipp=0; ipp <np; ++ipp) {
285 istart=len*ipp+MIN(ipp,rest);
286 iend=len*(ipp+1)+MIN(ipp+1,rest);
287 #endif
288 #pragma ivdep
289 for (i0=istart;i0<iend;i0++) {
290 r[i0]-=alpha*v[i0];
291 d=r[i0]-rs[i0];
292 ss+=d*d;
293 ss1+=d*rs[i0];
294 }
295 #ifdef USE_DYNAMIC_SCHEDULING
296 }
297 #else
298 }
299 #endif
300 #pragma omp critical
301 {
302 sum_3+=ss;
303 sum_4+=ss1;
304 }
305 }
306 #ifdef PASO_MPI
307 loc_sum[0] = sum_3;
308 loc_sum[1] = sum_4;
309 MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
310 sum_3=sum[0];
311 sum_4=sum[1];
312 #endif
313 sum_5 = 0;
314 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
315 {
316 gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ;
317 gamma_2= ONE-gamma_1;
318 ss=0;
319 #ifdef USE_DYNAMIC_SCHEDULING
320 #pragma omp for schedule(dynamic, 1)
321 for (ipp=0; ipp < n_chunks; ++ipp) {
322 istart=chunk_size*ipp;
323 iend=MIN(istart+chunk_size,n);
324 #else
325 #pragma omp for schedule(static)
326 for (ipp=0; ipp <np; ++ipp) {
327 istart=len*ipp+MIN(ipp,rest);
328 iend=len*(ipp+1)+MIN(ipp+1,rest);
329 #endif
330 #pragma ivdep
331 for (i0=istart;i0<iend;i0++) {
332 rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
333 x2[i0]+=alpha*p[i0];
334 x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
335 ss+=rs[i0]*rs[i0];
336 }
337 #ifdef USE_DYNAMIC_SCHEDULING
338 }
339 #else
340 }
341 #endif
342 #pragma omp critical
343 {
344 sum_5+=ss;
345 }
346 }
347 #ifdef PASO_MPI
348 loc_sum[0] = sum_5;
349 MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
350 #endif
351 norm_of_residual=sqrt(sum_5);
352 convergeFlag = norm_of_residual <= tol;
353 maxIterFlag = num_iter == maxit;
354 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
355 }
356 }
357 /* end of iteration */
358 num_iter_global=num_iter;
359 norm_of_residual_global=norm_of_residual;
360 if (maxIterFlag) {
361 status = SOLVER_MAXITER_REACHED;
362 } else if (breakFlag) {
363 status = SOLVER_BREAKDOWN;
364 }
365 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
366 TMPMEMFREE(rs);
367 TMPMEMFREE(x2);
368 TMPMEMFREE(v);
369 TMPMEMFREE(p);
370 *iter=num_iter_global;
371 *resid=norm_of_residual_global;
372 }
373 /* End of PCG */
374 return status;
375 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26