/[escript]/branches/windows_from_1456_trunk_1580_merged_in/paso/src/PCG.c
ViewVC logotype

Contents of /branches/windows_from_1456_trunk_1580_merged_in/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1582 - (show annotations)
Wed Jun 4 07:20:59 2008 UTC (12 years, 7 months ago) by phornby
File MIME type: text/plain
File size: 13212 byte(s)
Merge in trunk changes from 1574 to 1580.


1
2 /* $Id$ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /* PCG iterations */
17
18 #include "SystemMatrix.h"
19 #include "Paso.h"
20 #include "Solver.h"
21
22 #ifdef _OPENMP
23 #include <omp.h>
24 #endif
25
26 #ifdef PASO_MPI
27 #include <mpi.h>
28 #endif
29
30 /*
31 *
32 * Purpose
33 * =======
34 *
35 * PCG solves the linear system A*x = b using the
36 * preconditioned conjugate gradient method plus a smoother
37 * A has to be symmetric.
38 *
39 * Convergence test: norm( b - A*x )< TOL.
40 * For other measures, see the above reference.
41 *
42 * Arguments
43 * =========
44 *
45 * r (input) DOUBLE PRECISION array, dimension N.
46 * On entry, residual of inital guess x
47 *
48 * x (input/output) DOUBLE PRECISION array, dimension N.
49 * On input, the initial guess.
50 *
51 * ITER (input/output) INT
52 * On input, the maximum iterations to be performed.
53 * On output, actual number of iterations performed.
54 *
55 * INFO (output) INT
56 *
57 * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
58 * = SOLVEr_MAXITER_REACHED
59 * = SOLVER_INPUT_ERROR Illegal parameter:
60 * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
61 * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
62 *
63 * ==============================================================
64 */
65
66 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
67
68 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
69 #define USE_DYNAMIC_SCHEDULING
70 #endif
71
72 err_t Paso_Solver_PCG(
73 Paso_SystemMatrix * A,
74 double * r,
75 double * x,
76 dim_t *iter,
77 double * tolerance,
78 Paso_Performance* pp) {
79
80 /* Local variables */
81 dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, n_chunks, np, ipp;
82 register double ss,ss1;
83 dim_t i0, istart, iend;
84 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
85 err_t status = SOLVER_NO_ERROR;
86 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
87 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
88 double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
89 #ifdef PASO_MPI
90 double loc_sum[2], sum[2];
91 #endif
92 double norm_of_residual,norm_of_residual_global;
93 register double d;
94 char* chksz_chr;
95 np=omp_get_max_threads();
96
97 #ifdef USE_DYNAMIC_SCHEDULING
98 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
99 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
100 chunk_size=MIN(MAX(1,chunk_size),n/np);
101 n_chunks=n/chunk_size;
102 if (n_chunks*chunk_size<n) n_chunks+=1;
103 #else
104 len=n/np;
105 rest=n-len*np;
106 #endif
107 /* */
108 /*-----------------------------------------------------------------*/
109 /* */
110 /* Start of Calculation : */
111 /* --------------------- */
112 /* */
113 /* */
114 rs=TMPMEMALLOC(n,double);
115 p=TMPMEMALLOC(n,double);
116 v=TMPMEMALLOC(n,double);
117 x2=TMPMEMALLOC(n,double);
118
119 /* Test the input parameters. */
120
121 if (n < 0) {
122 status = SOLVER_INPUT_ERROR;
123 } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
124 status = SOLVER_MEMORY_ERROR;
125 } else {
126 maxit = *iter;
127 tol = *resid;
128 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
129 /* initialize data */
130 #pragma omp parallel private(i0, istart, iend, ipp)
131 {
132 #ifdef USE_DYNAMIC_SCHEDULING
133 #pragma omp for schedule(dynamic, 1)
134 for (ipp=0; ipp < n_chunks; ++ipp) {
135 istart=chunk_size*ipp;
136 iend=MIN(istart+chunk_size,n);
137 #else
138 #pragma omp for schedule(static)
139 for (ipp=0; ipp <np; ++ipp) {
140 istart=len*ipp+MIN(ipp,rest);
141 iend=len*(ipp+1)+MIN(ipp+1,rest);
142 #endif
143 #pragma ivdep
144 for (i0=istart;i0<iend;i0++) {
145 rs[i0]=r[i0];
146 x2[i0]=x[i0];
147 p[i0]=0;
148 v[i0]=0;
149 }
150 #ifdef USE_DYNAMIC_SCHEDULING
151 }
152 #else
153 }
154 #endif
155 }
156 num_iter=0;
157 /* start of iteration */
158 while (!(convergeFlag || maxIterFlag || breakFlag)) {
159 ++(num_iter);
160 /* v=prec(r) */
161 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
162 Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
163 Paso_Solver_solvePreconditioner(A,v,r);
164 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
165 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
166 /* tau=v*r */
167 sum_1 = 0;
168 #pragma omp parallel private(i0, istart, iend, ipp, ss)
169 {
170 ss=0;
171 #ifdef USE_DYNAMIC_SCHEDULING
172 #pragma omp for schedule(dynamic, 1)
173 for (ipp=0; ipp < n_chunks; ++ipp) {
174 istart=chunk_size*ipp;
175 iend=MIN(istart+chunk_size,n);
176 #else
177 #pragma omp for schedule(static)
178 for (ipp=0; ipp <np; ++ipp) {
179 istart=len*ipp+MIN(ipp,rest);
180 iend=len*(ipp+1)+MIN(ipp+1,rest);
181 #endif
182 #pragma ivdep
183 for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
184 #ifdef USE_DYNAMIC_SCHEDULING
185 }
186 #else
187 }
188 #endif
189 #pragma omp critical
190 {
191 sum_1+=ss;
192 }
193 }
194 #ifdef PASO_MPI
195 /* In case we have many MPI processes, each of which may have several OMP threads:
196 OMP master participates in an MPI reduction to get global sum_1 */
197 loc_sum[0] = sum_1;
198 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
199 #endif
200 tau_old=tau;
201 tau=sum_1;
202 /* p=v+beta*p */
203 #pragma omp parallel private(i0, istart, iend, ipp,beta)
204 {
205 #ifdef USE_DYNAMIC_SCHEDULING
206 #pragma omp for schedule(dynamic, 1)
207 for (ipp=0; ipp < n_chunks; ++ipp) {
208 istart=chunk_size*ipp;
209 iend=MIN(istart+chunk_size,n);
210 #else
211 #pragma omp for schedule(static)
212 for (ipp=0; ipp <np; ++ipp) {
213 istart=len*ipp+MIN(ipp,rest);
214 iend=len*(ipp+1)+MIN(ipp+1,rest);
215 #endif
216 if (num_iter==1) {
217 #pragma ivdep
218 for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
219 } else {
220 beta=tau/tau_old;
221 #pragma ivdep
222 for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
223 }
224 #ifdef USE_DYNAMIC_SCHEDULING
225 }
226 #else
227 }
228 #endif
229 }
230 /* v=A*p */
231 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
232 Performance_startMonitor(pp,PERFORMANCE_MVM);
233 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, p,ZERO,v);
234 Performance_stopMonitor(pp,PERFORMANCE_MVM);
235 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
236
237 /* delta=p*v */
238 sum_2 = 0;
239 #pragma omp parallel private(i0, istart, iend, ipp,ss)
240 {
241 ss=0;
242 #ifdef USE_DYNAMIC_SCHEDULING
243 #pragma omp for schedule(dynamic, 1)
244 for (ipp=0; ipp < n_chunks; ++ipp) {
245 istart=chunk_size*ipp;
246 iend=MIN(istart+chunk_size,n);
247 #else
248 #pragma omp for schedule(static)
249 for (ipp=0; ipp <np; ++ipp) {
250 istart=len*ipp+MIN(ipp,rest);
251 iend=len*(ipp+1)+MIN(ipp+1,rest);
252 #endif
253 #pragma ivdep
254 for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
255 #ifdef USE_DYNAMIC_SCHEDULING
256 }
257 #else
258 }
259 #endif
260 #pragma omp critical
261 {
262 sum_2+=ss;
263 }
264 }
265 #ifdef PASO_MPI
266 loc_sum[0] = sum_2;
267 MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
268 #endif
269 delta=sum_2;
270 alpha=tau/delta;
271
272 if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
273 /* smoother */
274 sum_3 = 0;
275 sum_4 = 0;
276 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
277 {
278 ss=0;
279 ss1=0;
280 #ifdef USE_DYNAMIC_SCHEDULING
281 #pragma omp for schedule(dynamic, 1)
282 for (ipp=0; ipp < n_chunks; ++ipp) {
283 istart=chunk_size*ipp;
284 iend=MIN(istart+chunk_size,n);
285 #else
286 #pragma omp for schedule(static)
287 for (ipp=0; ipp <np; ++ipp) {
288 istart=len*ipp+MIN(ipp,rest);
289 iend=len*(ipp+1)+MIN(ipp+1,rest);
290 #endif
291 #pragma ivdep
292 for (i0=istart;i0<iend;i0++) {
293 r[i0]-=alpha*v[i0];
294 d=r[i0]-rs[i0];
295 ss+=d*d;
296 ss1+=d*rs[i0];
297 }
298 #ifdef USE_DYNAMIC_SCHEDULING
299 }
300 #else
301 }
302 #endif
303 #pragma omp critical
304 {
305 sum_3+=ss;
306 sum_4+=ss1;
307 }
308 }
309 #ifdef PASO_MPI
310 loc_sum[0] = sum_3;
311 loc_sum[1] = sum_4;
312 MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
313 sum_3=sum[0];
314 sum_4=sum[1];
315 #endif
316 sum_5 = 0;
317 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
318 {
319 gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ;
320 gamma_2= ONE-gamma_1;
321 ss=0;
322 #ifdef USE_DYNAMIC_SCHEDULING
323 #pragma omp for schedule(dynamic, 1)
324 for (ipp=0; ipp < n_chunks; ++ipp) {
325 istart=chunk_size*ipp;
326 iend=MIN(istart+chunk_size,n);
327 #else
328 #pragma omp for schedule(static)
329 for (ipp=0; ipp <np; ++ipp) {
330 istart=len*ipp+MIN(ipp,rest);
331 iend=len*(ipp+1)+MIN(ipp+1,rest);
332 #endif
333 #pragma ivdep
334 for (i0=istart;i0<iend;i0++) {
335 rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
336 x2[i0]+=alpha*p[i0];
337 x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
338 ss+=rs[i0]*rs[i0];
339 }
340 #ifdef USE_DYNAMIC_SCHEDULING
341 }
342 #else
343 }
344 #endif
345 #pragma omp critical
346 {
347 sum_5+=ss;
348 }
349 }
350 #ifdef PASO_MPI
351 loc_sum[0] = sum_5;
352 MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
353 #endif
354 norm_of_residual=sqrt(sum_5);
355 convergeFlag = norm_of_residual <= tol;
356 maxIterFlag = num_iter == maxit;
357 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
358 }
359 }
360 /* end of iteration */
361 num_iter_global=num_iter;
362 norm_of_residual_global=norm_of_residual;
363 if (maxIterFlag) {
364 status = SOLVER_MAXITER_REACHED;
365 } else if (breakFlag) {
366 status = SOLVER_BREAKDOWN;
367 }
368 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
369 TMPMEMFREE(rs);
370 TMPMEMFREE(x2);
371 TMPMEMFREE(v);
372 TMPMEMFREE(p);
373 *iter=num_iter_global;
374 *resid=norm_of_residual_global;
375 }
376 /* End of PCG */
377 return status;
378 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26