/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Contents of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1628 - (show annotations)
Fri Jul 11 13:12:46 2008 UTC (11 years, 3 months ago) by phornby
File MIME type: text/plain
File size: 13626 byte(s)

Merge in /branches/windows_from_1456_trunk_1620_merged_in branch.

You will find a preserved pre-merge trunk in tags under tags/trunk_at_1625.
That will be useful for diffing & checking on my stupidity.

Here is a list of the conflicts and their resolution at this
point in time.


=================================================================================
(LLWS == looks like white space).

finley/src/Assemble_addToSystemMatrix.c - resolve to branch - unused var. may be wrong.....
finley/src/CPPAdapter/SystemMatrixAdapter.cpp - resolve to branch - LLWS
finley/src/CPPAdapter/MeshAdapter.cpp - resolve to branch - LLWS
paso/src/PCG.c - resolve to branch - unused var fixes.
paso/src/SolverFCT.c - resolve to branch - LLWS
paso/src/FGMRES.c - resolve to branch - LLWS
paso/src/Common.h - resolve to trunk version. It's omp.h's include... not sure it's needed,
but for the sake of saftey.....
paso/src/Functions.c - resolve to branch version, indentation/tab removal and return error
on bad unimplemented Paso_FunctionCall.
paso/src/SolverFCT_solve.c - resolve to branch version, unused vars
paso/src/SparseMatrix_MatrixVector.c - resolve to branch version, unused vars.
escript/src/Utils.cpp - resloved to branch, needs WinSock2.h
escript/src/DataExpanded.cpp - resolved to branch version - LLWS
escript/src/DataFactory.cpp - resolve to branch version
=================================================================================

This currently passes tests on linux (debian), but is not checked on windows or Altix yet.

This checkin is to make a trunk I can check out for windows to do tests on it.

Known outstanding problem is in the operator=() method of exceptions
causing warning messages on the intel compilers.

May the God of doughnuts have mercy on my soul.


1
2 /* $Id$ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /* PCG iterations */
17
18 #include "SystemMatrix.h"
19 #include "Paso.h"
20 #include "Solver.h"
21
22 #ifdef _OPENMP
23 #include <omp.h>
24 #endif
25
26 #ifdef PASO_MPI
27 #include <mpi.h>
28 #endif
29
30 /*
31 *
32 * Purpose
33 * =======
34 *
35 * PCG solves the linear system A*x = b using the
36 * preconditioned conjugate gradient method plus a smoother
37 * A has to be symmetric.
38 *
39 * Convergence test: norm( b - A*x )< TOL.
40 * For other measures, see the above reference.
41 *
42 * Arguments
43 * =========
44 *
45 * r (input) DOUBLE PRECISION array, dimension N.
46 * On entry, residual of inital guess x
47 *
48 * x (input/output) DOUBLE PRECISION array, dimension N.
49 * On input, the initial guess.
50 *
51 * ITER (input/output) INT
52 * On input, the maximum iterations to be performed.
53 * On output, actual number of iterations performed.
54 *
55 * INFO (output) INT
56 *
57 * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
58 * = SOLVEr_MAXITER_REACHED
59 * = SOLVER_INPUT_ERROR Illegal parameter:
60 * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
61 * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
62 *
63 * ==============================================================
64 */
65
66 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
67
68 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
69 #define USE_DYNAMIC_SCHEDULING
70 #endif
71
72 err_t Paso_Solver_PCG(
73 Paso_SystemMatrix * A,
74 double * r,
75 double * x,
76 dim_t *iter,
77 double * tolerance,
78 Paso_Performance* pp) {
79
80 /* Local variables */
81 dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, np, ipp;
82 register double ss,ss1;
83 dim_t i0, istart, iend;
84 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
85 err_t status = SOLVER_NO_ERROR;
86 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
87 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
88 double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
89 #ifdef PASO_MPI
90 double loc_sum[2], sum[2];
91 #endif
92 double norm_of_residual,norm_of_residual_global;
93 register double d;
94
95 /* Should not be any executable code before this ifdef */
96
97 #ifdef USE_DYNAMIC_SCHEDULING
98
99 /* Watch out for these declarations (see above) */
100 char* chksz_chr;
101 dim_t n_chunks;
102
103 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
104 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
105 np=omp_get_max_threads();
106 chunk_size=MIN(MAX(1,chunk_size),n/np);
107 n_chunks=n/chunk_size;
108 if (n_chunks*chunk_size<n) n_chunks+=1;
109 #else
110 np=omp_get_max_threads();
111 len=n/np;
112 rest=n-len*np;
113 #endif
114 /* */
115 /*-----------------------------------------------------------------*/
116 /* */
117 /* Start of Calculation : */
118 /* --------------------- */
119 /* */
120 /* */
121 rs=TMPMEMALLOC(n,double);
122 p=TMPMEMALLOC(n,double);
123 v=TMPMEMALLOC(n,double);
124 x2=TMPMEMALLOC(n,double);
125
126 /* Test the input parameters. */
127
128 if (n < 0) {
129 status = SOLVER_INPUT_ERROR;
130 } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
131 status = SOLVER_MEMORY_ERROR;
132 } else {
133 maxit = *iter;
134 tol = *resid;
135 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
136 /* initialize data */
137 #pragma omp parallel private(i0, istart, iend, ipp)
138 {
139 #ifdef USE_DYNAMIC_SCHEDULING
140 #pragma omp for schedule(dynamic, 1)
141 for (ipp=0; ipp < n_chunks; ++ipp) {
142 istart=chunk_size*ipp;
143 iend=MIN(istart+chunk_size,n);
144 #else
145 #pragma omp for schedule(static)
146 for (ipp=0; ipp <np; ++ipp) {
147 istart=len*ipp+MIN(ipp,rest);
148 iend=len*(ipp+1)+MIN(ipp+1,rest);
149 #endif
150 #pragma ivdep
151 for (i0=istart;i0<iend;i0++) {
152 rs[i0]=r[i0];
153 x2[i0]=x[i0];
154 p[i0]=0;
155 v[i0]=0;
156 }
157 #ifdef USE_DYNAMIC_SCHEDULING
158 }
159 #else
160 }
161 #endif
162 }
163 num_iter=0;
164
165 /* PGH */
166 /* without this we get a use of an unititialised var below */
167 tau = 0;
168
169 /* start of iteration */
170 while (!(convergeFlag || maxIterFlag || breakFlag)) {
171 ++(num_iter);
172
173 /* PGH */
174 /* The next lines were commented out before I got here */
175 /* v=prec(r) */
176 /* tau=v*r; */
177 /* leading to the use of an unititialised var below */
178
179 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
180 Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
181 Paso_Solver_solvePreconditioner(A,v,r);
182 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
183 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
184
185 sum_1 = 0;
186 #pragma omp parallel private(i0, istart, iend, ipp, ss)
187 {
188 ss=0;
189 #ifdef USE_DYNAMIC_SCHEDULING
190 #pragma omp for schedule(dynamic, 1)
191 for (ipp=0; ipp < n_chunks; ++ipp) {
192 istart=chunk_size*ipp;
193 iend=MIN(istart+chunk_size,n);
194 #else
195 #pragma omp for schedule(static)
196 for (ipp=0; ipp <np; ++ipp) {
197 istart=len*ipp+MIN(ipp,rest);
198 iend=len*(ipp+1)+MIN(ipp+1,rest);
199 #endif
200 #pragma ivdep
201 for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
202 #ifdef USE_DYNAMIC_SCHEDULING
203 }
204 #else
205 }
206 #endif
207 #pragma omp critical
208 {
209 sum_1+=ss;
210 }
211 }
212 #ifdef PASO_MPI
213 /* In case we have many MPI processes, each of which may have several OMP threads:
214 OMP master participates in an MPI reduction to get global sum_1 */
215 loc_sum[0] = sum_1;
216 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
217 #endif
218 tau_old=tau;
219 tau=sum_1;
220 /* p=v+beta*p */
221 #pragma omp parallel private(i0, istart, iend, ipp,beta)
222 {
223 #ifdef USE_DYNAMIC_SCHEDULING
224 #pragma omp for schedule(dynamic, 1)
225 for (ipp=0; ipp < n_chunks; ++ipp) {
226 istart=chunk_size*ipp;
227 iend=MIN(istart+chunk_size,n);
228 #else
229 #pragma omp for schedule(static)
230 for (ipp=0; ipp <np; ++ipp) {
231 istart=len*ipp+MIN(ipp,rest);
232 iend=len*(ipp+1)+MIN(ipp+1,rest);
233 #endif
234 if (num_iter==1) {
235 #pragma ivdep
236 for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
237 } else {
238 beta=tau/tau_old;
239 #pragma ivdep
240 for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
241 }
242 #ifdef USE_DYNAMIC_SCHEDULING
243 }
244 #else
245 }
246 #endif
247 }
248 /* v=A*p */
249 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
250 Performance_startMonitor(pp,PERFORMANCE_MVM);
251 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, p,ZERO,v);
252 Performance_stopMonitor(pp,PERFORMANCE_MVM);
253 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
254
255 /* delta=p*v */
256 sum_2 = 0;
257 #pragma omp parallel private(i0, istart, iend, ipp,ss)
258 {
259 ss=0;
260 #ifdef USE_DYNAMIC_SCHEDULING
261 #pragma omp for schedule(dynamic, 1)
262 for (ipp=0; ipp < n_chunks; ++ipp) {
263 istart=chunk_size*ipp;
264 iend=MIN(istart+chunk_size,n);
265 #else
266 #pragma omp for schedule(static)
267 for (ipp=0; ipp <np; ++ipp) {
268 istart=len*ipp+MIN(ipp,rest);
269 iend=len*(ipp+1)+MIN(ipp+1,rest);
270 #endif
271 #pragma ivdep
272 for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
273 #ifdef USE_DYNAMIC_SCHEDULING
274 }
275 #else
276 }
277 #endif
278 #pragma omp critical
279 {
280 sum_2+=ss;
281 }
282 }
283 #ifdef PASO_MPI
284 loc_sum[0] = sum_2;
285 MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
286 #endif
287 delta=sum_2;
288 alpha=tau/delta;
289
290 if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
291 /* smoother */
292 sum_3 = 0;
293 sum_4 = 0;
294 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
295 {
296 ss=0;
297 ss1=0;
298 #ifdef USE_DYNAMIC_SCHEDULING
299 #pragma omp for schedule(dynamic, 1)
300 for (ipp=0; ipp < n_chunks; ++ipp) {
301 istart=chunk_size*ipp;
302 iend=MIN(istart+chunk_size,n);
303 #else
304 #pragma omp for schedule(static)
305 for (ipp=0; ipp <np; ++ipp) {
306 istart=len*ipp+MIN(ipp,rest);
307 iend=len*(ipp+1)+MIN(ipp+1,rest);
308 #endif
309 #pragma ivdep
310 for (i0=istart;i0<iend;i0++) {
311 r[i0]-=alpha*v[i0];
312 d=r[i0]-rs[i0];
313 ss+=d*d;
314 ss1+=d*rs[i0];
315 }
316 #ifdef USE_DYNAMIC_SCHEDULING
317 }
318 #else
319 }
320 #endif
321 #pragma omp critical
322 {
323 sum_3+=ss;
324 sum_4+=ss1;
325 }
326 }
327 #ifdef PASO_MPI
328 loc_sum[0] = sum_3;
329 loc_sum[1] = sum_4;
330 MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
331 sum_3=sum[0];
332 sum_4=sum[1];
333 #endif
334 sum_5 = 0;
335 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
336 {
337 gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ;
338 gamma_2= ONE-gamma_1;
339 ss=0;
340 #ifdef USE_DYNAMIC_SCHEDULING
341 #pragma omp for schedule(dynamic, 1)
342 for (ipp=0; ipp < n_chunks; ++ipp) {
343 istart=chunk_size*ipp;
344 iend=MIN(istart+chunk_size,n);
345 #else
346 #pragma omp for schedule(static)
347 for (ipp=0; ipp <np; ++ipp) {
348 istart=len*ipp+MIN(ipp,rest);
349 iend=len*(ipp+1)+MIN(ipp+1,rest);
350 #endif
351 #pragma ivdep
352 for (i0=istart;i0<iend;i0++) {
353 rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
354 x2[i0]+=alpha*p[i0];
355 x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
356 ss+=rs[i0]*rs[i0];
357 }
358 #ifdef USE_DYNAMIC_SCHEDULING
359 }
360 #else
361 }
362 #endif
363 #pragma omp critical
364 {
365 sum_5+=ss;
366 }
367 }
368 #ifdef PASO_MPI
369 loc_sum[0] = sum_5;
370 MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
371 #endif
372 norm_of_residual=sqrt(sum_5);
373 convergeFlag = norm_of_residual <= tol;
374 maxIterFlag = num_iter == maxit;
375 breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
376 }
377 }
378 /* end of iteration */
379 num_iter_global=num_iter;
380 norm_of_residual_global=norm_of_residual;
381 if (maxIterFlag) {
382 status = SOLVER_MAXITER_REACHED;
383 } else if (breakFlag) {
384 status = SOLVER_BREAKDOWN;
385 }
386 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
387 TMPMEMFREE(rs);
388 TMPMEMFREE(x2);
389 TMPMEMFREE(v);
390 TMPMEMFREE(p);
391 *iter=num_iter_global;
392 *resid=norm_of_residual_global;
393 }
394 /* End of PCG */
395 return status;
396 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26