/[escript]/trunk/paso/src/PCG.c
ViewVC logotype

Annotation of /trunk/paso/src/PCG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1628 - (hide annotations)
Fri Jul 11 13:12:46 2008 UTC (11 years, 3 months ago) by phornby
File MIME type: text/plain
File size: 13626 byte(s)

Merge in /branches/windows_from_1456_trunk_1620_merged_in branch.

You will find a preserved pre-merge trunk in tags under tags/trunk_at_1625.
That will be useful for diffing & checking on my stupidity.

Here is a list of the conflicts and their resolution at this
point in time.


=================================================================================
(LLWS == looks like white space).

finley/src/Assemble_addToSystemMatrix.c - resolve to branch - unused var. may be wrong.....
finley/src/CPPAdapter/SystemMatrixAdapter.cpp - resolve to branch - LLWS
finley/src/CPPAdapter/MeshAdapter.cpp - resolve to branch - LLWS
paso/src/PCG.c - resolve to branch - unused var fixes.
paso/src/SolverFCT.c - resolve to branch - LLWS
paso/src/FGMRES.c - resolve to branch - LLWS
paso/src/Common.h - resolve to trunk version. It's omp.h's include... not sure it's needed,
but for the sake of saftey.....
paso/src/Functions.c - resolve to branch version, indentation/tab removal and return error
on bad unimplemented Paso_FunctionCall.
paso/src/SolverFCT_solve.c - resolve to branch version, unused vars
paso/src/SparseMatrix_MatrixVector.c - resolve to branch version, unused vars.
escript/src/Utils.cpp - resloved to branch, needs WinSock2.h
escript/src/DataExpanded.cpp - resolved to branch version - LLWS
escript/src/DataFactory.cpp - resolve to branch version
=================================================================================

This currently passes tests on linux (debian), but is not checked on windows or Altix yet.

This checkin is to make a trunk I can check out for windows to do tests on it.

Known outstanding problem is in the operator=() method of exceptions
causing warning messages on the intel compilers.

May the God of doughnuts have mercy on my soul.


1 ksteube 1312
2 jgs 150 /* $Id$ */
3    
4 ksteube 1312 /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15 gross 495
16 jgs 150 /* PCG iterations */
17    
18 gross 700 #include "SystemMatrix.h"
19     #include "Paso.h"
20 jgs 150 #include "Solver.h"
21 woo409 757
22 jgs 150 #ifdef _OPENMP
23     #include <omp.h>
24     #endif
25    
26 ksteube 1312 #ifdef PASO_MPI
27     #include <mpi.h>
28     #endif
29    
30 jgs 150 /*
31     *
32     * Purpose
33     * =======
34     *
35     * PCG solves the linear system A*x = b using the
36     * preconditioned conjugate gradient method plus a smoother
37     * A has to be symmetric.
38     *
39     * Convergence test: norm( b - A*x )< TOL.
40     * For other measures, see the above reference.
41     *
42     * Arguments
43     * =========
44     *
45     * r (input) DOUBLE PRECISION array, dimension N.
46     * On entry, residual of inital guess x
47     *
48     * x (input/output) DOUBLE PRECISION array, dimension N.
49     * On input, the initial guess.
50     *
51     * ITER (input/output) INT
52     * On input, the maximum iterations to be performed.
53     * On output, actual number of iterations performed.
54     *
55     * INFO (output) INT
56     *
57     * = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned.
58     * = SOLVEr_MAXITER_REACHED
59     * = SOLVER_INPUT_ERROR Illegal parameter:
60     * = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller
61     * = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller
62     *
63     * ==============================================================
64     */
65    
66 gross 1570 /* #define PASO_DYNAMIC_SCHEDULING_MVM */
67    
68     #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
69     #define USE_DYNAMIC_SCHEDULING
70     #endif
71    
72 jgs 150 err_t Paso_Solver_PCG(
73     Paso_SystemMatrix * A,
74     double * r,
75     double * x,
76     dim_t *iter,
77 gross 584 double * tolerance,
78     Paso_Performance* pp) {
79 jgs 150
80     /* Local variables */
81 phornby 1628 dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, np, ipp;
82 gross 1570 register double ss,ss1;
83     dim_t i0, istart, iend;
84 jgs 150 bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE;
85     err_t status = SOLVER_NO_ERROR;
86 ksteube 1312 dim_t n = Paso_SystemMatrix_getTotalNumRows(A);
87 jgs 150 double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ;
88     double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol;
89 phornby 1628 #ifdef PASO_MPI
90     double loc_sum[2], sum[2];
91     #endif
92     double norm_of_residual,norm_of_residual_global;
93     register double d;
94 jgs 150
95 phornby 1628 /* Should not be any executable code before this ifdef */
96    
97 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
98 phornby 1628
99     /* Watch out for these declarations (see above) */
100     char* chksz_chr;
101     dim_t n_chunks;
102    
103 gross 1570 chksz_chr=getenv("PASO_CHUNK_SIZE_PCG");
104     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
105 phornby 1628 np=omp_get_max_threads();
106 gross 1570 chunk_size=MIN(MAX(1,chunk_size),n/np);
107     n_chunks=n/chunk_size;
108     if (n_chunks*chunk_size<n) n_chunks+=1;
109     #else
110 phornby 1628 np=omp_get_max_threads();
111 gross 1570 len=n/np;
112     rest=n-len*np;
113     #endif
114 jgs 150 /* */
115     /*-----------------------------------------------------------------*/
116     /* */
117     /* Start of Calculation : */
118     /* --------------------- */
119     /* */
120     /* */
121     rs=TMPMEMALLOC(n,double);
122     p=TMPMEMALLOC(n,double);
123     v=TMPMEMALLOC(n,double);
124     x2=TMPMEMALLOC(n,double);
125    
126     /* Test the input parameters. */
127    
128     if (n < 0) {
129     status = SOLVER_INPUT_ERROR;
130     } else if (rs==NULL || p==NULL || v==NULL || x2==NULL) {
131     status = SOLVER_MEMORY_ERROR;
132     } else {
133     maxit = *iter;
134     tol = *resid;
135 gross 1556 Performance_startMonitor(pp,PERFORMANCE_SOLVER);
136     /* initialize data */
137 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp)
138 jgs 150 {
139 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
140     #pragma omp for schedule(dynamic, 1)
141     for (ipp=0; ipp < n_chunks; ++ipp) {
142     istart=chunk_size*ipp;
143     iend=MIN(istart+chunk_size,n);
144     #else
145     #pragma omp for schedule(static)
146     for (ipp=0; ipp <np; ++ipp) {
147     istart=len*ipp+MIN(ipp,rest);
148     iend=len*(ipp+1)+MIN(ipp+1,rest);
149     #endif
150 gross 1572 #pragma ivdep
151     for (i0=istart;i0<iend;i0++) {
152 gross 1570 rs[i0]=r[i0];
153     x2[i0]=x[i0];
154     p[i0]=0;
155     v[i0]=0;
156 gross 1572 }
157 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
158     }
159     #else
160     }
161     #endif
162 gross 1556 }
163     num_iter=0;
164 phornby 1628
165     /* PGH */
166     /* without this we get a use of an unititialised var below */
167     tau = 0;
168    
169 gross 1556 /* start of iteration */
170     while (!(convergeFlag || maxIterFlag || breakFlag)) {
171 jgs 150 ++(num_iter);
172 phornby 1628
173     /* PGH */
174     /* The next lines were commented out before I got here */
175 jgs 150 /* v=prec(r) */
176 phornby 1628 /* tau=v*r; */
177     /* leading to the use of an unititialised var below */
178    
179 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
180     Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER);
181 jgs 150 Paso_Solver_solvePreconditioner(A,v,r);
182 gross 584 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER);
183     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
184 phornby 1628
185 gross 1570 sum_1 = 0;
186 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss)
187 gross 1570 {
188 gross 1571 ss=0;
189 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
190     #pragma omp for schedule(dynamic, 1)
191     for (ipp=0; ipp < n_chunks; ++ipp) {
192     istart=chunk_size*ipp;
193     iend=MIN(istart+chunk_size,n);
194     #else
195 gross 1572 #pragma omp for schedule(static)
196 gross 1570 for (ipp=0; ipp <np; ++ipp) {
197     istart=len*ipp+MIN(ipp,rest);
198     iend=len*(ipp+1)+MIN(ipp+1,rest);
199     #endif
200 gross 1572 #pragma ivdep
201     for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0];
202 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
203     }
204     #else
205     }
206     #endif
207 gross 1572 #pragma omp critical
208 gross 1571 {
209 gross 1572 sum_1+=ss;
210 gross 1571 }
211 gross 1570 }
212 ksteube 1312 #ifdef PASO_MPI
213     /* In case we have many MPI processes, each of which may have several OMP threads:
214     OMP master participates in an MPI reduction to get global sum_1 */
215 ksteube 1579 loc_sum[0] = sum_1;
216 gross 1556 MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
217 ksteube 1312 #endif
218 jgs 150 tau_old=tau;
219     tau=sum_1;
220     /* p=v+beta*p */
221 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,beta)
222 gross 1570 {
223     #ifdef USE_DYNAMIC_SCHEDULING
224     #pragma omp for schedule(dynamic, 1)
225     for (ipp=0; ipp < n_chunks; ++ipp) {
226     istart=chunk_size*ipp;
227     iend=MIN(istart+chunk_size,n);
228     #else
229     #pragma omp for schedule(static)
230     for (ipp=0; ipp <np; ++ipp) {
231     istart=len*ipp+MIN(ipp,rest);
232     iend=len*(ipp+1)+MIN(ipp+1,rest);
233     #endif
234 gross 1572 if (num_iter==1) {
235     #pragma ivdep
236     for (i0=istart;i0<iend;i0++) p[i0]=v[i0];
237     } else {
238     beta=tau/tau_old;
239     #pragma ivdep
240     for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0];
241     }
242 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
243     }
244     #else
245     }
246     #endif
247 jgs 150 }
248     /* v=A*p */
249 gross 584 Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
250     Performance_startMonitor(pp,PERFORMANCE_MVM);
251 gross 415 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, p,ZERO,v);
252 gross 584 Performance_stopMonitor(pp,PERFORMANCE_MVM);
253     Performance_startMonitor(pp,PERFORMANCE_SOLVER);
254 gross 1570
255 jgs 150 /* delta=p*v */
256 gross 1570 sum_2 = 0;
257 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp,ss)
258 gross 1570 {
259 gross 1571 ss=0;
260 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
261     #pragma omp for schedule(dynamic, 1)
262     for (ipp=0; ipp < n_chunks; ++ipp) {
263     istart=chunk_size*ipp;
264     iend=MIN(istart+chunk_size,n);
265     #else
266     #pragma omp for schedule(static)
267     for (ipp=0; ipp <np; ++ipp) {
268     istart=len*ipp+MIN(ipp,rest);
269     iend=len*(ipp+1)+MIN(ipp+1,rest);
270     #endif
271 gross 1572 #pragma ivdep
272     for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0];
273 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
274     }
275     #else
276     }
277     #endif
278 gross 1572 #pragma omp critical
279 gross 1571 {
280 gross 1572 sum_2+=ss;
281 gross 1571 }
282 gross 1570 }
283 ksteube 1312 #ifdef PASO_MPI
284 gross 1556 loc_sum[0] = sum_2;
285     MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
286 ksteube 1312 #endif
287 jgs 150 delta=sum_2;
288 gross 1572 alpha=tau/delta;
289 jgs 150
290     if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) {
291     /* smoother */
292 gross 1570 sum_3 = 0;
293     sum_4 = 0;
294 gross 1572 #pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1)
295 gross 1556 {
296 gross 1571 ss=0;
297     ss1=0;
298 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
299     #pragma omp for schedule(dynamic, 1)
300     for (ipp=0; ipp < n_chunks; ++ipp) {
301     istart=chunk_size*ipp;
302     iend=MIN(istart+chunk_size,n);
303     #else
304     #pragma omp for schedule(static)
305     for (ipp=0; ipp <np; ++ipp) {
306     istart=len*ipp+MIN(ipp,rest);
307     iend=len*(ipp+1)+MIN(ipp+1,rest);
308     #endif
309 gross 1572 #pragma ivdep
310     for (i0=istart;i0<iend;i0++) {
311     r[i0]-=alpha*v[i0];
312     d=r[i0]-rs[i0];
313     ss+=d*d;
314     ss1+=d*rs[i0];
315     }
316 gross 1571 #ifdef USE_DYNAMIC_SCHEDULING
317     }
318     #else
319     }
320     #endif
321 gross 1572 #pragma omp critical
322 gross 1570 {
323 gross 1572 sum_3+=ss;
324     sum_4+=ss1;
325 gross 1570 }
326 ksteube 1312 }
327     #ifdef PASO_MPI
328 gross 1556 loc_sum[0] = sum_3;
329     loc_sum[1] = sum_4;
330     MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
331     sum_3=sum[0];
332     sum_4=sum[1];
333 ksteube 1312 #endif
334 gross 1570 sum_5 = 0;
335 gross 1571 #pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2)
336 gross 1556 {
337 gross 1570 gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ;
338     gamma_2= ONE-gamma_1;
339 gross 1571 ss=0;
340 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
341     #pragma omp for schedule(dynamic, 1)
342     for (ipp=0; ipp < n_chunks; ++ipp) {
343     istart=chunk_size*ipp;
344     iend=MIN(istart+chunk_size,n);
345     #else
346     #pragma omp for schedule(static)
347     for (ipp=0; ipp <np; ++ipp) {
348     istart=len*ipp+MIN(ipp,rest);
349     iend=len*(ipp+1)+MIN(ipp+1,rest);
350     #endif
351 gross 1572 #pragma ivdep
352     for (i0=istart;i0<iend;i0++) {
353     rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0];
354     x2[i0]+=alpha*p[i0];
355     x[i0]=gamma_2*x[i0]+gamma_1*x2[i0];
356     ss+=rs[i0]*rs[i0];
357     }
358 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
359 gross 1556 }
360 gross 1570 #else
361     }
362     #endif
363 gross 1571 #pragma omp critical
364     {
365 gross 1572 sum_5+=ss;
366 gross 1571 }
367 gross 495 }
368 ksteube 1312 #ifdef PASO_MPI
369 gross 1556 loc_sum[0] = sum_5;
370     MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm);
371 ksteube 1312 #endif
372 gross 495 norm_of_residual=sqrt(sum_5);
373     convergeFlag = norm_of_residual <= tol;
374     maxIterFlag = num_iter == maxit;
375     breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS);
376     }
377 gross 1556 }
378     /* end of iteration */
379     num_iter_global=num_iter;
380     norm_of_residual_global=norm_of_residual;
381     if (maxIterFlag) {
382     status = SOLVER_MAXITER_REACHED;
383     } else if (breakFlag) {
384     status = SOLVER_BREAKDOWN;
385     }
386     Performance_stopMonitor(pp,PERFORMANCE_SOLVER);
387 jgs 150 TMPMEMFREE(rs);
388     TMPMEMFREE(x2);
389     TMPMEMFREE(v);
390     TMPMEMFREE(p);
391     *iter=num_iter_global;
392     *resid=norm_of_residual_global;
393     }
394     /* End of PCG */
395     return status;
396     }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26