1 |
|
2 |
/* $Id$ */ |
3 |
|
4 |
/******************************************************* |
5 |
* |
6 |
* Copyright 2003-2007 by ACceSS MNRF |
7 |
* Copyright 2007 by University of Queensland |
8 |
* |
9 |
* http://esscc.uq.edu.au |
10 |
* Primary Business: Queensland, Australia |
11 |
* Licensed under the Open Software License version 3.0 |
12 |
* http://www.opensource.org/licenses/osl-3.0.php |
13 |
* |
14 |
*******************************************************/ |
15 |
|
16 |
/* PCG iterations */ |
17 |
|
18 |
#include "SystemMatrix.h" |
19 |
#include "Paso.h" |
20 |
#include "Solver.h" |
21 |
|
22 |
#ifdef _OPENMP |
23 |
#include <omp.h> |
24 |
#endif |
25 |
|
26 |
#ifdef PASO_MPI |
27 |
#include <mpi.h> |
28 |
#endif |
29 |
|
30 |
/* |
31 |
* |
32 |
* Purpose |
33 |
* ======= |
34 |
* |
35 |
* PCG solves the linear system A*x = b using the |
36 |
* preconditioned conjugate gradient method plus a smoother |
37 |
* A has to be symmetric. |
38 |
* |
39 |
* Convergence test: norm( b - A*x )< TOL. |
40 |
* For other measures, see the above reference. |
41 |
* |
42 |
* Arguments |
43 |
* ========= |
44 |
* |
45 |
* r (input) DOUBLE PRECISION array, dimension N. |
46 |
* On entry, residual of inital guess x |
47 |
* |
48 |
* x (input/output) DOUBLE PRECISION array, dimension N. |
49 |
* On input, the initial guess. |
50 |
* |
51 |
* ITER (input/output) INT |
52 |
* On input, the maximum iterations to be performed. |
53 |
* On output, actual number of iterations performed. |
54 |
* |
55 |
* INFO (output) INT |
56 |
* |
57 |
* = SOLVER_NO_ERROR: Successful exit. Iterated approximate solution returned. |
58 |
* = SOLVEr_MAXITER_REACHED |
59 |
* = SOLVER_INPUT_ERROR Illegal parameter: |
60 |
* = SOLVEr_BREAKDOWN: If parameters rHO or OMEGA become smaller |
61 |
* = SOLVER_MEMORY_ERROR : If parameters rHO or OMEGA become smaller |
62 |
* |
63 |
* ============================================================== |
64 |
*/ |
65 |
|
66 |
/* #define PASO_DYNAMIC_SCHEDULING_MVM */ |
67 |
|
68 |
#if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP |
69 |
#define USE_DYNAMIC_SCHEDULING |
70 |
#endif |
71 |
|
72 |
err_t Paso_Solver_PCG( |
73 |
Paso_SystemMatrix * A, |
74 |
double * r, |
75 |
double * x, |
76 |
dim_t *iter, |
77 |
double * tolerance, |
78 |
Paso_Performance* pp) { |
79 |
|
80 |
/* Local variables */ |
81 |
dim_t num_iter=0,maxit,num_iter_global, chunk_size=-1, len,rest, n_chunks, np, ipp; |
82 |
register double ss,ss1; |
83 |
dim_t i0, istart, iend; |
84 |
bool_t breakFlag=FALSE, maxIterFlag=FALSE, convergeFlag=FALSE; |
85 |
err_t status = SOLVER_NO_ERROR; |
86 |
dim_t n = Paso_SystemMatrix_getTotalNumRows(A); |
87 |
double *resid = tolerance, *rs=NULL, *p=NULL, *v=NULL, *x2=NULL ; |
88 |
double tau_old,tau,beta,delta,gamma_1,gamma_2,alpha,sum_1,sum_2,sum_3,sum_4,sum_5,tol; |
89 |
#ifdef PASO_MPI |
90 |
double loc_sum[2], sum[2]; |
91 |
#endif |
92 |
double norm_of_residual,norm_of_residual_global; |
93 |
register double d; |
94 |
char* chksz_chr; |
95 |
np=omp_get_max_threads(); |
96 |
|
97 |
#ifdef USE_DYNAMIC_SCHEDULING |
98 |
chksz_chr=getenv("PASO_CHUNK_SIZE_PCG"); |
99 |
if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size); |
100 |
chunk_size=MIN(MAX(1,chunk_size),n/np); |
101 |
n_chunks=n/chunk_size; |
102 |
if (n_chunks*chunk_size<n) n_chunks+=1; |
103 |
#else |
104 |
len=n/np; |
105 |
rest=n-len*np; |
106 |
#endif |
107 |
/* */ |
108 |
/*-----------------------------------------------------------------*/ |
109 |
/* */ |
110 |
/* Start of Calculation : */ |
111 |
/* --------------------- */ |
112 |
/* */ |
113 |
/* */ |
114 |
rs=TMPMEMALLOC(n,double); |
115 |
p=TMPMEMALLOC(n,double); |
116 |
v=TMPMEMALLOC(n,double); |
117 |
x2=TMPMEMALLOC(n,double); |
118 |
|
119 |
/* Test the input parameters. */ |
120 |
|
121 |
if (n < 0) { |
122 |
status = SOLVER_INPUT_ERROR; |
123 |
} else if (rs==NULL || p==NULL || v==NULL || x2==NULL) { |
124 |
status = SOLVER_MEMORY_ERROR; |
125 |
} else { |
126 |
maxit = *iter; |
127 |
tol = *resid; |
128 |
Performance_startMonitor(pp,PERFORMANCE_SOLVER); |
129 |
/* initialize data */ |
130 |
#pragma omp parallel private(i0, istart, iend, ipp) |
131 |
{ |
132 |
#ifdef USE_DYNAMIC_SCHEDULING |
133 |
#pragma omp for schedule(dynamic, 1) |
134 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
135 |
istart=chunk_size*ipp; |
136 |
iend=MIN(istart+chunk_size,n); |
137 |
#else |
138 |
#pragma omp for schedule(static) |
139 |
for (ipp=0; ipp <np; ++ipp) { |
140 |
istart=len*ipp+MIN(ipp,rest); |
141 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
142 |
#endif |
143 |
#pragma ivdep |
144 |
for (i0=istart;i0<iend;i0++) { |
145 |
rs[i0]=r[i0]; |
146 |
x2[i0]=x[i0]; |
147 |
p[i0]=0; |
148 |
v[i0]=0; |
149 |
} |
150 |
#ifdef USE_DYNAMIC_SCHEDULING |
151 |
} |
152 |
#else |
153 |
} |
154 |
#endif |
155 |
} |
156 |
num_iter=0; |
157 |
|
158 |
/* PGH */ |
159 |
/* without this we get a use of an unititialised var below */ |
160 |
tau = 0; |
161 |
|
162 |
/* start of iteration */ |
163 |
while (!(convergeFlag || maxIterFlag || breakFlag)) { |
164 |
++(num_iter); |
165 |
|
166 |
/* PGH */ |
167 |
/* The next lines were commented out before I got here */ |
168 |
/* v=prec(r) */ |
169 |
/* tau=v*r; */ |
170 |
/* leading to the use of an unititialised var below */ |
171 |
|
172 |
Performance_stopMonitor(pp,PERFORMANCE_SOLVER); |
173 |
Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER); |
174 |
Paso_Solver_solvePreconditioner(A,v,r); |
175 |
Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER); |
176 |
Performance_startMonitor(pp,PERFORMANCE_SOLVER); |
177 |
|
178 |
sum_1 = 0; |
179 |
#pragma omp parallel private(i0, istart, iend, ipp, ss) |
180 |
{ |
181 |
ss=0; |
182 |
#ifdef USE_DYNAMIC_SCHEDULING |
183 |
#pragma omp for schedule(dynamic, 1) |
184 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
185 |
istart=chunk_size*ipp; |
186 |
iend=MIN(istart+chunk_size,n); |
187 |
#else |
188 |
#pragma omp for schedule(static) |
189 |
for (ipp=0; ipp <np; ++ipp) { |
190 |
istart=len*ipp+MIN(ipp,rest); |
191 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
192 |
#endif |
193 |
#pragma ivdep |
194 |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*r[i0]; |
195 |
#ifdef USE_DYNAMIC_SCHEDULING |
196 |
} |
197 |
#else |
198 |
} |
199 |
#endif |
200 |
#pragma omp critical |
201 |
{ |
202 |
sum_1+=ss; |
203 |
} |
204 |
} |
205 |
#ifdef PASO_MPI |
206 |
/* In case we have many MPI processes, each of which may have several OMP threads: |
207 |
OMP master participates in an MPI reduction to get global sum_1 */ |
208 |
loc_sum[0] = sum_1; |
209 |
MPI_Allreduce(loc_sum, &sum_1, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm); |
210 |
#endif |
211 |
tau_old=tau; |
212 |
tau=sum_1; |
213 |
/* p=v+beta*p */ |
214 |
#pragma omp parallel private(i0, istart, iend, ipp,beta) |
215 |
{ |
216 |
#ifdef USE_DYNAMIC_SCHEDULING |
217 |
#pragma omp for schedule(dynamic, 1) |
218 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
219 |
istart=chunk_size*ipp; |
220 |
iend=MIN(istart+chunk_size,n); |
221 |
#else |
222 |
#pragma omp for schedule(static) |
223 |
for (ipp=0; ipp <np; ++ipp) { |
224 |
istart=len*ipp+MIN(ipp,rest); |
225 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
226 |
#endif |
227 |
if (num_iter==1) { |
228 |
#pragma ivdep |
229 |
for (i0=istart;i0<iend;i0++) p[i0]=v[i0]; |
230 |
} else { |
231 |
beta=tau/tau_old; |
232 |
#pragma ivdep |
233 |
for (i0=istart;i0<iend;i0++) p[i0]=v[i0]+beta*p[i0]; |
234 |
} |
235 |
#ifdef USE_DYNAMIC_SCHEDULING |
236 |
} |
237 |
#else |
238 |
} |
239 |
#endif |
240 |
} |
241 |
/* v=A*p */ |
242 |
Performance_stopMonitor(pp,PERFORMANCE_SOLVER); |
243 |
Performance_startMonitor(pp,PERFORMANCE_MVM); |
244 |
Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(ONE, A, p,ZERO,v); |
245 |
Performance_stopMonitor(pp,PERFORMANCE_MVM); |
246 |
Performance_startMonitor(pp,PERFORMANCE_SOLVER); |
247 |
|
248 |
/* delta=p*v */ |
249 |
sum_2 = 0; |
250 |
#pragma omp parallel private(i0, istart, iend, ipp,ss) |
251 |
{ |
252 |
ss=0; |
253 |
#ifdef USE_DYNAMIC_SCHEDULING |
254 |
#pragma omp for schedule(dynamic, 1) |
255 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
256 |
istart=chunk_size*ipp; |
257 |
iend=MIN(istart+chunk_size,n); |
258 |
#else |
259 |
#pragma omp for schedule(static) |
260 |
for (ipp=0; ipp <np; ++ipp) { |
261 |
istart=len*ipp+MIN(ipp,rest); |
262 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
263 |
#endif |
264 |
#pragma ivdep |
265 |
for (i0=istart;i0<iend;i0++) ss+=v[i0]*p[i0]; |
266 |
#ifdef USE_DYNAMIC_SCHEDULING |
267 |
} |
268 |
#else |
269 |
} |
270 |
#endif |
271 |
#pragma omp critical |
272 |
{ |
273 |
sum_2+=ss; |
274 |
} |
275 |
} |
276 |
#ifdef PASO_MPI |
277 |
loc_sum[0] = sum_2; |
278 |
MPI_Allreduce(loc_sum, &sum_2, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm); |
279 |
#endif |
280 |
delta=sum_2; |
281 |
alpha=tau/delta; |
282 |
|
283 |
if (! (breakFlag = (ABS(delta) <= TOLERANCE_FOR_SCALARS))) { |
284 |
/* smoother */ |
285 |
sum_3 = 0; |
286 |
sum_4 = 0; |
287 |
#pragma omp parallel private(i0, istart, iend, ipp,d, ss, ss1) |
288 |
{ |
289 |
ss=0; |
290 |
ss1=0; |
291 |
#ifdef USE_DYNAMIC_SCHEDULING |
292 |
#pragma omp for schedule(dynamic, 1) |
293 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
294 |
istart=chunk_size*ipp; |
295 |
iend=MIN(istart+chunk_size,n); |
296 |
#else |
297 |
#pragma omp for schedule(static) |
298 |
for (ipp=0; ipp <np; ++ipp) { |
299 |
istart=len*ipp+MIN(ipp,rest); |
300 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
301 |
#endif |
302 |
#pragma ivdep |
303 |
for (i0=istart;i0<iend;i0++) { |
304 |
r[i0]-=alpha*v[i0]; |
305 |
d=r[i0]-rs[i0]; |
306 |
ss+=d*d; |
307 |
ss1+=d*rs[i0]; |
308 |
} |
309 |
#ifdef USE_DYNAMIC_SCHEDULING |
310 |
} |
311 |
#else |
312 |
} |
313 |
#endif |
314 |
#pragma omp critical |
315 |
{ |
316 |
sum_3+=ss; |
317 |
sum_4+=ss1; |
318 |
} |
319 |
} |
320 |
#ifdef PASO_MPI |
321 |
loc_sum[0] = sum_3; |
322 |
loc_sum[1] = sum_4; |
323 |
MPI_Allreduce(loc_sum, sum, 2, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm); |
324 |
sum_3=sum[0]; |
325 |
sum_4=sum[1]; |
326 |
#endif |
327 |
sum_5 = 0; |
328 |
#pragma omp parallel private(i0, istart, iend, ipp, ss, gamma_1,gamma_2) |
329 |
{ |
330 |
gamma_1= ( (ABS(sum_3)<= ZERO) ? 0 : -sum_4/sum_3) ; |
331 |
gamma_2= ONE-gamma_1; |
332 |
ss=0; |
333 |
#ifdef USE_DYNAMIC_SCHEDULING |
334 |
#pragma omp for schedule(dynamic, 1) |
335 |
for (ipp=0; ipp < n_chunks; ++ipp) { |
336 |
istart=chunk_size*ipp; |
337 |
iend=MIN(istart+chunk_size,n); |
338 |
#else |
339 |
#pragma omp for schedule(static) |
340 |
for (ipp=0; ipp <np; ++ipp) { |
341 |
istart=len*ipp+MIN(ipp,rest); |
342 |
iend=len*(ipp+1)+MIN(ipp+1,rest); |
343 |
#endif |
344 |
#pragma ivdep |
345 |
for (i0=istart;i0<iend;i0++) { |
346 |
rs[i0]=gamma_2*rs[i0]+gamma_1*r[i0]; |
347 |
x2[i0]+=alpha*p[i0]; |
348 |
x[i0]=gamma_2*x[i0]+gamma_1*x2[i0]; |
349 |
ss+=rs[i0]*rs[i0]; |
350 |
} |
351 |
#ifdef USE_DYNAMIC_SCHEDULING |
352 |
} |
353 |
#else |
354 |
} |
355 |
#endif |
356 |
#pragma omp critical |
357 |
{ |
358 |
sum_5+=ss; |
359 |
} |
360 |
} |
361 |
#ifdef PASO_MPI |
362 |
loc_sum[0] = sum_5; |
363 |
MPI_Allreduce(loc_sum, &sum_5, 1, MPI_DOUBLE, MPI_SUM, A->mpi_info->comm); |
364 |
#endif |
365 |
norm_of_residual=sqrt(sum_5); |
366 |
convergeFlag = norm_of_residual <= tol; |
367 |
maxIterFlag = num_iter == maxit; |
368 |
breakFlag = (ABS(tau) <= TOLERANCE_FOR_SCALARS); |
369 |
} |
370 |
} |
371 |
/* end of iteration */ |
372 |
num_iter_global=num_iter; |
373 |
norm_of_residual_global=norm_of_residual; |
374 |
if (maxIterFlag) { |
375 |
status = SOLVER_MAXITER_REACHED; |
376 |
} else if (breakFlag) { |
377 |
status = SOLVER_BREAKDOWN; |
378 |
} |
379 |
Performance_stopMonitor(pp,PERFORMANCE_SOLVER); |
380 |
TMPMEMFREE(rs); |
381 |
TMPMEMFREE(x2); |
382 |
TMPMEMFREE(v); |
383 |
TMPMEMFREE(p); |
384 |
*iter=num_iter_global; |
385 |
*resid=norm_of_residual_global; |
386 |
} |
387 |
/* End of PCG */ |
388 |
return status; |
389 |
} |