1 |
|
2 |
/******************************************************* |
3 |
* |
4 |
* Copyright (c) 2003-2010 by University of Queensland |
5 |
* Earth Systems Science Computational Center (ESSCC) |
6 |
* http://www.uq.edu.au/esscc |
7 |
* |
8 |
* Primary Business: Queensland, Australia |
9 |
* Licensed under the Open Software License version 3.0 |
10 |
* http://www.opensource.org/licenses/osl-3.0.php |
11 |
* |
12 |
*******************************************************/ |
13 |
|
14 |
|
15 |
/**************************************************************/ |
16 |
|
17 |
/* Paso: Gauss-Seidel */ |
18 |
|
19 |
/**************************************************************/ |
20 |
|
21 |
/* Author: artak@uq.edu.au */ |
22 |
|
23 |
/**************************************************************/ |
24 |
|
25 |
#include "Paso.h" |
26 |
#include "Preconditioner.h" |
27 |
#include "PasoUtil.h" |
28 |
#include "BlockOps.h" |
29 |
|
30 |
#include <stdio.h> |
31 |
|
32 |
|
33 |
/**************************************************************/ |
34 |
|
35 |
/* free all memory used by Smoother */ |
36 |
|
37 |
void Paso_Preconditioner_Smoother_free(Paso_Preconditioner_Smoother * in) { |
38 |
if (in!=NULL) { |
39 |
Paso_Preconditioner_LocalSmoother_free(in->localSmoother); |
40 |
MEMFREE(in); |
41 |
} |
42 |
} |
43 |
void Paso_Preconditioner_LocalSmoother_free(Paso_Preconditioner_LocalSmoother * in) { |
44 |
if (in!=NULL) { |
45 |
MEMFREE(in->diag); |
46 |
MEMFREE(in->pivot); |
47 |
MEMFREE(in->buffer); |
48 |
MEMFREE(in); |
49 |
} |
50 |
} |
51 |
/**************************************************************/ |
52 |
|
53 |
/* constructs the symmetric Gauss-Seidel preconditioner |
54 |
|
55 |
*/ |
56 |
Paso_Preconditioner_Smoother* Paso_Preconditioner_Smoother_alloc(Paso_SystemMatrix * A_p, const bool_t jacobi, const bool_t is_local, const bool_t verbose) |
57 |
{ |
58 |
|
59 |
/* allocations: */ |
60 |
Paso_Preconditioner_Smoother* out=MEMALLOC(1,Paso_Preconditioner_Smoother); |
61 |
if (! Esys_checkPtr(out)) { |
62 |
out->localSmoother=Paso_Preconditioner_LocalSmoother_alloc(A_p->mainBlock,jacobi,verbose); |
63 |
out->is_local=is_local; |
64 |
} |
65 |
if (Esys_MPIInfo_noError(A_p->mpi_info)) { |
66 |
return out; |
67 |
} else { |
68 |
Paso_Preconditioner_Smoother_free(out); |
69 |
return NULL; |
70 |
} |
71 |
} |
72 |
Paso_Preconditioner_LocalSmoother* Paso_Preconditioner_LocalSmoother_alloc(Paso_SparseMatrix * A_p, const bool_t jacobi, bool_t verbose) |
73 |
{ |
74 |
|
75 |
dim_t n=A_p->numRows; |
76 |
dim_t n_block=A_p->row_block_size; |
77 |
dim_t block_size=A_p->block_size; |
78 |
|
79 |
double time0=Esys_timer(); |
80 |
/* allocations: */ |
81 |
Paso_Preconditioner_LocalSmoother* out=MEMALLOC(1,Paso_Preconditioner_LocalSmoother); |
82 |
if (! Esys_checkPtr(out)) { |
83 |
|
84 |
out->diag=MEMALLOC( ((size_t) n) * ((size_t) block_size),double); |
85 |
out->pivot=MEMALLOC( ((size_t) n) * ((size_t) n_block), index_t); |
86 |
out->buffer=MEMALLOC( ((size_t) n) * ((size_t) n_block), double); |
87 |
out->Jacobi=jacobi; |
88 |
|
89 |
if ( ! ( Esys_checkPtr(out->diag) || Esys_checkPtr(out->pivot) ) ) { |
90 |
Paso_SparseMatrix_invMain(A_p, out->diag, out->pivot ); |
91 |
} |
92 |
|
93 |
} |
94 |
time0=Esys_timer()-time0; |
95 |
|
96 |
if (Esys_noError()) { |
97 |
if (verbose) { |
98 |
if (jacobi) { |
99 |
printf("timing: Jacobi preparation: elemination : %e\n",time0); |
100 |
} else { |
101 |
printf("timing: Gauss-Seidel preparation: elemination : %e\n",time0); |
102 |
} |
103 |
} |
104 |
return out; |
105 |
} else { |
106 |
Paso_Preconditioner_LocalSmoother_free(out); |
107 |
return NULL; |
108 |
} |
109 |
} |
110 |
|
111 |
/* |
112 |
|
113 |
performs a few sweeps of the from |
114 |
|
115 |
S (x_{k} - x_{k-1}) = b - A x_{k-1} |
116 |
|
117 |
where x_{0}=0 and S provides some approximatioon of A. |
118 |
|
119 |
Under MPI S is build on using A_p->mainBlock only. |
120 |
if Smoother is local the defect b - A x_{k-1} is calculated using A_p->mainBlock only. |
121 |
|
122 |
*/ |
123 |
|
124 |
void Paso_Preconditioner_Smoother_solve(Paso_SystemMatrix* A_p, Paso_Preconditioner_Smoother * smoother, double * x, const double * b, |
125 |
const dim_t sweeps, const bool_t x_is_initial) |
126 |
{ |
127 |
const dim_t n= (A_p->mainBlock->numRows) * (A_p->mainBlock->row_block_size); |
128 |
|
129 |
double *b_new = smoother->localSmoother->buffer; |
130 |
dim_t nsweeps=sweeps; |
131 |
if (smoother->is_local) { |
132 |
Paso_Preconditioner_LocalSmoother_solve(A_p->mainBlock,smoother->localSmoother,x,b,sweeps,x_is_initial); |
133 |
} else { |
134 |
if (! x_is_initial) { |
135 |
Paso_Copy(n, x, b); |
136 |
Paso_Preconditioner_LocalSmoother_Sweep(A_p->mainBlock,smoother->localSmoother,x); |
137 |
nsweeps--; |
138 |
} |
139 |
while (nsweeps > 0 ) { |
140 |
Paso_Copy(n, b_new, b); |
141 |
Paso_SystemMatrix_MatrixVector((-1.), A_p, x, 1., b_new); /* b_new = b - A*x */ |
142 |
Paso_Preconditioner_LocalSmoother_Sweep(A_p->mainBlock,smoother->localSmoother,b_new); |
143 |
Paso_AXPY(n, x, 1., b_new); |
144 |
nsweeps--; |
145 |
} |
146 |
|
147 |
} |
148 |
} |
149 |
void Paso_Preconditioner_LocalSmoother_solve(Paso_SparseMatrix* A_p, Paso_Preconditioner_LocalSmoother * smoother, double * x, const double * b, |
150 |
const dim_t sweeps, const bool_t x_is_initial) |
151 |
{ |
152 |
const dim_t n= (A_p->numRows) * (A_p->row_block_size); |
153 |
double *b_new = smoother->buffer; |
154 |
dim_t nsweeps=sweeps; |
155 |
|
156 |
if (! x_is_initial) { |
157 |
Paso_Copy(n, x, b); |
158 |
Paso_Preconditioner_LocalSmoother_Sweep(A_p,smoother,x); |
159 |
nsweeps--; |
160 |
} |
161 |
|
162 |
while (nsweeps > 0 ) { |
163 |
Paso_Copy(n, b_new, b); |
164 |
|
165 |
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0((-1.), A_p, x, 1., b_new); /* b_new = b - A*x */ |
166 |
Paso_Preconditioner_LocalSmoother_Sweep(A_p,smoother,b_new); |
167 |
Paso_AXPY(n, x, 1., b_new); |
168 |
nsweeps--; |
169 |
} |
170 |
} |
171 |
|
172 |
void Paso_Preconditioner_LocalSmoother_Sweep(Paso_SparseMatrix* A, Paso_Preconditioner_LocalSmoother * smoother, double * x) |
173 |
{ |
174 |
const dim_t nt=omp_get_max_threads(); |
175 |
if (smoother->Jacobi) { |
176 |
Paso_BlockOps_allMV(A->row_block_size,A->numRows,smoother->diag,smoother->pivot,x); |
177 |
} else { |
178 |
if (nt < 2) { |
179 |
Paso_Preconditioner_LocalSmoother_Sweep_sequential(A,smoother,x); |
180 |
} else { |
181 |
Paso_Preconditioner_LocalSmoother_Sweep_colored(A,smoother,x); |
182 |
} |
183 |
} |
184 |
} |
185 |
|
186 |
/* inplace Gauss-Seidel sweep in seqential mode: */ |
187 |
|
188 |
void Paso_Preconditioner_LocalSmoother_Sweep_sequential(Paso_SparseMatrix* A_p, Paso_Preconditioner_LocalSmoother * smoother, double * x) |
189 |
{ |
190 |
const dim_t n=A_p->numRows; |
191 |
const dim_t n_block=A_p->row_block_size; |
192 |
const double *diag = smoother->diag; |
193 |
/* const index_t* pivot = smoother->pivot; |
194 |
const dim_t block_size=A_p->block_size; use for block size >3*/ |
195 |
|
196 |
register dim_t i,k; |
197 |
register index_t iptr_ik, mm; |
198 |
|
199 |
const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p); |
200 |
/* forward substitution */ |
201 |
|
202 |
if (n_block==1) { |
203 |
Paso_BlockOps_MV_1(&x[0], &diag[0], &x[0]); |
204 |
for (i = 1; i < n; ++i) { |
205 |
mm=ptr_main[i]; |
206 |
/* x_i=x_i-a_ik*x_k (with k<i) */ |
207 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) { |
208 |
k=A_p->pattern->index[iptr_ik]; |
209 |
Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]); |
210 |
} |
211 |
Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]); |
212 |
} |
213 |
} else if (n_block==2) { |
214 |
Paso_BlockOps_MV_2(&x[0], &diag[0], &x[0]); |
215 |
for (i = 1; i < n; ++i) { |
216 |
mm=ptr_main[i]; |
217 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) { |
218 |
k=A_p->pattern->index[iptr_ik]; |
219 |
Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]); |
220 |
} |
221 |
Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]); |
222 |
} |
223 |
} else if (n_block==3) { |
224 |
Paso_BlockOps_MV_3(&x[0], &diag[0], &x[0]); |
225 |
for (i = 1; i < n; ++i) { |
226 |
mm=ptr_main[i]; |
227 |
/* x_i=x_i-a_ik*x_k */ |
228 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) { |
229 |
k=A_p->pattern->index[iptr_ik]; |
230 |
Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]); |
231 |
} |
232 |
Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]); |
233 |
} |
234 |
} /* add block size >3 */ |
235 |
|
236 |
/* backward substitution */ |
237 |
|
238 |
if (n_block==1) { |
239 |
for (i = n-2; i > -1; --i) { |
240 |
mm=ptr_main[i]; |
241 |
Paso_BlockOps_MV_1(&x[i], &A_p->val[mm], &x[i]); |
242 |
for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) { |
243 |
k=A_p->pattern->index[iptr_ik]; |
244 |
Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]); |
245 |
} |
246 |
Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]); |
247 |
} |
248 |
|
249 |
} else if (n_block==2) { |
250 |
for (i = n-2; i > -1; --i) { |
251 |
mm=ptr_main[i]; |
252 |
Paso_BlockOps_MV_2(&x[2*i], &A_p->val[4*mm], &x[2*i]); |
253 |
for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) { |
254 |
k=A_p->pattern->index[iptr_ik]; |
255 |
Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]); |
256 |
} |
257 |
Paso_BlockOps_MV_2(&x[2*i], &diag[i*4], &x[2*i]); |
258 |
|
259 |
} |
260 |
} else if (n_block==3) { |
261 |
for (i = n-2; i > -1; --i) { |
262 |
mm=ptr_main[i]; |
263 |
Paso_BlockOps_MV_3(&x[3*i], &A_p->val[9*mm], &x[3*i]); |
264 |
|
265 |
for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) { |
266 |
k=A_p->pattern->index[iptr_ik]; |
267 |
Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]); |
268 |
} |
269 |
Paso_BlockOps_MV_3(&x[3*i], &diag[i*9], &x[3*i]); |
270 |
} |
271 |
|
272 |
} /* add block size >3 */ |
273 |
|
274 |
return; |
275 |
} |
276 |
|
277 |
void Paso_Preconditioner_LocalSmoother_Sweep_colored(Paso_SparseMatrix* A_p, Paso_Preconditioner_LocalSmoother * smoother, double * x) |
278 |
{ |
279 |
const dim_t n=A_p->numRows; |
280 |
const dim_t n_block=A_p->row_block_size; |
281 |
const double *diag = smoother->diag; |
282 |
index_t* pivot = smoother->pivot; |
283 |
const dim_t block_size=A_p->block_size; |
284 |
|
285 |
register dim_t i,k; |
286 |
register index_t color,iptr_ik, mm; |
287 |
|
288 |
const index_t* coloring = Paso_Pattern_borrowColoringPointer(A_p->pattern); |
289 |
const dim_t num_colors = Paso_Pattern_getNumColors(A_p->pattern); |
290 |
const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p); |
291 |
|
292 |
/* forward substitution */ |
293 |
|
294 |
|
295 |
/* color = 0 */ |
296 |
if (n_block==1) { |
297 |
#pragma omp parallel for schedule(static) private(i) |
298 |
for (i = 0; i < n; ++i) { |
299 |
if (coloring[i]== 0 ) Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]); |
300 |
} |
301 |
} else if (n_block==2) { |
302 |
#pragma omp parallel for schedule(static) private(i) |
303 |
for (i = 0; i < n; ++i) { |
304 |
if (coloring[i]== 0 ) Paso_BlockOps_MV_2(&x[2*i], &diag[i*4], &x[2*i]); |
305 |
} |
306 |
} else if (n_block==3) { |
307 |
#pragma omp parallel for schedule(static) private(i) |
308 |
for (i = 0; i < n; ++i) { |
309 |
if (coloring[i]==0) Paso_BlockOps_MV_3(&x[3*i], &diag[i*9], &x[3*i]); |
310 |
} |
311 |
} else { |
312 |
#pragma omp parallel for schedule(static) private(i) |
313 |
for (i = 0; i < n; ++i) { |
314 |
if (coloring[i]==0) Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]); |
315 |
} |
316 |
} |
317 |
|
318 |
for (color=1;color<num_colors;++color) { |
319 |
|
320 |
if (n_block==1) { |
321 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k) |
322 |
for (i = 0; i < n; ++i) { |
323 |
if (coloring[i]==color) { |
324 |
/* x_i=x_i-a_ik*x_k */ |
325 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
326 |
k=A_p->pattern->index[iptr_ik]; |
327 |
if (coloring[k]<color) Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]); |
328 |
} |
329 |
Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]); |
330 |
} |
331 |
} |
332 |
} else if (n_block==2) { |
333 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k) |
334 |
for (i = 0; i < n; ++i) { |
335 |
if (coloring[i]==color) { |
336 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
337 |
k=A_p->pattern->index[iptr_ik]; |
338 |
if (coloring[k]<color) Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]); |
339 |
} |
340 |
Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]); |
341 |
} |
342 |
} |
343 |
} else if (n_block==3) { |
344 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k) |
345 |
for (i = 0; i < n; ++i) { |
346 |
if (coloring[i]==color) { |
347 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
348 |
k=A_p->pattern->index[iptr_ik]; |
349 |
if (coloring[k]<color) Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]); |
350 |
} |
351 |
Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]); |
352 |
} |
353 |
} |
354 |
} else { |
355 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k) |
356 |
for (i = 0; i < n; ++i) { |
357 |
if (coloring[i] == color) { |
358 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
359 |
k=A_p->pattern->index[iptr_ik]; |
360 |
if (coloring[k]<color) Paso_BlockOps_SMV_N(n_block, &x[n_block*i], &A_p->val[block_size*iptr_ik], &x[n_block*k]); |
361 |
} |
362 |
Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]); |
363 |
} |
364 |
} |
365 |
} |
366 |
} /* end of coloring loop */ |
367 |
|
368 |
/* backward substitution */ |
369 |
for (color=(num_colors)-2 ;color>-1;--color) { /* Note: color=(num_colors)-1 is not required */ |
370 |
if (n_block==1) { |
371 |
#pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k) |
372 |
for (i = 0; i < n; ++i) { |
373 |
if (coloring[i]==color) { |
374 |
mm=ptr_main[i]; |
375 |
Paso_BlockOps_MV_1(&x[i], &A_p->val[mm], &x[i]); |
376 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
377 |
k=A_p->pattern->index[iptr_ik]; |
378 |
if (coloring[k]>color) Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]); |
379 |
} |
380 |
Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]); |
381 |
} |
382 |
} |
383 |
} else if (n_block==2) { |
384 |
#pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k) |
385 |
for (i = 0; i < n; ++i) { |
386 |
if (coloring[i]==color) { |
387 |
mm=ptr_main[i]; |
388 |
Paso_BlockOps_MV_2(&x[2*i], &A_p->val[4*mm], &x[2*i]); |
389 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
390 |
k=A_p->pattern->index[iptr_ik]; |
391 |
if (coloring[k]>color) Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]); |
392 |
} |
393 |
Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]); |
394 |
} |
395 |
} |
396 |
} else if (n_block==3) { |
397 |
#pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k) |
398 |
for (i = 0; i < n; ++i) { |
399 |
if (coloring[i]==color) { |
400 |
mm=ptr_main[i]; |
401 |
Paso_BlockOps_MV_3(&x[3*i], &A_p->val[9*mm], &x[3*i]); |
402 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
403 |
k=A_p->pattern->index[iptr_ik]; |
404 |
if (coloring[k]>color) Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]); |
405 |
} |
406 |
Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]); |
407 |
} |
408 |
} |
409 |
} else { |
410 |
#pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k) |
411 |
for (i = 0; i < n; ++i) { |
412 |
if (coloring[i]==color) { |
413 |
mm=ptr_main[i]; |
414 |
Paso_BlockOps_MV_N( n_block, &x[n_block*i], &A_p->val[block_size*mm], &x[n_block*i] ); |
415 |
for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) { |
416 |
k=A_p->pattern->index[iptr_ik]; |
417 |
if (coloring[k]>color) Paso_BlockOps_SMV_N(n_block, &x[n_block*i], &A_p->val[block_size*iptr_ik], &x[n_block*k]); |
418 |
} |
419 |
Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]); |
420 |
} |
421 |
} |
422 |
} |
423 |
} |
424 |
return; |
425 |
} |