/[escript]/trunk/paso/src/Smoother.c
ViewVC logotype

Contents of /trunk/paso/src/Smoother.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3098 - (show annotations)
Fri Aug 20 08:08:27 2010 UTC (8 years, 4 months ago) by gross
Original Path: trunk/paso/src/Solver_GS.c
File MIME type: text/plain
File size: 14344 byte(s)
fix to the GaussSeidel
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: Gauss-Seidel */
18
19 /**************************************************************/
20
21 /* Author: artak@uq.edu.au */
22
23 /**************************************************************/
24
25 #include "Paso.h"
26 #include "Solver.h"
27 #include "PasoUtil.h"
28 #include "BlockOps.h"
29
30 #include <stdio.h>
31
32
33 /**************************************************************/
34
35 /* free all memory used by GS */
36
37 void Paso_Solver_GS_free(Paso_Solver_GS * in) {
38 if (in!=NULL) {
39 Paso_Solver_LocalGS_free(in->localGS);
40 MEMFREE(in);
41 }
42 }
43 void Paso_Solver_LocalGS_free(Paso_Solver_LocalGS * in) {
44 if (in!=NULL) {
45 MEMFREE(in->diag);
46 MEMFREE(in->pivot);
47 MEMFREE(in->buffer);
48 MEMFREE(in);
49 }
50 }
51 /**************************************************************/
52
53 /* constructs the symmetric Gauss-Seidel preconditioner
54
55 */
56 Paso_Solver_GS* Paso_Solver_getGS(Paso_SystemMatrix * A_p, dim_t sweeps, bool_t is_local, bool_t verbose)
57 {
58
59 /* allocations: */
60 Paso_Solver_GS* out=MEMALLOC(1,Paso_Solver_GS);
61 if (! Paso_checkPtr(out)) {
62 out->localGS=Paso_Solver_getLocalGS(A_p->mainBlock,sweeps,verbose);
63 out->is_local=is_local;
64 }
65 if (Paso_MPIInfo_noError(A_p->mpi_info)) {
66 return out;
67 } else {
68 Paso_Solver_GS_free(out);
69 return NULL;
70 }
71 }
72 Paso_Solver_LocalGS* Paso_Solver_getLocalGS(Paso_SparseMatrix * A_p, dim_t sweeps, bool_t verbose) {
73
74 dim_t n=A_p->numRows;
75 dim_t n_block=A_p->row_block_size;
76 dim_t block_size=A_p->block_size;
77
78 double time0=Paso_timer();
79 /* allocations: */
80 Paso_Solver_LocalGS* out=MEMALLOC(1,Paso_Solver_LocalGS);
81 if (! Paso_checkPtr(out)) {
82
83 out->diag=MEMALLOC( ((size_t) n) * ((size_t) block_size),double);
84 out->pivot=MEMALLOC( ((size_t) n) * ((size_t) n_block), index_t);
85 out->buffer=MEMALLOC( ((size_t) n) * ((size_t) n_block), double);
86 out->sweeps=sweeps;
87
88 if ( ! ( Paso_checkPtr(out->diag) || Paso_checkPtr(out->pivot) ) ) {
89 Paso_SparseMatrix_invMain(A_p, out->diag, out->pivot );
90 }
91
92 }
93 time0=Paso_timer()-time0;
94
95 if (Paso_noError()) {
96 if (verbose) printf("timing: Gauss-Seidel preparation: elemination : %e\n",time0);
97 return out;
98 } else {
99 Paso_Solver_LocalGS_free(out);
100 return NULL;
101 }
102 }
103
104 /*
105
106 performs a few sweeps of the from
107
108 S (x_{k} - x_{k-1}) = b - A x_{k-1}
109
110 where x_{0}=0 and S provides some approximatioon of A.
111
112 Under MPI S is build on using A_p->mainBlock only.
113 if GS is local the defect b - A x_{k-1} is calculated using A_p->mainBlock only.
114
115 */
116
117 void Paso_Solver_solveGS(Paso_SystemMatrix* A_p, Paso_Solver_GS * gs, double * x, const double * b)
118 {
119 register dim_t i;
120 const dim_t n= (A_p->mainBlock->numRows) * (A_p->mainBlock->row_block_size);
121
122 double *b_new = gs->localGS->buffer;
123 dim_t sweeps=gs->localGS->sweeps;
124
125 if (gs->is_local) {
126 Paso_Solver_solveLocalGS(A_p->mainBlock,gs->localGS,x,b);
127 } else {
128 #pragma omp parallel for private(i) schedule(static)
129 for (i=0;i<n;++i) x[i]=b[i];
130
131 Paso_Solver_localGSSweep(A_p->mainBlock,gs->localGS,x);
132 while (sweeps > 0 ) {
133 #pragma omp parallel for private(i) schedule(static)
134 for (i=0;i<n;++i) b_new[i]=b[i];
135
136 Paso_SystemMatrix_MatrixVector((-1.), A_p, x, 1., b_new); /* b_new = b - A*x */
137
138 Paso_Solver_localGSSweep(A_p->mainBlock,gs->localGS,b_new);
139
140 #pragma omp parallel for private(i) schedule(static)
141 for (i=0;i<n;++i) x[i]+=b_new[i];
142
143 sweeps--;
144 }
145
146 }
147 }
148 void Paso_Solver_solveLocalGS(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x, const double * b)
149 {
150 register dim_t i;
151 const dim_t n= (A_p->numRows) * (A_p->row_block_size);
152 double *b_new = gs->buffer;
153 dim_t sweeps=gs->sweeps;
154
155 #pragma omp parallel for private(i) schedule(static)
156 for (i=0;i<n;++i) x[i]=b[i];
157 Paso_Solver_localGSSweep(A_p,gs,x);
158
159 while (sweeps > 0 ) {
160 #pragma omp parallel for private(i) schedule(static)
161 for (i=0;i<n;++i) b_new[i]=b[i];
162
163 Paso_SparseMatrix_MatrixVector_CSC_OFFSET0((-1.), A_p, x, 1., b_new); /* b_new = b - A*x */
164
165 Paso_Solver_localGSSweep(A_p,gs,b_new);
166
167 #pragma omp parallel for private(i) schedule(static)
168 for (i=0;i<n;++i) x[i]+=b_new[i];
169
170 sweeps--;
171 }
172 }
173
174 void Paso_Solver_localGSSweep(Paso_SparseMatrix* A, Paso_Solver_LocalGS * gs, double * x)
175 {
176 #ifdef _OPENMP
177 const dim_t nt=omp_get_max_threads();
178 #else
179 const dim_t nt = 1;
180 #endif
181 if (nt < 2) {
182 Paso_Solver_localGSSweep_sequential(A,gs,x);
183 } else {
184 Paso_Solver_localGSSweep_colored(A,gs,x);
185 }
186 }
187
188 /* inplace Gauss-Seidel sweep in seqential mode: */
189
190 void Paso_Solver_localGSSweep_sequential(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x)
191 {
192 const dim_t n=A_p->numRows;
193 const dim_t n_block=A_p->row_block_size;
194 const double *diag = gs->diag;
195 /* const index_t* pivot = gs->pivot;
196 const dim_t block_size=A_p->block_size; use for block size >3*/
197
198 register dim_t i,k;
199 register index_t iptr_ik, mm;
200
201 const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
202
203 /* forward substitution */
204
205 if (n_block==1) {
206
207 Paso_BlockOps_MV_1(&x[0], &diag[0], &x[0]);
208
209 for (i = 1; i < n; ++i) {
210 mm=ptr_main[i];
211 /* x_i=x_i-a_ik*x_k (with k<i) */
212 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) {
213 k=A_p->pattern->index[iptr_ik];
214 /* printf("row %d : add %d\n",i,k); */
215 Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
216 }
217 Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
218 /* printf("row %d : multipy by \n",i); */
219 }
220 return;
221 } else if (n_block==2) {
222 Paso_BlockOps_MV_2(&x[0], &diag[0], &x[0]);
223 for (i = 1; i < n; ++i) {
224 mm=ptr_main[i];
225
226 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) {
227 k=A_p->pattern->index[iptr_ik];
228 Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
229 }
230 Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]);
231 }
232 } else if (n_block==3) {
233 Paso_BlockOps_MV_3(&x[0], &diag[0], &x[0]);
234 for (i = 1; i < n; ++i) {
235 mm=ptr_main[i];
236 /* x_i=x_i-a_ik*x_k */
237 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) {
238 k=A_p->pattern->index[iptr_ik];
239 Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
240 }
241 Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]);
242 }
243
244 } /* add block size >3 */
245
246
247
248 /* backward substitution */
249
250 if (n_block==1) {
251
252 for (i = n-2; i > -1; ++i) {
253 mm=ptr_main[i];
254 Paso_BlockOps_MV_1(&x[i], &A_p->val[mm], &x[i]);
255 for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) {
256 k=A_p->pattern->index[iptr_ik];
257 Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
258 }
259 Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
260 }
261
262 } else if (n_block==2) {
263 for (i = n-2; i > -1; ++i) {
264 mm=ptr_main[i];
265 Paso_BlockOps_MV_2(&x[2*i], &A_p->val[4*mm], &x[2*i]);
266 for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) {
267 k=A_p->pattern->index[iptr_ik];
268 Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
269 }
270 Paso_BlockOps_MV_2(&x[2*i], &diag[i*4], &x[2*i]);
271 }
272 } else if (n_block==3) {
273 for (i = n-2; i > -1; ++i) {
274 mm=ptr_main[i];
275 Paso_BlockOps_MV_3(&x[3*i], &A_p->val[9*mm], &x[3*i]);
276
277 for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) {
278 k=A_p->pattern->index[iptr_ik];
279 Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
280 }
281 Paso_BlockOps_MV_3(&x[3*i], &diag[i*9], &x[3*i]);
282 }
283
284 } /* add block size >3 */
285
286 return;
287 }
288
289 void Paso_Solver_localGSSweep_colored(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x)
290 {
291 const dim_t n=A_p->numRows;
292 const dim_t n_block=A_p->row_block_size;
293 const double *diag = gs->diag;
294 index_t* pivot = gs->pivot;
295 const dim_t block_size=A_p->block_size;
296
297 register dim_t i,k;
298 register index_t color,iptr_ik, mm;
299
300 const index_t* coloring = Paso_Pattern_borrowColoringPointer(A_p->pattern);
301 const dim_t num_colors = Paso_Pattern_getNumColors(A_p->pattern);
302 const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
303
304 /* forward substitution */
305
306 /* color = 0 */
307 if (n_block==1) {
308 #pragma omp parallel for schedule(static) private(i)
309 for (i = 0; i < n; ++i) {
310 if (coloring[i]== 0 ) Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
311 }
312 } else if (n_block==2) {
313 #pragma omp parallel for schedule(static) private(i)
314 for (i = 0; i < n; ++i) {
315 if (coloring[i]== 0 ) Paso_BlockOps_MV_2(&x[2*i], &diag[i*4], &x[2*i]);
316 }
317 } else if (n_block==3) {
318 #pragma omp parallel for schedule(static) private(i)
319 for (i = 0; i < n; ++i) {
320 if (coloring[i]==0) Paso_BlockOps_MV_3(&x[3*i], &diag[i*9], &x[3*i]);
321 }
322 } else {
323 #pragma omp parallel for schedule(static) private(i)
324 for (i = 0; i < n; ++i) {
325 if (coloring[i]==0) Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]);
326 }
327 }
328
329 for (color=1;color<num_colors;++color) {
330 if (n_block==1) {
331 #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
332 for (i = 0; i < n; ++i) {
333 if (coloring[i]==color) {
334 /* x_i=x_i-a_ik*x_k */
335 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
336 k=A_p->pattern->index[iptr_ik];
337 if (coloring[k]<color) Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
338 }
339 Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
340 }
341 }
342 } else if (n_block==2) {
343 #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
344 for (i = 0; i < n; ++i) {
345 if (coloring[i]==color) {
346 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
347 k=A_p->pattern->index[iptr_ik];
348 if (coloring[k]<color) Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
349 }
350 Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]);
351 }
352
353 }
354 } else if (n_block==3) {
355 #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
356 for (i = 0; i < n; ++i) {
357 if (coloring[i]==color) {
358 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
359 k=A_p->pattern->index[iptr_ik];
360 if (coloring[k]<color) Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
361 }
362 Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]);
363 }
364 }
365 } else {
366 #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
367 for (i = 0; i < n; ++i) {
368 if (coloring[i] == color) {
369 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
370 k=A_p->pattern->index[iptr_ik];
371 if (coloring[k]<color) Paso_BlockOps_SMV_N(n_block, &x[n_block*i], &A_p->val[block_size*iptr_ik], &x[n_block*k]);
372 }
373 Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]);
374 }
375 }
376 }
377 } /* end of coloring loop */
378
379
380 /* backward substitution */
381 for (color=(num_colors)-2 ;color>-1;--color) { /* Note: color=(num_colors)-1 is not required */
382 if (n_block==1) {
383 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
384 for (i = 0; i < n; ++i) {
385 if (coloring[i]==color) {
386 mm=ptr_main[i];
387 Paso_BlockOps_MV_1(&x[i], &A_p->val[mm], &x[i]);
388 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
389 k=A_p->pattern->index[iptr_ik];
390 if (coloring[k]>color) Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
391 }
392 Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
393 }
394 }
395 } else if (n_block==2) {
396 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
397 for (i = 0; i < n; ++i) {
398 if (coloring[i]==color) {
399 mm=ptr_main[i];
400 Paso_BlockOps_MV_2(&x[2*i], &A_p->val[4*mm], &x[2*i]);
401 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
402 k=A_p->pattern->index[iptr_ik];
403 if (coloring[k]>color) Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
404 }
405 Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]);
406 }
407 }
408 } else if (n_block==3) {
409 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
410 for (i = 0; i < n; ++i) {
411 if (coloring[i]==color) {
412 mm=ptr_main[i];
413 Paso_BlockOps_MV_3(&x[3*i], &A_p->val[9*mm], &x[3*i]);
414 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
415 k=A_p->pattern->index[iptr_ik];
416 if (coloring[k]>color) Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
417 }
418 Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]);
419 }
420 }
421 } else {
422 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
423 for (i = 0; i < n; ++i) {
424 if (coloring[i]==color) {
425 mm=ptr_main[i];
426 Paso_BlockOps_MV_N( n_block, &x[n_block*i], &A_p->val[block_size*mm], &x[n_block*i] );
427 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
428 k=A_p->pattern->index[iptr_ik];
429 if (coloring[k]>color) Paso_BlockOps_SMV_N(n_block, &x[n_block*i], &A_p->val[block_size*iptr_ik], &x[n_block*k]);
430 }
431 Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]);
432 }
433 }
434 }
435 }
436 return;
437 }
438
439
440

  ViewVC Help
Powered by ViewVC 1.1.26