/[escript]/trunk/paso/src/Smoother.c
ViewVC logotype

Annotation of /trunk/paso/src/Smoother.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3098 - (hide annotations)
Fri Aug 20 08:08:27 2010 UTC (8 years, 8 months ago) by gross
Original Path: trunk/paso/src/Solver_GS.c
File MIME type: text/plain
File size: 14344 byte(s)
fix to the GaussSeidel
1 jfenwick 1851
2     /*******************************************************
3     *
4 jfenwick 2881 * Copyright (c) 2003-2010 by University of Queensland
5 jfenwick 1851 * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13    
14    
15     /**************************************************************/
16    
17 gross 3097 /* Paso: Gauss-Seidel */
18 jfenwick 1851
19     /**************************************************************/
20    
21     /* Author: artak@uq.edu.au */
22    
23     /**************************************************************/
24    
25     #include "Paso.h"
26     #include "Solver.h"
27     #include "PasoUtil.h"
28 gross 3097 #include "BlockOps.h"
29 jfenwick 1851
30     #include <stdio.h>
31    
32 gross 3094
33 jfenwick 1851 /**************************************************************/
34    
35     /* free all memory used by GS */
36    
37     void Paso_Solver_GS_free(Paso_Solver_GS * in) {
38     if (in!=NULL) {
39 gross 3094 Paso_Solver_LocalGS_free(in->localGS);
40 jfenwick 1851 MEMFREE(in);
41     }
42     }
43 gross 3094 void Paso_Solver_LocalGS_free(Paso_Solver_LocalGS * in) {
44     if (in!=NULL) {
45     MEMFREE(in->diag);
46     MEMFREE(in->pivot);
47 gross 3097 MEMFREE(in->buffer);
48 gross 3094 MEMFREE(in);
49     }
50     }
51 jfenwick 1851 /**************************************************************/
52    
53 gross 3094 /* constructs the symmetric Gauss-Seidel preconditioner
54 jfenwick 1851
55     */
56 gross 3094 Paso_Solver_GS* Paso_Solver_getGS(Paso_SystemMatrix * A_p, dim_t sweeps, bool_t is_local, bool_t verbose)
57     {
58    
59 jfenwick 1851 /* allocations: */
60     Paso_Solver_GS* out=MEMALLOC(1,Paso_Solver_GS);
61 gross 3094 if (! Paso_checkPtr(out)) {
62     out->localGS=Paso_Solver_getLocalGS(A_p->mainBlock,sweeps,verbose);
63     out->is_local=is_local;
64 jfenwick 1851 }
65 gross 3094 if (Paso_MPIInfo_noError(A_p->mpi_info)) {
66 jfenwick 1851 return out;
67 gross 3094 } else {
68 jfenwick 1851 Paso_Solver_GS_free(out);
69     return NULL;
70     }
71     }
72 gross 3094 Paso_Solver_LocalGS* Paso_Solver_getLocalGS(Paso_SparseMatrix * A_p, dim_t sweeps, bool_t verbose) {
73    
74     dim_t n=A_p->numRows;
75     dim_t n_block=A_p->row_block_size;
76     dim_t block_size=A_p->block_size;
77    
78     double time0=Paso_timer();
79     /* allocations: */
80     Paso_Solver_LocalGS* out=MEMALLOC(1,Paso_Solver_LocalGS);
81     if (! Paso_checkPtr(out)) {
82    
83     out->diag=MEMALLOC( ((size_t) n) * ((size_t) block_size),double);
84     out->pivot=MEMALLOC( ((size_t) n) * ((size_t) n_block), index_t);
85 gross 3097 out->buffer=MEMALLOC( ((size_t) n) * ((size_t) n_block), double);
86 gross 3094 out->sweeps=sweeps;
87    
88     if ( ! ( Paso_checkPtr(out->diag) || Paso_checkPtr(out->pivot) ) ) {
89     Paso_SparseMatrix_invMain(A_p, out->diag, out->pivot );
90     }
91    
92     }
93     time0=Paso_timer()-time0;
94    
95     if (Paso_noError()) {
96     if (verbose) printf("timing: Gauss-Seidel preparation: elemination : %e\n",time0);
97     return out;
98     } else {
99     Paso_Solver_LocalGS_free(out);
100     return NULL;
101     }
102     }
103 jfenwick 1851
104 gross 3097 /*
105 jfenwick 1851
106 gross 3097 performs a few sweeps of the from
107 jfenwick 1851
108 gross 3097 S (x_{k} - x_{k-1}) = b - A x_{k-1}
109 jfenwick 1851
110 gross 3097 where x_{0}=0 and S provides some approximatioon of A.
111 jfenwick 1851
112 gross 3097 Under MPI S is build on using A_p->mainBlock only.
113     if GS is local the defect b - A x_{k-1} is calculated using A_p->mainBlock only.
114 gross 3094
115 jfenwick 1851 */
116    
117 gross 3094 void Paso_Solver_solveGS(Paso_SystemMatrix* A_p, Paso_Solver_GS * gs, double * x, const double * b)
118     {
119     register dim_t i;
120 gross 3097 const dim_t n= (A_p->mainBlock->numRows) * (A_p->mainBlock->row_block_size);
121    
122     double *b_new = gs->localGS->buffer;
123 gross 3094 dim_t sweeps=gs->localGS->sweeps;
124    
125 gross 3097 if (gs->is_local) {
126     Paso_Solver_solveLocalGS(A_p->mainBlock,gs->localGS,x,b);
127     } else {
128     #pragma omp parallel for private(i) schedule(static)
129     for (i=0;i<n;++i) x[i]=b[i];
130    
131     Paso_Solver_localGSSweep(A_p->mainBlock,gs->localGS,x);
132     while (sweeps > 0 ) {
133     #pragma omp parallel for private(i) schedule(static)
134     for (i=0;i<n;++i) b_new[i]=b[i];
135    
136     Paso_SystemMatrix_MatrixVector((-1.), A_p, x, 1., b_new); /* b_new = b - A*x */
137    
138     Paso_Solver_localGSSweep(A_p->mainBlock,gs->localGS,b_new);
139    
140     #pragma omp parallel for private(i) schedule(static)
141     for (i=0;i<n;++i) x[i]+=b_new[i];
142    
143     sweeps--;
144     }
145    
146 gross 3094 }
147 gross 3097 }
148     void Paso_Solver_solveLocalGS(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x, const double * b)
149     {
150     register dim_t i;
151     const dim_t n= (A_p->numRows) * (A_p->row_block_size);
152     double *b_new = gs->buffer;
153     dim_t sweeps=gs->sweeps;
154 gross 3094
155 gross 3097 #pragma omp parallel for private(i) schedule(static)
156     for (i=0;i<n;++i) x[i]=b[i];
157     Paso_Solver_localGSSweep(A_p,gs,x);
158 gross 3094
159 gross 3097 while (sweeps > 0 ) {
160 gross 3094 #pragma omp parallel for private(i) schedule(static)
161 gross 3097 for (i=0;i<n;++i) b_new[i]=b[i];
162 gross 3094
163 gross 3097 Paso_SparseMatrix_MatrixVector_CSC_OFFSET0((-1.), A_p, x, 1., b_new); /* b_new = b - A*x */
164    
165     Paso_Solver_localGSSweep(A_p,gs,b_new);
166    
167     #pragma omp parallel for private(i) schedule(static)
168     for (i=0;i<n;++i) x[i]+=b_new[i];
169    
170     sweeps--;
171 gross 3094 }
172     }
173 jfenwick 1851
174 gross 3097 void Paso_Solver_localGSSweep(Paso_SparseMatrix* A, Paso_Solver_LocalGS * gs, double * x)
175 gross 3094 {
176     #ifdef _OPENMP
177     const dim_t nt=omp_get_max_threads();
178     #else
179     const dim_t nt = 1;
180     #endif
181 gross 3098 if (nt < 2) {
182 gross 3097 Paso_Solver_localGSSweep_sequential(A,gs,x);
183     } else {
184     Paso_Solver_localGSSweep_colored(A,gs,x);
185 gross 3094 }
186 jfenwick 1851 }
187    
188 gross 3097 /* inplace Gauss-Seidel sweep in seqential mode: */
189 gross 3094
190 gross 3097 void Paso_Solver_localGSSweep_sequential(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x)
191     {
192     const dim_t n=A_p->numRows;
193     const dim_t n_block=A_p->row_block_size;
194     const double *diag = gs->diag;
195     /* const index_t* pivot = gs->pivot;
196     const dim_t block_size=A_p->block_size; use for block size >3*/
197 gross 3094
198 gross 3097 register dim_t i,k;
199     register index_t iptr_ik, mm;
200    
201     const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
202 gross 3098
203 gross 3097 /* forward substitution */
204    
205     if (n_block==1) {
206 gross 3098
207     Paso_BlockOps_MV_1(&x[0], &diag[0], &x[0]);
208    
209     for (i = 1; i < n; ++i) {
210 gross 3097 mm=ptr_main[i];
211     /* x_i=x_i-a_ik*x_k (with k<i) */
212     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) {
213 gross 3098 k=A_p->pattern->index[iptr_ik];
214     /* printf("row %d : add %d\n",i,k); */
215 gross 3097 Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
216     }
217 gross 3098 Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
218     /* printf("row %d : multipy by \n",i); */
219 gross 3097 }
220 gross 3098 return;
221 gross 3097 } else if (n_block==2) {
222 gross 3098 Paso_BlockOps_MV_2(&x[0], &diag[0], &x[0]);
223     for (i = 1; i < n; ++i) {
224 gross 3097 mm=ptr_main[i];
225    
226     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) {
227     k=A_p->pattern->index[iptr_ik];
228     Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
229     }
230 gross 3098 Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]);
231 gross 3097 }
232     } else if (n_block==3) {
233 gross 3098 Paso_BlockOps_MV_3(&x[0], &diag[0], &x[0]);
234     for (i = 1; i < n; ++i) {
235 gross 3097 mm=ptr_main[i];
236     /* x_i=x_i-a_ik*x_k */
237     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<mm; ++iptr_ik) {
238     k=A_p->pattern->index[iptr_ik];
239     Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
240     }
241 gross 3098 Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]);
242 gross 3097 }
243    
244     } /* add block size >3 */
245    
246    
247    
248     /* backward substitution */
249    
250     if (n_block==1) {
251 gross 3098
252 gross 3097 for (i = n-2; i > -1; ++i) {
253 gross 3098 mm=ptr_main[i];
254     Paso_BlockOps_MV_1(&x[i], &A_p->val[mm], &x[i]);
255     for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) {
256     k=A_p->pattern->index[iptr_ik];
257     Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
258     }
259     Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
260 gross 3097 }
261 gross 3098
262 gross 3097 } else if (n_block==2) {
263     for (i = n-2; i > -1; ++i) {
264 gross 3098 mm=ptr_main[i];
265     Paso_BlockOps_MV_2(&x[2*i], &A_p->val[4*mm], &x[2*i]);
266     for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) {
267     k=A_p->pattern->index[iptr_ik];
268     Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
269     }
270     Paso_BlockOps_MV_2(&x[2*i], &diag[i*4], &x[2*i]);
271 gross 3097 }
272     } else if (n_block==3) {
273     for (i = n-2; i > -1; ++i) {
274 gross 3098 mm=ptr_main[i];
275     Paso_BlockOps_MV_3(&x[3*i], &A_p->val[9*mm], &x[3*i]);
276 gross 3097
277 gross 3098 for (iptr_ik=mm+1; iptr_ik < A_p->pattern->ptr[i+1]; ++iptr_ik) {
278     k=A_p->pattern->index[iptr_ik];
279     Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
280     }
281     Paso_BlockOps_MV_3(&x[3*i], &diag[i*9], &x[3*i]);
282 gross 3097 }
283    
284     } /* add block size >3 */
285    
286     return;
287     }
288 gross 3094
289 gross 3097 void Paso_Solver_localGSSweep_colored(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x)
290 gross 3094 {
291     const dim_t n=A_p->numRows;
292     const dim_t n_block=A_p->row_block_size;
293     const double *diag = gs->diag;
294 gross 3097 index_t* pivot = gs->pivot;
295     const dim_t block_size=A_p->block_size;
296 gross 3094
297     register dim_t i,k;
298     register index_t color,iptr_ik, mm;
299    
300     const index_t* coloring = Paso_Pattern_borrowColoringPointer(A_p->pattern);
301     const dim_t num_colors = Paso_Pattern_getNumColors(A_p->pattern);
302     const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
303    
304     /* forward substitution */
305    
306     /* color = 0 */
307     if (n_block==1) {
308 gross 3097 #pragma omp parallel for schedule(static) private(i)
309 gross 3094 for (i = 0; i < n; ++i) {
310 gross 3097 if (coloring[i]== 0 ) Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
311     }
312 gross 3094 } else if (n_block==2) {
313 gross 3097 #pragma omp parallel for schedule(static) private(i)
314 gross 3094 for (i = 0; i < n; ++i) {
315 gross 3097 if (coloring[i]== 0 ) Paso_BlockOps_MV_2(&x[2*i], &diag[i*4], &x[2*i]);
316 gross 3094 }
317 gross 3097 } else if (n_block==3) {
318     #pragma omp parallel for schedule(static) private(i)
319 gross 3094 for (i = 0; i < n; ++i) {
320 gross 3097 if (coloring[i]==0) Paso_BlockOps_MV_3(&x[3*i], &diag[i*9], &x[3*i]);
321 gross 3094 }
322 gross 3097 } else {
323     #pragma omp parallel for schedule(static) private(i)
324     for (i = 0; i < n; ++i) {
325     if (coloring[i]==0) Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]);
326     }
327     }
328 gross 3094
329     for (color=1;color<num_colors;++color) {
330     if (n_block==1) {
331 gross 3097 #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
332 gross 3094 for (i = 0; i < n; ++i) {
333     if (coloring[i]==color) {
334     /* x_i=x_i-a_ik*x_k */
335     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
336     k=A_p->pattern->index[iptr_ik];
337 gross 3097 if (coloring[k]<color) Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
338 gross 3094 }
339 gross 3097 Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
340 gross 3094 }
341     }
342     } else if (n_block==2) {
343 gross 3097 #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
344 gross 3094 for (i = 0; i < n; ++i) {
345     if (coloring[i]==color) {
346     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
347     k=A_p->pattern->index[iptr_ik];
348 gross 3097 if (coloring[k]<color) Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
349 gross 3094 }
350 gross 3097 Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]);
351 gross 3094 }
352    
353     }
354     } else if (n_block==3) {
355 gross 3097 #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
356 gross 3094 for (i = 0; i < n; ++i) {
357     if (coloring[i]==color) {
358     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
359     k=A_p->pattern->index[iptr_ik];
360 gross 3097 if (coloring[k]<color) Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
361 gross 3094 }
362 gross 3097 Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]);
363 gross 3094 }
364     }
365 gross 3097 } else {
366     #pragma omp parallel for schedule(static) private(i,iptr_ik,k)
367     for (i = 0; i < n; ++i) {
368     if (coloring[i] == color) {
369     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
370     k=A_p->pattern->index[iptr_ik];
371     if (coloring[k]<color) Paso_BlockOps_SMV_N(n_block, &x[n_block*i], &A_p->val[block_size*iptr_ik], &x[n_block*k]);
372     }
373     Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]);
374     }
375     }
376     }
377 gross 3094 } /* end of coloring loop */
378    
379    
380     /* backward substitution */
381     for (color=(num_colors)-2 ;color>-1;--color) { /* Note: color=(num_colors)-1 is not required */
382     if (n_block==1) {
383 gross 3097 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
384 gross 3094 for (i = 0; i < n; ++i) {
385     if (coloring[i]==color) {
386     mm=ptr_main[i];
387 gross 3097 Paso_BlockOps_MV_1(&x[i], &A_p->val[mm], &x[i]);
388 gross 3094 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
389     k=A_p->pattern->index[iptr_ik];
390 gross 3097 if (coloring[k]>color) Paso_BlockOps_SMV_1(&x[i], &A_p->val[iptr_ik], &x[k]);
391 gross 3094 }
392 gross 3097 Paso_BlockOps_MV_1(&x[i], &diag[i], &x[i]);
393 gross 3094 }
394     }
395     } else if (n_block==2) {
396 gross 3097 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
397 gross 3094 for (i = 0; i < n; ++i) {
398     if (coloring[i]==color) {
399     mm=ptr_main[i];
400 gross 3097 Paso_BlockOps_MV_2(&x[2*i], &A_p->val[4*mm], &x[2*i]);
401 gross 3094 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
402     k=A_p->pattern->index[iptr_ik];
403 gross 3097 if (coloring[k]>color) Paso_BlockOps_SMV_2(&x[2*i], &A_p->val[4*iptr_ik], &x[2*k]);
404 gross 3094 }
405 gross 3097 Paso_BlockOps_MV_2(&x[2*i], &diag[4*i], &x[2*i]);
406 gross 3094 }
407     }
408     } else if (n_block==3) {
409 gross 3097 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
410 gross 3094 for (i = 0; i < n; ++i) {
411     if (coloring[i]==color) {
412     mm=ptr_main[i];
413 gross 3097 Paso_BlockOps_MV_3(&x[3*i], &A_p->val[9*mm], &x[3*i]);
414 gross 3094 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
415     k=A_p->pattern->index[iptr_ik];
416 gross 3097 if (coloring[k]>color) Paso_BlockOps_SMV_3(&x[3*i], &A_p->val[9*iptr_ik], &x[3*k]);
417 gross 3094 }
418 gross 3097 Paso_BlockOps_MV_3(&x[3*i], &diag[9*i], &x[3*i]);
419 gross 3094 }
420     }
421 gross 3097 } else {
422     #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k)
423     for (i = 0; i < n; ++i) {
424     if (coloring[i]==color) {
425     mm=ptr_main[i];
426     Paso_BlockOps_MV_N( n_block, &x[n_block*i], &A_p->val[block_size*mm], &x[n_block*i] );
427     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
428     k=A_p->pattern->index[iptr_ik];
429     if (coloring[k]>color) Paso_BlockOps_SMV_N(n_block, &x[n_block*i], &A_p->val[block_size*iptr_ik], &x[n_block*k]);
430     }
431     Paso_BlockOps_Solve_N(n_block, &x[n_block*i], &diag[block_size*i], &pivot[n_block*i], &x[n_block*i]);
432     }
433     }
434     }
435 gross 3094 }
436     return;
437     }
438    
439    
440    

  ViewVC Help
Powered by ViewVC 1.1.26