/[escript]/trunk/paso/src/ILU.c
ViewVC logotype

Contents of /trunk/paso/src/ILU.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3259 - (show annotations)
Mon Oct 11 01:48:14 2010 UTC (9 years ago) by jfenwick
File MIME type: text/plain
File size: 24550 byte(s)
Merging dudley and scons updates from branches

1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: ILU preconditioner with reordering */
18
19 /**************************************************************/
20
21 /* Copyrights by ACcESS Australia 2003,2004,2005 */
22 /* Author: Lutz Gross, l.gross@uq.edu.au */
23
24 /**************************************************************/
25
26 #include "Paso.h"
27 #include "Preconditioner.h"
28 #include "PasoUtil.h"
29
30 /**************************************************************/
31
32 /* free all memory used by ILU */
33
34 void Paso_Solver_ILU_free(Paso_Solver_ILU * in) {
35 if (in!=NULL) {
36 MEMFREE(in->factors);
37 MEMFREE(in);
38 }
39 }
40
41 /**************************************************************/
42
43 /* constructs the incomplete block factorization of
44
45 */
46 Paso_Solver_ILU* Paso_Solver_getILU(Paso_SparseMatrix * A,bool_t verbose) {
47 const dim_t n=A->numRows;
48 const dim_t n_block=A->row_block_size;
49 const index_t* colorOf = Paso_Pattern_borrowColoringPointer(A->pattern);
50 const dim_t num_colors = Paso_Pattern_getNumColors(A->pattern);
51 const index_t *ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A);
52 register double A11,A12,A13,A21,A22,A23,A31,A32,A33,D;
53 register double S11,S12,S13,S21,S22,S23,S31,S32,S33;
54 register index_t i,iptr_main,iptr_ik,k,iptr_kj,j,iptr_ij,color,color2, iptr;
55 double time0=0,time_fac=0;
56 /* allocations: */
57 Paso_Solver_ILU* out=MEMALLOC(1,Paso_Solver_ILU);
58 if (Esys_checkPtr(out)) return NULL;
59 out->factors=MEMALLOC(A->len,double);
60
61 if ( ! Esys_checkPtr(out->factors) ) {
62
63 time0=Esys_timer();
64
65 #pragma omp parallel for schedule(static) private(i,iptr,k)
66 for (i = 0; i < n; ++i) {
67 for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; iptr++) {
68 for (k=0;k<n_block*n_block;++k) out->factors[n_block*n_block*iptr+k]=A->val[n_block*n_block*iptr+k];
69 }
70 }
71 /* start factorization */
72 for (color=0;color<num_colors && Esys_noError();++color) {
73 if (n_block==1) {
74 #pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,j,iptr_ij,A11,iptr_main,D)
75 for (i = 0; i < n; ++i) {
76 if (colorOf[i]==color) {
77 for (color2=0;color2<color;++color2) {
78 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
79 k=A->pattern->index[iptr_ik];
80 if (colorOf[k]==color2) {
81 A11=out->factors[iptr_ik];
82 /* a_ij=a_ij-a_ik*a_kj */
83 for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) {
84 j=A->pattern->index[iptr_kj];
85 if (colorOf[j]>color2) {
86 S11=out->factors[iptr_kj];
87 for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) {
88 if (j==A->pattern->index[iptr_ij]) {
89 out->factors[iptr_ij]-=A11*S11;
90 break;
91 }
92 }
93 }
94 }
95 }
96 }
97 }
98 iptr_main=ptr_main[i];
99 D=out->factors[iptr_main];
100 if (ABS(D)>0.) {
101 D=1./D;
102 out->factors[iptr_main]=D;
103 /* a_ik=a_ii^{-1}*a_ik */
104 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
105 k=A->pattern->index[iptr_ik];
106 if (colorOf[k]>color) {
107 A11=out->factors[iptr_ik];
108 out->factors[iptr_ik]=A11*D;
109 }
110 }
111 } else {
112 Esys_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block.");
113 }
114 }
115 }
116 } else if (n_block==2) {
117 #pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,S21,S12,S22,j,iptr_ij,A11,A21,A12,A22,iptr_main,D)
118 for (i = 0; i < n; ++i) {
119 if (colorOf[i]==color) {
120 for (color2=0;color2<color;++color2) {
121 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
122 k=A->pattern->index[iptr_ik];
123 if (colorOf[k]==color2) {
124 A11=out->factors[iptr_ik*4 ];
125 A21=out->factors[iptr_ik*4+1];
126 A12=out->factors[iptr_ik*4+2];
127 A22=out->factors[iptr_ik*4+3];
128 /* a_ij=a_ij-a_ik*a_kj */
129 for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) {
130 j=A->pattern->index[iptr_kj];
131 if (colorOf[j]>color2) {
132 S11=out->factors[iptr_kj*4];
133 S21=out->factors[iptr_kj*4+1];
134 S12=out->factors[iptr_kj*4+2];
135 S22=out->factors[iptr_kj*4+3];
136 for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) {
137 if (j==A->pattern->index[iptr_ij]) {
138 out->factors[4*iptr_ij ]-=A11*S11+A12*S21;
139 out->factors[4*iptr_ij+1]-=A21*S11+A22*S21;
140 out->factors[4*iptr_ij+2]-=A11*S12+A12*S22;
141 out->factors[4*iptr_ij+3]-=A21*S12+A22*S22;
142 break;
143 }
144 }
145 }
146 }
147 }
148 }
149 }
150 iptr_main=ptr_main[i];
151 A11=out->factors[iptr_main*4];
152 A21=out->factors[iptr_main*4+1];
153 A12=out->factors[iptr_main*4+2];
154 A22=out->factors[iptr_main*4+3];
155 D = A11*A22-A12*A21;
156 if (ABS(D)>0.) {
157 D=1./D;
158 S11= A22*D;
159 S21=-A21*D;
160 S12=-A12*D;
161 S22= A11*D;
162 out->factors[iptr_main*4] = S11;
163 out->factors[iptr_main*4+1]= S21;
164 out->factors[iptr_main*4+2]= S12;
165 out->factors[iptr_main*4+3]= S22;
166 /* a_ik=a_ii^{-1}*a_ik */
167 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
168 k=A->pattern->index[iptr_ik];
169 if (colorOf[k]>color) {
170 A11=out->factors[iptr_ik*4 ];
171 A21=out->factors[iptr_ik*4+1];
172 A12=out->factors[iptr_ik*4+2];
173 A22=out->factors[iptr_ik*4+3];
174 out->factors[4*iptr_ik ]=S11*A11+S12*A21;
175 out->factors[4*iptr_ik+1]=S21*A11+S22*A21;
176 out->factors[4*iptr_ik+2]=S11*A12+S12*A22;
177 out->factors[4*iptr_ik+3]=S21*A12+S22*A22;
178 }
179 }
180 } else {
181 Esys_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block.");
182 }
183 }
184 }
185 } else if (n_block==3) {
186 #pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,S21,S31,S12,S22,S32,S13,S23,S33,j,iptr_ij,A11,A21,A31,A12,A22,A32,A13,A23,A33,iptr_main,D)
187 for (i = 0; i < n; ++i) {
188 if (colorOf[i]==color) {
189 for (color2=0;color2<color;++color2) {
190 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
191 k=A->pattern->index[iptr_ik];
192 if (colorOf[k]==color2) {
193 A11=out->factors[iptr_ik*9 ];
194 A21=out->factors[iptr_ik*9+1];
195 A31=out->factors[iptr_ik*9+2];
196 A12=out->factors[iptr_ik*9+3];
197 A22=out->factors[iptr_ik*9+4];
198 A32=out->factors[iptr_ik*9+5];
199 A13=out->factors[iptr_ik*9+6];
200 A23=out->factors[iptr_ik*9+7];
201 A33=out->factors[iptr_ik*9+8];
202 /* a_ij=a_ij-a_ik*a_kj */
203 for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) {
204 j=A->pattern->index[iptr_kj];
205 if (colorOf[j]>color2) {
206 S11=out->factors[iptr_kj*9 ];
207 S21=out->factors[iptr_kj*9+1];
208 S31=out->factors[iptr_kj*9+2];
209 S12=out->factors[iptr_kj*9+3];
210 S22=out->factors[iptr_kj*9+4];
211 S32=out->factors[iptr_kj*9+5];
212 S13=out->factors[iptr_kj*9+6];
213 S23=out->factors[iptr_kj*9+7];
214 S33=out->factors[iptr_kj*9+8];
215 for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) {
216 if (j==A->pattern->index[iptr_ij]) {
217 out->factors[iptr_ij*9 ]-=A11*S11+A12*S21+A13*S31;
218 out->factors[iptr_ij*9+1]-=A21*S11+A22*S21+A23*S31;
219 out->factors[iptr_ij*9+2]-=A31*S11+A32*S21+A33*S31;
220 out->factors[iptr_ij*9+3]-=A11*S12+A12*S22+A13*S32;
221 out->factors[iptr_ij*9+4]-=A21*S12+A22*S22+A23*S32;
222 out->factors[iptr_ij*9+5]-=A31*S12+A32*S22+A33*S32;
223 out->factors[iptr_ij*9+6]-=A11*S13+A12*S23+A13*S33;
224 out->factors[iptr_ij*9+7]-=A21*S13+A22*S23+A23*S33;
225 out->factors[iptr_ij*9+8]-=A31*S13+A32*S23+A33*S33;
226 break;
227 }
228 }
229 }
230 }
231 }
232 }
233 }
234 iptr_main=ptr_main[i];
235 A11=out->factors[iptr_main*9 ];
236 A21=out->factors[iptr_main*9+1];
237 A31=out->factors[iptr_main*9+2];
238 A12=out->factors[iptr_main*9+3];
239 A22=out->factors[iptr_main*9+4];
240 A32=out->factors[iptr_main*9+5];
241 A13=out->factors[iptr_main*9+6];
242 A23=out->factors[iptr_main*9+7];
243 A33=out->factors[iptr_main*9+8];
244 D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22);
245 if (ABS(D)>0.) {
246 D=1./D;
247 S11=(A22*A33-A23*A32)*D;
248 S21=(A31*A23-A21*A33)*D;
249 S31=(A21*A32-A31*A22)*D;
250 S12=(A13*A32-A12*A33)*D;
251 S22=(A11*A33-A31*A13)*D;
252 S32=(A12*A31-A11*A32)*D;
253 S13=(A12*A23-A13*A22)*D;
254 S23=(A13*A21-A11*A23)*D;
255 S33=(A11*A22-A12*A21)*D;
256
257 out->factors[iptr_main*9 ]=S11;
258 out->factors[iptr_main*9+1]=S21;
259 out->factors[iptr_main*9+2]=S31;
260 out->factors[iptr_main*9+3]=S12;
261 out->factors[iptr_main*9+4]=S22;
262 out->factors[iptr_main*9+5]=S32;
263 out->factors[iptr_main*9+6]=S13;
264 out->factors[iptr_main*9+7]=S23;
265 out->factors[iptr_main*9+8]=S33;
266
267 /* a_ik=a_ii^{-1}*a_ik */
268 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
269 k=A->pattern->index[iptr_ik];
270 if (colorOf[k]>color) {
271 A11=out->factors[iptr_ik*9 ];
272 A21=out->factors[iptr_ik*9+1];
273 A31=out->factors[iptr_ik*9+2];
274 A12=out->factors[iptr_ik*9+3];
275 A22=out->factors[iptr_ik*9+4];
276 A32=out->factors[iptr_ik*9+5];
277 A13=out->factors[iptr_ik*9+6];
278 A23=out->factors[iptr_ik*9+7];
279 A33=out->factors[iptr_ik*9+8];
280 out->factors[iptr_ik*9 ]=S11*A11+S12*A21+S13*A31;
281 out->factors[iptr_ik*9+1]=S21*A11+S22*A21+S23*A31;
282 out->factors[iptr_ik*9+2]=S31*A11+S32*A21+S33*A31;
283 out->factors[iptr_ik*9+3]=S11*A12+S12*A22+S13*A32;
284 out->factors[iptr_ik*9+4]=S21*A12+S22*A22+S23*A32;
285 out->factors[iptr_ik*9+5]=S31*A12+S32*A22+S33*A32;
286 out->factors[iptr_ik*9+6]=S11*A13+S12*A23+S13*A33;
287 out->factors[iptr_ik*9+7]=S21*A13+S22*A23+S23*A33;
288 out->factors[iptr_ik*9+8]=S31*A13+S32*A23+S33*A33;
289 }
290 }
291 } else {
292 Esys_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block.");
293 }
294 }
295 }
296 } else {
297 Esys_setError(VALUE_ERROR, "Paso_Solver_getILU: block size greater than 3 is not supported.");
298 }
299 #pragma omp barrier
300 }
301 time_fac=Esys_timer()-time0;
302 }
303 if (Esys_noError()) {
304 if (verbose) printf("timing: ILU: coloring/elemination : %e sec\n",time_fac);
305 return out;
306 } else {
307 Paso_Solver_ILU_free(out);
308 return NULL;
309 }
310 }
311
312 /**************************************************************/
313
314 /* apply ILU precondition b-> x
315
316 in fact it solves LUx=b in the form x= U^{-1} L^{-1}b
317
318 should be called within a parallel region
319 barrier synconization should be performed to make sure that the input vector available
320
321 */
322
323 void Paso_Solver_solveILU(Paso_SparseMatrix * A, Paso_Solver_ILU * ilu, double * x, const double * b) {
324 register dim_t i,k;
325 register index_t color,iptr_ik,iptr_main;
326 register double S1,S2,S3,R1,R2,R3;
327 const dim_t n=A->numRows;
328 const dim_t n_block=A->row_block_size;
329 const index_t* colorOf = Paso_Pattern_borrowColoringPointer(A->pattern);
330 const dim_t num_colors = Paso_Pattern_getNumColors(A->pattern);
331 const index_t *ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A);
332
333
334 /* copy x into b*/
335 #pragma omp parallel for private(i) schedule(static)
336 for (i=0;i<n*n_block;++i) x[i]=b[i];
337 /* forward substitution */
338 for (color=0;color<num_colors;++color) {
339 if (n_block==1) {
340 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1,iptr_main)
341 for (i = 0; i < n; ++i) {
342 if (colorOf[i]==color) {
343 /* x_i=x_i-a_ik*x_k */
344 S1=x[i];
345 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
346 k=A->pattern->index[iptr_ik];
347 if (colorOf[k]<color) {
348 R1=x[k];
349 S1-=ilu->factors[iptr_ik]*R1;
350 }
351 }
352 iptr_main=ptr_main[i];
353 x[i]=ilu->factors[iptr_main]*S1;
354 }
355 }
356 } else if (n_block==2) {
357 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,iptr_main,S1,S2,R1,R2)
358 for (i = 0; i < n; ++i) {
359 if (colorOf[i]==color) {
360 /* x_i=x_i-a_ik*x_k */
361 S1=x[2*i];
362 S2=x[2*i+1];
363 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
364 k=A->pattern->index[iptr_ik];
365 if (colorOf[k]<color) {
366 R1=x[2*k];
367 R2=x[2*k+1];
368 S1-=ilu->factors[4*iptr_ik ]*R1+ilu->factors[4*iptr_ik+2]*R2;
369 S2-=ilu->factors[4*iptr_ik+1]*R1+ilu->factors[4*iptr_ik+3]*R2;
370 }
371 }
372 iptr_main=ptr_main[i];
373 x[2*i ]=ilu->factors[4*iptr_main ]*S1+ilu->factors[4*iptr_main+2]*S2;
374 x[2*i+1]=ilu->factors[4*iptr_main+1]*S1+ilu->factors[4*iptr_main+3]*S2;
375 }
376
377 }
378 } else if (n_block==3) {
379 #pragma omp parallel for schedule(static) private(i,iptr_ik,iptr_main,k,S1,S2,S3,R1,R2,R3)
380 for (i = 0; i < n; ++i) {
381 if (colorOf[i]==color) {
382 /* x_i=x_i-a_ik*x_k */
383 S1=x[3*i];
384 S2=x[3*i+1];
385 S3=x[3*i+2];
386 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
387 k=A->pattern->index[iptr_ik];
388 if (colorOf[k]<color) {
389 R1=x[3*k];
390 R2=x[3*k+1];
391 R3=x[3*k+2];
392 S1-=ilu->factors[9*iptr_ik ]*R1+ilu->factors[9*iptr_ik+3]*R2+ilu->factors[9*iptr_ik+6]*R3;
393 S2-=ilu->factors[9*iptr_ik+1]*R1+ilu->factors[9*iptr_ik+4]*R2+ilu->factors[9*iptr_ik+7]*R3;
394 S3-=ilu->factors[9*iptr_ik+2]*R1+ilu->factors[9*iptr_ik+5]*R2+ilu->factors[9*iptr_ik+8]*R3;
395 }
396 }
397 iptr_main=ptr_main[i];
398 x[3*i ]=ilu->factors[9*iptr_main ]*S1+ilu->factors[9*iptr_main+3]*S2+ilu->factors[9*iptr_main+6]*S3;
399 x[3*i+1]=ilu->factors[9*iptr_main+1]*S1+ilu->factors[9*iptr_main+4]*S2+ilu->factors[9*iptr_main+7]*S3;
400 x[3*i+2]=ilu->factors[9*iptr_main+2]*S1+ilu->factors[9*iptr_main+5]*S2+ilu->factors[9*iptr_main+8]*S3;
401 }
402 }
403 }
404 }
405 /* backward substitution */
406 for (color=(num_colors)-1;color>-1;--color) {
407 if (n_block==1) {
408 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1)
409 for (i = 0; i < n; ++i) {
410 if (colorOf[i]==color) {
411 /* x_i=x_i-a_ik*x_k */
412 S1=x[i];
413 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
414 k=A->pattern->index[iptr_ik];
415 if (colorOf[k]>color) {
416 R1=x[k];
417 S1-=ilu->factors[iptr_ik]*R1;
418 }
419 }
420 x[i]=S1;
421 }
422 }
423 } else if (n_block==2) {
424 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2)
425 for (i = 0; i < n; ++i) {
426 if (colorOf[i]==color) {
427 /* x_i=x_i-a_ik*x_k */
428 S1=x[2*i];
429 S2=x[2*i+1];
430 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
431 k=A->pattern->index[iptr_ik];
432 if (colorOf[k]>color) {
433 R1=x[2*k];
434 R2=x[2*k+1];
435 S1-=ilu->factors[4*iptr_ik ]*R1+ilu->factors[4*iptr_ik+2]*R2;
436 S2-=ilu->factors[4*iptr_ik+1]*R1+ilu->factors[4*iptr_ik+3]*R2;
437 }
438 }
439 x[2*i]=S1;
440 x[2*i+1]=S2;
441 }
442 }
443 } else if (n_block==3) {
444 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3)
445 for (i = 0; i < n; ++i) {
446 if (colorOf[i]==color) {
447 /* x_i=x_i-a_ik*x_k */
448 S1=x[3*i ];
449 S2=x[3*i+1];
450 S3=x[3*i+2];
451 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
452 k=A->pattern->index[iptr_ik];
453 if (colorOf[k]>color) {
454 R1=x[3*k];
455 R2=x[3*k+1];
456 R3=x[3*k+2];
457 S1-=ilu->factors[9*iptr_ik ]*R1+ilu->factors[9*iptr_ik+3]*R2+ilu->factors[9*iptr_ik+6]*R3;
458 S2-=ilu->factors[9*iptr_ik+1]*R1+ilu->factors[9*iptr_ik+4]*R2+ilu->factors[9*iptr_ik+7]*R3;
459 S3-=ilu->factors[9*iptr_ik+2]*R1+ilu->factors[9*iptr_ik+5]*R2+ilu->factors[9*iptr_ik+8]*R3;
460 }
461 }
462 x[3*i]=S1;
463 x[3*i+1]=S2;
464 x[3*i+2]=S3;
465 }
466 }
467 }
468 #pragma omp barrier
469 }
470 return;
471 }
472

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26