/[escript]/branches/doubleplusgood/paso/src/ILU.cpp
ViewVC logotype

Contents of /branches/doubleplusgood/paso/src/ILU.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4261 - (show annotations)
Wed Feb 27 06:09:33 2013 UTC (6 years, 1 month ago) by jfenwick
File size: 24720 byte(s)
Initial all c++ build.
But ... there are now reinterpret_cast<>'s
1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2013 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development since 2012 by School of Earth Sciences
13 *
14 *****************************************************************************/
15
16
17 /************************************************************************************/
18
19 /* Paso: ILU preconditioner with reordering */
20
21 /************************************************************************************/
22
23 /* Copyrights by ACcESS Australia 2003,2004,2005 */
24 /* Author: Lutz Gross, l.gross@uq.edu.au */
25
26 /************************************************************************************/
27
28 #include "Paso.h"
29 #include "Preconditioner.h"
30 #include "PasoUtil.h"
31
32 /************************************************************************************/
33
34 /* free all memory used by ILU */
35
36 void Paso_Solver_ILU_free(Paso_Solver_ILU * in) {
37 if (in!=NULL) {
38 MEMFREE(in->factors);
39 MEMFREE(in);
40 }
41 }
42
43 /************************************************************************************/
44
45 /* constructs the incomplete block factorization */
46
47 Paso_Solver_ILU* Paso_Solver_getILU(Paso_SparseMatrix * A,bool_t verbose) {
48 const dim_t n=A->numRows;
49 const dim_t n_block=A->row_block_size;
50 const index_t* colorOf = Paso_Pattern_borrowColoringPointer(A->pattern);
51 const dim_t num_colors = Paso_Pattern_getNumColors(A->pattern);
52 const index_t *ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A);
53 register double A11,A12,A13,A21,A22,A23,A31,A32,A33,D;
54 register double S11,S12,S13,S21,S22,S23,S31,S32,S33;
55 register index_t i,iptr_main,iptr_ik,k,iptr_kj,j,iptr_ij,color,color2, iptr;
56 double time0=0,time_fac=0;
57 /* allocations: */
58 Paso_Solver_ILU* out=MEMALLOC(1,Paso_Solver_ILU);
59 if (Esys_checkPtr(out)) return NULL;
60 out->factors=MEMALLOC(A->len,double);
61
62 if ( ! Esys_checkPtr(out->factors) ) {
63
64 time0=Esys_timer();
65
66 #pragma omp parallel for schedule(static) private(i,iptr,k)
67 for (i = 0; i < n; ++i) {
68 for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; iptr++) {
69 for (k=0;k<n_block*n_block;++k) out->factors[n_block*n_block*iptr+k]=A->val[n_block*n_block*iptr+k];
70 }
71 }
72 /* start factorization */
73 for (color=0;color<num_colors && Esys_noError();++color) {
74 if (n_block==1) {
75 #pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,j,iptr_ij,A11,iptr_main,D)
76 for (i = 0; i < n; ++i) {
77 if (colorOf[i]==color) {
78 for (color2=0;color2<color;++color2) {
79 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
80 k=A->pattern->index[iptr_ik];
81 if (colorOf[k]==color2) {
82 A11=out->factors[iptr_ik];
83 /* a_ij=a_ij-a_ik*a_kj */
84 for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) {
85 j=A->pattern->index[iptr_kj];
86 if (colorOf[j]>color2) {
87 S11=out->factors[iptr_kj];
88 for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) {
89 if (j==A->pattern->index[iptr_ij]) {
90 out->factors[iptr_ij]-=A11*S11;
91 break;
92 }
93 }
94 }
95 }
96 }
97 }
98 }
99 iptr_main=ptr_main[i];
100 D=out->factors[iptr_main];
101 if (ABS(D)>0.) {
102 D=1./D;
103 out->factors[iptr_main]=D;
104 /* a_ik=a_ii^{-1}*a_ik */
105 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
106 k=A->pattern->index[iptr_ik];
107 if (colorOf[k]>color) {
108 A11=out->factors[iptr_ik];
109 out->factors[iptr_ik]=A11*D;
110 }
111 }
112 } else {
113 Esys_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block.");
114 }
115 }
116 }
117 } else if (n_block==2) {
118 #pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,S21,S12,S22,j,iptr_ij,A11,A21,A12,A22,iptr_main,D)
119 for (i = 0; i < n; ++i) {
120 if (colorOf[i]==color) {
121 for (color2=0;color2<color;++color2) {
122 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
123 k=A->pattern->index[iptr_ik];
124 if (colorOf[k]==color2) {
125 A11=out->factors[iptr_ik*4 ];
126 A21=out->factors[iptr_ik*4+1];
127 A12=out->factors[iptr_ik*4+2];
128 A22=out->factors[iptr_ik*4+3];
129 /* a_ij=a_ij-a_ik*a_kj */
130 for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) {
131 j=A->pattern->index[iptr_kj];
132 if (colorOf[j]>color2) {
133 S11=out->factors[iptr_kj*4];
134 S21=out->factors[iptr_kj*4+1];
135 S12=out->factors[iptr_kj*4+2];
136 S22=out->factors[iptr_kj*4+3];
137 for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) {
138 if (j==A->pattern->index[iptr_ij]) {
139 out->factors[4*iptr_ij ]-=A11*S11+A12*S21;
140 out->factors[4*iptr_ij+1]-=A21*S11+A22*S21;
141 out->factors[4*iptr_ij+2]-=A11*S12+A12*S22;
142 out->factors[4*iptr_ij+3]-=A21*S12+A22*S22;
143 break;
144 }
145 }
146 }
147 }
148 }
149 }
150 }
151 iptr_main=ptr_main[i];
152 A11=out->factors[iptr_main*4];
153 A21=out->factors[iptr_main*4+1];
154 A12=out->factors[iptr_main*4+2];
155 A22=out->factors[iptr_main*4+3];
156 D = A11*A22-A12*A21;
157 if (ABS(D)>0.) {
158 D=1./D;
159 S11= A22*D;
160 S21=-A21*D;
161 S12=-A12*D;
162 S22= A11*D;
163 out->factors[iptr_main*4] = S11;
164 out->factors[iptr_main*4+1]= S21;
165 out->factors[iptr_main*4+2]= S12;
166 out->factors[iptr_main*4+3]= S22;
167 /* a_ik=a_ii^{-1}*a_ik */
168 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
169 k=A->pattern->index[iptr_ik];
170 if (colorOf[k]>color) {
171 A11=out->factors[iptr_ik*4 ];
172 A21=out->factors[iptr_ik*4+1];
173 A12=out->factors[iptr_ik*4+2];
174 A22=out->factors[iptr_ik*4+3];
175 out->factors[4*iptr_ik ]=S11*A11+S12*A21;
176 out->factors[4*iptr_ik+1]=S21*A11+S22*A21;
177 out->factors[4*iptr_ik+2]=S11*A12+S12*A22;
178 out->factors[4*iptr_ik+3]=S21*A12+S22*A22;
179 }
180 }
181 } else {
182 Esys_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block.");
183 }
184 }
185 }
186 } else if (n_block==3) {
187 #pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,S21,S31,S12,S22,S32,S13,S23,S33,j,iptr_ij,A11,A21,A31,A12,A22,A32,A13,A23,A33,iptr_main,D)
188 for (i = 0; i < n; ++i) {
189 if (colorOf[i]==color) {
190 for (color2=0;color2<color;++color2) {
191 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
192 k=A->pattern->index[iptr_ik];
193 if (colorOf[k]==color2) {
194 A11=out->factors[iptr_ik*9 ];
195 A21=out->factors[iptr_ik*9+1];
196 A31=out->factors[iptr_ik*9+2];
197 A12=out->factors[iptr_ik*9+3];
198 A22=out->factors[iptr_ik*9+4];
199 A32=out->factors[iptr_ik*9+5];
200 A13=out->factors[iptr_ik*9+6];
201 A23=out->factors[iptr_ik*9+7];
202 A33=out->factors[iptr_ik*9+8];
203 /* a_ij=a_ij-a_ik*a_kj */
204 for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) {
205 j=A->pattern->index[iptr_kj];
206 if (colorOf[j]>color2) {
207 S11=out->factors[iptr_kj*9 ];
208 S21=out->factors[iptr_kj*9+1];
209 S31=out->factors[iptr_kj*9+2];
210 S12=out->factors[iptr_kj*9+3];
211 S22=out->factors[iptr_kj*9+4];
212 S32=out->factors[iptr_kj*9+5];
213 S13=out->factors[iptr_kj*9+6];
214 S23=out->factors[iptr_kj*9+7];
215 S33=out->factors[iptr_kj*9+8];
216 for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) {
217 if (j==A->pattern->index[iptr_ij]) {
218 out->factors[iptr_ij*9 ]-=A11*S11+A12*S21+A13*S31;
219 out->factors[iptr_ij*9+1]-=A21*S11+A22*S21+A23*S31;
220 out->factors[iptr_ij*9+2]-=A31*S11+A32*S21+A33*S31;
221 out->factors[iptr_ij*9+3]-=A11*S12+A12*S22+A13*S32;
222 out->factors[iptr_ij*9+4]-=A21*S12+A22*S22+A23*S32;
223 out->factors[iptr_ij*9+5]-=A31*S12+A32*S22+A33*S32;
224 out->factors[iptr_ij*9+6]-=A11*S13+A12*S23+A13*S33;
225 out->factors[iptr_ij*9+7]-=A21*S13+A22*S23+A23*S33;
226 out->factors[iptr_ij*9+8]-=A31*S13+A32*S23+A33*S33;
227 break;
228 }
229 }
230 }
231 }
232 }
233 }
234 }
235 iptr_main=ptr_main[i];
236 A11=out->factors[iptr_main*9 ];
237 A21=out->factors[iptr_main*9+1];
238 A31=out->factors[iptr_main*9+2];
239 A12=out->factors[iptr_main*9+3];
240 A22=out->factors[iptr_main*9+4];
241 A32=out->factors[iptr_main*9+5];
242 A13=out->factors[iptr_main*9+6];
243 A23=out->factors[iptr_main*9+7];
244 A33=out->factors[iptr_main*9+8];
245 D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22);
246 if (ABS(D)>0.) {
247 D=1./D;
248 S11=(A22*A33-A23*A32)*D;
249 S21=(A31*A23-A21*A33)*D;
250 S31=(A21*A32-A31*A22)*D;
251 S12=(A13*A32-A12*A33)*D;
252 S22=(A11*A33-A31*A13)*D;
253 S32=(A12*A31-A11*A32)*D;
254 S13=(A12*A23-A13*A22)*D;
255 S23=(A13*A21-A11*A23)*D;
256 S33=(A11*A22-A12*A21)*D;
257
258 out->factors[iptr_main*9 ]=S11;
259 out->factors[iptr_main*9+1]=S21;
260 out->factors[iptr_main*9+2]=S31;
261 out->factors[iptr_main*9+3]=S12;
262 out->factors[iptr_main*9+4]=S22;
263 out->factors[iptr_main*9+5]=S32;
264 out->factors[iptr_main*9+6]=S13;
265 out->factors[iptr_main*9+7]=S23;
266 out->factors[iptr_main*9+8]=S33;
267
268 /* a_ik=a_ii^{-1}*a_ik */
269 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
270 k=A->pattern->index[iptr_ik];
271 if (colorOf[k]>color) {
272 A11=out->factors[iptr_ik*9 ];
273 A21=out->factors[iptr_ik*9+1];
274 A31=out->factors[iptr_ik*9+2];
275 A12=out->factors[iptr_ik*9+3];
276 A22=out->factors[iptr_ik*9+4];
277 A32=out->factors[iptr_ik*9+5];
278 A13=out->factors[iptr_ik*9+6];
279 A23=out->factors[iptr_ik*9+7];
280 A33=out->factors[iptr_ik*9+8];
281 out->factors[iptr_ik*9 ]=S11*A11+S12*A21+S13*A31;
282 out->factors[iptr_ik*9+1]=S21*A11+S22*A21+S23*A31;
283 out->factors[iptr_ik*9+2]=S31*A11+S32*A21+S33*A31;
284 out->factors[iptr_ik*9+3]=S11*A12+S12*A22+S13*A32;
285 out->factors[iptr_ik*9+4]=S21*A12+S22*A22+S23*A32;
286 out->factors[iptr_ik*9+5]=S31*A12+S32*A22+S33*A32;
287 out->factors[iptr_ik*9+6]=S11*A13+S12*A23+S13*A33;
288 out->factors[iptr_ik*9+7]=S21*A13+S22*A23+S23*A33;
289 out->factors[iptr_ik*9+8]=S31*A13+S32*A23+S33*A33;
290 }
291 }
292 } else {
293 Esys_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block.");
294 }
295 }
296 }
297 } else {
298 Esys_setError(VALUE_ERROR, "Paso_Solver_getILU: block size greater than 3 is not supported.");
299 }
300 #pragma omp barrier
301 }
302 time_fac=Esys_timer()-time0;
303 }
304 if (Esys_noError()) {
305 if (verbose) printf("timing: ILU: coloring/elimination: %e sec\n",time_fac);
306 return out;
307 } else {
308 Paso_Solver_ILU_free(out);
309 return NULL;
310 }
311 }
312
313 /************************************************************************************/
314
315 /* Applies ILU precondition b-> x
316
317 In fact it solves LUx=b in the form x= U^{-1} L^{-1}b
318
319 Should be called within a parallel region.
320 Barrier synchronization should be performed to make sure that the input
321 vector is available.
322 */
323
324 void Paso_Solver_solveILU(Paso_SparseMatrix * A, Paso_Solver_ILU * ilu, double * x, const double * b) {
325 register dim_t i,k;
326 register index_t color,iptr_ik,iptr_main;
327 register double S1,S2,S3,R1,R2,R3;
328 const dim_t n=A->numRows;
329 const dim_t n_block=A->row_block_size;
330 const index_t* colorOf = Paso_Pattern_borrowColoringPointer(A->pattern);
331 const dim_t num_colors = Paso_Pattern_getNumColors(A->pattern);
332 const index_t *ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A);
333
334
335 /* copy x into b */
336 #pragma omp parallel for private(i) schedule(static)
337 for (i=0;i<n*n_block;++i) x[i]=b[i];
338 /* forward substitution */
339 for (color=0;color<num_colors;++color) {
340 if (n_block==1) {
341 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1,iptr_main)
342 for (i = 0; i < n; ++i) {
343 if (colorOf[i]==color) {
344 /* x_i=x_i-a_ik*x_k */
345 S1=x[i];
346 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
347 k=A->pattern->index[iptr_ik];
348 if (colorOf[k]<color) {
349 R1=x[k];
350 S1-=ilu->factors[iptr_ik]*R1;
351 }
352 }
353 iptr_main=ptr_main[i];
354 x[i]=ilu->factors[iptr_main]*S1;
355 }
356 }
357 } else if (n_block==2) {
358 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,iptr_main,S1,S2,R1,R2)
359 for (i = 0; i < n; ++i) {
360 if (colorOf[i]==color) {
361 /* x_i=x_i-a_ik*x_k */
362 S1=x[2*i];
363 S2=x[2*i+1];
364 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
365 k=A->pattern->index[iptr_ik];
366 if (colorOf[k]<color) {
367 R1=x[2*k];
368 R2=x[2*k+1];
369 S1-=ilu->factors[4*iptr_ik ]*R1+ilu->factors[4*iptr_ik+2]*R2;
370 S2-=ilu->factors[4*iptr_ik+1]*R1+ilu->factors[4*iptr_ik+3]*R2;
371 }
372 }
373 iptr_main=ptr_main[i];
374 x[2*i ]=ilu->factors[4*iptr_main ]*S1+ilu->factors[4*iptr_main+2]*S2;
375 x[2*i+1]=ilu->factors[4*iptr_main+1]*S1+ilu->factors[4*iptr_main+3]*S2;
376 }
377
378 }
379 } else if (n_block==3) {
380 #pragma omp parallel for schedule(static) private(i,iptr_ik,iptr_main,k,S1,S2,S3,R1,R2,R3)
381 for (i = 0; i < n; ++i) {
382 if (colorOf[i]==color) {
383 /* x_i=x_i-a_ik*x_k */
384 S1=x[3*i];
385 S2=x[3*i+1];
386 S3=x[3*i+2];
387 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
388 k=A->pattern->index[iptr_ik];
389 if (colorOf[k]<color) {
390 R1=x[3*k];
391 R2=x[3*k+1];
392 R3=x[3*k+2];
393 S1-=ilu->factors[9*iptr_ik ]*R1+ilu->factors[9*iptr_ik+3]*R2+ilu->factors[9*iptr_ik+6]*R3;
394 S2-=ilu->factors[9*iptr_ik+1]*R1+ilu->factors[9*iptr_ik+4]*R2+ilu->factors[9*iptr_ik+7]*R3;
395 S3-=ilu->factors[9*iptr_ik+2]*R1+ilu->factors[9*iptr_ik+5]*R2+ilu->factors[9*iptr_ik+8]*R3;
396 }
397 }
398 iptr_main=ptr_main[i];
399 x[3*i ]=ilu->factors[9*iptr_main ]*S1+ilu->factors[9*iptr_main+3]*S2+ilu->factors[9*iptr_main+6]*S3;
400 x[3*i+1]=ilu->factors[9*iptr_main+1]*S1+ilu->factors[9*iptr_main+4]*S2+ilu->factors[9*iptr_main+7]*S3;
401 x[3*i+2]=ilu->factors[9*iptr_main+2]*S1+ilu->factors[9*iptr_main+5]*S2+ilu->factors[9*iptr_main+8]*S3;
402 }
403 }
404 }
405 }
406 /* backward substitution */
407 for (color=(num_colors)-1;color>-1;--color) {
408 if (n_block==1) {
409 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1)
410 for (i = 0; i < n; ++i) {
411 if (colorOf[i]==color) {
412 /* x_i=x_i-a_ik*x_k */
413 S1=x[i];
414 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
415 k=A->pattern->index[iptr_ik];
416 if (colorOf[k]>color) {
417 R1=x[k];
418 S1-=ilu->factors[iptr_ik]*R1;
419 }
420 }
421 x[i]=S1;
422 }
423 }
424 } else if (n_block==2) {
425 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2)
426 for (i = 0; i < n; ++i) {
427 if (colorOf[i]==color) {
428 /* x_i=x_i-a_ik*x_k */
429 S1=x[2*i];
430 S2=x[2*i+1];
431 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
432 k=A->pattern->index[iptr_ik];
433 if (colorOf[k]>color) {
434 R1=x[2*k];
435 R2=x[2*k+1];
436 S1-=ilu->factors[4*iptr_ik ]*R1+ilu->factors[4*iptr_ik+2]*R2;
437 S2-=ilu->factors[4*iptr_ik+1]*R1+ilu->factors[4*iptr_ik+3]*R2;
438 }
439 }
440 x[2*i]=S1;
441 x[2*i+1]=S2;
442 }
443 }
444 } else if (n_block==3) {
445 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3)
446 for (i = 0; i < n; ++i) {
447 if (colorOf[i]==color) {
448 /* x_i=x_i-a_ik*x_k */
449 S1=x[3*i ];
450 S2=x[3*i+1];
451 S3=x[3*i+2];
452 for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) {
453 k=A->pattern->index[iptr_ik];
454 if (colorOf[k]>color) {
455 R1=x[3*k];
456 R2=x[3*k+1];
457 R3=x[3*k+2];
458 S1-=ilu->factors[9*iptr_ik ]*R1+ilu->factors[9*iptr_ik+3]*R2+ilu->factors[9*iptr_ik+6]*R3;
459 S2-=ilu->factors[9*iptr_ik+1]*R1+ilu->factors[9*iptr_ik+4]*R2+ilu->factors[9*iptr_ik+7]*R3;
460 S3-=ilu->factors[9*iptr_ik+2]*R1+ilu->factors[9*iptr_ik+5]*R2+ilu->factors[9*iptr_ik+8]*R3;
461 }
462 }
463 x[3*i]=S1;
464 x[3*i+1]=S2;
465 x[3*i+2]=S3;
466 }
467 }
468 }
469 #pragma omp barrier
470 }
471 }
472

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26