1 |
/* $Id$ */ |
2 |
|
3 |
/**************************************************************/ |
4 |
|
5 |
/* Paso: ILU preconditioner with reordering */ |
6 |
|
7 |
/**************************************************************/ |
8 |
|
9 |
/* Copyrights by ACcESS Australia 2003,2004,2005 */ |
10 |
/* Author: gross@access.edu.au */ |
11 |
|
12 |
/**************************************************************/ |
13 |
|
14 |
#include "Paso.h" |
15 |
#include "Solver.h" |
16 |
#include "PasoUtil.h" |
17 |
|
18 |
/**************************************************************/ |
19 |
|
20 |
/* free all memory used by ILU */ |
21 |
|
22 |
void Paso_Solver_ILU_free(Paso_Solver_ILU * in) { |
23 |
if (in!=NULL) { |
24 |
MEMFREE(in->colorOf); |
25 |
MEMFREE(in->factors); |
26 |
MEMFREE(in->main_iptr); |
27 |
Paso_SystemMatrixPattern_dealloc(in->pattern); |
28 |
MEMFREE(in); |
29 |
} |
30 |
} |
31 |
|
32 |
/**************************************************************/ |
33 |
|
34 |
/* constructs the incomplete block factorization of |
35 |
|
36 |
*/ |
37 |
Paso_Solver_ILU* Paso_Solver_getILU(Paso_SystemMatrix * A,bool_t verbose) { |
38 |
dim_t n=A->num_rows; |
39 |
dim_t n_block=A->row_block_size; |
40 |
index_t num_colors=0; |
41 |
register double A11,A12,A13,A21,A22,A23,A31,A32,A33,D; |
42 |
register double mainA11,mainA12,mainA13,mainA21,mainA22,mainA23,mainA31,mainA32,mainA33; |
43 |
register double S11,S12,S13,S21,S22,S23,S31,S32,S33; |
44 |
register index_t i,iptr_main,iptr,iptr_ik,k,iptr_kj,j,iptr_ij,color,color2; |
45 |
double time0,time_color,time_fac; |
46 |
/* allocations: */ |
47 |
Paso_Solver_ILU* out=MEMALLOC(1,Paso_Solver_ILU); |
48 |
if (Paso_checkPtr(out)) return NULL; |
49 |
index_t* mis_marker=TMPMEMALLOC(n,index_t); |
50 |
out->colorOf=MEMALLOC(n,index_t); |
51 |
out->factors=MEMALLOC(A->len,double); |
52 |
out->main_iptr=MEMALLOC(n,index_t); |
53 |
out->pattern=Paso_SystemMatrixPattern_reference(A->pattern); |
54 |
out->n_block=n_block; |
55 |
out->n=n; |
56 |
time0=Paso_timer(); |
57 |
if ( !(Paso_checkPtr(mis_marker) || |
58 |
Paso_checkPtr(out->colorOf) || Paso_checkPtr(out->main_iptr) || Paso_checkPtr(out->factors)) ) { |
59 |
/* get coloring */ |
60 |
index_t num_colors=0; |
61 |
#pragma omp parallel for private(i) schedule(static) |
62 |
for (i = 0; i < n; ++i) out->colorOf[i]=-1; |
63 |
while (Paso_Util_isAny(n,out->colorOf,-1) && Paso_noError()) { |
64 |
#pragma omp parallel for private(i) schedule(static) |
65 |
for (i = 0; i < n; ++i) mis_marker[i]=out->colorOf[i]; |
66 |
Paso_SystemMatrixPattern_mis(A->pattern,mis_marker); |
67 |
|
68 |
#pragma omp parallel for private(i) schedule(static) |
69 |
for (i = 0; i < n; ++i) if (mis_marker[i]) out->colorOf[i]=num_colors; |
70 |
++num_colors; |
71 |
} |
72 |
out->num_colors=num_colors; |
73 |
time_color=Paso_timer()-time0; |
74 |
time0=Paso_timer(); |
75 |
/* find main diagonal and copy matrix values */ |
76 |
#pragma omp parallel for schedule(static) private(i,iptr,iptr_main,k) |
77 |
for (i = 0; i < n; ++i) { |
78 |
for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; ++iptr) { |
79 |
iptr_main=A->pattern->ptr[0]-1; |
80 |
for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; iptr++) { |
81 |
if (A->pattern->index[iptr]==i) iptr_main=iptr; |
82 |
for (k=0;k<n_block*n_block;++k) out->factors[n_block*n_block*iptr+k]=A->val[n_block*n_block*iptr+k]; |
83 |
} |
84 |
out->main_iptr[i]=iptr_main; |
85 |
if (iptr_main==A->pattern->ptr[0]-1) |
86 |
Paso_setError(VALUE_ERROR, "Paso_Solver_getILU: no main diagonal"); |
87 |
} |
88 |
} |
89 |
/* start factorization */ |
90 |
|
91 |
#pragma omp barrier |
92 |
for (color=0;color<out->num_colors && Paso_noError();++color) { |
93 |
if (n_block==1) { |
94 |
#pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,j,iptr_ij,A11,iptr_main,D) |
95 |
for (i = 0; i < n; ++i) { |
96 |
if (out->colorOf[i]==color) { |
97 |
for (color2=0;color2<color;++color2) { |
98 |
for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) { |
99 |
k=A->pattern->index[iptr_ik]; |
100 |
if (out->colorOf[k]==color2) { |
101 |
A11=out->factors[iptr_ik]; |
102 |
/* a_ij=a_ij-a_ik*a_kj */ |
103 |
for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) { |
104 |
j=A->pattern->index[iptr_kj]; |
105 |
if (out->colorOf[j]>color2) { |
106 |
S11=out->factors[iptr_kj]; |
107 |
for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) { |
108 |
if (j==A->pattern->index[iptr_ij]) { |
109 |
out->factors[iptr_ij]-=A11*S11; |
110 |
break; |
111 |
} |
112 |
} |
113 |
} |
114 |
} |
115 |
} |
116 |
} |
117 |
} |
118 |
iptr_main=out->main_iptr[i]; |
119 |
D=out->factors[iptr_main]; |
120 |
if (ABS(D)>0.) { |
121 |
D=1./D; |
122 |
out->factors[iptr_main]=D; |
123 |
/* a_ik=a_ii^{-1}*a_ik */ |
124 |
for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) { |
125 |
k=A->pattern->index[iptr_ik]; |
126 |
if (out->colorOf[k]>color) { |
127 |
A11=out->factors[iptr_ik]; |
128 |
out->factors[iptr_ik]=A11*D; |
129 |
} |
130 |
} |
131 |
} else { |
132 |
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block."); |
133 |
} |
134 |
} |
135 |
} |
136 |
} else if (n_block==2) { |
137 |
#pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,S21,S12,S22,j,iptr_ij,A11,A21,A12,A22,iptr_main,D) |
138 |
for (i = 0; i < n; ++i) { |
139 |
if (out->colorOf[i]==color) { |
140 |
for (color2=0;color2<color;++color2) { |
141 |
for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) { |
142 |
k=A->pattern->index[iptr_ik]; |
143 |
if (out->colorOf[k]==color2) { |
144 |
A11=out->factors[iptr_ik*4 ]; |
145 |
A21=out->factors[iptr_ik*4+1]; |
146 |
A12=out->factors[iptr_ik*4+2]; |
147 |
A22=out->factors[iptr_ik*4+3]; |
148 |
/* a_ij=a_ij-a_ik*a_kj */ |
149 |
for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) { |
150 |
j=A->pattern->index[iptr_kj]; |
151 |
if (out->colorOf[j]>color2) { |
152 |
S11=out->factors[iptr_kj*4]; |
153 |
S21=out->factors[iptr_kj*4+1]; |
154 |
S12=out->factors[iptr_kj*4+2]; |
155 |
S22=out->factors[iptr_kj*4+3]; |
156 |
for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) { |
157 |
if (j==A->pattern->index[iptr_ij]) { |
158 |
out->factors[4*iptr_ij ]-=A11*S11+A12*S21; |
159 |
out->factors[4*iptr_ij+1]-=A21*S11+A22*S21; |
160 |
out->factors[4*iptr_ij+2]-=A11*S12+A12*S22; |
161 |
out->factors[4*iptr_ij+3]-=A21*S12+A22*S22; |
162 |
break; |
163 |
} |
164 |
} |
165 |
} |
166 |
} |
167 |
} |
168 |
} |
169 |
} |
170 |
iptr_main=out->main_iptr[i]; |
171 |
A11=out->factors[iptr_main*4]; |
172 |
A21=out->factors[iptr_main*4+1]; |
173 |
A12=out->factors[iptr_main*4+2]; |
174 |
A22=out->factors[iptr_main*4+3]; |
175 |
D = A11*A22-A12*A21; |
176 |
if (ABS(D)>0.) { |
177 |
D=1./D; |
178 |
S11= A22*D; |
179 |
S21=-A21*D; |
180 |
S12=-A12*D; |
181 |
S22= A11*D; |
182 |
out->factors[iptr_main*4] = S11; |
183 |
out->factors[iptr_main*4+1]= S21; |
184 |
out->factors[iptr_main*4+2]= S12; |
185 |
out->factors[iptr_main*4+3]= S22; |
186 |
/* a_ik=a_ii^{-1}*a_ik */ |
187 |
for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) { |
188 |
k=A->pattern->index[iptr_ik]; |
189 |
if (out->colorOf[k]>color) { |
190 |
A11=out->factors[iptr_ik*4 ]; |
191 |
A21=out->factors[iptr_ik*4+1]; |
192 |
A12=out->factors[iptr_ik*4+2]; |
193 |
A22=out->factors[iptr_ik*4+3]; |
194 |
out->factors[4*iptr_ik ]=S11*A11+S12*A21; |
195 |
out->factors[4*iptr_ik+1]=S21*A11+S22*A21; |
196 |
out->factors[4*iptr_ik+2]=S11*A12+S12*A22; |
197 |
out->factors[4*iptr_ik+3]=S21*A12+S22*A22; |
198 |
} |
199 |
} |
200 |
} else { |
201 |
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block."); |
202 |
} |
203 |
} |
204 |
} |
205 |
} else if (n_block==3) { |
206 |
#pragma omp parallel for schedule(static) private(i,color2,iptr_ik,k,iptr_kj,S11,S21,S31,S12,S22,S32,S13,S23,S33,j,iptr_ij,A11,A21,A31,A12,A22,A32,A13,A23,A33,iptr_main,D) |
207 |
for (i = 0; i < n; ++i) { |
208 |
if (out->colorOf[i]==color) { |
209 |
for (color2=0;color2<color;++color2) { |
210 |
for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) { |
211 |
k=A->pattern->index[iptr_ik]; |
212 |
if (out->colorOf[k]==color2) { |
213 |
A11=out->factors[iptr_ik*9 ]; |
214 |
A21=out->factors[iptr_ik*9+1]; |
215 |
A31=out->factors[iptr_ik*9+2]; |
216 |
A12=out->factors[iptr_ik*9+3]; |
217 |
A22=out->factors[iptr_ik*9+4]; |
218 |
A32=out->factors[iptr_ik*9+5]; |
219 |
A13=out->factors[iptr_ik*9+6]; |
220 |
A23=out->factors[iptr_ik*9+7]; |
221 |
A33=out->factors[iptr_ik*9+8]; |
222 |
/* a_ij=a_ij-a_ik*a_kj */ |
223 |
for (iptr_kj=A->pattern->ptr[k];iptr_kj<A->pattern->ptr[k+1]; iptr_kj++) { |
224 |
j=A->pattern->index[iptr_kj]; |
225 |
if (out->colorOf[j]>color2) { |
226 |
S11=out->factors[iptr_kj*9 ]; |
227 |
S21=out->factors[iptr_kj*9+1]; |
228 |
S31=out->factors[iptr_kj*9+2]; |
229 |
S12=out->factors[iptr_kj*9+3]; |
230 |
S22=out->factors[iptr_kj*9+4]; |
231 |
S32=out->factors[iptr_kj*9+5]; |
232 |
S13=out->factors[iptr_kj*9+6]; |
233 |
S23=out->factors[iptr_kj*9+7]; |
234 |
S33=out->factors[iptr_kj*9+8]; |
235 |
for (iptr_ij=A->pattern->ptr[i];iptr_ij<A->pattern->ptr[i+1]; iptr_ij++) { |
236 |
if (j==A->pattern->index[iptr_ij]) { |
237 |
out->factors[iptr_ij*9 ]-=A11*S11+A12*S21+A13*S31; |
238 |
out->factors[iptr_ij*9+1]-=A21*S11+A22*S21+A23*S31; |
239 |
out->factors[iptr_ij*9+2]-=A31*S11+A32*S21+A33*S31; |
240 |
out->factors[iptr_ij*9+3]-=A11*S12+A12*S22+A13*S32; |
241 |
out->factors[iptr_ij*9+4]-=A21*S12+A22*S22+A23*S32; |
242 |
out->factors[iptr_ij*9+5]-=A31*S12+A32*S22+A33*S32; |
243 |
out->factors[iptr_ij*9+6]-=A11*S13+A12*S23+A13*S33; |
244 |
out->factors[iptr_ij*9+7]-=A21*S13+A22*S23+A23*S33; |
245 |
out->factors[iptr_ij*9+8]-=A31*S13+A32*S23+A33*S33; |
246 |
break; |
247 |
} |
248 |
} |
249 |
} |
250 |
} |
251 |
} |
252 |
} |
253 |
} |
254 |
iptr_main=out->main_iptr[i]; |
255 |
A11=out->factors[iptr_main*9 ]; |
256 |
A21=out->factors[iptr_main*9+1]; |
257 |
A31=out->factors[iptr_main*9+2]; |
258 |
A12=out->factors[iptr_main*9+3]; |
259 |
A22=out->factors[iptr_main*9+4]; |
260 |
A32=out->factors[iptr_main*9+5]; |
261 |
A13=out->factors[iptr_main*9+6]; |
262 |
A23=out->factors[iptr_main*9+7]; |
263 |
A33=out->factors[iptr_main*9+8]; |
264 |
D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22); |
265 |
if (ABS(D)>0.) { |
266 |
D=1./D; |
267 |
S11=(A22*A33-A23*A32)*D; |
268 |
S21=(A31*A23-A21*A33)*D; |
269 |
S31=(A21*A32-A31*A22)*D; |
270 |
S12=(A13*A32-A12*A33)*D; |
271 |
S22=(A11*A33-A31*A13)*D; |
272 |
S32=(A12*A31-A11*A32)*D; |
273 |
S13=(A12*A23-A13*A22)*D; |
274 |
S23=(A13*A21-A11*A23)*D; |
275 |
S33=(A11*A22-A12*A21)*D; |
276 |
|
277 |
out->factors[iptr_main*9 ]=S11; |
278 |
out->factors[iptr_main*9+1]=S21; |
279 |
out->factors[iptr_main*9+2]=S31; |
280 |
out->factors[iptr_main*9+3]=S12; |
281 |
out->factors[iptr_main*9+4]=S22; |
282 |
out->factors[iptr_main*9+5]=S32; |
283 |
out->factors[iptr_main*9+6]=S13; |
284 |
out->factors[iptr_main*9+7]=S23; |
285 |
out->factors[iptr_main*9+8]=S33; |
286 |
|
287 |
/* a_ik=a_ii^{-1}*a_ik */ |
288 |
for (iptr_ik=A->pattern->ptr[i];iptr_ik<A->pattern->ptr[i+1]; ++iptr_ik) { |
289 |
k=A->pattern->index[iptr_ik]; |
290 |
if (out->colorOf[k]>color) { |
291 |
A11=out->factors[iptr_ik*9 ]; |
292 |
A21=out->factors[iptr_ik*9+1]; |
293 |
A31=out->factors[iptr_ik*9+2]; |
294 |
A12=out->factors[iptr_ik*9+3]; |
295 |
A22=out->factors[iptr_ik*9+4]; |
296 |
A32=out->factors[iptr_ik*9+5]; |
297 |
A13=out->factors[iptr_ik*9+6]; |
298 |
A23=out->factors[iptr_ik*9+7]; |
299 |
A33=out->factors[iptr_ik*9+8]; |
300 |
out->factors[iptr_ik*9 ]=S11*A11+S12*A21+S13*A31; |
301 |
out->factors[iptr_ik*9+1]=S21*A11+S22*A21+S23*A31; |
302 |
out->factors[iptr_ik*9+2]=S31*A11+S32*A21+S33*A31; |
303 |
out->factors[iptr_ik*9+3]=S11*A12+S12*A22+S13*A32; |
304 |
out->factors[iptr_ik*9+4]=S21*A12+S22*A22+S23*A32; |
305 |
out->factors[iptr_ik*9+5]=S31*A12+S32*A22+S33*A32; |
306 |
out->factors[iptr_ik*9+6]=S11*A13+S12*A23+S13*A33; |
307 |
out->factors[iptr_ik*9+7]=S21*A13+S22*A23+S23*A33; |
308 |
out->factors[iptr_ik*9+8]=S31*A13+S32*A23+S33*A33; |
309 |
} |
310 |
} |
311 |
} else { |
312 |
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getILU: non-regular main diagonal block."); |
313 |
} |
314 |
} |
315 |
} |
316 |
} else { |
317 |
Paso_setError(VALUE_ERROR, "Paso_Solver_getILU: block size greater than 3 is not supported."); |
318 |
} |
319 |
#pragma omp barrier |
320 |
} |
321 |
time_fac=Paso_timer()-time0; |
322 |
} |
323 |
|
324 |
TMPMEMFREE(mis_marker); |
325 |
if (Paso_noError()) { |
326 |
if (verbose) { |
327 |
printf("ILU: %d color used \n",out->num_colors); |
328 |
printf("timing: ILU: coloring/elemination : %e/%e\n",time_color,time_fac); |
329 |
} |
330 |
return out; |
331 |
} else { |
332 |
Paso_Solver_ILU_free(out); |
333 |
return NULL; |
334 |
} |
335 |
} |
336 |
|
337 |
/**************************************************************/ |
338 |
|
339 |
/* apply ILU precondition b-> x |
340 |
|
341 |
in fact it solves LUx=b in the form x= U^{-1} L^{-1}b |
342 |
|
343 |
should be called within a parallel region |
344 |
barrier synconization should be performed to make sure that the input vector available |
345 |
|
346 |
*/ |
347 |
|
348 |
void Paso_Solver_solveILU(Paso_Solver_ILU * ilu, double * x, double * b) { |
349 |
register dim_t i,k; |
350 |
register index_t color,iptr_ik,iptr_main; |
351 |
register double S1,S2,S3,R1,R2,R3; |
352 |
dim_t n_block=ilu->n_block; |
353 |
dim_t n=ilu->n; |
354 |
|
355 |
|
356 |
/* copy x into b*/ |
357 |
#pragma omp for private(i) schedule(static) |
358 |
for (i=0;i<n*n_block;++i) x[i]=b[i]; |
359 |
/* forward substitution */ |
360 |
for (color=0;color<ilu->num_colors;++color) { |
361 |
if (n_block==1) { |
362 |
#pragma omp for schedule(static) private(i,iptr_ik,k,S1,R1,iptr_main) |
363 |
for (i = 0; i < n; ++i) { |
364 |
if (ilu->colorOf[i]==color) { |
365 |
/* x_i=x_i-a_ik*x_k */ |
366 |
S1=x[i]; |
367 |
for (iptr_ik=ilu->pattern->ptr[i];iptr_ik<ilu->pattern->ptr[i+1]; ++iptr_ik) { |
368 |
k=ilu->pattern->index[iptr_ik]; |
369 |
if (ilu->colorOf[k]<color) { |
370 |
R1=x[k]; |
371 |
S1-=ilu->factors[iptr_ik]*R1; |
372 |
} |
373 |
} |
374 |
iptr_main=ilu->main_iptr[i]; |
375 |
x[i]=ilu->factors[iptr_main]*S1; |
376 |
} |
377 |
} |
378 |
} else if (n_block==2) { |
379 |
#pragma omp for schedule(static) private(i,iptr_ik,k,iptr_main,S1,S2,R1,R2) |
380 |
for (i = 0; i < n; ++i) { |
381 |
if (ilu->colorOf[i]==color) { |
382 |
/* x_i=x_i-a_ik*x_k */ |
383 |
S1=x[2*i]; |
384 |
S2=x[2*i+1]; |
385 |
for (iptr_ik=ilu->pattern->ptr[i];iptr_ik<ilu->pattern->ptr[i+1]; ++iptr_ik) { |
386 |
k=ilu->pattern->index[iptr_ik]; |
387 |
if (ilu->colorOf[k]<color) { |
388 |
R1=x[2*k]; |
389 |
R2=x[2*k+1]; |
390 |
S1-=ilu->factors[4*iptr_ik ]*R1+ilu->factors[4*iptr_ik+2]*R2; |
391 |
S2-=ilu->factors[4*iptr_ik+1]*R1+ilu->factors[4*iptr_ik+3]*R2; |
392 |
} |
393 |
} |
394 |
iptr_main=ilu->main_iptr[i]; |
395 |
x[2*i ]=ilu->factors[4*iptr_main ]*S1+ilu->factors[4*iptr_main+2]*S2; |
396 |
x[2*i+1]=ilu->factors[4*iptr_main+1]*S1+ilu->factors[4*iptr_main+3]*S2; |
397 |
} |
398 |
|
399 |
} |
400 |
} else if (n_block==3) { |
401 |
#pragma omp for schedule(static) private(i,iptr_ik,iptr_main,k,S1,S2,S3,R1,R2,R3) |
402 |
for (i = 0; i < n; ++i) { |
403 |
if (ilu->colorOf[i]==color) { |
404 |
/* x_i=x_i-a_ik*x_k */ |
405 |
S1=x[3*i]; |
406 |
S2=x[3*i+1]; |
407 |
S3=x[3*i+2]; |
408 |
for (iptr_ik=ilu->pattern->ptr[i];iptr_ik<ilu->pattern->ptr[i+1]; ++iptr_ik) { |
409 |
k=ilu->pattern->index[iptr_ik]; |
410 |
if (ilu->colorOf[k]<color) { |
411 |
R1=x[3*k]; |
412 |
R2=x[3*k+1]; |
413 |
R3=x[3*k+2]; |
414 |
S1-=ilu->factors[9*iptr_ik ]*R1+ilu->factors[9*iptr_ik+3]*R2+ilu->factors[9*iptr_ik+6]*R3; |
415 |
S2-=ilu->factors[9*iptr_ik+1]*R1+ilu->factors[9*iptr_ik+4]*R2+ilu->factors[9*iptr_ik+7]*R3; |
416 |
S3-=ilu->factors[9*iptr_ik+2]*R1+ilu->factors[9*iptr_ik+5]*R2+ilu->factors[9*iptr_ik+8]*R3; |
417 |
} |
418 |
} |
419 |
iptr_main=ilu->main_iptr[i]; |
420 |
x[3*i ]=ilu->factors[9*iptr_main ]*S1+ilu->factors[9*iptr_main+3]*S2+ilu->factors[9*iptr_main+6]*S3; |
421 |
x[3*i+1]=ilu->factors[9*iptr_main+1]*S1+ilu->factors[9*iptr_main+4]*S2+ilu->factors[9*iptr_main+7]*S3; |
422 |
x[3*i+2]=ilu->factors[9*iptr_main+2]*S1+ilu->factors[9*iptr_main+5]*S2+ilu->factors[9*iptr_main+8]*S3; |
423 |
} |
424 |
} |
425 |
} |
426 |
#pragma omp barrier |
427 |
} |
428 |
/* backward substitution */ |
429 |
for (color=(ilu->num_colors)-1;color>-1;--color) { |
430 |
if (n_block==1) { |
431 |
#pragma omp for schedule(static) private(i,iptr_ik,k,S1,R1) |
432 |
for (i = 0; i < n; ++i) { |
433 |
if (ilu->colorOf[i]==color) { |
434 |
/* x_i=x_i-a_ik*x_k */ |
435 |
S1=x[i]; |
436 |
for (iptr_ik=ilu->pattern->ptr[i];iptr_ik<ilu->pattern->ptr[i+1]; ++iptr_ik) { |
437 |
k=ilu->pattern->index[iptr_ik]; |
438 |
if (ilu->colorOf[k]>color) { |
439 |
R1=x[k]; |
440 |
S1-=ilu->factors[iptr_ik]*R1; |
441 |
} |
442 |
} |
443 |
x[i]=S1; |
444 |
} |
445 |
} |
446 |
} else if (n_block==2) { |
447 |
#pragma omp for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2) |
448 |
for (i = 0; i < n; ++i) { |
449 |
if (ilu->colorOf[i]==color) { |
450 |
/* x_i=x_i-a_ik*x_k */ |
451 |
S1=x[2*i]; |
452 |
S2=x[2*i+1]; |
453 |
for (iptr_ik=ilu->pattern->ptr[i];iptr_ik<ilu->pattern->ptr[i+1]; ++iptr_ik) { |
454 |
k=ilu->pattern->index[iptr_ik]; |
455 |
if (ilu->colorOf[k]>color) { |
456 |
R1=x[2*k]; |
457 |
R2=x[2*k+1]; |
458 |
S1-=ilu->factors[4*iptr_ik ]*R1+ilu->factors[4*iptr_ik+2]*R2; |
459 |
S2-=ilu->factors[4*iptr_ik+1]*R1+ilu->factors[4*iptr_ik+3]*R2; |
460 |
} |
461 |
} |
462 |
x[2*i]=S1; |
463 |
x[2*i+1]=S2; |
464 |
} |
465 |
} |
466 |
} else if (n_block==3) { |
467 |
#pragma omp for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3) |
468 |
for (i = 0; i < n; ++i) { |
469 |
if (ilu->colorOf[i]==color) { |
470 |
/* x_i=x_i-a_ik*x_k */ |
471 |
S1=x[3*i ]; |
472 |
S2=x[3*i+1]; |
473 |
S3=x[3*i+2]; |
474 |
for (iptr_ik=ilu->pattern->ptr[i];iptr_ik<ilu->pattern->ptr[i+1]; ++iptr_ik) { |
475 |
k=ilu->pattern->index[iptr_ik]; |
476 |
if (ilu->colorOf[k]>color) { |
477 |
R1=x[3*k]; |
478 |
R2=x[3*k+1]; |
479 |
R3=x[3*k+2]; |
480 |
S1-=ilu->factors[9*iptr_ik ]*R1+ilu->factors[9*iptr_ik+3]*R2+ilu->factors[9*iptr_ik+6]*R3; |
481 |
S2-=ilu->factors[9*iptr_ik+1]*R1+ilu->factors[9*iptr_ik+4]*R2+ilu->factors[9*iptr_ik+7]*R3; |
482 |
S3-=ilu->factors[9*iptr_ik+2]*R1+ilu->factors[9*iptr_ik+5]*R2+ilu->factors[9*iptr_ik+8]*R3; |
483 |
} |
484 |
} |
485 |
x[3*i]=S1; |
486 |
x[3*i+1]=S2; |
487 |
x[3*i+2]=S3; |
488 |
} |
489 |
} |
490 |
} |
491 |
#pragma omp barrier |
492 |
} |
493 |
return; |
494 |
} |
495 |
/* |
496 |
* $Log$ |
497 |
* Revision 1.2 2005/09/15 03:44:40 jgs |
498 |
* Merge of development branch dev-02 back to main trunk on 2005-09-15 |
499 |
* |
500 |
* Revision 1.1.2.1 2005/09/05 06:29:50 gross |
501 |
* These files have been extracted from finley to define a stand alone libray for iterative |
502 |
* linear solvers on the ALTIX. main entry through Paso_solve. this version compiles but |
503 |
* has not been tested yet. |
504 |
* |
505 |
* |
506 |
*/ |