1 |
jfenwick |
1851 |
|
2 |
|
|
/******************************************************* |
3 |
|
|
* |
4 |
|
|
* Copyright (c) 2003-2008 by University of Queensland |
5 |
|
|
* Earth Systems Science Computational Center (ESSCC) |
6 |
|
|
* http://www.uq.edu.au/esscc |
7 |
|
|
* |
8 |
|
|
* Primary Business: Queensland, Australia |
9 |
|
|
* Licensed under the Open Software License version 3.0 |
10 |
|
|
* http://www.opensource.org/licenses/osl-3.0.php |
11 |
|
|
* |
12 |
|
|
*******************************************************/ |
13 |
|
|
|
14 |
|
|
|
15 |
|
|
/**************************************************************/ |
16 |
|
|
|
17 |
|
|
/* Paso: GS preconditioner with reordering */ |
18 |
|
|
|
19 |
|
|
/**************************************************************/ |
20 |
|
|
|
21 |
|
|
/* Copyrights by ACcESS Australia 2003,2004,2005,2006,2007,2008 */ |
22 |
|
|
/* Author: artak@uq.edu.au */ |
23 |
|
|
|
24 |
|
|
/**************************************************************/ |
25 |
|
|
|
26 |
|
|
#include "Paso.h" |
27 |
|
|
#include "Solver.h" |
28 |
|
|
#include "PasoUtil.h" |
29 |
|
|
|
30 |
|
|
#include <stdio.h> |
31 |
|
|
|
32 |
|
|
/**************************************************************/ |
33 |
|
|
|
34 |
|
|
/* free all memory used by GS */ |
35 |
|
|
|
36 |
|
|
void Paso_Solver_GS_free(Paso_Solver_GS * in) { |
37 |
|
|
if (in!=NULL) { |
38 |
|
|
MEMFREE(in->colorOf); |
39 |
|
|
Paso_SparseMatrix_free(in->factors); |
40 |
|
|
MEMFREE(in->diag); |
41 |
|
|
MEMFREE(in->main_iptr); |
42 |
|
|
Paso_Pattern_free(in->pattern); |
43 |
|
|
MEMFREE(in); |
44 |
|
|
} |
45 |
|
|
} |
46 |
|
|
|
47 |
|
|
/**************************************************************/ |
48 |
|
|
|
49 |
|
|
/* constructs the incomplete block factorization of |
50 |
|
|
|
51 |
|
|
*/ |
52 |
|
|
Paso_Solver_GS* Paso_Solver_getGS(Paso_SparseMatrix * A,bool_t verbose) { |
53 |
|
|
dim_t n=A->numRows; |
54 |
|
|
dim_t n_block=A->row_block_size; |
55 |
|
|
dim_t block_size=A->block_size; |
56 |
|
|
index_t num_colors=0, *mis_marker=NULL; |
57 |
|
|
register index_t i,iptr_main,iPtr,iptr_ik,k,iptr_kj,j,iptr_ij,color,color2; |
58 |
|
|
double time0,time_color,time_fac; |
59 |
|
|
/* allocations: */ |
60 |
|
|
Paso_Solver_GS* out=MEMALLOC(1,Paso_Solver_GS); |
61 |
|
|
if (Paso_checkPtr(out)) return NULL; |
62 |
|
|
out->colorOf=MEMALLOC(n,index_t); |
63 |
|
|
out->diag=MEMALLOC( ((size_t) n) * ((size_t) block_size),double); |
64 |
|
|
/*out->diag=MEMALLOC(A->len,double);*/ |
65 |
|
|
out->main_iptr=MEMALLOC(n,index_t); |
66 |
|
|
out->pattern=Paso_Pattern_getReference(A->pattern); |
67 |
|
|
out->factors=Paso_SparseMatrix_getReference(A); |
68 |
|
|
out->n_block=n_block; |
69 |
|
|
out->n=n; |
70 |
|
|
|
71 |
|
|
if ( !(Paso_checkPtr(out->colorOf) || Paso_checkPtr(out->main_iptr) || Paso_checkPtr(out->factors)) ) { |
72 |
|
|
time0=Paso_timer(); |
73 |
|
|
Paso_Pattern_color(A->pattern,&out->num_colors,out->colorOf); |
74 |
|
|
time_color=Paso_timer()-time0; |
75 |
|
|
|
76 |
|
|
if (Paso_noError()) { |
77 |
|
|
time0=Paso_timer(); |
78 |
|
|
|
79 |
|
|
if (! (Paso_checkPtr(out->diag))) { |
80 |
|
|
if (n_block==1) { |
81 |
|
|
#pragma omp parallel for private(i, iPtr) schedule(static) |
82 |
|
|
for (i = 0; i < A->pattern->numOutput; i++) { |
83 |
|
|
out->diag[i]=1.; |
84 |
|
|
/* find main diagonal */ |
85 |
|
|
for (iPtr = A->pattern->ptr[i]; iPtr < A->pattern->ptr[i + 1]; iPtr++) { |
86 |
|
|
if (A->pattern->index[iPtr]==i) { |
87 |
|
|
iptr_main=iPtr; |
88 |
|
|
out->diag[i]=A->val[iPtr]; |
89 |
|
|
break; |
90 |
|
|
} |
91 |
|
|
} |
92 |
|
|
out->main_iptr[i]=iptr_main; |
93 |
|
|
} |
94 |
|
|
} else if (n_block==2) { |
95 |
|
|
#pragma omp parallel for private(i, iPtr) schedule(static) |
96 |
|
|
for (i = 0; i < A->pattern->numOutput; i++) { |
97 |
|
|
out->diag[i*4+0]= 1.; |
98 |
|
|
out->diag[i*4+1]= 0.; |
99 |
|
|
out->diag[i*4+2]= 0.; |
100 |
|
|
out->diag[i*4+3]= 1.; |
101 |
|
|
/* find main diagonal */ |
102 |
|
|
for (iPtr = A->pattern->ptr[i]; iPtr < A->pattern->ptr[i + 1]; iPtr++) { |
103 |
|
|
if (A->pattern->index[iPtr]==i) { |
104 |
|
|
iptr_main=iPtr; |
105 |
|
|
out->diag[i*4]= A->val[iPtr*4]; |
106 |
|
|
out->diag[i*4+1]=A->val[iPtr*4+1]; |
107 |
|
|
out->diag[i*4+2]=A->val[iPtr*4+2]; |
108 |
|
|
out->diag[i*4+3]= A->val[iPtr*4+3]; |
109 |
|
|
break; |
110 |
|
|
} |
111 |
|
|
} |
112 |
|
|
out->main_iptr[i]=iptr_main; |
113 |
|
|
} |
114 |
|
|
} else if (n_block==3) { |
115 |
|
|
#pragma omp parallel for private(i, iPtr) schedule(static) |
116 |
|
|
for (i = 0; i < A->pattern->numOutput; i++) { |
117 |
|
|
out->diag[i*9 ]=1.; |
118 |
|
|
out->diag[i*9+1]=0.; |
119 |
|
|
out->diag[i*9+2]=0.; |
120 |
|
|
out->diag[i*9+3]=0.; |
121 |
|
|
out->diag[i*9+4]=1.; |
122 |
|
|
out->diag[i*9+5]=0.; |
123 |
|
|
out->diag[i*9+6]=0.; |
124 |
|
|
out->diag[i*9+7]=0.; |
125 |
|
|
out->diag[i*9+8]=1.; |
126 |
|
|
/* find main diagonal */ |
127 |
|
|
for (iPtr = A->pattern->ptr[i]; iPtr < A->pattern->ptr[i + 1]; iPtr++) { |
128 |
|
|
if (A->pattern->index[iPtr]==i) { |
129 |
|
|
iptr_main=iPtr; |
130 |
|
|
out->diag[i*9 ]=A->val[iPtr*9 ]; |
131 |
|
|
out->diag[i*9+1]=A->val[iPtr*9+1]; |
132 |
|
|
out->diag[i*9+2]=A->val[iPtr*9+2]; |
133 |
|
|
out->diag[i*9+3]=A->val[iPtr*9+3]; |
134 |
|
|
out->diag[i*9+4]=A->val[iPtr*9+4]; |
135 |
|
|
out->diag[i*9+5]=A->val[iPtr*9+5]; |
136 |
|
|
out->diag[i*9+6]=A->val[iPtr*9+6]; |
137 |
|
|
out->diag[i*9+7]=A->val[iPtr*9+7]; |
138 |
|
|
out->diag[i*9+8]=A->val[iPtr*9+8]; |
139 |
|
|
break; |
140 |
|
|
} |
141 |
|
|
} |
142 |
|
|
out->main_iptr[i]=iptr_main; |
143 |
|
|
} |
144 |
|
|
} |
145 |
|
|
} |
146 |
|
|
|
147 |
|
|
time_fac=Paso_timer()-time0; |
148 |
|
|
} |
149 |
|
|
} |
150 |
|
|
if (Paso_noError()) { |
151 |
|
|
if (verbose) { |
152 |
|
|
printf("GS: %d color used \n",out->num_colors); |
153 |
|
|
printf("timing: GS: coloring/elemination : %e/%e\n",time_color,time_fac); |
154 |
|
|
} |
155 |
|
|
return out; |
156 |
|
|
} else { |
157 |
|
|
Paso_Solver_GS_free(out); |
158 |
|
|
return NULL; |
159 |
|
|
} |
160 |
|
|
} |
161 |
|
|
|
162 |
|
|
/**************************************************************/ |
163 |
|
|
|
164 |
|
|
/* apply GS precondition b-> x |
165 |
|
|
|
166 |
|
|
in fact it solves LD^{-1}Ux=b in the form x= U^{-1} D L^{-1}b |
167 |
|
|
|
168 |
|
|
should be called within a parallel region |
169 |
|
|
barrier synconization should be performed to make sure that the input vector available |
170 |
|
|
|
171 |
|
|
*/ |
172 |
|
|
|
173 |
|
|
void Paso_Solver_solveGS(Paso_Solver_GS * gs, double * x, double * b) { |
174 |
|
|
register dim_t i,k; |
175 |
|
|
register index_t color,iptr_ik,iptr_main; |
176 |
|
|
register double A11,A12,A21,A22,A13,A23,A33,A32,A31,S1,S2,S3,R1,R2,R3,D,S21,S22,S12,S11,S13,S23,S33,S32,S31; |
177 |
|
|
dim_t n_block=gs->n_block; |
178 |
|
|
dim_t n=gs->n; |
179 |
|
|
index_t* pivot=NULL; |
180 |
|
|
|
181 |
|
|
/* copy x into b*/ |
182 |
|
|
#pragma omp parallel for private(i) schedule(static) |
183 |
|
|
for (i=0;i<n*n_block;++i) x[i]=b[i]; |
184 |
|
|
/* forward substitution */ |
185 |
|
|
for (color=0;color<gs->num_colors;++color) { |
186 |
|
|
if (n_block==1) { |
187 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1,iptr_main) |
188 |
|
|
for (i = 0; i < n; ++i) { |
189 |
|
|
if (gs->colorOf[i]==color) { |
190 |
|
|
/* x_i=x_i-a_ik*x_k */ |
191 |
|
|
S1=x[i]; |
192 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
193 |
|
|
k=gs->pattern->index[iptr_ik]; |
194 |
|
|
if (gs->colorOf[k]<color) { |
195 |
|
|
R1=x[k]; |
196 |
|
|
S1-=gs->factors->val[iptr_ik]*R1; |
197 |
|
|
} |
198 |
|
|
} |
199 |
|
|
iptr_main=gs->main_iptr[i]; |
200 |
|
|
x[i]=(1/gs->factors->val[iptr_main])*S1; |
201 |
|
|
} |
202 |
|
|
} |
203 |
|
|
} else if (n_block==2) { |
204 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,iptr_main,S1,S2,R1,R2) |
205 |
|
|
for (i = 0; i < n; ++i) { |
206 |
|
|
if (gs->colorOf[i]==color) { |
207 |
|
|
/* x_i=x_i-a_ik*x_k */ |
208 |
|
|
S1=x[2*i]; |
209 |
|
|
S2=x[2*i+1]; |
210 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
211 |
|
|
k=gs->pattern->index[iptr_ik]; |
212 |
|
|
if (gs->colorOf[k]<color) { |
213 |
|
|
R1=x[2*k]; |
214 |
|
|
R2=x[2*k+1]; |
215 |
|
|
S1-=gs->factors->val[4*iptr_ik ]*R1+gs->factors->val[4*iptr_ik+2]*R2; |
216 |
|
|
S2-=gs->factors->val[4*iptr_ik+1]*R1+gs->factors->val[4*iptr_ik+3]*R2; |
217 |
|
|
} |
218 |
|
|
} |
219 |
|
|
iptr_main=gs->main_iptr[i]; |
220 |
|
|
A11=gs->factors->val[iptr_main*4]; |
221 |
|
|
A21=gs->factors->val[iptr_main*4+1]; |
222 |
|
|
A12=gs->factors->val[iptr_main*4+2]; |
223 |
|
|
A22=gs->factors->val[iptr_main*4+3]; |
224 |
|
|
D = A11*A22-A12*A21; |
225 |
|
|
if (ABS(D)>0.) { |
226 |
|
|
D=1./D; |
227 |
|
|
S11= A22*D; |
228 |
|
|
S21=-A21*D; |
229 |
|
|
S12=-A12*D; |
230 |
|
|
S22= A11*D; |
231 |
|
|
x[2*i ]=S11*S1+S12*S2; |
232 |
|
|
x[2*i+1]=S21*S1+S22*S2; |
233 |
|
|
} else { |
234 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
235 |
|
|
} |
236 |
|
|
} |
237 |
|
|
|
238 |
|
|
} |
239 |
|
|
} else if (n_block==3) { |
240 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,iptr_main,k,S1,S2,S3,R1,R2,R3) |
241 |
|
|
for (i = 0; i < n; ++i) { |
242 |
|
|
if (gs->colorOf[i]==color) { |
243 |
|
|
/* x_i=x_i-a_ik*x_k */ |
244 |
|
|
S1=x[3*i]; |
245 |
|
|
S2=x[3*i+1]; |
246 |
|
|
S3=x[3*i+2]; |
247 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
248 |
|
|
k=gs->pattern->index[iptr_ik]; |
249 |
|
|
if (gs->colorOf[k]<color) { |
250 |
|
|
R1=x[3*k]; |
251 |
|
|
R2=x[3*k+1]; |
252 |
|
|
R3=x[3*k+2]; |
253 |
|
|
S1-=gs->factors->val[9*iptr_ik ]*R1+gs->factors->val[9*iptr_ik+3]*R2+gs->factors->val[9*iptr_ik+6]*R3; |
254 |
|
|
S2-=gs->factors->val[9*iptr_ik+1]*R1+gs->factors->val[9*iptr_ik+4]*R2+gs->factors->val[9*iptr_ik+7]*R3; |
255 |
|
|
S3-=gs->factors->val[9*iptr_ik+2]*R1+gs->factors->val[9*iptr_ik+5]*R2+gs->factors->val[9*iptr_ik+8]*R3; |
256 |
|
|
} |
257 |
|
|
} |
258 |
|
|
iptr_main=gs->main_iptr[i]; |
259 |
|
|
A11=gs->factors->val[iptr_main*9 ]; |
260 |
|
|
A21=gs->factors->val[iptr_main*9+1]; |
261 |
|
|
A31=gs->factors->val[iptr_main*9+2]; |
262 |
|
|
A12=gs->factors->val[iptr_main*9+3]; |
263 |
|
|
A22=gs->factors->val[iptr_main*9+4]; |
264 |
|
|
A32=gs->factors->val[iptr_main*9+5]; |
265 |
|
|
A13=gs->factors->val[iptr_main*9+6]; |
266 |
|
|
A23=gs->factors->val[iptr_main*9+7]; |
267 |
|
|
A33=gs->factors->val[iptr_main*9+8]; |
268 |
|
|
D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22); |
269 |
|
|
if (ABS(D)>0.) { |
270 |
|
|
D=1./D; |
271 |
|
|
S11=(A22*A33-A23*A32)*D; |
272 |
|
|
S21=(A31*A23-A21*A33)*D; |
273 |
|
|
S31=(A21*A32-A31*A22)*D; |
274 |
|
|
S12=(A13*A32-A12*A33)*D; |
275 |
|
|
S22=(A11*A33-A31*A13)*D; |
276 |
|
|
S32=(A12*A31-A11*A32)*D; |
277 |
|
|
S13=(A12*A23-A13*A22)*D; |
278 |
|
|
S23=(A13*A21-A11*A23)*D; |
279 |
|
|
S33=(A11*A22-A12*A21)*D; |
280 |
|
|
/* a_ik=a_ii^{-1}*a_ik */ |
281 |
|
|
x[3*i ]=S11*S1+S12*S2+S13*S3; |
282 |
|
|
x[3*i+1]=S21*S1+S22*S2+S23*S3; |
283 |
|
|
x[3*i+2]=S31*S1+S32*S2+S33*S3; |
284 |
|
|
} else { |
285 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
286 |
|
|
} |
287 |
|
|
} |
288 |
|
|
} |
289 |
|
|
} |
290 |
|
|
} |
291 |
|
|
|
292 |
|
|
/* Multipling with diag(A) */ |
293 |
|
|
Paso_Solver_applyBlockDiagonalMatrix(gs->n_block,gs->n,gs->diag,pivot,x,x); |
294 |
|
|
|
295 |
|
|
/* backward substitution */ |
296 |
|
|
for (color=(gs->num_colors)-1;color>-1;--color) { |
297 |
|
|
if (n_block==1) { |
298 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1) |
299 |
|
|
for (i = 0; i < n; ++i) { |
300 |
|
|
if (gs->colorOf[i]==color) { |
301 |
|
|
/* x_i=x_i-a_ik*x_k */ |
302 |
|
|
S1=x[i]; |
303 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
304 |
|
|
k=gs->pattern->index[iptr_ik]; |
305 |
|
|
if (gs->colorOf[k]>color) { |
306 |
|
|
R1=x[k]; |
307 |
|
|
S1-=gs->factors->val[iptr_ik]*R1; |
308 |
|
|
} |
309 |
|
|
} |
310 |
|
|
/*x[i]=S1;*/ |
311 |
|
|
iptr_main=gs->main_iptr[i]; |
312 |
|
|
x[i]=(1/gs->factors->val[iptr_main])*S1; |
313 |
|
|
} |
314 |
|
|
} |
315 |
|
|
} else if (n_block==2) { |
316 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2) |
317 |
|
|
for (i = 0; i < n; ++i) { |
318 |
|
|
if (gs->colorOf[i]==color) { |
319 |
|
|
/* x_i=x_i-a_ik*x_k */ |
320 |
|
|
S1=x[2*i]; |
321 |
|
|
S2=x[2*i+1]; |
322 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
323 |
|
|
k=gs->pattern->index[iptr_ik]; |
324 |
|
|
if (gs->colorOf[k]>color) { |
325 |
|
|
R1=x[2*k]; |
326 |
|
|
R2=x[2*k+1]; |
327 |
|
|
S1-=gs->factors->val[4*iptr_ik ]*R1+gs->factors->val[4*iptr_ik+2]*R2; |
328 |
|
|
S2-=gs->factors->val[4*iptr_ik+1]*R1+gs->factors->val[4*iptr_ik+3]*R2; |
329 |
|
|
} |
330 |
|
|
} |
331 |
|
|
/*x[2*i]=S1; |
332 |
|
|
x[2*i+1]=S2;*/ |
333 |
|
|
iptr_main=gs->main_iptr[i]; |
334 |
|
|
A11=gs->factors->val[iptr_main*4]; |
335 |
|
|
A21=gs->factors->val[iptr_main*4+1]; |
336 |
|
|
A12=gs->factors->val[iptr_main*4+2]; |
337 |
|
|
A22=gs->factors->val[iptr_main*4+3]; |
338 |
|
|
D = A11*A22-A12*A21; |
339 |
|
|
if (ABS(D)>0.) { |
340 |
|
|
D=1./D; |
341 |
|
|
S11= A22*D; |
342 |
|
|
S21=-A21*D; |
343 |
|
|
S12=-A12*D; |
344 |
|
|
S22= A11*D; |
345 |
|
|
x[2*i ]=S11*S1+S12*S2; |
346 |
|
|
x[2*i+1]=S21*S1+S22*S2; |
347 |
|
|
} else { |
348 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
349 |
|
|
} |
350 |
|
|
|
351 |
|
|
} |
352 |
|
|
} |
353 |
|
|
} else if (n_block==3) { |
354 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3) |
355 |
|
|
for (i = 0; i < n; ++i) { |
356 |
|
|
if (gs->colorOf[i]==color) { |
357 |
|
|
/* x_i=x_i-a_ik*x_k */ |
358 |
|
|
S1=x[3*i ]; |
359 |
|
|
S2=x[3*i+1]; |
360 |
|
|
S3=x[3*i+2]; |
361 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
362 |
|
|
k=gs->pattern->index[iptr_ik]; |
363 |
|
|
if (gs->colorOf[k]>color) { |
364 |
|
|
R1=x[3*k]; |
365 |
|
|
R2=x[3*k+1]; |
366 |
|
|
R3=x[3*k+2]; |
367 |
|
|
S1-=gs->factors->val[9*iptr_ik ]*R1+gs->factors->val[9*iptr_ik+3]*R2+gs->factors->val[9*iptr_ik+6]*R3; |
368 |
|
|
S2-=gs->factors->val[9*iptr_ik+1]*R1+gs->factors->val[9*iptr_ik+4]*R2+gs->factors->val[9*iptr_ik+7]*R3; |
369 |
|
|
S3-=gs->factors->val[9*iptr_ik+2]*R1+gs->factors->val[9*iptr_ik+5]*R2+gs->factors->val[9*iptr_ik+8]*R3; |
370 |
|
|
} |
371 |
|
|
} |
372 |
|
|
/* x[3*i]=S1; |
373 |
|
|
x[3*i+1]=S2; |
374 |
|
|
x[3*i+2]=S3; |
375 |
|
|
*/ iptr_main=gs->main_iptr[i]; |
376 |
|
|
A11=gs->factors->val[iptr_main*9 ]; |
377 |
|
|
A21=gs->factors->val[iptr_main*9+1]; |
378 |
|
|
A31=gs->factors->val[iptr_main*9+2]; |
379 |
|
|
A12=gs->factors->val[iptr_main*9+3]; |
380 |
|
|
A22=gs->factors->val[iptr_main*9+4]; |
381 |
|
|
A32=gs->factors->val[iptr_main*9+5]; |
382 |
|
|
A13=gs->factors->val[iptr_main*9+6]; |
383 |
|
|
A23=gs->factors->val[iptr_main*9+7]; |
384 |
|
|
A33=gs->factors->val[iptr_main*9+8]; |
385 |
|
|
D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22); |
386 |
|
|
if (ABS(D)>0.) { |
387 |
|
|
D=1./D; |
388 |
|
|
S11=(A22*A33-A23*A32)*D; |
389 |
|
|
S21=(A31*A23-A21*A33)*D; |
390 |
|
|
S31=(A21*A32-A31*A22)*D; |
391 |
|
|
S12=(A13*A32-A12*A33)*D; |
392 |
|
|
S22=(A11*A33-A31*A13)*D; |
393 |
|
|
S32=(A12*A31-A11*A32)*D; |
394 |
|
|
S13=(A12*A23-A13*A22)*D; |
395 |
|
|
S23=(A13*A21-A11*A23)*D; |
396 |
|
|
S33=(A11*A22-A12*A21)*D; |
397 |
|
|
x[3*i ]=S11*S1+S12*S2+S13*S3; |
398 |
|
|
x[3*i+1]=S21*S1+S22*S2+S23*S3; |
399 |
|
|
x[3*i+2]=S31*S1+S32*S2+S33*S3; |
400 |
|
|
} else { |
401 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
402 |
|
|
} |
403 |
|
|
} |
404 |
|
|
} |
405 |
|
|
} |
406 |
|
|
} |
407 |
|
|
|
408 |
|
|
return; |
409 |
|
|
} |
410 |
|
|
|
411 |
artak |
1902 |
/**************************************************************/ |
412 |
|
|
|
413 |
|
|
/* apply GS precondition b-> x |
414 |
|
|
|
415 |
|
|
in fact it solves LD^{-1}Ux=b in the form x= U^{-1} D L^{-1}b |
416 |
|
|
|
417 |
|
|
should be called within a parallel region |
418 |
|
|
barrier synconization should be performed to make sure that the input vector available |
419 |
|
|
|
420 |
|
|
*/ |
421 |
|
|
|
422 |
|
|
void Paso_Solver_solveGS1(Paso_Solver_GS * gs, double * x, double * b) { |
423 |
|
|
register dim_t i,k; |
424 |
|
|
register index_t color,iptr_ik,iptr_main; |
425 |
|
|
register double A11,A12,A21,A22,A13,A23,A33,A32,A31,S1,S2,S3,R1,R2,R3,D,S21,S22,S12,S11,S13,S23,S33,S32,S31; |
426 |
|
|
dim_t n_block=gs->n_block; |
427 |
|
|
dim_t n=gs->n; |
428 |
|
|
index_t* pivot=NULL; |
429 |
|
|
|
430 |
|
|
/* copy x into b*/ |
431 |
|
|
#pragma omp parallel for private(i) schedule(static) |
432 |
|
|
/*for (i=0;i<n*n_block;++i) x[i]=b[i];*/ |
433 |
|
|
|
434 |
|
|
/* forward substitution */ |
435 |
|
|
for (color=0;color<gs->num_colors;++color) { |
436 |
|
|
if (n_block==1) { |
437 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1,iptr_main) |
438 |
|
|
for (i = 0; i < n; ++i) { |
439 |
|
|
if (gs->colorOf[i]==color) { |
440 |
|
|
/* x_i=x_i-a_ik*x_k */ |
441 |
|
|
S1=b[i]; |
442 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
443 |
|
|
k=gs->pattern->index[iptr_ik]; |
444 |
|
|
if (gs->colorOf[k]<color) { |
445 |
|
|
R1=x[k]; |
446 |
|
|
S1-=gs->factors->val[iptr_ik]*R1; |
447 |
|
|
} |
448 |
|
|
} |
449 |
|
|
iptr_main=gs->main_iptr[i]; |
450 |
|
|
x[i]=(1/gs->factors->val[iptr_main])*S1; |
451 |
|
|
} |
452 |
|
|
} |
453 |
|
|
} else if (n_block==2) { |
454 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,iptr_main,S1,S2,R1,R2) |
455 |
|
|
for (i = 0; i < n; ++i) { |
456 |
|
|
if (gs->colorOf[i]==color) { |
457 |
|
|
/* x_i=x_i-a_ik*x_k */ |
458 |
|
|
S1=b[2*i]; |
459 |
|
|
S2=b[2*i+1]; |
460 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
461 |
|
|
k=gs->pattern->index[iptr_ik]; |
462 |
|
|
if (gs->colorOf[k]<color) { |
463 |
|
|
R1=x[2*k]; |
464 |
|
|
R2=x[2*k+1]; |
465 |
|
|
S1-=gs->factors->val[4*iptr_ik ]*R1+gs->factors->val[4*iptr_ik+2]*R2; |
466 |
|
|
S2-=gs->factors->val[4*iptr_ik+1]*R1+gs->factors->val[4*iptr_ik+3]*R2; |
467 |
|
|
} |
468 |
|
|
} |
469 |
|
|
iptr_main=gs->main_iptr[i]; |
470 |
|
|
A11=gs->factors->val[iptr_main*4]; |
471 |
|
|
A21=gs->factors->val[iptr_main*4+1]; |
472 |
|
|
A12=gs->factors->val[iptr_main*4+2]; |
473 |
|
|
A22=gs->factors->val[iptr_main*4+3]; |
474 |
|
|
D = A11*A22-A12*A21; |
475 |
|
|
if (ABS(D)>0.) { |
476 |
|
|
D=1./D; |
477 |
|
|
S11= A22*D; |
478 |
|
|
S21=-A21*D; |
479 |
|
|
S12=-A12*D; |
480 |
|
|
S22= A11*D; |
481 |
|
|
x[2*i ]=S11*S1+S12*S2; |
482 |
|
|
x[2*i+1]=S21*S1+S22*S2; |
483 |
|
|
} else { |
484 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
485 |
|
|
} |
486 |
|
|
} |
487 |
|
|
|
488 |
|
|
} |
489 |
|
|
} else if (n_block==3) { |
490 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,iptr_main,k,S1,S2,S3,R1,R2,R3) |
491 |
|
|
for (i = 0; i < n; ++i) { |
492 |
|
|
if (gs->colorOf[i]==color) { |
493 |
|
|
/* x_i=x_i-a_ik*x_k */ |
494 |
|
|
S1=b[3*i]; |
495 |
|
|
S2=b[3*i+1]; |
496 |
|
|
S3=b[3*i+2]; |
497 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
498 |
|
|
k=gs->pattern->index[iptr_ik]; |
499 |
|
|
if (gs->colorOf[k]<color) { |
500 |
|
|
R1=x[3*k]; |
501 |
|
|
R2=x[3*k+1]; |
502 |
|
|
R3=x[3*k+2]; |
503 |
|
|
S1-=gs->factors->val[9*iptr_ik ]*R1+gs->factors->val[9*iptr_ik+3]*R2+gs->factors->val[9*iptr_ik+6]*R3; |
504 |
|
|
S2-=gs->factors->val[9*iptr_ik+1]*R1+gs->factors->val[9*iptr_ik+4]*R2+gs->factors->val[9*iptr_ik+7]*R3; |
505 |
|
|
S3-=gs->factors->val[9*iptr_ik+2]*R1+gs->factors->val[9*iptr_ik+5]*R2+gs->factors->val[9*iptr_ik+8]*R3; |
506 |
|
|
} |
507 |
|
|
} |
508 |
|
|
iptr_main=gs->main_iptr[i]; |
509 |
|
|
A11=gs->factors->val[iptr_main*9 ]; |
510 |
|
|
A21=gs->factors->val[iptr_main*9+1]; |
511 |
|
|
A31=gs->factors->val[iptr_main*9+2]; |
512 |
|
|
A12=gs->factors->val[iptr_main*9+3]; |
513 |
|
|
A22=gs->factors->val[iptr_main*9+4]; |
514 |
|
|
A32=gs->factors->val[iptr_main*9+5]; |
515 |
|
|
A13=gs->factors->val[iptr_main*9+6]; |
516 |
|
|
A23=gs->factors->val[iptr_main*9+7]; |
517 |
|
|
A33=gs->factors->val[iptr_main*9+8]; |
518 |
|
|
D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22); |
519 |
|
|
if (ABS(D)>0.) { |
520 |
|
|
D=1./D; |
521 |
|
|
S11=(A22*A33-A23*A32)*D; |
522 |
|
|
S21=(A31*A23-A21*A33)*D; |
523 |
|
|
S31=(A21*A32-A31*A22)*D; |
524 |
|
|
S12=(A13*A32-A12*A33)*D; |
525 |
|
|
S22=(A11*A33-A31*A13)*D; |
526 |
|
|
S32=(A12*A31-A11*A32)*D; |
527 |
|
|
S13=(A12*A23-A13*A22)*D; |
528 |
|
|
S23=(A13*A21-A11*A23)*D; |
529 |
|
|
S33=(A11*A22-A12*A21)*D; |
530 |
|
|
/* a_ik=a_ii^{-1}*a_ik */ |
531 |
|
|
x[3*i ]=S11*S1+S12*S2+S13*S3; |
532 |
|
|
x[3*i+1]=S21*S1+S22*S2+S23*S3; |
533 |
|
|
x[3*i+2]=S31*S1+S32*S2+S33*S3; |
534 |
|
|
} else { |
535 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
536 |
|
|
} |
537 |
|
|
} |
538 |
|
|
} |
539 |
|
|
} |
540 |
|
|
} |
541 |
|
|
|
542 |
|
|
/* Multipling with diag(A) */ |
543 |
|
|
Paso_Solver_applyBlockDiagonalMatrix(gs->n_block,gs->n,gs->diag,pivot,x,x); |
544 |
|
|
|
545 |
|
|
/* backward substitution */ |
546 |
|
|
for (color=(gs->num_colors)-1;color>-1;--color) { |
547 |
|
|
if (n_block==1) { |
548 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1) |
549 |
|
|
for (i = 0; i < n; ++i) { |
550 |
|
|
if (gs->colorOf[i]==color) { |
551 |
|
|
/* x_i=x_i-a_ik*x_k */ |
552 |
|
|
S1=x[i]; |
553 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
554 |
|
|
k=gs->pattern->index[iptr_ik]; |
555 |
|
|
if (gs->colorOf[k]>color) { |
556 |
|
|
R1=x[k]; |
557 |
|
|
S1-=gs->factors->val[iptr_ik]*R1; |
558 |
|
|
} |
559 |
|
|
} |
560 |
|
|
/*x[i]=S1;*/ |
561 |
|
|
iptr_main=gs->main_iptr[i]; |
562 |
|
|
x[i]=(1/gs->factors->val[iptr_main])*S1; |
563 |
|
|
} |
564 |
|
|
} |
565 |
|
|
} else if (n_block==2) { |
566 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2) |
567 |
|
|
for (i = 0; i < n; ++i) { |
568 |
|
|
if (gs->colorOf[i]==color) { |
569 |
|
|
/* x_i=x_i-a_ik*x_k */ |
570 |
|
|
S1=x[2*i]; |
571 |
|
|
S2=x[2*i+1]; |
572 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
573 |
|
|
k=gs->pattern->index[iptr_ik]; |
574 |
|
|
if (gs->colorOf[k]>color) { |
575 |
|
|
R1=x[2*k]; |
576 |
|
|
R2=x[2*k+1]; |
577 |
|
|
S1-=gs->factors->val[4*iptr_ik ]*R1+gs->factors->val[4*iptr_ik+2]*R2; |
578 |
|
|
S2-=gs->factors->val[4*iptr_ik+1]*R1+gs->factors->val[4*iptr_ik+3]*R2; |
579 |
|
|
} |
580 |
|
|
} |
581 |
|
|
/*x[2*i]=S1; |
582 |
|
|
x[2*i+1]=S2;*/ |
583 |
|
|
iptr_main=gs->main_iptr[i]; |
584 |
|
|
A11=gs->factors->val[iptr_main*4]; |
585 |
|
|
A21=gs->factors->val[iptr_main*4+1]; |
586 |
|
|
A12=gs->factors->val[iptr_main*4+2]; |
587 |
|
|
A22=gs->factors->val[iptr_main*4+3]; |
588 |
|
|
D = A11*A22-A12*A21; |
589 |
|
|
if (ABS(D)>0.) { |
590 |
|
|
D=1./D; |
591 |
|
|
S11= A22*D; |
592 |
|
|
S21=-A21*D; |
593 |
|
|
S12=-A12*D; |
594 |
|
|
S22= A11*D; |
595 |
|
|
x[2*i ]=S11*S1+S12*S2; |
596 |
|
|
x[2*i+1]=S21*S1+S22*S2; |
597 |
|
|
} else { |
598 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
599 |
|
|
} |
600 |
|
|
|
601 |
|
|
} |
602 |
|
|
} |
603 |
|
|
} else if (n_block==3) { |
604 |
|
|
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3) |
605 |
|
|
for (i = 0; i < n; ++i) { |
606 |
|
|
if (gs->colorOf[i]==color) { |
607 |
|
|
/* x_i=x_i-a_ik*x_k */ |
608 |
|
|
S1=x[3*i ]; |
609 |
|
|
S2=x[3*i+1]; |
610 |
|
|
S3=x[3*i+2]; |
611 |
|
|
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
612 |
|
|
k=gs->pattern->index[iptr_ik]; |
613 |
|
|
if (gs->colorOf[k]>color) { |
614 |
|
|
R1=x[3*k]; |
615 |
|
|
R2=x[3*k+1]; |
616 |
|
|
R3=x[3*k+2]; |
617 |
|
|
S1-=gs->factors->val[9*iptr_ik ]*R1+gs->factors->val[9*iptr_ik+3]*R2+gs->factors->val[9*iptr_ik+6]*R3; |
618 |
|
|
S2-=gs->factors->val[9*iptr_ik+1]*R1+gs->factors->val[9*iptr_ik+4]*R2+gs->factors->val[9*iptr_ik+7]*R3; |
619 |
|
|
S3-=gs->factors->val[9*iptr_ik+2]*R1+gs->factors->val[9*iptr_ik+5]*R2+gs->factors->val[9*iptr_ik+8]*R3; |
620 |
|
|
} |
621 |
|
|
} |
622 |
|
|
/* x[3*i]=S1; |
623 |
|
|
x[3*i+1]=S2; |
624 |
|
|
x[3*i+2]=S3; |
625 |
|
|
*/ iptr_main=gs->main_iptr[i]; |
626 |
|
|
A11=gs->factors->val[iptr_main*9 ]; |
627 |
|
|
A21=gs->factors->val[iptr_main*9+1]; |
628 |
|
|
A31=gs->factors->val[iptr_main*9+2]; |
629 |
|
|
A12=gs->factors->val[iptr_main*9+3]; |
630 |
|
|
A22=gs->factors->val[iptr_main*9+4]; |
631 |
|
|
A32=gs->factors->val[iptr_main*9+5]; |
632 |
|
|
A13=gs->factors->val[iptr_main*9+6]; |
633 |
|
|
A23=gs->factors->val[iptr_main*9+7]; |
634 |
|
|
A33=gs->factors->val[iptr_main*9+8]; |
635 |
|
|
D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22); |
636 |
|
|
if (ABS(D)>0.) { |
637 |
|
|
D=1./D; |
638 |
|
|
S11=(A22*A33-A23*A32)*D; |
639 |
|
|
S21=(A31*A23-A21*A33)*D; |
640 |
|
|
S31=(A21*A32-A31*A22)*D; |
641 |
|
|
S12=(A13*A32-A12*A33)*D; |
642 |
|
|
S22=(A11*A33-A31*A13)*D; |
643 |
|
|
S32=(A12*A31-A11*A32)*D; |
644 |
|
|
S13=(A12*A23-A13*A22)*D; |
645 |
|
|
S23=(A13*A21-A11*A23)*D; |
646 |
|
|
S33=(A11*A22-A12*A21)*D; |
647 |
|
|
x[3*i ]=S11*S1+S12*S2+S13*S3; |
648 |
|
|
x[3*i+1]=S21*S1+S22*S2+S23*S3; |
649 |
|
|
x[3*i+2]=S31*S1+S32*S2+S33*S3; |
650 |
|
|
} else { |
651 |
|
|
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
652 |
|
|
} |
653 |
|
|
} |
654 |
|
|
} |
655 |
|
|
} |
656 |
|
|
} |
657 |
|
|
|
658 |
|
|
return; |
659 |
|
|
} |
660 |
|
|
|