1 |
|
2 |
/******************************************************* |
3 |
* |
4 |
* Copyright (c) 2003-2008 by University of Queensland |
5 |
* Earth Systems Science Computational Center (ESSCC) |
6 |
* http://www.uq.edu.au/esscc |
7 |
* |
8 |
* Primary Business: Queensland, Australia |
9 |
* Licensed under the Open Software License version 3.0 |
10 |
* http://www.opensource.org/licenses/osl-3.0.php |
11 |
* |
12 |
*******************************************************/ |
13 |
|
14 |
|
15 |
/**************************************************************/ |
16 |
|
17 |
/* Paso: GS preconditioner with reordering */ |
18 |
|
19 |
/**************************************************************/ |
20 |
|
21 |
/* Copyrights by ACcESS Australia 2003,2004,2005,2006,2007,2008 */ |
22 |
/* Author: artak@uq.edu.au */ |
23 |
|
24 |
/**************************************************************/ |
25 |
|
26 |
#include "Paso.h" |
27 |
#include "Solver.h" |
28 |
#include "PasoUtil.h" |
29 |
|
30 |
#include <stdio.h> |
31 |
|
32 |
/**************************************************************/ |
33 |
|
34 |
/* free all memory used by GS */ |
35 |
|
36 |
void Paso_Solver_GS_free(Paso_Solver_GS * in) { |
37 |
if (in!=NULL) { |
38 |
MEMFREE(in->colorOf); |
39 |
Paso_SparseMatrix_free(in->factors); |
40 |
MEMFREE(in->diag); |
41 |
MEMFREE(in->main_iptr); |
42 |
Paso_Pattern_free(in->pattern); |
43 |
MEMFREE(in); |
44 |
} |
45 |
} |
46 |
|
47 |
/**************************************************************/ |
48 |
|
49 |
/* constructs the incomplete block factorization of |
50 |
|
51 |
*/ |
52 |
Paso_Solver_GS* Paso_Solver_getGS(Paso_SparseMatrix * A,bool_t verbose) { |
53 |
dim_t n=A->numRows; |
54 |
dim_t n_block=A->row_block_size; |
55 |
dim_t block_size=A->block_size; |
56 |
index_t num_colors=0, *mis_marker=NULL; |
57 |
register index_t i,iptr_main,iPtr,iptr_ik,k,iptr_kj,j,iptr_ij,color,color2; |
58 |
double time0,time_color,time_fac; |
59 |
/* allocations: */ |
60 |
Paso_Solver_GS* out=MEMALLOC(1,Paso_Solver_GS); |
61 |
if (Paso_checkPtr(out)) return NULL; |
62 |
out->colorOf=MEMALLOC(n,index_t); |
63 |
out->diag=MEMALLOC( ((size_t) n) * ((size_t) block_size),double); |
64 |
/*out->diag=MEMALLOC(A->len,double);*/ |
65 |
out->main_iptr=MEMALLOC(n,index_t); |
66 |
out->pattern=Paso_Pattern_getReference(A->pattern); |
67 |
out->factors=Paso_SparseMatrix_getReference(A); |
68 |
out->n_block=n_block; |
69 |
out->n=n; |
70 |
|
71 |
if ( !(Paso_checkPtr(out->colorOf) || Paso_checkPtr(out->main_iptr) || Paso_checkPtr(out->factors)) ) { |
72 |
time0=Paso_timer(); |
73 |
Paso_Pattern_color(A->pattern,&out->num_colors,out->colorOf); |
74 |
time_color=Paso_timer()-time0; |
75 |
|
76 |
if (Paso_noError()) { |
77 |
time0=Paso_timer(); |
78 |
|
79 |
if (! (Paso_checkPtr(out->diag))) { |
80 |
if (n_block==1) { |
81 |
#pragma omp parallel for private(i, iPtr) schedule(static) |
82 |
for (i = 0; i < A->pattern->numOutput; i++) { |
83 |
out->diag[i]=1.; |
84 |
/* find main diagonal */ |
85 |
for (iPtr = A->pattern->ptr[i]; iPtr < A->pattern->ptr[i + 1]; iPtr++) { |
86 |
if (A->pattern->index[iPtr]==i) { |
87 |
iptr_main=iPtr; |
88 |
out->diag[i]=A->val[iPtr]; |
89 |
break; |
90 |
} |
91 |
} |
92 |
out->main_iptr[i]=iptr_main; |
93 |
} |
94 |
} else if (n_block==2) { |
95 |
#pragma omp parallel for private(i, iPtr) schedule(static) |
96 |
for (i = 0; i < A->pattern->numOutput; i++) { |
97 |
out->diag[i*4+0]= 1.; |
98 |
out->diag[i*4+1]= 0.; |
99 |
out->diag[i*4+2]= 0.; |
100 |
out->diag[i*4+3]= 1.; |
101 |
/* find main diagonal */ |
102 |
for (iPtr = A->pattern->ptr[i]; iPtr < A->pattern->ptr[i + 1]; iPtr++) { |
103 |
if (A->pattern->index[iPtr]==i) { |
104 |
iptr_main=iPtr; |
105 |
out->diag[i*4]= A->val[iPtr*4]; |
106 |
out->diag[i*4+1]=A->val[iPtr*4+1]; |
107 |
out->diag[i*4+2]=A->val[iPtr*4+2]; |
108 |
out->diag[i*4+3]= A->val[iPtr*4+3]; |
109 |
break; |
110 |
} |
111 |
} |
112 |
out->main_iptr[i]=iptr_main; |
113 |
} |
114 |
} else if (n_block==3) { |
115 |
#pragma omp parallel for private(i, iPtr) schedule(static) |
116 |
for (i = 0; i < A->pattern->numOutput; i++) { |
117 |
out->diag[i*9 ]=1.; |
118 |
out->diag[i*9+1]=0.; |
119 |
out->diag[i*9+2]=0.; |
120 |
out->diag[i*9+3]=0.; |
121 |
out->diag[i*9+4]=1.; |
122 |
out->diag[i*9+5]=0.; |
123 |
out->diag[i*9+6]=0.; |
124 |
out->diag[i*9+7]=0.; |
125 |
out->diag[i*9+8]=1.; |
126 |
/* find main diagonal */ |
127 |
for (iPtr = A->pattern->ptr[i]; iPtr < A->pattern->ptr[i + 1]; iPtr++) { |
128 |
if (A->pattern->index[iPtr]==i) { |
129 |
iptr_main=iPtr; |
130 |
out->diag[i*9 ]=A->val[iPtr*9 ]; |
131 |
out->diag[i*9+1]=A->val[iPtr*9+1]; |
132 |
out->diag[i*9+2]=A->val[iPtr*9+2]; |
133 |
out->diag[i*9+3]=A->val[iPtr*9+3]; |
134 |
out->diag[i*9+4]=A->val[iPtr*9+4]; |
135 |
out->diag[i*9+5]=A->val[iPtr*9+5]; |
136 |
out->diag[i*9+6]=A->val[iPtr*9+6]; |
137 |
out->diag[i*9+7]=A->val[iPtr*9+7]; |
138 |
out->diag[i*9+8]=A->val[iPtr*9+8]; |
139 |
break; |
140 |
} |
141 |
} |
142 |
out->main_iptr[i]=iptr_main; |
143 |
} |
144 |
} |
145 |
} |
146 |
|
147 |
time_fac=Paso_timer()-time0; |
148 |
} |
149 |
} |
150 |
if (Paso_noError()) { |
151 |
if (verbose) { |
152 |
printf("GS: %d color used \n",out->num_colors); |
153 |
printf("timing: GS: coloring/elemination : %e/%e\n",time_color,time_fac); |
154 |
} |
155 |
return out; |
156 |
} else { |
157 |
Paso_Solver_GS_free(out); |
158 |
return NULL; |
159 |
} |
160 |
} |
161 |
|
162 |
/**************************************************************/ |
163 |
|
164 |
/* apply GS precondition b-> x |
165 |
|
166 |
in fact it solves LD^{-1}Ux=b in the form x= U^{-1} D L^{-1}b |
167 |
|
168 |
should be called within a parallel region |
169 |
barrier synconization should be performed to make sure that the input vector available |
170 |
|
171 |
*/ |
172 |
|
173 |
void Paso_Solver_solveGS(Paso_Solver_GS * gs, double * x, double * b) { |
174 |
register dim_t i,k; |
175 |
register index_t color,iptr_ik,iptr_main; |
176 |
register double A11,A12,A21,A22,A13,A23,A33,A32,A31,S1,S2,S3,R1,R2,R3,D,S21,S22,S12,S11,S13,S23,S33,S32,S31; |
177 |
dim_t n_block=gs->n_block; |
178 |
dim_t n=gs->n; |
179 |
index_t* pivot=NULL; |
180 |
|
181 |
/* copy x into b*/ |
182 |
#pragma omp parallel for private(i) schedule(static) |
183 |
for (i=0;i<n*n_block;++i) x[i]=b[i]; |
184 |
/* forward substitution */ |
185 |
for (color=0;color<gs->num_colors;++color) { |
186 |
if (n_block==1) { |
187 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1,iptr_main) |
188 |
for (i = 0; i < n; ++i) { |
189 |
if (gs->colorOf[i]==color) { |
190 |
/* x_i=x_i-a_ik*x_k */ |
191 |
S1=x[i]; |
192 |
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
193 |
k=gs->pattern->index[iptr_ik]; |
194 |
if (gs->colorOf[k]<color) { |
195 |
R1=x[k]; |
196 |
S1-=gs->factors->val[iptr_ik]*R1; |
197 |
} |
198 |
} |
199 |
iptr_main=gs->main_iptr[i]; |
200 |
x[i]=(1/gs->factors->val[iptr_main])*S1; |
201 |
} |
202 |
} |
203 |
} else if (n_block==2) { |
204 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,iptr_main,S1,S2,R1,R2) |
205 |
for (i = 0; i < n; ++i) { |
206 |
if (gs->colorOf[i]==color) { |
207 |
/* x_i=x_i-a_ik*x_k */ |
208 |
S1=x[2*i]; |
209 |
S2=x[2*i+1]; |
210 |
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
211 |
k=gs->pattern->index[iptr_ik]; |
212 |
if (gs->colorOf[k]<color) { |
213 |
R1=x[2*k]; |
214 |
R2=x[2*k+1]; |
215 |
S1-=gs->factors->val[4*iptr_ik ]*R1+gs->factors->val[4*iptr_ik+2]*R2; |
216 |
S2-=gs->factors->val[4*iptr_ik+1]*R1+gs->factors->val[4*iptr_ik+3]*R2; |
217 |
} |
218 |
} |
219 |
iptr_main=gs->main_iptr[i]; |
220 |
A11=gs->factors->val[iptr_main*4]; |
221 |
A21=gs->factors->val[iptr_main*4+1]; |
222 |
A12=gs->factors->val[iptr_main*4+2]; |
223 |
A22=gs->factors->val[iptr_main*4+3]; |
224 |
D = A11*A22-A12*A21; |
225 |
if (ABS(D)>0.) { |
226 |
D=1./D; |
227 |
S11= A22*D; |
228 |
S21=-A21*D; |
229 |
S12=-A12*D; |
230 |
S22= A11*D; |
231 |
x[2*i ]=S11*S1+S12*S2; |
232 |
x[2*i+1]=S21*S1+S22*S2; |
233 |
} else { |
234 |
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
235 |
} |
236 |
} |
237 |
|
238 |
} |
239 |
} else if (n_block==3) { |
240 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,iptr_main,k,S1,S2,S3,R1,R2,R3) |
241 |
for (i = 0; i < n; ++i) { |
242 |
if (gs->colorOf[i]==color) { |
243 |
/* x_i=x_i-a_ik*x_k */ |
244 |
S1=x[3*i]; |
245 |
S2=x[3*i+1]; |
246 |
S3=x[3*i+2]; |
247 |
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
248 |
k=gs->pattern->index[iptr_ik]; |
249 |
if (gs->colorOf[k]<color) { |
250 |
R1=x[3*k]; |
251 |
R2=x[3*k+1]; |
252 |
R3=x[3*k+2]; |
253 |
S1-=gs->factors->val[9*iptr_ik ]*R1+gs->factors->val[9*iptr_ik+3]*R2+gs->factors->val[9*iptr_ik+6]*R3; |
254 |
S2-=gs->factors->val[9*iptr_ik+1]*R1+gs->factors->val[9*iptr_ik+4]*R2+gs->factors->val[9*iptr_ik+7]*R3; |
255 |
S3-=gs->factors->val[9*iptr_ik+2]*R1+gs->factors->val[9*iptr_ik+5]*R2+gs->factors->val[9*iptr_ik+8]*R3; |
256 |
} |
257 |
} |
258 |
iptr_main=gs->main_iptr[i]; |
259 |
A11=gs->factors->val[iptr_main*9 ]; |
260 |
A21=gs->factors->val[iptr_main*9+1]; |
261 |
A31=gs->factors->val[iptr_main*9+2]; |
262 |
A12=gs->factors->val[iptr_main*9+3]; |
263 |
A22=gs->factors->val[iptr_main*9+4]; |
264 |
A32=gs->factors->val[iptr_main*9+5]; |
265 |
A13=gs->factors->val[iptr_main*9+6]; |
266 |
A23=gs->factors->val[iptr_main*9+7]; |
267 |
A33=gs->factors->val[iptr_main*9+8]; |
268 |
D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22); |
269 |
if (ABS(D)>0.) { |
270 |
D=1./D; |
271 |
S11=(A22*A33-A23*A32)*D; |
272 |
S21=(A31*A23-A21*A33)*D; |
273 |
S31=(A21*A32-A31*A22)*D; |
274 |
S12=(A13*A32-A12*A33)*D; |
275 |
S22=(A11*A33-A31*A13)*D; |
276 |
S32=(A12*A31-A11*A32)*D; |
277 |
S13=(A12*A23-A13*A22)*D; |
278 |
S23=(A13*A21-A11*A23)*D; |
279 |
S33=(A11*A22-A12*A21)*D; |
280 |
/* a_ik=a_ii^{-1}*a_ik */ |
281 |
x[3*i ]=S11*S1+S12*S2+S13*S3; |
282 |
x[3*i+1]=S21*S1+S22*S2+S23*S3; |
283 |
x[3*i+2]=S31*S1+S32*S2+S33*S3; |
284 |
} else { |
285 |
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
286 |
} |
287 |
} |
288 |
} |
289 |
} |
290 |
} |
291 |
|
292 |
/* Multipling with diag(A) */ |
293 |
Paso_Solver_applyBlockDiagonalMatrix(gs->n_block,gs->n,gs->diag,pivot,x,x); |
294 |
|
295 |
/* backward substitution */ |
296 |
for (color=(gs->num_colors)-1;color>-1;--color) { |
297 |
if (n_block==1) { |
298 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,R1) |
299 |
for (i = 0; i < n; ++i) { |
300 |
if (gs->colorOf[i]==color) { |
301 |
/* x_i=x_i-a_ik*x_k */ |
302 |
S1=x[i]; |
303 |
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
304 |
k=gs->pattern->index[iptr_ik]; |
305 |
if (gs->colorOf[k]>color) { |
306 |
R1=x[k]; |
307 |
S1-=gs->factors->val[iptr_ik]*R1; |
308 |
} |
309 |
} |
310 |
/*x[i]=S1;*/ |
311 |
iptr_main=gs->main_iptr[i]; |
312 |
x[i]=(1/gs->factors->val[iptr_main])*S1; |
313 |
} |
314 |
} |
315 |
} else if (n_block==2) { |
316 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2) |
317 |
for (i = 0; i < n; ++i) { |
318 |
if (gs->colorOf[i]==color) { |
319 |
/* x_i=x_i-a_ik*x_k */ |
320 |
S1=x[2*i]; |
321 |
S2=x[2*i+1]; |
322 |
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
323 |
k=gs->pattern->index[iptr_ik]; |
324 |
if (gs->colorOf[k]>color) { |
325 |
R1=x[2*k]; |
326 |
R2=x[2*k+1]; |
327 |
S1-=gs->factors->val[4*iptr_ik ]*R1+gs->factors->val[4*iptr_ik+2]*R2; |
328 |
S2-=gs->factors->val[4*iptr_ik+1]*R1+gs->factors->val[4*iptr_ik+3]*R2; |
329 |
} |
330 |
} |
331 |
/*x[2*i]=S1; |
332 |
x[2*i+1]=S2;*/ |
333 |
iptr_main=gs->main_iptr[i]; |
334 |
A11=gs->factors->val[iptr_main*4]; |
335 |
A21=gs->factors->val[iptr_main*4+1]; |
336 |
A12=gs->factors->val[iptr_main*4+2]; |
337 |
A22=gs->factors->val[iptr_main*4+3]; |
338 |
D = A11*A22-A12*A21; |
339 |
if (ABS(D)>0.) { |
340 |
D=1./D; |
341 |
S11= A22*D; |
342 |
S21=-A21*D; |
343 |
S12=-A12*D; |
344 |
S22= A11*D; |
345 |
x[2*i ]=S11*S1+S12*S2; |
346 |
x[2*i+1]=S21*S1+S22*S2; |
347 |
} else { |
348 |
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
349 |
} |
350 |
|
351 |
} |
352 |
} |
353 |
} else if (n_block==3) { |
354 |
#pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3) |
355 |
for (i = 0; i < n; ++i) { |
356 |
if (gs->colorOf[i]==color) { |
357 |
/* x_i=x_i-a_ik*x_k */ |
358 |
S1=x[3*i ]; |
359 |
S2=x[3*i+1]; |
360 |
S3=x[3*i+2]; |
361 |
for (iptr_ik=gs->pattern->ptr[i];iptr_ik<gs->pattern->ptr[i+1]; ++iptr_ik) { |
362 |
k=gs->pattern->index[iptr_ik]; |
363 |
if (gs->colorOf[k]>color) { |
364 |
R1=x[3*k]; |
365 |
R2=x[3*k+1]; |
366 |
R3=x[3*k+2]; |
367 |
S1-=gs->factors->val[9*iptr_ik ]*R1+gs->factors->val[9*iptr_ik+3]*R2+gs->factors->val[9*iptr_ik+6]*R3; |
368 |
S2-=gs->factors->val[9*iptr_ik+1]*R1+gs->factors->val[9*iptr_ik+4]*R2+gs->factors->val[9*iptr_ik+7]*R3; |
369 |
S3-=gs->factors->val[9*iptr_ik+2]*R1+gs->factors->val[9*iptr_ik+5]*R2+gs->factors->val[9*iptr_ik+8]*R3; |
370 |
} |
371 |
} |
372 |
/* x[3*i]=S1; |
373 |
x[3*i+1]=S2; |
374 |
x[3*i+2]=S3; |
375 |
*/ iptr_main=gs->main_iptr[i]; |
376 |
A11=gs->factors->val[iptr_main*9 ]; |
377 |
A21=gs->factors->val[iptr_main*9+1]; |
378 |
A31=gs->factors->val[iptr_main*9+2]; |
379 |
A12=gs->factors->val[iptr_main*9+3]; |
380 |
A22=gs->factors->val[iptr_main*9+4]; |
381 |
A32=gs->factors->val[iptr_main*9+5]; |
382 |
A13=gs->factors->val[iptr_main*9+6]; |
383 |
A23=gs->factors->val[iptr_main*9+7]; |
384 |
A33=gs->factors->val[iptr_main*9+8]; |
385 |
D = A11*(A22*A33-A23*A32)+ A12*(A31*A23-A21*A33)+A13*(A21*A32-A31*A22); |
386 |
if (ABS(D)>0.) { |
387 |
D=1./D; |
388 |
S11=(A22*A33-A23*A32)*D; |
389 |
S21=(A31*A23-A21*A33)*D; |
390 |
S31=(A21*A32-A31*A22)*D; |
391 |
S12=(A13*A32-A12*A33)*D; |
392 |
S22=(A11*A33-A31*A13)*D; |
393 |
S32=(A12*A31-A11*A32)*D; |
394 |
S13=(A12*A23-A13*A22)*D; |
395 |
S23=(A13*A21-A11*A23)*D; |
396 |
S33=(A11*A22-A12*A21)*D; |
397 |
x[3*i ]=S11*S1+S12*S2+S13*S3; |
398 |
x[3*i+1]=S21*S1+S22*S2+S23*S3; |
399 |
x[3*i+2]=S31*S1+S32*S2+S33*S3; |
400 |
} else { |
401 |
Paso_setError(ZERO_DIVISION_ERROR, "Paso_Solver_getGS: non-regular main diagonal block."); |
402 |
} |
403 |
} |
404 |
} |
405 |
} |
406 |
} |
407 |
|
408 |
if (gs->sweeps>1) { |
409 |
/* Compute the residual b=b-Ax*/ |
410 |
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(DBLE(-1), gs->factors, x, DBLE(2), b); |
411 |
/* Go round again*/ |
412 |
gs->sweeps=gs->sweeps-1; |
413 |
Paso_Solver_solveGS(gs,x,b); |
414 |
} |
415 |
|
416 |
return; |
417 |
} |
418 |
|