1 |
|
2 |
/******************************************************* |
3 |
* |
4 |
* Copyright (c) 2003-2010 by University of Queensland |
5 |
* Earth Systems Science Computational Center (ESSCC) |
6 |
* http://www.uq.edu.au/esscc |
7 |
* |
8 |
* Primary Business: Queensland, Australia |
9 |
* Licensed under the Open Software License version 3.0 |
10 |
* http://www.opensource.org/licenses/osl-3.0.php |
11 |
* |
12 |
*******************************************************/ |
13 |
|
14 |
|
15 |
/**************************************************************/ |
16 |
|
17 |
/* Paso: AMG preconditioner (local version) */ |
18 |
|
19 |
/**************************************************************/ |
20 |
|
21 |
/* Author: artak@uq.edu.au, l.gross@uq.edu.au */ |
22 |
|
23 |
/**************************************************************/ |
24 |
|
25 |
#define SHOW_TIMING FALSE |
26 |
|
27 |
#include "Paso.h" |
28 |
#include "Preconditioner.h" |
29 |
#include "Options.h" |
30 |
#include "PasoUtil.h" |
31 |
#include "UMFPACK.h" |
32 |
#include "MKL.h" |
33 |
|
34 |
/**************************************************************/ |
35 |
|
36 |
/* free all memory used by AMG */ |
37 |
|
38 |
void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) { |
39 |
if (in!=NULL) { |
40 |
Paso_Preconditioner_Smoother_free(in->Smoother); |
41 |
Paso_SystemMatrix_free(in->P); |
42 |
Paso_SystemMatrix_free(in->R); |
43 |
Paso_SystemMatrix_free(in->A_C); |
44 |
Paso_Preconditioner_AMG_free(in->AMG_C); |
45 |
MEMFREE(in->r); |
46 |
MEMFREE(in->x_C); |
47 |
MEMFREE(in->b_C); |
48 |
|
49 |
MEMFREE(in); |
50 |
} |
51 |
} |
52 |
|
53 |
index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) { |
54 |
if (in->AMG_C == NULL) { |
55 |
return in->level; |
56 |
} else { |
57 |
return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C); |
58 |
} |
59 |
} |
60 |
double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) { |
61 |
if (in->AMG_C == NULL) { |
62 |
if (in->A_C == NULL) { |
63 |
return 1.; |
64 |
} else { |
65 |
return Paso_SystemMatrix_getSparsity(in->A_C); |
66 |
} |
67 |
} else { |
68 |
return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C); |
69 |
} |
70 |
} |
71 |
dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) { |
72 |
if (in->AMG_C == NULL) { |
73 |
if (in->A_C == NULL) { |
74 |
return 0; |
75 |
} else { |
76 |
return Paso_SystemMatrix_getTotalNumRows(in->A_C); |
77 |
} |
78 |
} else { |
79 |
return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C); |
80 |
} |
81 |
} |
82 |
/***************************************************************** |
83 |
|
84 |
constructs AMG |
85 |
|
86 |
******************************************************************/ |
87 |
Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) { |
88 |
|
89 |
Paso_Preconditioner_AMG* out=NULL; |
90 |
bool_t verbose=options->verbose; |
91 |
Paso_SystemMatrix *Atemp=NULL, *A_C=NULL; |
92 |
|
93 |
const dim_t my_n=Paso_SystemMatrix_getNumRows(A_p); |
94 |
const dim_t overlap_n=Paso_SystemMatrix_getColOverlap(A_p); |
95 |
const dim_t n = my_n + overlap_n; |
96 |
|
97 |
const dim_t n_block=A_p->row_block_size; |
98 |
index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F=NULL; |
99 |
dim_t n_F=0, n_C=0, i; |
100 |
double time0=0; |
101 |
const double theta = options->coarsening_threshold; |
102 |
const double tau = options->diagonal_dominance_threshold; |
103 |
const double sparsity=Paso_SystemMatrix_getSparsity(A_p); |
104 |
const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p); |
105 |
|
106 |
|
107 |
/* |
108 |
is the input matrix A suitable for coarsening |
109 |
|
110 |
*/ |
111 |
if ( (sparsity >= options->min_coarse_sparsity) || |
112 |
(total_n <= options->min_coarse_matrix_size) || |
113 |
(level > options->level_max) ) { |
114 |
|
115 |
if (verbose) { |
116 |
/* |
117 |
print stopping condition: |
118 |
- 'SPAR' = min_coarse_matrix_sparsity exceeded |
119 |
- 'SIZE' = min_coarse_matrix_size exceeded |
120 |
- 'LEVEL' = level_max exceeded |
121 |
*/ |
122 |
printf("Paso_Preconditioner: AMG: termination of coarsening by "); |
123 |
|
124 |
if (sparsity >= options->min_coarse_sparsity) |
125 |
printf("SPAR "); |
126 |
|
127 |
if (total_n <= options->min_coarse_matrix_size) |
128 |
printf("SIZE "); |
129 |
|
130 |
if (level > options->level_max) |
131 |
printf("LEVEL "); |
132 |
|
133 |
printf("\n"); |
134 |
|
135 |
printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n", |
136 |
level, options->level_max, sparsity, options->min_coarse_sparsity, total_n, options->min_coarse_matrix_size); |
137 |
|
138 |
} |
139 |
|
140 |
return NULL; |
141 |
} else { |
142 |
/* Start Coarsening : */ |
143 |
const dim_t len_S=A_p->mainBlock->pattern->len+A_p->col_coupleBlock->pattern->len; |
144 |
|
145 |
F_marker=TMPMEMALLOC(n,index_t); |
146 |
counter=TMPMEMALLOC(n,index_t); |
147 |
|
148 |
dim_t* degree_S=TMPMEMALLOC(my_n, dim_t); |
149 |
index_t *offset_S=TMPMEMALLOC(my_n, index_t); |
150 |
index_t *S=TMPMEMALLOC(len_S, index_t); |
151 |
if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S) ) ) { |
152 |
/* |
153 |
set splitting of unknows: |
154 |
|
155 |
*/ |
156 |
time0=Esys_timer(); |
157 |
if (n_block>1) { |
158 |
Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau); |
159 |
} else { |
160 |
Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau); |
161 |
} |
162 |
{ |
163 |
dim_t p; |
164 |
for (i=0; i< my_n; ++i) { |
165 |
printf("%d: ",i); |
166 |
for (p=0; p<degree_S[i];++p) printf("%d ",S[offset_S[i]+p]); |
167 |
printf("\n"); |
168 |
} |
169 |
} |
170 |
|
171 |
|
172 |
/*MPI: |
173 |
Paso_Preconditioner_AMG_RungeStuebenSearch(n, A_p->pattern->ptr, degree_S, S, F_marker, options->usePanel); |
174 |
*/ |
175 |
|
176 |
/* in BoomerAMG interpolation is used FF connectiovity is required :*/ |
177 |
/*MPI: |
178 |
if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING) |
179 |
Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker); |
180 |
*/ |
181 |
|
182 |
options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time); |
183 |
|
184 |
#ifdef AAAAA |
185 |
if (Esys_noError() ) { |
186 |
#pragma omp parallel for private(i) schedule(static) |
187 |
for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F); |
188 |
|
189 |
/* |
190 |
count number of unkowns to be eliminated: |
191 |
*/ |
192 |
n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker); |
193 |
n_C=n-n_F; |
194 |
if (verbose) printf("Paso_Preconditioner: AMG level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F); |
195 |
|
196 |
if ( n_F == 0 ) { /* is a nasty case. a direct solver should be used, return NULL */ |
197 |
out = NULL; |
198 |
} else { |
199 |
out=MEMALLOC(1,Paso_Preconditioner_AMG); |
200 |
if (! Esys_checkPtr(out)) { |
201 |
out->level = level; |
202 |
out->n = n; |
203 |
out->n_F = n_F; |
204 |
out->n_block = n_block; |
205 |
out->A_C = NULL; |
206 |
out->P = NULL; |
207 |
out->R = NULL; |
208 |
out->post_sweeps = options->post_sweeps; |
209 |
out->pre_sweeps = options->pre_sweeps; |
210 |
out->r = NULL; |
211 |
out->x_C = NULL; |
212 |
out->b_C = NULL; |
213 |
out->AMG_C = NULL; |
214 |
out->Smoother=NULL; |
215 |
} |
216 |
mask_C=TMPMEMALLOC(n,index_t); |
217 |
rows_in_F=TMPMEMALLOC(n_F,index_t); |
218 |
Esys_checkPtr(mask_C); |
219 |
Esys_checkPtr(rows_in_F); |
220 |
if ( Esys_noError() ) { |
221 |
|
222 |
out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), verbose); |
223 |
|
224 |
if (n_C != 0) { |
225 |
/* if nothing is been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */ |
226 |
|
227 |
/* allocate helpers :*/ |
228 |
out->x_C=MEMALLOC(n_block*n_C,double); |
229 |
out->b_C=MEMALLOC(n_block*n_C,double); |
230 |
out->r=MEMALLOC(n_block*n,double); |
231 |
|
232 |
Esys_checkPtr(out->r); |
233 |
Esys_checkPtr(out->x_C); |
234 |
Esys_checkPtr(out->b_C); |
235 |
|
236 |
if ( Esys_noError() ) { |
237 |
/* creates index for F:*/ |
238 |
#pragma omp parallel private(i) |
239 |
{ |
240 |
#pragma omp for schedule(static) |
241 |
for (i = 0; i < n; ++i) { |
242 |
if (F_marker[i]) rows_in_F[counter[i]]=i; |
243 |
} |
244 |
} |
245 |
/* create mask of C nodes with value >-1 gives new id */ |
246 |
i=Paso_Util_cumsum_maskedFalse(n,counter, F_marker); |
247 |
|
248 |
#pragma omp parallel for private(i) schedule(static) |
249 |
for (i = 0; i < n; ++i) { |
250 |
if (F_marker[i]) { |
251 |
mask_C[i]=-1; |
252 |
} else { |
253 |
mask_C[i]=counter[i];; |
254 |
} |
255 |
} |
256 |
/* |
257 |
get Prolongation : |
258 |
*/ |
259 |
time0=Esys_timer(); |
260 |
/*MPI: |
261 |
out->P=Paso_Preconditioner_AMG_getProlongation(A_p,A_p->pattern->ptr, degree_S,S,n_C,mask_C, options->interpolation_method); |
262 |
*/ |
263 |
if (SHOW_TIMING) printf("timing: level %d: getProlongation: %e\n",level, Esys_timer()-time0); |
264 |
} |
265 |
/* |
266 |
construct Restriction operator as transposed of Prolongation operator: |
267 |
*/ |
268 |
if ( Esys_noError()) { |
269 |
time0=Esys_timer(); |
270 |
/*MPI: |
271 |
out->R=Paso_SystemMatrix_getTranspose(out->P); |
272 |
*/ |
273 |
if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0); |
274 |
} |
275 |
/* |
276 |
construct coarse level matrix: |
277 |
*/ |
278 |
if ( Esys_noError()) { |
279 |
time0=Esys_timer(); |
280 |
/*MPI: |
281 |
Atemp=Paso_SystemMatrix_MatrixMatrix(A_p,out->P); |
282 |
A_C=Paso_SystemMatrix_MatrixMatrix(out->R,Atemp); |
283 |
|
284 |
Paso_SystemMatrix_free(Atemp); |
285 |
*/ |
286 |
|
287 |
if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0); |
288 |
} |
289 |
|
290 |
|
291 |
/* |
292 |
constructe courser level: |
293 |
|
294 |
*/ |
295 |
if ( Esys_noError()) { |
296 |
out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options); |
297 |
} |
298 |
if ( Esys_noError()) { |
299 |
if ( out->AMG_C == NULL ) { |
300 |
out->reordering = options->reordering; |
301 |
out->refinements = options->coarse_matrix_refinements; |
302 |
/* no coarse level matrix has been constructed. use direct solver */ |
303 |
#ifdef MKL |
304 |
out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_C); |
305 |
Paso_SystemMatrix_free(A_C); |
306 |
out->A_C->solver_package = PASO_MKL; |
307 |
if (verbose) printf("Paso_Preconditioner: AMG: use MKL direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block); |
308 |
#else |
309 |
#ifdef UMFPACK |
310 |
out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_C); |
311 |
Paso_SystemMatrix_free(A_C); |
312 |
out->A_C->solver_package = PASO_UMFPACK; |
313 |
if (verbose) printf("Paso_Preconditioner: AMG: use UMFPACK direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block); |
314 |
#else |
315 |
out->A_C=A_C; |
316 |
out->A_C->solver_p=Paso_Preconditioner_Smoother_alloc(out->A_C, (options->smoother == PASO_JACOBI), verbose); |
317 |
out->A_C->solver_package = PASO_SMOOTHER; |
318 |
if (verbose) printf("Paso_Preconditioner: AMG: use smoother on the coarsest level (number of unknowns = %d).\n",n_C*n_block); |
319 |
#endif |
320 |
#endif |
321 |
} else { |
322 |
/* finally we set some helpers for the solver step */ |
323 |
out->A_C=A_C; |
324 |
} |
325 |
} |
326 |
} |
327 |
} |
328 |
TMPMEMFREE(mask_C); |
329 |
TMPMEMFREE(rows_in_F); |
330 |
} |
331 |
} |
332 |
#endif |
333 |
|
334 |
} |
335 |
TMPMEMFREE(counter); |
336 |
TMPMEMFREE(F_marker); |
337 |
TMPMEMFREE(degree_S); |
338 |
TMPMEMFREE(offset_S); |
339 |
TMPMEMFREE(S); |
340 |
} |
341 |
|
342 |
if (Esys_noError()) { |
343 |
return out; |
344 |
} else { |
345 |
Paso_Preconditioner_AMG_free(out); |
346 |
return NULL; |
347 |
} |
348 |
} |
349 |
|
350 |
|
351 |
void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) { |
352 |
const dim_t n = amg->n * amg->n_block; |
353 |
double time0=0; |
354 |
const dim_t post_sweeps=amg->post_sweeps; |
355 |
const dim_t pre_sweeps=amg->pre_sweeps; |
356 |
|
357 |
/* presmoothing */ |
358 |
time0=Esys_timer(); |
359 |
Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE); |
360 |
time0=Esys_timer()-time0; |
361 |
if (SHOW_TIMING) printf("timing: level %d: Presmooting: %e\n",amg->level, time0); |
362 |
/* end of presmoothing */ |
363 |
|
364 |
if (amg->n_F < amg->n) { /* is there work on the coarse level? */ |
365 |
time0=Esys_timer(); |
366 |
Paso_Copy(n, amg->r, b); /* r <- b */ |
367 |
Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/ |
368 |
Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */ |
369 |
time0=Esys_timer()-time0; |
370 |
/* coarse level solve */ |
371 |
if ( amg->AMG_C == NULL) { |
372 |
time0=Esys_timer(); |
373 |
/* A_C is the coarsest level */ |
374 |
#ifdef FIXME |
375 |
switch (amg->A_C->solver_package) { |
376 |
case (PASO_MKL): |
377 |
Paso_MKL(amg->A_C, amg->x_C,amg->b_C, amg->reordering, amg->refinements, SHOW_TIMING); |
378 |
break; |
379 |
case (PASO_UMFPACK): |
380 |
Paso_UMFPACK(amg->A_C, amg->x_C,amg->b_C, amg->refinements, SHOW_TIMING); |
381 |
break; |
382 |
case (PASO_SMOOTHER): |
383 |
Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE); |
384 |
break; |
385 |
} |
386 |
#endif |
387 |
Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE); |
388 |
if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0); |
389 |
} else { |
390 |
Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */ |
391 |
} |
392 |
time0=time0+Esys_timer(); |
393 |
Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */ |
394 |
|
395 |
/*postsmoothing*/ |
396 |
|
397 |
/*solve Ax=b with initial guess x */ |
398 |
time0=Esys_timer(); |
399 |
Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE); |
400 |
time0=Esys_timer()-time0; |
401 |
if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0); |
402 |
/*end of postsmoothing*/ |
403 |
|
404 |
} |
405 |
return; |
406 |
} |
407 |
|
408 |
/* theta = threshold for strong connections */ |
409 |
/* tau = threshold for diagonal dominance */ |
410 |
|
411 |
/*S_i={j \in N_i; i strongly coupled to j} |
412 |
|
413 |
in the sense that |A_{ij}| >= theta * max_k |A_{ik}| |
414 |
*/ |
415 |
|
416 |
void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A, |
417 |
dim_t *degree_S, index_t *offset_S, index_t *S, |
418 |
const double theta, const double tau) |
419 |
{ |
420 |
const dim_t my_n=Paso_SystemMatrix_getNumRows(A); |
421 |
index_t iptr, i; |
422 |
|
423 |
|
424 |
#pragma omp parallel for private(i,iptr) schedule(static) |
425 |
for (i=0;i<my_n;++i) { |
426 |
register double max_offdiagonal = 0.; |
427 |
register double sum_row=0; |
428 |
register double main_row=0; |
429 |
#pragma ivdep |
430 |
for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) { |
431 |
register index_t j=A->mainBlock->pattern->index[iptr]; |
432 |
register double fnorm=ABS(A->mainBlock->val[iptr]); |
433 |
|
434 |
if( j != i) { |
435 |
max_offdiagonal = MAX(max_offdiagonal,fnorm); |
436 |
sum_row+=fnorm; |
437 |
} else { |
438 |
main_row=fnorm; |
439 |
} |
440 |
|
441 |
} |
442 |
#pragma ivdep |
443 |
for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) { |
444 |
register index_t j=A->col_coupleBlock->pattern->index[iptr]; |
445 |
register double fnorm=ABS(A->col_coupleBlock->val[iptr]); |
446 |
max_offdiagonal = MAX(max_offdiagonal,fnorm); |
447 |
sum_row+=fnorm; |
448 |
} |
449 |
|
450 |
|
451 |
const double threshold = theta*max_offdiagonal; |
452 |
register dim_t kdeg=0; |
453 |
register index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i]; |
454 |
if (tau*main_row < sum_row) { /* no diagonal domainance */ |
455 |
#pragma ivdep |
456 |
for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) { |
457 |
register index_t j=A->mainBlock->pattern->index[iptr]; |
458 |
if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) { |
459 |
S[koffset+kdeg] = j; |
460 |
kdeg++; |
461 |
} |
462 |
} |
463 |
#pragma ivdep |
464 |
for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) { |
465 |
register index_t j=A->col_coupleBlock->pattern->index[iptr]; |
466 |
if(ABS(A->col_coupleBlock->val[iptr])>threshold) { |
467 |
S[koffset+kdeg] = j + my_n; |
468 |
kdeg++; |
469 |
} |
470 |
} |
471 |
} |
472 |
offset_S[i]=koffset; |
473 |
degree_S[i]=kdeg; |
474 |
} |
475 |
} |
476 |
|
477 |
/* theta = threshold for strong connections */ |
478 |
/* tau = threshold for diagonal dominance */ |
479 |
/*S_i={j \in N_i; i strongly coupled to j} |
480 |
|
481 |
in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F |
482 |
*/ |
483 |
void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A, |
484 |
dim_t *degree_S, index_t *offset_S, index_t *S, |
485 |
const double theta, const double tau) |
486 |
|
487 |
{ |
488 |
const dim_t my_n=Paso_SystemMatrix_getNumRows(A); |
489 |
index_t iptr, i, bi; |
490 |
const dim_t n_block=A->row_block_size; |
491 |
|
492 |
|
493 |
#pragma omp parallel private(i,iptr, bi) |
494 |
{ |
495 |
dim_t max_deg=0; |
496 |
#pragma omp for schedule(static) |
497 |
for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i] |
498 |
+A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]); |
499 |
|
500 |
double *rtmp=TMPMEMALLOC(max_deg, double); |
501 |
|
502 |
#pragma omp for schedule(static) |
503 |
for (i=0;i<my_n;++i) { |
504 |
|
505 |
register double max_offdiagonal = 0.; |
506 |
register double sum_row=0; |
507 |
register double main_row=0; |
508 |
register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i]; |
509 |
|
510 |
for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) { |
511 |
register index_t j=A->mainBlock->pattern->index[iptr]; |
512 |
register double fnorm=0; |
513 |
#pragma ivdep |
514 |
for(bi=0;bi<n_block*n_block;++bi) { |
515 |
register double rtmp2 = A->mainBlock->val[iptr*n_block*n_block+bi]; |
516 |
fnorm+=rtmp2*rtmp2; |
517 |
} |
518 |
fnorm=sqrt(fnorm); |
519 |
|
520 |
rtmp[iptr+rtmp_offset]=fnorm; |
521 |
if( j != i) { |
522 |
max_offdiagonal = MAX(max_offdiagonal,fnorm); |
523 |
sum_row+=fnorm; |
524 |
} else { |
525 |
main_row=fnorm; |
526 |
} |
527 |
} |
528 |
rtmp_offset=A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]-A->col_coupleBlock->pattern->ptr[i]; |
529 |
for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) { |
530 |
register index_t j=A->col_coupleBlock->pattern->index[iptr]; |
531 |
register double fnorm=0; |
532 |
#pragma ivdep |
533 |
for(bi=0;bi<n_block*n_block;++bi) { |
534 |
register double rtmp2 = A->col_coupleBlock->val[iptr*n_block*n_block+bi]; |
535 |
fnorm+=rtmp2*rtmp2; |
536 |
} |
537 |
fnorm=sqrt(fnorm); |
538 |
|
539 |
rtmp[iptr+rtmp_offset]=fnorm; |
540 |
max_offdiagonal = MAX(max_offdiagonal,fnorm); |
541 |
sum_row+=fnorm; |
542 |
} |
543 |
|
544 |
const double threshold = theta*max_offdiagonal; |
545 |
register dim_t kdeg=0; |
546 |
register index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i]; |
547 |
|
548 |
if (tau*main_row < sum_row) { /* no diagonal domainance */ |
549 |
rtmp_offset=-A->mainBlock->pattern->ptr[i]; |
550 |
#pragma ivdep |
551 |
for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) { |
552 |
register index_t j=A->mainBlock->pattern->index[iptr]; |
553 |
if(rtmp[iptr+rtmp_offset] > threshold && i!=j) { |
554 |
S[koffset+kdeg] = j; |
555 |
kdeg++; |
556 |
} |
557 |
} |
558 |
rtmp_offset=A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]-A->col_coupleBlock->pattern->ptr[i]; |
559 |
#pragma ivdep |
560 |
for (iptr=A->col_coupleBlock->pattern->ptr[i]; iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) { |
561 |
register index_t j=A->col_coupleBlock->pattern->index[iptr]; |
562 |
if(rtmp[iptr+rtmp_offset] > threshold) { |
563 |
S[koffset+kdeg] = j + my_n; |
564 |
kdeg++; |
565 |
} |
566 |
} |
567 |
|
568 |
} |
569 |
degree_S[i]=kdeg; |
570 |
offset_S[i]=koffset; |
571 |
} |
572 |
TMPMEMFREE(rtmp); |
573 |
} /* end of parallel region */ |
574 |
|
575 |
} |
576 |
|
577 |
#ifdef AAAAA |
578 |
/* the runge stueben coarsening algorithm: */ |
579 |
void Paso_Preconditioner_AMG_RungeStuebenSearch(const dim_t n, const index_t* offset_S, |
580 |
const dim_t* degree_S, const index_t* S, |
581 |
index_t*split_marker, const bool_t usePanel) |
582 |
{ |
583 |
|
584 |
index_t *lambda=NULL, *ST=NULL, *notInPanel=NULL, *panel=NULL, lambda_max, lambda_k; |
585 |
dim_t i,k, p, q, *degree_ST=NULL, len_panel, len_panel_new; |
586 |
register index_t j, itmp; |
587 |
|
588 |
if (n<=0) return; /* make sure that the return of Paso_Util_arg_max is not pointing to nirvana */ |
589 |
|
590 |
lambda=TMPMEMALLOC(n, index_t); Esys_checkPtr(lambda); |
591 |
degree_ST=TMPMEMALLOC(n, dim_t); Esys_checkPtr(degree_ST); |
592 |
ST=TMPMEMALLOC(offset_S[n], index_t); Esys_checkPtr(ST); |
593 |
if (usePanel) { |
594 |
notInPanel=TMPMEMALLOC(n, bool_t); Esys_checkPtr(notInPanel); |
595 |
panel=TMPMEMALLOC(n, index_t); Esys_checkPtr(panel); |
596 |
} |
597 |
|
598 |
|
599 |
|
600 |
if (Esys_noError() ) { |
601 |
/* initialize split_marker and split_marker :*/ |
602 |
/* those unknows which are not influenced go into F, the rest is available for F or C */ |
603 |
#pragma omp parallel for private(i) schedule(static) |
604 |
for (i=0;i<n;++i) { |
605 |
degree_ST[i]=0; |
606 |
if (degree_S[i]>0) { |
607 |
lambda[i]=0; |
608 |
split_marker[i]=PASO_AMG_UNDECIDED; |
609 |
} else { |
610 |
split_marker[i]=PASO_AMG_IN_F; |
611 |
lambda[i]=-1; |
612 |
} |
613 |
} |
614 |
/* create transpose :*/ |
615 |
for (i=0;i<n;++i) { |
616 |
for (p=0; p<degree_S[i]; ++p) { |
617 |
j=S[offset_S[i]+p]; |
618 |
ST[offset_S[j]+degree_ST[j]]=i; |
619 |
degree_ST[j]++; |
620 |
} |
621 |
} |
622 |
/* lambda[i] = |undecided k in ST[i]| + 2 * |F-unknown in ST[i]| */ |
623 |
#pragma omp parallel for private(i, j, itmp) schedule(static) |
624 |
for (i=0;i<n;++i) { |
625 |
if (split_marker[i]==PASO_AMG_UNDECIDED) { |
626 |
itmp=lambda[i]; |
627 |
for (p=0; p<degree_ST[i]; ++p) { |
628 |
j=ST[offset_S[i]+p]; |
629 |
if (split_marker[j]==PASO_AMG_UNDECIDED) { |
630 |
itmp++; |
631 |
} else { /* at this point there are no C points */ |
632 |
itmp+=2; |
633 |
} |
634 |
} |
635 |
lambda[i]=itmp; |
636 |
} |
637 |
} |
638 |
if (usePanel) { |
639 |
#pragma omp parallel for private(i) schedule(static) |
640 |
for (i=0;i<n;++i) notInPanel[i]=TRUE; |
641 |
} |
642 |
/* start search :*/ |
643 |
i=Paso_Util_arg_max(n,lambda); |
644 |
while (lambda[i]>-1) { /* is there any undecided unknown? */ |
645 |
|
646 |
if (usePanel) { |
647 |
len_panel=0; |
648 |
do { |
649 |
/* the unknown i is moved to C */ |
650 |
split_marker[i]=PASO_AMG_IN_C; |
651 |
lambda[i]=-1; /* lambda from unavailable unknowns is set to -1 */ |
652 |
|
653 |
/* all undecided unknown strongly coupled to i are moved to F */ |
654 |
for (p=0; p<degree_ST[i]; ++p) { |
655 |
j=ST[offset_S[i]+p]; |
656 |
|
657 |
if (split_marker[j]==PASO_AMG_UNDECIDED) { |
658 |
|
659 |
split_marker[j]=PASO_AMG_IN_F; |
660 |
lambda[j]=-1; |
661 |
|
662 |
for (q=0; q<degree_ST[j]; ++q) { |
663 |
k=ST[offset_S[j]+q]; |
664 |
if (split_marker[k]==PASO_AMG_UNDECIDED) { |
665 |
lambda[k]++; |
666 |
if (notInPanel[k]) { |
667 |
notInPanel[k]=FALSE; |
668 |
panel[len_panel]=k; |
669 |
len_panel++; |
670 |
} |
671 |
|
672 |
} /* the unknown i is moved to C */ |
673 |
split_marker[i]=PASO_AMG_IN_C; |
674 |
lambda[i]=-1; /* lambda from unavailable unknowns is set to -1 */ |
675 |
} |
676 |
|
677 |
} |
678 |
} |
679 |
for (p=0; p<degree_S[i]; ++p) { |
680 |
j=S[offset_S[i]+p]; |
681 |
if (split_marker[j]==PASO_AMG_UNDECIDED) { |
682 |
lambda[j]--; |
683 |
if (notInPanel[j]) { |
684 |
notInPanel[j]=FALSE; |
685 |
panel[len_panel]=j; |
686 |
len_panel++; |
687 |
} |
688 |
} |
689 |
} |
690 |
|
691 |
/* consolidate panel */ |
692 |
/* remove lambda[q]=-1 */ |
693 |
lambda_max=-1; |
694 |
i=-1; |
695 |
len_panel_new=0; |
696 |
for (q=0; q<len_panel; q++) { |
697 |
k=panel[q]; |
698 |
lambda_k=lambda[k]; |
699 |
if (split_marker[k]==PASO_AMG_UNDECIDED) { |
700 |
panel[len_panel_new]=k; |
701 |
len_panel_new++; |
702 |
|
703 |
if (lambda_max == lambda_k) { |
704 |
if (k<i) i=k; |
705 |
} else if (lambda_max < lambda_k) { |
706 |
lambda_max =lambda_k; |
707 |
i=k; |
708 |
} |
709 |
} |
710 |
} |
711 |
len_panel=len_panel_new; |
712 |
} while (len_panel>0); |
713 |
} else { |
714 |
/* the unknown i is moved to C */ |
715 |
split_marker[i]=PASO_AMG_IN_C; |
716 |
lambda[i]=-1; /* lambda from unavailable unknowns is set to -1 */ |
717 |
|
718 |
/* all undecided unknown strongly coupled to i are moved to F */ |
719 |
for (p=0; p<degree_ST[i]; ++p) { |
720 |
j=ST[offset_S[i]+p]; |
721 |
if (split_marker[j]==PASO_AMG_UNDECIDED) { |
722 |
|
723 |
split_marker[j]=PASO_AMG_IN_F; |
724 |
lambda[j]=-1; |
725 |
|
726 |
for (q=0; q<degree_ST[j]; ++q) { |
727 |
k=ST[offset_S[j]+q]; |
728 |
if (split_marker[k]==PASO_AMG_UNDECIDED) lambda[k]++; |
729 |
} |
730 |
|
731 |
} |
732 |
} |
733 |
for (p=0; p<degree_S[i]; ++p) { |
734 |
j=S[offset_S[i]+p]; |
735 |
if(split_marker[j]==PASO_AMG_UNDECIDED) lambda[j]--; |
736 |
} |
737 |
|
738 |
} |
739 |
i=Paso_Util_arg_max(n,lambda); |
740 |
} |
741 |
|
742 |
} |
743 |
TMPMEMFREE(lambda); |
744 |
TMPMEMFREE(ST); |
745 |
TMPMEMFREE(degree_ST); |
746 |
TMPMEMFREE(panel); |
747 |
TMPMEMFREE(notInPanel); |
748 |
} |
749 |
/* ensures that two F nodes are connected via a C node :*/ |
750 |
void Paso_Preconditioner_AMG_enforceFFConnectivity(const dim_t n, const index_t* offset_S, |
751 |
const dim_t* degree_S, const index_t* S, |
752 |
index_t*split_marker) |
753 |
{ |
754 |
dim_t i, p, q; |
755 |
|
756 |
/* now we make sure that two (strongly) connected F nodes are (strongly) connected via a C node. */ |
757 |
for (i=0;i<n;++i) { |
758 |
if ( (split_marker[i]==PASO_AMG_IN_F) && (degree_S[i]>0) ) { |
759 |
for (p=0; p<degree_S[i]; ++p) { |
760 |
register index_t j=S[offset_S[i]+p]; |
761 |
if ( (split_marker[j]==PASO_AMG_IN_F) && (degree_S[j]>0) ) { |
762 |
/* i and j are now two F nodes which are strongly connected */ |
763 |
/* is there a C node they share ? */ |
764 |
register index_t sharing=-1; |
765 |
for (q=0; q<degree_S[i]; ++q) { |
766 |
index_t k=S[offset_S[i]+q]; |
767 |
if (split_marker[k]==PASO_AMG_IN_C) { |
768 |
register index_t* where_k=(index_t*)bsearch(&k, &(S[offset_S[j]]), degree_S[j], sizeof(index_t), Paso_comparIndex); |
769 |
if (where_k != NULL) { |
770 |
sharing=k; |
771 |
break; |
772 |
} |
773 |
} |
774 |
} |
775 |
if (sharing<0) { |
776 |
if (i<j) { |
777 |
split_marker[j]=PASO_AMG_IN_C; |
778 |
} else { |
779 |
split_marker[i]=PASO_AMG_IN_C; |
780 |
break; /* no point to look any further as i is now a C node */ |
781 |
} |
782 |
} |
783 |
} |
784 |
} |
785 |
} |
786 |
} |
787 |
} |
788 |
#endif |
789 |
|
790 |
#ifdef DFG |
791 |
void Paso_Preconditioner_AMG_CIJPCoarsening( ) |
792 |
{ |
793 |
|
794 |
|
795 |
const dim_t my_n; |
796 |
const dim_t overlap_n; |
797 |
const dim_t n= my_n + overlap_n; |
798 |
|
799 |
|
800 |
/* initialize split_marker and split_marker :*/ |
801 |
/* those unknows which are not influenced go into F, the rest is available for F or C */ |
802 |
#pragma omp parallel for private(i) schedule(static) |
803 |
for (i=0;i<n;++i) { |
804 |
degree_ST[i]=0; |
805 |
if (degree_S[i]>0) { |
806 |
lambda[i]=0; |
807 |
split_marker[i]=PASO_AMG_UNDECIDED; |
808 |
} else { |
809 |
split_marker[i]=PASO_AMG_IN_F; |
810 |
lambda[i]=-1; |
811 |
} |
812 |
} |
813 |
|
814 |
/* set local lambda + overlap */ |
815 |
#pragma omp parallel for private(i) |
816 |
for (i=0; i<n ++i) { |
817 |
w[i]=degree_ST[i]; |
818 |
} |
819 |
for (i=0; i<my_n; i++) { |
820 |
w2[i]=random; |
821 |
} |
822 |
|
823 |
|
824 |
/* add noise to w */ |
825 |
Paso_Coupler_add(n, w, 1., w2, col_coupler); |
826 |
|
827 |
/* */ |
828 |
global_n_C=0; |
829 |
global_n_F=..; |
830 |
|
831 |
while (global_n_C + global_n_F < global_n) { |
832 |
|
833 |
|
834 |
is_in_D[i]=FALSE; |
835 |
/* test local connectivit*/ |
836 |
/* w2[i] = max(w[k] | k in S_i or k in S^T_i */ |
837 |
#pragma omp parallel for private(i) |
838 |
for (i=0; i<n; ++i) w2[i]=0; |
839 |
|
840 |
for (i=0; i<my_n; ++i) { |
841 |
for( iPtr =0 ; iPtr < degree_S[i]; ++iPtr) { |
842 |
k=S[offset_S[i]+iPtr]; |
843 |
w2[i]=MAX(w2[i],w[k]); |
844 |
w2[k]=MAX(w2[k],w[i]); |
845 |
} |
846 |
} |
847 |
/* adjust overlaps by MAX */ |
848 |
Paso_Coupler_max(n, w2, col_coupler); |
849 |
|
850 |
/* points with w[i]>w2[i] become C nodes */ |
851 |
} |
852 |
|
853 |
} |
854 |
#endif |