/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Contents of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3827 - (show annotations)
Tue Feb 14 11:42:08 2012 UTC (7 years, 7 months ago) by lgao
File MIME type: text/plain
File size: 37238 byte(s)
 Merge branch "amg_from_3530" back into trunk.


1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: AMG preconditioner (local version) */
18
19 /**************************************************************/
20
21 /* Author: artak@uq.edu.au, l.gross@uq.edu.au */
22
23 /**************************************************************/
24
25 #define SHOW_TIMING FALSE
26 #define MY_DEBUG 0
27 #define MY_DEBUG1 1
28
29 #include "Paso.h"
30 #include "Preconditioner.h"
31 #include "Options.h"
32 #include "PasoUtil.h"
33 #include "UMFPACK.h"
34 #include "MKL.h"
35 #include<stdio.h>
36
37
38 /**************************************************************/
39
40 /* free all memory used by AMG */
41
42 void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43 if (in!=NULL) {
44 Paso_Preconditioner_Smoother_free(in->Smoother);
45 Paso_SystemMatrix_free(in->P);
46 Paso_SystemMatrix_free(in->R);
47 Paso_SystemMatrix_free(in->A_C);
48 Paso_Preconditioner_AMG_free(in->AMG_C);
49 MEMFREE(in->r);
50 MEMFREE(in->x_C);
51 MEMFREE(in->b_C);
52
53 MEMFREE(in);
54 }
55 }
56
57 index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
58 if (in->AMG_C == NULL) {
59 return in->level;
60 } else {
61 return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
62 }
63 }
64 double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
65 if (in->AMG_C == NULL) {
66 if (in->A_C == NULL) {
67 return 1.;
68 } else {
69 return Paso_SystemMatrix_getSparsity(in->A_C);
70 }
71 } else {
72 return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
73 }
74 }
75 dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
76 if (in->AMG_C == NULL) {
77 if (in->A_C == NULL) {
78 return 0;
79 } else {
80 return Paso_SystemMatrix_getTotalNumRows(in->A_C);
81 }
82 } else {
83 return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
84 }
85 }
86 /*****************************************************************
87
88 constructs AMG
89
90 ******************************************************************/
91 Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92
93 Paso_Preconditioner_AMG* out=NULL;
94 Paso_SystemMatrix *A_C;
95 bool_t verbose=options->verbose;
96
97 const dim_t my_n=A_p->mainBlock->numRows;
98 const dim_t overlap_n=A_p->row_coupleBlock->numRows;
99
100 const dim_t n = my_n + overlap_n;
101
102 const dim_t n_block=A_p->row_block_size;
103 // const dim_t n_block=A_p->block_size;
104 index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
105 dim_t i, n_F, n_C, F_flag, *F_set=NULL;
106 double time0=0;
107 const double theta = options->coarsening_threshold;
108 const double tau = options->diagonal_dominance_threshold;
109 const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
110 const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);
111 const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
112
113
114 /*
115 is the input matrix A suitable for coarsening?
116
117 */
118 if ( (sparsity >= options->min_coarse_sparsity) ||
119 // (total_n <= options->min_coarse_matrix_size) ||
120 (global_n <= options->min_coarse_matrix_size) ||
121 (level > options->level_max) ) {
122
123 if (verbose) {
124 /*
125 print stopping condition:
126 - 'SPAR' = min_coarse_matrix_sparsity exceeded
127 - 'SIZE' = min_coarse_matrix_size exceeded
128 - 'LEVEL' = level_max exceeded
129 */
130 printf("Paso_Preconditioner: AMG: termination of coarsening by ");
131
132 if (sparsity >= options->min_coarse_sparsity)
133 printf("SPAR");
134
135 if (total_n <= options->min_coarse_matrix_size)
136 printf("SIZE");
137
138 if (level > options->level_max)
139 printf("LEVEL");
140
141 printf("\n");
142
143 printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
144 level, options->level_max, sparsity, options->min_coarse_sparsity, total_n, options->min_coarse_matrix_size);
145
146 }
147
148 return NULL;
149 } else {
150 /* Start Coarsening : */
151
152 /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
153 const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
154
155 dim_t* degree_S=TMPMEMALLOC(n, dim_t);
156 index_t *offset_S=TMPMEMALLOC(n, index_t);
157 index_t *S=TMPMEMALLOC(len_S, index_t);
158 dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
159 index_t *offset_ST=TMPMEMALLOC(n, index_t);
160 index_t *ST=TMPMEMALLOC(len_S, index_t);
161
162
163 F_marker=TMPMEMALLOC(n,index_t);
164 counter=TMPMEMALLOC(n,index_t);
165
166 if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
167 || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
168 /*
169 make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
170 */
171 Paso_SystemMatrix_copyColCoupleBlock(A_p);
172 Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
173
174 /*
175 set splitting of unknows:
176
177 */
178 time0=Esys_timer();
179 if (n_block>1) {
180 Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
181 } else {
182 Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
183 }
184 Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
185 // Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
186
187 Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
188 degree_S, offset_S, S, degree_ST, offset_ST, ST,
189 A_p->col_coupler->connector,A_p->col_distribution);
190
191
192 /* in BoomerAMG if interpolation is used FF connectivity is required */
193 /*MPI:
194 if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
195 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);
196 */
197
198 options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
199 if (Esys_noError() ) {
200 #pragma omp parallel for private(i) schedule(static)
201 for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F);
202
203 /*
204 count number of unkowns to be eliminated:
205 */
206 n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
207 n_C=n-n_F;
208 if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);
209
210 /* collect n_F values on all processes, a direct solver should
211 be used if any n_F value is 0 */
212 F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
213 MPI_Allgather(&n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
214 F_flag = 1;
215 for (i=0; i<A_p->mpi_info->size; i++) {
216 if (F_set[i] == 0) {
217 F_flag = 0;
218 break;
219 }
220 }
221 TMPMEMFREE(F_set);
222
223 // if ( n_F == 0 ) { /* is a nasty case. a direct solver should be used, return NULL */
224 if (F_flag == 0) {
225 out = NULL;
226 } else {
227 out=MEMALLOC(1,Paso_Preconditioner_AMG);
228 if (! Esys_checkPtr(out)) {
229 out->level = level;
230 out->n = n;
231 out->n_F = n_F;
232 out->n_block = n_block;
233 out->A_C = NULL;
234 out->P = NULL;
235 out->R = NULL;
236 // out->post_sweeps = options->post_sweeps;
237 // out->pre_sweeps = options->pre_sweeps;
238 out->post_sweeps = 2;
239 out->pre_sweeps = 2;
240 out->r = NULL;
241 out->x_C = NULL;
242 out->b_C = NULL;
243 out->AMG_C = NULL;
244 out->Smoother=NULL;
245 }
246 mask_C=TMPMEMALLOC(n,index_t);
247 rows_in_F=TMPMEMALLOC(n_F,index_t);
248 Esys_checkPtr(mask_C);
249 Esys_checkPtr(rows_in_F);
250 if ( Esys_noError() ) {
251
252 out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
253
254 if (n_C != 0) {
255 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
256
257 /* allocate helpers :*/
258 out->x_C=MEMALLOC(n_block*n_C,double);
259 out->b_C=MEMALLOC(n_block*n_C,double);
260 out->r=MEMALLOC(n_block*n,double);
261
262 Esys_checkPtr(out->r);
263 Esys_checkPtr(out->x_C);
264 Esys_checkPtr(out->b_C);
265
266 if ( Esys_noError() ) {
267 /* creates index for F:*/
268 #pragma omp parallel private(i)
269 {
270 #pragma omp for schedule(static)
271 for (i = 0; i < n; ++i) {
272 if (F_marker[i]) rows_in_F[counter[i]]=i;
273 }
274 }
275 /* create mask of C nodes with value >-1, gives new id */
276 i=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
277 /*
278 get Prolongation :
279 */
280
281 time0=Esys_timer();
282
283 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
284
285 }
286
287 /*
288 construct Restriction operator as transposed of Prolongation operator:
289 */
290
291 if ( Esys_noError()) {
292 time0=Esys_timer();
293
294 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
295
296 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
297 }
298 /*
299 construct coarse level matrix:
300 */
301 if ( Esys_noError()) {
302 time0=Esys_timer();
303
304 A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
305
306 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
307 }
308
309 /*
310 constructe courser level:
311
312 */
313 if ( Esys_noError()) {
314 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
315 }
316
317 if ( Esys_noError()) {
318 if ( out->AMG_C == NULL ) {
319 /* merge the system matrix into 1 rank when
320 it's not suitable coarsening due to the
321 total number of unknowns are too small */
322 out->A_C=A_C;
323 out->reordering = options->reordering;
324 out->refinements = options->coarse_matrix_refinements;
325 out->verbose = verbose;
326 out->options_smoother = options->smoother;
327 } else {
328 /* finally we set some helpers for the solver step */
329 out->A_C=A_C;
330 }
331 }
332 }
333 }
334 TMPMEMFREE(mask_C);
335 TMPMEMFREE(rows_in_F);
336 }
337 }
338
339 }
340 TMPMEMFREE(counter);
341 TMPMEMFREE(F_marker);
342 TMPMEMFREE(degree_S);
343 TMPMEMFREE(offset_S);
344 TMPMEMFREE(S);
345 TMPMEMFREE(degree_ST);
346 TMPMEMFREE(offset_ST);
347 TMPMEMFREE(ST);
348
349 }
350
351 if (Esys_noError()) {
352 return out;
353 } else {
354 Paso_Preconditioner_AMG_free(out);
355 return NULL;
356 }
357 }
358
359
360 void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
361 const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
362 double time0=0;
363 const dim_t post_sweeps=amg->post_sweeps;
364 const dim_t pre_sweeps=amg->pre_sweeps;
365 int rank = A->mpi_info->rank;
366
367 /* presmoothing */
368 time0=Esys_timer();
369 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
370
371 time0=Esys_timer()-time0;
372 if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
373 /* end of presmoothing */
374
375 if (amg->n_F < amg->n) { /* is there work on the coarse level? */
376 time0=Esys_timer();
377
378 Paso_Copy(n, amg->r, b); /* r <- b */
379 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
380 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */
381
382 time0=Esys_timer()-time0;
383 /* coarse level solve */
384 if ( amg->AMG_C == NULL) {
385 time0=Esys_timer();
386 /* A_C is the coarsest level */
387 Paso_Preconditioner_AMG_mergeSolve(amg);
388
389 if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
390 } else {
391 Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */
392 }
393
394 time0=time0+Esys_timer();
395 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */
396
397 /*postsmoothing*/
398
399 /*solve Ax=b with initial guess x */
400 time0=Esys_timer();
401 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
402 time0=Esys_timer()-time0;
403 if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
404 /*end of postsmoothing*/
405 }
406
407 return;
408 }
409
410 /* theta = threshold for strong connections */
411 /* tau = threshold for diagonal dominance */
412
413 /*S_i={j \in N_i; i strongly coupled to j}
414
415 in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
416 */
417
418 void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
419 dim_t *degree_S, index_t *offset_S, index_t *S,
420 const double theta, const double tau)
421 {
422
423 const dim_t my_n=A->mainBlock->numRows;
424 const dim_t overlap_n=A->row_coupleBlock->numRows;
425
426 index_t iptr, i;
427 double *threshold_p=NULL;
428 index_t rank=A->mpi_info->rank;
429
430 threshold_p = TMPMEMALLOC(2*my_n, double);
431
432 #pragma omp parallel for private(i,iptr) schedule(static)
433 for (i=0;i<my_n;++i) {
434
435 register double max_offdiagonal = 0.;
436 register double sum_row=0;
437 register double main_row=0;
438 register dim_t kdeg=0;
439 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
440
441
442 /* collect information for row i: */
443 #pragma ivdep
444 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
445 register index_t j=A->mainBlock->pattern->index[iptr];
446 register double fnorm=ABS(A->mainBlock->val[iptr]);
447 if( j != i) {
448 max_offdiagonal = MAX(max_offdiagonal,fnorm);
449 sum_row+=fnorm;
450 } else {
451 main_row=fnorm;
452 }
453
454 }
455
456 #pragma ivdep
457 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
458 register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
459
460 max_offdiagonal = MAX(max_offdiagonal,fnorm);
461 sum_row+=fnorm;
462 }
463
464 /* inspect row i: */
465 {
466 const double threshold = theta*max_offdiagonal;
467 threshold_p[2*i+1]=threshold;
468 if (tau*main_row < sum_row) { /* no diagonal dominance */
469 threshold_p[2*i]=1;
470 #pragma ivdep
471 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
472 register index_t j=A->mainBlock->pattern->index[iptr];
473 if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
474 S[koffset+kdeg] = j;
475 kdeg++;
476 }
477 }
478 // #pragma ivdep
479 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
480 register index_t j=A->col_coupleBlock->pattern->index[iptr];
481 if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
482 S[koffset+kdeg] = j + my_n;
483 kdeg++;
484 }
485 }
486 } else {
487 threshold_p[2*i]=-1;
488 }
489 }
490 offset_S[i]=koffset;
491 degree_S[i]=kdeg;
492 }
493
494 /* now we need to distribute the threshold and the diagonal dominance indicator */
495 if (A->mpi_info->size > 1) {
496
497 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
498 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
499
500 double *remote_threshold=NULL;
501
502 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
503 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
504 Paso_Coupler_finishCollect(threshold_coupler);
505 remote_threshold=threshold_coupler->recv_buffer;
506
507 #pragma omp parallel for private(i,iptr) schedule(static)
508 for (i=0; i<overlap_n; i++) {
509 const double threshold = remote_threshold[2*i+1];
510 register dim_t kdeg=0;
511 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
512 if (remote_threshold[2*i]>0) {
513 #pragma ivdep
514 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
515 register index_t j=A->row_coupleBlock->pattern->index[iptr];
516 if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
517 S[koffset+kdeg] = j ;
518 kdeg++;
519 }
520 }
521
522 #pragma ivdep
523 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
524 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
525 if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
526 S[koffset+kdeg] = j + my_n;
527 kdeg++;
528 }
529 }
530 }
531 offset_S[i+my_n]=koffset;
532 degree_S[i+my_n]=kdeg;
533 }
534
535 Paso_Coupler_free(threshold_coupler);
536 }
537 TMPMEMFREE(threshold_p);
538 }
539
540 /* theta = threshold for strong connections */
541 /* tau = threshold for diagonal dominance */
542 /*S_i={j \in N_i; i strongly coupled to j}
543
544 in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
545 */
546 void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
547 dim_t *degree_S, index_t *offset_S, index_t *S,
548 const double theta, const double tau)
549
550 {
551 const dim_t block_size=A->block_size;
552 const dim_t my_n=A->mainBlock->numRows;
553 const dim_t overlap_n=A->row_coupleBlock->numRows;
554
555 index_t iptr, i, bi;
556 double *threshold_p=NULL;
557
558
559 threshold_p = TMPMEMALLOC(2*my_n, double);
560
561 #pragma omp parallel private(i,iptr,bi)
562 {
563
564 dim_t max_deg=0;
565 double *rtmp=NULL;
566
567 #pragma omp for schedule(static)
568 for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
569 +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
570
571 rtmp=TMPMEMALLOC(max_deg, double);
572
573 #pragma omp for schedule(static)
574 for (i=0;i<my_n;++i) {
575 register double max_offdiagonal = 0.;
576 register double sum_row=0;
577 register double main_row=0;
578 register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
579 register dim_t kdeg=0;
580 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
581
582 /* collect information for row i: */
583 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
584 register index_t j=A->mainBlock->pattern->index[iptr];
585 register double fnorm=0;
586 #pragma ivdep
587 for(bi=0;bi<block_size;++bi) {
588 register double rtmp2= A->mainBlock->val[iptr*block_size+bi];
589 fnorm+=rtmp2*rtmp2;
590 }
591 fnorm=sqrt(fnorm);
592 rtmp[iptr+rtmp_offset]=fnorm;
593
594 if( j != i) {
595 max_offdiagonal = MAX(max_offdiagonal,fnorm);
596 sum_row+=fnorm;
597 } else {
598 main_row=fnorm;
599 }
600
601 }
602
603 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
604 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
605 register double fnorm=0;
606 #pragma ivdep
607 for(bi=0;bi<block_size;++bi) {
608 register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
609 fnorm+=rtmp2*rtmp2;
610 }
611 fnorm=sqrt(fnorm);
612
613 rtmp[iptr+rtmp_offset]=fnorm;
614 max_offdiagonal = MAX(max_offdiagonal,fnorm);
615 sum_row+=fnorm;
616 }
617
618
619 /* inspect row i: */
620 {
621 const double threshold = theta*max_offdiagonal;
622 rtmp_offset=-A->mainBlock->pattern->ptr[i];
623
624 threshold_p[2*i+1]=threshold;
625 if (tau*main_row < sum_row) { /* no diagonal dominance */
626 threshold_p[2*i]=1;
627 #pragma ivdep
628 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
629 register index_t j=A->mainBlock->pattern->index[iptr];
630 if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
631 S[koffset+kdeg] = j;
632 kdeg++;
633 }
634 }
635 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
636 #pragma ivdep
637 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
638 register index_t j=A->col_coupleBlock->pattern->index[iptr];
639 if( rtmp[iptr+rtmp_offset] >threshold) {
640 S[koffset+kdeg] = j + my_n;
641 kdeg++;
642 }
643 }
644 } else {
645 threshold_p[2*i]=-1;
646 }
647 }
648 offset_S[i]=koffset;
649 degree_S[i]=kdeg;
650 }
651 TMPMEMFREE(rtmp);
652 }
653 /* now we need to distribute the threshold and the diagonal dominance indicator */
654 if (A->mpi_info->size > 1) {
655
656 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
657 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
658
659 double *remote_threshold=NULL;
660
661 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
662 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
663 Paso_Coupler_finishCollect(threshold_coupler);
664 remote_threshold=threshold_coupler->recv_buffer;
665
666 #pragma omp parallel for private(i,iptr) schedule(static)
667 for (i=0; i<overlap_n; i++) {
668
669 const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
670 register dim_t kdeg=0;
671 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
672 if (remote_threshold[2*i]>0) {
673 #pragma ivdep
674 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
675 register index_t j=A->row_coupleBlock->pattern->index[iptr];
676 register double fnorm2=0;
677 #pragma ivdepremote_threshold[2*i]
678 for(bi=0;bi<block_size;++bi) {
679 register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
680 fnorm2+=rtmp2*rtmp2;
681 }
682
683 if(fnorm2 > threshold2 ) {
684 S[koffset+kdeg] = j ;
685 kdeg++;
686 }
687 }
688
689 #pragma ivdep
690 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
691 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
692 register double fnorm2=0;
693 #pragma ivdepremote_threshold[2*i]
694 for(bi=0;bi<block_size;++bi) {
695 register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
696 fnorm2+=rtmp2*rtmp2;
697 }
698 if(fnorm2 > threshold2 && i != j) {
699 S[koffset+kdeg] = j + my_n;
700 kdeg++;
701 }
702 }
703
704 }
705 offset_S[i+my_n]=koffset;
706 degree_S[i+my_n]=kdeg;
707 }
708 Paso_Coupler_free(threshold_coupler);
709 }
710 TMPMEMFREE(threshold_p);
711 }
712
713 void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
714 const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
715 {
716 index_t i, j;
717 dim_t p;
718 dim_t len=0;
719 #pragma omp parallel for private(i) schedule(static)
720 for (i=0; i<nT ;++i) {
721 degree_ST[i]=0;
722 }
723 for (i=0; i<n ;++i) {
724 for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
725 }
726 for (i=0; i<nT ;++i) {
727 offset_ST[i]=len;
728 len+=degree_ST[i];
729 degree_ST[i]=0;
730 }
731 for (i=0; i<n ;++i) {
732 for (p=0; p<degree_S[i]; ++p) {
733 j=S[offset_S[i]+p];
734 ST[offset_ST[j]+degree_ST[j]]=i;
735 degree_ST[j]++;
736 }
737 }
738 }
739
740 int compareindex(const void *a, const void *b)
741 {
742 return (*(int *)a - *(int *)b);
743 }
744
745 void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
746 const dim_t* degree_S, const index_t* offset_S, const index_t* S,
747 const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
748 Paso_Connector* col_connector, Paso_Distribution* col_dist)
749 {
750 dim_t i, numUndefined, iter=0;
751 index_t iptr, jptr, kptr;
752 double *random=NULL, *w=NULL, *Status=NULL;
753 index_t * ST_flag=NULL;
754 index_t rank=col_connector->mpi_info->rank;
755
756 Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector ,1);
757
758 w=TMPMEMALLOC(n, double);
759 Status=TMPMEMALLOC(n, double);
760 random = Paso_Distribution_createRandomVector(col_dist,1);
761 ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
762
763 #pragma omp parallel for private(i)
764 for (i=0; i< my_n; ++i) {
765 w[i]=degree_ST[i]+random[i];
766 if (degree_ST[i] < 1) {
767 Status[i]=-100; /* F point */
768 } else {
769 Status[i]=1; /* status undefined */
770 }
771 }
772
773 #pragma omp parallel for private(i, iptr)
774 for (i=0; i< n; ++i) {
775 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
776 ST_flag[offset_ST[i]+iptr]=1;
777 }
778 }
779
780
781 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
782 printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);
783 iter=0;
784 while (numUndefined > 0) {
785 Paso_Coupler_fillOverlap(n, w, w_coupler);
786
787 /* calculate the maximum value of neighbours following active strong connections:
788 w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active */
789 #pragma omp parallel for private(i, iptr)
790 for (i=0; i<my_n; ++i) {
791 if (Status[i]>0) { /* status is still undefined */
792
793 register bool_t inD=TRUE;
794 const double wi=w[i];
795
796 for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
797 const index_t k=S[offset_S[i]+iptr];
798 const index_t* start_p = &ST[offset_ST[k]];
799 const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
800
801 if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
802 if (wi <= w[k] ) {
803 inD=FALSE;
804 break;
805 }
806 }
807
808 }
809
810 if (inD) {
811 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
812 const index_t k=ST[offset_ST[i]+iptr];
813 if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
814 if (wi <= w[k] ) {
815 inD=FALSE;
816 break;
817 }
818 }
819 }
820 }
821 if (inD) {
822 Status[i]=0.; /* is in D */
823 }
824 }
825
826 }
827
828 Paso_Coupler_fillOverlap(n, Status, w_coupler);
829
830
831 /* remove connection to D points :
832
833 for each i in D:
834 for each j in S_i:
835 w[j]--
836 ST_tag[j,i]=-1
837 for each j in ST[i]:
838 ST_tag[i,j]=-1
839 for each k in ST[j]:
840 if k in ST[i]:
841 w[j]--;
842 ST_tag[j,k]=-1
843
844 */
845 /* w is updated for local rows only */
846 {
847 #pragma omp parallel for private(i, jptr)
848 for (i=0; i< my_n; ++i) {
849
850 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
851 const index_t j=ST[offset_ST[i]+jptr];
852 if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
853 w[i]--;
854 ST_flag[offset_ST[i]+jptr]=-1;
855 }
856 }
857
858 }
859 #pragma omp parallel for private(i, jptr)
860 for (i=my_n; i< n; ++i) {
861 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
862 const index_t j = ST[offset_ST[i]+jptr];
863 if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
864 }
865 }
866
867
868 for (i=0; i< n; ++i) {
869 if ( Status[i] == 0. ) {
870
871 const index_t* start_p = &ST[offset_ST[i]];
872
873 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
874 const index_t j=ST[offset_ST[i]+jptr];
875 ST_flag[offset_ST[i]+jptr]=-1;
876 for (kptr=0; kptr<degree_ST[j]; ++kptr) {
877 const index_t k=ST[offset_ST[j]+kptr];
878 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
879 if (ST_flag[offset_ST[j]+kptr] >0) {
880 if (j< my_n ) {
881 w[j]--;
882 }
883 ST_flag[offset_ST[j]+kptr]=-1;
884 }
885 }
886 }
887 }
888 }
889 }
890 }
891
892 /* adjust status */
893 #pragma omp parallel for private(i)
894 for (i=0; i< my_n; ++i) {
895 if ( Status[i] == 0. ) {
896 Status[i] = -10; /* this is now a C point */
897 } else if (Status[i] == 1. && w[i]<1.) {
898 Status[i] = -100; /* this is now a F point */
899 }
900 }
901
902 i = numUndefined;
903 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
904 if (numUndefined == i) {
905 Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
906 return;
907 }
908
909 iter++;
910 printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);
911
912 } /* end of while loop */
913
914 /* map to output :*/
915 Paso_Coupler_fillOverlap(n, Status, w_coupler);
916 #pragma omp parallel for private(i)
917 for (i=0; i< n; ++i) {
918 if (Status[i] > -50.) {
919 split_marker[i]=PASO_AMG_IN_C;
920 } else {
921 split_marker[i]=PASO_AMG_IN_F;
922 }
923 }
924 /* clean up : */
925 Paso_Coupler_free(w_coupler);
926 TMPMEMFREE(random);
927 TMPMEMFREE(w);
928 TMPMEMFREE(Status);
929 TMPMEMFREE(ST_flag);
930
931 return;
932 }
933
934 /* Merge the system matrix which is distributed on ranks into a complete
935 matrix on rank 0, then solve this matrix on rank 0 only */
936 Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
937 index_t i, iptr, j, n, remote_n, total_n, len, offset, tag;
938 index_t row_block_size, col_block_size, block_size;
939 index_t size=A->mpi_info->size;
940 index_t rank=A->mpi_info->rank;
941 index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
942 index_t *temp_n=NULL, *temp_len=NULL;
943 double *val=NULL;
944 Paso_Pattern *pattern=NULL;
945 Paso_SparseMatrix *out=NULL;
946 #ifdef ESYS_MPI
947 MPI_Request* mpi_requests=NULL;
948 MPI_Status* mpi_stati=NULL;
949 #else
950 int *mpi_requests=NULL, *mpi_stati=NULL;
951 #endif
952
953 if (size == 1) {
954 n = A->mainBlock->numRows;
955 ptr = TMPMEMALLOC(n, index_t);
956 #pragma omp parallel for private(i)
957 for (i=0; i<n; i++) ptr[i] = i;
958 out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
959 TMPMEMFREE(ptr);
960 return out;
961 }
962
963 n = A->mainBlock->numRows;
964 block_size = A->block_size;
965
966 /* Merge MainBlock and CoupleBlock to get a complete column entries
967 for each row allocated to current rank. Output (ptr, idx, val)
968 contains all info needed from current rank to merge a system
969 matrix */
970 Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
971
972 #ifdef ESYS_MPI
973 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
974 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
975 #else
976 mpi_requests=TMPMEMALLOC(size*2,int);
977 mpi_stati=TMPMEMALLOC(size*2,int);
978 #endif
979
980 /* Now, pass all info to rank 0 and merge them into one sparse
981 matrix */
982 if (rank == 0) {
983 /* First, copy local ptr values into ptr_global */
984 total_n=Paso_SystemMatrix_getGlobalNumRows(A);
985 ptr_global = MEMALLOC(total_n+1, index_t);
986 memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
987 iptr = n+1;
988 MEMFREE(ptr);
989 temp_n = TMPMEMALLOC(size, index_t);
990 temp_len = TMPMEMALLOC(size, index_t);
991 temp_n[0] = iptr;
992
993 /* Second, receive ptr values from other ranks */
994 for (i=1; i<size; i++) {
995 remote_n = A->row_distribution->first_component[i+1] -
996 A->row_distribution->first_component[i];
997 MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
998 A->mpi_info->msg_tag_counter+i,
999 A->mpi_info->comm,
1000 &mpi_requests[i]);
1001 temp_n[i] = remote_n;
1002 iptr += remote_n;
1003 }
1004 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1005 A->mpi_info->msg_tag_counter += size;
1006
1007 /* Then, prepare to receive idx and val from other ranks */
1008 len = 0;
1009 offset = -1;
1010 for (i=0; i<size; i++) {
1011 if (temp_n[i] > 0) {
1012 offset += temp_n[i];
1013 len += ptr_global[offset];
1014 temp_len[i] = ptr_global[offset];
1015 }else
1016 temp_len[i] = 0;
1017 }
1018
1019 idx_global = MEMALLOC(len, index_t);
1020 iptr = temp_len[0];
1021 offset = n+1;
1022 for (i=1; i<size; i++) {
1023 len = temp_len[i];
1024 MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1025 A->mpi_info->msg_tag_counter+i,
1026 A->mpi_info->comm,
1027 &mpi_requests[i]);
1028 remote_n = temp_n[i];
1029 for (j=0; j<remote_n; j++) {
1030 ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1031 }
1032 offset += remote_n;
1033 iptr += len;
1034 }
1035 memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1036 MEMFREE(idx);
1037 row_block_size = A->mainBlock->row_block_size;
1038 col_block_size = A->mainBlock->col_block_size;
1039 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1040 A->mpi_info->msg_tag_counter += size;
1041 TMPMEMFREE(temp_n);
1042
1043 /* Then generate the sparse matrix */
1044 pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, total_n,
1045 total_n, ptr_global, idx_global);
1046 out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1047 row_block_size, col_block_size, FALSE);
1048 Paso_Pattern_free(pattern);
1049
1050 /* Finally, receive and copy the value */
1051 iptr = temp_len[0] * block_size;
1052 for (i=1; i<size; i++) {
1053 len = temp_len[i];
1054 MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1055 A->mpi_info->msg_tag_counter+i,
1056 A->mpi_info->comm,
1057 &mpi_requests[i]);
1058 iptr += (len * block_size);
1059 }
1060 memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1061 MEMFREE(val);
1062 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1063 A->mpi_info->msg_tag_counter += size;
1064 TMPMEMFREE(temp_len);
1065 } else { /* it's not rank 0 */
1066
1067 /* First, send out the local ptr */
1068 tag = A->mpi_info->msg_tag_counter+rank;
1069 MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1070 &mpi_requests[0]);
1071
1072 /* Next, send out the local idx */
1073 len = ptr[n];
1074 tag += size;
1075 MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1076 &mpi_requests[1]);
1077
1078 /* At last, send out the local val */
1079 len *= block_size;
1080 tag += size;
1081 MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1082 &mpi_requests[2]);
1083
1084 MPI_Waitall(3, mpi_requests, mpi_stati);
1085 A->mpi_info->msg_tag_counter = tag + size - rank;
1086 MEMFREE(ptr);
1087 MEMFREE(idx);
1088 MEMFREE(val);
1089
1090 out = NULL;
1091 }
1092
1093 TMPMEMFREE(mpi_requests);
1094 TMPMEMFREE(mpi_stati);
1095 return out;
1096 }
1097
1098
1099 void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1100 Paso_SystemMatrix *A = amg->A_C;
1101 Paso_SparseMatrix *A_D, *A_temp;
1102 double* x=NULL;
1103 double* b=NULL;
1104 index_t rank = A->mpi_info->rank;
1105 index_t size = A->mpi_info->size;
1106 index_t i, n, p, count, n_block;
1107 index_t *counts, *offset, *dist;
1108
1109 n_block = amg->n_block;
1110
1111 A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1112
1113 /* First, gather x and b into rank 0 */
1114 dist = A->pattern->input_distribution->first_component;
1115 n = Paso_SystemMatrix_getGlobalNumRows(A);
1116 b = TMPMEMALLOC(n*n_block, double);
1117 x = TMPMEMALLOC(n*n_block, double);
1118 counts = TMPMEMALLOC(size, index_t);
1119 offset = TMPMEMALLOC(size, index_t);
1120
1121 #pragma omp parallel for private(i,p)
1122 for (i=0; i<size; i++) {
1123 p = dist[i];
1124 counts[i] = (dist[i+1] - p)*n_block;
1125 offset[i] = p*n_block;
1126 }
1127 count = counts[rank];
1128 MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1129 MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1130
1131 if (rank == 0) {
1132 /* solve locally */
1133 #ifdef MKL
1134 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1135 A_temp->solver_package = PASO_MKL;
1136 Paso_SparseMatrix_free(A_D);
1137 Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1138 Paso_SparseMatrix_free(A_temp);
1139 #else
1140 #ifdef UMFPACK
1141 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1142 A_temp->solver_package = PASO_UMFPACK;
1143 Paso_SparseMatrix_free(A_D);
1144 Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1145 Paso_SparseMatrix_free(A_temp);
1146 #else
1147 A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1148 A_D->solver_package = PASO_SMOOTHER;
1149 Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1150 Paso_SparseMatrix_free(A_D);
1151 #endif
1152 #endif
1153 }
1154
1155 /* now we need to distribute the solution to all ranks */
1156 MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1157
1158 TMPMEMFREE(x);
1159 TMPMEMFREE(b);
1160 TMPMEMFREE(counts);
1161 TMPMEMFREE(offset);
1162 }

  ViewVC Help
Powered by ViewVC 1.1.26