/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Contents of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3828 - (show annotations)
Wed Feb 15 03:27:58 2012 UTC (7 years, 9 months ago) by lgao
File MIME type: text/plain
File size: 37285 byte(s)
Fix all warnings reported on guineapig.


1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: AMG preconditioner (local version) */
18
19 /**************************************************************/
20
21 /* Author: artak@uq.edu.au, l.gross@uq.edu.au */
22
23 /**************************************************************/
24
25 #define SHOW_TIMING FALSE
26 #define MY_DEBUG 0
27 #define MY_DEBUG1 1
28
29 #include "Paso.h"
30 #include "Preconditioner.h"
31 #include "Options.h"
32 #include "PasoUtil.h"
33 #include "UMFPACK.h"
34 #include "MKL.h"
35 #include<stdio.h>
36
37
38 /**************************************************************/
39
40 /* free all memory used by AMG */
41
42 void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43 if (in!=NULL) {
44 Paso_Preconditioner_Smoother_free(in->Smoother);
45 Paso_SystemMatrix_free(in->P);
46 Paso_SystemMatrix_free(in->R);
47 Paso_SystemMatrix_free(in->A_C);
48 Paso_Preconditioner_AMG_free(in->AMG_C);
49 MEMFREE(in->r);
50 MEMFREE(in->x_C);
51 MEMFREE(in->b_C);
52
53 MEMFREE(in);
54 }
55 }
56
57 index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
58 if (in->AMG_C == NULL) {
59 return in->level;
60 } else {
61 return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
62 }
63 }
64 double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
65 if (in->AMG_C == NULL) {
66 if (in->A_C == NULL) {
67 return 1.;
68 } else {
69 return Paso_SystemMatrix_getSparsity(in->A_C);
70 }
71 } else {
72 return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
73 }
74 }
75 dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
76 if (in->AMG_C == NULL) {
77 if (in->A_C == NULL) {
78 return 0;
79 } else {
80 return Paso_SystemMatrix_getTotalNumRows(in->A_C);
81 }
82 } else {
83 return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
84 }
85 }
86 /*****************************************************************
87
88 constructs AMG
89
90 ******************************************************************/
91 Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92
93 Paso_Preconditioner_AMG* out=NULL;
94 Paso_SystemMatrix *A_C=NULL;
95 bool_t verbose=options->verbose;
96
97 const dim_t my_n=A_p->mainBlock->numRows;
98 const dim_t overlap_n=A_p->row_coupleBlock->numRows;
99
100 const dim_t n = my_n + overlap_n;
101
102 const dim_t n_block=A_p->row_block_size;
103 index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
104 dim_t i, n_F, n_C, F_flag, *F_set=NULL;
105 double time0=0;
106 const double theta = options->coarsening_threshold;
107 const double tau = options->diagonal_dominance_threshold;
108 const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
109 const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);
110
111
112 /*
113 is the input matrix A suitable for coarsening?
114
115 */
116 if ( (sparsity >= options->min_coarse_sparsity) ||
117 (total_n <= options->min_coarse_matrix_size) ||
118 (level > options->level_max) ) {
119
120 if (verbose) {
121 /*
122 print stopping condition:
123 - 'SPAR' = min_coarse_matrix_sparsity exceeded
124 - 'SIZE' = min_coarse_matrix_size exceeded
125 - 'LEVEL' = level_max exceeded
126 */
127 printf("Paso_Preconditioner: AMG: termination of coarsening by ");
128
129 if (sparsity >= options->min_coarse_sparsity)
130 printf("SPAR");
131
132 if (total_n <= options->min_coarse_matrix_size)
133 printf("SIZE");
134
135 if (level > options->level_max)
136 printf("LEVEL");
137
138 printf("\n");
139
140 printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
141 level, options->level_max, sparsity, options->min_coarse_sparsity, total_n, options->min_coarse_matrix_size);
142
143 }
144
145 return NULL;
146 } else {
147 /* Start Coarsening : */
148
149 /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
150 const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
151
152 dim_t* degree_S=TMPMEMALLOC(n, dim_t);
153 index_t *offset_S=TMPMEMALLOC(n, index_t);
154 index_t *S=TMPMEMALLOC(len_S, index_t);
155 dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
156 index_t *offset_ST=TMPMEMALLOC(n, index_t);
157 index_t *ST=TMPMEMALLOC(len_S, index_t);
158
159
160 F_marker=TMPMEMALLOC(n,index_t);
161 counter=TMPMEMALLOC(n,index_t);
162
163 if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
164 || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
165 /*
166 make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
167 */
168 Paso_SystemMatrix_copyColCoupleBlock(A_p);
169 Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
170
171 /*
172 set splitting of unknows:
173
174 */
175 time0=Esys_timer();
176 if (n_block>1) {
177 Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
178 } else {
179 Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
180 }
181 Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
182 /* Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
183 */
184
185 Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
186 degree_S, offset_S, S, degree_ST, offset_ST, ST,
187 A_p->col_coupler->connector,A_p->col_distribution);
188
189
190 /* in BoomerAMG if interpolation is used FF connectivity is required */
191 /*MPI:
192 if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
193 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);
194 */
195
196 options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
197 if (Esys_noError() ) {
198 #pragma omp parallel for private(i) schedule(static)
199 for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F);
200
201 /*
202 count number of unkowns to be eliminated:
203 */
204 n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
205 n_C=n-n_F;
206 if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);
207
208 /* collect n_F values on all processes, a direct solver should
209 be used if any n_F value is 0 */
210 F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
211 #ifdef ESYS_MPI
212 MPI_Allgather(&n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
213 #endif
214 F_flag = 1;
215 for (i=0; i<A_p->mpi_info->size; i++) {
216 if (F_set[i] == 0) {
217 F_flag = 0;
218 break;
219 }
220 }
221 TMPMEMFREE(F_set);
222
223 /* if ( n_F == 0 ) { is a nasty case. a direct solver should be used, return NULL */
224 if (F_flag == 0) {
225 out = NULL;
226 } else {
227 out=MEMALLOC(1,Paso_Preconditioner_AMG);
228 if (! Esys_checkPtr(out)) {
229 out->level = level;
230 out->n = n;
231 out->n_F = n_F;
232 out->n_block = n_block;
233 out->A_C = NULL;
234 out->P = NULL;
235 out->R = NULL;
236 out->post_sweeps = options->post_sweeps;
237 out->pre_sweeps = options->pre_sweeps;
238 out->r = NULL;
239 out->x_C = NULL;
240 out->b_C = NULL;
241 out->AMG_C = NULL;
242 out->Smoother=NULL;
243 }
244 mask_C=TMPMEMALLOC(n,index_t);
245 rows_in_F=TMPMEMALLOC(n_F,index_t);
246 Esys_checkPtr(mask_C);
247 Esys_checkPtr(rows_in_F);
248 if ( Esys_noError() ) {
249
250 out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
251
252 if (n_C != 0) {
253 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
254
255 /* allocate helpers :*/
256 out->x_C=MEMALLOC(n_block*n_C,double);
257 out->b_C=MEMALLOC(n_block*n_C,double);
258 out->r=MEMALLOC(n_block*n,double);
259
260 Esys_checkPtr(out->r);
261 Esys_checkPtr(out->x_C);
262 Esys_checkPtr(out->b_C);
263
264 if ( Esys_noError() ) {
265 /* creates index for F:*/
266 #pragma omp parallel private(i)
267 {
268 #pragma omp for schedule(static)
269 for (i = 0; i < n; ++i) {
270 if (F_marker[i]) rows_in_F[counter[i]]=i;
271 }
272 }
273 /* create mask of C nodes with value >-1, gives new id */
274 i=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
275 /*
276 get Prolongation :
277 */
278
279 time0=Esys_timer();
280
281 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
282
283 }
284
285 /*
286 construct Restriction operator as transposed of Prolongation operator:
287 */
288
289 if ( Esys_noError()) {
290 time0=Esys_timer();
291
292 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
293
294 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
295 }
296 /*
297 construct coarse level matrix:
298 */
299 if ( Esys_noError()) {
300 time0=Esys_timer();
301
302 A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
303
304 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
305 }
306
307 /*
308 constructe courser level:
309
310 */
311 if ( Esys_noError()) {
312 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
313 }
314
315 if ( Esys_noError()) {
316 if ( out->AMG_C == NULL ) {
317 /* merge the system matrix into 1 rank when
318 it's not suitable coarsening due to the
319 total number of unknowns are too small */
320 out->A_C=A_C;
321 out->reordering = options->reordering;
322 out->refinements = options->coarse_matrix_refinements;
323 out->verbose = verbose;
324 out->options_smoother = options->smoother;
325 } else {
326 /* finally we set some helpers for the solver step */
327 out->A_C=A_C;
328 }
329 }
330 }
331 }
332 TMPMEMFREE(mask_C);
333 TMPMEMFREE(rows_in_F);
334 }
335 }
336
337 }
338 TMPMEMFREE(counter);
339 TMPMEMFREE(F_marker);
340 TMPMEMFREE(degree_S);
341 TMPMEMFREE(offset_S);
342 TMPMEMFREE(S);
343 TMPMEMFREE(degree_ST);
344 TMPMEMFREE(offset_ST);
345 TMPMEMFREE(ST);
346
347 }
348
349 if (Esys_noError()) {
350 return out;
351 } else {
352 Paso_Preconditioner_AMG_free(out);
353 return NULL;
354 }
355 }
356
357
358 void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
359 const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
360 double time0=0;
361 const dim_t post_sweeps=amg->post_sweeps;
362 const dim_t pre_sweeps=amg->pre_sweeps;
363
364 /* presmoothing */
365 time0=Esys_timer();
366 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
367
368 time0=Esys_timer()-time0;
369 if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
370 /* end of presmoothing */
371
372 if (amg->n_F < amg->n) { /* is there work on the coarse level? */
373 time0=Esys_timer();
374
375 Paso_Copy(n, amg->r, b); /* r <- b */
376 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
377 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */
378
379 time0=Esys_timer()-time0;
380 /* coarse level solve */
381 if ( amg->AMG_C == NULL) {
382 time0=Esys_timer();
383 /* A_C is the coarsest level */
384 Paso_Preconditioner_AMG_mergeSolve(amg);
385
386 if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
387 } else {
388 Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */
389 }
390
391 time0=time0+Esys_timer();
392 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */
393
394 /*postsmoothing*/
395
396 /*solve Ax=b with initial guess x */
397 time0=Esys_timer();
398 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
399 time0=Esys_timer()-time0;
400 if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
401 /*end of postsmoothing*/
402 }
403
404 return;
405 }
406
407 /* theta = threshold for strong connections */
408 /* tau = threshold for diagonal dominance */
409
410 /*S_i={j \in N_i; i strongly coupled to j}
411
412 in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
413 */
414
415 void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
416 dim_t *degree_S, index_t *offset_S, index_t *S,
417 const double theta, const double tau)
418 {
419
420 const dim_t my_n=A->mainBlock->numRows;
421 const dim_t overlap_n=A->row_coupleBlock->numRows;
422
423 index_t iptr, i;
424 double *threshold_p=NULL;
425
426 threshold_p = TMPMEMALLOC(2*my_n, double);
427
428 #pragma omp parallel for private(i,iptr) schedule(static)
429 for (i=0;i<my_n;++i) {
430
431 register double max_offdiagonal = 0.;
432 register double sum_row=0;
433 register double main_row=0;
434 register dim_t kdeg=0;
435 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
436
437
438 /* collect information for row i: */
439 #pragma ivdep
440 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
441 register index_t j=A->mainBlock->pattern->index[iptr];
442 register double fnorm=ABS(A->mainBlock->val[iptr]);
443 if( j != i) {
444 max_offdiagonal = MAX(max_offdiagonal,fnorm);
445 sum_row+=fnorm;
446 } else {
447 main_row=fnorm;
448 }
449
450 }
451
452 #pragma ivdep
453 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
454 register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
455
456 max_offdiagonal = MAX(max_offdiagonal,fnorm);
457 sum_row+=fnorm;
458 }
459
460 /* inspect row i: */
461 {
462 const double threshold = theta*max_offdiagonal;
463 threshold_p[2*i+1]=threshold;
464 if (tau*main_row < sum_row) { /* no diagonal dominance */
465 threshold_p[2*i]=1;
466 #pragma ivdep
467 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
468 register index_t j=A->mainBlock->pattern->index[iptr];
469 if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
470 S[koffset+kdeg] = j;
471 kdeg++;
472 }
473 }
474 #pragma ivdep
475 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
476 register index_t j=A->col_coupleBlock->pattern->index[iptr];
477 if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
478 S[koffset+kdeg] = j + my_n;
479 kdeg++;
480 }
481 }
482 } else {
483 threshold_p[2*i]=-1;
484 }
485 }
486 offset_S[i]=koffset;
487 degree_S[i]=kdeg;
488 }
489
490 /* now we need to distribute the threshold and the diagonal dominance indicator */
491 if (A->mpi_info->size > 1) {
492
493 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
494 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
495
496 double *remote_threshold=NULL;
497
498 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
499 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
500 Paso_Coupler_finishCollect(threshold_coupler);
501 remote_threshold=threshold_coupler->recv_buffer;
502
503 #pragma omp parallel for private(i,iptr) schedule(static)
504 for (i=0; i<overlap_n; i++) {
505 const double threshold = remote_threshold[2*i+1];
506 register dim_t kdeg=0;
507 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
508 if (remote_threshold[2*i]>0) {
509 #pragma ivdep
510 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
511 register index_t j=A->row_coupleBlock->pattern->index[iptr];
512 if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
513 S[koffset+kdeg] = j ;
514 kdeg++;
515 }
516 }
517
518 #pragma ivdep
519 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
520 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
521 if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
522 S[koffset+kdeg] = j + my_n;
523 kdeg++;
524 }
525 }
526 }
527 offset_S[i+my_n]=koffset;
528 degree_S[i+my_n]=kdeg;
529 }
530
531 Paso_Coupler_free(threshold_coupler);
532 }
533 TMPMEMFREE(threshold_p);
534 }
535
536 /* theta = threshold for strong connections */
537 /* tau = threshold for diagonal dominance */
538 /*S_i={j \in N_i; i strongly coupled to j}
539
540 in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
541 */
542 void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
543 dim_t *degree_S, index_t *offset_S, index_t *S,
544 const double theta, const double tau)
545
546 {
547 const dim_t block_size=A->block_size;
548 const dim_t my_n=A->mainBlock->numRows;
549 const dim_t overlap_n=A->row_coupleBlock->numRows;
550
551 index_t iptr, i, bi;
552 double *threshold_p=NULL;
553
554
555 threshold_p = TMPMEMALLOC(2*my_n, double);
556
557 #pragma omp parallel private(i,iptr,bi)
558 {
559
560 dim_t max_deg=0;
561 double *rtmp=NULL;
562
563 #pragma omp for schedule(static)
564 for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
565 +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
566
567 rtmp=TMPMEMALLOC(max_deg, double);
568
569 #pragma omp for schedule(static)
570 for (i=0;i<my_n;++i) {
571 register double max_offdiagonal = 0.;
572 register double sum_row=0;
573 register double main_row=0;
574 register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
575 register dim_t kdeg=0;
576 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
577
578 /* collect information for row i: */
579 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
580 register index_t j=A->mainBlock->pattern->index[iptr];
581 register double fnorm=0;
582 #pragma ivdep
583 for(bi=0;bi<block_size;++bi) {
584 register double rtmp2= A->mainBlock->val[iptr*block_size+bi];
585 fnorm+=rtmp2*rtmp2;
586 }
587 fnorm=sqrt(fnorm);
588 rtmp[iptr+rtmp_offset]=fnorm;
589
590 if( j != i) {
591 max_offdiagonal = MAX(max_offdiagonal,fnorm);
592 sum_row+=fnorm;
593 } else {
594 main_row=fnorm;
595 }
596
597 }
598
599 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
600 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
601 register double fnorm=0;
602 #pragma ivdep
603 for(bi=0;bi<block_size;++bi) {
604 register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
605 fnorm+=rtmp2*rtmp2;
606 }
607 fnorm=sqrt(fnorm);
608
609 rtmp[iptr+rtmp_offset]=fnorm;
610 max_offdiagonal = MAX(max_offdiagonal,fnorm);
611 sum_row+=fnorm;
612 }
613
614
615 /* inspect row i: */
616 {
617 const double threshold = theta*max_offdiagonal;
618 rtmp_offset=-A->mainBlock->pattern->ptr[i];
619
620 threshold_p[2*i+1]=threshold;
621 if (tau*main_row < sum_row) { /* no diagonal dominance */
622 threshold_p[2*i]=1;
623 #pragma ivdep
624 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
625 register index_t j=A->mainBlock->pattern->index[iptr];
626 if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
627 S[koffset+kdeg] = j;
628 kdeg++;
629 }
630 }
631 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
632 #pragma ivdep
633 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
634 register index_t j=A->col_coupleBlock->pattern->index[iptr];
635 if( rtmp[iptr+rtmp_offset] >threshold) {
636 S[koffset+kdeg] = j + my_n;
637 kdeg++;
638 }
639 }
640 } else {
641 threshold_p[2*i]=-1;
642 }
643 }
644 offset_S[i]=koffset;
645 degree_S[i]=kdeg;
646 }
647 TMPMEMFREE(rtmp);
648 }
649 /* now we need to distribute the threshold and the diagonal dominance indicator */
650 if (A->mpi_info->size > 1) {
651
652 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
653 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
654
655 double *remote_threshold=NULL;
656
657 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
658 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
659 Paso_Coupler_finishCollect(threshold_coupler);
660 remote_threshold=threshold_coupler->recv_buffer;
661
662 #pragma omp parallel for private(i,iptr) schedule(static)
663 for (i=0; i<overlap_n; i++) {
664
665 const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
666 register dim_t kdeg=0;
667 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
668 if (remote_threshold[2*i]>0) {
669 #pragma ivdep
670 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
671 register index_t j=A->row_coupleBlock->pattern->index[iptr];
672 register double fnorm2=0;
673 #pragma ivdepremote_threshold[2*i]
674 for(bi=0;bi<block_size;++bi) {
675 register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
676 fnorm2+=rtmp2*rtmp2;
677 }
678
679 if(fnorm2 > threshold2 ) {
680 S[koffset+kdeg] = j ;
681 kdeg++;
682 }
683 }
684
685 #pragma ivdep
686 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
687 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
688 register double fnorm2=0;
689 #pragma ivdepremote_threshold[2*i]
690 for(bi=0;bi<block_size;++bi) {
691 register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
692 fnorm2+=rtmp2*rtmp2;
693 }
694 if(fnorm2 > threshold2 && i != j) {
695 S[koffset+kdeg] = j + my_n;
696 kdeg++;
697 }
698 }
699
700 }
701 offset_S[i+my_n]=koffset;
702 degree_S[i+my_n]=kdeg;
703 }
704 Paso_Coupler_free(threshold_coupler);
705 }
706 TMPMEMFREE(threshold_p);
707 }
708
709 void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
710 const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
711 {
712 index_t i, j;
713 dim_t p;
714 dim_t len=0;
715 #pragma omp parallel for private(i) schedule(static)
716 for (i=0; i<nT ;++i) {
717 degree_ST[i]=0;
718 }
719 for (i=0; i<n ;++i) {
720 for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
721 }
722 for (i=0; i<nT ;++i) {
723 offset_ST[i]=len;
724 len+=degree_ST[i];
725 degree_ST[i]=0;
726 }
727 for (i=0; i<n ;++i) {
728 for (p=0; p<degree_S[i]; ++p) {
729 j=S[offset_S[i]+p];
730 ST[offset_ST[j]+degree_ST[j]]=i;
731 degree_ST[j]++;
732 }
733 }
734 }
735
736 int compareindex(const void *a, const void *b)
737 {
738 return (*(int *)a - *(int *)b);
739 }
740
741 void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
742 const dim_t* degree_S, const index_t* offset_S, const index_t* S,
743 const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
744 Paso_Connector* col_connector, Paso_Distribution* col_dist)
745 {
746 dim_t i, numUndefined, iter=0;
747 index_t iptr, jptr, kptr;
748 double *random=NULL, *w=NULL, *Status=NULL;
749 index_t * ST_flag=NULL;
750
751 Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector ,1);
752
753 w=TMPMEMALLOC(n, double);
754 Status=TMPMEMALLOC(n, double);
755 random = Paso_Distribution_createRandomVector(col_dist,1);
756 ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
757
758 #pragma omp parallel for private(i)
759 for (i=0; i< my_n; ++i) {
760 w[i]=degree_ST[i]+random[i];
761 if (degree_ST[i] < 1) {
762 Status[i]=-100; /* F point */
763 } else {
764 Status[i]=1; /* status undefined */
765 }
766 }
767
768 #pragma omp parallel for private(i, iptr)
769 for (i=0; i< n; ++i) {
770 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
771 ST_flag[offset_ST[i]+iptr]=1;
772 }
773 }
774
775
776 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
777 printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);
778 iter=0;
779 while (numUndefined > 0) {
780 Paso_Coupler_fillOverlap(n, w, w_coupler);
781
782 /* calculate the maximum value of neighbours following active strong connections:
783 w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active */
784 #pragma omp parallel for private(i, iptr)
785 for (i=0; i<my_n; ++i) {
786 if (Status[i]>0) { /* status is still undefined */
787
788 register bool_t inD=TRUE;
789 const double wi=w[i];
790
791 for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
792 const index_t k=S[offset_S[i]+iptr];
793 const index_t* start_p = &ST[offset_ST[k]];
794 const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
795
796 if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
797 if (wi <= w[k] ) {
798 inD=FALSE;
799 break;
800 }
801 }
802
803 }
804
805 if (inD) {
806 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
807 const index_t k=ST[offset_ST[i]+iptr];
808 if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
809 if (wi <= w[k] ) {
810 inD=FALSE;
811 break;
812 }
813 }
814 }
815 }
816 if (inD) {
817 Status[i]=0.; /* is in D */
818 }
819 }
820
821 }
822
823 Paso_Coupler_fillOverlap(n, Status, w_coupler);
824
825
826 /* remove connection to D points :
827
828 for each i in D:
829 for each j in S_i:
830 w[j]--
831 ST_tag[j,i]=-1
832 for each j in ST[i]:
833 ST_tag[i,j]=-1
834 for each k in ST[j]:
835 if k in ST[i]:
836 w[j]--;
837 ST_tag[j,k]=-1
838
839 */
840 /* w is updated for local rows only */
841 {
842 #pragma omp parallel for private(i, jptr)
843 for (i=0; i< my_n; ++i) {
844
845 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
846 const index_t j=ST[offset_ST[i]+jptr];
847 if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
848 w[i]--;
849 ST_flag[offset_ST[i]+jptr]=-1;
850 }
851 }
852
853 }
854 #pragma omp parallel for private(i, jptr)
855 for (i=my_n; i< n; ++i) {
856 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
857 const index_t j = ST[offset_ST[i]+jptr];
858 if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
859 }
860 }
861
862
863 for (i=0; i< n; ++i) {
864 if ( Status[i] == 0. ) {
865
866 const index_t* start_p = &ST[offset_ST[i]];
867
868 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
869 const index_t j=ST[offset_ST[i]+jptr];
870 ST_flag[offset_ST[i]+jptr]=-1;
871 for (kptr=0; kptr<degree_ST[j]; ++kptr) {
872 const index_t k=ST[offset_ST[j]+kptr];
873 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
874 if (ST_flag[offset_ST[j]+kptr] >0) {
875 if (j< my_n ) {
876 w[j]--;
877 }
878 ST_flag[offset_ST[j]+kptr]=-1;
879 }
880 }
881 }
882 }
883 }
884 }
885 }
886
887 /* adjust status */
888 #pragma omp parallel for private(i)
889 for (i=0; i< my_n; ++i) {
890 if ( Status[i] == 0. ) {
891 Status[i] = -10; /* this is now a C point */
892 } else if (Status[i] == 1. && w[i]<1.) {
893 Status[i] = -100; /* this is now a F point */
894 }
895 }
896
897 i = numUndefined;
898 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
899 if (numUndefined == i) {
900 Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
901 return;
902 }
903
904 iter++;
905 printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);
906
907 } /* end of while loop */
908
909 /* map to output :*/
910 Paso_Coupler_fillOverlap(n, Status, w_coupler);
911 #pragma omp parallel for private(i)
912 for (i=0; i< n; ++i) {
913 if (Status[i] > -50.) {
914 split_marker[i]=PASO_AMG_IN_C;
915 } else {
916 split_marker[i]=PASO_AMG_IN_F;
917 }
918 }
919 /* clean up : */
920 Paso_Coupler_free(w_coupler);
921 TMPMEMFREE(random);
922 TMPMEMFREE(w);
923 TMPMEMFREE(Status);
924 TMPMEMFREE(ST_flag);
925
926 return;
927 }
928
929 /* Merge the system matrix which is distributed on ranks into a complete
930 matrix on rank 0, then solve this matrix on rank 0 only */
931 Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
932 index_t i, iptr, j, n, remote_n, total_n, len, offset, tag;
933 index_t row_block_size, col_block_size, block_size;
934 index_t size=A->mpi_info->size;
935 index_t rank=A->mpi_info->rank;
936 index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
937 index_t *temp_n=NULL, *temp_len=NULL;
938 double *val=NULL;
939 Paso_Pattern *pattern=NULL;
940 Paso_SparseMatrix *out=NULL;
941 #ifdef ESYS_MPI
942 MPI_Request* mpi_requests=NULL;
943 MPI_Status* mpi_stati=NULL;
944 #else
945 int *mpi_requests=NULL, *mpi_stati=NULL;
946 #endif
947
948 if (size == 1) {
949 n = A->mainBlock->numRows;
950 ptr = TMPMEMALLOC(n, index_t);
951 #pragma omp parallel for private(i)
952 for (i=0; i<n; i++) ptr[i] = i;
953 out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
954 TMPMEMFREE(ptr);
955 return out;
956 }
957
958 n = A->mainBlock->numRows;
959 block_size = A->block_size;
960
961 /* Merge MainBlock and CoupleBlock to get a complete column entries
962 for each row allocated to current rank. Output (ptr, idx, val)
963 contains all info needed from current rank to merge a system
964 matrix */
965 Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
966
967 #ifdef ESYS_MPI
968 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
969 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
970 #else
971 mpi_requests=TMPMEMALLOC(size*2,int);
972 mpi_stati=TMPMEMALLOC(size*2,int);
973 #endif
974
975 /* Now, pass all info to rank 0 and merge them into one sparse
976 matrix */
977 if (rank == 0) {
978 /* First, copy local ptr values into ptr_global */
979 total_n=Paso_SystemMatrix_getGlobalNumRows(A);
980 ptr_global = MEMALLOC(total_n+1, index_t);
981 memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
982 iptr = n+1;
983 MEMFREE(ptr);
984 temp_n = TMPMEMALLOC(size, index_t);
985 temp_len = TMPMEMALLOC(size, index_t);
986 temp_n[0] = iptr;
987
988 /* Second, receive ptr values from other ranks */
989 for (i=1; i<size; i++) {
990 remote_n = A->row_distribution->first_component[i+1] -
991 A->row_distribution->first_component[i];
992 #ifdef ESYS_MPI
993 MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
994 A->mpi_info->msg_tag_counter+i,
995 A->mpi_info->comm,
996 &mpi_requests[i]);
997 #endif
998 temp_n[i] = remote_n;
999 iptr += remote_n;
1000 }
1001 #ifdef ESYS_MPI
1002 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1003 #endif
1004 A->mpi_info->msg_tag_counter += size;
1005
1006 /* Then, prepare to receive idx and val from other ranks */
1007 len = 0;
1008 offset = -1;
1009 for (i=0; i<size; i++) {
1010 if (temp_n[i] > 0) {
1011 offset += temp_n[i];
1012 len += ptr_global[offset];
1013 temp_len[i] = ptr_global[offset];
1014 }else
1015 temp_len[i] = 0;
1016 }
1017
1018 idx_global = MEMALLOC(len, index_t);
1019 iptr = temp_len[0];
1020 offset = n+1;
1021 for (i=1; i<size; i++) {
1022 len = temp_len[i];
1023 #ifdef ESYS_MPI
1024 MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1025 A->mpi_info->msg_tag_counter+i,
1026 A->mpi_info->comm,
1027 &mpi_requests[i]);
1028 #endif
1029 remote_n = temp_n[i];
1030 for (j=0; j<remote_n; j++) {
1031 ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1032 }
1033 offset += remote_n;
1034 iptr += len;
1035 }
1036 memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1037 MEMFREE(idx);
1038 row_block_size = A->mainBlock->row_block_size;
1039 col_block_size = A->mainBlock->col_block_size;
1040 #ifdef ESYS_MPI
1041 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1042 #endif
1043 A->mpi_info->msg_tag_counter += size;
1044 TMPMEMFREE(temp_n);
1045
1046 /* Then generate the sparse matrix */
1047 pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, total_n,
1048 total_n, ptr_global, idx_global);
1049 out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1050 row_block_size, col_block_size, FALSE);
1051 Paso_Pattern_free(pattern);
1052
1053 /* Finally, receive and copy the value */
1054 iptr = temp_len[0] * block_size;
1055 for (i=1; i<size; i++) {
1056 len = temp_len[i];
1057 #ifdef ESYS_MPI
1058 MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1059 A->mpi_info->msg_tag_counter+i,
1060 A->mpi_info->comm,
1061 &mpi_requests[i]);
1062 #endif
1063 iptr += (len * block_size);
1064 }
1065 memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1066 MEMFREE(val);
1067 #ifdef ESYS_MPI
1068 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1069 #endif
1070 A->mpi_info->msg_tag_counter += size;
1071 TMPMEMFREE(temp_len);
1072 } else { /* it's not rank 0 */
1073
1074 /* First, send out the local ptr */
1075 tag = A->mpi_info->msg_tag_counter+rank;
1076 #ifdef ESYS_MPI
1077 MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1078 &mpi_requests[0]);
1079 #endif
1080
1081 /* Next, send out the local idx */
1082 len = ptr[n];
1083 tag += size;
1084 #ifdef ESYS_MPI
1085 MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1086 &mpi_requests[1]);
1087 #endif
1088
1089 /* At last, send out the local val */
1090 len *= block_size;
1091 tag += size;
1092 #ifdef ESYS_MPI
1093 MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1094 &mpi_requests[2]);
1095
1096 MPI_Waitall(3, mpi_requests, mpi_stati);
1097 #endif
1098 A->mpi_info->msg_tag_counter = tag + size - rank;
1099 MEMFREE(ptr);
1100 MEMFREE(idx);
1101 MEMFREE(val);
1102
1103 out = NULL;
1104 }
1105
1106 TMPMEMFREE(mpi_requests);
1107 TMPMEMFREE(mpi_stati);
1108 return out;
1109 }
1110
1111
1112 void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1113 Paso_SystemMatrix *A = amg->A_C;
1114 Paso_SparseMatrix *A_D, *A_temp;
1115 double* x=NULL;
1116 double* b=NULL;
1117 index_t rank = A->mpi_info->rank;
1118 index_t size = A->mpi_info->size;
1119 index_t i, n, p, count, n_block;
1120 index_t *counts, *offset, *dist;
1121
1122 n_block = amg->n_block;
1123
1124 A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1125
1126 /* First, gather x and b into rank 0 */
1127 dist = A->pattern->input_distribution->first_component;
1128 n = Paso_SystemMatrix_getGlobalNumRows(A);
1129 b = TMPMEMALLOC(n*n_block, double);
1130 x = TMPMEMALLOC(n*n_block, double);
1131 counts = TMPMEMALLOC(size, index_t);
1132 offset = TMPMEMALLOC(size, index_t);
1133
1134 #pragma omp parallel for private(i,p)
1135 for (i=0; i<size; i++) {
1136 p = dist[i];
1137 counts[i] = (dist[i+1] - p)*n_block;
1138 offset[i] = p*n_block;
1139 }
1140 count = counts[rank];
1141 #ifdef ESYS_MPI
1142 MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1143 MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1144 #endif
1145
1146 if (rank == 0) {
1147 /* solve locally */
1148 #ifdef MKL
1149 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1150 A_temp->solver_package = PASO_MKL;
1151 Paso_SparseMatrix_free(A_D);
1152 Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1153 Paso_SparseMatrix_free(A_temp);
1154 #else
1155 #ifdef UMFPACK
1156 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1157 A_temp->solver_package = PASO_UMFPACK;
1158 Paso_SparseMatrix_free(A_D);
1159 Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1160 Paso_SparseMatrix_free(A_temp);
1161 #else
1162 A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1163 A_D->solver_package = PASO_SMOOTHER;
1164 Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1165 Paso_SparseMatrix_free(A_D);
1166 #endif
1167 #endif
1168 }
1169
1170 #ifdef ESYS_MPI
1171 /* now we need to distribute the solution to all ranks */
1172 MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1173 #endif
1174
1175 TMPMEMFREE(x);
1176 TMPMEMFREE(b);
1177 TMPMEMFREE(counts);
1178 TMPMEMFREE(offset);
1179 }

  ViewVC Help
Powered by ViewVC 1.1.26