/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Contents of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3886 - (show annotations)
Thu Apr 5 00:50:30 2012 UTC (7 years, 5 months ago) by gross
File MIME type: text/plain
File size: 37370 byte(s)
some minor problems with MPI AMG fixed.
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: AMG preconditioner (local version) */
18
19 /**************************************************************/
20
21 /* Author: artak@uq.edu.au, l.gross@uq.edu.au */
22
23 /**************************************************************/
24
25 #define SHOW_TIMING FALSE
26 #define MY_DEBUG 0
27 #define MY_DEBUG1 1
28
29 #include "Paso.h"
30 #include "Preconditioner.h"
31 #include "Options.h"
32 #include "PasoUtil.h"
33 #include "UMFPACK.h"
34 #include "MKL.h"
35 #include<stdio.h>
36
37
38 /**************************************************************/
39
40 /* free all memory used by AMG */
41
42 void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43 if (in!=NULL) {
44 Paso_Preconditioner_Smoother_free(in->Smoother);
45 Paso_SystemMatrix_free(in->P);
46 Paso_SystemMatrix_free(in->R);
47 Paso_SystemMatrix_free(in->A_C);
48 Paso_Preconditioner_AMG_free(in->AMG_C);
49 MEMFREE(in->r);
50 MEMFREE(in->x_C);
51 MEMFREE(in->b_C);
52
53 MEMFREE(in);
54 }
55 }
56
57 index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
58 if (in->AMG_C == NULL) {
59 return in->level;
60 } else {
61 return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
62 }
63 }
64 double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
65 if (in->AMG_C == NULL) {
66 if (in->A_C == NULL) {
67 return 1.;
68 } else {
69 return Paso_SystemMatrix_getSparsity(in->A_C);
70 }
71 } else {
72 return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
73 }
74 }
75 dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
76 if (in->AMG_C == NULL) {
77 if (in->A_C == NULL) {
78 return 0;
79 } else {
80 return Paso_SystemMatrix_getTotalNumRows(in->A_C);
81 }
82 } else {
83 return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
84 }
85 }
86 /*****************************************************************
87
88 constructs AMG
89
90 ******************************************************************/
91 Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92
93 Paso_Preconditioner_AMG* out=NULL;
94 Paso_SystemMatrix *A_C=NULL;
95 bool_t verbose=options->verbose;
96
97 const dim_t my_n=A_p->mainBlock->numRows;
98 const dim_t overlap_n=A_p->row_coupleBlock->numRows;
99
100 const dim_t n = my_n + overlap_n;
101
102 const dim_t n_block=A_p->row_block_size;
103 index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
104 dim_t i, my_n_F, my_n_C, n_C, F_flag, *F_set=NULL, global_n_C=0, global_n_F=0, n_F;
105 double time0=0;
106 const double theta = options->coarsening_threshold;
107 const double tau = options->diagonal_dominance_threshold;
108 const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
109 const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
110
111
112 /*
113 is the input matrix A suitable for coarsening?
114
115 */
116 if ( (sparsity >= options->min_coarse_sparsity) ||
117 (global_n <= options->min_coarse_matrix_size) ||
118 (level > options->level_max) ) {
119
120 if (verbose) {
121 /*
122 print stopping condition:
123 - 'SPAR' = min_coarse_matrix_sparsity exceeded
124 - 'SIZE' = min_coarse_matrix_size exceeded
125 - 'LEVEL' = level_max exceeded
126 */
127 printf("Paso_Preconditioner: AMG: termination of coarsening by ");
128
129 if (sparsity >= options->min_coarse_sparsity)
130 printf("SPAR");
131
132 if (global_n <= options->min_coarse_matrix_size)
133 printf("SIZE");
134
135 if (level > options->level_max)
136 printf("LEVEL");
137
138 printf("\n");
139
140 printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
141 level, options->level_max, sparsity, options->min_coarse_sparsity, global_n, options->min_coarse_matrix_size);
142
143 }
144
145 return NULL;
146 } else {
147 /* Start Coarsening : */
148
149 /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
150 const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
151
152 dim_t* degree_S=TMPMEMALLOC(n, dim_t);
153 index_t *offset_S=TMPMEMALLOC(n, index_t);
154 index_t *S=TMPMEMALLOC(len_S, index_t);
155 dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
156 index_t *offset_ST=TMPMEMALLOC(n, index_t);
157 index_t *ST=TMPMEMALLOC(len_S, index_t);
158
159
160 F_marker=TMPMEMALLOC(n,index_t);
161 counter=TMPMEMALLOC(n,index_t);
162
163 if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
164 || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
165 /*
166 make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
167 */
168 Paso_SystemMatrix_copyColCoupleBlock(A_p);
169 Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
170
171 /*
172 set splitting of unknows:
173
174 */
175 time0=Esys_timer();
176 if (n_block>1) {
177 Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
178 } else {
179 Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
180 }
181 Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
182 /* Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
183 */
184
185 Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
186 degree_S, offset_S, S, degree_ST, offset_ST, ST,
187 A_p->col_coupler->connector,A_p->col_distribution);
188
189
190 /* in BoomerAMG if interpolation is used FF connectivity is required */
191 /*MPI:
192 if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
193 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);
194 */
195
196 options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
197 if (Esys_noError() ) {
198 #pragma omp parallel for private(i) schedule(static)
199 for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F);
200
201 /*
202 count number of unkowns to be eliminated:
203 */
204 my_n_F=Paso_Util_cumsum_maskedTrue(my_n,counter, F_marker);
205 n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
206 /* collect my_n_F values on all processes, a direct solver should
207 be used if any my_n_F value is 0 */
208 F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
209 #ifdef ESYS_MPI
210 MPI_Allgather(&my_n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
211 #endif
212 global_n_F=0;
213 F_flag = 1;
214 for (i=0; i<A_p->mpi_info->size; i++) {
215 global_n_F+=F_set[i];
216 if (F_set[i] == 0) F_flag = 0;
217 }
218 TMPMEMFREE(F_set);
219
220 my_n_C=my_n-my_n_F;
221 global_n_C=global_n-global_n_F;
222 if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,global_n_F,global_n_C);
223
224
225 /* if ( n_F == 0 ) { is a nasty case. a direct solver should be used, return NULL */
226 if (F_flag == 0) {
227 out = NULL;
228 } else {
229 out=MEMALLOC(1,Paso_Preconditioner_AMG);
230 if (! Esys_checkPtr(out)) {
231 out->level = level;
232 out->A_C = NULL;
233 out->P = NULL;
234 out->R = NULL;
235 out->post_sweeps = options->post_sweeps;
236 out->pre_sweeps = options->pre_sweeps;
237 out->r = NULL;
238 out->x_C = NULL;
239 out->b_C = NULL;
240 out->AMG_C = NULL;
241 out->Smoother=NULL;
242 }
243 mask_C=TMPMEMALLOC(n,index_t);
244 rows_in_F=TMPMEMALLOC(n_F,index_t);
245 Esys_checkPtr(mask_C);
246 Esys_checkPtr(rows_in_F);
247 if ( Esys_noError() ) {
248
249 out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
250
251 if (global_n_C != 0) {
252 /* create mask of C nodes with value >-1, gives new id */
253 n_C=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
254 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
255
256 /* allocate helpers :*/
257 out->x_C=MEMALLOC(n_block*my_n_C,double);
258 out->b_C=MEMALLOC(n_block*my_n_C,double);
259 out->r=MEMALLOC(n_block*my_n,double);
260
261 Esys_checkPtr(out->r);
262 Esys_checkPtr(out->x_C);
263 Esys_checkPtr(out->b_C);
264
265 if ( Esys_noError() ) {
266 /* creates index for F:*/
267 #pragma omp parallel private(i)
268 {
269 #pragma omp for schedule(static)
270 for (i = 0; i < n; ++i) {
271 if (F_marker[i]) rows_in_F[counter[i]]=i;
272 }
273 }
274 /*
275 get Prolongation :
276 */
277
278 time0=Esys_timer();
279
280 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
281
282 }
283
284 /*
285 construct Restriction operator as transposed of Prolongation operator:
286 */
287
288 if ( Esys_noError()) {
289 time0=Esys_timer();
290
291 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
292
293 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
294 }
295 /*
296 construct coarse level matrix:
297 */
298 if ( Esys_noError()) {
299 time0=Esys_timer();
300
301 A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
302
303 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
304 }
305
306 /*
307 constructe courser level:
308
309 */
310 if ( Esys_noError()) {
311 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
312 }
313
314 if ( Esys_noError()) {
315 if ( out->AMG_C == NULL ) {
316 /* merge the system matrix into 1 rank when
317 it's not suitable coarsening due to the
318 total number of unknowns are too small */
319 out->A_C=A_C;
320 out->reordering = options->reordering;
321 out->refinements = options->coarse_matrix_refinements;
322 out->verbose = verbose;
323 out->options_smoother = options->smoother;
324 } else {
325 /* finally we set some helpers for the solver step */
326 out->A_C=A_C;
327 }
328 }
329 }
330 }
331 TMPMEMFREE(mask_C);
332 TMPMEMFREE(rows_in_F);
333 }
334 }
335
336 }
337 TMPMEMFREE(counter);
338 TMPMEMFREE(F_marker);
339 TMPMEMFREE(degree_S);
340 TMPMEMFREE(offset_S);
341 TMPMEMFREE(S);
342 TMPMEMFREE(degree_ST);
343 TMPMEMFREE(offset_ST);
344 TMPMEMFREE(ST);
345
346 }
347
348 if (Esys_noError()) {
349 return out;
350 } else {
351 Paso_Preconditioner_AMG_free(out);
352 return NULL;
353 }
354 }
355
356
357 void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
358 const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
359 double time0=0;
360 const dim_t post_sweeps=amg->post_sweeps;
361 const dim_t pre_sweeps=amg->pre_sweeps;
362
363 /* presmoothing */
364 time0=Esys_timer();
365 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
366
367 time0=Esys_timer()-time0;
368 if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
369 /* end of presmoothing */
370
371 time0=Esys_timer();
372
373 Paso_Copy(n, amg->r, b); /* r <- b */
374 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
375 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */
376
377 time0=Esys_timer()-time0;
378 /* coarse level solve */
379 if ( amg->AMG_C == NULL) {
380 time0=Esys_timer();
381 /* A_C is the coarsest level */
382 Paso_Preconditioner_AMG_mergeSolve(amg);
383
384 if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
385 } else {
386 Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */
387 }
388
389 time0=time0+Esys_timer();
390 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */
391
392 /*postsmoothing*/
393
394 /*solve Ax=b with initial guess x */
395 time0=Esys_timer();
396 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
397 time0=Esys_timer()-time0;
398 if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
399 return;
400 }
401
402 /* theta = threshold for strong connections */
403 /* tau = threshold for diagonal dominance */
404
405 /*S_i={j \in N_i; i strongly coupled to j}
406
407 in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
408 */
409
410 void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
411 dim_t *degree_S, index_t *offset_S, index_t *S,
412 const double theta, const double tau)
413 {
414
415 const dim_t my_n=A->mainBlock->numRows;
416 const dim_t overlap_n=A->row_coupleBlock->numRows;
417
418 index_t iptr, i;
419 double *threshold_p=NULL;
420
421 threshold_p = TMPMEMALLOC(2*my_n, double);
422
423 #pragma omp parallel for private(i,iptr) schedule(static)
424 for (i=0;i<my_n;++i) {
425
426 register double max_offdiagonal = 0.;
427 register double sum_row=0;
428 register double main_row=0;
429 register dim_t kdeg=0;
430 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
431
432
433 /* collect information for row i: */
434 #pragma ivdep
435 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
436 register index_t j=A->mainBlock->pattern->index[iptr];
437 register double fnorm=ABS(A->mainBlock->val[iptr]);
438 if( j != i) {
439 max_offdiagonal = MAX(max_offdiagonal,fnorm);
440 sum_row+=fnorm;
441 } else {
442 main_row=fnorm;
443 }
444
445 }
446
447 #pragma ivdep
448 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
449 register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
450
451 max_offdiagonal = MAX(max_offdiagonal,fnorm);
452 sum_row+=fnorm;
453 }
454
455 /* inspect row i: */
456 {
457 const double threshold = theta*max_offdiagonal;
458 threshold_p[2*i+1]=threshold;
459 if (tau*main_row < sum_row) { /* no diagonal dominance */
460 threshold_p[2*i]=1;
461 #pragma ivdep
462 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
463 register index_t j=A->mainBlock->pattern->index[iptr];
464 if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
465 S[koffset+kdeg] = j;
466 kdeg++;
467 }
468 }
469 #pragma ivdep
470 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
471 register index_t j=A->col_coupleBlock->pattern->index[iptr];
472 if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
473 S[koffset+kdeg] = j + my_n;
474 kdeg++;
475 }
476 }
477 } else {
478 threshold_p[2*i]=-1;
479 }
480 }
481 offset_S[i]=koffset;
482 degree_S[i]=kdeg;
483 }
484
485 /* now we need to distribute the threshold and the diagonal dominance indicator */
486 if (A->mpi_info->size > 1) {
487
488 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
489 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
490
491 double *remote_threshold=NULL;
492
493 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
494 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
495 Paso_Coupler_finishCollect(threshold_coupler);
496 remote_threshold=threshold_coupler->recv_buffer;
497
498 #pragma omp parallel for private(i,iptr) schedule(static)
499 for (i=0; i<overlap_n; i++) {
500 const double threshold = remote_threshold[2*i+1];
501 register dim_t kdeg=0;
502 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
503 if (remote_threshold[2*i]>0) {
504 #pragma ivdep
505 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
506 register index_t j=A->row_coupleBlock->pattern->index[iptr];
507 if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
508 S[koffset+kdeg] = j ;
509 kdeg++;
510 }
511 }
512
513 #pragma ivdep
514 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
515 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
516 if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
517 S[koffset+kdeg] = j + my_n;
518 kdeg++;
519 }
520 }
521 }
522 offset_S[i+my_n]=koffset;
523 degree_S[i+my_n]=kdeg;
524 }
525
526 Paso_Coupler_free(threshold_coupler);
527 }
528 TMPMEMFREE(threshold_p);
529 }
530
531 /* theta = threshold for strong connections */
532 /* tau = threshold for diagonal dominance */
533 /*S_i={j \in N_i; i strongly coupled to j}
534
535 in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
536 */
537 void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
538 dim_t *degree_S, index_t *offset_S, index_t *S,
539 const double theta, const double tau)
540
541 {
542 const dim_t block_size=A->block_size;
543 const dim_t my_n=A->mainBlock->numRows;
544 const dim_t overlap_n=A->row_coupleBlock->numRows;
545
546 index_t iptr, i, bi;
547 double *threshold_p=NULL;
548
549
550 threshold_p = TMPMEMALLOC(2*my_n, double);
551
552 #pragma omp parallel private(i,iptr,bi)
553 {
554
555 dim_t max_deg=0;
556 double *rtmp=NULL;
557
558 #pragma omp for schedule(static)
559 for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
560 +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
561
562 rtmp=TMPMEMALLOC(max_deg, double);
563
564 #pragma omp for schedule(static)
565 for (i=0;i<my_n;++i) {
566 register double max_offdiagonal = 0.;
567 register double sum_row=0;
568 register double main_row=0;
569 register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
570 register dim_t kdeg=0;
571 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
572
573 /* collect information for row i: */
574 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
575 register index_t j=A->mainBlock->pattern->index[iptr];
576 register double fnorm=0;
577 #pragma ivdep
578 for(bi=0;bi<block_size;++bi) {
579 register double rtmp2= A->mainBlock->val[iptr*block_size+bi];
580 fnorm+=rtmp2*rtmp2;
581 }
582 fnorm=sqrt(fnorm);
583 rtmp[iptr+rtmp_offset]=fnorm;
584
585 if( j != i) {
586 max_offdiagonal = MAX(max_offdiagonal,fnorm);
587 sum_row+=fnorm;
588 } else {
589 main_row=fnorm;
590 }
591
592 }
593
594 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
595 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
596 register double fnorm=0;
597 #pragma ivdep
598 for(bi=0;bi<block_size;++bi) {
599 register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
600 fnorm+=rtmp2*rtmp2;
601 }
602 fnorm=sqrt(fnorm);
603
604 rtmp[iptr+rtmp_offset]=fnorm;
605 max_offdiagonal = MAX(max_offdiagonal,fnorm);
606 sum_row+=fnorm;
607 }
608
609
610 /* inspect row i: */
611 {
612 const double threshold = theta*max_offdiagonal;
613 rtmp_offset=-A->mainBlock->pattern->ptr[i];
614
615 threshold_p[2*i+1]=threshold;
616 if (tau*main_row < sum_row) { /* no diagonal dominance */
617 threshold_p[2*i]=1;
618 #pragma ivdep
619 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
620 register index_t j=A->mainBlock->pattern->index[iptr];
621 if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
622 S[koffset+kdeg] = j;
623 kdeg++;
624 }
625 }
626 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
627 #pragma ivdep
628 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
629 register index_t j=A->col_coupleBlock->pattern->index[iptr];
630 if( rtmp[iptr+rtmp_offset] >threshold) {
631 S[koffset+kdeg] = j + my_n;
632 kdeg++;
633 }
634 }
635 } else {
636 threshold_p[2*i]=-1;
637 }
638 }
639 offset_S[i]=koffset;
640 degree_S[i]=kdeg;
641 }
642 TMPMEMFREE(rtmp);
643 }
644 /* now we need to distribute the threshold and the diagonal dominance indicator */
645 if (A->mpi_info->size > 1) {
646
647 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
648 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
649
650 double *remote_threshold=NULL;
651
652 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
653 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
654 Paso_Coupler_finishCollect(threshold_coupler);
655 remote_threshold=threshold_coupler->recv_buffer;
656
657 #pragma omp parallel for private(i,iptr) schedule(static)
658 for (i=0; i<overlap_n; i++) {
659
660 const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
661 register dim_t kdeg=0;
662 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
663 if (remote_threshold[2*i]>0) {
664 #pragma ivdep
665 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
666 register index_t j=A->row_coupleBlock->pattern->index[iptr];
667 register double fnorm2=0;
668 #pragma ivdepremote_threshold[2*i]
669 for(bi=0;bi<block_size;++bi) {
670 register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
671 fnorm2+=rtmp2*rtmp2;
672 }
673
674 if(fnorm2 > threshold2 ) {
675 S[koffset+kdeg] = j ;
676 kdeg++;
677 }
678 }
679
680 #pragma ivdep
681 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
682 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
683 register double fnorm2=0;
684 #pragma ivdepremote_threshold[2*i]
685 for(bi=0;bi<block_size;++bi) {
686 register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
687 fnorm2+=rtmp2*rtmp2;
688 }
689 if(fnorm2 > threshold2 && i != j) {
690 S[koffset+kdeg] = j + my_n;
691 kdeg++;
692 }
693 }
694
695 }
696 offset_S[i+my_n]=koffset;
697 degree_S[i+my_n]=kdeg;
698 }
699 Paso_Coupler_free(threshold_coupler);
700 }
701 TMPMEMFREE(threshold_p);
702 }
703
704 void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
705 const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
706 {
707 index_t i, j;
708 dim_t p;
709 dim_t len=0;
710 #pragma omp parallel for private(i) schedule(static)
711 for (i=0; i<nT ;++i) {
712 degree_ST[i]=0;
713 }
714 for (i=0; i<n ;++i) {
715 for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
716 }
717 for (i=0; i<nT ;++i) {
718 offset_ST[i]=len;
719 len+=degree_ST[i];
720 degree_ST[i]=0;
721 }
722 for (i=0; i<n ;++i) {
723 for (p=0; p<degree_S[i]; ++p) {
724 j=S[offset_S[i]+p];
725 ST[offset_ST[j]+degree_ST[j]]=i;
726 degree_ST[j]++;
727 }
728 }
729 }
730
731 int compareindex(const void *a, const void *b)
732 {
733 return (*(int *)a - *(int *)b);
734 }
735
736 void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
737 const dim_t* degree_S, const index_t* offset_S, const index_t* S,
738 const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
739 Paso_Connector* col_connector, Paso_Distribution* col_dist)
740 {
741 dim_t i, numUndefined, iter=0;
742 index_t iptr, jptr, kptr;
743 double *random=NULL, *w=NULL, *Status=NULL;
744 index_t * ST_flag=NULL;
745
746 Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector ,1);
747
748 w=TMPMEMALLOC(n, double);
749 Status=TMPMEMALLOC(n, double);
750 random = Paso_Distribution_createRandomVector(col_dist,1);
751 ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
752
753 #pragma omp parallel for private(i)
754 for (i=0; i< my_n; ++i) {
755 w[i]=degree_ST[i]+random[i];
756 if (degree_ST[i] < 1) {
757 Status[i]=-100; /* F point */
758 } else {
759 Status[i]=1; /* status undefined */
760 }
761 }
762
763 #pragma omp parallel for private(i, iptr)
764 for (i=0; i< n; ++i) {
765 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
766 ST_flag[offset_ST[i]+iptr]=1;
767 }
768 }
769
770
771 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
772 /* printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined); */
773 iter=0;
774 while (numUndefined > 0) {
775 Paso_Coupler_fillOverlap(n, w, w_coupler);
776
777 /* calculate the maximum value of neighbours following active strong connections:
778 w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active */
779 #pragma omp parallel for private(i, iptr)
780 for (i=0; i<my_n; ++i) {
781 if (Status[i]>0) { /* status is still undefined */
782
783 register bool_t inD=TRUE;
784 const double wi=w[i];
785
786 for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
787 const index_t k=S[offset_S[i]+iptr];
788 const index_t* start_p = &ST[offset_ST[k]];
789 const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
790
791 if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
792 if (wi <= w[k] ) {
793 inD=FALSE;
794 break;
795 }
796 }
797
798 }
799
800 if (inD) {
801 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
802 const index_t k=ST[offset_ST[i]+iptr];
803 if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
804 if (wi <= w[k] ) {
805 inD=FALSE;
806 break;
807 }
808 }
809 }
810 }
811 if (inD) {
812 Status[i]=0.; /* is in D */
813 }
814 }
815
816 }
817
818 Paso_Coupler_fillOverlap(n, Status, w_coupler);
819
820
821 /* remove connection to D points :
822
823 for each i in D:
824 for each j in S_i:
825 w[j]--
826 ST_tag[j,i]=-1
827 for each j in ST[i]:
828 ST_tag[i,j]=-1
829 for each k in ST[j]:
830 if k in ST[i]:
831 w[j]--;
832 ST_tag[j,k]=-1
833
834 */
835 /* w is updated for local rows only */
836 {
837 #pragma omp parallel for private(i, jptr)
838 for (i=0; i< my_n; ++i) {
839
840 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
841 const index_t j=ST[offset_ST[i]+jptr];
842 if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
843 w[i]--;
844 ST_flag[offset_ST[i]+jptr]=-1;
845 }
846 }
847
848 }
849 #pragma omp parallel for private(i, jptr)
850 for (i=my_n; i< n; ++i) {
851 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
852 const index_t j = ST[offset_ST[i]+jptr];
853 if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
854 }
855 }
856
857
858 for (i=0; i< n; ++i) {
859 if ( Status[i] == 0. ) {
860
861 const index_t* start_p = &ST[offset_ST[i]];
862
863 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
864 const index_t j=ST[offset_ST[i]+jptr];
865 ST_flag[offset_ST[i]+jptr]=-1;
866 for (kptr=0; kptr<degree_ST[j]; ++kptr) {
867 const index_t k=ST[offset_ST[j]+kptr];
868 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
869 if (ST_flag[offset_ST[j]+kptr] >0) {
870 if (j< my_n ) {
871 w[j]--;
872 }
873 ST_flag[offset_ST[j]+kptr]=-1;
874 }
875 }
876 }
877 }
878 }
879 }
880 }
881
882 /* adjust status */
883 #pragma omp parallel for private(i)
884 for (i=0; i< my_n; ++i) {
885 if ( Status[i] == 0. ) {
886 Status[i] = -10; /* this is now a C point */
887 } else if (Status[i] == 1. && w[i]<1.) {
888 Status[i] = -100; /* this is now a F point */
889 }
890 }
891
892 i = numUndefined;
893 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
894 if (numUndefined == i) {
895 Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
896 return;
897 }
898
899 iter++;
900 /* printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined); */
901
902 } /* end of while loop */
903
904 /* map to output :*/
905 Paso_Coupler_fillOverlap(n, Status, w_coupler);
906 #pragma omp parallel for private(i)
907 for (i=0; i< n; ++i) {
908 if (Status[i] > -50.) {
909 split_marker[i]=PASO_AMG_IN_C;
910 } else {
911 split_marker[i]=PASO_AMG_IN_F;
912 }
913 }
914 /* clean up : */
915 Paso_Coupler_free(w_coupler);
916 TMPMEMFREE(random);
917 TMPMEMFREE(w);
918 TMPMEMFREE(Status);
919 TMPMEMFREE(ST_flag);
920
921 return;
922 }
923
924 /* Merge the system matrix which is distributed on ranks into a complete
925 matrix on rank 0, then solve this matrix on rank 0 only */
926 Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
927 index_t i, iptr, j, n, remote_n, global_n, len, offset, tag;
928 index_t row_block_size, col_block_size, block_size;
929 index_t size=A->mpi_info->size;
930 index_t rank=A->mpi_info->rank;
931 index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
932 index_t *temp_n=NULL, *temp_len=NULL;
933 double *val=NULL;
934 Paso_Pattern *pattern=NULL;
935 Paso_SparseMatrix *out=NULL;
936 #ifdef ESYS_MPI
937 MPI_Request* mpi_requests=NULL;
938 MPI_Status* mpi_stati=NULL;
939 #else
940 int *mpi_requests=NULL, *mpi_stati=NULL;
941 #endif
942
943 if (size == 1) {
944 n = A->mainBlock->numRows;
945 ptr = TMPMEMALLOC(n, index_t);
946 #pragma omp parallel for private(i)
947 for (i=0; i<n; i++) ptr[i] = i;
948 out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
949 TMPMEMFREE(ptr);
950 return out;
951 }
952
953 n = A->mainBlock->numRows;
954 block_size = A->block_size;
955
956 /* Merge MainBlock and CoupleBlock to get a complete column entries
957 for each row allocated to current rank. Output (ptr, idx, val)
958 contains all info needed from current rank to merge a system
959 matrix */
960 Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
961
962 #ifdef ESYS_MPI
963 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
964 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
965 #else
966 mpi_requests=TMPMEMALLOC(size*2,int);
967 mpi_stati=TMPMEMALLOC(size*2,int);
968 #endif
969
970 /* Now, pass all info to rank 0 and merge them into one sparse
971 matrix */
972 if (rank == 0) {
973 /* First, copy local ptr values into ptr_global */
974 global_n=Paso_SystemMatrix_getGlobalNumRows(A);
975 ptr_global = MEMALLOC(global_n+1, index_t);
976 memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
977 iptr = n+1;
978 MEMFREE(ptr);
979 temp_n = TMPMEMALLOC(size, index_t);
980 temp_len = TMPMEMALLOC(size, index_t);
981 temp_n[0] = iptr;
982
983 /* Second, receive ptr values from other ranks */
984 for (i=1; i<size; i++) {
985 remote_n = A->row_distribution->first_component[i+1] -
986 A->row_distribution->first_component[i];
987 #ifdef ESYS_MPI
988 MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
989 A->mpi_info->msg_tag_counter+i,
990 A->mpi_info->comm,
991 &mpi_requests[i]);
992 #endif
993 temp_n[i] = remote_n;
994 iptr += remote_n;
995 }
996 #ifdef ESYS_MPI
997 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
998 #endif
999 A->mpi_info->msg_tag_counter += size;
1000
1001 /* Then, prepare to receive idx and val from other ranks */
1002 len = 0;
1003 offset = -1;
1004 for (i=0; i<size; i++) {
1005 if (temp_n[i] > 0) {
1006 offset += temp_n[i];
1007 len += ptr_global[offset];
1008 temp_len[i] = ptr_global[offset];
1009 }else
1010 temp_len[i] = 0;
1011 }
1012
1013 idx_global = MEMALLOC(len, index_t);
1014 iptr = temp_len[0];
1015 offset = n+1;
1016 for (i=1; i<size; i++) {
1017 len = temp_len[i];
1018 #ifdef ESYS_MPI
1019 MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1020 A->mpi_info->msg_tag_counter+i,
1021 A->mpi_info->comm,
1022 &mpi_requests[i]);
1023 #endif
1024 remote_n = temp_n[i];
1025 for (j=0; j<remote_n; j++) {
1026 ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1027 }
1028 offset += remote_n;
1029 iptr += len;
1030 }
1031 memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1032 MEMFREE(idx);
1033 row_block_size = A->mainBlock->row_block_size;
1034 col_block_size = A->mainBlock->col_block_size;
1035 #ifdef ESYS_MPI
1036 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1037 #endif
1038 A->mpi_info->msg_tag_counter += size;
1039 TMPMEMFREE(temp_n);
1040
1041 /* Then generate the sparse matrix */
1042 pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, global_n,
1043 global_n, ptr_global, idx_global);
1044 out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1045 row_block_size, col_block_size, FALSE);
1046 Paso_Pattern_free(pattern);
1047
1048 /* Finally, receive and copy the value */
1049 iptr = temp_len[0] * block_size;
1050 for (i=1; i<size; i++) {
1051 len = temp_len[i];
1052 #ifdef ESYS_MPI
1053 MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1054 A->mpi_info->msg_tag_counter+i,
1055 A->mpi_info->comm,
1056 &mpi_requests[i]);
1057 #endif
1058 iptr += (len * block_size);
1059 }
1060 memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1061 MEMFREE(val);
1062 #ifdef ESYS_MPI
1063 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1064 #endif
1065 A->mpi_info->msg_tag_counter += size;
1066 TMPMEMFREE(temp_len);
1067 } else { /* it's not rank 0 */
1068
1069 /* First, send out the local ptr */
1070 tag = A->mpi_info->msg_tag_counter+rank;
1071 #ifdef ESYS_MPI
1072 MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1073 &mpi_requests[0]);
1074 #endif
1075
1076 /* Next, send out the local idx */
1077 len = ptr[n];
1078 tag += size;
1079 #ifdef ESYS_MPI
1080 MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1081 &mpi_requests[1]);
1082 #endif
1083
1084 /* At last, send out the local val */
1085 len *= block_size;
1086 tag += size;
1087 #ifdef ESYS_MPI
1088 MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1089 &mpi_requests[2]);
1090
1091 MPI_Waitall(3, mpi_requests, mpi_stati);
1092 #endif
1093 A->mpi_info->msg_tag_counter = tag + size - rank;
1094 MEMFREE(ptr);
1095 MEMFREE(idx);
1096 MEMFREE(val);
1097
1098 out = NULL;
1099 }
1100
1101 TMPMEMFREE(mpi_requests);
1102 TMPMEMFREE(mpi_stati);
1103 return out;
1104 }
1105
1106
1107 void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1108 Paso_SystemMatrix *A = amg->A_C;
1109 Paso_SparseMatrix *A_D, *A_temp;
1110 double* x=NULL;
1111 double* b=NULL;
1112 const index_t rank = A->mpi_info->rank;
1113 const index_t size = A->mpi_info->size;
1114 const n_block = A->mainBlock->row_block_size;
1115
1116 index_t i, n, p;
1117 index_t *counts, *offset, *dist;
1118 #ifdef ESYS_MPI
1119 index_t count;
1120 #endif
1121 A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1122
1123 /* First, gather x and b into rank 0 */
1124 dist = A->pattern->input_distribution->first_component;
1125 n = Paso_SystemMatrix_getGlobalNumRows(A);
1126 b = TMPMEMALLOC(n*n_block, double);
1127 x = TMPMEMALLOC(n*n_block, double);
1128 counts = TMPMEMALLOC(size, index_t);
1129 offset = TMPMEMALLOC(size, index_t);
1130
1131 #pragma omp parallel for private(i,p)
1132 for (i=0; i<size; i++) {
1133 p = dist[i];
1134 counts[i] = (dist[i+1] - p)*n_block;
1135 offset[i] = p*n_block;
1136 }
1137 #ifdef ESYS_MPI
1138 count = counts[rank];
1139 MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1140 MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1141 #endif
1142
1143 if (rank == 0) {
1144 /* solve locally */
1145 #ifdef MKL
1146 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1147 A_temp->solver_package = PASO_MKL;
1148 Paso_SparseMatrix_free(A_D);
1149 Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1150 Paso_SparseMatrix_free(A_temp);
1151 #else
1152 #ifdef UMFPACK
1153 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1154 A_temp->solver_package = PASO_UMFPACK;
1155 Paso_SparseMatrix_free(A_D);
1156 Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1157 Paso_SparseMatrix_free(A_temp);
1158 #else
1159 A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1160 A_D->solver_package = PASO_SMOOTHER;
1161 Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1162 Paso_SparseMatrix_free(A_D);
1163 #endif
1164 #endif
1165 }
1166
1167 #ifdef ESYS_MPI
1168 /* now we need to distribute the solution to all ranks */
1169 MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1170 #endif
1171
1172 TMPMEMFREE(x);
1173 TMPMEMFREE(b);
1174 TMPMEMFREE(counts);
1175 TMPMEMFREE(offset);
1176 }

  ViewVC Help
Powered by ViewVC 1.1.26