/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Contents of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3884 - (show annotations)
Wed Apr 4 04:55:43 2012 UTC (7 years, 5 months ago) by gross
File MIME type: text/plain
File size: 37478 byte(s)
some fix in AMG
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: AMG preconditioner (local version) */
18
19 /**************************************************************/
20
21 /* Author: artak@uq.edu.au, l.gross@uq.edu.au */
22
23 /**************************************************************/
24
25 #define SHOW_TIMING FALSE
26 #define MY_DEBUG 0
27 #define MY_DEBUG1 1
28
29 #include "Paso.h"
30 #include "Preconditioner.h"
31 #include "Options.h"
32 #include "PasoUtil.h"
33 #include "UMFPACK.h"
34 #include "MKL.h"
35 #include<stdio.h>
36
37
38 /**************************************************************/
39
40 /* free all memory used by AMG */
41
42 void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43 if (in!=NULL) {
44 Paso_Preconditioner_Smoother_free(in->Smoother);
45 Paso_SystemMatrix_free(in->P);
46 Paso_SystemMatrix_free(in->R);
47 Paso_SystemMatrix_free(in->A_C);
48 Paso_Preconditioner_AMG_free(in->AMG_C);
49 MEMFREE(in->r);
50 MEMFREE(in->x_C);
51 MEMFREE(in->b_C);
52
53 MEMFREE(in);
54 }
55 }
56
57 index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
58 if (in->AMG_C == NULL) {
59 return in->level;
60 } else {
61 return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
62 }
63 }
64 double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
65 if (in->AMG_C == NULL) {
66 if (in->A_C == NULL) {
67 return 1.;
68 } else {
69 return Paso_SystemMatrix_getSparsity(in->A_C);
70 }
71 } else {
72 return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
73 }
74 }
75 dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
76 if (in->AMG_C == NULL) {
77 if (in->A_C == NULL) {
78 return 0;
79 } else {
80 return Paso_SystemMatrix_getTotalNumRows(in->A_C);
81 }
82 } else {
83 return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
84 }
85 }
86 /*****************************************************************
87
88 constructs AMG
89
90 ******************************************************************/
91 Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92
93 Paso_Preconditioner_AMG* out=NULL;
94 Paso_SystemMatrix *A_C=NULL;
95 bool_t verbose=options->verbose;
96
97 const dim_t my_n=A_p->mainBlock->numRows;
98 const dim_t overlap_n=A_p->row_coupleBlock->numRows;
99
100 const dim_t n = my_n + overlap_n;
101
102 const dim_t n_block=A_p->row_block_size;
103 index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
104 dim_t i, n_F, n_C, F_flag, *F_set=NULL, total_n_C=0, total_n_F=0;
105 double time0=0;
106 const double theta = options->coarsening_threshold;
107 const double tau = options->diagonal_dominance_threshold;
108 const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
109 const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);
110
111
112 /*
113 is the input matrix A suitable for coarsening?
114
115 */
116 if ( (sparsity >= options->min_coarse_sparsity) ||
117 (total_n <= options->min_coarse_matrix_size) ||
118 (level > options->level_max) ) {
119
120 if (verbose) {
121 /*
122 print stopping condition:
123 - 'SPAR' = min_coarse_matrix_sparsity exceeded
124 - 'SIZE' = min_coarse_matrix_size exceeded
125 - 'LEVEL' = level_max exceeded
126 */
127 printf("Paso_Preconditioner: AMG: termination of coarsening by ");
128
129 if (sparsity >= options->min_coarse_sparsity)
130 printf("SPAR");
131
132 if (total_n <= options->min_coarse_matrix_size)
133 printf("SIZE");
134
135 if (level > options->level_max)
136 printf("LEVEL");
137
138 printf("\n");
139
140 printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
141 level, options->level_max, sparsity, options->min_coarse_sparsity, total_n, options->min_coarse_matrix_size);
142
143 }
144
145 return NULL;
146 } else {
147 /* Start Coarsening : */
148
149 /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
150 const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
151
152 dim_t* degree_S=TMPMEMALLOC(n, dim_t);
153 index_t *offset_S=TMPMEMALLOC(n, index_t);
154 index_t *S=TMPMEMALLOC(len_S, index_t);
155 dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
156 index_t *offset_ST=TMPMEMALLOC(n, index_t);
157 index_t *ST=TMPMEMALLOC(len_S, index_t);
158
159
160 F_marker=TMPMEMALLOC(n,index_t);
161 counter=TMPMEMALLOC(n,index_t);
162
163 if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
164 || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
165 /*
166 make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
167 */
168 Paso_SystemMatrix_copyColCoupleBlock(A_p);
169 Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
170
171 /*
172 set splitting of unknows:
173
174 */
175 time0=Esys_timer();
176 if (n_block>1) {
177 Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
178 } else {
179 Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
180 }
181 Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
182 /* Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
183 */
184
185 Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
186 degree_S, offset_S, S, degree_ST, offset_ST, ST,
187 A_p->col_coupler->connector,A_p->col_distribution);
188
189
190 /* in BoomerAMG if interpolation is used FF connectivity is required */
191 /*MPI:
192 if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
193 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);
194 */
195
196 options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
197 if (Esys_noError() ) {
198 #pragma omp parallel for private(i) schedule(static)
199 for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F);
200
201 /*
202 count number of unkowns to be eliminated:
203 */
204 n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
205 /* collect n_F values on all processes, a direct solver should
206 be used if any n_F value is 0 */
207 F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
208 #ifdef ESYS_MPI
209 MPI_Allgather(&n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
210 #endif
211 total_n_F=0;
212 F_flag = 1;
213 for (i=0; i<A_p->mpi_info->size; i++) {
214 total_n_F+=F_set[i];
215 if (F_set[i] == 0) {
216 F_flag = 0;
217 break;
218 }
219 }
220 TMPMEMFREE(F_set);
221
222 n_C=n-n_F;
223 total_n_C=total_n-total_n_F;
224 if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,total_n_F,total_n_C);
225
226
227 /* if ( n_F == 0 ) { is a nasty case. a direct solver should be used, return NULL */
228 if (F_flag == 0) {
229 out = NULL;
230 } else {
231 out=MEMALLOC(1,Paso_Preconditioner_AMG);
232 if (! Esys_checkPtr(out)) {
233 out->level = level;
234 out->n = n;
235 out->n_F = n_F;
236 out->n_block = n_block;
237 out->A_C = NULL;
238 out->P = NULL;
239 out->R = NULL;
240 out->post_sweeps = options->post_sweeps;
241 out->pre_sweeps = options->pre_sweeps;
242 out->r = NULL;
243 out->x_C = NULL;
244 out->b_C = NULL;
245 out->AMG_C = NULL;
246 out->Smoother=NULL;
247 }
248 mask_C=TMPMEMALLOC(n,index_t);
249 rows_in_F=TMPMEMALLOC(n_F,index_t);
250 Esys_checkPtr(mask_C);
251 Esys_checkPtr(rows_in_F);
252 if ( Esys_noError() ) {
253
254 out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
255
256 if (total_n_C != 0) {
257 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
258
259 /* allocate helpers :*/
260 out->x_C=MEMALLOC(n_block*n_C,double);
261 out->b_C=MEMALLOC(n_block*n_C,double);
262 out->r=MEMALLOC(n_block*n,double);
263
264 Esys_checkPtr(out->r);
265 Esys_checkPtr(out->x_C);
266 Esys_checkPtr(out->b_C);
267
268 if ( Esys_noError() ) {
269 /* creates index for F:*/
270 #pragma omp parallel private(i)
271 {
272 #pragma omp for schedule(static)
273 for (i = 0; i < n; ++i) {
274 if (F_marker[i]) rows_in_F[counter[i]]=i;
275 }
276 }
277 /* create mask of C nodes with value >-1, gives new id */
278 i=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
279 /*
280 get Prolongation :
281 */
282
283 time0=Esys_timer();
284
285 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
286
287 }
288
289 /*
290 construct Restriction operator as transposed of Prolongation operator:
291 */
292
293 if ( Esys_noError()) {
294 time0=Esys_timer();
295
296 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
297
298 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
299 }
300 /*
301 construct coarse level matrix:
302 */
303 if ( Esys_noError()) {
304 time0=Esys_timer();
305
306 A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
307
308 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
309 }
310
311 /*
312 constructe courser level:
313
314 */
315 if ( Esys_noError()) {
316 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
317 }
318
319 if ( Esys_noError()) {
320 if ( out->AMG_C == NULL ) {
321 /* merge the system matrix into 1 rank when
322 it's not suitable coarsening due to the
323 total number of unknowns are too small */
324 out->A_C=A_C;
325 out->reordering = options->reordering;
326 out->refinements = options->coarse_matrix_refinements;
327 out->verbose = verbose;
328 out->options_smoother = options->smoother;
329 } else {
330 /* finally we set some helpers for the solver step */
331 out->A_C=A_C;
332 }
333 }
334 }
335 }
336 TMPMEMFREE(mask_C);
337 TMPMEMFREE(rows_in_F);
338 }
339 }
340
341 }
342 TMPMEMFREE(counter);
343 TMPMEMFREE(F_marker);
344 TMPMEMFREE(degree_S);
345 TMPMEMFREE(offset_S);
346 TMPMEMFREE(S);
347 TMPMEMFREE(degree_ST);
348 TMPMEMFREE(offset_ST);
349 TMPMEMFREE(ST);
350
351 }
352
353 if (Esys_noError()) {
354 return out;
355 } else {
356 Paso_Preconditioner_AMG_free(out);
357 return NULL;
358 }
359 }
360
361
362 void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
363 const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
364 double time0=0;
365 const dim_t post_sweeps=amg->post_sweeps;
366 const dim_t pre_sweeps=amg->pre_sweeps;
367
368 /* presmoothing */
369 time0=Esys_timer();
370 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
371
372 time0=Esys_timer()-time0;
373 if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
374 /* end of presmoothing */
375
376 if (amg->n_F < amg->n) { /* is there work on the coarse level? */
377 time0=Esys_timer();
378
379 Paso_Copy(n, amg->r, b); /* r <- b */
380 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
381 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */
382
383 time0=Esys_timer()-time0;
384 /* coarse level solve */
385 if ( amg->AMG_C == NULL) {
386 time0=Esys_timer();
387 /* A_C is the coarsest level */
388 Paso_Preconditioner_AMG_mergeSolve(amg);
389
390 if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
391 } else {
392 Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */
393 }
394
395 time0=time0+Esys_timer();
396 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */
397
398 /*postsmoothing*/
399
400 /*solve Ax=b with initial guess x */
401 time0=Esys_timer();
402 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
403 time0=Esys_timer()-time0;
404 if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
405 /*end of postsmoothing*/
406 }
407
408 return;
409 }
410
411 /* theta = threshold for strong connections */
412 /* tau = threshold for diagonal dominance */
413
414 /*S_i={j \in N_i; i strongly coupled to j}
415
416 in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
417 */
418
419 void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
420 dim_t *degree_S, index_t *offset_S, index_t *S,
421 const double theta, const double tau)
422 {
423
424 const dim_t my_n=A->mainBlock->numRows;
425 const dim_t overlap_n=A->row_coupleBlock->numRows;
426
427 index_t iptr, i;
428 double *threshold_p=NULL;
429
430 threshold_p = TMPMEMALLOC(2*my_n, double);
431
432 #pragma omp parallel for private(i,iptr) schedule(static)
433 for (i=0;i<my_n;++i) {
434
435 register double max_offdiagonal = 0.;
436 register double sum_row=0;
437 register double main_row=0;
438 register dim_t kdeg=0;
439 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
440
441
442 /* collect information for row i: */
443 #pragma ivdep
444 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
445 register index_t j=A->mainBlock->pattern->index[iptr];
446 register double fnorm=ABS(A->mainBlock->val[iptr]);
447 if( j != i) {
448 max_offdiagonal = MAX(max_offdiagonal,fnorm);
449 sum_row+=fnorm;
450 } else {
451 main_row=fnorm;
452 }
453
454 }
455
456 #pragma ivdep
457 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
458 register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
459
460 max_offdiagonal = MAX(max_offdiagonal,fnorm);
461 sum_row+=fnorm;
462 }
463
464 /* inspect row i: */
465 {
466 const double threshold = theta*max_offdiagonal;
467 threshold_p[2*i+1]=threshold;
468 if (tau*main_row < sum_row) { /* no diagonal dominance */
469 threshold_p[2*i]=1;
470 #pragma ivdep
471 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
472 register index_t j=A->mainBlock->pattern->index[iptr];
473 if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
474 S[koffset+kdeg] = j;
475 kdeg++;
476 }
477 }
478 #pragma ivdep
479 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
480 register index_t j=A->col_coupleBlock->pattern->index[iptr];
481 if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
482 S[koffset+kdeg] = j + my_n;
483 kdeg++;
484 }
485 }
486 } else {
487 threshold_p[2*i]=-1;
488 }
489 }
490 offset_S[i]=koffset;
491 degree_S[i]=kdeg;
492 }
493
494 /* now we need to distribute the threshold and the diagonal dominance indicator */
495 if (A->mpi_info->size > 1) {
496
497 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
498 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
499
500 double *remote_threshold=NULL;
501
502 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
503 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
504 Paso_Coupler_finishCollect(threshold_coupler);
505 remote_threshold=threshold_coupler->recv_buffer;
506
507 #pragma omp parallel for private(i,iptr) schedule(static)
508 for (i=0; i<overlap_n; i++) {
509 const double threshold = remote_threshold[2*i+1];
510 register dim_t kdeg=0;
511 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
512 if (remote_threshold[2*i]>0) {
513 #pragma ivdep
514 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
515 register index_t j=A->row_coupleBlock->pattern->index[iptr];
516 if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
517 S[koffset+kdeg] = j ;
518 kdeg++;
519 }
520 }
521
522 #pragma ivdep
523 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
524 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
525 if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
526 S[koffset+kdeg] = j + my_n;
527 kdeg++;
528 }
529 }
530 }
531 offset_S[i+my_n]=koffset;
532 degree_S[i+my_n]=kdeg;
533 }
534
535 Paso_Coupler_free(threshold_coupler);
536 }
537 TMPMEMFREE(threshold_p);
538 }
539
540 /* theta = threshold for strong connections */
541 /* tau = threshold for diagonal dominance */
542 /*S_i={j \in N_i; i strongly coupled to j}
543
544 in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
545 */
546 void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
547 dim_t *degree_S, index_t *offset_S, index_t *S,
548 const double theta, const double tau)
549
550 {
551 const dim_t block_size=A->block_size;
552 const dim_t my_n=A->mainBlock->numRows;
553 const dim_t overlap_n=A->row_coupleBlock->numRows;
554
555 index_t iptr, i, bi;
556 double *threshold_p=NULL;
557
558
559 threshold_p = TMPMEMALLOC(2*my_n, double);
560
561 #pragma omp parallel private(i,iptr,bi)
562 {
563
564 dim_t max_deg=0;
565 double *rtmp=NULL;
566
567 #pragma omp for schedule(static)
568 for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
569 +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
570
571 rtmp=TMPMEMALLOC(max_deg, double);
572
573 #pragma omp for schedule(static)
574 for (i=0;i<my_n;++i) {
575 register double max_offdiagonal = 0.;
576 register double sum_row=0;
577 register double main_row=0;
578 register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
579 register dim_t kdeg=0;
580 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
581
582 /* collect information for row i: */
583 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
584 register index_t j=A->mainBlock->pattern->index[iptr];
585 register double fnorm=0;
586 #pragma ivdep
587 for(bi=0;bi<block_size;++bi) {
588 register double rtmp2= A->mainBlock->val[iptr*block_size+bi];
589 fnorm+=rtmp2*rtmp2;
590 }
591 fnorm=sqrt(fnorm);
592 rtmp[iptr+rtmp_offset]=fnorm;
593
594 if( j != i) {
595 max_offdiagonal = MAX(max_offdiagonal,fnorm);
596 sum_row+=fnorm;
597 } else {
598 main_row=fnorm;
599 }
600
601 }
602
603 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
604 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
605 register double fnorm=0;
606 #pragma ivdep
607 for(bi=0;bi<block_size;++bi) {
608 register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
609 fnorm+=rtmp2*rtmp2;
610 }
611 fnorm=sqrt(fnorm);
612
613 rtmp[iptr+rtmp_offset]=fnorm;
614 max_offdiagonal = MAX(max_offdiagonal,fnorm);
615 sum_row+=fnorm;
616 }
617
618
619 /* inspect row i: */
620 {
621 const double threshold = theta*max_offdiagonal;
622 rtmp_offset=-A->mainBlock->pattern->ptr[i];
623
624 threshold_p[2*i+1]=threshold;
625 if (tau*main_row < sum_row) { /* no diagonal dominance */
626 threshold_p[2*i]=1;
627 #pragma ivdep
628 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
629 register index_t j=A->mainBlock->pattern->index[iptr];
630 if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
631 S[koffset+kdeg] = j;
632 kdeg++;
633 }
634 }
635 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
636 #pragma ivdep
637 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
638 register index_t j=A->col_coupleBlock->pattern->index[iptr];
639 if( rtmp[iptr+rtmp_offset] >threshold) {
640 S[koffset+kdeg] = j + my_n;
641 kdeg++;
642 }
643 }
644 } else {
645 threshold_p[2*i]=-1;
646 }
647 }
648 offset_S[i]=koffset;
649 degree_S[i]=kdeg;
650 }
651 TMPMEMFREE(rtmp);
652 }
653 /* now we need to distribute the threshold and the diagonal dominance indicator */
654 if (A->mpi_info->size > 1) {
655
656 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
657 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
658
659 double *remote_threshold=NULL;
660
661 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
662 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
663 Paso_Coupler_finishCollect(threshold_coupler);
664 remote_threshold=threshold_coupler->recv_buffer;
665
666 #pragma omp parallel for private(i,iptr) schedule(static)
667 for (i=0; i<overlap_n; i++) {
668
669 const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
670 register dim_t kdeg=0;
671 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
672 if (remote_threshold[2*i]>0) {
673 #pragma ivdep
674 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
675 register index_t j=A->row_coupleBlock->pattern->index[iptr];
676 register double fnorm2=0;
677 #pragma ivdepremote_threshold[2*i]
678 for(bi=0;bi<block_size;++bi) {
679 register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
680 fnorm2+=rtmp2*rtmp2;
681 }
682
683 if(fnorm2 > threshold2 ) {
684 S[koffset+kdeg] = j ;
685 kdeg++;
686 }
687 }
688
689 #pragma ivdep
690 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
691 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
692 register double fnorm2=0;
693 #pragma ivdepremote_threshold[2*i]
694 for(bi=0;bi<block_size;++bi) {
695 register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
696 fnorm2+=rtmp2*rtmp2;
697 }
698 if(fnorm2 > threshold2 && i != j) {
699 S[koffset+kdeg] = j + my_n;
700 kdeg++;
701 }
702 }
703
704 }
705 offset_S[i+my_n]=koffset;
706 degree_S[i+my_n]=kdeg;
707 }
708 Paso_Coupler_free(threshold_coupler);
709 }
710 TMPMEMFREE(threshold_p);
711 }
712
713 void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
714 const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
715 {
716 index_t i, j;
717 dim_t p;
718 dim_t len=0;
719 #pragma omp parallel for private(i) schedule(static)
720 for (i=0; i<nT ;++i) {
721 degree_ST[i]=0;
722 }
723 for (i=0; i<n ;++i) {
724 for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
725 }
726 for (i=0; i<nT ;++i) {
727 offset_ST[i]=len;
728 len+=degree_ST[i];
729 degree_ST[i]=0;
730 }
731 for (i=0; i<n ;++i) {
732 for (p=0; p<degree_S[i]; ++p) {
733 j=S[offset_S[i]+p];
734 ST[offset_ST[j]+degree_ST[j]]=i;
735 degree_ST[j]++;
736 }
737 }
738 }
739
740 int compareindex(const void *a, const void *b)
741 {
742 return (*(int *)a - *(int *)b);
743 }
744
745 void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
746 const dim_t* degree_S, const index_t* offset_S, const index_t* S,
747 const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
748 Paso_Connector* col_connector, Paso_Distribution* col_dist)
749 {
750 dim_t i, numUndefined, iter=0;
751 index_t iptr, jptr, kptr;
752 double *random=NULL, *w=NULL, *Status=NULL;
753 index_t * ST_flag=NULL;
754
755 Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector ,1);
756
757 w=TMPMEMALLOC(n, double);
758 Status=TMPMEMALLOC(n, double);
759 random = Paso_Distribution_createRandomVector(col_dist,1);
760 ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
761
762 #pragma omp parallel for private(i)
763 for (i=0; i< my_n; ++i) {
764 w[i]=degree_ST[i]+random[i];
765 if (degree_ST[i] < 1) {
766 Status[i]=-100; /* F point */
767 } else {
768 Status[i]=1; /* status undefined */
769 }
770 }
771
772 #pragma omp parallel for private(i, iptr)
773 for (i=0; i< n; ++i) {
774 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
775 ST_flag[offset_ST[i]+iptr]=1;
776 }
777 }
778
779
780 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
781 /* printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined); */
782 iter=0;
783 while (numUndefined > 0) {
784 Paso_Coupler_fillOverlap(n, w, w_coupler);
785
786 /* calculate the maximum value of neighbours following active strong connections:
787 w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active */
788 #pragma omp parallel for private(i, iptr)
789 for (i=0; i<my_n; ++i) {
790 if (Status[i]>0) { /* status is still undefined */
791
792 register bool_t inD=TRUE;
793 const double wi=w[i];
794
795 for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
796 const index_t k=S[offset_S[i]+iptr];
797 const index_t* start_p = &ST[offset_ST[k]];
798 const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
799
800 if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
801 if (wi <= w[k] ) {
802 inD=FALSE;
803 break;
804 }
805 }
806
807 }
808
809 if (inD) {
810 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
811 const index_t k=ST[offset_ST[i]+iptr];
812 if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
813 if (wi <= w[k] ) {
814 inD=FALSE;
815 break;
816 }
817 }
818 }
819 }
820 if (inD) {
821 Status[i]=0.; /* is in D */
822 }
823 }
824
825 }
826
827 Paso_Coupler_fillOverlap(n, Status, w_coupler);
828
829
830 /* remove connection to D points :
831
832 for each i in D:
833 for each j in S_i:
834 w[j]--
835 ST_tag[j,i]=-1
836 for each j in ST[i]:
837 ST_tag[i,j]=-1
838 for each k in ST[j]:
839 if k in ST[i]:
840 w[j]--;
841 ST_tag[j,k]=-1
842
843 */
844 /* w is updated for local rows only */
845 {
846 #pragma omp parallel for private(i, jptr)
847 for (i=0; i< my_n; ++i) {
848
849 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
850 const index_t j=ST[offset_ST[i]+jptr];
851 if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
852 w[i]--;
853 ST_flag[offset_ST[i]+jptr]=-1;
854 }
855 }
856
857 }
858 #pragma omp parallel for private(i, jptr)
859 for (i=my_n; i< n; ++i) {
860 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
861 const index_t j = ST[offset_ST[i]+jptr];
862 if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
863 }
864 }
865
866
867 for (i=0; i< n; ++i) {
868 if ( Status[i] == 0. ) {
869
870 const index_t* start_p = &ST[offset_ST[i]];
871
872 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
873 const index_t j=ST[offset_ST[i]+jptr];
874 ST_flag[offset_ST[i]+jptr]=-1;
875 for (kptr=0; kptr<degree_ST[j]; ++kptr) {
876 const index_t k=ST[offset_ST[j]+kptr];
877 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
878 if (ST_flag[offset_ST[j]+kptr] >0) {
879 if (j< my_n ) {
880 w[j]--;
881 }
882 ST_flag[offset_ST[j]+kptr]=-1;
883 }
884 }
885 }
886 }
887 }
888 }
889 }
890
891 /* adjust status */
892 #pragma omp parallel for private(i)
893 for (i=0; i< my_n; ++i) {
894 if ( Status[i] == 0. ) {
895 Status[i] = -10; /* this is now a C point */
896 } else if (Status[i] == 1. && w[i]<1.) {
897 Status[i] = -100; /* this is now a F point */
898 }
899 }
900
901 i = numUndefined;
902 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
903 if (numUndefined == i) {
904 Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
905 return;
906 }
907
908 iter++;
909 /* printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined); */
910
911 } /* end of while loop */
912
913 /* map to output :*/
914 Paso_Coupler_fillOverlap(n, Status, w_coupler);
915 #pragma omp parallel for private(i)
916 for (i=0; i< n; ++i) {
917 if (Status[i] > -50.) {
918 split_marker[i]=PASO_AMG_IN_C;
919 } else {
920 split_marker[i]=PASO_AMG_IN_F;
921 }
922 }
923 /* clean up : */
924 Paso_Coupler_free(w_coupler);
925 TMPMEMFREE(random);
926 TMPMEMFREE(w);
927 TMPMEMFREE(Status);
928 TMPMEMFREE(ST_flag);
929
930 return;
931 }
932
933 /* Merge the system matrix which is distributed on ranks into a complete
934 matrix on rank 0, then solve this matrix on rank 0 only */
935 Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
936 index_t i, iptr, j, n, remote_n, total_n, len, offset, tag;
937 index_t row_block_size, col_block_size, block_size;
938 index_t size=A->mpi_info->size;
939 index_t rank=A->mpi_info->rank;
940 index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
941 index_t *temp_n=NULL, *temp_len=NULL;
942 double *val=NULL;
943 Paso_Pattern *pattern=NULL;
944 Paso_SparseMatrix *out=NULL;
945 #ifdef ESYS_MPI
946 MPI_Request* mpi_requests=NULL;
947 MPI_Status* mpi_stati=NULL;
948 #else
949 int *mpi_requests=NULL, *mpi_stati=NULL;
950 #endif
951
952 if (size == 1) {
953 n = A->mainBlock->numRows;
954 ptr = TMPMEMALLOC(n, index_t);
955 #pragma omp parallel for private(i)
956 for (i=0; i<n; i++) ptr[i] = i;
957 out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
958 TMPMEMFREE(ptr);
959 return out;
960 }
961
962 n = A->mainBlock->numRows;
963 block_size = A->block_size;
964
965 /* Merge MainBlock and CoupleBlock to get a complete column entries
966 for each row allocated to current rank. Output (ptr, idx, val)
967 contains all info needed from current rank to merge a system
968 matrix */
969 Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
970
971 #ifdef ESYS_MPI
972 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
973 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
974 #else
975 mpi_requests=TMPMEMALLOC(size*2,int);
976 mpi_stati=TMPMEMALLOC(size*2,int);
977 #endif
978
979 /* Now, pass all info to rank 0 and merge them into one sparse
980 matrix */
981 if (rank == 0) {
982 /* First, copy local ptr values into ptr_global */
983 total_n=Paso_SystemMatrix_getGlobalNumRows(A);
984 ptr_global = MEMALLOC(total_n+1, index_t);
985 memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
986 iptr = n+1;
987 MEMFREE(ptr);
988 temp_n = TMPMEMALLOC(size, index_t);
989 temp_len = TMPMEMALLOC(size, index_t);
990 temp_n[0] = iptr;
991
992 /* Second, receive ptr values from other ranks */
993 for (i=1; i<size; i++) {
994 remote_n = A->row_distribution->first_component[i+1] -
995 A->row_distribution->first_component[i];
996 #ifdef ESYS_MPI
997 MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
998 A->mpi_info->msg_tag_counter+i,
999 A->mpi_info->comm,
1000 &mpi_requests[i]);
1001 #endif
1002 temp_n[i] = remote_n;
1003 iptr += remote_n;
1004 }
1005 #ifdef ESYS_MPI
1006 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1007 #endif
1008 A->mpi_info->msg_tag_counter += size;
1009
1010 /* Then, prepare to receive idx and val from other ranks */
1011 len = 0;
1012 offset = -1;
1013 for (i=0; i<size; i++) {
1014 if (temp_n[i] > 0) {
1015 offset += temp_n[i];
1016 len += ptr_global[offset];
1017 temp_len[i] = ptr_global[offset];
1018 }else
1019 temp_len[i] = 0;
1020 }
1021
1022 idx_global = MEMALLOC(len, index_t);
1023 iptr = temp_len[0];
1024 offset = n+1;
1025 for (i=1; i<size; i++) {
1026 len = temp_len[i];
1027 #ifdef ESYS_MPI
1028 MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1029 A->mpi_info->msg_tag_counter+i,
1030 A->mpi_info->comm,
1031 &mpi_requests[i]);
1032 #endif
1033 remote_n = temp_n[i];
1034 for (j=0; j<remote_n; j++) {
1035 ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1036 }
1037 offset += remote_n;
1038 iptr += len;
1039 }
1040 memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1041 MEMFREE(idx);
1042 row_block_size = A->mainBlock->row_block_size;
1043 col_block_size = A->mainBlock->col_block_size;
1044 #ifdef ESYS_MPI
1045 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1046 #endif
1047 A->mpi_info->msg_tag_counter += size;
1048 TMPMEMFREE(temp_n);
1049
1050 /* Then generate the sparse matrix */
1051 pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, total_n,
1052 total_n, ptr_global, idx_global);
1053 out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1054 row_block_size, col_block_size, FALSE);
1055 Paso_Pattern_free(pattern);
1056
1057 /* Finally, receive and copy the value */
1058 iptr = temp_len[0] * block_size;
1059 for (i=1; i<size; i++) {
1060 len = temp_len[i];
1061 #ifdef ESYS_MPI
1062 MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1063 A->mpi_info->msg_tag_counter+i,
1064 A->mpi_info->comm,
1065 &mpi_requests[i]);
1066 #endif
1067 iptr += (len * block_size);
1068 }
1069 memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1070 MEMFREE(val);
1071 #ifdef ESYS_MPI
1072 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1073 #endif
1074 A->mpi_info->msg_tag_counter += size;
1075 TMPMEMFREE(temp_len);
1076 } else { /* it's not rank 0 */
1077
1078 /* First, send out the local ptr */
1079 tag = A->mpi_info->msg_tag_counter+rank;
1080 #ifdef ESYS_MPI
1081 MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1082 &mpi_requests[0]);
1083 #endif
1084
1085 /* Next, send out the local idx */
1086 len = ptr[n];
1087 tag += size;
1088 #ifdef ESYS_MPI
1089 MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1090 &mpi_requests[1]);
1091 #endif
1092
1093 /* At last, send out the local val */
1094 len *= block_size;
1095 tag += size;
1096 #ifdef ESYS_MPI
1097 MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1098 &mpi_requests[2]);
1099
1100 MPI_Waitall(3, mpi_requests, mpi_stati);
1101 #endif
1102 A->mpi_info->msg_tag_counter = tag + size - rank;
1103 MEMFREE(ptr);
1104 MEMFREE(idx);
1105 MEMFREE(val);
1106
1107 out = NULL;
1108 }
1109
1110 TMPMEMFREE(mpi_requests);
1111 TMPMEMFREE(mpi_stati);
1112 return out;
1113 }
1114
1115
1116 void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1117 Paso_SystemMatrix *A = amg->A_C;
1118 Paso_SparseMatrix *A_D, *A_temp;
1119 double* x=NULL;
1120 double* b=NULL;
1121 index_t rank = A->mpi_info->rank;
1122 index_t size = A->mpi_info->size;
1123 index_t i, n, p, n_block;
1124 index_t *counts, *offset, *dist;
1125 #ifdef ESYS_MPI
1126 index_t count;
1127 #endif
1128 n_block = amg->n_block;
1129 A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1130
1131 /* First, gather x and b into rank 0 */
1132 dist = A->pattern->input_distribution->first_component;
1133 n = Paso_SystemMatrix_getGlobalNumRows(A);
1134 b = TMPMEMALLOC(n*n_block, double);
1135 x = TMPMEMALLOC(n*n_block, double);
1136 counts = TMPMEMALLOC(size, index_t);
1137 offset = TMPMEMALLOC(size, index_t);
1138
1139 #pragma omp parallel for private(i,p)
1140 for (i=0; i<size; i++) {
1141 p = dist[i];
1142 counts[i] = (dist[i+1] - p)*n_block;
1143 offset[i] = p*n_block;
1144 }
1145 #ifdef ESYS_MPI
1146 count = counts[rank];
1147 MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1148 MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1149 #endif
1150
1151 if (rank == 0) {
1152 /* solve locally */
1153 #ifdef MKL
1154 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1155 A_temp->solver_package = PASO_MKL;
1156 Paso_SparseMatrix_free(A_D);
1157 Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1158 Paso_SparseMatrix_free(A_temp);
1159 #else
1160 #ifdef UMFPACK
1161 A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1162 A_temp->solver_package = PASO_UMFPACK;
1163 Paso_SparseMatrix_free(A_D);
1164 Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1165 Paso_SparseMatrix_free(A_temp);
1166 #else
1167 A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1168 A_D->solver_package = PASO_SMOOTHER;
1169 Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1170 Paso_SparseMatrix_free(A_D);
1171 #endif
1172 #endif
1173 }
1174
1175 #ifdef ESYS_MPI
1176 /* now we need to distribute the solution to all ranks */
1177 MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1178 #endif
1179
1180 TMPMEMFREE(x);
1181 TMPMEMFREE(b);
1182 TMPMEMFREE(counts);
1183 TMPMEMFREE(offset);
1184 }

  ViewVC Help
Powered by ViewVC 1.1.26