/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Annotation of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3827 - (hide annotations)
Tue Feb 14 11:42:08 2012 UTC (7 years, 7 months ago) by lgao
File MIME type: text/plain
File size: 37238 byte(s)
 Merge branch "amg_from_3530" back into trunk.


1 gross 3193
2     /*******************************************************
3     *
4     * Copyright (c) 2003-2010 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13    
14    
15     /**************************************************************/
16    
17 gross 3441 /* Paso: AMG preconditioner (local version) */
18 gross 3193
19     /**************************************************************/
20    
21 caltinay 3642 /* Author: artak@uq.edu.au, l.gross@uq.edu.au */
22 gross 3193
23     /**************************************************************/
24    
25 gross 3283 #define SHOW_TIMING FALSE
26 lgao 3827 #define MY_DEBUG 0
27     #define MY_DEBUG1 1
28 gross 3283
29 gross 3193 #include "Paso.h"
30     #include "Preconditioner.h"
31     #include "Options.h"
32     #include "PasoUtil.h"
33     #include "UMFPACK.h"
34     #include "MKL.h"
35 gross 3458 #include<stdio.h>
36 gross 3193
37 gross 3458
38 gross 3193 /**************************************************************/
39    
40     /* free all memory used by AMG */
41    
42 gross 3441 void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43 gross 3193 if (in!=NULL) {
44 gross 3441 Paso_Preconditioner_Smoother_free(in->Smoother);
45     Paso_SystemMatrix_free(in->P);
46     Paso_SystemMatrix_free(in->R);
47     Paso_SystemMatrix_free(in->A_C);
48     Paso_Preconditioner_AMG_free(in->AMG_C);
49 gross 3283 MEMFREE(in->r);
50     MEMFREE(in->x_C);
51     MEMFREE(in->b_C);
52 gross 3193
53 gross 3283 MEMFREE(in);
54 gross 3193 }
55     }
56    
57 gross 3441 index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
58 gross 3402 if (in->AMG_C == NULL) {
59     return in->level;
60     } else {
61 gross 3441 return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
62 gross 3402 }
63     }
64 gross 3441 double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
65 gross 3403 if (in->AMG_C == NULL) {
66     if (in->A_C == NULL) {
67     return 1.;
68     } else {
69 gross 3442 return Paso_SystemMatrix_getSparsity(in->A_C);
70 gross 3403 }
71     } else {
72 gross 3441 return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
73 gross 3403 }
74 gross 3402 }
75 gross 3441 dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
76 gross 3402 if (in->AMG_C == NULL) {
77 gross 3403 if (in->A_C == NULL) {
78     return 0;
79     } else {
80 gross 3445 return Paso_SystemMatrix_getTotalNumRows(in->A_C);
81 gross 3403 }
82 gross 3402 } else {
83 gross 3441 return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
84 gross 3402 }
85     }
86 gross 3283 /*****************************************************************
87 gross 3193
88 gross 3283 constructs AMG
89    
90     ******************************************************************/
91 gross 3441 Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92 gross 3193
93 gross 3441 Paso_Preconditioner_AMG* out=NULL;
94 lgao 3827 Paso_SystemMatrix *A_C;
95 gross 3193 bool_t verbose=options->verbose;
96 gross 3445
97 gross 3482 const dim_t my_n=A_p->mainBlock->numRows;
98     const dim_t overlap_n=A_p->row_coupleBlock->numRows;
99    
100 gross 3445 const dim_t n = my_n + overlap_n;
101    
102 gross 3193 const dim_t n_block=A_p->row_block_size;
103 lgao 3827 // const dim_t n_block=A_p->block_size;
104     index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
105     dim_t i, n_F, n_C, F_flag, *F_set=NULL;
106 gross 3283 double time0=0;
107     const double theta = options->coarsening_threshold;
108     const double tau = options->diagonal_dominance_threshold;
109 gross 3442 const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
110     const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);
111 lgao 3827 const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
112 plaub 3436
113 lgao 3827
114 gross 3193 /*
115 caltinay 3642 is the input matrix A suitable for coarsening?
116 gross 3283
117 gross 3193 */
118 gross 3442 if ( (sparsity >= options->min_coarse_sparsity) ||
119 lgao 3827 // (total_n <= options->min_coarse_matrix_size) ||
120     (global_n <= options->min_coarse_matrix_size) ||
121 gross 3442 (level > options->level_max) ) {
122 plaub 3436
123 gross 3445 if (verbose) {
124 plaub 3436 /*
125     print stopping condition:
126     - 'SPAR' = min_coarse_matrix_sparsity exceeded
127     - 'SIZE' = min_coarse_matrix_size exceeded
128     - 'LEVEL' = level_max exceeded
129     */
130 gross 3442 printf("Paso_Preconditioner: AMG: termination of coarsening by ");
131 plaub 3436
132 gross 3442 if (sparsity >= options->min_coarse_sparsity)
133 caltinay 3642 printf("SPAR");
134 plaub 3436
135 gross 3442 if (total_n <= options->min_coarse_matrix_size)
136 caltinay 3642 printf("SIZE");
137 plaub 3436
138     if (level > options->level_max)
139 caltinay 3642 printf("LEVEL");
140 plaub 3436
141     printf("\n");
142    
143     printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
144 gross 3442 level, options->level_max, sparsity, options->min_coarse_sparsity, total_n, options->min_coarse_matrix_size);
145 plaub 3436
146 gross 3445 }
147 plaub 3436
148 gross 3445 return NULL;
149     } else {
150 gross 3451 /* Start Coarsening : */
151 gross 3482
152     /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
153 lgao 3827 const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
154 gross 3482
155     dim_t* degree_S=TMPMEMALLOC(n, dim_t);
156     index_t *offset_S=TMPMEMALLOC(n, index_t);
157 gross 3451 index_t *S=TMPMEMALLOC(len_S, index_t);
158 gross 3482 dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
159     index_t *offset_ST=TMPMEMALLOC(n, index_t);
160     index_t *ST=TMPMEMALLOC(len_S, index_t);
161    
162    
163 gross 3440 F_marker=TMPMEMALLOC(n,index_t);
164 gross 3283 counter=TMPMEMALLOC(n,index_t);
165 gross 3445
166 gross 3482 if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
167     || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
168 gross 3458 /*
169     make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
170     */
171 gross 3482 Paso_SystemMatrix_copyColCoupleBlock(A_p);
172 lgao 3827 Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
173 gross 3458
174     /*
175 gross 3283 set splitting of unknows:
176 gross 3458
177 gross 3283 */
178     time0=Esys_timer();
179     if (n_block>1) {
180 gross 3445 Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
181 gross 3283 } else {
182 gross 3445 Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
183 gross 3283 }
184 gross 3482 Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
185 lgao 3827 // Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
186    
187 gross 3482 Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
188     degree_S, offset_S, S, degree_ST, offset_ST, ST,
189     A_p->col_coupler->connector,A_p->col_distribution);
190 gross 3449
191 gross 3442
192 caltinay 3642 /* in BoomerAMG if interpolation is used FF connectivity is required */
193 gross 3442 /*MPI:
194 gross 3440 if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
195     Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);
196 gross 3442 */
197    
198 gross 3283 options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
199     if (Esys_noError() ) {
200     #pragma omp parallel for private(i) schedule(static)
201 gross 3440 for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F);
202 gross 3283
203     /*
204     count number of unkowns to be eliminated:
205     */
206 gross 3440 n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
207 gross 3283 n_C=n-n_F;
208 lgao 3827 if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);
209    
210     /* collect n_F values on all processes, a direct solver should
211     be used if any n_F value is 0 */
212     F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
213     MPI_Allgather(&n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
214     F_flag = 1;
215     for (i=0; i<A_p->mpi_info->size; i++) {
216     if (F_set[i] == 0) {
217     F_flag = 0;
218     break;
219     }
220     }
221     TMPMEMFREE(F_set);
222    
223     // if ( n_F == 0 ) { /* is a nasty case. a direct solver should be used, return NULL */
224     if (F_flag == 0) {
225 gross 3283 out = NULL;
226     } else {
227 gross 3441 out=MEMALLOC(1,Paso_Preconditioner_AMG);
228 gross 3323 if (! Esys_checkPtr(out)) {
229 gross 3283 out->level = level;
230     out->n = n;
231     out->n_F = n_F;
232     out->n_block = n_block;
233     out->A_C = NULL;
234     out->P = NULL;
235     out->R = NULL;
236 lgao 3827 // out->post_sweeps = options->post_sweeps;
237     // out->pre_sweeps = options->pre_sweeps;
238     out->post_sweeps = 2;
239     out->pre_sweeps = 2;
240 gross 3283 out->r = NULL;
241     out->x_C = NULL;
242     out->b_C = NULL;
243     out->AMG_C = NULL;
244 gross 3323 out->Smoother=NULL;
245     }
246     mask_C=TMPMEMALLOC(n,index_t);
247     rows_in_F=TMPMEMALLOC(n_F,index_t);
248     Esys_checkPtr(mask_C);
249     Esys_checkPtr(rows_in_F);
250     if ( Esys_noError() ) {
251    
252 lgao 3827 out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
253 gross 3323
254 gross 3402 if (n_C != 0) {
255 caltinay 3642 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
256 gross 3283
257 gross 3315 /* allocate helpers :*/
258     out->x_C=MEMALLOC(n_block*n_C,double);
259     out->b_C=MEMALLOC(n_block*n_C,double);
260     out->r=MEMALLOC(n_block*n,double);
261    
262     Esys_checkPtr(out->r);
263     Esys_checkPtr(out->x_C);
264     Esys_checkPtr(out->b_C);
265    
266 gross 3323 if ( Esys_noError() ) {
267     /* creates index for F:*/
268     #pragma omp parallel private(i)
269     {
270     #pragma omp for schedule(static)
271     for (i = 0; i < n; ++i) {
272 gross 3440 if (F_marker[i]) rows_in_F[counter[i]]=i;
273 gross 3323 }
274     }
275 caltinay 3642 /* create mask of C nodes with value >-1, gives new id */
276 lgao 3827 i=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
277 gross 3323 /*
278 gross 3440 get Prolongation :
279 gross 3323 */
280 lgao 3827
281 gross 3323 time0=Esys_timer();
282 lgao 3827
283     out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
284    
285 gross 3283 }
286 lgao 3827
287 gross 3323 /*
288 gross 3440 construct Restriction operator as transposed of Prolongation operator:
289 gross 3283 */
290 lgao 3827
291 gross 3315 if ( Esys_noError()) {
292     time0=Esys_timer();
293 lgao 3827
294     out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
295    
296 gross 3441 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
297 gross 3315 }
298 lgao 3827 /*
299     construct coarse level matrix:
300     */
301     if ( Esys_noError()) {
302     time0=Esys_timer();
303 gross 3442
304 lgao 3827 A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
305 gross 3315
306 lgao 3827 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
307     }
308    
309 gross 3283 /*
310     constructe courser level:
311    
312     */
313     if ( Esys_noError()) {
314 gross 3441 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
315 gross 3315 }
316 lgao 3827
317 gross 3315 if ( Esys_noError()) {
318 lgao 3827 if ( out->AMG_C == NULL ) {
319     /* merge the system matrix into 1 rank when
320     it's not suitable coarsening due to the
321     total number of unknowns are too small */
322     out->A_C=A_C;
323 gross 3283 out->reordering = options->reordering;
324 lgao 3827 out->refinements = options->coarse_matrix_refinements;
325     out->verbose = verbose;
326     out->options_smoother = options->smoother;
327     } else {
328 gross 3283 /* finally we set some helpers for the solver step */
329     out->A_C=A_C;
330 lgao 3827 }
331 gross 3283 }
332     }
333     }
334     TMPMEMFREE(mask_C);
335     TMPMEMFREE(rows_in_F);
336     }
337     }
338 gross 3445
339 gross 3193 }
340     TMPMEMFREE(counter);
341 gross 3440 TMPMEMFREE(F_marker);
342     TMPMEMFREE(degree_S);
343 gross 3445 TMPMEMFREE(offset_S);
344 gross 3303 TMPMEMFREE(S);
345 gross 3482 TMPMEMFREE(degree_ST);
346     TMPMEMFREE(offset_ST);
347     TMPMEMFREE(ST);
348    
349 gross 3445 }
350 gross 3193
351 jfenwick 3259 if (Esys_noError()) {
352 gross 3193 return out;
353     } else {
354 gross 3441 Paso_Preconditioner_AMG_free(out);
355 gross 3193 return NULL;
356     }
357     }
358    
359    
360 gross 3441 void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
361 lgao 3827 const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
362 gross 3283 double time0=0;
363     const dim_t post_sweeps=amg->post_sweeps;
364     const dim_t pre_sweeps=amg->pre_sweeps;
365 lgao 3827 int rank = A->mpi_info->rank;
366 gross 3193
367 gross 3283 /* presmoothing */
368     time0=Esys_timer();
369 gross 3442 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
370 lgao 3827
371 gross 3283 time0=Esys_timer()-time0;
372 caltinay 3642 if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
373 gross 3283 /* end of presmoothing */
374    
375     if (amg->n_F < amg->n) { /* is there work on the coarse level? */
376     time0=Esys_timer();
377 lgao 3827
378 gross 3283 Paso_Copy(n, amg->r, b); /* r <- b */
379 gross 3441 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
380 gross 3445 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */
381 lgao 3827
382 gross 3283 time0=Esys_timer()-time0;
383     /* coarse level solve */
384     if ( amg->AMG_C == NULL) {
385     time0=Esys_timer();
386     /* A_C is the coarsest level */
387 lgao 3827 Paso_Preconditioner_AMG_mergeSolve(amg);
388    
389 gross 3283 if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
390     } else {
391 gross 3441 Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */
392 lgao 3827 }
393    
394 gross 3283 time0=time0+Esys_timer();
395 gross 3445 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */
396 lgao 3827
397 gross 3283 /*postsmoothing*/
398    
399     /*solve Ax=b with initial guess x */
400     time0=Esys_timer();
401 gross 3442 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
402 gross 3283 time0=Esys_timer()-time0;
403     if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
404     /*end of postsmoothing*/
405     }
406 lgao 3827
407 gross 3283 return;
408     }
409 gross 3193
410 gross 3283 /* theta = threshold for strong connections */
411     /* tau = threshold for diagonal dominance */
412 gross 3193
413 gross 3303 /*S_i={j \in N_i; i strongly coupled to j}
414    
415     in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
416     */
417    
418 gross 3441 void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
419 gross 3445 dim_t *degree_S, index_t *offset_S, index_t *S,
420 gross 3303 const double theta, const double tau)
421 gross 3283 {
422 gross 3482
423     const dim_t my_n=A->mainBlock->numRows;
424     const dim_t overlap_n=A->row_coupleBlock->numRows;
425    
426 gross 3445 index_t iptr, i;
427 gross 3482 double *threshold_p=NULL;
428 lgao 3827 index_t rank=A->mpi_info->rank;
429 gross 3193
430 gross 3482 threshold_p = TMPMEMALLOC(2*my_n, double);
431    
432     #pragma omp parallel for private(i,iptr) schedule(static)
433     for (i=0;i<my_n;++i) {
434    
435 gross 3445 register double max_offdiagonal = 0.;
436     register double sum_row=0;
437     register double main_row=0;
438 gross 3451 register dim_t kdeg=0;
439 caltinay 3489 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
440 lgao 3827
441 gross 3482
442     /* collect information for row i: */
443 gross 3283 #pragma ivdep
444 gross 3445 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
445     register index_t j=A->mainBlock->pattern->index[iptr];
446     register double fnorm=ABS(A->mainBlock->val[iptr]);
447 gross 3283 if( j != i) {
448     max_offdiagonal = MAX(max_offdiagonal,fnorm);
449     sum_row+=fnorm;
450     } else {
451     main_row=fnorm;
452     }
453 gross 3445
454 gross 3283 }
455 lgao 3827
456 gross 3445 #pragma ivdep
457     for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
458     register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
459 lgao 3827
460 gross 3445 max_offdiagonal = MAX(max_offdiagonal,fnorm);
461     sum_row+=fnorm;
462     }
463    
464 gross 3482 /* inspect row i: */
465 gross 3451 {
466     const double threshold = theta*max_offdiagonal;
467 gross 3482 threshold_p[2*i+1]=threshold;
468 caltinay 3642 if (tau*main_row < sum_row) { /* no diagonal dominance */
469 gross 3482 threshold_p[2*i]=1;
470 gross 3451 #pragma ivdep
471     for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
472     register index_t j=A->mainBlock->pattern->index[iptr];
473     if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
474     S[koffset+kdeg] = j;
475     kdeg++;
476     }
477 gross 3283 }
478 lgao 3827 // #pragma ivdep
479 gross 3451 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
480     register index_t j=A->col_coupleBlock->pattern->index[iptr];
481     if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
482     S[koffset+kdeg] = j + my_n;
483     kdeg++;
484     }
485 gross 3445 }
486 gross 3482 } else {
487     threshold_p[2*i]=-1;
488 gross 3451 }
489 gross 3440 }
490 gross 3445 offset_S[i]=koffset;
491 gross 3440 degree_S[i]=kdeg;
492 gross 3283 }
493 lgao 3827
494 gross 3482 /* now we need to distribute the threshold and the diagonal dominance indicator */
495     if (A->mpi_info->size > 1) {
496    
497     const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
498     -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
499    
500     double *remote_threshold=NULL;
501    
502     Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
503     Paso_Coupler_startCollect(threshold_coupler,threshold_p);
504     Paso_Coupler_finishCollect(threshold_coupler);
505     remote_threshold=threshold_coupler->recv_buffer;
506    
507     #pragma omp parallel for private(i,iptr) schedule(static)
508     for (i=0; i<overlap_n; i++) {
509     const double threshold = remote_threshold[2*i+1];
510     register dim_t kdeg=0;
511 lgao 3827 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
512 gross 3482 if (remote_threshold[2*i]>0) {
513 lgao 3827 #pragma ivdep
514     for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
515 gross 3482 register index_t j=A->row_coupleBlock->pattern->index[iptr];
516     if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
517     S[koffset+kdeg] = j ;
518     kdeg++;
519     }
520 lgao 3827 }
521 gross 3482
522 lgao 3827 #pragma ivdep
523     for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
524     register index_t j=A->remote_coupleBlock->pattern->index[iptr];
525     if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
526     S[koffset+kdeg] = j + my_n;
527     kdeg++;
528     }
529     }
530 gross 3482 }
531     offset_S[i+my_n]=koffset;
532     degree_S[i+my_n]=kdeg;
533     }
534    
535     Paso_Coupler_free(threshold_coupler);
536     }
537     TMPMEMFREE(threshold_p);
538 gross 3283 }
539 gross 3193
540 gross 3283 /* theta = threshold for strong connections */
541     /* tau = threshold for diagonal dominance */
542 gross 3303 /*S_i={j \in N_i; i strongly coupled to j}
543 gross 3283
544 gross 3303 in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
545     */
546 gross 3441 void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
547 gross 3445 dim_t *degree_S, index_t *offset_S, index_t *S,
548 gross 3303 const double theta, const double tau)
549    
550 gross 3283 {
551 lgao 3827 const dim_t block_size=A->block_size;
552 gross 3482 const dim_t my_n=A->mainBlock->numRows;
553     const dim_t overlap_n=A->row_coupleBlock->numRows;
554    
555 gross 3445 index_t iptr, i, bi;
556 gross 3482 double *threshold_p=NULL;
557 gross 3283
558    
559 gross 3482 threshold_p = TMPMEMALLOC(2*my_n, double);
560 lgao 3827
561     #pragma omp parallel private(i,iptr,bi)
562 gross 3482 {
563    
564     dim_t max_deg=0;
565     double *rtmp=NULL;
566 gross 3451
567 gross 3482 #pragma omp for schedule(static)
568     for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
569     +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
570 gross 3193
571 gross 3482 rtmp=TMPMEMALLOC(max_deg, double);
572 gross 3283
573 gross 3482 #pragma omp for schedule(static)
574     for (i=0;i<my_n;++i) {
575     register double max_offdiagonal = 0.;
576     register double sum_row=0;
577     register double main_row=0;
578     register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
579     register dim_t kdeg=0;
580 caltinay 3489 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
581 gross 3193
582 gross 3482 /* collect information for row i: */
583     for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
584     register index_t j=A->mainBlock->pattern->index[iptr];
585     register double fnorm=0;
586     #pragma ivdep
587 lgao 3827 for(bi=0;bi<block_size;++bi) {
588     register double rtmp2= A->mainBlock->val[iptr*block_size+bi];
589 gross 3482 fnorm+=rtmp2*rtmp2;
590 gross 3283 }
591 gross 3482 fnorm=sqrt(fnorm);
592     rtmp[iptr+rtmp_offset]=fnorm;
593    
594     if( j != i) {
595 gross 3445 max_offdiagonal = MAX(max_offdiagonal,fnorm);
596     sum_row+=fnorm;
597 gross 3482 } else {
598     main_row=fnorm;
599 gross 3445 }
600 gross 3482
601     }
602    
603     rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
604     for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
605     register double fnorm=0;
606     #pragma ivdep
607 lgao 3827 for(bi=0;bi<block_size;++bi) {
608     register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
609 gross 3482 fnorm+=rtmp2*rtmp2;
610 gross 3283 }
611 gross 3482 fnorm=sqrt(fnorm);
612    
613     rtmp[iptr+rtmp_offset]=fnorm;
614     max_offdiagonal = MAX(max_offdiagonal,fnorm);
615     sum_row+=fnorm;
616 gross 3283 }
617 gross 3482
618    
619     /* inspect row i: */
620     {
621     const double threshold = theta*max_offdiagonal;
622     rtmp_offset=-A->mainBlock->pattern->ptr[i];
623    
624     threshold_p[2*i+1]=threshold;
625 caltinay 3642 if (tau*main_row < sum_row) { /* no diagonal dominance */
626 gross 3482 threshold_p[2*i]=1;
627     #pragma ivdep
628     for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
629     register index_t j=A->mainBlock->pattern->index[iptr];
630     if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
631     S[koffset+kdeg] = j;
632     kdeg++;
633 gross 3351 }
634 gross 3283 }
635 gross 3482 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
636     #pragma ivdep
637     for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
638     register index_t j=A->col_coupleBlock->pattern->index[iptr];
639     if( rtmp[iptr+rtmp_offset] >threshold) {
640     S[koffset+kdeg] = j + my_n;
641     kdeg++;
642 gross 3351 }
643     }
644 gross 3482 } else {
645     threshold_p[2*i]=-1;
646     }
647     }
648     offset_S[i]=koffset;
649     degree_S[i]=kdeg;
650     }
651     TMPMEMFREE(rtmp);
652     }
653     /* now we need to distribute the threshold and the diagonal dominance indicator */
654     if (A->mpi_info->size > 1) {
655    
656     const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
657     -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
658    
659     double *remote_threshold=NULL;
660    
661     Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
662     Paso_Coupler_startCollect(threshold_coupler,threshold_p);
663     Paso_Coupler_finishCollect(threshold_coupler);
664     remote_threshold=threshold_coupler->recv_buffer;
665    
666     #pragma omp parallel for private(i,iptr) schedule(static)
667     for (i=0; i<overlap_n; i++) {
668    
669     const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
670     register dim_t kdeg=0;
671 lgao 3827 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
672 gross 3482 if (remote_threshold[2*i]>0) {
673     #pragma ivdep
674     for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
675     register index_t j=A->row_coupleBlock->pattern->index[iptr];
676     register double fnorm2=0;
677     #pragma ivdepremote_threshold[2*i]
678 lgao 3827 for(bi=0;bi<block_size;++bi) {
679     register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
680 gross 3482 fnorm2+=rtmp2*rtmp2;
681 gross 3351 }
682 gross 3482
683     if(fnorm2 > threshold2 ) {
684     S[koffset+kdeg] = j ;
685     kdeg++;
686 gross 3351 }
687 gross 3283 }
688 lgao 3827
689     #pragma ivdep
690     for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
691     register index_t j=A->remote_coupleBlock->pattern->index[iptr];
692     register double fnorm2=0;
693     #pragma ivdepremote_threshold[2*i]
694     for(bi=0;bi<block_size;++bi) {
695     register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
696     fnorm2+=rtmp2*rtmp2;
697     }
698     if(fnorm2 > threshold2 && i != j) {
699     S[koffset+kdeg] = j + my_n;
700     kdeg++;
701     }
702     }
703 gross 3351
704 gross 3283 }
705 gross 3482 offset_S[i+my_n]=koffset;
706     degree_S[i+my_n]=kdeg;
707 gross 3283 }
708 gross 3482 Paso_Coupler_free(threshold_coupler);
709 gross 3283 }
710 gross 3482 TMPMEMFREE(threshold_p);
711 gross 3193 }
712 caltinay 3642
713 gross 3482 void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
714     const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
715 gross 3440 {
716 gross 3482 index_t i, j;
717     dim_t p;
718     dim_t len=0;
719 gross 3485 #pragma omp parallel for private(i) schedule(static)
720 gross 3482 for (i=0; i<nT ;++i) {
721     degree_ST[i]=0;
722     }
723     for (i=0; i<n ;++i) {
724     for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
725     }
726     for (i=0; i<nT ;++i) {
727     offset_ST[i]=len;
728     len+=degree_ST[i];
729     degree_ST[i]=0;
730     }
731     for (i=0; i<n ;++i) {
732     for (p=0; p<degree_S[i]; ++p) {
733     j=S[offset_S[i]+p];
734     ST[offset_ST[j]+degree_ST[j]]=i;
735     degree_ST[j]++;
736     }
737     }
738 gross 3440 }
739 gross 3442
740 lgao 3827 int compareindex(const void *a, const void *b)
741     {
742     return (*(int *)a - *(int *)b);
743     }
744    
745 gross 3482 void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
746     const dim_t* degree_S, const index_t* offset_S, const index_t* S,
747     const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
748     Paso_Connector* col_connector, Paso_Distribution* col_dist)
749 gross 3442 {
750 gross 3482 dim_t i, numUndefined, iter=0;
751     index_t iptr, jptr, kptr;
752     double *random=NULL, *w=NULL, *Status=NULL;
753     index_t * ST_flag=NULL;
754 lgao 3827 index_t rank=col_connector->mpi_info->rank;
755 gross 3445
756 gross 3482 Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector ,1);
757    
758     w=TMPMEMALLOC(n, double);
759     Status=TMPMEMALLOC(n, double);
760     random = Paso_Distribution_createRandomVector(col_dist,1);
761     ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
762 lgao 3827
763 gross 3482 #pragma omp parallel for private(i)
764     for (i=0; i< my_n; ++i) {
765     w[i]=degree_ST[i]+random[i];
766     if (degree_ST[i] < 1) {
767     Status[i]=-100; /* F point */
768     } else {
769     Status[i]=1; /* status undefined */
770     }
771     }
772 lgao 3827
773 gross 3482 #pragma omp parallel for private(i, iptr)
774     for (i=0; i< n; ++i) {
775     for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
776     ST_flag[offset_ST[i]+iptr]=1;
777     }
778     }
779    
780 gross 3445
781 gross 3482 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
782     printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);
783     iter=0;
784     while (numUndefined > 0) {
785     Paso_Coupler_fillOverlap(n, w, w_coupler);
786 lgao 3827
787 caltinay 3642 /* calculate the maximum value of neighbours following active strong connections:
788     w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active */
789 gross 3482 #pragma omp parallel for private(i, iptr)
790     for (i=0; i<my_n; ++i) {
791     if (Status[i]>0) { /* status is still undefined */
792 gross 3449
793 gross 3482 register bool_t inD=TRUE;
794     const double wi=w[i];
795    
796     for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
797     const index_t k=S[offset_S[i]+iptr];
798     const index_t* start_p = &ST[offset_ST[k]];
799     const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
800    
801 lgao 3827 if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
802 gross 3482 if (wi <= w[k] ) {
803     inD=FALSE;
804     break;
805     }
806     }
807    
808     }
809    
810     if (inD) {
811     for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
812     const index_t k=ST[offset_ST[i]+iptr];
813     if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
814     if (wi <= w[k] ) {
815     inD=FALSE;
816     break;
817     }
818     }
819     }
820     }
821     if (inD) {
822     Status[i]=0.; /* is in D */
823     }
824     }
825    
826 gross 3449 }
827    
828 gross 3482 Paso_Coupler_fillOverlap(n, Status, w_coupler);
829 gross 3442
830    
831 gross 3482 /* remove connection to D points :
832    
833     for each i in D:
834     for each j in S_i:
835     w[j]--
836     ST_tag[j,i]=-1
837     for each j in ST[i]:
838     ST_tag[i,j]=-1
839     for each k in ST[j]:
840     if k in ST[i]:
841     w[j]--;
842     ST_tag[j,k]=-1
843    
844     */
845     /* w is updated for local rows only */
846     {
847 gross 3485 #pragma omp parallel for private(i, jptr)
848 gross 3482 for (i=0; i< my_n; ++i) {
849 gross 3445
850 gross 3482 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
851     const index_t j=ST[offset_ST[i]+jptr];
852     if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
853     w[i]--;
854     ST_flag[offset_ST[i]+jptr]=-1;
855     }
856     }
857    
858     }
859 gross 3485 #pragma omp parallel for private(i, jptr)
860 gross 3482 for (i=my_n; i< n; ++i) {
861     for (jptr=0; jptr<degree_ST[i]; ++jptr) {
862     const index_t j = ST[offset_ST[i]+jptr];
863     if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
864     }
865     }
866 gross 3442
867 gross 3482
868     for (i=0; i< n; ++i) {
869     if ( Status[i] == 0. ) {
870 gross 3445
871 gross 3482 const index_t* start_p = &ST[offset_ST[i]];
872    
873     for (jptr=0; jptr<degree_ST[i]; ++jptr) {
874     const index_t j=ST[offset_ST[i]+jptr];
875     ST_flag[offset_ST[i]+jptr]=-1;
876     for (kptr=0; kptr<degree_ST[j]; ++kptr) {
877     const index_t k=ST[offset_ST[j]+kptr];
878     if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
879     if (ST_flag[offset_ST[j]+kptr] >0) {
880     if (j< my_n ) {
881     w[j]--;
882     }
883     ST_flag[offset_ST[j]+kptr]=-1;
884     }
885     }
886     }
887     }
888     }
889     }
890     }
891 lgao 3827
892 gross 3482 /* adjust status */
893     #pragma omp parallel for private(i)
894     for (i=0; i< my_n; ++i) {
895     if ( Status[i] == 0. ) {
896     Status[i] = -10; /* this is now a C point */
897 lgao 3827 } else if (Status[i] == 1. && w[i]<1.) {
898 gross 3482 Status[i] = -100; /* this is now a F point */
899     }
900     }
901    
902 lgao 3827 i = numUndefined;
903 gross 3482 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
904 lgao 3827 if (numUndefined == i) {
905     Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
906     return;
907     }
908    
909 gross 3482 iter++;
910     printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);
911 lgao 3827
912 gross 3482 } /* end of while loop */
913    
914     /* map to output :*/
915     Paso_Coupler_fillOverlap(n, Status, w_coupler);
916     #pragma omp parallel for private(i)
917     for (i=0; i< n; ++i) {
918     if (Status[i] > -50.) {
919     split_marker[i]=PASO_AMG_IN_C;
920     } else {
921     split_marker[i]=PASO_AMG_IN_F;
922     }
923     }
924     /* clean up : */
925     Paso_Coupler_free(w_coupler);
926     TMPMEMFREE(random);
927     TMPMEMFREE(w);
928     TMPMEMFREE(Status);
929     TMPMEMFREE(ST_flag);
930    
931     return;
932 gross 3442 }
933 lgao 3827
934     /* Merge the system matrix which is distributed on ranks into a complete
935     matrix on rank 0, then solve this matrix on rank 0 only */
936     Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
937     index_t i, iptr, j, n, remote_n, total_n, len, offset, tag;
938     index_t row_block_size, col_block_size, block_size;
939     index_t size=A->mpi_info->size;
940     index_t rank=A->mpi_info->rank;
941     index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
942     index_t *temp_n=NULL, *temp_len=NULL;
943     double *val=NULL;
944     Paso_Pattern *pattern=NULL;
945     Paso_SparseMatrix *out=NULL;
946     #ifdef ESYS_MPI
947     MPI_Request* mpi_requests=NULL;
948     MPI_Status* mpi_stati=NULL;
949     #else
950     int *mpi_requests=NULL, *mpi_stati=NULL;
951     #endif
952    
953     if (size == 1) {
954     n = A->mainBlock->numRows;
955     ptr = TMPMEMALLOC(n, index_t);
956     #pragma omp parallel for private(i)
957     for (i=0; i<n; i++) ptr[i] = i;
958     out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
959     TMPMEMFREE(ptr);
960     return out;
961     }
962    
963     n = A->mainBlock->numRows;
964     block_size = A->block_size;
965    
966     /* Merge MainBlock and CoupleBlock to get a complete column entries
967     for each row allocated to current rank. Output (ptr, idx, val)
968     contains all info needed from current rank to merge a system
969     matrix */
970     Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
971    
972     #ifdef ESYS_MPI
973     mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
974     mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
975     #else
976     mpi_requests=TMPMEMALLOC(size*2,int);
977     mpi_stati=TMPMEMALLOC(size*2,int);
978     #endif
979    
980     /* Now, pass all info to rank 0 and merge them into one sparse
981     matrix */
982     if (rank == 0) {
983     /* First, copy local ptr values into ptr_global */
984     total_n=Paso_SystemMatrix_getGlobalNumRows(A);
985     ptr_global = MEMALLOC(total_n+1, index_t);
986     memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
987     iptr = n+1;
988     MEMFREE(ptr);
989     temp_n = TMPMEMALLOC(size, index_t);
990     temp_len = TMPMEMALLOC(size, index_t);
991     temp_n[0] = iptr;
992    
993     /* Second, receive ptr values from other ranks */
994     for (i=1; i<size; i++) {
995     remote_n = A->row_distribution->first_component[i+1] -
996     A->row_distribution->first_component[i];
997     MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
998     A->mpi_info->msg_tag_counter+i,
999     A->mpi_info->comm,
1000     &mpi_requests[i]);
1001     temp_n[i] = remote_n;
1002     iptr += remote_n;
1003     }
1004     MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1005     A->mpi_info->msg_tag_counter += size;
1006    
1007     /* Then, prepare to receive idx and val from other ranks */
1008     len = 0;
1009     offset = -1;
1010     for (i=0; i<size; i++) {
1011     if (temp_n[i] > 0) {
1012     offset += temp_n[i];
1013     len += ptr_global[offset];
1014     temp_len[i] = ptr_global[offset];
1015     }else
1016     temp_len[i] = 0;
1017     }
1018    
1019     idx_global = MEMALLOC(len, index_t);
1020     iptr = temp_len[0];
1021     offset = n+1;
1022     for (i=1; i<size; i++) {
1023     len = temp_len[i];
1024     MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1025     A->mpi_info->msg_tag_counter+i,
1026     A->mpi_info->comm,
1027     &mpi_requests[i]);
1028     remote_n = temp_n[i];
1029     for (j=0; j<remote_n; j++) {
1030     ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1031     }
1032     offset += remote_n;
1033     iptr += len;
1034     }
1035     memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1036     MEMFREE(idx);
1037     row_block_size = A->mainBlock->row_block_size;
1038     col_block_size = A->mainBlock->col_block_size;
1039     MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1040     A->mpi_info->msg_tag_counter += size;
1041     TMPMEMFREE(temp_n);
1042    
1043     /* Then generate the sparse matrix */
1044     pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, total_n,
1045     total_n, ptr_global, idx_global);
1046     out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1047     row_block_size, col_block_size, FALSE);
1048     Paso_Pattern_free(pattern);
1049    
1050     /* Finally, receive and copy the value */
1051     iptr = temp_len[0] * block_size;
1052     for (i=1; i<size; i++) {
1053     len = temp_len[i];
1054     MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1055     A->mpi_info->msg_tag_counter+i,
1056     A->mpi_info->comm,
1057     &mpi_requests[i]);
1058     iptr += (len * block_size);
1059     }
1060     memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1061     MEMFREE(val);
1062     MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1063     A->mpi_info->msg_tag_counter += size;
1064     TMPMEMFREE(temp_len);
1065     } else { /* it's not rank 0 */
1066    
1067     /* First, send out the local ptr */
1068     tag = A->mpi_info->msg_tag_counter+rank;
1069     MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1070     &mpi_requests[0]);
1071    
1072     /* Next, send out the local idx */
1073     len = ptr[n];
1074     tag += size;
1075     MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1076     &mpi_requests[1]);
1077    
1078     /* At last, send out the local val */
1079     len *= block_size;
1080     tag += size;
1081     MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1082     &mpi_requests[2]);
1083    
1084     MPI_Waitall(3, mpi_requests, mpi_stati);
1085     A->mpi_info->msg_tag_counter = tag + size - rank;
1086     MEMFREE(ptr);
1087     MEMFREE(idx);
1088     MEMFREE(val);
1089    
1090     out = NULL;
1091     }
1092    
1093     TMPMEMFREE(mpi_requests);
1094     TMPMEMFREE(mpi_stati);
1095     return out;
1096     }
1097    
1098    
1099     void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1100     Paso_SystemMatrix *A = amg->A_C;
1101     Paso_SparseMatrix *A_D, *A_temp;
1102     double* x=NULL;
1103     double* b=NULL;
1104     index_t rank = A->mpi_info->rank;
1105     index_t size = A->mpi_info->size;
1106     index_t i, n, p, count, n_block;
1107     index_t *counts, *offset, *dist;
1108    
1109     n_block = amg->n_block;
1110    
1111     A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1112    
1113     /* First, gather x and b into rank 0 */
1114     dist = A->pattern->input_distribution->first_component;
1115     n = Paso_SystemMatrix_getGlobalNumRows(A);
1116     b = TMPMEMALLOC(n*n_block, double);
1117     x = TMPMEMALLOC(n*n_block, double);
1118     counts = TMPMEMALLOC(size, index_t);
1119     offset = TMPMEMALLOC(size, index_t);
1120    
1121     #pragma omp parallel for private(i,p)
1122     for (i=0; i<size; i++) {
1123     p = dist[i];
1124     counts[i] = (dist[i+1] - p)*n_block;
1125     offset[i] = p*n_block;
1126     }
1127     count = counts[rank];
1128     MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1129     MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1130    
1131     if (rank == 0) {
1132     /* solve locally */
1133     #ifdef MKL
1134     A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1135     A_temp->solver_package = PASO_MKL;
1136     Paso_SparseMatrix_free(A_D);
1137     Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1138     Paso_SparseMatrix_free(A_temp);
1139     #else
1140     #ifdef UMFPACK
1141     A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1142     A_temp->solver_package = PASO_UMFPACK;
1143     Paso_SparseMatrix_free(A_D);
1144     Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1145     Paso_SparseMatrix_free(A_temp);
1146     #else
1147     A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1148     A_D->solver_package = PASO_SMOOTHER;
1149     Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1150     Paso_SparseMatrix_free(A_D);
1151     #endif
1152     #endif
1153     }
1154    
1155     /* now we need to distribute the solution to all ranks */
1156     MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1157    
1158     TMPMEMFREE(x);
1159     TMPMEMFREE(b);
1160     TMPMEMFREE(counts);
1161     TMPMEMFREE(offset);
1162     }

  ViewVC Help
Powered by ViewVC 1.1.26