/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Annotation of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3886 - (hide annotations)
Thu Apr 5 00:50:30 2012 UTC (7 years, 5 months ago) by gross
File MIME type: text/plain
File size: 37370 byte(s)
some minor problems with MPI AMG fixed.
1 gross 3193
2     /*******************************************************
3     *
4     * Copyright (c) 2003-2010 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13    
14    
15     /**************************************************************/
16    
17 gross 3441 /* Paso: AMG preconditioner (local version) */
18 gross 3193
19     /**************************************************************/
20    
21 caltinay 3642 /* Author: artak@uq.edu.au, l.gross@uq.edu.au */
22 gross 3193
23     /**************************************************************/
24    
25 gross 3283 #define SHOW_TIMING FALSE
26 lgao 3827 #define MY_DEBUG 0
27     #define MY_DEBUG1 1
28 gross 3283
29 gross 3193 #include "Paso.h"
30     #include "Preconditioner.h"
31     #include "Options.h"
32     #include "PasoUtil.h"
33     #include "UMFPACK.h"
34     #include "MKL.h"
35 gross 3458 #include<stdio.h>
36 gross 3193
37 gross 3458
38 gross 3193 /**************************************************************/
39    
40     /* free all memory used by AMG */
41    
42 gross 3441 void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43 gross 3193 if (in!=NULL) {
44 gross 3441 Paso_Preconditioner_Smoother_free(in->Smoother);
45     Paso_SystemMatrix_free(in->P);
46     Paso_SystemMatrix_free(in->R);
47     Paso_SystemMatrix_free(in->A_C);
48     Paso_Preconditioner_AMG_free(in->AMG_C);
49 gross 3283 MEMFREE(in->r);
50     MEMFREE(in->x_C);
51     MEMFREE(in->b_C);
52 gross 3193
53 gross 3283 MEMFREE(in);
54 gross 3193 }
55     }
56    
57 gross 3441 index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
58 gross 3402 if (in->AMG_C == NULL) {
59     return in->level;
60     } else {
61 gross 3441 return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
62 gross 3402 }
63     }
64 gross 3441 double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
65 gross 3403 if (in->AMG_C == NULL) {
66     if (in->A_C == NULL) {
67     return 1.;
68     } else {
69 gross 3442 return Paso_SystemMatrix_getSparsity(in->A_C);
70 gross 3403 }
71     } else {
72 gross 3441 return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
73 gross 3403 }
74 gross 3402 }
75 gross 3441 dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
76 gross 3402 if (in->AMG_C == NULL) {
77 gross 3403 if (in->A_C == NULL) {
78     return 0;
79     } else {
80 gross 3445 return Paso_SystemMatrix_getTotalNumRows(in->A_C);
81 gross 3403 }
82 gross 3402 } else {
83 gross 3441 return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
84 gross 3402 }
85     }
86 gross 3283 /*****************************************************************
87 gross 3193
88 gross 3283 constructs AMG
89    
90     ******************************************************************/
91 gross 3441 Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92 gross 3193
93 gross 3441 Paso_Preconditioner_AMG* out=NULL;
94 lgao 3828 Paso_SystemMatrix *A_C=NULL;
95 gross 3193 bool_t verbose=options->verbose;
96 gross 3445
97 gross 3482 const dim_t my_n=A_p->mainBlock->numRows;
98     const dim_t overlap_n=A_p->row_coupleBlock->numRows;
99    
100 gross 3445 const dim_t n = my_n + overlap_n;
101    
102 gross 3193 const dim_t n_block=A_p->row_block_size;
103 lgao 3827 index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
104 gross 3886 dim_t i, my_n_F, my_n_C, n_C, F_flag, *F_set=NULL, global_n_C=0, global_n_F=0, n_F;
105 gross 3283 double time0=0;
106     const double theta = options->coarsening_threshold;
107     const double tau = options->diagonal_dominance_threshold;
108 gross 3442 const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
109 gross 3885 const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
110 plaub 3436
111 lgao 3827
112 gross 3193 /*
113 caltinay 3642 is the input matrix A suitable for coarsening?
114 gross 3283
115 gross 3193 */
116 gross 3442 if ( (sparsity >= options->min_coarse_sparsity) ||
117 gross 3885 (global_n <= options->min_coarse_matrix_size) ||
118 gross 3442 (level > options->level_max) ) {
119 plaub 3436
120 gross 3445 if (verbose) {
121 plaub 3436 /*
122     print stopping condition:
123     - 'SPAR' = min_coarse_matrix_sparsity exceeded
124     - 'SIZE' = min_coarse_matrix_size exceeded
125     - 'LEVEL' = level_max exceeded
126     */
127 gross 3442 printf("Paso_Preconditioner: AMG: termination of coarsening by ");
128 plaub 3436
129 gross 3442 if (sparsity >= options->min_coarse_sparsity)
130 caltinay 3642 printf("SPAR");
131 plaub 3436
132 gross 3885 if (global_n <= options->min_coarse_matrix_size)
133 caltinay 3642 printf("SIZE");
134 plaub 3436
135     if (level > options->level_max)
136 caltinay 3642 printf("LEVEL");
137 plaub 3436
138     printf("\n");
139    
140     printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
141 gross 3885 level, options->level_max, sparsity, options->min_coarse_sparsity, global_n, options->min_coarse_matrix_size);
142 plaub 3436
143 gross 3445 }
144 plaub 3436
145 gross 3445 return NULL;
146     } else {
147 gross 3451 /* Start Coarsening : */
148 gross 3482
149     /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
150 lgao 3827 const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
151 gross 3482
152     dim_t* degree_S=TMPMEMALLOC(n, dim_t);
153     index_t *offset_S=TMPMEMALLOC(n, index_t);
154 gross 3451 index_t *S=TMPMEMALLOC(len_S, index_t);
155 gross 3482 dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
156     index_t *offset_ST=TMPMEMALLOC(n, index_t);
157     index_t *ST=TMPMEMALLOC(len_S, index_t);
158    
159    
160 gross 3440 F_marker=TMPMEMALLOC(n,index_t);
161 gross 3283 counter=TMPMEMALLOC(n,index_t);
162 gross 3445
163 gross 3482 if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
164     || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
165 gross 3458 /*
166     make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
167     */
168 gross 3482 Paso_SystemMatrix_copyColCoupleBlock(A_p);
169 lgao 3827 Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
170 gross 3458
171     /*
172 gross 3283 set splitting of unknows:
173 gross 3458
174 gross 3283 */
175     time0=Esys_timer();
176     if (n_block>1) {
177 gross 3445 Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
178 gross 3283 } else {
179 gross 3445 Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
180 gross 3283 }
181 gross 3482 Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
182 lgao 3828 /* Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
183     */
184 lgao 3827
185 gross 3482 Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
186     degree_S, offset_S, S, degree_ST, offset_ST, ST,
187     A_p->col_coupler->connector,A_p->col_distribution);
188 gross 3449
189 gross 3442
190 caltinay 3642 /* in BoomerAMG if interpolation is used FF connectivity is required */
191 gross 3442 /*MPI:
192 gross 3440 if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
193     Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);
194 gross 3442 */
195    
196 gross 3283 options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
197     if (Esys_noError() ) {
198     #pragma omp parallel for private(i) schedule(static)
199 gross 3440 for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F);
200 gross 3283
201     /*
202     count number of unkowns to be eliminated:
203     */
204 gross 3886 my_n_F=Paso_Util_cumsum_maskedTrue(my_n,counter, F_marker);
205     n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
206     /* collect my_n_F values on all processes, a direct solver should
207     be used if any my_n_F value is 0 */
208 lgao 3827 F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
209 lgao 3828 #ifdef ESYS_MPI
210 gross 3886 MPI_Allgather(&my_n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
211 lgao 3828 #endif
212 gross 3885 global_n_F=0;
213 lgao 3827 F_flag = 1;
214     for (i=0; i<A_p->mpi_info->size; i++) {
215 gross 3885 global_n_F+=F_set[i];
216     if (F_set[i] == 0) F_flag = 0;
217 lgao 3827 }
218     TMPMEMFREE(F_set);
219 gross 3884
220 gross 3886 my_n_C=my_n-my_n_F;
221 gross 3885 global_n_C=global_n-global_n_F;
222     if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,global_n_F,global_n_C);
223 gross 3884
224 lgao 3827
225 lgao 3828 /* if ( n_F == 0 ) { is a nasty case. a direct solver should be used, return NULL */
226 lgao 3827 if (F_flag == 0) {
227 gross 3283 out = NULL;
228     } else {
229 gross 3441 out=MEMALLOC(1,Paso_Preconditioner_AMG);
230 gross 3323 if (! Esys_checkPtr(out)) {
231 gross 3283 out->level = level;
232     out->A_C = NULL;
233     out->P = NULL;
234     out->R = NULL;
235 lgao 3828 out->post_sweeps = options->post_sweeps;
236     out->pre_sweeps = options->pre_sweeps;
237 gross 3283 out->r = NULL;
238     out->x_C = NULL;
239     out->b_C = NULL;
240     out->AMG_C = NULL;
241 gross 3323 out->Smoother=NULL;
242     }
243     mask_C=TMPMEMALLOC(n,index_t);
244     rows_in_F=TMPMEMALLOC(n_F,index_t);
245     Esys_checkPtr(mask_C);
246     Esys_checkPtr(rows_in_F);
247     if ( Esys_noError() ) {
248    
249 lgao 3827 out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
250 gross 3323
251 gross 3885 if (global_n_C != 0) {
252 gross 3886 /* create mask of C nodes with value >-1, gives new id */
253     n_C=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
254     /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
255 gross 3283
256 gross 3315 /* allocate helpers :*/
257 gross 3886 out->x_C=MEMALLOC(n_block*my_n_C,double);
258     out->b_C=MEMALLOC(n_block*my_n_C,double);
259     out->r=MEMALLOC(n_block*my_n,double);
260 gross 3315
261     Esys_checkPtr(out->r);
262     Esys_checkPtr(out->x_C);
263     Esys_checkPtr(out->b_C);
264    
265 gross 3323 if ( Esys_noError() ) {
266     /* creates index for F:*/
267     #pragma omp parallel private(i)
268     {
269     #pragma omp for schedule(static)
270     for (i = 0; i < n; ++i) {
271 gross 3440 if (F_marker[i]) rows_in_F[counter[i]]=i;
272 gross 3323 }
273     }
274     /*
275 gross 3440 get Prolongation :
276 gross 3323 */
277 lgao 3827
278 gross 3323 time0=Esys_timer();
279 lgao 3827
280     out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
281    
282 gross 3283 }
283 lgao 3827
284 gross 3323 /*
285 gross 3440 construct Restriction operator as transposed of Prolongation operator:
286 gross 3283 */
287 lgao 3827
288 gross 3315 if ( Esys_noError()) {
289     time0=Esys_timer();
290 lgao 3827
291     out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
292    
293 gross 3441 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
294 gross 3315 }
295 lgao 3827 /*
296     construct coarse level matrix:
297     */
298     if ( Esys_noError()) {
299     time0=Esys_timer();
300 gross 3442
301 lgao 3827 A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
302 gross 3315
303 lgao 3827 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
304     }
305    
306 gross 3283 /*
307     constructe courser level:
308    
309     */
310     if ( Esys_noError()) {
311 gross 3441 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
312 gross 3315 }
313 lgao 3827
314 gross 3315 if ( Esys_noError()) {
315 lgao 3827 if ( out->AMG_C == NULL ) {
316     /* merge the system matrix into 1 rank when
317     it's not suitable coarsening due to the
318     total number of unknowns are too small */
319     out->A_C=A_C;
320 gross 3283 out->reordering = options->reordering;
321 lgao 3827 out->refinements = options->coarse_matrix_refinements;
322     out->verbose = verbose;
323     out->options_smoother = options->smoother;
324     } else {
325 gross 3283 /* finally we set some helpers for the solver step */
326     out->A_C=A_C;
327 lgao 3827 }
328 gross 3283 }
329     }
330     }
331     TMPMEMFREE(mask_C);
332     TMPMEMFREE(rows_in_F);
333     }
334     }
335 gross 3445
336 gross 3193 }
337     TMPMEMFREE(counter);
338 gross 3440 TMPMEMFREE(F_marker);
339     TMPMEMFREE(degree_S);
340 gross 3445 TMPMEMFREE(offset_S);
341 gross 3303 TMPMEMFREE(S);
342 gross 3482 TMPMEMFREE(degree_ST);
343     TMPMEMFREE(offset_ST);
344     TMPMEMFREE(ST);
345    
346 gross 3445 }
347 gross 3193
348 jfenwick 3259 if (Esys_noError()) {
349 gross 3193 return out;
350     } else {
351 gross 3441 Paso_Preconditioner_AMG_free(out);
352 gross 3193 return NULL;
353     }
354     }
355    
356    
357 gross 3441 void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
358 lgao 3827 const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
359 gross 3283 double time0=0;
360     const dim_t post_sweeps=amg->post_sweeps;
361     const dim_t pre_sweeps=amg->pre_sweeps;
362 gross 3193
363 gross 3283 /* presmoothing */
364     time0=Esys_timer();
365 gross 3442 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
366 lgao 3827
367 gross 3283 time0=Esys_timer()-time0;
368 caltinay 3642 if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
369 gross 3283 /* end of presmoothing */
370    
371 gross 3886 time0=Esys_timer();
372 lgao 3827
373 gross 3886 Paso_Copy(n, amg->r, b); /* r <- b */
374     Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
375     Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */
376 lgao 3827
377 gross 3886 time0=Esys_timer()-time0;
378     /* coarse level solve */
379     if ( amg->AMG_C == NULL) {
380 gross 3283 time0=Esys_timer();
381     /* A_C is the coarsest level */
382 lgao 3827 Paso_Preconditioner_AMG_mergeSolve(amg);
383    
384 gross 3283 if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
385 gross 3886 } else {
386 gross 3441 Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */
387 gross 3886 }
388 lgao 3827
389 gross 3886 time0=time0+Esys_timer();
390     Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */
391 lgao 3827
392 gross 3886 /*postsmoothing*/
393 gross 3283
394 gross 3886 /*solve Ax=b with initial guess x */
395     time0=Esys_timer();
396     Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
397     time0=Esys_timer()-time0;
398     if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
399     return;
400 gross 3283 }
401 gross 3193
402 gross 3283 /* theta = threshold for strong connections */
403     /* tau = threshold for diagonal dominance */
404 gross 3193
405 gross 3303 /*S_i={j \in N_i; i strongly coupled to j}
406    
407     in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
408     */
409    
410 gross 3441 void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
411 gross 3445 dim_t *degree_S, index_t *offset_S, index_t *S,
412 gross 3303 const double theta, const double tau)
413 gross 3283 {
414 gross 3482
415     const dim_t my_n=A->mainBlock->numRows;
416     const dim_t overlap_n=A->row_coupleBlock->numRows;
417    
418 gross 3445 index_t iptr, i;
419 gross 3482 double *threshold_p=NULL;
420 gross 3193
421 gross 3482 threshold_p = TMPMEMALLOC(2*my_n, double);
422    
423     #pragma omp parallel for private(i,iptr) schedule(static)
424     for (i=0;i<my_n;++i) {
425    
426 gross 3445 register double max_offdiagonal = 0.;
427     register double sum_row=0;
428     register double main_row=0;
429 gross 3451 register dim_t kdeg=0;
430 caltinay 3489 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
431 lgao 3827
432 gross 3482
433     /* collect information for row i: */
434 gross 3283 #pragma ivdep
435 gross 3445 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
436     register index_t j=A->mainBlock->pattern->index[iptr];
437     register double fnorm=ABS(A->mainBlock->val[iptr]);
438 gross 3283 if( j != i) {
439     max_offdiagonal = MAX(max_offdiagonal,fnorm);
440     sum_row+=fnorm;
441     } else {
442     main_row=fnorm;
443     }
444 gross 3445
445 gross 3283 }
446 lgao 3827
447 gross 3445 #pragma ivdep
448     for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
449     register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
450 lgao 3827
451 gross 3445 max_offdiagonal = MAX(max_offdiagonal,fnorm);
452     sum_row+=fnorm;
453     }
454    
455 gross 3482 /* inspect row i: */
456 gross 3451 {
457     const double threshold = theta*max_offdiagonal;
458 gross 3482 threshold_p[2*i+1]=threshold;
459 caltinay 3642 if (tau*main_row < sum_row) { /* no diagonal dominance */
460 gross 3482 threshold_p[2*i]=1;
461 gross 3451 #pragma ivdep
462     for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
463     register index_t j=A->mainBlock->pattern->index[iptr];
464     if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
465     S[koffset+kdeg] = j;
466     kdeg++;
467     }
468 gross 3283 }
469 lgao 3828 #pragma ivdep
470 gross 3451 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
471     register index_t j=A->col_coupleBlock->pattern->index[iptr];
472     if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
473     S[koffset+kdeg] = j + my_n;
474     kdeg++;
475     }
476 gross 3445 }
477 gross 3482 } else {
478     threshold_p[2*i]=-1;
479 gross 3451 }
480 gross 3440 }
481 gross 3445 offset_S[i]=koffset;
482 gross 3440 degree_S[i]=kdeg;
483 gross 3283 }
484 lgao 3827
485 gross 3482 /* now we need to distribute the threshold and the diagonal dominance indicator */
486     if (A->mpi_info->size > 1) {
487    
488     const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
489     -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
490    
491     double *remote_threshold=NULL;
492    
493     Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
494     Paso_Coupler_startCollect(threshold_coupler,threshold_p);
495     Paso_Coupler_finishCollect(threshold_coupler);
496     remote_threshold=threshold_coupler->recv_buffer;
497    
498     #pragma omp parallel for private(i,iptr) schedule(static)
499     for (i=0; i<overlap_n; i++) {
500     const double threshold = remote_threshold[2*i+1];
501     register dim_t kdeg=0;
502 lgao 3827 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
503 gross 3482 if (remote_threshold[2*i]>0) {
504 lgao 3827 #pragma ivdep
505     for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
506 gross 3482 register index_t j=A->row_coupleBlock->pattern->index[iptr];
507     if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
508     S[koffset+kdeg] = j ;
509     kdeg++;
510     }
511 lgao 3827 }
512 gross 3482
513 lgao 3827 #pragma ivdep
514     for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
515     register index_t j=A->remote_coupleBlock->pattern->index[iptr];
516     if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
517     S[koffset+kdeg] = j + my_n;
518     kdeg++;
519     }
520     }
521 gross 3482 }
522     offset_S[i+my_n]=koffset;
523     degree_S[i+my_n]=kdeg;
524     }
525    
526     Paso_Coupler_free(threshold_coupler);
527     }
528     TMPMEMFREE(threshold_p);
529 gross 3283 }
530 gross 3193
531 gross 3283 /* theta = threshold for strong connections */
532     /* tau = threshold for diagonal dominance */
533 gross 3303 /*S_i={j \in N_i; i strongly coupled to j}
534 gross 3283
535 gross 3303 in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
536     */
537 gross 3441 void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
538 gross 3445 dim_t *degree_S, index_t *offset_S, index_t *S,
539 gross 3303 const double theta, const double tau)
540    
541 gross 3283 {
542 lgao 3827 const dim_t block_size=A->block_size;
543 gross 3482 const dim_t my_n=A->mainBlock->numRows;
544     const dim_t overlap_n=A->row_coupleBlock->numRows;
545    
546 gross 3445 index_t iptr, i, bi;
547 gross 3482 double *threshold_p=NULL;
548 gross 3283
549    
550 gross 3482 threshold_p = TMPMEMALLOC(2*my_n, double);
551 lgao 3827
552     #pragma omp parallel private(i,iptr,bi)
553 gross 3482 {
554    
555     dim_t max_deg=0;
556     double *rtmp=NULL;
557 gross 3451
558 gross 3482 #pragma omp for schedule(static)
559     for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
560     +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
561 gross 3193
562 gross 3482 rtmp=TMPMEMALLOC(max_deg, double);
563 gross 3283
564 gross 3482 #pragma omp for schedule(static)
565     for (i=0;i<my_n;++i) {
566     register double max_offdiagonal = 0.;
567     register double sum_row=0;
568     register double main_row=0;
569     register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
570     register dim_t kdeg=0;
571 caltinay 3489 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
572 gross 3193
573 gross 3482 /* collect information for row i: */
574     for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
575     register index_t j=A->mainBlock->pattern->index[iptr];
576     register double fnorm=0;
577     #pragma ivdep
578 lgao 3827 for(bi=0;bi<block_size;++bi) {
579     register double rtmp2= A->mainBlock->val[iptr*block_size+bi];
580 gross 3482 fnorm+=rtmp2*rtmp2;
581 gross 3283 }
582 gross 3482 fnorm=sqrt(fnorm);
583     rtmp[iptr+rtmp_offset]=fnorm;
584    
585     if( j != i) {
586 gross 3445 max_offdiagonal = MAX(max_offdiagonal,fnorm);
587     sum_row+=fnorm;
588 gross 3482 } else {
589     main_row=fnorm;
590 gross 3445 }
591 gross 3482
592     }
593    
594     rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
595     for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
596     register double fnorm=0;
597     #pragma ivdep
598 lgao 3827 for(bi=0;bi<block_size;++bi) {
599     register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
600 gross 3482 fnorm+=rtmp2*rtmp2;
601 gross 3283 }
602 gross 3482 fnorm=sqrt(fnorm);
603    
604     rtmp[iptr+rtmp_offset]=fnorm;
605     max_offdiagonal = MAX(max_offdiagonal,fnorm);
606     sum_row+=fnorm;
607 gross 3283 }
608 gross 3482
609    
610     /* inspect row i: */
611     {
612     const double threshold = theta*max_offdiagonal;
613     rtmp_offset=-A->mainBlock->pattern->ptr[i];
614    
615     threshold_p[2*i+1]=threshold;
616 caltinay 3642 if (tau*main_row < sum_row) { /* no diagonal dominance */
617 gross 3482 threshold_p[2*i]=1;
618     #pragma ivdep
619     for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
620     register index_t j=A->mainBlock->pattern->index[iptr];
621     if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
622     S[koffset+kdeg] = j;
623     kdeg++;
624 gross 3351 }
625 gross 3283 }
626 gross 3482 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
627     #pragma ivdep
628     for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
629     register index_t j=A->col_coupleBlock->pattern->index[iptr];
630     if( rtmp[iptr+rtmp_offset] >threshold) {
631     S[koffset+kdeg] = j + my_n;
632     kdeg++;
633 gross 3351 }
634     }
635 gross 3482 } else {
636     threshold_p[2*i]=-1;
637     }
638     }
639     offset_S[i]=koffset;
640     degree_S[i]=kdeg;
641     }
642     TMPMEMFREE(rtmp);
643     }
644     /* now we need to distribute the threshold and the diagonal dominance indicator */
645     if (A->mpi_info->size > 1) {
646    
647     const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
648     -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
649    
650     double *remote_threshold=NULL;
651    
652     Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
653     Paso_Coupler_startCollect(threshold_coupler,threshold_p);
654     Paso_Coupler_finishCollect(threshold_coupler);
655     remote_threshold=threshold_coupler->recv_buffer;
656    
657     #pragma omp parallel for private(i,iptr) schedule(static)
658     for (i=0; i<overlap_n; i++) {
659    
660     const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
661     register dim_t kdeg=0;
662 lgao 3827 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
663 gross 3482 if (remote_threshold[2*i]>0) {
664     #pragma ivdep
665     for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
666     register index_t j=A->row_coupleBlock->pattern->index[iptr];
667     register double fnorm2=0;
668     #pragma ivdepremote_threshold[2*i]
669 lgao 3827 for(bi=0;bi<block_size;++bi) {
670     register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
671 gross 3482 fnorm2+=rtmp2*rtmp2;
672 gross 3351 }
673 gross 3482
674     if(fnorm2 > threshold2 ) {
675     S[koffset+kdeg] = j ;
676     kdeg++;
677 gross 3351 }
678 gross 3283 }
679 lgao 3827
680     #pragma ivdep
681     for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
682     register index_t j=A->remote_coupleBlock->pattern->index[iptr];
683     register double fnorm2=0;
684     #pragma ivdepremote_threshold[2*i]
685     for(bi=0;bi<block_size;++bi) {
686     register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
687     fnorm2+=rtmp2*rtmp2;
688     }
689     if(fnorm2 > threshold2 && i != j) {
690     S[koffset+kdeg] = j + my_n;
691     kdeg++;
692     }
693     }
694 gross 3351
695 gross 3283 }
696 gross 3482 offset_S[i+my_n]=koffset;
697     degree_S[i+my_n]=kdeg;
698 gross 3283 }
699 gross 3482 Paso_Coupler_free(threshold_coupler);
700 gross 3283 }
701 gross 3482 TMPMEMFREE(threshold_p);
702 gross 3193 }
703 caltinay 3642
704 gross 3482 void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
705     const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
706 gross 3440 {
707 gross 3482 index_t i, j;
708     dim_t p;
709     dim_t len=0;
710 gross 3485 #pragma omp parallel for private(i) schedule(static)
711 gross 3482 for (i=0; i<nT ;++i) {
712     degree_ST[i]=0;
713     }
714     for (i=0; i<n ;++i) {
715     for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
716     }
717     for (i=0; i<nT ;++i) {
718     offset_ST[i]=len;
719     len+=degree_ST[i];
720     degree_ST[i]=0;
721     }
722     for (i=0; i<n ;++i) {
723     for (p=0; p<degree_S[i]; ++p) {
724     j=S[offset_S[i]+p];
725     ST[offset_ST[j]+degree_ST[j]]=i;
726     degree_ST[j]++;
727     }
728     }
729 gross 3440 }
730 gross 3442
731 lgao 3827 int compareindex(const void *a, const void *b)
732     {
733     return (*(int *)a - *(int *)b);
734     }
735    
736 gross 3482 void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
737     const dim_t* degree_S, const index_t* offset_S, const index_t* S,
738     const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
739     Paso_Connector* col_connector, Paso_Distribution* col_dist)
740 gross 3442 {
741 gross 3482 dim_t i, numUndefined, iter=0;
742     index_t iptr, jptr, kptr;
743     double *random=NULL, *w=NULL, *Status=NULL;
744     index_t * ST_flag=NULL;
745 gross 3445
746 gross 3482 Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector ,1);
747    
748     w=TMPMEMALLOC(n, double);
749     Status=TMPMEMALLOC(n, double);
750     random = Paso_Distribution_createRandomVector(col_dist,1);
751     ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
752 lgao 3827
753 gross 3482 #pragma omp parallel for private(i)
754     for (i=0; i< my_n; ++i) {
755     w[i]=degree_ST[i]+random[i];
756     if (degree_ST[i] < 1) {
757     Status[i]=-100; /* F point */
758     } else {
759     Status[i]=1; /* status undefined */
760     }
761     }
762 lgao 3827
763 gross 3482 #pragma omp parallel for private(i, iptr)
764     for (i=0; i< n; ++i) {
765     for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
766     ST_flag[offset_ST[i]+iptr]=1;
767     }
768     }
769    
770 gross 3445
771 gross 3482 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
772 gross 3884 /* printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined); */
773 gross 3482 iter=0;
774     while (numUndefined > 0) {
775     Paso_Coupler_fillOverlap(n, w, w_coupler);
776 lgao 3827
777 caltinay 3642 /* calculate the maximum value of neighbours following active strong connections:
778     w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active */
779 gross 3482 #pragma omp parallel for private(i, iptr)
780     for (i=0; i<my_n; ++i) {
781     if (Status[i]>0) { /* status is still undefined */
782 gross 3449
783 gross 3482 register bool_t inD=TRUE;
784     const double wi=w[i];
785    
786     for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
787     const index_t k=S[offset_S[i]+iptr];
788     const index_t* start_p = &ST[offset_ST[k]];
789     const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
790    
791 lgao 3827 if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
792 gross 3482 if (wi <= w[k] ) {
793     inD=FALSE;
794     break;
795     }
796     }
797    
798     }
799    
800     if (inD) {
801     for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
802     const index_t k=ST[offset_ST[i]+iptr];
803     if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
804     if (wi <= w[k] ) {
805     inD=FALSE;
806     break;
807     }
808     }
809     }
810     }
811     if (inD) {
812     Status[i]=0.; /* is in D */
813     }
814     }
815    
816 gross 3449 }
817    
818 gross 3482 Paso_Coupler_fillOverlap(n, Status, w_coupler);
819 gross 3442
820    
821 gross 3482 /* remove connection to D points :
822    
823     for each i in D:
824     for each j in S_i:
825     w[j]--
826     ST_tag[j,i]=-1
827     for each j in ST[i]:
828     ST_tag[i,j]=-1
829     for each k in ST[j]:
830     if k in ST[i]:
831     w[j]--;
832     ST_tag[j,k]=-1
833    
834     */
835     /* w is updated for local rows only */
836     {
837 gross 3485 #pragma omp parallel for private(i, jptr)
838 gross 3482 for (i=0; i< my_n; ++i) {
839 gross 3445
840 gross 3482 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
841     const index_t j=ST[offset_ST[i]+jptr];
842     if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
843     w[i]--;
844     ST_flag[offset_ST[i]+jptr]=-1;
845     }
846     }
847    
848     }
849 gross 3485 #pragma omp parallel for private(i, jptr)
850 gross 3482 for (i=my_n; i< n; ++i) {
851     for (jptr=0; jptr<degree_ST[i]; ++jptr) {
852     const index_t j = ST[offset_ST[i]+jptr];
853     if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
854     }
855     }
856 gross 3442
857 gross 3482
858     for (i=0; i< n; ++i) {
859     if ( Status[i] == 0. ) {
860 gross 3445
861 gross 3482 const index_t* start_p = &ST[offset_ST[i]];
862    
863     for (jptr=0; jptr<degree_ST[i]; ++jptr) {
864     const index_t j=ST[offset_ST[i]+jptr];
865     ST_flag[offset_ST[i]+jptr]=-1;
866     for (kptr=0; kptr<degree_ST[j]; ++kptr) {
867     const index_t k=ST[offset_ST[j]+kptr];
868     if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
869     if (ST_flag[offset_ST[j]+kptr] >0) {
870     if (j< my_n ) {
871     w[j]--;
872     }
873     ST_flag[offset_ST[j]+kptr]=-1;
874     }
875     }
876     }
877     }
878     }
879     }
880     }
881 lgao 3827
882 gross 3482 /* adjust status */
883     #pragma omp parallel for private(i)
884     for (i=0; i< my_n; ++i) {
885     if ( Status[i] == 0. ) {
886     Status[i] = -10; /* this is now a C point */
887 lgao 3827 } else if (Status[i] == 1. && w[i]<1.) {
888 gross 3482 Status[i] = -100; /* this is now a F point */
889     }
890     }
891    
892 lgao 3827 i = numUndefined;
893 gross 3482 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
894 lgao 3827 if (numUndefined == i) {
895     Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
896     return;
897     }
898    
899 gross 3482 iter++;
900 gross 3884 /* printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined); */
901 lgao 3827
902 gross 3482 } /* end of while loop */
903    
904     /* map to output :*/
905     Paso_Coupler_fillOverlap(n, Status, w_coupler);
906     #pragma omp parallel for private(i)
907     for (i=0; i< n; ++i) {
908     if (Status[i] > -50.) {
909     split_marker[i]=PASO_AMG_IN_C;
910     } else {
911     split_marker[i]=PASO_AMG_IN_F;
912     }
913     }
914     /* clean up : */
915     Paso_Coupler_free(w_coupler);
916     TMPMEMFREE(random);
917     TMPMEMFREE(w);
918     TMPMEMFREE(Status);
919     TMPMEMFREE(ST_flag);
920    
921     return;
922 gross 3442 }
923 lgao 3827
924     /* Merge the system matrix which is distributed on ranks into a complete
925     matrix on rank 0, then solve this matrix on rank 0 only */
926     Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
927 gross 3885 index_t i, iptr, j, n, remote_n, global_n, len, offset, tag;
928 lgao 3827 index_t row_block_size, col_block_size, block_size;
929     index_t size=A->mpi_info->size;
930     index_t rank=A->mpi_info->rank;
931     index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
932     index_t *temp_n=NULL, *temp_len=NULL;
933     double *val=NULL;
934     Paso_Pattern *pattern=NULL;
935     Paso_SparseMatrix *out=NULL;
936     #ifdef ESYS_MPI
937     MPI_Request* mpi_requests=NULL;
938     MPI_Status* mpi_stati=NULL;
939     #else
940     int *mpi_requests=NULL, *mpi_stati=NULL;
941     #endif
942    
943     if (size == 1) {
944     n = A->mainBlock->numRows;
945     ptr = TMPMEMALLOC(n, index_t);
946     #pragma omp parallel for private(i)
947     for (i=0; i<n; i++) ptr[i] = i;
948     out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
949     TMPMEMFREE(ptr);
950     return out;
951     }
952    
953     n = A->mainBlock->numRows;
954     block_size = A->block_size;
955    
956     /* Merge MainBlock and CoupleBlock to get a complete column entries
957     for each row allocated to current rank. Output (ptr, idx, val)
958     contains all info needed from current rank to merge a system
959     matrix */
960     Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
961    
962     #ifdef ESYS_MPI
963     mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
964     mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
965     #else
966     mpi_requests=TMPMEMALLOC(size*2,int);
967     mpi_stati=TMPMEMALLOC(size*2,int);
968     #endif
969    
970     /* Now, pass all info to rank 0 and merge them into one sparse
971     matrix */
972     if (rank == 0) {
973     /* First, copy local ptr values into ptr_global */
974 gross 3885 global_n=Paso_SystemMatrix_getGlobalNumRows(A);
975     ptr_global = MEMALLOC(global_n+1, index_t);
976 lgao 3827 memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
977     iptr = n+1;
978     MEMFREE(ptr);
979     temp_n = TMPMEMALLOC(size, index_t);
980     temp_len = TMPMEMALLOC(size, index_t);
981     temp_n[0] = iptr;
982    
983     /* Second, receive ptr values from other ranks */
984     for (i=1; i<size; i++) {
985     remote_n = A->row_distribution->first_component[i+1] -
986     A->row_distribution->first_component[i];
987 lgao 3828 #ifdef ESYS_MPI
988 lgao 3827 MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
989     A->mpi_info->msg_tag_counter+i,
990     A->mpi_info->comm,
991     &mpi_requests[i]);
992 lgao 3828 #endif
993 lgao 3827 temp_n[i] = remote_n;
994     iptr += remote_n;
995     }
996 lgao 3828 #ifdef ESYS_MPI
997 lgao 3827 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
998 lgao 3828 #endif
999 lgao 3827 A->mpi_info->msg_tag_counter += size;
1000    
1001     /* Then, prepare to receive idx and val from other ranks */
1002     len = 0;
1003     offset = -1;
1004     for (i=0; i<size; i++) {
1005     if (temp_n[i] > 0) {
1006     offset += temp_n[i];
1007     len += ptr_global[offset];
1008     temp_len[i] = ptr_global[offset];
1009     }else
1010     temp_len[i] = 0;
1011     }
1012    
1013     idx_global = MEMALLOC(len, index_t);
1014     iptr = temp_len[0];
1015     offset = n+1;
1016     for (i=1; i<size; i++) {
1017     len = temp_len[i];
1018 lgao 3828 #ifdef ESYS_MPI
1019 lgao 3827 MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1020     A->mpi_info->msg_tag_counter+i,
1021     A->mpi_info->comm,
1022     &mpi_requests[i]);
1023 lgao 3828 #endif
1024 lgao 3827 remote_n = temp_n[i];
1025     for (j=0; j<remote_n; j++) {
1026     ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1027     }
1028     offset += remote_n;
1029     iptr += len;
1030     }
1031     memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1032     MEMFREE(idx);
1033     row_block_size = A->mainBlock->row_block_size;
1034     col_block_size = A->mainBlock->col_block_size;
1035 lgao 3828 #ifdef ESYS_MPI
1036 lgao 3827 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1037 lgao 3828 #endif
1038 lgao 3827 A->mpi_info->msg_tag_counter += size;
1039     TMPMEMFREE(temp_n);
1040    
1041     /* Then generate the sparse matrix */
1042 gross 3885 pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, global_n,
1043     global_n, ptr_global, idx_global);
1044 lgao 3827 out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1045     row_block_size, col_block_size, FALSE);
1046     Paso_Pattern_free(pattern);
1047    
1048     /* Finally, receive and copy the value */
1049     iptr = temp_len[0] * block_size;
1050     for (i=1; i<size; i++) {
1051     len = temp_len[i];
1052 lgao 3828 #ifdef ESYS_MPI
1053 lgao 3827 MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1054     A->mpi_info->msg_tag_counter+i,
1055     A->mpi_info->comm,
1056     &mpi_requests[i]);
1057 lgao 3828 #endif
1058 lgao 3827 iptr += (len * block_size);
1059     }
1060     memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1061     MEMFREE(val);
1062 lgao 3828 #ifdef ESYS_MPI
1063 lgao 3827 MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1064 lgao 3828 #endif
1065 lgao 3827 A->mpi_info->msg_tag_counter += size;
1066     TMPMEMFREE(temp_len);
1067     } else { /* it's not rank 0 */
1068    
1069     /* First, send out the local ptr */
1070     tag = A->mpi_info->msg_tag_counter+rank;
1071 lgao 3828 #ifdef ESYS_MPI
1072 lgao 3827 MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1073     &mpi_requests[0]);
1074 lgao 3828 #endif
1075 lgao 3827
1076     /* Next, send out the local idx */
1077     len = ptr[n];
1078     tag += size;
1079 lgao 3828 #ifdef ESYS_MPI
1080 lgao 3827 MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1081     &mpi_requests[1]);
1082 lgao 3828 #endif
1083 lgao 3827
1084     /* At last, send out the local val */
1085     len *= block_size;
1086     tag += size;
1087 lgao 3828 #ifdef ESYS_MPI
1088 lgao 3827 MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1089     &mpi_requests[2]);
1090    
1091     MPI_Waitall(3, mpi_requests, mpi_stati);
1092 lgao 3828 #endif
1093 lgao 3827 A->mpi_info->msg_tag_counter = tag + size - rank;
1094     MEMFREE(ptr);
1095     MEMFREE(idx);
1096     MEMFREE(val);
1097    
1098     out = NULL;
1099     }
1100    
1101     TMPMEMFREE(mpi_requests);
1102     TMPMEMFREE(mpi_stati);
1103     return out;
1104     }
1105    
1106    
1107     void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1108     Paso_SystemMatrix *A = amg->A_C;
1109     Paso_SparseMatrix *A_D, *A_temp;
1110     double* x=NULL;
1111     double* b=NULL;
1112 gross 3886 const index_t rank = A->mpi_info->rank;
1113     const index_t size = A->mpi_info->size;
1114     const n_block = A->mainBlock->row_block_size;
1115    
1116     index_t i, n, p;
1117 lgao 3827 index_t *counts, *offset, *dist;
1118 gross 3835 #ifdef ESYS_MPI
1119     index_t count;
1120     #endif
1121 lgao 3827 A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1122    
1123     /* First, gather x and b into rank 0 */
1124     dist = A->pattern->input_distribution->first_component;
1125     n = Paso_SystemMatrix_getGlobalNumRows(A);
1126     b = TMPMEMALLOC(n*n_block, double);
1127     x = TMPMEMALLOC(n*n_block, double);
1128     counts = TMPMEMALLOC(size, index_t);
1129     offset = TMPMEMALLOC(size, index_t);
1130    
1131     #pragma omp parallel for private(i,p)
1132     for (i=0; i<size; i++) {
1133     p = dist[i];
1134     counts[i] = (dist[i+1] - p)*n_block;
1135     offset[i] = p*n_block;
1136     }
1137 lgao 3828 #ifdef ESYS_MPI
1138 gross 3835 count = counts[rank];
1139     MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1140     MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1141 lgao 3828 #endif
1142 lgao 3827
1143     if (rank == 0) {
1144     /* solve locally */
1145     #ifdef MKL
1146     A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1147     A_temp->solver_package = PASO_MKL;
1148     Paso_SparseMatrix_free(A_D);
1149     Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1150     Paso_SparseMatrix_free(A_temp);
1151     #else
1152     #ifdef UMFPACK
1153     A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1154     A_temp->solver_package = PASO_UMFPACK;
1155     Paso_SparseMatrix_free(A_D);
1156     Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1157     Paso_SparseMatrix_free(A_temp);
1158     #else
1159     A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1160     A_D->solver_package = PASO_SMOOTHER;
1161     Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1162     Paso_SparseMatrix_free(A_D);
1163     #endif
1164     #endif
1165     }
1166    
1167 lgao 3828 #ifdef ESYS_MPI
1168 lgao 3827 /* now we need to distribute the solution to all ranks */
1169     MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1170 lgao 3828 #endif
1171 lgao 3827
1172     TMPMEMFREE(x);
1173     TMPMEMFREE(b);
1174     TMPMEMFREE(counts);
1175     TMPMEMFREE(offset);
1176     }

  ViewVC Help
Powered by ViewVC 1.1.26