/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Diff of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3642 by caltinay, Thu Oct 27 03:41:51 2011 UTC revision 3827 by lgao, Tue Feb 14 11:42:08 2012 UTC
# Line 23  Line 23 
23  /**************************************************************/  /**************************************************************/
24    
25  #define SHOW_TIMING FALSE  #define SHOW_TIMING FALSE
26    #define MY_DEBUG 0
27    #define MY_DEBUG1 1
28    
29  #include "Paso.h"  #include "Paso.h"
30  #include "Preconditioner.h"  #include "Preconditioner.h"
# Line 89  dim_t Paso_Preconditioner_AMG_getNumCoar Line 91  dim_t Paso_Preconditioner_AMG_getNumCoar
91  Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {  Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92    
93    Paso_Preconditioner_AMG* out=NULL;    Paso_Preconditioner_AMG* out=NULL;
94      Paso_SystemMatrix *A_C;
95    bool_t verbose=options->verbose;    bool_t verbose=options->verbose;
96    
97    const dim_t my_n=A_p->mainBlock->numRows;    const dim_t my_n=A_p->mainBlock->numRows;
# Line 97  Paso_Preconditioner_AMG* Paso_Preconditi Line 100  Paso_Preconditioner_AMG* Paso_Preconditi
100    const dim_t n = my_n + overlap_n;    const dim_t n = my_n + overlap_n;
101    
102    const dim_t n_block=A_p->row_block_size;    const dim_t n_block=A_p->row_block_size;
103    index_t* F_marker=NULL, *counter=NULL;  //  const dim_t n_block=A_p->block_size;
104    dim_t i;    index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
105      dim_t i, n_F, n_C, F_flag, *F_set=NULL;
106    double time0=0;    double time0=0;
107    const double theta = options->coarsening_threshold;    const double theta = options->coarsening_threshold;
108    const double tau = options->diagonal_dominance_threshold;    const double tau = options->diagonal_dominance_threshold;
109    const double sparsity=Paso_SystemMatrix_getSparsity(A_p);    const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
110    const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);    const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);
111      const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
112    
113    
     
114    /*    /*
115        is the input matrix A suitable for coarsening?        is the input matrix A suitable for coarsening?
116                
117    */    */
118    if ( (sparsity >= options->min_coarse_sparsity) ||    if ( (sparsity >= options->min_coarse_sparsity) ||
119         (total_n <= options->min_coarse_matrix_size) ||  //       (total_n <= options->min_coarse_matrix_size) ||
120           (global_n <= options->min_coarse_matrix_size) ||
121         (level > options->level_max) ) {         (level > options->level_max) ) {
122    
123          if (verbose) {          if (verbose) {
# Line 144  Paso_Preconditioner_AMG* Paso_Preconditi Line 150  Paso_Preconditioner_AMG* Paso_Preconditi
150       /* Start Coarsening : */       /* Start Coarsening : */
151            
152       /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */       /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
153       const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len;       const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len  + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
154    
155       dim_t* degree_S=TMPMEMALLOC(n, dim_t);       dim_t* degree_S=TMPMEMALLOC(n, dim_t);
156       index_t *offset_S=TMPMEMALLOC(n, index_t);       index_t *offset_S=TMPMEMALLOC(n, index_t);
# Line 163  Paso_Preconditioner_AMG* Paso_Preconditi Line 169  Paso_Preconditioner_AMG* Paso_Preconditi
169             make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical             make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
170      */      */
171          Paso_SystemMatrix_copyColCoupleBlock(A_p);          Paso_SystemMatrix_copyColCoupleBlock(A_p);
172          /* Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE); */          Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
173    
174      /*      /*
175            set splitting of unknows:            set splitting of unknows:
# Line 176  Paso_Preconditioner_AMG* Paso_Preconditi Line 182  Paso_Preconditioner_AMG* Paso_Preconditi
182             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
183       }       }
184       Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);       Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
185        //   Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
186  {  
    dim_t p;  
    for (i=0; i<n; ++i) {  
          printf("%d: ",i);  
         for (p=0; p<degree_S[i];++p) printf("%d ",S[offset_S[i]+p]);  
         printf("\n");  
    }  
 }  
187       Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,       Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
188                              degree_S, offset_S, S, degree_ST, offset_ST, ST,                              degree_S, offset_S, S, degree_ST, offset_ST, ST,
189                          A_p->col_coupler->connector,A_p->col_distribution);                          A_p->col_coupler->connector,A_p->col_distribution);
 Esys_setError(SYSTEM_ERROR, "AMG:DONE.");  
 return NULL;  
190                
191    
 /*MPI:  
      Paso_Preconditioner_AMG_RungeStuebenSearch(n, A_p->pattern->ptr, degree_S, S, F_marker, options->usePanel);  
 */  
       
192           /* in BoomerAMG if interpolation is used FF connectivity is required */           /* in BoomerAMG if interpolation is used FF connectivity is required */
193  /*MPI:  /*MPI:
194           if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)           if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
# Line 203  return NULL; Line 196  return NULL;
196  */  */
197    
198       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
   
 #ifdef AAAAA  
199       if (Esys_noError() ) {       if (Esys_noError() ) {
200          #pragma omp parallel for private(i) schedule(static)          #pragma omp parallel for private(i) schedule(static)
201          for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] ==  PASO_AMG_IN_F);          for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] ==  PASO_AMG_IN_F);
# Line 214  return NULL; Line 205  return NULL;
205          */          */
206          n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);          n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
207          n_C=n-n_F;          n_C=n-n_F;
208          if (verbose) printf("Paso_Preconditioner: AMG level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);          if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);
209        
210          if ( n_F == 0 ) {  /* this is a nasty case. a direct solver should be used, return NULL */              /* collect n_F values on all processes, a direct solver should
211                    be used if any n_F value is 0 */
212                F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
213                MPI_Allgather(&n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
214                F_flag = 1;
215                for (i=0; i<A_p->mpi_info->size; i++) {
216                    if (F_set[i] == 0) {
217                      F_flag = 0;
218                      break;
219                    }
220                }
221                TMPMEMFREE(F_set);
222            
223    //          if ( n_F == 0 ) {  /*  is a nasty case. a direct solver should be used, return NULL */
224                if (F_flag == 0) {
225             out = NULL;             out = NULL;
226          } else {          } else {
227             out=MEMALLOC(1,Paso_Preconditioner_AMG);             out=MEMALLOC(1,Paso_Preconditioner_AMG);
# Line 228  return NULL; Line 233  return NULL;
233            out->A_C = NULL;            out->A_C = NULL;
234            out->P = NULL;              out->P = NULL;  
235            out->R = NULL;                      out->R = NULL;          
236            out->post_sweeps = options->post_sweeps;  //        out->post_sweeps = options->post_sweeps;
237            out->pre_sweeps  = options->pre_sweeps;  //        out->pre_sweeps  = options->pre_sweeps;
238              out->post_sweeps = 2;
239              out->pre_sweeps = 2;
240            out->r = NULL;            out->r = NULL;
241            out->x_C = NULL;            out->x_C = NULL;
242            out->b_C = NULL;            out->b_C = NULL;
# Line 242  return NULL; Line 249  return NULL;
249             Esys_checkPtr(rows_in_F);             Esys_checkPtr(rows_in_F);
250             if ( Esys_noError() ) {             if ( Esys_noError() ) {
251    
252            out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), verbose);            out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
253                
254            if (n_C != 0) {            if (n_C != 0) {
255                 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */                 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
# Line 266  return NULL; Line 273  return NULL;
273                    }                    }
274                 }                 }
275                 /*  create mask of C nodes with value >-1, gives new id */                 /*  create mask of C nodes with value >-1, gives new id */
276                 i=Paso_Util_cumsum_maskedFalse(n,counter, F_marker);                 i=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
   
                #pragma omp parallel for private(i) schedule(static)  
                for (i = 0; i < n; ++i) {  
                   if  (F_marker[i]) {  
                  mask_C[i]=-1;  
                   } else {  
                  mask_C[i]=counter[i];;  
                   }  
                }  
277                 /*                 /*
278                    get Prolongation :                        get Prolongation :    
279                 */                                   */                  
280  /*MPI:  
281                 time0=Esys_timer();                 time0=Esys_timer();
282                 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,A_p->pattern->ptr, degree_S,S,n_C,mask_C, options->interpolation_method);  
283                 if (SHOW_TIMING) printf("timing: level %d: getProlongation: %e\n",level, Esys_timer()-time0);                 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
284  */  
285              }              }
286    
287              /*                    /*      
288                 construct Restriction operator as transposed of Prolongation operator:                 construct Restriction operator as transposed of Prolongation operator:
289              */              */
290  /*MPI:  
291              if ( Esys_noError()) {              if ( Esys_noError()) {
292                 time0=Esys_timer();                 time0=Esys_timer();
293                 out->R=Paso_SystemMatrix_getTranspose(out->P);  
294                   out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
295    
296                 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);                 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
297              }                    }      
298  */              /*
299              /*                          construct coarse level matrix:
300              construct coarse level matrix:                          */
301              */                          if ( Esys_noError()) {
302  /*MPI:                             time0=Esys_timer();
             if ( Esys_noError()) {  
                time0=Esys_timer();  
                Atemp=Paso_SystemMatrix_MatrixMatrix(A_p,out->P);  
                A_C=Paso_SystemMatrix_MatrixMatrix(out->R,Atemp);  
                Paso_Preconditioner_AMG_setStrongConnections  
                Paso_SystemMatrix_free(Atemp);  
303    
304                 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);                                         A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
305              }  
306  */                             if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
307                            }
308    
               
309              /*              /*
310                 constructe courser level:                 constructe courser level:
311                                
# Line 318  return NULL; Line 313  return NULL;
313              if ( Esys_noError()) {              if ( Esys_noError()) {
314                 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);                 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
315              }              }
316    
317              if ( Esys_noError()) {              if ( Esys_noError()) {
318                 if ( out->AMG_C == NULL ) {                if ( out->AMG_C == NULL ) {
319                      /* merge the system matrix into 1 rank when
320                     it's not suitable coarsening due to the
321                     total number of unknowns are too small */
322                      out->A_C=A_C;
323                    out->reordering = options->reordering;                    out->reordering = options->reordering;
324                    out->refinements = options->coarse_matrix_refinements;                                out->refinements = options->coarse_matrix_refinements;
325                    /* no coarse level matrix has been constructed. use direct solver */                    out->verbose = verbose;
326                    #ifdef MKL                    out->options_smoother = options->smoother;
327                      out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_C);                } else {
                     Paso_SystemMatrix_free(A_C);  
                     out->A_C->solver_package = PASO_MKL;  
                     if (verbose) printf("Paso_Preconditioner: AMG: use MKL direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                   #else  
                     #ifdef UMFPACK  
                        out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_C);  
                        Paso_SystemMatrix_free(A_C);  
                        out->A_C->solver_package = PASO_UMFPACK;  
                        if (verbose) printf("Paso_Preconditioner: AMG: use UMFPACK direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                     #else  
                        out->A_C=A_C;  
                        out->A_C->solver_p=Paso_Preconditioner_Smoother_alloc(out->A_C, (options->smoother == PASO_JACOBI), verbose);  
                        out->A_C->solver_package = PASO_SMOOTHER;  
                        if (verbose) printf("Paso_Preconditioner: AMG: use smoother on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                     #endif  
                   #endif  
                } else {  
328                    /* finally we set some helpers for the solver step */                    /* finally we set some helpers for the solver step */
329                    out->A_C=A_C;                    out->A_C=A_C;
330                 }                }
331              }                      }        
332            }            }
333             }             }
# Line 352  return NULL; Line 335  return NULL;
335             TMPMEMFREE(rows_in_F);             TMPMEMFREE(rows_in_F);
336          }          }
337       }       }
 #endif  
338    
339    }    }
340    TMPMEMFREE(counter);    TMPMEMFREE(counter);
# Line 376  return NULL; Line 358  return NULL;
358    
359    
360  void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {  void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
361       const dim_t n = amg->n * amg->n_block;       const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
362       double time0=0;       double time0=0;
363       const dim_t post_sweeps=amg->post_sweeps;       const dim_t post_sweeps=amg->post_sweeps;
364       const dim_t pre_sweeps=amg->pre_sweeps;       const dim_t pre_sweeps=amg->pre_sweeps;
365         int rank = A->mpi_info->rank;
366    
367       /* presmoothing */       /* presmoothing */
368       time0=Esys_timer();       time0=Esys_timer();
369       Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);       Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
370    
371       time0=Esys_timer()-time0;       time0=Esys_timer()-time0;
372       if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);       if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
373       /* end of presmoothing */       /* end of presmoothing */
374            
375       if (amg->n_F < amg->n) { /* is there work on the coarse level? */       if (amg->n_F < amg->n) { /* is there work on the coarse level? */
376           time0=Esys_timer();           time0=Esys_timer();
377    
378       Paso_Copy(n, amg->r, b);                            /*  r <- b */       Paso_Copy(n, amg->r, b);                            /*  r <- b */
379       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
380       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */
381    
382           time0=Esys_timer()-time0;           time0=Esys_timer()-time0;
383       /* coarse level solve */       /* coarse level solve */
384       if ( amg->AMG_C == NULL) {       if ( amg->AMG_C == NULL) {
385          time0=Esys_timer();          time0=Esys_timer();
386          /*  A_C is the coarsest level */          /*  A_C is the coarsest level */
387  #ifdef FIXME          Paso_Preconditioner_AMG_mergeSolve(amg);
388          switch (amg->A_C->solver_package) {  
            case (PASO_MKL):  
           Paso_MKL(amg->A_C, amg->x_C,amg->b_C, amg->reordering, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_UMFPACK):  
           Paso_UMFPACK(amg->A_C, amg->x_C,amg->b_C, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_SMOOTHER):  
           Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE);  
           break;  
         }  
 #endif  
         Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE);  
389          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
390       } else {       } else {
391          Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */          Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */
392       }         }
393    
394       time0=time0+Esys_timer();       time0=time0+Esys_timer();
395       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */           Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */    
396        
397           /*postsmoothing*/           /*postsmoothing*/
398                
399          /*solve Ax=b with initial guess x */          /*solve Ax=b with initial guess x */
# Line 427  void Paso_Preconditioner_AMG_solve(Paso_ Line 402  void Paso_Preconditioner_AMG_solve(Paso_
402          time0=Esys_timer()-time0;          time0=Esys_timer()-time0;
403          if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);          if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
404          /*end of postsmoothing*/          /*end of postsmoothing*/
       
405       }       }
406    
407       return;       return;
408  }  }
409    
# Line 450  void Paso_Preconditioner_AMG_setStrongCo Line 425  void Paso_Preconditioner_AMG_setStrongCo
425        
426     index_t iptr, i;     index_t iptr, i;
427     double *threshold_p=NULL;     double *threshold_p=NULL;
428       index_t rank=A->mpi_info->rank;
429    
430     threshold_p = TMPMEMALLOC(2*my_n, double);     threshold_p = TMPMEMALLOC(2*my_n, double);
431        
# Line 462  void Paso_Preconditioner_AMG_setStrongCo Line 437  void Paso_Preconditioner_AMG_setStrongCo
437       register double main_row=0;       register double main_row=0;
438       register dim_t kdeg=0;       register dim_t kdeg=0;
439           register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];           register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
440    
441                    
442       /* collect information for row i: */       /* collect information for row i: */
443       #pragma ivdep       #pragma ivdep
444       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
445          register index_t j=A->mainBlock->pattern->index[iptr];          register index_t j=A->mainBlock->pattern->index[iptr];
446          register double fnorm=ABS(A->mainBlock->val[iptr]);          register double fnorm=ABS(A->mainBlock->val[iptr]);
           
447          if( j != i) {          if( j != i) {
448             max_offdiagonal = MAX(max_offdiagonal,fnorm);             max_offdiagonal = MAX(max_offdiagonal,fnorm);
449             sum_row+=fnorm;             sum_row+=fnorm;
# Line 477  void Paso_Preconditioner_AMG_setStrongCo Line 452  void Paso_Preconditioner_AMG_setStrongCo
452          }          }
453    
454       }       }
455        
456       #pragma ivdep       #pragma ivdep
457       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
458          register double fnorm=ABS(A->col_coupleBlock->val[iptr]);          register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
459    
460          max_offdiagonal = MAX(max_offdiagonal,fnorm);          max_offdiagonal = MAX(max_offdiagonal,fnorm);
461          sum_row+=fnorm;          sum_row+=fnorm;
462       }       }
# Line 499  void Paso_Preconditioner_AMG_setStrongCo Line 475  void Paso_Preconditioner_AMG_setStrongCo
475               kdeg++;               kdeg++;
476                }                }
477             }             }
478             #pragma ivdep  //         #pragma ivdep
479             for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {             for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
480                register index_t j=A->col_coupleBlock->pattern->index[iptr];                register index_t j=A->col_coupleBlock->pattern->index[iptr];
481                if(ABS(A->col_coupleBlock->val[iptr])>threshold) {                if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
# Line 514  void Paso_Preconditioner_AMG_setStrongCo Line 490  void Paso_Preconditioner_AMG_setStrongCo
490           offset_S[i]=koffset;           offset_S[i]=koffset;
491       degree_S[i]=kdeg;       degree_S[i]=kdeg;
492        }        }
493    
494        /* now we need to distribute the threshold and the diagonal dominance indicator */        /* now we need to distribute the threshold and the diagonal dominance indicator */
495        if (A->mpi_info->size > 1) {        if (A->mpi_info->size > 1) {
496    
# Line 529  void Paso_Preconditioner_AMG_setStrongCo Line 506  void Paso_Preconditioner_AMG_setStrongCo
506    
507            #pragma omp parallel for private(i,iptr) schedule(static)            #pragma omp parallel for private(i,iptr) schedule(static)
508            for (i=0; i<overlap_n; i++) {            for (i=0; i<overlap_n; i++) {
           
509            const double threshold = remote_threshold[2*i+1];            const double threshold = remote_threshold[2*i+1];
510            register dim_t kdeg=0;            register dim_t kdeg=0;
511                register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i];                register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
512                if (remote_threshold[2*i]>0) {                if (remote_threshold[2*i]>0) {
513               #pragma ivdep          #pragma ivdep
514               for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
515                register index_t j=A->row_coupleBlock->pattern->index[iptr];                register index_t j=A->row_coupleBlock->pattern->index[iptr];
516                if(ABS(A->row_coupleBlock->val[iptr])>threshold) {                if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
517               S[koffset+kdeg] = j ;               S[koffset+kdeg] = j ;
518               kdeg++;               kdeg++;
519                }                }
520             }          }
521    
522            #pragma ivdep
523            for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
524              register index_t j=A->remote_coupleBlock->pattern->index[iptr];
525              if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
526                  S[koffset+kdeg] = j + my_n;
527                  kdeg++;
528              }
529            }
530                }                }
531                offset_S[i+my_n]=koffset;                offset_S[i+my_n]=koffset;
532            degree_S[i+my_n]=kdeg;            degree_S[i+my_n]=kdeg;
# Line 564  void Paso_Preconditioner_AMG_setStrongCo Line 548  void Paso_Preconditioner_AMG_setStrongCo
548                              const double theta, const double tau)                              const double theta, const double tau)
549    
550  {  {
551     const dim_t n_block=A->block_size;     const dim_t block_size=A->block_size;
552     const dim_t my_n=A->mainBlock->numRows;     const dim_t my_n=A->mainBlock->numRows;
553     const dim_t overlap_n=A->row_coupleBlock->numRows;     const dim_t overlap_n=A->row_coupleBlock->numRows;
554        
# Line 573  void Paso_Preconditioner_AMG_setStrongCo Line 557  void Paso_Preconditioner_AMG_setStrongCo
557        
558        
559     threshold_p = TMPMEMALLOC(2*my_n, double);     threshold_p = TMPMEMALLOC(2*my_n, double);
560      
561     #pragma omp parallel private(i,iptr, bi)     #pragma omp parallel private(i,iptr,bi)
562     {     {
563        
564        dim_t max_deg=0;        dim_t max_deg=0;
# Line 600  void Paso_Preconditioner_AMG_setStrongCo Line 584  void Paso_Preconditioner_AMG_setStrongCo
584          register index_t j=A->mainBlock->pattern->index[iptr];          register index_t j=A->mainBlock->pattern->index[iptr];
585          register double fnorm=0;          register double fnorm=0;
586          #pragma ivdep          #pragma ivdep
587          for(bi=0;bi<n_block;++bi) {          for(bi=0;bi<block_size;++bi) {
588              register double  rtmp2= A->mainBlock->val[iptr*n_block+bi];              register double  rtmp2= A->mainBlock->val[iptr*block_size+bi];
589             fnorm+=rtmp2*rtmp2;             fnorm+=rtmp2*rtmp2;
590          }          }
591          fnorm=sqrt(fnorm);          fnorm=sqrt(fnorm);
# Line 620  void Paso_Preconditioner_AMG_setStrongCo Line 604  void Paso_Preconditioner_AMG_setStrongCo
604       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
605          register double fnorm=0;          register double fnorm=0;
606          #pragma ivdep          #pragma ivdep
607          for(bi=0;bi<n_block;++bi) {          for(bi=0;bi<block_size;++bi) {
608             register double rtmp2 = A->col_coupleBlock->val[iptr*n_block+bi];             register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
609             fnorm+=rtmp2*rtmp2;             fnorm+=rtmp2*rtmp2;
610          }          }
611          fnorm=sqrt(fnorm);          fnorm=sqrt(fnorm);
# Line 640  void Paso_Preconditioner_AMG_setStrongCo Line 624  void Paso_Preconditioner_AMG_setStrongCo
624          threshold_p[2*i+1]=threshold;          threshold_p[2*i+1]=threshold;
625          if (tau*main_row < sum_row) { /* no diagonal dominance */          if (tau*main_row < sum_row) { /* no diagonal dominance */
626             threshold_p[2*i]=1;             threshold_p[2*i]=1;
            rtmp_offset=-A->mainBlock->pattern->ptr[i];  
627             #pragma ivdep             #pragma ivdep
628             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
629            register index_t j=A->mainBlock->pattern->index[iptr];            register index_t j=A->mainBlock->pattern->index[iptr];
# Line 685  void Paso_Preconditioner_AMG_setStrongCo Line 668  void Paso_Preconditioner_AMG_setStrongCo
668            
669       const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];       const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
670       register dim_t kdeg=0;       register dim_t kdeg=0;
671       register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i];       register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
672       if (remote_threshold[2*i]>0) {       if (remote_threshold[2*i]>0) {
673          #pragma ivdep          #pragma ivdep
674          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
675             register index_t j=A->row_coupleBlock->pattern->index[iptr];             register index_t j=A->row_coupleBlock->pattern->index[iptr];
676             register double fnorm2=0;             register double fnorm2=0;
677             #pragma ivdepremote_threshold[2*i]             #pragma ivdepremote_threshold[2*i]
678             for(bi=0;bi<n_block;++bi) {             for(bi=0;bi<block_size;++bi) {
679            register double rtmp2 = A->row_coupleBlock->val[iptr*n_block+bi];            register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
680            fnorm2+=rtmp2*rtmp2;            fnorm2+=rtmp2*rtmp2;
681             }             }
682                        
# Line 702  void Paso_Preconditioner_AMG_setStrongCo Line 685  void Paso_Preconditioner_AMG_setStrongCo
685            kdeg++;            kdeg++;
686             }             }
687          }          }
688    
689            #pragma ivdep
690                for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
691                   register index_t j=A->remote_coupleBlock->pattern->index[iptr];
692                   register double fnorm2=0;
693                   #pragma ivdepremote_threshold[2*i]
694                   for(bi=0;bi<block_size;++bi) {
695                      register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
696                      fnorm2+=rtmp2*rtmp2;
697                   }
698                   if(fnorm2 > threshold2 && i != j) {
699                      S[koffset+kdeg] = j + my_n;
700                      kdeg++;
701                   }
702                }
703                    
704       }       }
705       offset_S[i+my_n]=koffset;       offset_S[i+my_n]=koffset;
# Line 739  void Paso_Preconditioner_AMG_transposeSt Line 737  void Paso_Preconditioner_AMG_transposeSt
737     }     }
738  }  }
739    
740    int compareindex(const void *a, const void *b)
741    {
742      return (*(int *)a - *(int *)b);
743    }
744    
745  void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,  void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
746                          const dim_t* degree_S, const index_t* offset_S, const index_t* S,                          const dim_t* degree_S, const index_t* offset_S, const index_t* S,
747                          const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,                          const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
# Line 748  void Paso_Preconditioner_AMG_CIJPCoarsen Line 751  void Paso_Preconditioner_AMG_CIJPCoarsen
751    index_t iptr, jptr, kptr;    index_t iptr, jptr, kptr;
752    double *random=NULL, *w=NULL, *Status=NULL;    double *random=NULL, *w=NULL, *Status=NULL;
753    index_t * ST_flag=NULL;    index_t * ST_flag=NULL;
754      index_t rank=col_connector->mpi_info->rank;
755    
756    Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector  ,1);    Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector  ,1);
757        
# Line 755  void Paso_Preconditioner_AMG_CIJPCoarsen Line 759  void Paso_Preconditioner_AMG_CIJPCoarsen
759    Status=TMPMEMALLOC(n, double);    Status=TMPMEMALLOC(n, double);
760    random = Paso_Distribution_createRandomVector(col_dist,1);    random = Paso_Distribution_createRandomVector(col_dist,1);
761    ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);    ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
762      
763    #pragma omp parallel for private(i)    #pragma omp parallel for private(i)
764    for (i=0; i< my_n; ++i) {    for (i=0; i< my_n; ++i) {
765        w[i]=degree_ST[i]+random[i];        w[i]=degree_ST[i]+random[i];
# Line 765  void Paso_Preconditioner_AMG_CIJPCoarsen Line 769  void Paso_Preconditioner_AMG_CIJPCoarsen
769         Status[i]=1; /* status undefined */         Status[i]=1; /* status undefined */
770        }        }
771    }    }
772    
773    #pragma omp parallel for private(i, iptr)    #pragma omp parallel for private(i, iptr)
774    for (i=0; i< n; ++i) {    for (i=0; i< n; ++i) {
775        for( iptr =0 ; iptr < degree_ST[i]; ++iptr)  {        for( iptr =0 ; iptr < degree_ST[i]; ++iptr)  {
# Line 775  void Paso_Preconditioner_AMG_CIJPCoarsen Line 780  void Paso_Preconditioner_AMG_CIJPCoarsen
780        
781    numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );    numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
782    printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);    printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);
   
     
783    iter=0;    iter=0;
784    while (numUndefined > 0) {    while (numUndefined > 0) {
785       Paso_Coupler_fillOverlap(n, w, w_coupler);       Paso_Coupler_fillOverlap(n, w, w_coupler);
786       {  
     int p;  
     for (p=0; p<my_n; ++p) {  
        printf(" %d : %f %f \n",p, w[p], Status[p]);  
     }  
     for (p=my_n; p<n; ++p) {  
        printf(" %d : %f \n",p, w[p]);  
     }  
       
       
      }  
       
787        /* calculate the maximum value of neighbours following active strong connections:        /* calculate the maximum value of neighbours following active strong connections:
788          w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active  */                w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active  */      
789        #pragma omp parallel for private(i, iptr)        #pragma omp parallel for private(i, iptr)
# Line 801  void Paso_Preconditioner_AMG_CIJPCoarsen Line 793  void Paso_Preconditioner_AMG_CIJPCoarsen
793          register bool_t inD=TRUE;          register bool_t inD=TRUE;
794          const double wi=w[i];          const double wi=w[i];
795    
   
796          for( iptr =0 ; iptr < degree_S[i]; ++iptr) {          for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
   
797             const index_t k=S[offset_S[i]+iptr];             const index_t k=S[offset_S[i]+iptr];
798             const index_t* start_p = &ST[offset_ST[k]];             const index_t* start_p = &ST[offset_ST[k]];
799             const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);             const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
800    
801             if (ST_flag[(index_t)(where_p-start_p)]>0) {             if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
 printf("S: %d (%e) -> %d    (%e)\n",i, wi, k, w[k]);        
802            if (wi <= w[k] ) {            if (wi <= w[k] ) {
803               inD=FALSE;               inD=FALSE;
804               break;               break;
# Line 822  printf("S: %d (%e) -> %d   (%e)\n",i, wi, Line 811  printf("S: %d (%e) -> %d   (%e)\n",i, wi,
811            for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {            for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
812               const index_t k=ST[offset_ST[i]+iptr];               const index_t k=ST[offset_ST[i]+iptr];
813               if ( ST_flag[offset_ST[i]+iptr] > 0 ) {               if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
 printf("ST: %d (%e) -> %d   (%e)\n",i, wi, k, w[k]);        
               
814                         if (wi <= w[k] ) {                         if (wi <= w[k] ) {
815                 inD=FALSE;                 inD=FALSE;
816                 break;                 break;
# Line 833  printf("ST: %d (%e) -> %d  (%e)\n",i, wi, Line 820  printf("ST: %d (%e) -> %d  (%e)\n",i, wi,
820          }              }    
821          if (inD) {          if (inD) {
822             Status[i]=0.; /* is in D */             Status[i]=0.; /* is in D */
 printf("%d is in D\n",i);  
823          }          }
824       }       }
825            
# Line 865  printf("%d is in D\n",i); Line 851  printf("%d is in D\n",i);
851            const index_t j=ST[offset_ST[i]+jptr];            const index_t j=ST[offset_ST[i]+jptr];
852            if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {            if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
853               w[i]--;               w[i]--;
 printf("%d reduced by %d\n",i,j);  
854               ST_flag[offset_ST[i]+jptr]=-1;               ST_flag[offset_ST[i]+jptr]=-1;
855            }            }
856             }             }
# Line 887  printf("%d reduced by %d\n",i,j); Line 872  printf("%d reduced by %d\n",i,j);
872    
873               for (jptr=0; jptr<degree_ST[i]; ++jptr) {               for (jptr=0; jptr<degree_ST[i]; ++jptr) {
874              const index_t j=ST[offset_ST[i]+jptr];              const index_t j=ST[offset_ST[i]+jptr];
 printf("check connection: %d %d\n",i,j);  
875              ST_flag[offset_ST[i]+jptr]=-1;              ST_flag[offset_ST[i]+jptr]=-1;
876              for (kptr=0; kptr<degree_ST[j]; ++kptr) {              for (kptr=0; kptr<degree_ST[j]; ++kptr) {
877                 const index_t k=ST[offset_ST[j]+kptr];                 const index_t k=ST[offset_ST[j]+kptr];
 printf("check connection: %d of %d\n",k,j);  
878                 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */                 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
 printf("found!\n");  
879                    if (ST_flag[offset_ST[j]+kptr] >0) {                    if (ST_flag[offset_ST[j]+kptr] >0) {
880                   if (j< my_n ) {                   if (j< my_n ) {
881                      w[j]--;                      w[j]--;
 printf("%d reduced by %d and %d\n",j, i,k);  
882                   }                   }
883                   ST_flag[offset_ST[j]+kptr]=-1;                   ST_flag[offset_ST[j]+kptr]=-1;
884                    }                    }
# Line 907  printf("%d reduced by %d and %d\n",j, i, Line 888  printf("%d reduced by %d and %d\n",j, i,
888            }            }
889          }          }
890       }       }
891    
892       /* adjust status */       /* adjust status */
893       #pragma omp parallel for private(i)       #pragma omp parallel for private(i)
894       for (i=0; i< my_n; ++i) {       for (i=0; i< my_n; ++i) {
895          if ( Status[i] == 0. ) {          if ( Status[i] == 0. ) {
896             Status[i] = -10;   /* this is now a C point */             Status[i] = -10;   /* this is now a C point */
897          } else if ( w[i]<1.) {          } else if (Status[i] == 1. && w[i]<1.) {
898             Status[i] = -100;   /* this is now a F point */               Status[i] = -100;   /* this is now a F point */  
899          }          }
900       }       }
901            
902             i = numUndefined;
903       numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );       numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
904         if (numUndefined == i) {
905           Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
906           return;
907         }
908    
909       iter++;       iter++;
910       printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);       printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);
   } /* end of while loop */  
911    
912      } /* end of while loop */
913    
914    /* map to output :*/    /* map to output :*/
915    Paso_Coupler_fillOverlap(n, Status, w_coupler);    Paso_Coupler_fillOverlap(n, Status, w_coupler);
# Line 934  printf("%d reduced by %d and %d\n",j, i, Line 921  printf("%d reduced by %d and %d\n",j, i,
921          split_marker[i]=PASO_AMG_IN_F;          split_marker[i]=PASO_AMG_IN_F;
922       }       }
923    }    }
   
924    /* clean up : */    /* clean up : */
925    Paso_Coupler_free(w_coupler);    Paso_Coupler_free(w_coupler);
926    TMPMEMFREE(random);    TMPMEMFREE(random);
# Line 944  printf("%d reduced by %d and %d\n",j, i, Line 930  printf("%d reduced by %d and %d\n",j, i,
930        
931    return;    return;
932  }  }
933    
934    /* Merge the system matrix which is distributed on ranks into a complete
935       matrix on rank 0, then solve this matrix on rank 0 only */
936    Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
937      index_t i, iptr, j, n, remote_n, total_n, len, offset, tag;
938      index_t row_block_size, col_block_size, block_size;
939      index_t size=A->mpi_info->size;
940      index_t rank=A->mpi_info->rank;
941      index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
942      index_t *temp_n=NULL, *temp_len=NULL;
943      double  *val=NULL;
944      Paso_Pattern *pattern=NULL;
945      Paso_SparseMatrix *out=NULL;
946      #ifdef ESYS_MPI
947        MPI_Request* mpi_requests=NULL;
948        MPI_Status* mpi_stati=NULL;
949      #else
950        int *mpi_requests=NULL, *mpi_stati=NULL;
951      #endif
952    
953      if (size == 1) {
954        n = A->mainBlock->numRows;
955        ptr = TMPMEMALLOC(n, index_t);
956        #pragma omp parallel for private(i)
957        for (i=0; i<n; i++) ptr[i] = i;
958        out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
959        TMPMEMFREE(ptr);
960        return out;
961      }
962    
963      n = A->mainBlock->numRows;
964      block_size = A->block_size;
965    
966      /* Merge MainBlock and CoupleBlock to get a complete column entries
967         for each row allocated to current rank. Output (ptr, idx, val)
968         contains all info needed from current rank to merge a system
969         matrix  */
970      Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
971    
972      #ifdef ESYS_MPI
973        mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
974        mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
975      #else
976        mpi_requests=TMPMEMALLOC(size*2,int);
977        mpi_stati=TMPMEMALLOC(size*2,int);
978      #endif
979    
980      /* Now, pass all info to rank 0 and merge them into one sparse
981         matrix */
982      if (rank == 0) {
983        /* First, copy local ptr values into ptr_global */
984        total_n=Paso_SystemMatrix_getGlobalNumRows(A);
985        ptr_global = MEMALLOC(total_n+1, index_t);
986        memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
987        iptr = n+1;
988        MEMFREE(ptr);
989        temp_n = TMPMEMALLOC(size, index_t);
990        temp_len = TMPMEMALLOC(size, index_t);
991        temp_n[0] = iptr;
992        
993        /* Second, receive ptr values from other ranks */
994        for (i=1; i<size; i++) {
995          remote_n = A->row_distribution->first_component[i+1] -
996             A->row_distribution->first_component[i];
997          MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
998                A->mpi_info->msg_tag_counter+i,
999                A->mpi_info->comm,
1000                &mpi_requests[i]);
1001          temp_n[i] = remote_n;
1002          iptr += remote_n;
1003        }
1004        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1005        A->mpi_info->msg_tag_counter += size;
1006    
1007        /* Then, prepare to receive idx and val from other ranks */
1008        len = 0;
1009        offset = -1;
1010        for (i=0; i<size; i++) {
1011          if (temp_n[i] > 0) {
1012        offset += temp_n[i];
1013        len += ptr_global[offset];
1014        temp_len[i] = ptr_global[offset];
1015          }else
1016        temp_len[i] = 0;
1017        }
1018    
1019        idx_global = MEMALLOC(len, index_t);
1020        iptr = temp_len[0];
1021        offset = n+1;
1022        for (i=1; i<size; i++) {
1023          len = temp_len[i];
1024          MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1025                A->mpi_info->msg_tag_counter+i,
1026                A->mpi_info->comm,
1027                &mpi_requests[i]);
1028          remote_n = temp_n[i];
1029          for (j=0; j<remote_n; j++) {
1030        ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1031          }
1032          offset += remote_n;
1033          iptr += len;
1034        }
1035        memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1036        MEMFREE(idx);
1037        row_block_size = A->mainBlock->row_block_size;
1038        col_block_size = A->mainBlock->col_block_size;
1039        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1040        A->mpi_info->msg_tag_counter += size;
1041        TMPMEMFREE(temp_n);
1042    
1043        /* Then generate the sparse matrix */
1044        pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, total_n,
1045                total_n, ptr_global, idx_global);
1046        out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1047                row_block_size, col_block_size, FALSE);
1048        Paso_Pattern_free(pattern);
1049    
1050        /* Finally, receive and copy the value */
1051        iptr = temp_len[0] * block_size;
1052        for (i=1; i<size; i++) {
1053          len = temp_len[i];
1054          MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1055                            A->mpi_info->msg_tag_counter+i,
1056                            A->mpi_info->comm,
1057                            &mpi_requests[i]);
1058          iptr += (len * block_size);
1059        }
1060        memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1061        MEMFREE(val);
1062        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1063        A->mpi_info->msg_tag_counter += size;
1064        TMPMEMFREE(temp_len);
1065      } else { /* it's not rank 0 */
1066    
1067        /* First, send out the local ptr */
1068        tag = A->mpi_info->msg_tag_counter+rank;
1069        MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1070                &mpi_requests[0]);
1071    
1072        /* Next, send out the local idx */
1073        len = ptr[n];
1074        tag += size;
1075        MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1076                &mpi_requests[1]);
1077    
1078        /* At last, send out the local val */
1079        len *= block_size;
1080        tag += size;
1081        MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1082                &mpi_requests[2]);
1083    
1084        MPI_Waitall(3, mpi_requests, mpi_stati);
1085        A->mpi_info->msg_tag_counter = tag + size - rank;
1086        MEMFREE(ptr);
1087        MEMFREE(idx);
1088        MEMFREE(val);
1089    
1090        out = NULL;
1091      }
1092    
1093      TMPMEMFREE(mpi_requests);
1094      TMPMEMFREE(mpi_stati);
1095      return out;
1096    }
1097    
1098    
1099    void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1100      Paso_SystemMatrix *A = amg->A_C;
1101      Paso_SparseMatrix *A_D, *A_temp;
1102      double* x=NULL;
1103      double* b=NULL;
1104      index_t rank = A->mpi_info->rank;
1105      index_t size = A->mpi_info->size;
1106      index_t i, n, p, count, n_block;
1107      index_t *counts, *offset, *dist;
1108    
1109      n_block = amg->n_block;
1110    
1111      A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1112    
1113      /* First, gather x and b into rank 0 */
1114      dist = A->pattern->input_distribution->first_component;
1115      n = Paso_SystemMatrix_getGlobalNumRows(A);
1116      b = TMPMEMALLOC(n*n_block, double);
1117      x = TMPMEMALLOC(n*n_block, double);
1118      counts = TMPMEMALLOC(size, index_t);
1119      offset = TMPMEMALLOC(size, index_t);
1120    
1121      #pragma omp parallel for private(i,p)
1122      for (i=0; i<size; i++) {
1123        p = dist[i];
1124        counts[i] = (dist[i+1] - p)*n_block;
1125        offset[i] = p*n_block;
1126      }
1127      count = counts[rank];
1128      MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1129      MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1130    
1131      if (rank == 0) {
1132        /* solve locally */
1133        #ifdef MKL
1134          A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1135          A_temp->solver_package = PASO_MKL;
1136          Paso_SparseMatrix_free(A_D);
1137          Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1138          Paso_SparseMatrix_free(A_temp);
1139        #else
1140          #ifdef UMFPACK
1141        A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1142        A_temp->solver_package = PASO_UMFPACK;
1143        Paso_SparseMatrix_free(A_D);
1144        Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1145        Paso_SparseMatrix_free(A_temp);
1146          #else
1147        A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1148        A_D->solver_package = PASO_SMOOTHER;
1149        Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1150        Paso_SparseMatrix_free(A_D);
1151          #endif
1152        #endif
1153      }
1154    
1155      /* now we need to distribute the solution to all ranks */
1156      MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1157    
1158      TMPMEMFREE(x);
1159      TMPMEMFREE(b);
1160      TMPMEMFREE(counts);
1161      TMPMEMFREE(offset);
1162    }

Legend:
Removed from v.3642  
changed lines
  Added in v.3827

  ViewVC Help
Powered by ViewVC 1.1.26