/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Diff of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3485 by gross, Thu Mar 24 22:44:40 2011 UTC revision 3834 by gross, Wed Feb 15 07:09:09 2012 UTC
# Line 18  Line 18 
18    
19  /**************************************************************/  /**************************************************************/
20    
21  /* Author: artak@uq.edu.au, l.gross@uq.edu.au                                */  /* Author: artak@uq.edu.au, l.gross@uq.edu.au                 */
22    
23  /**************************************************************/  /**************************************************************/
24    
25  #define SHOW_TIMING FALSE  #define SHOW_TIMING FALSE
26    #define MY_DEBUG 0
27    #define MY_DEBUG1 1
28    
29  #include "Paso.h"  #include "Paso.h"
30  #include "Preconditioner.h"  #include "Preconditioner.h"
# Line 89  dim_t Paso_Preconditioner_AMG_getNumCoar Line 91  dim_t Paso_Preconditioner_AMG_getNumCoar
91  Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {  Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92    
93    Paso_Preconditioner_AMG* out=NULL;    Paso_Preconditioner_AMG* out=NULL;
94      Paso_SystemMatrix *A_C=NULL;
95    bool_t verbose=options->verbose;    bool_t verbose=options->verbose;
96    
97    const dim_t my_n=A_p->mainBlock->numRows;    const dim_t my_n=A_p->mainBlock->numRows;
# Line 97  Paso_Preconditioner_AMG* Paso_Preconditi Line 100  Paso_Preconditioner_AMG* Paso_Preconditi
100    const dim_t n = my_n + overlap_n;    const dim_t n = my_n + overlap_n;
101    
102    const dim_t n_block=A_p->row_block_size;    const dim_t n_block=A_p->row_block_size;
103    index_t* F_marker=NULL, *counter=NULL;    index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
104    dim_t i;    dim_t i, n_F, n_C, F_flag, *F_set=NULL;
105    double time0=0;    double time0=0;
106    const double theta = options->coarsening_threshold;    const double theta = options->coarsening_threshold;
107    const double tau = options->diagonal_dominance_threshold;    const double tau = options->diagonal_dominance_threshold;
108    const double sparsity=Paso_SystemMatrix_getSparsity(A_p);    const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
109    const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);    const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);
110    
111      
112    /*    /*
113        is the input matrix A suitable for coarsening        is the input matrix A suitable for coarsening?
114                
115    */    */
116    if ( (sparsity >= options->min_coarse_sparsity) ||    if ( (sparsity >= options->min_coarse_sparsity) ||
# Line 124  Paso_Preconditioner_AMG* Paso_Preconditi Line 127  Paso_Preconditioner_AMG* Paso_Preconditi
127            printf("Paso_Preconditioner: AMG: termination of coarsening by ");            printf("Paso_Preconditioner: AMG: termination of coarsening by ");
128    
129            if (sparsity >= options->min_coarse_sparsity)            if (sparsity >= options->min_coarse_sparsity)
130                printf("SPAR ");                printf("SPAR");
131    
132            if (total_n <= options->min_coarse_matrix_size)            if (total_n <= options->min_coarse_matrix_size)
133                printf("SIZE ");                printf("SIZE");
134    
135            if (level > options->level_max)            if (level > options->level_max)
136                printf("LEVEL ");                printf("LEVEL");
137    
138            printf("\n");            printf("\n");
139    
# Line 144  Paso_Preconditioner_AMG* Paso_Preconditi Line 147  Paso_Preconditioner_AMG* Paso_Preconditi
147       /* Start Coarsening : */       /* Start Coarsening : */
148            
149       /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */       /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
150       const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len;       const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len  + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
151    
152       dim_t* degree_S=TMPMEMALLOC(n, dim_t);       dim_t* degree_S=TMPMEMALLOC(n, dim_t);
153       index_t *offset_S=TMPMEMALLOC(n, index_t);       index_t *offset_S=TMPMEMALLOC(n, index_t);
# Line 163  Paso_Preconditioner_AMG* Paso_Preconditi Line 166  Paso_Preconditioner_AMG* Paso_Preconditi
166             make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical             make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
167      */      */
168          Paso_SystemMatrix_copyColCoupleBlock(A_p);          Paso_SystemMatrix_copyColCoupleBlock(A_p);
169          /* Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE); */          Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
170    
171      /*      /*
172            set splitting of unknows:            set splitting of unknows:
# Line 176  Paso_Preconditioner_AMG* Paso_Preconditi Line 179  Paso_Preconditioner_AMG* Paso_Preconditi
179             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
180       }       }
181       Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);       Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
182        /*   Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
183  {   */
184     dim_t p;  
    for (i=0; i<n; ++i) {  
          printf("%d: ",i);  
         for (p=0; p<degree_S[i];++p) printf("%d ",S[offset_S[i]+p]);  
         printf("\n");  
    }  
 }  
185       Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,       Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
186                              degree_S, offset_S, S, degree_ST, offset_ST, ST,                              degree_S, offset_S, S, degree_ST, offset_ST, ST,
187                          A_p->col_coupler->connector,A_p->col_distribution);                          A_p->col_coupler->connector,A_p->col_distribution);
 Esys_setError(SYSTEM_ERROR, "AMG:DONE.");  
 return NULL;  
188                
189    
190  /*MPI:           /* in BoomerAMG if interpolation is used FF connectivity is required */
      Paso_Preconditioner_AMG_RungeStuebenSearch(n, A_p->pattern->ptr, degree_S, S, F_marker, options->usePanel);  
 */  
       
          /* in BoomerAMG interpolation is used FF connectiovity is required :*/  
191  /*MPI:  /*MPI:
192           if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)           if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
193                               Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);                                 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);  
194  */  */
195    
196       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
   
 #ifdef AAAAA  
197       if (Esys_noError() ) {       if (Esys_noError() ) {
198          #pragma omp parallel for private(i) schedule(static)          #pragma omp parallel for private(i) schedule(static)
199          for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] ==  PASO_AMG_IN_F);          for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] ==  PASO_AMG_IN_F);
# Line 214  return NULL; Line 203  return NULL;
203          */          */
204          n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);          n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
205          n_C=n-n_F;          n_C=n-n_F;
206          if (verbose) printf("Paso_Preconditioner: AMG level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);          if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);
207        
208          if ( n_F == 0 ) {  /*  is a nasty case. a direct solver should be used, return NULL */              /* collect n_F values on all processes, a direct solver should
209                    be used if any n_F value is 0 */
210                F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
211            #ifdef ESYS_MPI
212                MPI_Allgather(&n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
213            #endif
214                F_flag = 1;
215                for (i=0; i<A_p->mpi_info->size; i++) {
216                    if (F_set[i] == 0) {
217                      F_flag = 0;
218                      break;
219                    }
220                }
221                TMPMEMFREE(F_set);
222            
223    /*          if ( n_F == 0 ) {  is a nasty case. a direct solver should be used, return NULL */
224                if (F_flag == 0) {
225             out = NULL;             out = NULL;
226          } else {          } else {
227             out=MEMALLOC(1,Paso_Preconditioner_AMG);             out=MEMALLOC(1,Paso_Preconditioner_AMG);
# Line 242  return NULL; Line 247  return NULL;
247             Esys_checkPtr(rows_in_F);             Esys_checkPtr(rows_in_F);
248             if ( Esys_noError() ) {             if ( Esys_noError() ) {
249    
250            out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), verbose);            out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
251                
252            if (n_C != 0) {            if (n_C != 0) {
253                 /* if nothing is been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */                 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
254        
255              /* allocate helpers :*/              /* allocate helpers :*/
256              out->x_C=MEMALLOC(n_block*n_C,double);              out->x_C=MEMALLOC(n_block*n_C,double);
# Line 265  return NULL; Line 270  return NULL;
270                   if  (F_marker[i]) rows_in_F[counter[i]]=i;                   if  (F_marker[i]) rows_in_F[counter[i]]=i;
271                    }                    }
272                 }                 }
273                 /*  create mask of C nodes with value >-1 gives new id */                 /*  create mask of C nodes with value >-1, gives new id */
274                 i=Paso_Util_cumsum_maskedFalse(n,counter, F_marker);                 i=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
   
                #pragma omp parallel for private(i) schedule(static)  
                for (i = 0; i < n; ++i) {  
                   if  (F_marker[i]) {  
                  mask_C[i]=-1;  
                   } else {  
                  mask_C[i]=counter[i];;  
                   }  
                }  
275                 /*                 /*
276                    get Prolongation :                        get Prolongation :    
277                 */                                   */                  
278    
279                 time0=Esys_timer();                 time0=Esys_timer();
280  /*MPI:  
281                 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,A_p->pattern->ptr, degree_S,S,n_C,mask_C, options->interpolation_method);                 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
282  */  
                if (SHOW_TIMING) printf("timing: level %d: getProlongation: %e\n",level, Esys_timer()-time0);  
283              }              }
284    
285              /*                    /*      
286                 construct Restriction operator as transposed of Prolongation operator:                 construct Restriction operator as transposed of Prolongation operator:
287              */              */
288    
289              if ( Esys_noError()) {              if ( Esys_noError()) {
290                 time0=Esys_timer();                 time0=Esys_timer();
291  /*MPI:  
292                 out->R=Paso_SystemMatrix_getTranspose(out->P);                 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
293  */  
294                 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);                 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
295              }                    }      
296              /*              /*
297              construct coarse level matrix:                          construct coarse level matrix:
298              */                          */
299              if ( Esys_noError()) {                          if ( Esys_noError()) {
300                 time0=Esys_timer();                             time0=Esys_timer();
 /*MPI:  
                Atemp=Paso_SystemMatrix_MatrixMatrix(A_p,out->P);  
                A_C=Paso_SystemMatrix_MatrixMatrix(out->R,Atemp);  
                Paso_Preconditioner_AMG_setStrongConnections  
                Paso_SystemMatrix_free(Atemp);  
 */  
301    
302                 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);                                         A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
303              }  
304                               if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
305                            }
306    
               
307              /*              /*
308                 constructe courser level:                 constructe courser level:
309                                
# Line 318  return NULL; Line 311  return NULL;
311              if ( Esys_noError()) {              if ( Esys_noError()) {
312                 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);                 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
313              }              }
314    
315              if ( Esys_noError()) {              if ( Esys_noError()) {
316                 if ( out->AMG_C == NULL ) {                if ( out->AMG_C == NULL ) {
317                      /* merge the system matrix into 1 rank when
318                     it's not suitable coarsening due to the
319                     total number of unknowns are too small */
320                      out->A_C=A_C;
321                    out->reordering = options->reordering;                    out->reordering = options->reordering;
322                    out->refinements = options->coarse_matrix_refinements;                                out->refinements = options->coarse_matrix_refinements;
323                    /* no coarse level matrix has been constructed. use direct solver */                    out->verbose = verbose;
324                    #ifdef MKL                    out->options_smoother = options->smoother;
325                      out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_C);                } else {
                     Paso_SystemMatrix_free(A_C);  
                     out->A_C->solver_package = PASO_MKL;  
                     if (verbose) printf("Paso_Preconditioner: AMG: use MKL direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                   #else  
                     #ifdef UMFPACK  
                        out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_C);  
                        Paso_SystemMatrix_free(A_C);  
                        out->A_C->solver_package = PASO_UMFPACK;  
                        if (verbose) printf("Paso_Preconditioner: AMG: use UMFPACK direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                     #else  
                        out->A_C=A_C;  
                        out->A_C->solver_p=Paso_Preconditioner_Smoother_alloc(out->A_C, (options->smoother == PASO_JACOBI), verbose);  
                        out->A_C->solver_package = PASO_SMOOTHER;  
                        if (verbose) printf("Paso_Preconditioner: AMG: use smoother on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                     #endif  
                   #endif  
                } else {  
326                    /* finally we set some helpers for the solver step */                    /* finally we set some helpers for the solver step */
327                    out->A_C=A_C;                    out->A_C=A_C;
328                 }                }
329              }                      }        
330            }            }
331             }             }
# Line 352  return NULL; Line 333  return NULL;
333             TMPMEMFREE(rows_in_F);             TMPMEMFREE(rows_in_F);
334          }          }
335       }       }
 #endif  
336    
337    }    }
338    TMPMEMFREE(counter);    TMPMEMFREE(counter);
# Line 376  return NULL; Line 356  return NULL;
356    
357    
358  void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {  void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
359       const dim_t n = amg->n * amg->n_block;       const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
360       double time0=0;       double time0=0;
361       const dim_t post_sweeps=amg->post_sweeps;       const dim_t post_sweeps=amg->post_sweeps;
362       const dim_t pre_sweeps=amg->pre_sweeps;       const dim_t pre_sweeps=amg->pre_sweeps;
# Line 384  void Paso_Preconditioner_AMG_solve(Paso_ Line 364  void Paso_Preconditioner_AMG_solve(Paso_
364       /* presmoothing */       /* presmoothing */
365       time0=Esys_timer();       time0=Esys_timer();
366       Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);       Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
367    
368       time0=Esys_timer()-time0;       time0=Esys_timer()-time0;
369       if (SHOW_TIMING) printf("timing: level %d: Presmooting: %e\n",amg->level, time0);       if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
370       /* end of presmoothing */       /* end of presmoothing */
371            
372       if (amg->n_F < amg->n) { /* is there work on the coarse level? */       if (amg->n_F < amg->n) { /* is there work on the coarse level? */
373           time0=Esys_timer();           time0=Esys_timer();
374    
375       Paso_Copy(n, amg->r, b);                            /*  r <- b */       Paso_Copy(n, amg->r, b);                            /*  r <- b */
376       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
377       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */
378    
379           time0=Esys_timer()-time0;           time0=Esys_timer()-time0;
380       /* coarse level solve */       /* coarse level solve */
381       if ( amg->AMG_C == NULL) {       if ( amg->AMG_C == NULL) {
382          time0=Esys_timer();          time0=Esys_timer();
383          /*  A_C is the coarsest level */          /*  A_C is the coarsest level */
384  #ifdef FIXME          Paso_Preconditioner_AMG_mergeSolve(amg);
385          switch (amg->A_C->solver_package) {  
            case (PASO_MKL):  
           Paso_MKL(amg->A_C, amg->x_C,amg->b_C, amg->reordering, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_UMFPACK):  
           Paso_UMFPACK(amg->A_C, amg->x_C,amg->b_C, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_SMOOTHER):  
           Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE);  
           break;  
         }  
 #endif  
         Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE);  
386          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
387       } else {       } else {
388          Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */          Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */
389       }         }
390    
391       time0=time0+Esys_timer();       time0=time0+Esys_timer();
392       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */           Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */    
393        
394           /*postsmoothing*/           /*postsmoothing*/
395                
396          /*solve Ax=b with initial guess x */          /*solve Ax=b with initial guess x */
# Line 427  void Paso_Preconditioner_AMG_solve(Paso_ Line 399  void Paso_Preconditioner_AMG_solve(Paso_
399          time0=Esys_timer()-time0;          time0=Esys_timer()-time0;
400          if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);          if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
401          /*end of postsmoothing*/          /*end of postsmoothing*/
       
402       }       }
403    
404       return;       return;
405  }  }
406    
# Line 451  void Paso_Preconditioner_AMG_setStrongCo Line 423  void Paso_Preconditioner_AMG_setStrongCo
423     index_t iptr, i;     index_t iptr, i;
424     double *threshold_p=NULL;     double *threshold_p=NULL;
425    
   
426     threshold_p = TMPMEMALLOC(2*my_n, double);     threshold_p = TMPMEMALLOC(2*my_n, double);
427        
428     #pragma omp parallel for private(i,iptr) schedule(static)     #pragma omp parallel for private(i,iptr) schedule(static)
# Line 461  void Paso_Preconditioner_AMG_setStrongCo Line 432  void Paso_Preconditioner_AMG_setStrongCo
432       register double sum_row=0;       register double sum_row=0;
433       register double main_row=0;       register double main_row=0;
434       register dim_t kdeg=0;       register dim_t kdeg=0;
435           const register index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];           register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
436    
437                    
438       /* collect information for row i: */       /* collect information for row i: */
439       #pragma ivdep       #pragma ivdep
440       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
441          register index_t j=A->mainBlock->pattern->index[iptr];          register index_t j=A->mainBlock->pattern->index[iptr];
442          register double fnorm=ABS(A->mainBlock->val[iptr]);          register double fnorm=ABS(A->mainBlock->val[iptr]);
           
443          if( j != i) {          if( j != i) {
444             max_offdiagonal = MAX(max_offdiagonal,fnorm);             max_offdiagonal = MAX(max_offdiagonal,fnorm);
445             sum_row+=fnorm;             sum_row+=fnorm;
# Line 477  void Paso_Preconditioner_AMG_setStrongCo Line 448  void Paso_Preconditioner_AMG_setStrongCo
448          }          }
449    
450       }       }
451        
452       #pragma ivdep       #pragma ivdep
453       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
454          register double fnorm=ABS(A->col_coupleBlock->val[iptr]);          register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
455    
456          max_offdiagonal = MAX(max_offdiagonal,fnorm);          max_offdiagonal = MAX(max_offdiagonal,fnorm);
457          sum_row+=fnorm;          sum_row+=fnorm;
458       }       }
# Line 489  void Paso_Preconditioner_AMG_setStrongCo Line 461  void Paso_Preconditioner_AMG_setStrongCo
461           {           {
462          const double threshold = theta*max_offdiagonal;          const double threshold = theta*max_offdiagonal;
463              threshold_p[2*i+1]=threshold;              threshold_p[2*i+1]=threshold;
464          if (tau*main_row < sum_row) { /* no diagonal domainance */          if (tau*main_row < sum_row) { /* no diagonal dominance */
465                 threshold_p[2*i]=1;                 threshold_p[2*i]=1;
466             #pragma ivdep             #pragma ivdep
467             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
# Line 514  void Paso_Preconditioner_AMG_setStrongCo Line 486  void Paso_Preconditioner_AMG_setStrongCo
486           offset_S[i]=koffset;           offset_S[i]=koffset;
487       degree_S[i]=kdeg;       degree_S[i]=kdeg;
488        }        }
489    
490        /* now we need to distribute the threshold and the diagonal dominance indicator */        /* now we need to distribute the threshold and the diagonal dominance indicator */
491        if (A->mpi_info->size > 1) {        if (A->mpi_info->size > 1) {
492    
# Line 529  void Paso_Preconditioner_AMG_setStrongCo Line 502  void Paso_Preconditioner_AMG_setStrongCo
502    
503            #pragma omp parallel for private(i,iptr) schedule(static)            #pragma omp parallel for private(i,iptr) schedule(static)
504            for (i=0; i<overlap_n; i++) {            for (i=0; i<overlap_n; i++) {
           
505            const double threshold = remote_threshold[2*i+1];            const double threshold = remote_threshold[2*i+1];
506            register dim_t kdeg=0;            register dim_t kdeg=0;
507                const register index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i];                register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
508                if (remote_threshold[2*i]>0) {                if (remote_threshold[2*i]>0) {
509               #pragma ivdep          #pragma ivdep
510               for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
511                register index_t j=A->row_coupleBlock->pattern->index[iptr];                register index_t j=A->row_coupleBlock->pattern->index[iptr];
512                if(ABS(A->row_coupleBlock->val[iptr])>threshold) {                if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
513               S[koffset+kdeg] = j ;               S[koffset+kdeg] = j ;
514               kdeg++;               kdeg++;
515                }                }
516             }          }
517    
518            #pragma ivdep
519            for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
520              register index_t j=A->remote_coupleBlock->pattern->index[iptr];
521              if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
522                  S[koffset+kdeg] = j + my_n;
523                  kdeg++;
524              }
525            }
526                }                }
527                offset_S[i+my_n]=koffset;                offset_S[i+my_n]=koffset;
528            degree_S[i+my_n]=kdeg;            degree_S[i+my_n]=kdeg;
# Line 564  void Paso_Preconditioner_AMG_setStrongCo Line 544  void Paso_Preconditioner_AMG_setStrongCo
544                              const double theta, const double tau)                              const double theta, const double tau)
545    
546  {  {
547     const dim_t n_block=A->block_size;     const dim_t block_size=A->block_size;
548     const dim_t my_n=A->mainBlock->numRows;     const dim_t my_n=A->mainBlock->numRows;
549     const dim_t overlap_n=A->row_coupleBlock->numRows;     const dim_t overlap_n=A->row_coupleBlock->numRows;
550        
# Line 573  void Paso_Preconditioner_AMG_setStrongCo Line 553  void Paso_Preconditioner_AMG_setStrongCo
553        
554        
555     threshold_p = TMPMEMALLOC(2*my_n, double);     threshold_p = TMPMEMALLOC(2*my_n, double);
556      
557     #pragma omp parallel private(i,iptr, bi)     #pragma omp parallel private(i,iptr,bi)
558     {     {
559        
560        dim_t max_deg=0;        dim_t max_deg=0;
# Line 593  void Paso_Preconditioner_AMG_setStrongCo Line 573  void Paso_Preconditioner_AMG_setStrongCo
573       register double main_row=0;       register double main_row=0;
574       register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];       register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
575       register dim_t kdeg=0;       register dim_t kdeg=0;
576       const register index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];       register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
577            
578       /* collect information for row i: */       /* collect information for row i: */
579       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
580          register index_t j=A->mainBlock->pattern->index[iptr];          register index_t j=A->mainBlock->pattern->index[iptr];
581          register double fnorm=0;          register double fnorm=0;
582          #pragma ivdep          #pragma ivdep
583          for(bi=0;bi<n_block;++bi) {          for(bi=0;bi<block_size;++bi) {
584              register double  rtmp2= A->mainBlock->val[iptr*n_block+bi];              register double  rtmp2= A->mainBlock->val[iptr*block_size+bi];
585             fnorm+=rtmp2*rtmp2;             fnorm+=rtmp2*rtmp2;
586          }          }
587          fnorm=sqrt(fnorm);          fnorm=sqrt(fnorm);
# Line 620  void Paso_Preconditioner_AMG_setStrongCo Line 600  void Paso_Preconditioner_AMG_setStrongCo
600       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
601          register double fnorm=0;          register double fnorm=0;
602          #pragma ivdep          #pragma ivdep
603          for(bi=0;bi<n_block;++bi) {          for(bi=0;bi<block_size;++bi) {
604             register double rtmp2 = A->col_coupleBlock->val[iptr*n_block+bi];             register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
605             fnorm+=rtmp2*rtmp2;             fnorm+=rtmp2*rtmp2;
606          }          }
607          fnorm=sqrt(fnorm);          fnorm=sqrt(fnorm);
# Line 638  void Paso_Preconditioner_AMG_setStrongCo Line 618  void Paso_Preconditioner_AMG_setStrongCo
618          rtmp_offset=-A->mainBlock->pattern->ptr[i];          rtmp_offset=-A->mainBlock->pattern->ptr[i];
619                    
620          threshold_p[2*i+1]=threshold;          threshold_p[2*i+1]=threshold;
621          if (tau*main_row < sum_row) { /* no diagonal domainance */          if (tau*main_row < sum_row) { /* no diagonal dominance */
622             threshold_p[2*i]=1;             threshold_p[2*i]=1;
            rtmp_offset=-A->mainBlock->pattern->ptr[i];  
623             #pragma ivdep             #pragma ivdep
624             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
625            register index_t j=A->mainBlock->pattern->index[iptr];            register index_t j=A->mainBlock->pattern->index[iptr];
# Line 685  void Paso_Preconditioner_AMG_setStrongCo Line 664  void Paso_Preconditioner_AMG_setStrongCo
664            
665       const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];       const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
666       register dim_t kdeg=0;       register dim_t kdeg=0;
667       const register index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i];       register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
668       if (remote_threshold[2*i]>0) {       if (remote_threshold[2*i]>0) {
669          #pragma ivdep          #pragma ivdep
670          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
671             register index_t j=A->row_coupleBlock->pattern->index[iptr];             register index_t j=A->row_coupleBlock->pattern->index[iptr];
672             register double fnorm2=0;             register double fnorm2=0;
673             #pragma ivdepremote_threshold[2*i]             #pragma ivdepremote_threshold[2*i]
674             for(bi=0;bi<n_block;++bi) {             for(bi=0;bi<block_size;++bi) {
675            register double rtmp2 = A->row_coupleBlock->val[iptr*n_block+bi];            register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
676            fnorm2+=rtmp2*rtmp2;            fnorm2+=rtmp2*rtmp2;
677             }             }
678                        
# Line 702  void Paso_Preconditioner_AMG_setStrongCo Line 681  void Paso_Preconditioner_AMG_setStrongCo
681            kdeg++;            kdeg++;
682             }             }
683          }          }
684    
685            #pragma ivdep
686                for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
687                   register index_t j=A->remote_coupleBlock->pattern->index[iptr];
688                   register double fnorm2=0;
689                   #pragma ivdepremote_threshold[2*i]
690                   for(bi=0;bi<block_size;++bi) {
691                      register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
692                      fnorm2+=rtmp2*rtmp2;
693                   }
694                   if(fnorm2 > threshold2 && i != j) {
695                      S[koffset+kdeg] = j + my_n;
696                      kdeg++;
697                   }
698                }
699                    
700       }       }
701       offset_S[i+my_n]=koffset;       offset_S[i+my_n]=koffset;
# Line 711  void Paso_Preconditioner_AMG_setStrongCo Line 705  void Paso_Preconditioner_AMG_setStrongCo
705     }     }
706     TMPMEMFREE(threshold_p);     TMPMEMFREE(threshold_p);
707  }  }
708    
709  void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,  void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
710                              const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)                              const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
711  {  {
# Line 738  void Paso_Preconditioner_AMG_transposeSt Line 733  void Paso_Preconditioner_AMG_transposeSt
733     }     }
734  }  }
735    
736    int compareindex(const void *a, const void *b)
737    {
738      return (*(int *)a - *(int *)b);
739    }
740    
741  void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,  void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
742                          const dim_t* degree_S, const index_t* offset_S, const index_t* S,                          const dim_t* degree_S, const index_t* offset_S, const index_t* S,
743                          const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,                          const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
# Line 754  void Paso_Preconditioner_AMG_CIJPCoarsen Line 754  void Paso_Preconditioner_AMG_CIJPCoarsen
754    Status=TMPMEMALLOC(n, double);    Status=TMPMEMALLOC(n, double);
755    random = Paso_Distribution_createRandomVector(col_dist,1);    random = Paso_Distribution_createRandomVector(col_dist,1);
756    ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);    ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
757      
758    #pragma omp parallel for private(i)    #pragma omp parallel for private(i)
759    for (i=0; i< my_n; ++i) {    for (i=0; i< my_n; ++i) {
760        w[i]=degree_ST[i]+random[i];        w[i]=degree_ST[i]+random[i];
# Line 764  void Paso_Preconditioner_AMG_CIJPCoarsen Line 764  void Paso_Preconditioner_AMG_CIJPCoarsen
764         Status[i]=1; /* status undefined */         Status[i]=1; /* status undefined */
765        }        }
766    }    }
767    
768    #pragma omp parallel for private(i, iptr)    #pragma omp parallel for private(i, iptr)
769    for (i=0; i< n; ++i) {    for (i=0; i< n; ++i) {
770        for( iptr =0 ; iptr < degree_ST[i]; ++iptr)  {        for( iptr =0 ; iptr < degree_ST[i]; ++iptr)  {
# Line 774  void Paso_Preconditioner_AMG_CIJPCoarsen Line 775  void Paso_Preconditioner_AMG_CIJPCoarsen
775        
776    numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );    numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
777    printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);    printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);
   
     
778    iter=0;    iter=0;
779    while (numUndefined > 0) {    while (numUndefined > 0) {
780       Paso_Coupler_fillOverlap(n, w, w_coupler);       Paso_Coupler_fillOverlap(n, w, w_coupler);
781       {  
782      int p;        /* calculate the maximum value of neighbours following active strong connections:
783      for (p=0; p<my_n; ++p) {          w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active  */      
        printf(" %d : %f %f \n",p, w[p], Status[p]);  
     }  
     for (p=my_n; p<n; ++p) {  
        printf(" %d : %f \n",p, w[p]);  
     }  
       
       
      }  
       
       /* calculate the maximum value of naigbours following active strong connections:  
         w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) conenction is still active  */        
784        #pragma omp parallel for private(i, iptr)        #pragma omp parallel for private(i, iptr)
785        for (i=0; i<my_n; ++i) {        for (i=0; i<my_n; ++i) {
786       if (Status[i]>0) { /* status is still undefined */       if (Status[i]>0) { /* status is still undefined */
# Line 800  void Paso_Preconditioner_AMG_CIJPCoarsen Line 788  void Paso_Preconditioner_AMG_CIJPCoarsen
788          register bool_t inD=TRUE;          register bool_t inD=TRUE;
789          const double wi=w[i];          const double wi=w[i];
790    
   
791          for( iptr =0 ; iptr < degree_S[i]; ++iptr) {          for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
   
792             const index_t k=S[offset_S[i]+iptr];             const index_t k=S[offset_S[i]+iptr];
793             const index_t* start_p = &ST[offset_ST[k]];             const index_t* start_p = &ST[offset_ST[k]];
794             const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);             const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
795    
796             if (ST_flag[(index_t)(where_p-start_p)]>0) {             if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
 printf("S: %d (%e) -> %d    (%e)\n",i, wi, k, w[k]);        
797            if (wi <= w[k] ) {            if (wi <= w[k] ) {
798               inD=FALSE;               inD=FALSE;
799               break;               break;
# Line 821  printf("S: %d (%e) -> %d   (%e)\n",i, wi, Line 806  printf("S: %d (%e) -> %d   (%e)\n",i, wi,
806            for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {            for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
807               const index_t k=ST[offset_ST[i]+iptr];               const index_t k=ST[offset_ST[i]+iptr];
808               if ( ST_flag[offset_ST[i]+iptr] > 0 ) {               if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
 printf("ST: %d (%e) -> %d   (%e)\n",i, wi, k, w[k]);        
               
809                         if (wi <= w[k] ) {                         if (wi <= w[k] ) {
810                 inD=FALSE;                 inD=FALSE;
811                 break;                 break;
# Line 832  printf("ST: %d (%e) -> %d  (%e)\n",i, wi, Line 815  printf("ST: %d (%e) -> %d  (%e)\n",i, wi,
815          }              }    
816          if (inD) {          if (inD) {
817             Status[i]=0.; /* is in D */             Status[i]=0.; /* is in D */
 printf("%d is in D\n",i);  
818          }          }
819       }       }
820            
# Line 864  printf("%d is in D\n",i); Line 846  printf("%d is in D\n",i);
846            const index_t j=ST[offset_ST[i]+jptr];            const index_t j=ST[offset_ST[i]+jptr];
847            if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {            if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
848               w[i]--;               w[i]--;
 printf("%d reduced by %d\n",i,j);  
849               ST_flag[offset_ST[i]+jptr]=-1;               ST_flag[offset_ST[i]+jptr]=-1;
850            }            }
851             }             }
# Line 886  printf("%d reduced by %d\n",i,j); Line 867  printf("%d reduced by %d\n",i,j);
867    
868               for (jptr=0; jptr<degree_ST[i]; ++jptr) {               for (jptr=0; jptr<degree_ST[i]; ++jptr) {
869              const index_t j=ST[offset_ST[i]+jptr];              const index_t j=ST[offset_ST[i]+jptr];
 printf("check connection: %d %d\n",i,j);  
870              ST_flag[offset_ST[i]+jptr]=-1;              ST_flag[offset_ST[i]+jptr]=-1;
871              for (kptr=0; kptr<degree_ST[j]; ++kptr) {              for (kptr=0; kptr<degree_ST[j]; ++kptr) {
872                 const index_t k=ST[offset_ST[j]+kptr];                 const index_t k=ST[offset_ST[j]+kptr];
 printf("check connection: %d of %d\n",k,j);  
873                 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */                 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
 printf("found!\n");  
874                    if (ST_flag[offset_ST[j]+kptr] >0) {                    if (ST_flag[offset_ST[j]+kptr] >0) {
875                   if (j< my_n ) {                   if (j< my_n ) {
876                      w[j]--;                      w[j]--;
 printf("%d reduced by %d and %d \n",j, i,k);  
877                   }                   }
878                   ST_flag[offset_ST[j]+kptr]=-1;                   ST_flag[offset_ST[j]+kptr]=-1;
879                    }                    }
# Line 906  printf("%d reduced by %d and %d \n",j, i Line 883  printf("%d reduced by %d and %d \n",j, i
883            }            }
884          }          }
885       }       }
886    
887       /* adjust status */       /* adjust status */
888       #pragma omp parallel for private(i)       #pragma omp parallel for private(i)
889       for (i=0; i< my_n; ++i) {       for (i=0; i< my_n; ++i) {
890          if ( Status[i] == 0. ) {          if ( Status[i] == 0. ) {
891             Status[i] = -10;   /* this is now a C point */             Status[i] = -10;   /* this is now a C point */
892          } else if ( w[i]<1.) {          } else if (Status[i] == 1. && w[i]<1.) {
893             Status[i] = -100;   /* this is now a F point */               Status[i] = -100;   /* this is now a F point */  
894          }          }
895       }       }
896            
897             i = numUndefined;
898       numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );       numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
899         if (numUndefined == i) {
900           Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
901           return;
902         }
903    
904       iter++;       iter++;
905       printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);       printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);
   } /* end of while loop */  
906    
907      } /* end of while loop */
908    
909    /* map to output :*/    /* map to output :*/
910    Paso_Coupler_fillOverlap(n, Status, w_coupler);    Paso_Coupler_fillOverlap(n, Status, w_coupler);
# Line 933  printf("%d reduced by %d and %d \n",j, i Line 916  printf("%d reduced by %d and %d \n",j, i
916          split_marker[i]=PASO_AMG_IN_F;          split_marker[i]=PASO_AMG_IN_F;
917       }       }
918    }    }
   
919    /* clean up : */    /* clean up : */
920    Paso_Coupler_free(w_coupler);    Paso_Coupler_free(w_coupler);
921    TMPMEMFREE(random);    TMPMEMFREE(random);
# Line 943  printf("%d reduced by %d and %d \n",j, i Line 925  printf("%d reduced by %d and %d \n",j, i
925        
926    return;    return;
927  }  }
928    
929    /* Merge the system matrix which is distributed on ranks into a complete
930       matrix on rank 0, then solve this matrix on rank 0 only */
931    Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
932      index_t i, iptr, j, n, remote_n, total_n, len, offset, tag;
933      index_t row_block_size, col_block_size, block_size;
934      index_t size=A->mpi_info->size;
935      index_t rank=A->mpi_info->rank;
936      index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
937      index_t *temp_n=NULL, *temp_len=NULL;
938      double  *val=NULL;
939      Paso_Pattern *pattern=NULL;
940      Paso_SparseMatrix *out=NULL;
941      #ifdef ESYS_MPI
942        MPI_Request* mpi_requests=NULL;
943        MPI_Status* mpi_stati=NULL;
944      #else
945        int *mpi_requests=NULL, *mpi_stati=NULL;
946      #endif
947    
948      if (size == 1) {
949        n = A->mainBlock->numRows;
950        ptr = TMPMEMALLOC(n, index_t);
951        #pragma omp parallel for private(i)
952        for (i=0; i<n; i++) ptr[i] = i;
953        out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
954        TMPMEMFREE(ptr);
955        return out;
956      }
957    
958      n = A->mainBlock->numRows;
959      block_size = A->block_size;
960    
961      /* Merge MainBlock and CoupleBlock to get a complete column entries
962         for each row allocated to current rank. Output (ptr, idx, val)
963         contains all info needed from current rank to merge a system
964         matrix  */
965      Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
966    
967      #ifdef ESYS_MPI
968        mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
969        mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
970      #else
971        mpi_requests=TMPMEMALLOC(size*2,int);
972        mpi_stati=TMPMEMALLOC(size*2,int);
973      #endif
974    
975      /* Now, pass all info to rank 0 and merge them into one sparse
976         matrix */
977      if (rank == 0) {
978        /* First, copy local ptr values into ptr_global */
979        total_n=Paso_SystemMatrix_getGlobalNumRows(A);
980        ptr_global = MEMALLOC(total_n+1, index_t);
981        memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
982        iptr = n+1;
983        MEMFREE(ptr);
984        temp_n = TMPMEMALLOC(size, index_t);
985        temp_len = TMPMEMALLOC(size, index_t);
986        temp_n[0] = iptr;
987        
988        /* Second, receive ptr values from other ranks */
989        for (i=1; i<size; i++) {
990          remote_n = A->row_distribution->first_component[i+1] -
991             A->row_distribution->first_component[i];
992          #ifdef ESYS_MPI
993          MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
994                A->mpi_info->msg_tag_counter+i,
995                A->mpi_info->comm,
996                &mpi_requests[i]);
997          #endif
998          temp_n[i] = remote_n;
999          iptr += remote_n;
1000        }
1001        #ifdef ESYS_MPI
1002        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1003        #endif
1004        A->mpi_info->msg_tag_counter += size;
1005    
1006        /* Then, prepare to receive idx and val from other ranks */
1007        len = 0;
1008        offset = -1;
1009        for (i=0; i<size; i++) {
1010          if (temp_n[i] > 0) {
1011        offset += temp_n[i];
1012        len += ptr_global[offset];
1013        temp_len[i] = ptr_global[offset];
1014          }else
1015        temp_len[i] = 0;
1016        }
1017    
1018        idx_global = MEMALLOC(len, index_t);
1019        iptr = temp_len[0];
1020        offset = n+1;
1021        for (i=1; i<size; i++) {
1022          len = temp_len[i];
1023          #ifdef ESYS_MPI
1024          MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1025                A->mpi_info->msg_tag_counter+i,
1026                A->mpi_info->comm,
1027                &mpi_requests[i]);
1028          #endif
1029          remote_n = temp_n[i];
1030          for (j=0; j<remote_n; j++) {
1031        ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1032          }
1033          offset += remote_n;
1034          iptr += len;
1035        }
1036        memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1037        MEMFREE(idx);
1038        row_block_size = A->mainBlock->row_block_size;
1039        col_block_size = A->mainBlock->col_block_size;
1040        #ifdef ESYS_MPI
1041        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1042        #endif
1043        A->mpi_info->msg_tag_counter += size;
1044        TMPMEMFREE(temp_n);
1045    
1046        /* Then generate the sparse matrix */
1047        pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, total_n,
1048                total_n, ptr_global, idx_global);
1049        out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1050                row_block_size, col_block_size, FALSE);
1051        Paso_Pattern_free(pattern);
1052    
1053        /* Finally, receive and copy the value */
1054        iptr = temp_len[0] * block_size;
1055        for (i=1; i<size; i++) {
1056          len = temp_len[i];
1057          #ifdef ESYS_MPI
1058          MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1059                            A->mpi_info->msg_tag_counter+i,
1060                            A->mpi_info->comm,
1061                            &mpi_requests[i]);
1062          #endif
1063          iptr += (len * block_size);
1064        }
1065        memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1066        MEMFREE(val);
1067        #ifdef ESYS_MPI
1068        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1069        #endif
1070        A->mpi_info->msg_tag_counter += size;
1071        TMPMEMFREE(temp_len);
1072      } else { /* it's not rank 0 */
1073    
1074        /* First, send out the local ptr */
1075        tag = A->mpi_info->msg_tag_counter+rank;
1076        #ifdef ESYS_MPI
1077        MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1078                &mpi_requests[0]);
1079        #endif
1080    
1081        /* Next, send out the local idx */
1082        len = ptr[n];
1083        tag += size;
1084        #ifdef ESYS_MPI
1085        MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1086                &mpi_requests[1]);
1087        #endif
1088    
1089        /* At last, send out the local val */
1090        len *= block_size;
1091        tag += size;
1092        #ifdef ESYS_MPI
1093        MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1094                &mpi_requests[2]);
1095    
1096        MPI_Waitall(3, mpi_requests, mpi_stati);
1097        #endif
1098        A->mpi_info->msg_tag_counter = tag + size - rank;
1099        MEMFREE(ptr);
1100        MEMFREE(idx);
1101        MEMFREE(val);
1102    
1103        out = NULL;
1104      }
1105    
1106      TMPMEMFREE(mpi_requests);
1107      TMPMEMFREE(mpi_stati);
1108      return out;
1109    }
1110    
1111    
1112    void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1113      Paso_SystemMatrix *A = amg->A_C;
1114      Paso_SparseMatrix *A_D, *A_temp;
1115      double* x=NULL;
1116      double* b=NULL;
1117      index_t rank = A->mpi_info->rank;
1118      index_t size = A->mpi_info->size;
1119      index_t i, n, p, n_block;
1120      index_t *counts, *offset, *dist;
1121    
1122      n_block = amg->n_block;
1123    
1124      A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1125    
1126      /* First, gather x and b into rank 0 */
1127      dist = A->pattern->input_distribution->first_component;
1128      n = Paso_SystemMatrix_getGlobalNumRows(A);
1129      b = TMPMEMALLOC(n*n_block, double);
1130      x = TMPMEMALLOC(n*n_block, double);
1131      counts = TMPMEMALLOC(size, index_t);
1132      offset = TMPMEMALLOC(size, index_t);
1133    
1134      #pragma omp parallel for private(i,p)
1135      for (i=0; i<size; i++) {
1136        p = dist[i];
1137        counts[i] = (dist[i+1] - p)*n_block;
1138        offset[i] = p*n_block;
1139      }
1140      #ifdef ESYS_MPI
1141      {
1142         index_t count = counts[rank];
1143         MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1144         MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1145      }
1146      #endif
1147    
1148      if (rank == 0) {
1149        /* solve locally */
1150        #ifdef MKL
1151          A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1152          A_temp->solver_package = PASO_MKL;
1153          Paso_SparseMatrix_free(A_D);
1154          Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1155          Paso_SparseMatrix_free(A_temp);
1156        #else
1157          #ifdef UMFPACK
1158        A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1159        A_temp->solver_package = PASO_UMFPACK;
1160        Paso_SparseMatrix_free(A_D);
1161        Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1162        Paso_SparseMatrix_free(A_temp);
1163          #else
1164        A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1165        A_D->solver_package = PASO_SMOOTHER;
1166        Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1167        Paso_SparseMatrix_free(A_D);
1168          #endif
1169        #endif
1170      }
1171    
1172      #ifdef ESYS_MPI
1173      /* now we need to distribute the solution to all ranks */
1174      MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1175      #endif
1176    
1177      TMPMEMFREE(x);
1178      TMPMEMFREE(b);
1179      TMPMEMFREE(counts);
1180      TMPMEMFREE(offset);
1181    }

Legend:
Removed from v.3485  
changed lines
  Added in v.3834

  ViewVC Help
Powered by ViewVC 1.1.26