/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Diff of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3485 by gross, Thu Mar 24 22:44:40 2011 UTC revision 3886 by gross, Thu Apr 5 00:50:30 2012 UTC
# Line 18  Line 18 
18    
19  /**************************************************************/  /**************************************************************/
20    
21  /* Author: artak@uq.edu.au, l.gross@uq.edu.au                                */  /* Author: artak@uq.edu.au, l.gross@uq.edu.au                 */
22    
23  /**************************************************************/  /**************************************************************/
24    
25  #define SHOW_TIMING FALSE  #define SHOW_TIMING FALSE
26    #define MY_DEBUG 0
27    #define MY_DEBUG1 1
28    
29  #include "Paso.h"  #include "Paso.h"
30  #include "Preconditioner.h"  #include "Preconditioner.h"
# Line 89  dim_t Paso_Preconditioner_AMG_getNumCoar Line 91  dim_t Paso_Preconditioner_AMG_getNumCoar
91  Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {  Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92    
93    Paso_Preconditioner_AMG* out=NULL;    Paso_Preconditioner_AMG* out=NULL;
94      Paso_SystemMatrix *A_C=NULL;
95    bool_t verbose=options->verbose;    bool_t verbose=options->verbose;
96    
97    const dim_t my_n=A_p->mainBlock->numRows;    const dim_t my_n=A_p->mainBlock->numRows;
# Line 97  Paso_Preconditioner_AMG* Paso_Preconditi Line 100  Paso_Preconditioner_AMG* Paso_Preconditi
100    const dim_t n = my_n + overlap_n;    const dim_t n = my_n + overlap_n;
101    
102    const dim_t n_block=A_p->row_block_size;    const dim_t n_block=A_p->row_block_size;
103    index_t* F_marker=NULL, *counter=NULL;    index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
104    dim_t i;    dim_t i, my_n_F, my_n_C, n_C, F_flag, *F_set=NULL, global_n_C=0, global_n_F=0, n_F;
105    double time0=0;    double time0=0;
106    const double theta = options->coarsening_threshold;    const double theta = options->coarsening_threshold;
107    const double tau = options->diagonal_dominance_threshold;    const double tau = options->diagonal_dominance_threshold;
108    const double sparsity=Paso_SystemMatrix_getSparsity(A_p);    const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
109    const dim_t total_n=Paso_SystemMatrix_getGlobalTotalNumRows(A_p);    const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
110    
111    
     
112    /*    /*
113        is the input matrix A suitable for coarsening        is the input matrix A suitable for coarsening?
114                
115    */    */
116    if ( (sparsity >= options->min_coarse_sparsity) ||    if ( (sparsity >= options->min_coarse_sparsity) ||
117         (total_n <= options->min_coarse_matrix_size) ||         (global_n <= options->min_coarse_matrix_size) ||
118         (level > options->level_max) ) {         (level > options->level_max) ) {
119    
120          if (verbose) {          if (verbose) {
# Line 124  Paso_Preconditioner_AMG* Paso_Preconditi Line 127  Paso_Preconditioner_AMG* Paso_Preconditi
127            printf("Paso_Preconditioner: AMG: termination of coarsening by ");            printf("Paso_Preconditioner: AMG: termination of coarsening by ");
128    
129            if (sparsity >= options->min_coarse_sparsity)            if (sparsity >= options->min_coarse_sparsity)
130                printf("SPAR ");                printf("SPAR");
131    
132            if (total_n <= options->min_coarse_matrix_size)            if (global_n <= options->min_coarse_matrix_size)
133                printf("SIZE ");                printf("SIZE");
134    
135            if (level > options->level_max)            if (level > options->level_max)
136                printf("LEVEL ");                printf("LEVEL");
137    
138            printf("\n");            printf("\n");
139    
140          printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",          printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
141             level,  options->level_max, sparsity, options->min_coarse_sparsity, total_n, options->min_coarse_matrix_size);               level,  options->level_max, sparsity, options->min_coarse_sparsity, global_n, options->min_coarse_matrix_size);  
142    
143         }         }
144    
# Line 144  Paso_Preconditioner_AMG* Paso_Preconditi Line 147  Paso_Preconditioner_AMG* Paso_Preconditi
147       /* Start Coarsening : */       /* Start Coarsening : */
148            
149       /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */       /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
150       const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len;       const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len  + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
151    
152       dim_t* degree_S=TMPMEMALLOC(n, dim_t);       dim_t* degree_S=TMPMEMALLOC(n, dim_t);
153       index_t *offset_S=TMPMEMALLOC(n, index_t);       index_t *offset_S=TMPMEMALLOC(n, index_t);
# Line 163  Paso_Preconditioner_AMG* Paso_Preconditi Line 166  Paso_Preconditioner_AMG* Paso_Preconditi
166             make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical             make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
167      */      */
168          Paso_SystemMatrix_copyColCoupleBlock(A_p);          Paso_SystemMatrix_copyColCoupleBlock(A_p);
169          /* Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE); */          Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
170    
171      /*      /*
172            set splitting of unknows:            set splitting of unknows:
# Line 176  Paso_Preconditioner_AMG* Paso_Preconditi Line 179  Paso_Preconditioner_AMG* Paso_Preconditi
179             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
180       }       }
181       Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);       Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
182        /*   Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
183  {   */
184     dim_t p;  
    for (i=0; i<n; ++i) {  
          printf("%d: ",i);  
         for (p=0; p<degree_S[i];++p) printf("%d ",S[offset_S[i]+p]);  
         printf("\n");  
    }  
 }  
185       Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,       Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
186                              degree_S, offset_S, S, degree_ST, offset_ST, ST,                              degree_S, offset_S, S, degree_ST, offset_ST, ST,
187                          A_p->col_coupler->connector,A_p->col_distribution);                          A_p->col_coupler->connector,A_p->col_distribution);
 Esys_setError(SYSTEM_ERROR, "AMG:DONE.");  
 return NULL;  
188                
189    
190  /*MPI:           /* in BoomerAMG if interpolation is used FF connectivity is required */
      Paso_Preconditioner_AMG_RungeStuebenSearch(n, A_p->pattern->ptr, degree_S, S, F_marker, options->usePanel);  
 */  
       
          /* in BoomerAMG interpolation is used FF connectiovity is required :*/  
191  /*MPI:  /*MPI:
192           if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)           if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
193                               Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);                                 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);  
194  */  */
195    
196       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
   
 #ifdef AAAAA  
197       if (Esys_noError() ) {       if (Esys_noError() ) {
198          #pragma omp parallel for private(i) schedule(static)          #pragma omp parallel for private(i) schedule(static)
199          for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] ==  PASO_AMG_IN_F);          for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] ==  PASO_AMG_IN_F);
# Line 212  return NULL; Line 201  return NULL;
201          /*          /*
202             count number of unkowns to be eliminated:             count number of unkowns to be eliminated:
203          */          */
204          n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);          my_n_F=Paso_Util_cumsum_maskedTrue(my_n,counter, F_marker);
205          n_C=n-n_F;          n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
206          if (verbose) printf("Paso_Preconditioner: AMG level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);              /* collect my_n_F values on all processes, a direct solver should
207                        be used if any my_n_F value is 0 */
208          if ( n_F == 0 ) {  /*  is a nasty case. a direct solver should be used, return NULL */              F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
209            #ifdef ESYS_MPI
210                MPI_Allgather(&my_n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
211            #endif
212                global_n_F=0;
213                F_flag = 1;
214                for (i=0; i<A_p->mpi_info->size; i++) {
215                    global_n_F+=F_set[i];
216                    if (F_set[i] == 0) F_flag = 0;
217                }
218                TMPMEMFREE(F_set);
219    
220            my_n_C=my_n-my_n_F;
221                global_n_C=global_n-global_n_F;
222            if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,global_n_F,global_n_C);
223    
224            
225    /*          if ( n_F == 0 ) {  is a nasty case. a direct solver should be used, return NULL */
226                if (F_flag == 0) {
227             out = NULL;             out = NULL;
228          } else {          } else {
229             out=MEMALLOC(1,Paso_Preconditioner_AMG);             out=MEMALLOC(1,Paso_Preconditioner_AMG);
230             if (! Esys_checkPtr(out)) {             if (! Esys_checkPtr(out)) {
231            out->level = level;            out->level = level;
           out->n = n;  
           out->n_F = n_F;  
           out->n_block = n_block;  
232            out->A_C = NULL;            out->A_C = NULL;
233            out->P = NULL;              out->P = NULL;  
234            out->R = NULL;                      out->R = NULL;          
# Line 242  return NULL; Line 246  return NULL;
246             Esys_checkPtr(rows_in_F);             Esys_checkPtr(rows_in_F);
247             if ( Esys_noError() ) {             if ( Esys_noError() ) {
248    
249            out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), verbose);            out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
250                
251            if (n_C != 0) {            if (global_n_C != 0) {
252                 /* if nothing is been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */              /*  create mask of C nodes with value >-1, gives new id */
253                n_C=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
254                /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
255        
256              /* allocate helpers :*/              /* allocate helpers :*/
257              out->x_C=MEMALLOC(n_block*n_C,double);              out->x_C=MEMALLOC(n_block*my_n_C,double);
258              out->b_C=MEMALLOC(n_block*n_C,double);              out->b_C=MEMALLOC(n_block*my_n_C,double);
259              out->r=MEMALLOC(n_block*n,double);              out->r=MEMALLOC(n_block*my_n,double);
260                            
261              Esys_checkPtr(out->r);              Esys_checkPtr(out->r);
262              Esys_checkPtr(out->x_C);              Esys_checkPtr(out->x_C);
# Line 265  return NULL; Line 271  return NULL;
271                   if  (F_marker[i]) rows_in_F[counter[i]]=i;                   if  (F_marker[i]) rows_in_F[counter[i]]=i;
272                    }                    }
273                 }                 }
                /*  create mask of C nodes with value >-1 gives new id */  
                i=Paso_Util_cumsum_maskedFalse(n,counter, F_marker);  
   
                #pragma omp parallel for private(i) schedule(static)  
                for (i = 0; i < n; ++i) {  
                   if  (F_marker[i]) {  
                  mask_C[i]=-1;  
                   } else {  
                  mask_C[i]=counter[i];;  
                   }  
                }  
274                 /*                 /*
275                    get Prolongation :                        get Prolongation :    
276                 */                                   */                  
277    
278                 time0=Esys_timer();                 time0=Esys_timer();
279  /*MPI:  
280                 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,A_p->pattern->ptr, degree_S,S,n_C,mask_C, options->interpolation_method);                 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
281  */  
                if (SHOW_TIMING) printf("timing: level %d: getProlongation: %e\n",level, Esys_timer()-time0);  
282              }              }
283    
284              /*                    /*      
285                 construct Restriction operator as transposed of Prolongation operator:                 construct Restriction operator as transposed of Prolongation operator:
286              */              */
287    
288              if ( Esys_noError()) {              if ( Esys_noError()) {
289                 time0=Esys_timer();                 time0=Esys_timer();
290  /*MPI:  
291                 out->R=Paso_SystemMatrix_getTranspose(out->P);                 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
292  */  
293                 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);                 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
294              }                    }      
295              /*              /*
296              construct coarse level matrix:                          construct coarse level matrix:
297              */                          */
298              if ( Esys_noError()) {                          if ( Esys_noError()) {
299                 time0=Esys_timer();                             time0=Esys_timer();
 /*MPI:  
                Atemp=Paso_SystemMatrix_MatrixMatrix(A_p,out->P);  
                A_C=Paso_SystemMatrix_MatrixMatrix(out->R,Atemp);  
                Paso_Preconditioner_AMG_setStrongConnections  
                Paso_SystemMatrix_free(Atemp);  
 */  
300    
301                 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);                                         A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
302              }  
303                               if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
304                            }
305    
               
306              /*              /*
307                 constructe courser level:                 constructe courser level:
308                                
# Line 318  return NULL; Line 310  return NULL;
310              if ( Esys_noError()) {              if ( Esys_noError()) {
311                 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);                 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
312              }              }
313    
314              if ( Esys_noError()) {              if ( Esys_noError()) {
315                 if ( out->AMG_C == NULL ) {                if ( out->AMG_C == NULL ) {
316                      /* merge the system matrix into 1 rank when
317                     it's not suitable coarsening due to the
318                     total number of unknowns are too small */
319                      out->A_C=A_C;
320                    out->reordering = options->reordering;                    out->reordering = options->reordering;
321                    out->refinements = options->coarse_matrix_refinements;                                out->refinements = options->coarse_matrix_refinements;
322                    /* no coarse level matrix has been constructed. use direct solver */                    out->verbose = verbose;
323                    #ifdef MKL                    out->options_smoother = options->smoother;
324                      out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_C);                } else {
                     Paso_SystemMatrix_free(A_C);  
                     out->A_C->solver_package = PASO_MKL;  
                     if (verbose) printf("Paso_Preconditioner: AMG: use MKL direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                   #else  
                     #ifdef UMFPACK  
                        out->A_C=Paso_SystemMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_C);  
                        Paso_SystemMatrix_free(A_C);  
                        out->A_C->solver_package = PASO_UMFPACK;  
                        if (verbose) printf("Paso_Preconditioner: AMG: use UMFPACK direct solver on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                     #else  
                        out->A_C=A_C;  
                        out->A_C->solver_p=Paso_Preconditioner_Smoother_alloc(out->A_C, (options->smoother == PASO_JACOBI), verbose);  
                        out->A_C->solver_package = PASO_SMOOTHER;  
                        if (verbose) printf("Paso_Preconditioner: AMG: use smoother on the coarsest level (number of unknowns = %d).\n",n_C*n_block);  
                     #endif  
                   #endif  
                } else {  
325                    /* finally we set some helpers for the solver step */                    /* finally we set some helpers for the solver step */
326                    out->A_C=A_C;                    out->A_C=A_C;
327                 }                }
328              }                      }        
329            }            }
330             }             }
# Line 352  return NULL; Line 332  return NULL;
332             TMPMEMFREE(rows_in_F);             TMPMEMFREE(rows_in_F);
333          }          }
334       }       }
 #endif  
335    
336    }    }
337    TMPMEMFREE(counter);    TMPMEMFREE(counter);
# Line 376  return NULL; Line 355  return NULL;
355    
356    
357  void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {  void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
358       const dim_t n = amg->n * amg->n_block;       const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
359       double time0=0;       double time0=0;
360       const dim_t post_sweeps=amg->post_sweeps;       const dim_t post_sweeps=amg->post_sweeps;
361       const dim_t pre_sweeps=amg->pre_sweeps;       const dim_t pre_sweeps=amg->pre_sweeps;
# Line 384  void Paso_Preconditioner_AMG_solve(Paso_ Line 363  void Paso_Preconditioner_AMG_solve(Paso_
363       /* presmoothing */       /* presmoothing */
364       time0=Esys_timer();       time0=Esys_timer();
365       Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);       Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
366    
367       time0=Esys_timer()-time0;       time0=Esys_timer()-time0;
368       if (SHOW_TIMING) printf("timing: level %d: Presmooting: %e\n",amg->level, time0);       if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
369       /* end of presmoothing */       /* end of presmoothing */
370            
371       if (amg->n_F < amg->n) { /* is there work on the coarse level? */       time0=Esys_timer();
372           time0=Esys_timer();  
373       Paso_Copy(n, amg->r, b);                            /*  r <- b */       Paso_Copy(n, amg->r, b);                            /*  r <- b */
374       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
375       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */
376           time0=Esys_timer()-time0;  
377       /* coarse level solve */       time0=Esys_timer()-time0;
378       if ( amg->AMG_C == NULL) {       /* coarse level solve */
379         if ( amg->AMG_C == NULL) {
380          time0=Esys_timer();          time0=Esys_timer();
381          /*  A_C is the coarsest level */          /*  A_C is the coarsest level */
382  #ifdef FIXME          Paso_Preconditioner_AMG_mergeSolve(amg);
383          switch (amg->A_C->solver_package) {  
            case (PASO_MKL):  
           Paso_MKL(amg->A_C, amg->x_C,amg->b_C, amg->reordering, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_UMFPACK):  
           Paso_UMFPACK(amg->A_C, amg->x_C,amg->b_C, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_SMOOTHER):  
           Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE);  
           break;  
         }  
 #endif  
         Paso_Preconditioner_Smoother_solve(amg->A_C, amg->A_C->solver_p,amg->x_C,amg->b_C,pre_sweeps+post_sweeps, FALSE);  
384          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
385       } else {       } else {
386          Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */          Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */
      }    
      time0=time0+Esys_timer();  
      Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */      
       
          /*postsmoothing*/  
         
         /*solve Ax=b with initial guess x */  
         time0=Esys_timer();  
         Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);  
         time0=Esys_timer()-time0;  
         if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);  
         /*end of postsmoothing*/  
       
387       }       }
388       return;  
389        time0=time0+Esys_timer();
390        Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */    
391    
392        /*postsmoothing*/
393          
394        /*solve Ax=b with initial guess x */
395        time0=Esys_timer();
396        Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
397        time0=Esys_timer()-time0;
398        if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
399        return;
400  }  }
401    
402  /* theta = threshold for strong connections */  /* theta = threshold for strong connections */
# Line 451  void Paso_Preconditioner_AMG_setStrongCo Line 418  void Paso_Preconditioner_AMG_setStrongCo
418     index_t iptr, i;     index_t iptr, i;
419     double *threshold_p=NULL;     double *threshold_p=NULL;
420    
   
421     threshold_p = TMPMEMALLOC(2*my_n, double);     threshold_p = TMPMEMALLOC(2*my_n, double);
422        
423     #pragma omp parallel for private(i,iptr) schedule(static)     #pragma omp parallel for private(i,iptr) schedule(static)
# Line 461  void Paso_Preconditioner_AMG_setStrongCo Line 427  void Paso_Preconditioner_AMG_setStrongCo
427       register double sum_row=0;       register double sum_row=0;
428       register double main_row=0;       register double main_row=0;
429       register dim_t kdeg=0;       register dim_t kdeg=0;
430           const register index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];           register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
431    
432                    
433       /* collect information for row i: */       /* collect information for row i: */
434       #pragma ivdep       #pragma ivdep
435       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
436          register index_t j=A->mainBlock->pattern->index[iptr];          register index_t j=A->mainBlock->pattern->index[iptr];
437          register double fnorm=ABS(A->mainBlock->val[iptr]);          register double fnorm=ABS(A->mainBlock->val[iptr]);
           
438          if( j != i) {          if( j != i) {
439             max_offdiagonal = MAX(max_offdiagonal,fnorm);             max_offdiagonal = MAX(max_offdiagonal,fnorm);
440             sum_row+=fnorm;             sum_row+=fnorm;
# Line 477  void Paso_Preconditioner_AMG_setStrongCo Line 443  void Paso_Preconditioner_AMG_setStrongCo
443          }          }
444    
445       }       }
446        
447       #pragma ivdep       #pragma ivdep
448       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
449          register double fnorm=ABS(A->col_coupleBlock->val[iptr]);          register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
450    
451          max_offdiagonal = MAX(max_offdiagonal,fnorm);          max_offdiagonal = MAX(max_offdiagonal,fnorm);
452          sum_row+=fnorm;          sum_row+=fnorm;
453       }       }
# Line 489  void Paso_Preconditioner_AMG_setStrongCo Line 456  void Paso_Preconditioner_AMG_setStrongCo
456           {           {
457          const double threshold = theta*max_offdiagonal;          const double threshold = theta*max_offdiagonal;
458              threshold_p[2*i+1]=threshold;              threshold_p[2*i+1]=threshold;
459          if (tau*main_row < sum_row) { /* no diagonal domainance */          if (tau*main_row < sum_row) { /* no diagonal dominance */
460                 threshold_p[2*i]=1;                 threshold_p[2*i]=1;
461             #pragma ivdep             #pragma ivdep
462             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
# Line 514  void Paso_Preconditioner_AMG_setStrongCo Line 481  void Paso_Preconditioner_AMG_setStrongCo
481           offset_S[i]=koffset;           offset_S[i]=koffset;
482       degree_S[i]=kdeg;       degree_S[i]=kdeg;
483        }        }
484    
485        /* now we need to distribute the threshold and the diagonal dominance indicator */        /* now we need to distribute the threshold and the diagonal dominance indicator */
486        if (A->mpi_info->size > 1) {        if (A->mpi_info->size > 1) {
487    
# Line 529  void Paso_Preconditioner_AMG_setStrongCo Line 497  void Paso_Preconditioner_AMG_setStrongCo
497    
498            #pragma omp parallel for private(i,iptr) schedule(static)            #pragma omp parallel for private(i,iptr) schedule(static)
499            for (i=0; i<overlap_n; i++) {            for (i=0; i<overlap_n; i++) {
           
500            const double threshold = remote_threshold[2*i+1];            const double threshold = remote_threshold[2*i+1];
501            register dim_t kdeg=0;            register dim_t kdeg=0;
502                const register index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i];                register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
503                if (remote_threshold[2*i]>0) {                if (remote_threshold[2*i]>0) {
504               #pragma ivdep          #pragma ivdep
505               for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
506                register index_t j=A->row_coupleBlock->pattern->index[iptr];                register index_t j=A->row_coupleBlock->pattern->index[iptr];
507                if(ABS(A->row_coupleBlock->val[iptr])>threshold) {                if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
508               S[koffset+kdeg] = j ;               S[koffset+kdeg] = j ;
509               kdeg++;               kdeg++;
510                }                }
511             }          }
512    
513            #pragma ivdep
514            for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
515              register index_t j=A->remote_coupleBlock->pattern->index[iptr];
516              if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
517                  S[koffset+kdeg] = j + my_n;
518                  kdeg++;
519              }
520            }
521                }                }
522                offset_S[i+my_n]=koffset;                offset_S[i+my_n]=koffset;
523            degree_S[i+my_n]=kdeg;            degree_S[i+my_n]=kdeg;
# Line 564  void Paso_Preconditioner_AMG_setStrongCo Line 539  void Paso_Preconditioner_AMG_setStrongCo
539                              const double theta, const double tau)                              const double theta, const double tau)
540    
541  {  {
542     const dim_t n_block=A->block_size;     const dim_t block_size=A->block_size;
543     const dim_t my_n=A->mainBlock->numRows;     const dim_t my_n=A->mainBlock->numRows;
544     const dim_t overlap_n=A->row_coupleBlock->numRows;     const dim_t overlap_n=A->row_coupleBlock->numRows;
545        
# Line 573  void Paso_Preconditioner_AMG_setStrongCo Line 548  void Paso_Preconditioner_AMG_setStrongCo
548        
549        
550     threshold_p = TMPMEMALLOC(2*my_n, double);     threshold_p = TMPMEMALLOC(2*my_n, double);
551      
552     #pragma omp parallel private(i,iptr, bi)     #pragma omp parallel private(i,iptr,bi)
553     {     {
554        
555        dim_t max_deg=0;        dim_t max_deg=0;
# Line 593  void Paso_Preconditioner_AMG_setStrongCo Line 568  void Paso_Preconditioner_AMG_setStrongCo
568       register double main_row=0;       register double main_row=0;
569       register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];       register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
570       register dim_t kdeg=0;       register dim_t kdeg=0;
571       const register index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];       register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
572            
573       /* collect information for row i: */       /* collect information for row i: */
574       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
575          register index_t j=A->mainBlock->pattern->index[iptr];          register index_t j=A->mainBlock->pattern->index[iptr];
576          register double fnorm=0;          register double fnorm=0;
577          #pragma ivdep          #pragma ivdep
578          for(bi=0;bi<n_block;++bi) {          for(bi=0;bi<block_size;++bi) {
579              register double  rtmp2= A->mainBlock->val[iptr*n_block+bi];              register double  rtmp2= A->mainBlock->val[iptr*block_size+bi];
580             fnorm+=rtmp2*rtmp2;             fnorm+=rtmp2*rtmp2;
581          }          }
582          fnorm=sqrt(fnorm);          fnorm=sqrt(fnorm);
# Line 620  void Paso_Preconditioner_AMG_setStrongCo Line 595  void Paso_Preconditioner_AMG_setStrongCo
595       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
596          register double fnorm=0;          register double fnorm=0;
597          #pragma ivdep          #pragma ivdep
598          for(bi=0;bi<n_block;++bi) {          for(bi=0;bi<block_size;++bi) {
599             register double rtmp2 = A->col_coupleBlock->val[iptr*n_block+bi];             register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
600             fnorm+=rtmp2*rtmp2;             fnorm+=rtmp2*rtmp2;
601          }          }
602          fnorm=sqrt(fnorm);          fnorm=sqrt(fnorm);
# Line 638  void Paso_Preconditioner_AMG_setStrongCo Line 613  void Paso_Preconditioner_AMG_setStrongCo
613          rtmp_offset=-A->mainBlock->pattern->ptr[i];          rtmp_offset=-A->mainBlock->pattern->ptr[i];
614                    
615          threshold_p[2*i+1]=threshold;          threshold_p[2*i+1]=threshold;
616          if (tau*main_row < sum_row) { /* no diagonal domainance */          if (tau*main_row < sum_row) { /* no diagonal dominance */
617             threshold_p[2*i]=1;             threshold_p[2*i]=1;
            rtmp_offset=-A->mainBlock->pattern->ptr[i];  
618             #pragma ivdep             #pragma ivdep
619             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {             for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
620            register index_t j=A->mainBlock->pattern->index[iptr];            register index_t j=A->mainBlock->pattern->index[iptr];
# Line 685  void Paso_Preconditioner_AMG_setStrongCo Line 659  void Paso_Preconditioner_AMG_setStrongCo
659            
660       const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];       const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
661       register dim_t kdeg=0;       register dim_t kdeg=0;
662       const register index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i];       register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
663       if (remote_threshold[2*i]>0) {       if (remote_threshold[2*i]>0) {
664          #pragma ivdep          #pragma ivdep
665          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {          for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
666             register index_t j=A->row_coupleBlock->pattern->index[iptr];             register index_t j=A->row_coupleBlock->pattern->index[iptr];
667             register double fnorm2=0;             register double fnorm2=0;
668             #pragma ivdepremote_threshold[2*i]             #pragma ivdepremote_threshold[2*i]
669             for(bi=0;bi<n_block;++bi) {             for(bi=0;bi<block_size;++bi) {
670            register double rtmp2 = A->row_coupleBlock->val[iptr*n_block+bi];            register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
671            fnorm2+=rtmp2*rtmp2;            fnorm2+=rtmp2*rtmp2;
672             }             }
673                        
# Line 702  void Paso_Preconditioner_AMG_setStrongCo Line 676  void Paso_Preconditioner_AMG_setStrongCo
676            kdeg++;            kdeg++;
677             }             }
678          }          }
679    
680            #pragma ivdep
681                for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
682                   register index_t j=A->remote_coupleBlock->pattern->index[iptr];
683                   register double fnorm2=0;
684                   #pragma ivdepremote_threshold[2*i]
685                   for(bi=0;bi<block_size;++bi) {
686                      register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
687                      fnorm2+=rtmp2*rtmp2;
688                   }
689                   if(fnorm2 > threshold2 && i != j) {
690                      S[koffset+kdeg] = j + my_n;
691                      kdeg++;
692                   }
693                }
694                    
695       }       }
696       offset_S[i+my_n]=koffset;       offset_S[i+my_n]=koffset;
# Line 711  void Paso_Preconditioner_AMG_setStrongCo Line 700  void Paso_Preconditioner_AMG_setStrongCo
700     }     }
701     TMPMEMFREE(threshold_p);     TMPMEMFREE(threshold_p);
702  }  }
703    
704  void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,  void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
705                              const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)                              const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
706  {  {
# Line 738  void Paso_Preconditioner_AMG_transposeSt Line 728  void Paso_Preconditioner_AMG_transposeSt
728     }     }
729  }  }
730    
731    int compareindex(const void *a, const void *b)
732    {
733      return (*(int *)a - *(int *)b);
734    }
735    
736  void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,  void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
737                          const dim_t* degree_S, const index_t* offset_S, const index_t* S,                          const dim_t* degree_S, const index_t* offset_S, const index_t* S,
738                          const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,                          const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
# Line 754  void Paso_Preconditioner_AMG_CIJPCoarsen Line 749  void Paso_Preconditioner_AMG_CIJPCoarsen
749    Status=TMPMEMALLOC(n, double);    Status=TMPMEMALLOC(n, double);
750    random = Paso_Distribution_createRandomVector(col_dist,1);    random = Paso_Distribution_createRandomVector(col_dist,1);
751    ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);    ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
752      
753    #pragma omp parallel for private(i)    #pragma omp parallel for private(i)
754    for (i=0; i< my_n; ++i) {    for (i=0; i< my_n; ++i) {
755        w[i]=degree_ST[i]+random[i];        w[i]=degree_ST[i]+random[i];
# Line 764  void Paso_Preconditioner_AMG_CIJPCoarsen Line 759  void Paso_Preconditioner_AMG_CIJPCoarsen
759         Status[i]=1; /* status undefined */         Status[i]=1; /* status undefined */
760        }        }
761    }    }
762    
763    #pragma omp parallel for private(i, iptr)    #pragma omp parallel for private(i, iptr)
764    for (i=0; i< n; ++i) {    for (i=0; i< n; ++i) {
765        for( iptr =0 ; iptr < degree_ST[i]; ++iptr)  {        for( iptr =0 ; iptr < degree_ST[i]; ++iptr)  {
# Line 773  void Paso_Preconditioner_AMG_CIJPCoarsen Line 769  void Paso_Preconditioner_AMG_CIJPCoarsen
769    
770        
771    numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );    numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
772    printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);    /* printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);  */
   
     
773    iter=0;    iter=0;
774    while (numUndefined > 0) {    while (numUndefined > 0) {
775       Paso_Coupler_fillOverlap(n, w, w_coupler);       Paso_Coupler_fillOverlap(n, w, w_coupler);
776       {  
777      int p;        /* calculate the maximum value of neighbours following active strong connections:
778      for (p=0; p<my_n; ++p) {          w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active  */      
        printf(" %d : %f %f \n",p, w[p], Status[p]);  
     }  
     for (p=my_n; p<n; ++p) {  
        printf(" %d : %f \n",p, w[p]);  
     }  
       
       
      }  
       
       /* calculate the maximum value of naigbours following active strong connections:  
         w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) conenction is still active  */        
779        #pragma omp parallel for private(i, iptr)        #pragma omp parallel for private(i, iptr)
780        for (i=0; i<my_n; ++i) {        for (i=0; i<my_n; ++i) {
781       if (Status[i]>0) { /* status is still undefined */       if (Status[i]>0) { /* status is still undefined */
# Line 800  void Paso_Preconditioner_AMG_CIJPCoarsen Line 783  void Paso_Preconditioner_AMG_CIJPCoarsen
783          register bool_t inD=TRUE;          register bool_t inD=TRUE;
784          const double wi=w[i];          const double wi=w[i];
785    
   
786          for( iptr =0 ; iptr < degree_S[i]; ++iptr) {          for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
   
787             const index_t k=S[offset_S[i]+iptr];             const index_t k=S[offset_S[i]+iptr];
788             const index_t* start_p = &ST[offset_ST[k]];             const index_t* start_p = &ST[offset_ST[k]];
789             const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);             const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
790    
791             if (ST_flag[(index_t)(where_p-start_p)]>0) {             if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
 printf("S: %d (%e) -> %d    (%e)\n",i, wi, k, w[k]);        
792            if (wi <= w[k] ) {            if (wi <= w[k] ) {
793               inD=FALSE;               inD=FALSE;
794               break;               break;
# Line 821  printf("S: %d (%e) -> %d   (%e)\n",i, wi, Line 801  printf("S: %d (%e) -> %d   (%e)\n",i, wi,
801            for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {            for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
802               const index_t k=ST[offset_ST[i]+iptr];               const index_t k=ST[offset_ST[i]+iptr];
803               if ( ST_flag[offset_ST[i]+iptr] > 0 ) {               if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
 printf("ST: %d (%e) -> %d   (%e)\n",i, wi, k, w[k]);        
               
804                         if (wi <= w[k] ) {                         if (wi <= w[k] ) {
805                 inD=FALSE;                 inD=FALSE;
806                 break;                 break;
# Line 832  printf("ST: %d (%e) -> %d  (%e)\n",i, wi, Line 810  printf("ST: %d (%e) -> %d  (%e)\n",i, wi,
810          }              }    
811          if (inD) {          if (inD) {
812             Status[i]=0.; /* is in D */             Status[i]=0.; /* is in D */
 printf("%d is in D\n",i);  
813          }          }
814       }       }
815            
# Line 864  printf("%d is in D\n",i); Line 841  printf("%d is in D\n",i);
841            const index_t j=ST[offset_ST[i]+jptr];            const index_t j=ST[offset_ST[i]+jptr];
842            if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {            if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
843               w[i]--;               w[i]--;
 printf("%d reduced by %d\n",i,j);  
844               ST_flag[offset_ST[i]+jptr]=-1;               ST_flag[offset_ST[i]+jptr]=-1;
845            }            }
846             }             }
# Line 886  printf("%d reduced by %d\n",i,j); Line 862  printf("%d reduced by %d\n",i,j);
862    
863               for (jptr=0; jptr<degree_ST[i]; ++jptr) {               for (jptr=0; jptr<degree_ST[i]; ++jptr) {
864              const index_t j=ST[offset_ST[i]+jptr];              const index_t j=ST[offset_ST[i]+jptr];
 printf("check connection: %d %d\n",i,j);  
865              ST_flag[offset_ST[i]+jptr]=-1;              ST_flag[offset_ST[i]+jptr]=-1;
866              for (kptr=0; kptr<degree_ST[j]; ++kptr) {              for (kptr=0; kptr<degree_ST[j]; ++kptr) {
867                 const index_t k=ST[offset_ST[j]+kptr];                 const index_t k=ST[offset_ST[j]+kptr];
 printf("check connection: %d of %d\n",k,j);  
868                 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */                 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
 printf("found!\n");  
869                    if (ST_flag[offset_ST[j]+kptr] >0) {                    if (ST_flag[offset_ST[j]+kptr] >0) {
870                   if (j< my_n ) {                   if (j< my_n ) {
871                      w[j]--;                      w[j]--;
 printf("%d reduced by %d and %d \n",j, i,k);  
872                   }                   }
873                   ST_flag[offset_ST[j]+kptr]=-1;                   ST_flag[offset_ST[j]+kptr]=-1;
874                    }                    }
# Line 906  printf("%d reduced by %d and %d \n",j, i Line 878  printf("%d reduced by %d and %d \n",j, i
878            }            }
879          }          }
880       }       }
881    
882       /* adjust status */       /* adjust status */
883       #pragma omp parallel for private(i)       #pragma omp parallel for private(i)
884       for (i=0; i< my_n; ++i) {       for (i=0; i< my_n; ++i) {
885          if ( Status[i] == 0. ) {          if ( Status[i] == 0. ) {
886             Status[i] = -10;   /* this is now a C point */             Status[i] = -10;   /* this is now a C point */
887          } else if ( w[i]<1.) {          } else if (Status[i] == 1. && w[i]<1.) {
888             Status[i] = -100;   /* this is now a F point */               Status[i] = -100;   /* this is now a F point */  
889          }          }
890       }       }
891            
892             i = numUndefined;
893       numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );       numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
894         if (numUndefined == i) {
895           Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
896           return;
897         }
898    
899       iter++;       iter++;
900       printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined);       /* printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined); */
   } /* end of while loop */  
901    
902      } /* end of while loop */
903    
904    /* map to output :*/    /* map to output :*/
905    Paso_Coupler_fillOverlap(n, Status, w_coupler);    Paso_Coupler_fillOverlap(n, Status, w_coupler);
# Line 933  printf("%d reduced by %d and %d \n",j, i Line 911  printf("%d reduced by %d and %d \n",j, i
911          split_marker[i]=PASO_AMG_IN_F;          split_marker[i]=PASO_AMG_IN_F;
912       }       }
913    }    }
   
914    /* clean up : */    /* clean up : */
915    Paso_Coupler_free(w_coupler);    Paso_Coupler_free(w_coupler);
916    TMPMEMFREE(random);    TMPMEMFREE(random);
# Line 943  printf("%d reduced by %d and %d \n",j, i Line 920  printf("%d reduced by %d and %d \n",j, i
920        
921    return;    return;
922  }  }
923    
924    /* Merge the system matrix which is distributed on ranks into a complete
925       matrix on rank 0, then solve this matrix on rank 0 only */
926    Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
927      index_t i, iptr, j, n, remote_n, global_n, len, offset, tag;
928      index_t row_block_size, col_block_size, block_size;
929      index_t size=A->mpi_info->size;
930      index_t rank=A->mpi_info->rank;
931      index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
932      index_t *temp_n=NULL, *temp_len=NULL;
933      double  *val=NULL;
934      Paso_Pattern *pattern=NULL;
935      Paso_SparseMatrix *out=NULL;
936      #ifdef ESYS_MPI
937        MPI_Request* mpi_requests=NULL;
938        MPI_Status* mpi_stati=NULL;
939      #else
940        int *mpi_requests=NULL, *mpi_stati=NULL;
941      #endif
942    
943      if (size == 1) {
944        n = A->mainBlock->numRows;
945        ptr = TMPMEMALLOC(n, index_t);
946        #pragma omp parallel for private(i)
947        for (i=0; i<n; i++) ptr[i] = i;
948        out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
949        TMPMEMFREE(ptr);
950        return out;
951      }
952    
953      n = A->mainBlock->numRows;
954      block_size = A->block_size;
955    
956      /* Merge MainBlock and CoupleBlock to get a complete column entries
957         for each row allocated to current rank. Output (ptr, idx, val)
958         contains all info needed from current rank to merge a system
959         matrix  */
960      Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
961    
962      #ifdef ESYS_MPI
963        mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
964        mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
965      #else
966        mpi_requests=TMPMEMALLOC(size*2,int);
967        mpi_stati=TMPMEMALLOC(size*2,int);
968      #endif
969    
970      /* Now, pass all info to rank 0 and merge them into one sparse
971         matrix */
972      if (rank == 0) {
973        /* First, copy local ptr values into ptr_global */
974        global_n=Paso_SystemMatrix_getGlobalNumRows(A);
975        ptr_global = MEMALLOC(global_n+1, index_t);
976        memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
977        iptr = n+1;
978        MEMFREE(ptr);
979        temp_n = TMPMEMALLOC(size, index_t);
980        temp_len = TMPMEMALLOC(size, index_t);
981        temp_n[0] = iptr;
982        
983        /* Second, receive ptr values from other ranks */
984        for (i=1; i<size; i++) {
985          remote_n = A->row_distribution->first_component[i+1] -
986             A->row_distribution->first_component[i];
987          #ifdef ESYS_MPI
988          MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
989                A->mpi_info->msg_tag_counter+i,
990                A->mpi_info->comm,
991                &mpi_requests[i]);
992          #endif
993          temp_n[i] = remote_n;
994          iptr += remote_n;
995        }
996        #ifdef ESYS_MPI
997        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
998        #endif
999        A->mpi_info->msg_tag_counter += size;
1000    
1001        /* Then, prepare to receive idx and val from other ranks */
1002        len = 0;
1003        offset = -1;
1004        for (i=0; i<size; i++) {
1005          if (temp_n[i] > 0) {
1006        offset += temp_n[i];
1007        len += ptr_global[offset];
1008        temp_len[i] = ptr_global[offset];
1009          }else
1010        temp_len[i] = 0;
1011        }
1012    
1013        idx_global = MEMALLOC(len, index_t);
1014        iptr = temp_len[0];
1015        offset = n+1;
1016        for (i=1; i<size; i++) {
1017          len = temp_len[i];
1018          #ifdef ESYS_MPI
1019          MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1020                A->mpi_info->msg_tag_counter+i,
1021                A->mpi_info->comm,
1022                &mpi_requests[i]);
1023          #endif
1024          remote_n = temp_n[i];
1025          for (j=0; j<remote_n; j++) {
1026        ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1027          }
1028          offset += remote_n;
1029          iptr += len;
1030        }
1031        memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1032        MEMFREE(idx);
1033        row_block_size = A->mainBlock->row_block_size;
1034        col_block_size = A->mainBlock->col_block_size;
1035        #ifdef ESYS_MPI
1036        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1037        #endif
1038        A->mpi_info->msg_tag_counter += size;
1039        TMPMEMFREE(temp_n);
1040    
1041        /* Then generate the sparse matrix */
1042        pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, global_n,
1043                global_n, ptr_global, idx_global);
1044        out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1045                row_block_size, col_block_size, FALSE);
1046        Paso_Pattern_free(pattern);
1047    
1048        /* Finally, receive and copy the value */
1049        iptr = temp_len[0] * block_size;
1050        for (i=1; i<size; i++) {
1051          len = temp_len[i];
1052          #ifdef ESYS_MPI
1053          MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1054                            A->mpi_info->msg_tag_counter+i,
1055                            A->mpi_info->comm,
1056                            &mpi_requests[i]);
1057          #endif
1058          iptr += (len * block_size);
1059        }
1060        memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1061        MEMFREE(val);
1062        #ifdef ESYS_MPI
1063        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1064        #endif
1065        A->mpi_info->msg_tag_counter += size;
1066        TMPMEMFREE(temp_len);
1067      } else { /* it's not rank 0 */
1068    
1069        /* First, send out the local ptr */
1070        tag = A->mpi_info->msg_tag_counter+rank;
1071        #ifdef ESYS_MPI
1072        MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1073                &mpi_requests[0]);
1074        #endif
1075    
1076        /* Next, send out the local idx */
1077        len = ptr[n];
1078        tag += size;
1079        #ifdef ESYS_MPI
1080        MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1081                &mpi_requests[1]);
1082        #endif
1083    
1084        /* At last, send out the local val */
1085        len *= block_size;
1086        tag += size;
1087        #ifdef ESYS_MPI
1088        MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1089                &mpi_requests[2]);
1090    
1091        MPI_Waitall(3, mpi_requests, mpi_stati);
1092        #endif
1093        A->mpi_info->msg_tag_counter = tag + size - rank;
1094        MEMFREE(ptr);
1095        MEMFREE(idx);
1096        MEMFREE(val);
1097    
1098        out = NULL;
1099      }
1100    
1101      TMPMEMFREE(mpi_requests);
1102      TMPMEMFREE(mpi_stati);
1103      return out;
1104    }
1105    
1106    
1107    void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1108      Paso_SystemMatrix *A = amg->A_C;
1109      Paso_SparseMatrix *A_D, *A_temp;
1110      double* x=NULL;
1111      double* b=NULL;
1112      const index_t rank = A->mpi_info->rank;
1113      const index_t size = A->mpi_info->size;
1114      const n_block = A->mainBlock->row_block_size;
1115    
1116      index_t i, n, p;
1117      index_t *counts, *offset, *dist;
1118      #ifdef ESYS_MPI
1119      index_t count;
1120      #endif
1121      A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1122    
1123      /* First, gather x and b into rank 0 */
1124      dist = A->pattern->input_distribution->first_component;
1125      n = Paso_SystemMatrix_getGlobalNumRows(A);
1126      b = TMPMEMALLOC(n*n_block, double);
1127      x = TMPMEMALLOC(n*n_block, double);
1128      counts = TMPMEMALLOC(size, index_t);
1129      offset = TMPMEMALLOC(size, index_t);
1130    
1131      #pragma omp parallel for private(i,p)
1132      for (i=0; i<size; i++) {
1133        p = dist[i];
1134        counts[i] = (dist[i+1] - p)*n_block;
1135        offset[i] = p*n_block;
1136      }
1137      #ifdef ESYS_MPI
1138      count = counts[rank];
1139      MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1140      MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1141      #endif
1142    
1143      if (rank == 0) {
1144        /* solve locally */
1145        #ifdef MKL
1146          A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1147          A_temp->solver_package = PASO_MKL;
1148          Paso_SparseMatrix_free(A_D);
1149          Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1150          Paso_SparseMatrix_free(A_temp);
1151        #else
1152          #ifdef UMFPACK
1153        A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1154        A_temp->solver_package = PASO_UMFPACK;
1155        Paso_SparseMatrix_free(A_D);
1156        Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1157        Paso_SparseMatrix_free(A_temp);
1158          #else
1159        A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1160        A_D->solver_package = PASO_SMOOTHER;
1161        Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1162        Paso_SparseMatrix_free(A_D);
1163          #endif
1164        #endif
1165      }
1166    
1167      #ifdef ESYS_MPI
1168      /* now we need to distribute the solution to all ranks */
1169      MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1170      #endif
1171    
1172      TMPMEMFREE(x);
1173      TMPMEMFREE(b);
1174      TMPMEMFREE(counts);
1175      TMPMEMFREE(offset);
1176    }

Legend:
Removed from v.3485  
changed lines
  Added in v.3886

  ViewVC Help
Powered by ViewVC 1.1.26