/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Diff of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3351 by gross, Tue Nov 16 02:39:40 2010 UTC revision 3886 by gross, Thu Apr 5 00:50:30 2012 UTC
# Line 14  Line 14 
14    
15  /**************************************************************/  /**************************************************************/
16    
17  /* Paso: AMG preconditioner                                  */  /* Paso: AMG preconditioner  (local version)                  */
18    
19  /**************************************************************/  /**************************************************************/
20    
21  /* Author: artak@uq.edu.au, l.gross@uq.edu.au                                */  /* Author: artak@uq.edu.au, l.gross@uq.edu.au                 */
22    
23  /**************************************************************/  /**************************************************************/
24    
25  #define SHOW_TIMING FALSE  #define SHOW_TIMING FALSE
26    #define MY_DEBUG 0
27    #define MY_DEBUG1 1
28    
29  #include "Paso.h"  #include "Paso.h"
30  #include "Preconditioner.h"  #include "Preconditioner.h"
# Line 30  Line 32 
32  #include "PasoUtil.h"  #include "PasoUtil.h"
33  #include "UMFPACK.h"  #include "UMFPACK.h"
34  #include "MKL.h"  #include "MKL.h"
35    #include<stdio.h>
36    
37    
38  /**************************************************************/  /**************************************************************/
39    
40  /* free all memory used by AMG                                */  /* free all memory used by AMG                                */
41    
42  void Paso_Preconditioner_LocalAMG_free(Paso_Preconditioner_LocalAMG * in) {  void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43       if (in!=NULL) {       if (in!=NULL) {
44      Paso_Preconditioner_LocalSmoother_free(in->Smoother);      Paso_Preconditioner_Smoother_free(in->Smoother);
45      Paso_SparseMatrix_free(in->P);      Paso_SystemMatrix_free(in->P);
46      Paso_SparseMatrix_free(in->R);      Paso_SystemMatrix_free(in->R);
47      Paso_SparseMatrix_free(in->A_C);      Paso_SystemMatrix_free(in->A_C);
48      Paso_Preconditioner_LocalAMG_free(in->AMG_C);      Paso_Preconditioner_AMG_free(in->AMG_C);
49      MEMFREE(in->r);      MEMFREE(in->r);
50      MEMFREE(in->x_C);      MEMFREE(in->x_C);
51      MEMFREE(in->b_C);      MEMFREE(in->b_C);
52    
       
53      MEMFREE(in);      MEMFREE(in);
54       }       }
55  }  }
56    
57    index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
58       if (in->AMG_C == NULL) {
59          return in->level;
60       } else {
61          return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
62       }
63    }
64    double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
65          if (in->AMG_C == NULL) {
66         if (in->A_C == NULL) {
67            return 1.;
68         } else {
69            return Paso_SystemMatrix_getSparsity(in->A_C);
70         }
71          } else {
72            return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
73          }
74    }
75    dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
76       if (in->AMG_C == NULL) {
77          if (in->A_C == NULL) {
78         return 0;
79          } else {
80         return Paso_SystemMatrix_getTotalNumRows(in->A_C);
81          }
82       } else {
83         return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
84       }
85    }
86  /*****************************************************************  /*****************************************************************
87    
88     constructs AMG     constructs AMG
89        
90  ******************************************************************/  ******************************************************************/
91  Paso_Preconditioner_LocalAMG* Paso_Preconditioner_LocalAMG_alloc(Paso_SparseMatrix *A_p,dim_t level,Paso_Options* options) {  Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
92    
93    Paso_Preconditioner_LocalAMG* out=NULL;    Paso_Preconditioner_AMG* out=NULL;
94      Paso_SystemMatrix *A_C=NULL;
95    bool_t verbose=options->verbose;    bool_t verbose=options->verbose;
96    
97      const dim_t my_n=A_p->mainBlock->numRows;
98      const dim_t overlap_n=A_p->row_coupleBlock->numRows;
99        
100    Paso_SparseMatrix *Atemp=NULL, *A_C=NULL;    const dim_t n = my_n + overlap_n;
101    const dim_t n=A_p->numRows;  
102    const dim_t n_block=A_p->row_block_size;    const dim_t n_block=A_p->row_block_size;
103    index_t* split_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F=NULL, *S=NULL, *degree=NULL;    index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
104    dim_t n_F=0, n_C=0, i;    dim_t i, my_n_F, my_n_C, n_C, F_flag, *F_set=NULL, global_n_C=0, global_n_F=0, n_F;
105    double time0=0;    double time0=0;
106    const double theta = options->coarsening_threshold;    const double theta = options->coarsening_threshold;
107    const double tau = options->diagonal_dominance_threshold;    const double tau = options->diagonal_dominance_threshold;
108        const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
109        const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
110    
111    
112    /*    /*
113        is the input matrix A suitable for coarsening        is the input matrix A suitable for coarsening?
114                
115    */    */
116    if ( (A_p->pattern->len >= options->min_coarse_sparsity * n * n ) || (n <= options->min_coarse_matrix_size) || (level > options->level_max) ) {    if ( (sparsity >= options->min_coarse_sparsity) ||
117       if (verbose) printf("Paso_Solver: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",         (global_n <= options->min_coarse_matrix_size) ||
118      level,  options->level_max, A_p->pattern->len/(1.*n * n), options->min_coarse_sparsity, n, options->min_coarse_matrix_size  );           (level > options->level_max) ) {
119       return NULL;  
120    }          if (verbose) {
121              /*
122                  print stopping condition:
123                          - 'SPAR' = min_coarse_matrix_sparsity exceeded
124                          - 'SIZE' = min_coarse_matrix_size exceeded
125                          - 'LEVEL' = level_max exceeded
126              */
127              printf("Paso_Preconditioner: AMG: termination of coarsening by ");
128    
129              if (sparsity >= options->min_coarse_sparsity)
130                  printf("SPAR");
131    
132              if (global_n <= options->min_coarse_matrix_size)
133                  printf("SIZE");
134    
135              if (level > options->level_max)
136                  printf("LEVEL");
137    
138              printf("\n");
139    
140            printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
141               level,  options->level_max, sparsity, options->min_coarse_sparsity, global_n, options->min_coarse_matrix_size);  
142    
143           }
144    
145           return NULL;
146      }  else {
147       /* Start Coarsening : */       /* Start Coarsening : */
148        
149         /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
150         const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len  + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
151    
152       split_marker=TMPMEMALLOC(n,index_t);       dim_t* degree_S=TMPMEMALLOC(n, dim_t);
153         index_t *offset_S=TMPMEMALLOC(n, index_t);
154         index_t *S=TMPMEMALLOC(len_S, index_t);
155         dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
156         index_t *offset_ST=TMPMEMALLOC(n, index_t);
157         index_t *ST=TMPMEMALLOC(len_S, index_t);
158        
159        
160         F_marker=TMPMEMALLOC(n,index_t);
161       counter=TMPMEMALLOC(n,index_t);       counter=TMPMEMALLOC(n,index_t);
162       degree=TMPMEMALLOC(n, dim_t);  
163       S=TMPMEMALLOC(A_p->pattern->len, index_t);       if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
164       if ( !( Esys_checkPtr(split_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree) || Esys_checkPtr(S) ) ) {      || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
165       /*      /*
166               make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
167        */
168            Paso_SystemMatrix_copyColCoupleBlock(A_p);
169            Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
170    
171        /*
172            set splitting of unknows:            set splitting of unknows:
173                  
174           */           */
175       time0=Esys_timer();       time0=Esys_timer();
176       if (n_block>1) {       if (n_block>1) {
177             Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree, S, theta,tau);             Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
178       } else {       } else {
179             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree, S, theta,tau);             Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
180       }       }
181       Paso_Preconditioner_AMG_RungeStuebenSearch(n, A_p->pattern->ptr, degree, S, split_marker);       Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
182    /*   Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
183     */
184    
185         Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
186                                degree_S, offset_S, S, degree_ST, offset_ST, ST,
187                            A_p->col_coupler->connector,A_p->col_distribution);
188          
189    
190             /* in BoomerAMG if interpolation is used FF connectivity is required */
191    /*MPI:
192             if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
193                                 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);  
194    */
195    
196       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);       options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
       
197       if (Esys_noError() ) {       if (Esys_noError() ) {
198          #pragma omp parallel for private(i) schedule(static)          #pragma omp parallel for private(i) schedule(static)
199          for (i = 0; i < n; ++i) split_marker[i]= (split_marker[i] == PASO_AMG_IN_F);          for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] ==  PASO_AMG_IN_F);
200            
201          /*          /*
202             count number of unkowns to be eliminated:             count number of unkowns to be eliminated:
203          */          */
204          n_F=Paso_Util_cumsum_maskedTrue(n,counter, split_marker);          my_n_F=Paso_Util_cumsum_maskedTrue(my_n,counter, F_marker);
205          n_C=n-n_F;          n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
206          if (verbose) printf("Paso_Solver: AMG level %d: %d unknowns are flagged for elimination. %d left.\n",level,n_F,n-n_F);              /* collect my_n_F values on all processes, a direct solver should
207                        be used if any my_n_F value is 0 */
208          if ( n_F == 0 ) {  /*  is a nasty case. a direct solver should be used, return NULL */              F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
209            #ifdef ESYS_MPI
210                MPI_Allgather(&my_n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
211            #endif
212                global_n_F=0;
213                F_flag = 1;
214                for (i=0; i<A_p->mpi_info->size; i++) {
215                    global_n_F+=F_set[i];
216                    if (F_set[i] == 0) F_flag = 0;
217                }
218                TMPMEMFREE(F_set);
219    
220            my_n_C=my_n-my_n_F;
221                global_n_C=global_n-global_n_F;
222            if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,global_n_F,global_n_C);
223    
224            
225    /*          if ( n_F == 0 ) {  is a nasty case. a direct solver should be used, return NULL */
226                if (F_flag == 0) {
227             out = NULL;             out = NULL;
228          } else {          } else {
229             out=MEMALLOC(1,Paso_Preconditioner_LocalAMG);             out=MEMALLOC(1,Paso_Preconditioner_AMG);
230             if (! Esys_checkPtr(out)) {             if (! Esys_checkPtr(out)) {
231            out->level = level;            out->level = level;
           out->n = n;  
           out->n_F = n_F;  
           out->n_block = n_block;  
232            out->A_C = NULL;            out->A_C = NULL;
233            out->P = NULL;              out->P = NULL;  
234            out->R = NULL;                      out->R = NULL;          
# Line 137  Paso_Preconditioner_LocalAMG* Paso_Preco Line 246  Paso_Preconditioner_LocalAMG* Paso_Preco
246             Esys_checkPtr(rows_in_F);             Esys_checkPtr(rows_in_F);
247             if ( Esys_noError() ) {             if ( Esys_noError() ) {
248    
249            out->Smoother = Paso_Preconditioner_LocalSmoother_alloc(A_p, (options->smoother == PASO_JACOBI), verbose);            out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
250                
251            if ( n_F < n ) { /* if nothing is been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */            if (global_n_C != 0) {
252                /*  create mask of C nodes with value >-1, gives new id */
253                n_C=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
254                /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
255        
256              /* allocate helpers :*/              /* allocate helpers :*/
257              out->x_C=MEMALLOC(n_block*n_C,double);              out->x_C=MEMALLOC(n_block*my_n_C,double);
258              out->b_C=MEMALLOC(n_block*n_C,double);              out->b_C=MEMALLOC(n_block*my_n_C,double);
259              out->r=MEMALLOC(n_block*n,double);              out->r=MEMALLOC(n_block*my_n,double);
260                            
261              Esys_checkPtr(out->r);              Esys_checkPtr(out->r);
262              Esys_checkPtr(out->x_C);              Esys_checkPtr(out->x_C);
# Line 156  Paso_Preconditioner_LocalAMG* Paso_Preco Line 268  Paso_Preconditioner_LocalAMG* Paso_Preco
268                 {                 {
269                    #pragma omp for schedule(static)                    #pragma omp for schedule(static)
270                    for (i = 0; i < n; ++i) {                    for (i = 0; i < n; ++i) {
271                   if  (split_marker[i]) rows_in_F[counter[i]]=i;                   if  (F_marker[i]) rows_in_F[counter[i]]=i;
                   }  
                }  
                /*  create mask of C nodes with value >-1 gives new id */  
                i=Paso_Util_cumsum_maskedFalse(n,counter, split_marker);  
   
                #pragma omp parallel for private(i) schedule(static)  
                for (i = 0; i < n; ++i) {  
                   if  (split_marker[i]) {  
                  mask_C[i]=-1;  
                   } else {  
                  mask_C[i]=counter[i];;  
272                    }                    }
273                 }                 }
274                 /*                 /*
275                    get Restriction :                      get Prolongation :    
276                 */                                   */                  
277    
278                 time0=Esys_timer();                 time0=Esys_timer();
279                 out->P=Paso_Preconditioner_AMG_getDirectProlongation(A_p,degree,S,n_C,mask_C);  
280                 if (SHOW_TIMING) printf("timing: level %d: getProlongation: %e\n",level, Esys_timer()-time0);                 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
281    
282              }              }
283    
284              /*                    /*      
285                 construct Prolongation operator as transposed of restriction operator:                 construct Restriction operator as transposed of Prolongation operator:
286              */              */
287    
288              if ( Esys_noError()) {              if ( Esys_noError()) {
289                 time0=Esys_timer();                 time0=Esys_timer();
290                 out->R=Paso_SparseMatrix_getTranspose(out->P);  
291                 if (SHOW_TIMING) printf("timing: level %d: Paso_SparseMatrix_getTranspose: %e\n",level,Esys_timer()-time0);                 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
292    
293                   if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
294              }                    }      
295              /*              /*
296              construct coarse level matrix:                          construct coarse level matrix:
297              */                          */
298              if ( Esys_noError()) {                          if ( Esys_noError()) {
299                 time0=Esys_timer();                             time0=Esys_timer();
300                 Atemp=Paso_SparseMatrix_MatrixMatrix(A_p,out->P);  
301                 A_C=Paso_SparseMatrix_MatrixMatrix(out->R,Atemp);                             A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
302                 Paso_SparseMatrix_free(Atemp);  
303                 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);                                         if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
304              }                          }
305    
               
306              /*              /*
307                 constructe courser level:                 constructe courser level:
308                                
309              */              */
310              if ( Esys_noError()) {              if ( Esys_noError()) {
311                 out->AMG_C=Paso_Preconditioner_LocalAMG_alloc(A_C,level+1,options);                 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
312              }              }
313    
314              if ( Esys_noError()) {              if ( Esys_noError()) {
315                 if ( out->AMG_C == NULL ) {                if ( out->AMG_C == NULL ) {
316                      /* merge the system matrix into 1 rank when
317                     it's not suitable coarsening due to the
318                     total number of unknowns are too small */
319                      out->A_C=A_C;
320                    out->reordering = options->reordering;                    out->reordering = options->reordering;
321                    out->refinements = options->coarse_matrix_refinements;                                out->refinements = options->coarse_matrix_refinements;
322                    /* no coarse level matrix has been constructed. use direct solver */                    out->verbose = verbose;
323                    #ifdef MKL                    out->options_smoother = options->smoother;
324                      out->A_C=Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_C);                } else {
                     Paso_SparseMatrix_free(A_C);  
                     out->A_C->solver_package = PASO_MKL;  
                     if (verbose) printf("Paso_Solver: AMG: use MKL direct solver on the coarsest level (number of unknowns = %d).\n",n_C);  
                   #else  
                     #ifdef UMFPACK  
                        out->A_C=Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_C);  
                        Paso_SparseMatrix_free(A_C);  
                        out->A_C->solver_package = PASO_UMFPACK;  
                        if (verbose) printf("Paso_Solver: AMG: use UMFPACK direct solver on the coarsest level (number of unknowns = %d).\n",n_C);  
                     #else  
                        out->A_C=A_C;  
                        out->A_C->solver_p=Paso_Preconditioner_LocalSmoother_alloc(out->A_C, (options->smoother == PASO_JACOBI), verbose);  
                        out->A_C->solver_package = PASO_SMOOTHER;  
                        if (verbose) printf("Paso_Solver: AMG: use smoother on the coarsest level (number of unknowns = %d).\n",n_C);  
                     #endif  
                   #endif  
                } else {  
325                    /* finally we set some helpers for the solver step */                    /* finally we set some helpers for the solver step */
326                    out->A_C=A_C;                    out->A_C=A_C;
327                 }                }
328              }                      }        
329            }            }
330             }             }
# Line 238  Paso_Preconditioner_LocalAMG* Paso_Preco Line 332  Paso_Preconditioner_LocalAMG* Paso_Preco
332             TMPMEMFREE(rows_in_F);             TMPMEMFREE(rows_in_F);
333          }          }
334       }       }
335    
336    }    }
337    TMPMEMFREE(counter);    TMPMEMFREE(counter);
338    TMPMEMFREE(split_marker);    TMPMEMFREE(F_marker);
339    TMPMEMFREE(degree);    TMPMEMFREE(degree_S);
340      TMPMEMFREE(offset_S);
341    TMPMEMFREE(S);    TMPMEMFREE(S);
342      TMPMEMFREE(degree_ST);
343      TMPMEMFREE(offset_ST);
344      TMPMEMFREE(ST);
345      
346      }
347    
348    if (Esys_noError()) {    if (Esys_noError()) {
349       return out;       return out;
350    } else  {    } else  {
351       Paso_Preconditioner_LocalAMG_free(out);       Paso_Preconditioner_AMG_free(out);
352       return NULL;       return NULL;
353    }    }
354  }  }
355    
356    
357  void Paso_Preconditioner_LocalAMG_solve(Paso_SparseMatrix* A, Paso_Preconditioner_LocalAMG * amg, double * x, double * b) {  void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
358       const dim_t n = amg->n * amg->n_block;       const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
359       double time0=0;       double time0=0;
360       const dim_t post_sweeps=amg->post_sweeps;       const dim_t post_sweeps=amg->post_sweeps;
361       const dim_t pre_sweeps=amg->pre_sweeps;       const dim_t pre_sweeps=amg->pre_sweeps;
362    
363       /* presmoothing */       /* presmoothing */
364       time0=Esys_timer();       time0=Esys_timer();
365       Paso_Preconditioner_LocalSmoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);       Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
366    
367       time0=Esys_timer()-time0;       time0=Esys_timer()-time0;
368       if (SHOW_TIMING) printf("timing: level %d: Presmooting: %e\n",amg->level, time0);       if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
369       /* end of presmoothing */       /* end of presmoothing */
370            
371       if (amg->n_F < amg->n) { /* is there work on the coarse level? */       time0=Esys_timer();
372           time0=Esys_timer();  
373       Paso_Copy(n, amg->r, b);                            /*  r <- b */       Paso_Copy(n, amg->r, b);                            /*  r <- b */
374       Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
375       Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_DIAG(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */       Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C);  /* b_c = R*r  */
376           time0=Esys_timer()-time0;  
377       /* coarse level solve */       time0=Esys_timer()-time0;
378       if ( amg->AMG_C == NULL) {       /* coarse level solve */
379         if ( amg->AMG_C == NULL) {
380          time0=Esys_timer();          time0=Esys_timer();
381          /*  A_C is the coarsest level */          /*  A_C is the coarsest level */
382          switch (amg->A_C->solver_package) {          Paso_Preconditioner_AMG_mergeSolve(amg);
383             case (PASO_MKL):  
           Paso_MKL(amg->A_C, amg->x_C,amg->b_C, amg->reordering, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_UMFPACK):  
           Paso_UMFPACK(amg->A_C, amg->x_C,amg->b_C, amg->refinements, SHOW_TIMING);  
           break;  
            case (PASO_SMOOTHER):  
           Paso_Preconditioner_LocalSmoother_solve(amg->A_C, amg->Smoother,amg->x_C,amg->b_C,pre_sweeps, FALSE);  
           break;  
         }  
384          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);          if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
385       } else {       } else {
386          Paso_Preconditioner_LocalAMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */          Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C)     */
      }    
      time0=time0+Esys_timer();  
      Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_DIAG(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */      
       
          /*postsmoothing*/  
         
         /*solve Ax=b with initial guess x */  
         time0=Esys_timer();  
         Paso_Preconditioner_LocalSmoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);  
         time0=Esys_timer()-time0;  
         if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);  
         /*end of postsmoothing*/  
       
387       }       }
388       return;  
389        time0=time0+Esys_timer();
390        Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */    
391    
392        /*postsmoothing*/
393          
394        /*solve Ax=b with initial guess x */
395        time0=Esys_timer();
396        Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
397        time0=Esys_timer()-time0;
398        if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
399        return;
400  }  }
401    
402  /* theta = threshold for strong connections */  /* theta = threshold for strong connections */
# Line 315  void Paso_Preconditioner_LocalAMG_solve( Line 407  void Paso_Preconditioner_LocalAMG_solve(
407  in the sense that |A_{ij}| >= theta * max_k |A_{ik}|  in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
408  */  */
409    
410  void Paso_Preconditioner_AMG_setStrongConnections(Paso_SparseMatrix* A,  void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
411                        dim_t *degree, index_t *S,                            dim_t *degree_S, index_t *offset_S, index_t *S,
412                        const double theta, const double tau)                        const double theta, const double tau)
413  {  {
    const dim_t n=A->numRows;  
    index_t iptr, i,j;  
    dim_t kdeg;  
    double max_offdiagonal, threshold, sum_row, main_row, fnorm;  
414    
415       const dim_t my_n=A->mainBlock->numRows;
416       const dim_t overlap_n=A->row_coupleBlock->numRows;
417      
418       index_t iptr, i;
419       double *threshold_p=NULL;
420    
421        #pragma omp parallel for private(i,iptr,max_offdiagonal, threshold,j, kdeg, sum_row, main_row, fnorm) schedule(static)     threshold_p = TMPMEMALLOC(2*my_n, double);
422        for (i=0;i<n;++i) {    
423                 #pragma omp parallel for private(i,iptr) schedule(static)
424       max_offdiagonal = 0.;     for (i=0;i<my_n;++i) {        
425       sum_row=0;      
426       main_row=0;       register double max_offdiagonal = 0.;
427         register double sum_row=0;
428         register double main_row=0;
429         register dim_t kdeg=0;
430             register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
431    
432            
433         /* collect information for row i: */
434       #pragma ivdep       #pragma ivdep
435       for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; ++iptr) {       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
436          j=A->pattern->index[iptr];          register index_t j=A->mainBlock->pattern->index[iptr];
437          fnorm=ABS(A->val[iptr]);          register double fnorm=ABS(A->mainBlock->val[iptr]);
           
438          if( j != i) {          if( j != i) {
439             max_offdiagonal = MAX(max_offdiagonal,fnorm);             max_offdiagonal = MAX(max_offdiagonal,fnorm);
440             sum_row+=fnorm;             sum_row+=fnorm;
441          } else {          } else {
442             main_row=fnorm;             main_row=fnorm;
443          }          }
444    
445       }       }
446       threshold = theta*max_offdiagonal;  
447       kdeg=0;       #pragma ivdep
448       if (tau*main_row < sum_row) { /* no diagonal domainance */       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
449          #pragma ivdep          register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
450          for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; ++iptr) {  
451             j=A->pattern->index[iptr];          max_offdiagonal = MAX(max_offdiagonal,fnorm);
452             if(ABS(A->val[iptr])>threshold && i!=j) {          sum_row+=fnorm;
           S[A->pattern->ptr[i]+kdeg] = j;  
           kdeg++;  
            }  
         }  
453       }       }
454       degree[i]=kdeg;  
455             /* inspect row i: */
456             {
457            const double threshold = theta*max_offdiagonal;
458                threshold_p[2*i+1]=threshold;
459            if (tau*main_row < sum_row) { /* no diagonal dominance */
460                   threshold_p[2*i]=1;
461               #pragma ivdep
462               for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
463                  register index_t j=A->mainBlock->pattern->index[iptr];
464                  if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
465                 S[koffset+kdeg] = j;
466                 kdeg++;
467                  }
468               }
469               #pragma ivdep
470               for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
471                  register index_t j=A->col_coupleBlock->pattern->index[iptr];
472                  if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
473                 S[koffset+kdeg] = j + my_n;
474                 kdeg++;
475                  }
476               }
477                } else {
478                   threshold_p[2*i]=-1;
479                }
480             }
481             offset_S[i]=koffset;
482         degree_S[i]=kdeg;
483        }        }
484    
485          /* now we need to distribute the threshold and the diagonal dominance indicator */
486          if (A->mpi_info->size > 1) {
487    
488              const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
489                     -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
490          
491              double *remote_threshold=NULL;
492              
493          Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector  ,2);
494              Paso_Coupler_startCollect(threshold_coupler,threshold_p);
495              Paso_Coupler_finishCollect(threshold_coupler);
496              remote_threshold=threshold_coupler->recv_buffer;
497    
498              #pragma omp parallel for private(i,iptr) schedule(static)
499              for (i=0; i<overlap_n; i++) {
500              const double threshold = remote_threshold[2*i+1];
501              register dim_t kdeg=0;
502                  register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
503                  if (remote_threshold[2*i]>0) {
504            #pragma ivdep
505            for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
506                  register index_t j=A->row_coupleBlock->pattern->index[iptr];
507                  if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
508                 S[koffset+kdeg] = j ;
509                 kdeg++;
510                  }
511            }
512    
513            #pragma ivdep
514            for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
515              register index_t j=A->remote_coupleBlock->pattern->index[iptr];
516              if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
517                  S[koffset+kdeg] = j + my_n;
518                  kdeg++;
519              }
520            }
521                  }
522                  offset_S[i+my_n]=koffset;
523              degree_S[i+my_n]=kdeg;
524              }
525    
526              Paso_Coupler_free(threshold_coupler);
527         }
528         TMPMEMFREE(threshold_p);
529  }  }
530    
531  /* theta = threshold for strong connections */  /* theta = threshold for strong connections */
# Line 366  void Paso_Preconditioner_AMG_setStrongCo Line 534  void Paso_Preconditioner_AMG_setStrongCo
534    
535  in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F  in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
536  */  */
537  void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SparseMatrix* A,  void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
538                              dim_t *degree, index_t *S,                              dim_t *degree_S, index_t *offset_S, index_t *S,
539                              const double theta, const double tau)                              const double theta, const double tau)
540    
541  {  {
542     const dim_t n_block=A->row_block_size;     const dim_t block_size=A->block_size;
543     const dim_t n=A->numRows;     const dim_t my_n=A->mainBlock->numRows;
544     index_t iptr, i,j, bi;     const dim_t overlap_n=A->row_coupleBlock->numRows;
545     dim_t kdeg, max_deg;    
546     register double max_offdiagonal, threshold, fnorm, sum_row, main_row;     index_t iptr, i, bi;
547     double *rtmp;     double *threshold_p=NULL;
548      
549        
550       threshold_p = TMPMEMALLOC(2*my_n, double);
551    
552       #pragma omp parallel private(i,iptr,bi)
553       {
554        
555        #pragma omp parallel private(i,iptr,max_offdiagonal, kdeg, threshold,j, max_deg, fnorm, sum_row, main_row, rtmp)        dim_t max_deg=0;
556        {        double *rtmp=NULL;
557       max_deg=0;  
558       #pragma omp for schedule(static)        #pragma omp for schedule(static)
559       for (i=0;i<n;++i) max_deg=MAX(max_deg, A->pattern->ptr[i+1]-A->pattern->ptr[i]);        for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
560                         +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
561                
562       rtmp=TMPMEMALLOC(max_deg, double);        rtmp=TMPMEMALLOC(max_deg, double);
563                
564       #pragma omp for schedule(static)        #pragma omp for schedule(static)
565       for (i=0;i<n;++i) {        for (i=0;i<my_n;++i) {        
566         register double max_offdiagonal = 0.;
567         register double sum_row=0;
568         register double main_row=0;
569         register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
570         register dim_t kdeg=0;
571         register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
572            
573          max_offdiagonal = 0.;       /* collect information for row i: */
574          sum_row=0;       for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
575          main_row=0;          register index_t j=A->mainBlock->pattern->index[iptr];
576          for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; ++iptr) {          register double fnorm=0;
577             j=A->pattern->index[iptr];          #pragma ivdep
578             fnorm=0;          for(bi=0;bi<block_size;++bi) {
579             #pragma ivdep              register double  rtmp2= A->mainBlock->val[iptr*block_size+bi];
580             for(bi=0;bi<n_block*n_block;++bi) fnorm+=A->val[iptr*n_block*n_block+bi]*A->val[iptr*n_block*n_block+bi];             fnorm+=rtmp2*rtmp2;
            fnorm=sqrt(fnorm);  
            rtmp[iptr-A->pattern->ptr[i]]=fnorm;  
            if( j != i) {  
           max_offdiagonal = MAX(max_offdiagonal,fnorm);  
           sum_row+=fnorm;  
            } else {  
           main_row=fnorm;  
            }  
581          }          }
582          threshold = theta*max_offdiagonal;          fnorm=sqrt(fnorm);
583            rtmp[iptr+rtmp_offset]=fnorm;
584            
585            if( j != i) {
586               max_offdiagonal = MAX(max_offdiagonal,fnorm);
587               sum_row+=fnorm;
588            } else {
589               main_row=fnorm;
590            }
591        
592         }
593                
594          kdeg=0;           rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
595          if (tau*main_row < sum_row) { /* no diagonal domainance */       for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
596            register double fnorm=0;
597            #pragma ivdep
598            for(bi=0;bi<block_size;++bi) {
599               register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
600               fnorm+=rtmp2*rtmp2;
601            }
602            fnorm=sqrt(fnorm);
603            
604            rtmp[iptr+rtmp_offset]=fnorm;
605            max_offdiagonal = MAX(max_offdiagonal,fnorm);
606            sum_row+=fnorm;
607         }
608        
609          
610         /* inspect row i: */
611         {
612            const double threshold = theta*max_offdiagonal;
613            rtmp_offset=-A->mainBlock->pattern->ptr[i];
614            
615            threshold_p[2*i+1]=threshold;
616            if (tau*main_row < sum_row) { /* no diagonal dominance */
617               threshold_p[2*i]=1;
618               #pragma ivdep
619               for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
620              register index_t j=A->mainBlock->pattern->index[iptr];
621              if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
622                 S[koffset+kdeg] = j;
623                 kdeg++;
624              }
625               }
626               rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
627             #pragma ivdep             #pragma ivdep
628             for (iptr=A->pattern->ptr[i];iptr<A->pattern->ptr[i+1]; ++iptr) {             for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
629            j=A->pattern->index[iptr];            register index_t j=A->col_coupleBlock->pattern->index[iptr];
630            if(rtmp[iptr-A->pattern->ptr[i]] > threshold && i!=j) {            if( rtmp[iptr+rtmp_offset] >threshold) {
631               S[A->pattern->ptr[i]+kdeg] = j;               S[koffset+kdeg] = j + my_n;
632               kdeg++;               kdeg++;
633            }            }
634             }             }
635            } else {
636               threshold_p[2*i]=-1;
637            }
638         }
639         offset_S[i]=koffset;
640         degree_S[i]=kdeg;
641          }
642          TMPMEMFREE(rtmp);
643       }
644       /* now we need to distribute the threshold and the diagonal dominance indicator */
645       if (A->mpi_info->size > 1) {
646          
647          const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
648                                 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
649          
650          double *remote_threshold=NULL;
651          
652          Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector  ,2);
653          Paso_Coupler_startCollect(threshold_coupler,threshold_p);
654          Paso_Coupler_finishCollect(threshold_coupler);
655          remote_threshold=threshold_coupler->recv_buffer;
656          
657          #pragma omp parallel for private(i,iptr) schedule(static)
658          for (i=0; i<overlap_n; i++) {
659        
660         const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
661         register dim_t kdeg=0;
662         register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
663         if (remote_threshold[2*i]>0) {
664            #pragma ivdep
665            for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
666               register index_t j=A->row_coupleBlock->pattern->index[iptr];
667               register double fnorm2=0;
668               #pragma ivdepremote_threshold[2*i]
669               for(bi=0;bi<block_size;++bi) {
670              register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
671              fnorm2+=rtmp2*rtmp2;
672               }
673              
674               if(fnorm2 > threshold2 ) {
675              S[koffset+kdeg] = j ;
676              kdeg++;
677               }
678          }          }
         degree[i]=kdeg;  
      }        
      TMPMEMFREE(rtmp);  
       } /* end of parallel region */  
   
 }    
679    
680  /* the runge stueben coarsening algorithm: */          #pragma ivdep
681  void Paso_Preconditioner_AMG_RungeStuebenSearch(const dim_t n, const index_t* offset,              for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
682                           const dim_t* degree, const index_t* S,                 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
683                           index_t*split_marker)                 register double fnorm2=0;
684                   #pragma ivdepremote_threshold[2*i]
685                   for(bi=0;bi<block_size;++bi) {
686                      register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
687                      fnorm2+=rtmp2*rtmp2;
688                   }
689                   if(fnorm2 > threshold2 && i != j) {
690                      S[koffset+kdeg] = j + my_n;
691                      kdeg++;
692                   }
693                }
694            
695         }
696         offset_S[i+my_n]=koffset;
697         degree_S[i+my_n]=kdeg;
698          }
699          Paso_Coupler_free(threshold_coupler);
700       }
701       TMPMEMFREE(threshold_p);
702    }
703    
704    void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
705                                const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
706  {  {
707     const bool_t usePanel=FALSE;     index_t i, j;
708         dim_t p;
709     index_t *lambda=NULL, *ST=NULL, *notInPanel=NULL, *panel=NULL, lambda_max, lambda_k;     dim_t len=0;
710     dim_t i,k, p, q, *degreeT=NULL, len_panel, len_panel_new;     #pragma omp parallel for private(i) schedule(static)
711     register index_t j, itmp;     for (i=0; i<nT ;++i) {
712            degree_ST[i]=0;
    if (n<=0) return; /* make sure that the return of Paso_Util_arg_max is not pointing to nirvana */  
     
    lambda=TMPMEMALLOC(n, index_t); Esys_checkPtr(lambda);  
    degreeT=TMPMEMALLOC(n, dim_t); Esys_checkPtr(degreeT);  
    ST=TMPMEMALLOC(offset[n], index_t);  Esys_checkPtr(ST);  
    if (usePanel) {  
       notInPanel=TMPMEMALLOC(n, bool_t); Esys_checkPtr(notInPanel);  
       panel=TMPMEMALLOC(n, index_t); Esys_checkPtr(panel);  
713     }     }
714       for (i=0; i<n ;++i) {
715          for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
716       }
717       for (i=0; i<nT ;++i) {
718          offset_ST[i]=len;
719          len+=degree_ST[i];
720          degree_ST[i]=0;
721       }
722       for (i=0; i<n ;++i) {
723          for (p=0; p<degree_S[i]; ++p) {
724           j=S[offset_S[i]+p];
725           ST[offset_ST[j]+degree_ST[j]]=i;
726           degree_ST[j]++;
727          }
728       }
729    }
730    
731    int compareindex(const void *a, const void *b)
732    {
733      return (*(int *)a - *(int *)b);
734    }
735    
736    void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
737                            const dim_t* degree_S, const index_t* offset_S, const index_t* S,
738                            const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
739                            Paso_Connector* col_connector, Paso_Distribution* col_dist)
740    {
741       dim_t i, numUndefined,   iter=0;
742      index_t iptr, jptr, kptr;
743      double *random=NULL, *w=NULL, *Status=NULL;
744      index_t * ST_flag=NULL;
745    
746      Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector  ,1);
747        
748        w=TMPMEMALLOC(n, double);
749        Status=TMPMEMALLOC(n, double);
750     if (Esys_noError() ) {    random = Paso_Distribution_createRandomVector(col_dist,1);
751        /* initialize  split_marker and split_marker :*/    ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
752        /* those unknows which are not influenced go into F, the rest is available for F or C */  
753        #pragma omp parallel for private(i) schedule(static)    #pragma omp parallel for private(i)
754        for (i=0;i<n;++i) {    for (i=0; i< my_n; ++i) {
755       degreeT[i]=0;        w[i]=degree_ST[i]+random[i];
756       if (degree[i]>0) {        if (degree_ST[i] < 1) {
757          lambda[i]=0;         Status[i]=-100; /* F point  */
758          split_marker[i]=PASO_AMG_UNDECIDED;        } else {
759       } else {         Status[i]=1; /* status undefined */
         split_marker[i]=PASO_AMG_IN_F;  
         lambda[i]=-1;  
      }  
760        }        }
761        /* create transpose :*/    }
762        for (i=0;i<n;++i) {  
763          for (p=0; p<degree[i]; ++p) {    #pragma omp parallel for private(i, iptr)
764             j=S[offset[i]+p];    for (i=0; i< n; ++i) {
765             ST[offset[j]+degreeT[j]]=i;        for( iptr =0 ; iptr < degree_ST[i]; ++iptr)  {
766             degreeT[j]++;       ST_flag[offset_ST[i]+iptr]=1;
         }  
767        }        }
768        /* lambda[i] = |undecided k in ST[i]| + 2 * |F-unknown in ST[i]| */    }  
769        #pragma omp parallel for private(i, j, itmp) schedule(static)  
770        for (i=0;i<n;++i) {    
771       if (split_marker[i]==PASO_AMG_UNDECIDED) {    numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
772          itmp=lambda[i];    /* printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined);  */
773          for (p=0; p<degreeT[i]; ++p) {    iter=0;
774             j=ST[offset[i]+p];    while (numUndefined > 0) {
775             if (split_marker[j]==PASO_AMG_UNDECIDED) {       Paso_Coupler_fillOverlap(n, w, w_coupler);
776            itmp++;  
777             } else {  /* at this point there are no C points */        /* calculate the maximum value of neighbours following active strong connections:
778            itmp+=2;          w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active  */      
779          #pragma omp parallel for private(i, iptr)
780          for (i=0; i<my_n; ++i) {
781         if (Status[i]>0) { /* status is still undefined */
782    
783            register bool_t inD=TRUE;
784            const double wi=w[i];
785    
786            for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
787               const index_t k=S[offset_S[i]+iptr];
788               const index_t* start_p = &ST[offset_ST[k]];
789               const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
790    
791               if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
792              if (wi <= w[k] ) {
793                 inD=FALSE;
794                 break;
795              }
796             }             }
797            
798            }
799            
800            if (inD) {
801              for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
802                 const index_t k=ST[offset_ST[i]+iptr];
803                 if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
804                           if (wi <= w[k] ) {
805                   inD=FALSE;
806                   break;
807                }
808                 }
809              }
810            }    
811            if (inD) {
812               Status[i]=0.; /* is in D */
813          }          }
         lambda[i]=itmp;  
814       }       }
815        
816        }        }
       if (usePanel) {  
      #pragma omp parallel for private(i) schedule(static)  
      for (i=0;i<n;++i) notInPanel[i]=TRUE;  
       }  
       /* start search :*/  
       i=Paso_Util_arg_max(n,lambda);  
       while (lambda[i]>-1) { /* is there any undecided unknown? */  
   
      if (usePanel) {  
         len_panel=0;  
         do {  
            /* the unknown i is moved to C */  
            split_marker[i]=PASO_AMG_IN_C;  
            lambda[i]=-1;  /* lambda from unavailable unknowns is set to -1 */  
             
            /* all undecided unknown strongly coupled to i are moved to F */  
            for (p=0; p<degreeT[i]; ++p) {  
           j=ST[offset[i]+p];  
             
           if (split_marker[j]==PASO_AMG_UNDECIDED) {  
               
              split_marker[j]=PASO_AMG_IN_F;  
              lambda[j]=-1;  
               
              for (q=0; q<degreeT[j]; ++q) {  
             k=ST[offset[j]+q];  
             if (split_marker[k]==PASO_AMG_UNDECIDED) {  
                lambda[k]++;  
                if (notInPanel[k]) {  
                   notInPanel[k]=FALSE;  
                   panel[len_panel]=k;  
                   len_panel++;  
                }  
817    
818              }    /* the unknown i is moved to C */        Paso_Coupler_fillOverlap(n, Status, w_coupler);
819              split_marker[i]=PASO_AMG_IN_C;  
820              lambda[i]=-1;  /* lambda from unavailable unknowns is set to -1 */  
821               }       /*   remove connection to D points :
822                    
823               for each i in D:
824              for each j in S_i:
825                 w[j]--
826                 ST_tag[j,i]=-1
827              for each j in ST[i]:
828                 ST_tag[i,j]=-1
829                 for each k in ST[j]:
830                if k in ST[i]:
831                   w[j]--;
832                ST_tag[j,k]=-1
833                
834         */
835         /* w is updated  for local rows only */
836         {
837            #pragma omp parallel for private(i, jptr)
838            for (i=0; i< my_n; ++i) {
839    
840               for (jptr=0; jptr<degree_ST[i]; ++jptr) {
841              const index_t j=ST[offset_ST[i]+jptr];
842              if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
843                 w[i]--;
844                 ST_flag[offset_ST[i]+jptr]=-1;
845            }            }
846             }             }
847             for (p=0; p<degree[i]; ++p) {            
848            j=S[offset[i]+p];          }
849            if (split_marker[j]==PASO_AMG_UNDECIDED) {          #pragma omp parallel for private(i, jptr)
850               lambda[j]--;          for (i=my_n; i< n; ++i) {
851               if (notInPanel[j]) {             for (jptr=0; jptr<degree_ST[i]; ++jptr) {
852              notInPanel[j]=FALSE;            const index_t j = ST[offset_ST[i]+jptr];
853              panel[len_panel]=j;            if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
             len_panel++;  
              }  
           }  
854             }             }
855            }
856    
857            
858            for (i=0; i< n; ++i) {
859               if ( Status[i] == 0. ) {
860    
861                         const index_t* start_p = &ST[offset_ST[i]];
862    
863             /* consolidate panel */               for (jptr=0; jptr<degree_ST[i]; ++jptr) {
864             /* remove lambda[q]=-1 */              const index_t j=ST[offset_ST[i]+jptr];
865             lambda_max=-1;              ST_flag[offset_ST[i]+jptr]=-1;
866             i=-1;              for (kptr=0; kptr<degree_ST[j]; ++kptr) {
867             len_panel_new=0;                 const index_t k=ST[offset_ST[j]+kptr];
868             for (q=0; q<len_panel; q++) {                 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
869               k=panel[q];                    if (ST_flag[offset_ST[j]+kptr] >0) {
870               lambda_k=lambda[k];                   if (j< my_n ) {
871               if (split_marker[k]==PASO_AMG_UNDECIDED) {                      w[j]--;
872              panel[len_panel_new]=k;                   }
873              len_panel_new++;                   ST_flag[offset_ST[j]+kptr]=-1;
874                      }
875              if (lambda_max == lambda_k) {                 }
                if (k<i) i=k;  
             } else if (lambda_max < lambda_k) {  
                lambda_max =lambda_k;  
                i=k;  
876              }              }
877               }               }
            }  
            len_panel=len_panel_new;  
         } while (len_panel>0);      
      } else {  
         /* the unknown i is moved to C */  
         split_marker[i]=PASO_AMG_IN_C;  
         lambda[i]=-1;  /* lambda from unavailable unknowns is set to -1 */  
           
         /* all undecided unknown strongly coupled to i are moved to F */  
         for (p=0; p<degreeT[i]; ++p) {  
            j=ST[offset[i]+p];  
            if (split_marker[j]==PASO_AMG_UNDECIDED) {  
           
           split_marker[j]=PASO_AMG_IN_F;  
           lambda[j]=-1;  
           
           for (q=0; q<degreeT[j]; ++q) {  
              k=ST[offset[j]+q];  
              if (split_marker[k]==PASO_AMG_UNDECIDED) lambda[k]++;  
878            }            }
   
            }  
879          }          }
880          for (p=0; p<degree[i]; ++p) {       }
881             j=S[offset[i]+p];  
882             if(split_marker[j]==PASO_AMG_UNDECIDED) lambda[j]--;       /* adjust status */
883         #pragma omp parallel for private(i)
884         for (i=0; i< my_n; ++i) {
885            if ( Status[i] == 0. ) {
886               Status[i] = -10;   /* this is now a C point */
887            } else if (Status[i] == 1. && w[i]<1.) {
888               Status[i] = -100;   /* this is now a F point */  
889          }          }
           
890       }       }
891       i=Paso_Util_arg_max(n,lambda);      
892         i = numUndefined;
893         numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
894         if (numUndefined == i) {
895           Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
896           return;
897         }
898    
899         iter++;
900         /* printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined); */
901    
902      } /* end of while loop */
903    
904      /* map to output :*/
905      Paso_Coupler_fillOverlap(n, Status, w_coupler);
906      #pragma omp parallel for private(i)
907      for (i=0; i< n; ++i) {
908         if (Status[i] > -50.) {
909            split_marker[i]=PASO_AMG_IN_C;
910         } else {
911            split_marker[i]=PASO_AMG_IN_F;
912         }
913      }
914      /* clean up : */
915      Paso_Coupler_free(w_coupler);
916      TMPMEMFREE(random);
917      TMPMEMFREE(w);
918      TMPMEMFREE(Status);
919      TMPMEMFREE(ST_flag);
920      
921      return;
922    }
923    
924    /* Merge the system matrix which is distributed on ranks into a complete
925       matrix on rank 0, then solve this matrix on rank 0 only */
926    Paso_SparseMatrix* Paso_Preconditioner_AMG_mergeSystemMatrix(Paso_SystemMatrix* A) {
927      index_t i, iptr, j, n, remote_n, global_n, len, offset, tag;
928      index_t row_block_size, col_block_size, block_size;
929      index_t size=A->mpi_info->size;
930      index_t rank=A->mpi_info->rank;
931      index_t *ptr=NULL, *idx=NULL, *ptr_global=NULL, *idx_global=NULL;
932      index_t *temp_n=NULL, *temp_len=NULL;
933      double  *val=NULL;
934      Paso_Pattern *pattern=NULL;
935      Paso_SparseMatrix *out=NULL;
936      #ifdef ESYS_MPI
937        MPI_Request* mpi_requests=NULL;
938        MPI_Status* mpi_stati=NULL;
939      #else
940        int *mpi_requests=NULL, *mpi_stati=NULL;
941      #endif
942    
943      if (size == 1) {
944        n = A->mainBlock->numRows;
945        ptr = TMPMEMALLOC(n, index_t);
946        #pragma omp parallel for private(i)
947        for (i=0; i<n; i++) ptr[i] = i;
948        out = Paso_SparseMatrix_getSubmatrix(A->mainBlock, n, n, ptr, ptr);
949        TMPMEMFREE(ptr);
950        return out;
951      }
952    
953      n = A->mainBlock->numRows;
954      block_size = A->block_size;
955    
956      /* Merge MainBlock and CoupleBlock to get a complete column entries
957         for each row allocated to current rank. Output (ptr, idx, val)
958         contains all info needed from current rank to merge a system
959         matrix  */
960      Paso_SystemMatrix_mergeMainAndCouple(A, &ptr, &idx, &val);
961    
962      #ifdef ESYS_MPI
963        mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
964        mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
965      #else
966        mpi_requests=TMPMEMALLOC(size*2,int);
967        mpi_stati=TMPMEMALLOC(size*2,int);
968      #endif
969    
970      /* Now, pass all info to rank 0 and merge them into one sparse
971         matrix */
972      if (rank == 0) {
973        /* First, copy local ptr values into ptr_global */
974        global_n=Paso_SystemMatrix_getGlobalNumRows(A);
975        ptr_global = MEMALLOC(global_n+1, index_t);
976        memcpy(ptr_global, ptr, (n+1) * sizeof(index_t));
977        iptr = n+1;
978        MEMFREE(ptr);
979        temp_n = TMPMEMALLOC(size, index_t);
980        temp_len = TMPMEMALLOC(size, index_t);
981        temp_n[0] = iptr;
982        
983        /* Second, receive ptr values from other ranks */
984        for (i=1; i<size; i++) {
985          remote_n = A->row_distribution->first_component[i+1] -
986             A->row_distribution->first_component[i];
987          #ifdef ESYS_MPI
988          MPI_Irecv(&(ptr_global[iptr]), remote_n, MPI_INT, i,
989                A->mpi_info->msg_tag_counter+i,
990                A->mpi_info->comm,
991                &mpi_requests[i]);
992          #endif
993          temp_n[i] = remote_n;
994          iptr += remote_n;
995        }
996        #ifdef ESYS_MPI
997        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
998        #endif
999        A->mpi_info->msg_tag_counter += size;
1000    
1001        /* Then, prepare to receive idx and val from other ranks */
1002        len = 0;
1003        offset = -1;
1004        for (i=0; i<size; i++) {
1005          if (temp_n[i] > 0) {
1006        offset += temp_n[i];
1007        len += ptr_global[offset];
1008        temp_len[i] = ptr_global[offset];
1009          }else
1010        temp_len[i] = 0;
1011        }
1012    
1013        idx_global = MEMALLOC(len, index_t);
1014        iptr = temp_len[0];
1015        offset = n+1;
1016        for (i=1; i<size; i++) {
1017          len = temp_len[i];
1018          #ifdef ESYS_MPI
1019          MPI_Irecv(&(idx_global[iptr]), len, MPI_INT, i,
1020                A->mpi_info->msg_tag_counter+i,
1021                A->mpi_info->comm,
1022                &mpi_requests[i]);
1023          #endif
1024          remote_n = temp_n[i];
1025          for (j=0; j<remote_n; j++) {
1026        ptr_global[j+offset] = ptr_global[j+offset] + iptr;
1027        }        }
1028     }        offset += remote_n;
1029     TMPMEMFREE(lambda);        iptr += len;
1030     TMPMEMFREE(ST);      }
1031     TMPMEMFREE(degreeT);      memcpy(idx_global, idx, temp_len[0] * sizeof(index_t));
1032     TMPMEMFREE(panel);      MEMFREE(idx);
1033     TMPMEMFREE(notInPanel);      row_block_size = A->mainBlock->row_block_size;
1034        col_block_size = A->mainBlock->col_block_size;
1035        #ifdef ESYS_MPI
1036        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1037        #endif
1038        A->mpi_info->msg_tag_counter += size;
1039        TMPMEMFREE(temp_n);
1040    
1041        /* Then generate the sparse matrix */
1042        pattern = Paso_Pattern_alloc(A->mainBlock->pattern->type, global_n,
1043                global_n, ptr_global, idx_global);
1044        out = Paso_SparseMatrix_alloc(A->mainBlock->type, pattern,
1045                row_block_size, col_block_size, FALSE);
1046        Paso_Pattern_free(pattern);
1047    
1048        /* Finally, receive and copy the value */
1049        iptr = temp_len[0] * block_size;
1050        for (i=1; i<size; i++) {
1051          len = temp_len[i];
1052          #ifdef ESYS_MPI
1053          MPI_Irecv(&(out->val[iptr]), len * block_size, MPI_DOUBLE, i,
1054                            A->mpi_info->msg_tag_counter+i,
1055                            A->mpi_info->comm,
1056                            &mpi_requests[i]);
1057          #endif
1058          iptr += (len * block_size);
1059        }
1060        memcpy(out->val, val, temp_len[0] * sizeof(double) * block_size);
1061        MEMFREE(val);
1062        #ifdef ESYS_MPI
1063        MPI_Waitall(size-1, &(mpi_requests[1]), mpi_stati);
1064        #endif
1065        A->mpi_info->msg_tag_counter += size;
1066        TMPMEMFREE(temp_len);
1067      } else { /* it's not rank 0 */
1068    
1069        /* First, send out the local ptr */
1070        tag = A->mpi_info->msg_tag_counter+rank;
1071        #ifdef ESYS_MPI
1072        MPI_Issend(&(ptr[1]), n, MPI_INT, 0, tag, A->mpi_info->comm,
1073                &mpi_requests[0]);
1074        #endif
1075    
1076        /* Next, send out the local idx */
1077        len = ptr[n];
1078        tag += size;
1079        #ifdef ESYS_MPI
1080        MPI_Issend(idx, len, MPI_INT, 0, tag, A->mpi_info->comm,
1081                &mpi_requests[1]);
1082        #endif
1083    
1084        /* At last, send out the local val */
1085        len *= block_size;
1086        tag += size;
1087        #ifdef ESYS_MPI
1088        MPI_Issend(val, len, MPI_DOUBLE, 0, tag, A->mpi_info->comm,
1089                &mpi_requests[2]);
1090    
1091        MPI_Waitall(3, mpi_requests, mpi_stati);
1092        #endif
1093        A->mpi_info->msg_tag_counter = tag + size - rank;
1094        MEMFREE(ptr);
1095        MEMFREE(idx);
1096        MEMFREE(val);
1097    
1098        out = NULL;
1099      }
1100    
1101      TMPMEMFREE(mpi_requests);
1102      TMPMEMFREE(mpi_stati);
1103      return out;
1104    }
1105    
1106    
1107    void Paso_Preconditioner_AMG_mergeSolve(Paso_Preconditioner_AMG * amg) {
1108      Paso_SystemMatrix *A = amg->A_C;
1109      Paso_SparseMatrix *A_D, *A_temp;
1110      double* x=NULL;
1111      double* b=NULL;
1112      const index_t rank = A->mpi_info->rank;
1113      const index_t size = A->mpi_info->size;
1114      const n_block = A->mainBlock->row_block_size;
1115    
1116      index_t i, n, p;
1117      index_t *counts, *offset, *dist;
1118      #ifdef ESYS_MPI
1119      index_t count;
1120      #endif
1121      A_D = Paso_Preconditioner_AMG_mergeSystemMatrix(A);
1122    
1123      /* First, gather x and b into rank 0 */
1124      dist = A->pattern->input_distribution->first_component;
1125      n = Paso_SystemMatrix_getGlobalNumRows(A);
1126      b = TMPMEMALLOC(n*n_block, double);
1127      x = TMPMEMALLOC(n*n_block, double);
1128      counts = TMPMEMALLOC(size, index_t);
1129      offset = TMPMEMALLOC(size, index_t);
1130    
1131      #pragma omp parallel for private(i,p)
1132      for (i=0; i<size; i++) {
1133        p = dist[i];
1134        counts[i] = (dist[i+1] - p)*n_block;
1135        offset[i] = p*n_block;
1136      }
1137      #ifdef ESYS_MPI
1138      count = counts[rank];
1139      MPI_Gatherv(amg->b_C, count, MPI_DOUBLE, b, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1140      MPI_Gatherv(amg->x_C, count, MPI_DOUBLE, x, counts, offset, MPI_DOUBLE, 0, A->mpi_info->comm);
1141      #endif
1142    
1143      if (rank == 0) {
1144        /* solve locally */
1145        #ifdef MKL
1146          A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_OFFSET1, A_D);
1147          A_temp->solver_package = PASO_MKL;
1148          Paso_SparseMatrix_free(A_D);
1149          Paso_MKL(A_temp, x, b, amg->reordering, amg->refinements, SHOW_TIMING);
1150          Paso_SparseMatrix_free(A_temp);
1151        #else
1152          #ifdef UMFPACK
1153        A_temp = Paso_SparseMatrix_unroll(MATRIX_FORMAT_BLK1 + MATRIX_FORMAT_CSC, A_D);
1154        A_temp->solver_package = PASO_UMFPACK;
1155        Paso_SparseMatrix_free(A_D);
1156        Paso_UMFPACK(A_temp, x, b, amg->refinements, SHOW_TIMING);
1157        Paso_SparseMatrix_free(A_temp);
1158          #else
1159        A_D->solver_p = Paso_Preconditioner_LocalSmoother_alloc(A_D, (amg->options_smoother == PASO_JACOBI), amg->verbose);
1160        A_D->solver_package = PASO_SMOOTHER;
1161        Paso_Preconditioner_LocalSmoother_solve(A_D, A_D->solver_p, x, b, amg->pre_sweeps+amg->post_sweeps, FALSE);
1162        Paso_SparseMatrix_free(A_D);
1163          #endif
1164        #endif
1165      }
1166    
1167      #ifdef ESYS_MPI
1168      /* now we need to distribute the solution to all ranks */
1169      MPI_Scatterv(x, counts, offset, MPI_DOUBLE, amg->x_C, count, MPI_DOUBLE, 0, A->mpi_info->comm);
1170      #endif
1171    
1172      TMPMEMFREE(x);
1173      TMPMEMFREE(b);
1174      TMPMEMFREE(counts);
1175      TMPMEMFREE(offset);
1176  }  }

Legend:
Removed from v.3351  
changed lines
  Added in v.3886

  ViewVC Help
Powered by ViewVC 1.1.26