/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Diff of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/doubleplusgood/paso/src/AMG_Interpolation.c revision 4257 by jfenwick, Wed Feb 27 03:42:40 2013 UTC trunk/paso/src/AMG_Interpolation.cpp revision 4836 by caltinay, Mon Apr 7 05:51:55 2014 UTC
# Line 1  Line 1 
1  /*****************************************************************************  /*****************************************************************************
2  *  *
3  * Copyright (c) 2003-2013 by University of Queensland  * Copyright (c) 2003-2014 by University of Queensland
4  * http://www.uq.edu.au  * http://www.uq.edu.au
5  *  *
6  * Primary Business: Queensland, Australia  * Primary Business: Queensland, Australia
# Line 8  Line 8 
8  * http://www.opensource.org/licenses/osl-3.0.php  * http://www.opensource.org/licenses/osl-3.0.php
9  *  *
10  * Development until 2012 by Earth Systems Science Computational Center (ESSCC)  * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11  * Development since 2012 by School of Earth Sciences  * Development 2012-2013 by School of Earth Sciences
12    * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13  *  *
14  *****************************************************************************/  *****************************************************************************/
15    
16    
17  /************************************************************************************/  /****************************************************************************/
18    
19  /* Paso: defines AMG Interpolation                            */  /* Paso: defines AMG Interpolation                            */
20    
21  /************************************************************************************/  /****************************************************************************/
22    
23  /* Author: l.gao@uq.edu.au                                    */  /* Author: l.gao@uq.edu.au                                    */
24    
25  /************************************************************************************/  /****************************************************************************/
26    
27  #include "Paso.h"  #include "Paso.h"
28  #include "SparseMatrix.h"  #include "SparseMatrix.h"
29  #include "PasoUtil.h"  #include "PasoUtil.h"
30  #include "Preconditioner.h"  #include "Preconditioner.h"
31    
32  /************************************************************************************  /****************************************************************************
33    
34      Methods nessecary for AMG preconditioner      Methods necessary for AMG preconditioner
35    
36      construct n_C x n_C interpolation operator A_C from matrix A      construct n_C x n_C interpolation operator A_C from matrix A
37      and prolongation matrix P.      and prolongation matrix P.
38    
39      The coarsening operator A_C is defined by RAP where R=P^T.      The coarsening operator A_C is defined by RAP where R=P^T.
40    
41  *************************************************************************************/  *****************************************************************************/
42    
43  /* Extend system matrix B with extra two sparse matrixes:  /* Extend system matrix B with extra two sparse matrices:
44      B_ext_main and B_ext_couple          B_ext_main and B_ext_couple
45     The combination of this two sparse matrixes represents       The combination of this two sparse matrices represents  
46     the portion of B that is stored on neighbor procs and     the portion of B that is stored on neighbour procs and
47     needed locally for triple matrix product (B^T) A B.     needed locally for triple matrix product (B^T) A B.
48    
49     FOR NOW, we assume that the system matrix B has a NULL     FOR NOW, we assume that the system matrix B has a NULL
# Line 56  Line 57 
57          A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
58          rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59     on q. */     on q. */
60  void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)  void Paso_Preconditioner_AMG_extendB(paso::SystemMatrix_ptr A,
61                                         paso::SystemMatrix_ptr B)
62  {  {
63    Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;      paso::Pattern_ptr pattern_main, pattern_couple;
64    Paso_Coupler *coupler=NULL;      paso::Coupler_ptr coupler;
65    Paso_SharedComponents *send=NULL, *recv=NULL;      paso::SharedComponents_ptr send, recv;
66    double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;      double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
67    index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;      index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
68    index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;      index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
69    index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;      index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
70    index_t *idx_m=NULL, *idx_c=NULL;      index_t *idx_m=NULL, *idx_c=NULL;
71    index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;      index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
72    index_t offset, len, block_size, block_size_size, max_num_cols;      index_t offset, len, block_size, block_size_size, max_num_cols;
73    index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;      index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
74    dim_t *recv_degree=NULL, *send_degree=NULL;      dim_t *recv_degree=NULL, *send_degree=NULL;
75    dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;      dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
76    
77    if (size == 1) return;      if (size == 1) return;
78    
79    if (B->row_coupleBlock) {      B->row_coupleBlock.reset();
80      Paso_SparseMatrix_free(B->row_coupleBlock);  
81      B->row_coupleBlock = NULL;      if (B->remote_coupleBlock.get()) {
82    }          Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to remote_coupleBlock has already been set.");
83            return;
84    if (B->row_coupleBlock || B->remote_coupleBlock) {      }
     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");  
     return;  
   }  
85    
86    /* sending/receiving unknown's global ID */      /* sending/receiving unknown's global ID */
87    num_main_cols = B->mainBlock->numCols;      num_main_cols = B->mainBlock->numCols;
88    cols = TMPMEMALLOC(num_main_cols, double);      cols = new double[num_main_cols];
89    offset = B->col_distribution->first_component[rank];      offset = B->col_distribution->first_component[rank];
90    #pragma omp parallel for private(i) schedule(static)      #pragma omp parallel for private(i) schedule(static)
91    for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;      for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
92    if (B->global_id == NULL) {      if (B->global_id == NULL) {
93      coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);          coupler.reset(new paso::Coupler(B->col_coupler->connector, 1));
94      Paso_Coupler_startCollect(coupler, cols);          coupler->startCollect(cols);
95    }      }
96    
97    recv_buf = TMPMEMALLOC(size, index_t);      recv_buf = new index_t[size];
98    recv_degree = TMPMEMALLOC(size, dim_t);      recv_degree = new dim_t[size];
99    recv_offset = TMPMEMALLOC(size+1, index_t);      recv_offset = new index_t[size+1];
100    #pragma omp parallel for private(i) schedule(static)    #pragma omp parallel for private(i) schedule(static)
101    for (i=0; i<size; i++){      for (i=0; i<size; i++){
102      recv_buf[i] = 0;          recv_buf[i] = 0;
103      recv_degree[i] = 1;          recv_degree[i] = 1;
104      recv_offset[i] = i;          recv_offset[i] = i;
105    }      }
106    
107    block_size = B->block_size;    block_size = B->block_size;
108    block_size_size = block_size * sizeof(double);    block_size_size = block_size * sizeof(double);
# Line 113  void Paso_Preconditioner_AMG_extendB(Pas Line 112  void Paso_Preconditioner_AMG_extendB(Pas
112    num_neighbors = send->numNeighbors;    num_neighbors = send->numNeighbors;
113    p = send->offsetInShared[num_neighbors];    p = send->offsetInShared[num_neighbors];
114    len = p * B->col_distribution->first_component[size];    len = p * B->col_distribution->first_component[size];
115    send_buf = TMPMEMALLOC(len * block_size, double);    send_buf = new double[len * block_size];
116    send_idx = TMPMEMALLOC(len, index_t);    send_idx = new index_t[len];
117    send_offset = TMPMEMALLOC((p+1)*2, index_t);    send_offset = new index_t[(p+1)*2];
118    send_degree = TMPMEMALLOC(num_neighbors, dim_t);    send_degree = new dim_t[num_neighbors];
119    i = num_main_cols + num_couple_cols;    i = num_main_cols + num_couple_cols;
120    send_m = TMPMEMALLOC(i * block_size, double);    send_m = new double[i * block_size];
121    send_c = TMPMEMALLOC(i * block_size, double);    send_c = new double[i * block_size];
122    idx_m = TMPMEMALLOC(i, index_t);    idx_m = new index_t[i];
123    idx_c = TMPMEMALLOC(i, index_t);    idx_c = new index_t[i];
124    
125    /* waiting for receiving unknown's global ID */    /* waiting for receiving unknown's global ID */
126    if (B->global_id == NULL) {    if (B->global_id == NULL) {
127      Paso_Coupler_finishCollect(coupler);        coupler->finishCollect();
128      global_id = MEMALLOC(num_couple_cols, index_t);      global_id = new index_t[num_couple_cols];
129      #pragma omp parallel for private(i) schedule(static)      #pragma omp parallel for private(i) schedule(static)
130      for (i=0; i<num_couple_cols; ++i)      for (i=0; i<num_couple_cols; ++i)
131          global_id[i] = coupler->recv_buffer[i];          global_id[i] = coupler->recv_buffer[i];
132      B->global_id = global_id;      B->global_id = global_id;
     Paso_Coupler_free(coupler);  
133    } else    } else
134      global_id = B->global_id;      global_id = B->global_id;
135    
# Line 148  void Paso_Preconditioner_AMG_extendB(Pas Line 146  void Paso_Preconditioner_AMG_extendB(Pas
146      recv_offset[i] = len;      recv_offset[i] = len;
147      len += recv_buf[i];      len += recv_buf[i];
148      if (max_num_cols < recv_buf[i])      if (max_num_cols < recv_buf[i])
149      max_num_cols = recv_buf[i];          max_num_cols = recv_buf[i];
150    }    }
151    recv_offset[size] = len;    recv_offset[size] = len;
152    cols_array = TMPMEMALLOC(len, index_t);    cols_array = new index_t[len];
153    #ifdef ESYS_MPI    #ifdef ESYS_MPI
154    MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);    MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155    #endif    #endif
# Line 159  void Paso_Preconditioner_AMG_extendB(Pas Line 157  void Paso_Preconditioner_AMG_extendB(Pas
157    /* first, prepare the ptr_ptr to be received */    /* first, prepare the ptr_ptr to be received */
158    q = recv->numNeighbors;    q = recv->numNeighbors;
159    len = recv->offsetInShared[q];    len = recv->offsetInShared[q];
160    ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);    ptr_ptr = new index_t[(len+1) * 2];
161    for (p=0; p<q; p++) {    for (p=0; p<q; p++) {
162      row = recv->offsetInShared[p];      row = recv->offsetInShared[p];
163      m = recv->offsetInShared[p + 1];      m = recv->offsetInShared[p + 1];
# Line 181  void Paso_Preconditioner_AMG_extendB(Pas Line 179  void Paso_Preconditioner_AMG_extendB(Pas
179      m_ub = B->col_distribution->first_component[neighbor + 1];      m_ub = B->col_distribution->first_component[neighbor + 1];
180      j_ub = send->offsetInShared[p + 1];      j_ub = send->offsetInShared[p + 1];
181      for (j=send->offsetInShared[p]; j<j_ub; j++) {      for (j=send->offsetInShared[p]; j<j_ub; j++) {
182      row = send->shared[j];          row = send->shared[j];
183      l_m = 0;          l_m = 0;
184      l_c = 0;          l_c = 0;
185      k_ub = B->col_coupleBlock->pattern->ptr[row + 1];          k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
186      k_lb = B->col_coupleBlock->pattern->ptr[row];          k_lb = B->col_coupleBlock->pattern->ptr[row];
187    
188      /* check part of col_coupleBlock for data to be passed @row */          /* check part of col_coupleBlock for data to be passed @row */
189      for (k=k_lb; k<k_ub; k++) {          for (k=k_lb; k<k_ub; k++) {
       m = global_id[B->col_coupleBlock->pattern->index[k]];  
       if (m > offset) break;  
       if (m>= m_lb && m < m_ub) {  
         /* data to be passed to sparse matrix B_ext_main */  
         idx_m[l_m] = m - m_lb;  
         memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);  
         l_m++;  
       } else {  
         /* data to be passed to sparse matrix B_ext_couple */  
         idx_c[l_c] = m;  
         memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);  
         l_c++;  
       }  
     }  
     k_lb = k;  
   
     /* check mainBlock for data to be passed @row to sparse  
        matrix B_ext_couple */  
     k_ub = B->mainBlock->pattern->ptr[row + 1];  
     k = B->mainBlock->pattern->ptr[row];  
     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));  
     for (; k<k_ub; k++) {  
       m = B->mainBlock->pattern->index[k] + offset;  
       idx_c[l_c] = m;  
       l_c++;  
     }  
   
     /* check the rest part of col_coupleBlock for data to  
        be passed @row to sparse matrix B_ext_couple */  
     k = k_lb;  
     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];  
     for (k=k_lb; k<k_ub; k++) {  
190            m = global_id[B->col_coupleBlock->pattern->index[k]];            m = global_id[B->col_coupleBlock->pattern->index[k]];
191        if (m>= m_lb && m < m_ub) {            if (m > offset) break;
192          /* data to be passed to sparse matrix B_ext_main */            if (m>= m_lb && m < m_ub) {
193          idx_m[l_m] = m - m_lb;              /* data to be passed to sparse matrix B_ext_main */
194          memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);              idx_m[l_m] = m - m_lb;
195          l_m++;              memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196        } else {              l_m++;
197          /* data to be passed to sparse matrix B_ext_couple */            } else {
198          idx_c[l_c] = m;              /* data to be passed to sparse matrix B_ext_couple */
199          memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);              idx_c[l_c] = m;
200          l_c++;              memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
201        }              l_c++;
202      }            }
203            }
204            k_lb = k;
205    
206      memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);          /* check mainBlock for data to be passed @row to sparse
207      memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));             matrix B_ext_couple */
208            k_ub = B->mainBlock->pattern->ptr[row + 1];
209            k = B->mainBlock->pattern->ptr[row];
210            memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
211            for (; k<k_ub; k++) {
212              m = B->mainBlock->pattern->index[k] + offset;
213              idx_c[l_c] = m;
214              l_c++;
215            }
216    
217            /* check the rest part of col_coupleBlock for data to
218               be passed @row to sparse matrix B_ext_couple */
219            k = k_lb;
220            k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
221            for (k=k_lb; k<k_ub; k++) {
222              m = global_id[B->col_coupleBlock->pattern->index[k]];
223              if (m>= m_lb && m < m_ub) {
224                /* data to be passed to sparse matrix B_ext_main */
225                idx_m[l_m] = m - m_lb;
226                memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227                l_m++;
228              } else {
229                /* data to be passed to sparse matrix B_ext_couple */
230                idx_c[l_c] = m;
231                memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
232                l_c++;
233              }
234            }
235    
236            memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
237            memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
238          send_offset[2*i] = l_m;          send_offset[2*i] = l_m;
239      len += l_m;          len += l_m;
240      memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);          memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
241      memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));          memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
242      send_offset[2*i+1] = l_c;          send_offset[2*i+1] = l_c;
243      len += l_c;          len += l_c;
244      i++;          i++;
245      }      }
246    
247      /* sending */      /* sending */
# Line 256  void Paso_Preconditioner_AMG_extendB(Pas Line 254  void Paso_Preconditioner_AMG_extendB(Pas
254      send_degree[p] = len;      send_degree[p] = len;
255      i0 = i;      i0 = i;
256    }    }
257    TMPMEMFREE(send_m);    delete[] send_m;
258    TMPMEMFREE(send_c);    delete[] send_c;
259    TMPMEMFREE(idx_m);    delete[] idx_m;
260    TMPMEMFREE(idx_c);    delete[] idx_c;
261    
262    
263    q = recv->numNeighbors;    q = recv->numNeighbors;
264    len = recv->offsetInShared[q];    len = recv->offsetInShared[q];
265    ptr_main = MEMALLOC((len+1), index_t);    ptr_main = new index_t[(len+1)];
266    ptr_couple = MEMALLOC((len+1), index_t);    ptr_couple = new index_t[(len+1)];
267    
268    #ifdef ESYS_MPI    #ifdef ESYS_MPI
269    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
270                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
271                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
272    #endif    #endif
273    A->mpi_info->msg_tag_counter += size;    ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
274    
275    j = 0;    j = 0;
276    k = 0;    k = 0;
# Line 285  void Paso_Preconditioner_AMG_extendB(Pas Line 283  void Paso_Preconditioner_AMG_extendB(Pas
283      ptr_couple[i+1] = k;      ptr_couple[i+1] = k;
284    }    }
285    
286    TMPMEMFREE(ptr_ptr);      delete[] ptr_ptr;  
287    idx_main = MEMALLOC(j, index_t);    idx_main = new index_t[j];
288    idx_couple = MEMALLOC(k, index_t);    idx_couple = new index_t[k];
289    ptr_idx = TMPMEMALLOC(j+k, index_t);    ptr_idx = new index_t[j+k];
290    ptr_val = TMPMEMALLOC((j+k) * block_size, double);    ptr_val = new double[(j+k) * block_size];
291    
292    /* send/receive index array */    /* send/receive index array */
293    j=0;    j=0;
# Line 299  void Paso_Preconditioner_AMG_extendB(Pas Line 297  void Paso_Preconditioner_AMG_extendB(Pas
297      m = recv->offsetInShared[p+1];      m = recv->offsetInShared[p+1];
298      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
299      if (i > 0) {      if (i > 0) {
300      k_ub ++;          k_ub ++;
301      #ifdef ESYS_MPI          #ifdef ESYS_MPI
302          MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],          MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
303                  A->mpi_info->msg_tag_counter+recv->neighbor[p],                  A->mpi_info->msg_tag_counter+recv->neighbor[p],
304                  A->mpi_info->comm,                  A->mpi_info->comm,
305                  &(A->col_coupler->mpi_requests[p]));                  &(A->col_coupler->mpi_requests[p]));
306      #endif          #endif
307      }      }
308      j += i;      j += i;
309    }    }
# Line 315  void Paso_Preconditioner_AMG_extendB(Pas Line 313  void Paso_Preconditioner_AMG_extendB(Pas
313    for (p=0; p<num_neighbors; p++) {    for (p=0; p<num_neighbors; p++) {
314      i = send_degree[p] - j;      i = send_degree[p] - j;
315      if (i > 0){      if (i > 0){
316      k_ub ++;          k_ub ++;
317      #ifdef ESYS_MPI          #ifdef ESYS_MPI
318          MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],          MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
319                  A->mpi_info->msg_tag_counter+rank,                  A->mpi_info->msg_tag_counter+rank,
320                  A->mpi_info->comm,                  A->mpi_info->comm,
321                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
322      #endif          #endif
323      }      }
324      j = send_degree[p];      j = send_degree[p];
325    }    }
# Line 331  void Paso_Preconditioner_AMG_extendB(Pas Line 329  void Paso_Preconditioner_AMG_extendB(Pas
329                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
330                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
331    #endif    #endif
332    A->mpi_info->msg_tag_counter += size;    ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
333    
334    #pragma omp parallel for private(i,j,k,m,p) schedule(static)    #pragma omp parallel for private(i,j,k,m,p) schedule(static)
335    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
# Line 339  void Paso_Preconditioner_AMG_extendB(Pas Line 337  void Paso_Preconditioner_AMG_extendB(Pas
337      k = ptr_main[i+1];      k = ptr_main[i+1];
338      m = ptr_couple[i];      m = ptr_couple[i];
339      for (p=j; p<k; p++) {      for (p=j; p<k; p++) {
340      idx_main[p] = ptr_idx[m+p];          idx_main[p] = ptr_idx[m+p];
341      }      }
342      j = ptr_couple[i+1];      j = ptr_couple[i+1];
343      for (p=m; p<j; p++) {      for (p=m; p<j; p++) {
344      idx_couple[p] = ptr_idx[k+p];          idx_couple[p] = ptr_idx[k+p];
345      }      }
346    }    }
347    TMPMEMFREE(ptr_idx);    delete[] ptr_idx;
348    
349    /* allocate pattern and sparsematrix for B_ext_main */    /* allocate pattern and sparsematrix for B_ext_main */
350    pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,    pattern_main.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
351                  len, num_main_cols, ptr_main, idx_main);                  len, num_main_cols, ptr_main, idx_main));
352    B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,    B->row_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
353          pattern_main, B->row_block_size, B->col_block_size,                  pattern_main, B->row_block_size, B->col_block_size,
354          FALSE);                  false));
   Paso_Pattern_free(pattern_main);  
355    
356    /* allocate pattern and sparsematrix for B_ext_couple */    /* allocate pattern and sparsematrix for B_ext_couple */
357    pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,    pattern_couple.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
358                  len, B->col_distribution->first_component[size],                  len, B->col_distribution->first_component[size],
359          ptr_couple, idx_couple);                  ptr_couple, idx_couple));
360    B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,    B->remote_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
361                  pattern_couple, B->row_block_size, B->col_block_size,                  pattern_couple, B->row_block_size, B->col_block_size,
362                  FALSE);                  false));
   Paso_Pattern_free(pattern_couple);  
363    
364    /* send/receive value array */    /* send/receive value array */
365    j=0;    j=0;
# Line 400  void Paso_Preconditioner_AMG_extendB(Pas Line 396  void Paso_Preconditioner_AMG_extendB(Pas
396                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
397                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
398    #endif    #endif
399    A->mpi_info->msg_tag_counter += size;    ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
400    
401    #pragma omp parallel for private(i,j,k,m,p) schedule(static)    #pragma omp parallel for private(i,j,k,m,p) schedule(static)
402    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
# Line 408  void Paso_Preconditioner_AMG_extendB(Pas Line 404  void Paso_Preconditioner_AMG_extendB(Pas
404      k = ptr_main[i+1];      k = ptr_main[i+1];
405      m = ptr_couple[i];      m = ptr_couple[i];
406      for (p=j; p<k; p++) {      for (p=j; p<k; p++) {
407      memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);          memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
408      }      }
409      j = ptr_couple[i+1];      j = ptr_couple[i+1];
410      for (p=m; p<j; p++) {      for (p=m; p<j; p++) {
411      memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);          memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
412      }      }
413    }    }
414    TMPMEMFREE(ptr_val);    delete[] ptr_val;
415    
416    
417    /* release all temp memory allocation */      /* release all temp memory allocation */
418    TMPMEMFREE(cols);      delete[] cols;
419    TMPMEMFREE(cols_array);      delete[] cols_array;
420    TMPMEMFREE(recv_offset);      delete[] recv_offset;
421    TMPMEMFREE(recv_degree);      delete[] recv_degree;
422    TMPMEMFREE(recv_buf);      delete[] recv_buf;
423    TMPMEMFREE(send_buf);      delete[] send_buf;
424    TMPMEMFREE(send_offset);      delete[] send_offset;
425    TMPMEMFREE(send_degree);      delete[] send_degree;
426    TMPMEMFREE(send_idx);      delete[] send_idx;
427  }  }
428    
429  /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)  /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
430     has the same number of rows as P->col_coupleBlock->numCols. Now, we need     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
431     to copy block of data in T to neighbor processors, defined by     to copy block of data in T to neighbour processors, defined by
432      P->col_coupler->connector->recv->neighbor[k] where k is in          P->col_coupler->connector->recv->neighbor[k] where k is in
433      [0, P->col_coupler->connector->recv->numNeighbors).          [0, P->col_coupler->connector->recv->numNeighbors).
434     Rows to be copied to neighbor processor k is in the list defined by     Rows to be copied to neighbor processor k is in the list defined by
435      P->col_coupler->connector->recv->offsetInShared[k] ...          P->col_coupler->connector->recv->offsetInShared[k] ...
436      P->col_coupler->connector->recv->offsetInShared[k+1]  */          P->col_coupler->connector->recv->offsetInShared[k+1]  */
437  void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,  void Paso_Preconditioner_AMG_CopyRemoteData(paso::SystemMatrix_ptr P,
438      index_t **p_ptr, index_t **p_idx, double **p_val,          index_t **p_ptr, index_t **p_idx, double **p_val,
439      index_t *global_id, index_t block_size)          index_t *global_id, index_t block_size)
440  {  {
441    Paso_SharedComponents *send=NULL, *recv=NULL;      paso::SharedComponents_ptr send, recv;
442    index_t send_neighbors, recv_neighbors, send_rows, recv_rows;      index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
443    index_t i, j, p, m, n, size;      index_t i, j, p, m, n, size;
444    index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;      index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
445    index_t *ptr=*p_ptr, *idx=*p_idx;      index_t *ptr=*p_ptr, *idx=*p_idx;
446    double  *val=*p_val, *recv_val=NULL;      double  *val=*p_val, *recv_val=NULL;
447    #ifdef ESYS_MPI    #ifdef ESYS_MPI
448    index_t rank = P->mpi_info->rank;      index_t rank = P->mpi_info->rank;
449    #endif    #endif
450    
451    size = P->mpi_info->size;      size = P->mpi_info->size;
452    send = P->col_coupler->connector->recv;      send = P->col_coupler->connector->recv;
453    recv = P->col_coupler->connector->send;      recv = P->col_coupler->connector->send;
454    send_neighbors = send->numNeighbors;      send_neighbors = send->numNeighbors;
455    recv_neighbors = recv->numNeighbors;      recv_neighbors = recv->numNeighbors;
456    send_rows = P->col_coupleBlock->numCols;      send_rows = P->col_coupleBlock->numCols;
457    recv_rows = recv->offsetInShared[recv_neighbors];      recv_rows = recv->offsetInShared[recv_neighbors];
458    
459    send_degree = TMPMEMALLOC(send_rows, index_t);      send_degree = new index_t[send_rows];
460    recv_ptr = MEMALLOC(recv_rows + 1, index_t);      recv_ptr = new index_t[recv_rows + 1];
461    #pragma omp for schedule(static) private(i)    #pragma omp for schedule(static) private(i)
462    for (i=0; i<send_rows; i++)      for (i=0; i<send_rows; i++)
463      send_degree[i] = ptr[i+1] - ptr[i];          send_degree[i] = ptr[i+1] - ptr[i];
464    
465    /* First, send/receive the degree */    /* First, send/receive the degree */
466    for (p=0; p<recv_neighbors; p++) { /* Receiving */    for (p=0; p<recv_neighbors; p++) { /* Receiving */
# Line 472  void Paso_Preconditioner_AMG_CopyRemoteD Line 468  void Paso_Preconditioner_AMG_CopyRemoteD
468      n = recv->offsetInShared[p+1];      n = recv->offsetInShared[p+1];
469      #ifdef ESYS_MPI      #ifdef ESYS_MPI
470      MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],      MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
471          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
472          P->mpi_info->comm,                  P->mpi_info->comm,
473          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
474      #endif      #endif
475    }    }
476    for (p=0; p<send_neighbors; p++) { /* Sending */    for (p=0; p<send_neighbors; p++) { /* Sending */
# Line 482  void Paso_Preconditioner_AMG_CopyRemoteD Line 478  void Paso_Preconditioner_AMG_CopyRemoteD
478      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
479      #ifdef ESYS_MPI      #ifdef ESYS_MPI
480      MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],      MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
481          P->mpi_info->msg_tag_counter + rank,                  P->mpi_info->msg_tag_counter + rank,
482          P->mpi_info->comm,                  P->mpi_info->comm,
483          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
484      #endif      #endif
485    }    }
486    #ifdef ESYS_MPI    #ifdef ESYS_MPI
487    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
488          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
489          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
490    #endif    #endif
491    P->mpi_info->msg_tag_counter += size;    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
492    
493    TMPMEMFREE(send_degree);    delete[] send_degree;
494    m = Paso_Util_cumsum(recv_rows, recv_ptr);    m = Paso_Util_cumsum(recv_rows, recv_ptr);
495    recv_ptr[recv_rows] = m;    recv_ptr[recv_rows] = m;
496    recv_idx = MEMALLOC(m, index_t);    recv_idx = new index_t[m];
497    recv_val = MEMALLOC(m * block_size, double);    recv_val = new double[m * block_size];
498    
499    /* Next, send/receive the index array */    /* Next, send/receive the index array */
500    j = 0;    j = 0;
# Line 509  void Paso_Preconditioner_AMG_CopyRemoteD Line 505  void Paso_Preconditioner_AMG_CopyRemoteD
505      if (i > 0) {      if (i > 0) {
506        #ifdef ESYS_MPI        #ifdef ESYS_MPI
507        MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],        MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
508          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
509          P->mpi_info->comm,                  P->mpi_info->comm,
510          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
511        #endif        #endif
512      }      }
513      j += i;      j += i;
# Line 523  void Paso_Preconditioner_AMG_CopyRemoteD Line 519  void Paso_Preconditioner_AMG_CopyRemoteD
519      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
520      i = ptr[n] - ptr[m];      i = ptr[n] - ptr[m];
521      if (i >0) {      if (i >0) {
522      #ifdef ESYS_MPI          #ifdef ESYS_MPI
523      MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],          MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
524          P->mpi_info->msg_tag_counter + rank,                  P->mpi_info->msg_tag_counter + rank,
525          P->mpi_info->comm,                  P->mpi_info->comm,
526          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
527      #endif          #endif
528      j += i;          j += i;
529      }      }
530    }    }
531    #ifdef ESYS_MPI    #ifdef ESYS_MPI
532    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
533          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
534          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
535    #endif    #endif
536    P->mpi_info->msg_tag_counter += size;    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
537    
538    /* Last, send/receive the data array */    /* Last, send/receive the data array */
539    j = 0;    j = 0;
# Line 548  void Paso_Preconditioner_AMG_CopyRemoteD Line 544  void Paso_Preconditioner_AMG_CopyRemoteD
544      #ifdef ESYS_MPI      #ifdef ESYS_MPI
545      if (i > 0)      if (i > 0)
546        MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],        MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
547          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
548          P->mpi_info->comm,                  P->mpi_info->comm,
549          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
550      #endif      #endif
551      j += (i*block_size);      j += (i*block_size);
552    }    }
# Line 561  void Paso_Preconditioner_AMG_CopyRemoteD Line 557  void Paso_Preconditioner_AMG_CopyRemoteD
557      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
558      i = ptr[n] - ptr[m];      i = ptr[n] - ptr[m];
559      if (i >0) {      if (i >0) {
560      #ifdef ESYS_MPI          #ifdef ESYS_MPI
561      MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],          MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
562          P->mpi_info->msg_tag_counter + rank,                  P->mpi_info->msg_tag_counter + rank,
563          P->mpi_info->comm,                  P->mpi_info->comm,
564          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
565      #endif          #endif
566      j += i * block_size;          j += i * block_size;
567      }      }
568    }    }
569    #ifdef ESYS_MPI    #ifdef ESYS_MPI
570    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
571          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
572          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
573    #endif    #endif
574    P->mpi_info->msg_tag_counter += size;    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
575    
576    /* Clean up and return with recevied ptr, index and data arrays */    /* Clean up and return with received ptr, index and data arrays */
577    MEMFREE(ptr);    delete[] ptr;
578    MEMFREE(idx);    delete[] idx;
579    MEMFREE(val);    delete[] val;
580    *p_ptr = recv_ptr;    *p_ptr = recv_ptr;
581    *p_idx = recv_idx;    *p_idx = recv_idx;
582    *p_val = recv_val;    *p_val = recv_val;
583  }  }
584    
585  Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(  paso::SystemMatrix_ptr Paso_Preconditioner_AMG_buildInterpolationOperator(
586      Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)          paso::SystemMatrix_ptr A, paso::SystemMatrix_ptr P,
587            paso::SystemMatrix_ptr R)
588  {  {
589     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
590     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;     paso::SystemMatrix_ptr out;
591     Paso_SystemMatrix *out=NULL;     paso::SystemMatrixPattern_ptr pattern;
592     Paso_SystemMatrixPattern *pattern=NULL;     paso::Distribution_ptr input_dist, output_dist;
593     Paso_Distribution *input_dist=NULL, *output_dist=NULL;     paso::Connector_ptr col_connector, row_connector;
    Paso_SharedComponents *send =NULL, *recv=NULL;  
    Paso_Connector *col_connector=NULL, *row_connector=NULL;  
    Paso_Pattern *main_pattern=NULL;  
    Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;  
594     const dim_t row_block_size=A->row_block_size;     const dim_t row_block_size=A->row_block_size;
595     const dim_t col_block_size=A->col_block_size;     const dim_t col_block_size=A->col_block_size;
596     const dim_t block_size = A->block_size;     const dim_t block_size = A->block_size;
# Line 631  Paso_SystemMatrix* Paso_Preconditioner_A Line 624  Paso_SystemMatrix* Paso_Preconditioner_A
624  /*   if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))  /*   if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
625       return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/       return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
626    
627     /* two sparse matrixes R_main and R_couple will be generate, as the     /* two sparse matrices R_main and R_couple will be generate, as the
628        transpose of P_main and P_col_couple, respectively. Note that,        transpose of P_main and P_col_couple, respectively. Note that,
629        R_couple is actually the row_coupleBlock of R (R=P^T) */        R_couple is actually the row_coupleBlock of R (R=P^T) */
630     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);     paso::SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
631       paso::SparseMatrix_ptr R_couple;
632     if (size > 1)     if (size > 1)
633       R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);       R_couple = P->col_coupleBlock->getTranspose();
634    
635     /* generate P_ext, i.e. portion of P that is tored on neighbor procs     /* generate P_ext, i.e. portion of P that is stored on neighbor procs
636        and needed locally for triple matrix product RAP        and needed locally for triple matrix product RAP
637        to be specific, P_ext (on processor k) are group of rows in P, where        to be specific, P_ext (on processor k) are group of rows in P, where
638        the list of rows from processor q is given by        the list of rows from processor q is given by
639      A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
640      rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
641        on q.        on q.
642        P_ext is represented by two sparse matrixes P_ext_main and        P_ext is represented by two sparse matrices P_ext_main and
643        P_ext_couple */        P_ext_couple */
644     Paso_Preconditioner_AMG_extendB(A, P);     Paso_Preconditioner_AMG_extendB(A, P);
645    
# Line 668  Paso_SystemMatrix* Paso_Preconditioner_A Line 662  Paso_SystemMatrix* Paso_Preconditioner_A
662     num_Pext_cols = 0;     num_Pext_cols = 0;
663     if (P->global_id) {     if (P->global_id) {
664       /* first count the number of cols "num_Pext_cols" in both P_ext_couple       /* first count the number of cols "num_Pext_cols" in both P_ext_couple
665      and P->col_coupleBlock */          and P->col_coupleBlock */
666       iptr = 0;       iptr = 0;
667       if (num_Pcouple_cols || sum > 0) {       if (num_Pcouple_cols || sum > 0) {
668      temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);          temp = new index_t[num_Pcouple_cols+sum];
669      #pragma omp parallel for lastprivate(iptr) schedule(static)          #pragma omp parallel for lastprivate(iptr) schedule(static)
670      for (iptr=0; iptr<sum; iptr++){          for (iptr=0; iptr<sum; iptr++){
671        temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];            temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
672      }          }
673      for (j=0; j<num_Pcouple_cols; j++, iptr++){          for (j=0; j<num_Pcouple_cols; j++, iptr++){
674        temp[iptr] = P->global_id[j];            temp[iptr] = P->global_id[j];
675      }          }
676       }       }
677       if (iptr) {       if (iptr) {
678      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
679        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_Pext_cols = 1;
680      #else          i = temp[0];
681        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
682      #endif            if (temp[j] > i) {
683      num_Pext_cols = 1;              i = temp[j];
684      i = temp[0];              temp[num_Pext_cols++] = i;
685      for (j=1; j<iptr; j++) {            }
686        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_Pext_cols++] = i;  
       }  
     }  
687       }       }
688       /* resize the pattern of P_ext_couple */       /* resize the pattern of P_ext_couple */
689       if(num_Pext_cols){       if(num_Pext_cols){
690      global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);          global_id_P = new index_t[num_Pext_cols];
691      #pragma omp parallel for private(i) schedule(static)          #pragma omp parallel for private(i) schedule(static)
692      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
693        global_id_P[i] = temp[i];            global_id_P[i] = temp[i];
694       }       }
695       if (num_Pcouple_cols || sum > 0)       if (num_Pcouple_cols || sum > 0)
696      TMPMEMFREE(temp);          delete[] temp;
697       #pragma omp parallel for private(i, where_p) schedule(static)       #pragma omp parallel for private(i, where_p) schedule(static)
698       for (i=0; i<sum; i++) {       for (i=0; i<sum; i++) {
699      where_p = (index_t *)bsearch(          where_p = (index_t *)bsearch(
700              &(P->remote_coupleBlock->pattern->index[i]),                          &(P->remote_coupleBlock->pattern->index[i]),
701              global_id_P, num_Pext_cols,                          global_id_P, num_Pext_cols,
702                          sizeof(index_t), Paso_comparIndex);                          sizeof(index_t), paso::comparIndex);
703      P->remote_coupleBlock->pattern->index[i] =          P->remote_coupleBlock->pattern->index[i] =
704              (index_t)(where_p -global_id_P);                          (index_t)(where_p -global_id_P);
705       }         }  
706    
707       /* build the mapping */       /* build the mapping */
708       if (num_Pcouple_cols) {       if (num_Pcouple_cols) {
709      Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);          Pcouple_to_Pext = new index_t[num_Pcouple_cols];
710      iptr = 0;          iptr = 0;
711      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
712        if (global_id_P[i] == P->global_id[iptr]) {            if (global_id_P[i] == P->global_id[iptr]) {
713          Pcouple_to_Pext[iptr++] = i;              Pcouple_to_Pext[iptr++] = i;
714          if (iptr == num_Pcouple_cols) break;              if (iptr == num_Pcouple_cols) break;
715        }            }
716       }       }
717     }     }
718    
719     /* alloc and initialize the makers */     /* alloc and initialise the makers */
720     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
721     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
722     A_marker = TMPMEMALLOC(num_A_cols, index_t);     A_marker = new index_t[num_A_cols];
723     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
724     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
725     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
# Line 742  Paso_SystemMatrix* Paso_Preconditioner_A Line 732  Paso_SystemMatrix* Paso_Preconditioner_A
732       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
733       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
734       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
735      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
736      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
737      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
738      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
739        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
740    
741        /* check whether entry RA[i_r, i2] has been previously visited.            /* check whether entry RA[i_r, i2] has been previously visited.
742           RAP new entry is possible only if entry RA[i_r, i2] has not               RAP new entry is possible only if entry RA[i_r, i2] has not
743           been visited yet */               been visited yet */
744        if (A_marker[i2] != i_r) {            if (A_marker[i2] != i_r) {
745          /* first, mark entry RA[i_r,i2] as visited */              /* first, mark entry RA[i_r,i2] as visited */
746          A_marker[i2] = i_r;              A_marker[i2] = i_r;
747    
748          /* then loop over elements in row i2 of P_ext_main */              /* then loop over elements in row i2 of P_ext_main */
749          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
750          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
751          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
752    
753          /* check whether entry RAP[i_r,i_c] has been created.                  /* check whether entry RAP[i_r,i_c] has been created.
754             If not yet created, create the entry and increase                     If not yet created, create the entry and increase
755             the total number of elements in RAP_ext */                     the total number of elements in RAP_ext */
756          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
757            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
758            sum++;                    sum++;
759          }                  }
760          }              }
761    
762          /* loop over elements in row i2 of P_ext_couple, do the same */              /* loop over elements in row i2 of P_ext_couple, do the same */
763          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
764          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
765          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
766          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
767            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
768            sum++;                    sum++;
769          }                  }
770          }              }
771        }            }
772      }          }
773    
774      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
775         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
776      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
777      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
778        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
779        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
780          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
781          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
782          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
783          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
784          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
785            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
786            sum++;                    sum++;
787          }                  }
788          }              }
789          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
790          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
791          /* note that we need to map the column index in                  /* note that we need to map the column index in
792             P->col_coupleBlock back into the column index in                     P->col_coupleBlock back into the column index in
793             P_ext_couple */                     P_ext_couple */
794          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
795          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
796            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
797            sum++;                    sum++;
798          }                  }
799          }              }
800        }            }
801      }          }
802       }       }
803     }     }
804    
805     /* Now we have the number of non-zero elements in RAP_ext, allocate     /* Now we have the number of non-zero elements in RAP_ext, allocate
806        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
807     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);     RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
808     RAP_ext_idx = MEMALLOC(sum, index_t);     RAP_ext_idx = new index_t[sum];
809     RAP_ext_val = MEMALLOC(sum * block_size, double);     RAP_ext_val = new double[sum * block_size];
810     RA_val = TMPMEMALLOC(block_size, double);     RA_val = new double[block_size];
811     RAP_val = TMPMEMALLOC(block_size, double);     RAP_val = new double[block_size];
812    
813     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
814     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
# Line 833  Paso_SystemMatrix* Paso_Preconditioner_A Line 823  Paso_SystemMatrix* Paso_Preconditioner_A
823       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
824       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
825       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
826      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
827      R_val = &(R_couple->val[j1*P_block_size]);          R_val = &(R_couple->val[j1*P_block_size]);
828    
829      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
830      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
831      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
832        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
833        temp_val = &(A->col_coupleBlock->val[j2*block_size]);            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
834        for (irb=0; irb<row_block_size; irb++)            for (irb=0; irb<row_block_size; irb++)
835              for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
836                  RA_val[irb+row_block_size*icb] = ZERO;                  RA_val[irb+row_block_size*icb] = ZERO;
837        for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
838          rtmp=R_val[irb];              rtmp=R_val[irb];
839          for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
840          RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
841          }              }
842        }            }
843    
844        /* check whether entry RA[i_r, i2] has been previously visited.            /* check whether entry RA[i_r, i2] has been previously visited.
845           RAP new entry is possible only if entry RA[i_r, i2] has not               RAP new entry is possible only if entry RA[i_r, i2] has not
846           been visited yet */               been visited yet */
847        if (A_marker[i2] != i_r) {            if (A_marker[i2] != i_r) {
848          /* first, mark entry RA[i_r,i2] as visited */              /* first, mark entry RA[i_r,i2] as visited */
849          A_marker[i2] = i_r;              A_marker[i2] = i_r;
850    
851          /* then loop over elements in row i2 of P_ext_main */              /* then loop over elements in row i2 of P_ext_main */
852          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
853          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
854          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
855          temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
856          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
857            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
858              RAP_val[irb+row_block_size*icb] = ZERO;                      RAP_val[irb+row_block_size*icb] = ZERO;
859          for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
860            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
861            for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
862              RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
863            }                    }
864          }                  }
865    
866          /* check whether entry RAP[i_r,i_c] has been created.                  /* check whether entry RAP[i_r,i_c] has been created.
867             If not yet created, create the entry and increase                     If not yet created, create the entry and increase
868             the total number of elements in RAP_ext */                     the total number of elements in RAP_ext */
869          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
870            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
871            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
872            RAP_ext_idx[sum] = i_c + offset;                    RAP_ext_idx[sum] = i_c + offset;
873            sum++;                    sum++;
874          } else {                  } else {
875            temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
876            for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
877              temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
878            }                    }
879          }                  }
880          }              }
881    
882          /* loop over elements in row i2 of P_ext_couple, do the same */              /* loop over elements in row i2 of P_ext_couple, do the same */
883          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
884          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
885          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
886                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
887          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
888            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
889              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
890                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
891            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
892                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
893                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
894                    }                    }
895                  }                  }
896    
897          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
898            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
899            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
900            RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
901            sum++;                    sum++;
902          } else {                  } else {
903                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
904                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
905                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
906                    }                    }
907          }                  }
908          }              }
909    
910        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.            /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
911           Only the contributions are added. */               Only the contributions are added. */
912        } else {            } else {
913          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
914          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
915          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
916                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
917          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
918            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
919              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
920                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
921            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
922                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
923                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
924                    }                    }
# Line 938  Paso_SystemMatrix* Paso_Preconditioner_A Line 928  Paso_SystemMatrix* Paso_Preconditioner_A
928                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
929                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
930                  }                  }
931          }              }
932          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
933          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
934          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
935                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
936          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
937            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
938              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
939                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
940            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
941                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
942                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
943                    }                    }
# Line 957  Paso_SystemMatrix* Paso_Preconditioner_A Line 947  Paso_SystemMatrix* Paso_Preconditioner_A
947                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
948                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
949                  }                  }
950          }              }
951        }            }
952      }          }
953    
954      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
955         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
956      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
957      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
958        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
959            temp_val = &(A->mainBlock->val[j2*block_size]);            temp_val = &(A->mainBlock->val[j2*block_size]);
960        for (irb=0; irb<row_block_size; irb++)            for (irb=0; irb<row_block_size; irb++)
961          for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
962          RA_val[irb+row_block_size*icb]=ZERO;                  RA_val[irb+row_block_size*icb]=ZERO;
963            for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
964          rtmp=R_val[irb];              rtmp=R_val[irb];
965              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
966                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
967              }              }
968            }            }
969    
970        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
971          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
972          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
973          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
974          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
975                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
976          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
977            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
978              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
979                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
980            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
981                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
982                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
983                    }                    }
984                  }                  }
985    
986          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
987            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
988            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
989            RAP_ext_idx[sum] = i_c + offset;                    RAP_ext_idx[sum] = i_c + offset;
990            sum++;                    sum++;
991          } else {                  } else {
992                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
993                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
994                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
995                    }                    }
996          }                  }
997          }              }
998          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
999          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1000          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1001                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1002          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1003                    for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1004              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1005                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1006            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1007                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1008                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1009                    }                    }
1010                  }                  }
1011    
1012          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
1013            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
1014            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1015                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1016            sum++;                    sum++;
1017          } else {                  } else {
1018                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1019                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1020                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1021                    }                    }
1022          }                  }
1023          }              }
1024        } else {            } else {
1025          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1026          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1027          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
1028                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
1029          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1030            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1031              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1032                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1033            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1034                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1035                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1036                    }                    }
# Line 1050  Paso_SystemMatrix* Paso_Preconditioner_A Line 1040  Paso_SystemMatrix* Paso_Preconditioner_A
1040                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1041                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1042                  }                  }
1043          }              }
1044          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1045          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1046          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1047                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1048          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1049            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1050              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1051                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1052              rtmp=temp_val[icb];                      rtmp=temp_val[icb];
1053                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1054                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1055                    }                    }
# Line 1069  Paso_SystemMatrix* Paso_Preconditioner_A Line 1059  Paso_SystemMatrix* Paso_Preconditioner_A
1059                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1060                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1061                  }                  }
1062          }              }
1063        }            }
1064      }          }
1065       }       }
1066       RAP_ext_ptr[i_r+1] = sum;       RAP_ext_ptr[i_r+1] = sum;
1067     }     }
1068     TMPMEMFREE(P_marker);     delete[] P_marker;
1069     TMPMEMFREE(Pcouple_to_Pext);     delete[] Pcouple_to_Pext;
1070    
1071     /* Now we have part of RAP[r,c] where row "r" is the list of rows     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1072        which is given by the column list of P->col_coupleBlock, and        which is given by the column list of P->col_coupleBlock, and
1073        column "c" is the list of columns which possiblely covers the        column "c" is the list of columns which possibly covers the
1074        whole column range of systme martris P. This part of data is to        whole column range of system matrix P. This part of data is to
1075        be passed to neighboring processors, and added to corresponding        be passed to neighbouring processors, and added to corresponding
1076        RAP[r,c] entries in the neighboring processors */        RAP[r,c] entries in the neighbouring processors */
1077     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1078          &RAP_ext_val, global_id_P, block_size);                  &RAP_ext_val, global_id_P, block_size);
1079    
1080     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1081     sum = RAP_ext_ptr[num_RAPext_rows];     sum = RAP_ext_ptr[num_RAPext_rows];
1082     num_RAPext_cols = 0;     num_RAPext_cols = 0;
1083     if (num_Pext_cols || sum > 0) {     if (num_Pext_cols || sum > 0) {
1084       temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);       temp = new index_t[num_Pext_cols+sum];
1085       j1_ub = offset + num_Pmain_cols;       j1_ub = offset + num_Pmain_cols;
1086       for (i=0, iptr=0; i<sum; i++) {       for (i=0, iptr=0; i<sum; i++) {
1087      if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */          if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */
1088        temp[iptr++] = RAP_ext_idx[i];          /* XXX */            temp[iptr++] = RAP_ext_idx[i];                  /* XXX */
1089       }       }
1090       for (j=0; j<num_Pext_cols; j++, iptr++){       for (j=0; j<num_Pext_cols; j++, iptr++){
1091      temp[iptr] = global_id_P[j];          temp[iptr] = global_id_P[j];
1092       }       }
1093    
1094       if (iptr) {       if (iptr) {
1095      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1096        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_RAPext_cols = 1;
1097      #else          i = temp[0];
1098        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
1099      #endif            if (temp[j] > i) {
1100      num_RAPext_cols = 1;              i = temp[j];
1101      i = temp[0];              temp[num_RAPext_cols++] = i;
1102      for (j=1; j<iptr; j++) {            }
1103        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_RAPext_cols++] = i;  
       }  
     }  
1104       }       }
1105     }     }
1106    
1107     /* resize the pattern of P_ext_couple */     /* resize the pattern of P_ext_couple */
1108     if(num_RAPext_cols){     if(num_RAPext_cols){
1109       global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);       global_id_RAP = new index_t[num_RAPext_cols];
1110       #pragma omp parallel for private(i) schedule(static)       #pragma omp parallel for private(i) schedule(static)
1111       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1112      global_id_RAP[i] = temp[i];          global_id_RAP[i] = temp[i];
1113     }     }
1114     if (num_Pext_cols || sum > 0)     if (num_Pext_cols || sum > 0)
1115       TMPMEMFREE(temp);       delete[] temp;
1116     j1_ub = offset + num_Pmain_cols;     j1_ub = offset + num_Pmain_cols;
1117     #pragma omp parallel for private(i, where_p) schedule(static)     #pragma omp parallel for private(i, where_p) schedule(static)
1118     for (i=0; i<sum; i++) {     for (i=0; i<sum; i++) {
1119       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1120      where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,          where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1121  /*XXX*/         num_RAPext_cols, sizeof(index_t), Paso_comparIndex);  /*XXX*/                 num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1122      RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);          RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1123       } else       } else
1124      RAP_ext_idx[i] = RAP_ext_idx[i] - offset;          RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1125     }     }
1126    
1127     /* build the mapping */     /* build the mapping */
1128     if (num_Pcouple_cols) {     if (num_Pcouple_cols) {
1129       Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);       Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1130       iptr = 0;       iptr = 0;
1131       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1132      if (global_id_RAP[i] == P->global_id[iptr]) {          if (global_id_RAP[i] == P->global_id[iptr]) {
1133        Pcouple_to_RAP[iptr++] = i;            Pcouple_to_RAP[iptr++] = i;
1134        if (iptr == num_Pcouple_cols) break;            if (iptr == num_Pcouple_cols) break;
1135      }          }
1136     }     }
1137    
1138     if (num_Pext_cols) {     if (num_Pext_cols) {
1139       Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);       Pext_to_RAP = new index_t[num_Pext_cols];
1140       iptr = 0;       iptr = 0;
1141       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1142      if (global_id_RAP[i] == global_id_P[iptr]) {          if (global_id_RAP[i] == global_id_P[iptr]) {
1143        Pext_to_RAP[iptr++] = i;            Pext_to_RAP[iptr++] = i;
1144        if (iptr == num_Pext_cols) break;            if (iptr == num_Pext_cols) break;
1145      }          }
1146     }     }
1147    
1148     if (global_id_P){     if (global_id_P){
1149       TMPMEMFREE(global_id_P);       delete[] global_id_P;
1150       global_id_P = NULL;       global_id_P = NULL;
1151     }     }
1152    
1153     /* alloc and initialize the makers */     /* alloc and initialise the makers */
1154     sum = num_RAPext_cols + num_Pmain_cols;     sum = num_RAPext_cols + num_Pmain_cols;
1155     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
1156     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
1157     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
1158     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
# Line 1184  Paso_SystemMatrix* Paso_Preconditioner_A Line 1170  Paso_SystemMatrix* Paso_Preconditioner_A
1170       row_marker_ext = j;       row_marker_ext = j;
1171    
1172       /* Mark the diagonal element RAP[i_r, i_r], and other elements       /* Mark the diagonal element RAP[i_r, i_r], and other elements
1173      in RAP_ext */          in RAP_ext */
1174       P_marker[i_r] = i;       P_marker[i_r] = i;
1175       i++;       i++;
1176       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
1177      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1178        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
1179          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1180            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
1181            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
1182          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
1183            P_marker[i_c] = i;                    P_marker[i_c] = i;
1184            i++;                    i++;
1185          }                  }
1186            } else {                } else {
1187          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
1188            P_marker[i_c] = j;                    P_marker[i_c] = j;
1189            j++;                    j++;
1190          }                  }
1191            }                }
1192          }              }
1193          break;              break;
1194        }            }
1195      }          }
1196       }       }
1197    
1198       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
1199       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
1200       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1201      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
1202    
1203            /* then, loop over elements in row i1 of A->col_coupleBlock */
1204            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1205            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1206              i2 = A->col_coupleBlock->pattern->index[j2];
1207    
1208              /* check whether entry RA[i_r, i2] has been previously visited.
1209                 RAP new entry is possible only if entry RA[i_r, i2] has not
1210                 been visited yet */
1211              if (A_marker[i2] != i_r) {
1212                /* first, mark entry RA[i_r,i2] as visited */
1213                A_marker[i2] = i_r;
1214    
1215                /* then loop over elements in row i2 of P_ext_main */
1216                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1217                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1218                    i_c = P->row_coupleBlock->pattern->index[j3];
1219    
1220                    /* check whether entry RAP[i_r,i_c] has been created.
1221                       If not yet created, create the entry and increase
1222                       the total number of elements in RAP_ext */
1223                    if (P_marker[i_c] < row_marker) {
1224                      P_marker[i_c] = i;
1225                      i++;
1226                    }
1227                }
1228    
1229                /* loop over elements in row i2 of P_ext_couple, do the same */
1230                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1231                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232                    i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1233                    if (P_marker[i_c] < row_marker_ext) {
1234                      P_marker[i_c] = j;
1235                      j++;
1236                    }
1237                }
1238              }
1239            }
1240    
1241      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* now loop over elements in row i1 of A->mainBlock, repeat
1242      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];             the process we have done to block A->col_coupleBlock */
1243      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          j2_ub = A->mainBlock->pattern->ptr[i1+1];
1244        i2 = A->col_coupleBlock->pattern->index[j2];          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1245              i2 = A->mainBlock->pattern->index[j2];
1246        /* check whether entry RA[i_r, i2] has been previously visited.            if (A_marker[i2 + num_Acouple_cols] != i_r) {
1247           RAP new entry is possible only if entry RA[i_r, i2] has not              A_marker[i2 + num_Acouple_cols] = i_r;
1248           been visited yet */              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1249        if (A_marker[i2] != i_r) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1250          /* first, mark entry RA[i_r,i2] as visited */                  i_c = P->mainBlock->pattern->index[j3];
1251          A_marker[i2] = i_r;                  if (P_marker[i_c] < row_marker) {
1252                      P_marker[i_c] = i;
1253          /* then loop over elements in row i2 of P_ext_main */                    i++;
1254          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];                  }
1255          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              }
1256          i_c = P->row_coupleBlock->pattern->index[j3];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1257                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1258          /* check whether entry RAP[i_r,i_c] has been created.                  /* note that we need to map the column index in
1259             If not yet created, create the entry and increase                     P->col_coupleBlock back into the column index in
1260             the total number of elements in RAP_ext */                     P_ext_couple */
1261          if (P_marker[i_c] < row_marker) {                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1262            P_marker[i_c] = i;                  if (P_marker[i_c] < row_marker_ext) {
1263            i++;                    P_marker[i_c] = j;
1264          }                    j++;
1265          }                  }
1266                }
1267          /* loop over elements in row i2 of P_ext_couple, do the same */            }
1268          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];          }
         for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         if (P_marker[i_c] < row_marker_ext) {  
           P_marker[i_c] = j;  
           j++;  
         }  
         }  
       }  
     }  
   
     /* now loop over elements in row i1 of A->mainBlock, repeat  
        the process we have done to block A->col_coupleBlock */  
     j2_ub = A->mainBlock->pattern->ptr[i1+1];  
     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
       i2 = A->mainBlock->pattern->index[j2];  
       if (A_marker[i2 + num_Acouple_cols] != i_r) {  
         A_marker[i2 + num_Acouple_cols] = i_r;  
         j3_ub = P->mainBlock->pattern->ptr[i2+1];  
         for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->mainBlock->pattern->index[j3];  
         if (P_marker[i_c] < row_marker) {  
           P_marker[i_c] = i;  
           i++;  
         }  
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         /* note that we need to map the column index in  
            P->col_coupleBlock back into the column index in  
            P_ext_couple */  
         i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         if (P_marker[i_c] < row_marker_ext) {  
           P_marker[i_c] = j;  
           j++;  
         }  
         }  
       }  
     }  
1269       }       }
1270     }     }
1271    
# Line 1287  Paso_SystemMatrix* Paso_Preconditioner_A Line 1273  Paso_SystemMatrix* Paso_Preconditioner_A
1273        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1274        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1275        RAP_couple */        RAP_couple */
1276     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_main_ptr = new index_t[num_Pmain_cols+1];
1277     RAP_main_idx = MEMALLOC(i, index_t);     RAP_main_idx = new index_t[i];
1278     RAP_main_val = TMPMEMALLOC(i * block_size, double);     RAP_main_val = new double[i * block_size];
1279     RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1280     RAP_couple_idx = MEMALLOC(j, index_t);     RAP_couple_idx = new index_t[j];
1281     RAP_couple_val = TMPMEMALLOC(j * block_size, double);     RAP_couple_val = new double[j * block_size];
1282    
1283     RAP_main_ptr[num_Pmain_cols] = i;     RAP_main_ptr[num_Pmain_cols] = i;
1284     RAP_couple_ptr[num_Pmain_cols] = j;     RAP_couple_ptr[num_Pmain_cols] = j;
# Line 1314  Paso_SystemMatrix* Paso_Preconditioner_A Line 1300  Paso_SystemMatrix* Paso_Preconditioner_A
1300       RAP_couple_ptr[i_r] = row_marker_ext;       RAP_couple_ptr[i_r] = row_marker_ext;
1301    
1302       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1303      in row i_r of RAP_ext */          in row i_r of RAP_ext */
1304       P_marker[i_r] = i;       P_marker[i_r] = i;
1305       for (ib=0; ib<block_size; ib++)       for (ib=0; ib<block_size; ib++)
1306         RAP_main_val[i*block_size+ib] = ZERO;         RAP_main_val[i*block_size+ib] = ZERO;
# Line 1322  Paso_SystemMatrix* Paso_Preconditioner_A Line 1308  Paso_SystemMatrix* Paso_Preconditioner_A
1308       i++;       i++;
1309    
1310       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
1311      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1312        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
1313          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1314            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
1315            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
1316          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
1317            P_marker[i_c] = i;                    P_marker[i_c] = i;
1318            memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));                    memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1319            RAP_main_idx[i] = i_c;                    RAP_main_idx[i] = i_c;
1320            i++;                    i++;
1321          } else {                  } else {
1322            t1_val = &(RAP_ext_val[k*block_size]);                    t1_val = &(RAP_ext_val[k*block_size]);
1323            t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);                    t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1324            for (ib=0; ib<block_size; ib++)                    for (ib=0; ib<block_size; ib++)
1325              t2_val[ib] += t1_val[ib];                      t2_val[ib] += t1_val[ib];
1326          }                  }
1327            } else {                } else {
1328          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
1329            P_marker[i_c] = j;                    P_marker[i_c] = j;
1330            memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));                    memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1331            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
1332            j++;                    j++;
1333          } else {                  } else {
1334            t1_val = &(RAP_ext_val[k*block_size]);                    t1_val = &(RAP_ext_val[k*block_size]);
1335            t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);                    t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1336            for (ib=0; ib<block_size; ib++)                    for (ib=0; ib<block_size; ib++)
1337              t2_val[ib] += t1_val[ib];                      t2_val[ib] += t1_val[ib];
1338          }                  }
1339            }                }
1340          }              }
1341          break;              break;
1342        }            }
1343      }          }
1344       }       }
1345    
1346       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
1347       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
1348       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1349      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
1350      R_val = &(R_main->val[j1*P_block_size]);          R_val = &(R_main->val[j1*P_block_size]);
1351    
1352      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
1353      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1354      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1355        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
1356            temp_val = &(A->col_coupleBlock->val[j2*block_size]);            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1357        for (irb=0; irb<row_block_size; irb++)            for (irb=0; irb<row_block_size; irb++)
1358          for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
1359          RA_val[irb+row_block_size*icb]=ZERO;                  RA_val[irb+row_block_size*icb]=ZERO;
1360            for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
1361          rtmp=R_val[irb];              rtmp=R_val[irb];
1362              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
1363                  RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1364              }              }
1365            }            }
1366    
1367    
1368        /* check whether entry RA[i_r, i2] has been previously visited.            /* check whether entry RA[i_r, i2] has been previously visited.
1369           RAP new entry is possible only if entry RA[i_r, i2] has not               RAP new entry is possible only if entry RA[i_r, i2] has not
1370           been visited yet */               been visited yet */
1371        if (A_marker[i2] != i_r) {            if (A_marker[i2] != i_r) {
1372          /* first, mark entry RA[i_r,i2] as visited */              /* first, mark entry RA[i_r,i2] as visited */
1373          A_marker[i2] = i_r;              A_marker[i2] = i_r;
1374    
1375          /* then loop over elements in row i2 of P_ext_main */              /* then loop over elements in row i2 of P_ext_main */
1376          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1377          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1378          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
1379                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1380          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1381            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1382              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1383                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1384            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1385                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1386                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1387                    }                    }
1388                  }                  }
1389    
1390    
1391          /* check whether entry RAP[i_r,i_c] has been created.                  /* check whether entry RAP[i_r,i_c] has been created.
1392             If not yet created, create the entry and increase                     If not yet created, create the entry and increase
1393             the total number of elements in RAP_ext */                     the total number of elements in RAP_ext */
1394          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
1395            P_marker[i_c] = i;                    P_marker[i_c] = i;
1396            memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1397            RAP_main_idx[i] = i_c;                    RAP_main_idx[i] = i_c;
1398            i++;                    i++;
1399          } else {                  } else {
1400                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1401                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1402                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1403                    }                    }
1404          }                  }
1405          }              }
1406    
1407          /* loop over elements in row i2 of P_ext_couple, do the same */              /* loop over elements in row i2 of P_ext_couple, do the same */
1408          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1409          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1410          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1411                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1412          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1413            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1414              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1415                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1416            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1417                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1418                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1419                    }                    }
1420                  }                  }
1421    
1422          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
1423            P_marker[i_c] = j;                    P_marker[i_c] = j;
1424            memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1425            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
1426            j++;                    j++;
1427          } else {                  } else {
1428                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1429                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1430                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1431                    }                    }
1432          }                  }
1433          }              }
1434    
1435        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.            /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1436           Only the contributions are added. */               Only the contributions are added. */
1437        } else {            } else {
1438          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1439          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1440          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
1441                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1442          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1443            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1444              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1445                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1446            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1447                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1448                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1449                    }                    }
# Line 1467  Paso_SystemMatrix* Paso_Preconditioner_A Line 1453  Paso_SystemMatrix* Paso_Preconditioner_A
1453                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1454                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1455                  }                  }
1456          }              }
1457          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1458          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1459          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1460                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1461          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1462            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1463              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1464                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1465            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1466                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1467                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1468                    }                    }
# Line 1486  Paso_SystemMatrix* Paso_Preconditioner_A Line 1472  Paso_SystemMatrix* Paso_Preconditioner_A
1472                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1473                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1474                  }                  }
1475          }              }
1476        }            }
1477      }          }
1478    
1479      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
1480         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
1481      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
1482      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1483        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
1484            temp_val = &(A->mainBlock->val[j2*block_size]);            temp_val = &(A->mainBlock->val[j2*block_size]);
1485        for (irb=0; irb<row_block_size; irb++)            for (irb=0; irb<row_block_size; irb++)
1486          for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
1487          RA_val[irb+row_block_size*icb]=ZERO;                  RA_val[irb+row_block_size*icb]=ZERO;
1488            for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
1489          rtmp=R_val[irb];              rtmp=R_val[irb];
1490              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
1491                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1492              }              }
1493            }            }
1494    
1495        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
1496          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
1497          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1498          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1499          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
1500                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
1501          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1502            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1503              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1504                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1505            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1506                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1507                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1508                    }                    }
1509                  }                  }
1510          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
1511            P_marker[i_c] = i;                    P_marker[i_c] = i;
1512            memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1513            RAP_main_idx[i] = i_c;                    RAP_main_idx[i] = i_c;
1514            i++;                    i++;
1515          } else {                  } else {
1516                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1517                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1518                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1519                    }                    }
1520          }                  }
1521          }              }
1522          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1523          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1524          /* note that we need to map the column index in                  /* note that we need to map the column index in
1525             P->col_coupleBlock back into the column index in                     P->col_coupleBlock back into the column index in
1526             P_ext_couple */                     P_ext_couple */
1527          i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1528                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1529          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1530            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1531              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1532                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1533            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1534                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1535                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1536                    }                    }
1537                  }                  }
1538    
1539          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
1540            P_marker[i_c] = j;                    P_marker[i_c] = j;
1541            memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1542            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
1543            j++;                    j++;
1544          } else {                  } else {
1545                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1546                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1547                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1548                    }                    }
1549          }                  }
1550          }              }
1551    
1552        } else {            } else {
1553          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1554          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1555          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
1556                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
1557          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1558            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1559              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1560                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1561            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1562                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1563                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1564                    }                    }
# Line 1582  Paso_SystemMatrix* Paso_Preconditioner_A Line 1568  Paso_SystemMatrix* Paso_Preconditioner_A
1568                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1569                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1570                  }                  }
1571          }              }
1572          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1573          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1574          i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1575                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1576          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1577            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1578              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1579                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1580            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
1581                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1582                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1583                    }                    }
# Line 1601  Paso_SystemMatrix* Paso_Preconditioner_A Line 1587  Paso_SystemMatrix* Paso_Preconditioner_A
1587                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1588                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1589                  }                  }
1590          }              }
1591        }            }
1592      }          }
1593       }       }
1594    
1595       /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */       /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1596       if (i > row_marker) {       if (i > row_marker) {
1597      offset = i - row_marker;          offset = i - row_marker;
1598      temp = TMPMEMALLOC(offset, index_t);          temp = new index_t[offset];
1599      #pragma omp parallel for schedule(static) private(iptr)          #pragma omp parallel for schedule(static) private(iptr)
1600      for (iptr=0; iptr<offset; iptr++)          for (iptr=0; iptr<offset; iptr++)
1601        temp[iptr] = RAP_main_idx[row_marker+iptr];            temp[iptr] = RAP_main_idx[row_marker+iptr];
1602      if (offset > 0) {          if (offset > 0) {
1603        #ifdef USE_QSORTG              qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1604          qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);          }
1605        #else          temp_val = new double[offset * block_size];
1606          qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);          #pragma omp parallel for schedule(static) private(iptr,k)
1607        #endif          for (iptr=0; iptr<offset; iptr++){
1608      }            k = P_marker[temp[iptr]];
1609      temp_val = TMPMEMALLOC(offset * block_size, double);            memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1610      #pragma omp parallel for schedule(static) private(iptr,k)            P_marker[temp[iptr]] = iptr + row_marker;
1611      for (iptr=0; iptr<offset; iptr++){          }
1612        k = P_marker[temp[iptr]];          #pragma omp parallel for schedule(static) private(iptr)
1613        memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));          for (iptr=0; iptr<offset; iptr++){
1614        P_marker[temp[iptr]] = iptr + row_marker;            RAP_main_idx[row_marker+iptr] = temp[iptr];
1615      }            memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1616      #pragma omp parallel for schedule(static) private(iptr)          }
1617      for (iptr=0; iptr<offset; iptr++){          delete[] temp;
1618        RAP_main_idx[row_marker+iptr] = temp[iptr];          delete[] temp_val;
       memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));  
     }  
     TMPMEMFREE(temp);  
     TMPMEMFREE(temp_val);  
1619       }       }
1620       if (j > row_marker_ext) {       if (j > row_marker_ext) {
1621          offset = j - row_marker_ext;          offset = j - row_marker_ext;
1622          temp = TMPMEMALLOC(offset, index_t);          temp = new index_t[offset];
1623      #pragma omp parallel for schedule(static) private(iptr)          #pragma omp parallel for schedule(static) private(iptr)
1624          for (iptr=0; iptr<offset; iptr++)          for (iptr=0; iptr<offset; iptr++)
1625            temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];            temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1626          if (offset > 0) {          if (offset > 0) {
1627            #ifdef USE_QSORTG              qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
             qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);  
           #else  
             qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);  
           #endif  
1628          }          }
1629          temp_val = TMPMEMALLOC(offset * block_size, double);          temp_val = new double[offset * block_size];
1630      #pragma omp parallel for schedule(static) private(iptr, k)          #pragma omp parallel for schedule(static) private(iptr, k)
1631          for (iptr=0; iptr<offset; iptr++){          for (iptr=0; iptr<offset; iptr++){
1632            k = P_marker[temp[iptr] + num_Pmain_cols];            k = P_marker[temp[iptr] + num_Pmain_cols];
1633        memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));            memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1634            P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;            P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1635          }          }
1636      #pragma omp parallel for schedule(static) private(iptr)          #pragma omp parallel for schedule(static) private(iptr)
1637          for (iptr=0; iptr<offset; iptr++){          for (iptr=0; iptr<offset; iptr++){
1638            RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];            RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1639        memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));            memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1640          }          }
1641          TMPMEMFREE(temp);          delete[] temp;
1642          TMPMEMFREE(temp_val);          delete[] temp_val;
1643       }       }
1644     }     }
1645    
1646     TMPMEMFREE(RA_val);     delete[] RA_val;
1647     TMPMEMFREE(RAP_val);     delete[] RAP_val;
1648     TMPMEMFREE(A_marker);     delete[] A_marker;
1649     TMPMEMFREE(Pext_to_RAP);     delete[] Pext_to_RAP;
1650     TMPMEMFREE(Pcouple_to_RAP);     delete[] Pcouple_to_RAP;
1651     MEMFREE(RAP_ext_ptr);     delete[] RAP_ext_ptr;
1652     MEMFREE(RAP_ext_idx);     delete[] RAP_ext_idx;
1653     MEMFREE(RAP_ext_val);     delete[] RAP_ext_val;
1654     Paso_SparseMatrix_free(R_main);     R_main.reset();
1655     Paso_SparseMatrix_free(R_couple);     R_couple.reset();
1656    
1657     /* Check whether there are empty columns in RAP_couple */     /* Check whether there are empty columns in RAP_couple */
1658     #pragma omp parallel for schedule(static) private(i)     #pragma omp parallel for schedule(static) private(i)
# Line 1685  Paso_SystemMatrix* Paso_Preconditioner_A Line 1663  Paso_SystemMatrix* Paso_Preconditioner_A
1663     for (i=0; i<j; i++) {     for (i=0; i<j; i++) {
1664       i1 = RAP_couple_idx[i];       i1 = RAP_couple_idx[i];
1665       if (P_marker[i1]) {       if (P_marker[i1]) {
1666      P_marker[i1] = 0;          P_marker[i1] = 0;
1667      k++;          k++;
1668       }       }
1669     }     }
1670    
1671     /* empty columns is found */     /* empty columns is found */
1672     if (k < num_RAPext_cols) {     if (k < num_RAPext_cols) {
1673       temp = TMPMEMALLOC(k, index_t);       temp = new index_t[k];
1674       k = 0;       k = 0;
1675       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1676      if (!P_marker[i]) {          if (!P_marker[i]) {
1677        P_marker[i] = k;            P_marker[i] = k;
1678        temp[k] = global_id_RAP[i];            temp[k] = global_id_RAP[i];
1679        k++;            k++;
1680      }          }
1681       #pragma omp parallel for schedule(static) private(i, i1)       #pragma omp parallel for schedule(static) private(i, i1)
1682       for (i=0; i<j; i++) {       for (i=0; i<j; i++) {
1683      i1 = RAP_couple_idx[i];          i1 = RAP_couple_idx[i];
1684      RAP_couple_idx[i] = P_marker[i1];          RAP_couple_idx[i] = P_marker[i1];
1685       }       }
1686       num_RAPext_cols = k;       num_RAPext_cols = k;
1687       TMPMEMFREE(global_id_RAP);       delete[] global_id_RAP;
1688       global_id_RAP = temp;       global_id_RAP = temp;
1689     }     }
1690     TMPMEMFREE(P_marker);     delete[] P_marker;
1691    
1692     /******************************************************/     /******************************************************/
1693     /* Start to create the coasre level System Matrix A_c */     /* Start to create the coarse level System Matrix A_c */
1694     /******************************************************/     /******************************************************/
1695     /* first, prepare the sender/receiver for the col_connector */     /* first, prepare the sender/receiver for the col_connector */
1696     dist = P->pattern->input_distribution->first_component;     dist = P->pattern->input_distribution->first_component;
1697     recv_len = TMPMEMALLOC(size, dim_t);     recv_len = new dim_t[size];
1698     send_len = TMPMEMALLOC(size, dim_t);     send_len = new dim_t[size];
1699     neighbor = TMPMEMALLOC(size, Esys_MPI_rank);     neighbor = new Esys_MPI_rank[size];
1700     offsetInShared = TMPMEMALLOC(size+1, index_t);     offsetInShared = new index_t[size+1];
1701     shared = TMPMEMALLOC(num_RAPext_cols, index_t);     shared = new index_t[num_RAPext_cols];
1702     memset(recv_len, 0, sizeof(dim_t) * size);     memset(recv_len, 0, sizeof(dim_t) * size);
1703     memset(send_len, 0, sizeof(dim_t) * size);     memset(send_len, 0, sizeof(dim_t) * size);
1704     num_neighbors = 0;     num_neighbors = 0;
# Line 1728  Paso_SystemMatrix* Paso_Preconditioner_A Line 1706  Paso_SystemMatrix* Paso_Preconditioner_A
1706     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1707       shared[i] = i + num_Pmain_cols;       shared[i] = i + num_Pmain_cols;
1708       if (k <= global_id_RAP[i]) {       if (k <= global_id_RAP[i]) {
1709      if (recv_len[j] > 0) {          if (recv_len[j] > 0) {
1710        neighbor[num_neighbors] = j;            neighbor[num_neighbors] = j;
1711        num_neighbors ++;            num_neighbors ++;
1712        offsetInShared[num_neighbors] = i;            offsetInShared[num_neighbors] = i;
1713      }          }
1714      while (k <= global_id_RAP[i]) {          while (k <= global_id_RAP[i]) {
1715        j++;            j++;
1716        k = dist[j+1];            k = dist[j+1];
1717      }          }
1718       }       }
1719       recv_len[j] ++;       recv_len[j] ++;
1720     }     }
# Line 1745  Paso_SystemMatrix* Paso_Preconditioner_A Line 1723  Paso_SystemMatrix* Paso_Preconditioner_A
1723       num_neighbors ++;       num_neighbors ++;
1724       offsetInShared[num_neighbors] = i;       offsetInShared[num_neighbors] = i;
1725     }     }
1726     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,  
1727              neighbor, shared, offsetInShared, 1, 0, mpi_info);     paso::SharedComponents_ptr recv(new paso::SharedComponents(
1728                   num_Pmain_cols, num_neighbors, neighbor, shared,
1729                   offsetInShared, 1, 0, mpi_info));
1730    
1731     #ifdef ESYS_MPI     #ifdef ESYS_MPI
1732     MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);     MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1733     #endif     #endif
1734    
1735     #ifdef ESYS_MPI     #ifdef ESYS_MPI
1736       mpi_requests=TMPMEMALLOC(size*2,MPI_Request);       mpi_requests=new MPI_Request[size*2];
1737       mpi_stati=TMPMEMALLOC(size*2,MPI_Status);       mpi_stati=new MPI_Status[size*2];
1738     #else     #else
1739       mpi_requests=TMPMEMALLOC(size*2,int);       mpi_requests=new int[size*2];
1740       mpi_stati=TMPMEMALLOC(size*2,int);       mpi_stati=new int[size*2];
1741     #endif     #endif
1742     num_neighbors = 0;     num_neighbors = 0;
1743     j = 0;     j = 0;
1744     offsetInShared[0] = 0;     offsetInShared[0] = 0;
1745     for (i=0; i<size; i++) {     for (i=0; i<size; i++) {
1746       if (send_len[i] > 0) {       if (send_len[i] > 0) {
1747      neighbor[num_neighbors] = i;          neighbor[num_neighbors] = i;
1748      num_neighbors ++;          num_neighbors ++;
1749      j += send_len[i];          j += send_len[i];
1750      offsetInShared[num_neighbors] = j;          offsetInShared[num_neighbors] = j;
1751       }       }
1752     }     }
1753     TMPMEMFREE(shared);     delete[] shared;
1754     shared = TMPMEMALLOC(j, index_t);     shared = new index_t[j];
1755     for (i=0, j=0; i<num_neighbors; i++) {     for (i=0, j=0; i<num_neighbors; i++) {
1756       k = neighbor[i];       k = neighbor[i];
1757       #ifdef ESYS_MPI       #ifdef ESYS_MPI
1758       MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,       MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1759          mpi_info->msg_tag_counter+k,                  mpi_info->msg_tag_counter+k,
1760          mpi_info->comm, &mpi_requests[i]);                  mpi_info->comm, &mpi_requests[i]);
1761       #endif       #endif
1762       j += send_len[k];       j += send_len[k];
1763     }     }
# Line 1785  Paso_SystemMatrix* Paso_Preconditioner_A Line 1765  Paso_SystemMatrix* Paso_Preconditioner_A
1765       k = recv->neighbor[i];       k = recv->neighbor[i];
1766       #ifdef ESYS_MPI       #ifdef ESYS_MPI
1767       MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,       MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1768          mpi_info->msg_tag_counter+rank,                  mpi_info->msg_tag_counter+rank,
1769          mpi_info->comm, &mpi_requests[i+num_neighbors]);                  mpi_info->comm, &mpi_requests[i+num_neighbors]);
1770       #endif       #endif
1771       j += recv_len[k];       j += recv_len[k];
1772     }     }
1773     #ifdef ESYS_MPI     #ifdef ESYS_MPI
1774     MPI_Waitall(num_neighbors + recv->numNeighbors,     MPI_Waitall(num_neighbors + recv->numNeighbors,
1775          mpi_requests, mpi_stati);                  mpi_requests, mpi_stati);
1776     #endif     #endif
1777     mpi_info->msg_tag_counter += size;     ESYS_MPI_INC_COUNTER(*mpi_info, size);
1778    
1779     j = offsetInShared[num_neighbors];     j = offsetInShared[num_neighbors];
1780     offset = dist[rank];     offset = dist[rank];
1781     #pragma omp parallel for schedule(static) private(i)     #pragma omp parallel for schedule(static) private(i)
1782     for (i=0; i<j; i++) shared[i] = shared[i] - offset;     for (i=0; i<j; i++) shared[i] = shared[i] - offset;
    send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,  
             neighbor, shared, offsetInShared, 1, 0, mpi_info);  
1783    
1784     col_connector = Paso_Connector_alloc(send, recv);     paso::SharedComponents_ptr send(new paso::SharedComponents(
1785     TMPMEMFREE(shared);                 num_Pmain_cols, num_neighbors, neighbor, shared,
1786     Paso_SharedComponents_free(recv);                 offsetInShared, 1, 0, mpi_info));
1787     Paso_SharedComponents_free(send);  
1788       col_connector.reset(new paso::Connector(send, recv));
1789       delete[] shared;
1790    
1791     /* now, create row distribution (output_distri) and col     /* now, create row distribution (output_distri) and col
1792        distribution (input_distribution) */        distribution (input_distribution) */
1793     input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);     input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1794     output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);     output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1795    
1796     /* then, prepare the sender/receiver for the row_connector, first, prepare     /* then, prepare the sender/receiver for the row_connector, first, prepare
1797        the information for sender */        the information for sender */
1798     sum = RAP_couple_ptr[num_Pmain_cols];     sum = RAP_couple_ptr[num_Pmain_cols];
1799     len = TMPMEMALLOC(size, dim_t);     len = new dim_t[size];
1800     send_ptr = TMPMEMALLOC(size, index_t*);     send_ptr = new index_t*[size];
1801     send_idx = TMPMEMALLOC(size, index_t*);     send_idx = new index_t*[size];
1802     #pragma omp parallel for schedule(static) private(i)     #pragma omp parallel for schedule(static) private(i)
1803     for (i=0; i<size; i++) {     for (i=0; i<size; i++) {
1804       send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);       send_ptr[i] = new index_t[num_Pmain_cols];
1805       send_idx[i] = TMPMEMALLOC(sum, index_t);       send_idx[i] = new index_t[sum];
1806       memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);       memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1807     }     }
1808     memset(len, 0, sizeof(dim_t) * size);     memset(len, 0, sizeof(dim_t) * size);
# Line 1832  Paso_SystemMatrix* Paso_Preconditioner_A Line 1812  Paso_SystemMatrix* Paso_Preconditioner_A
1812       i1 = RAP_couple_ptr[i_r];       i1 = RAP_couple_ptr[i_r];
1813       i2 = RAP_couple_ptr[i_r+1];       i2 = RAP_couple_ptr[i_r+1];
1814       if (i2 > i1) {       if (i2 > i1) {
1815      /* then row i_r will be in the sender of row_connector, now check          /* then row i_r will be in the sender of row_connector, now check
1816         how many neighbors i_r needs to be send to */             how many neighbours i_r needs to be send to */
1817      for (j=i1; j<i2; j++) {          for (j=i1; j<i2; j++) {
1818        i_c = RAP_couple_idx[j];            i_c = RAP_couple_idx[j];
1819        /* find out the corresponding neighbor "p" of column i_c */            /* find out the corresponding neighbor "p" of column i_c */
1820        for (p=0; p<recv->numNeighbors; p++) {            for (p=0; p<recv->numNeighbors; p++) {
1821          if (i_c < recv->offsetInShared[p+1]) {              if (i_c < recv->offsetInShared[p+1]) {
1822            k = recv->neighbor[p];                k = recv->neighbor[p];
1823            if (send_ptr[k][i_r] == 0) sum++;                if (send_ptr[k][i_r] == 0) sum++;
1824            send_ptr[k][i_r] ++;                send_ptr[k][i_r] ++;
1825            send_idx[k][len[k]] = global_id_RAP[i_c];                send_idx[k][len[k]] = global_id_RAP[i_c];
1826            len[k] ++;                len[k] ++;
1827            break;                break;
1828          }              }
1829        }            }
1830      }          }
1831       }       }
1832     }     }
1833     if (global_id_RAP) {     if (global_id_RAP) {
1834       TMPMEMFREE(global_id_RAP);       delete[] global_id_RAP;
1835       global_id_RAP = NULL;       global_id_RAP = NULL;
1836     }     }
1837    
1838     /* now allocate the sender */     /* now allocate the sender */
1839     shared = TMPMEMALLOC(sum, index_t);     shared = new index_t[sum];
1840     memset(send_len, 0, sizeof(dim_t) * size);     memset(send_len, 0, sizeof(dim_t) * size);
1841     num_neighbors=0;     num_neighbors=0;
1842     offsetInShared[0] = 0;     offsetInShared[0] = 0;
1843     for (p=0, k=0; p<size; p++) {     for (p=0, k=0; p<size; p++) {
1844       for (i=0; i<num_Pmain_cols; i++) {       for (i=0; i<num_Pmain_cols; i++) {
1845      if (send_ptr[p][i] > 0) {          if (send_ptr[p][i] > 0) {
1846        shared[k] = i;            shared[k] = i;
1847        k++;            k++;
1848        send_ptr[p][send_len[p]] = send_ptr[p][i];            send_ptr[p][send_len[p]] = send_ptr[p][i];
1849        send_len[p]++;            send_len[p]++;
1850      }          }
1851       }       }
1852       if (k > offsetInShared[num_neighbors]) {       if (k > offsetInShared[num_neighbors]) {
1853      neighbor[num_neighbors] = p;          neighbor[num_neighbors] = p;
1854      num_neighbors ++;          num_neighbors ++;
1855      offsetInShared[num_neighbors] = k;          offsetInShared[num_neighbors] = k;
1856       }       }
1857     }     }
1858     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,     send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1859                          neighbor, shared, offsetInShared, 1, 0, mpi_info);                 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1860    
1861     /* send/recv number of rows will be sent from current proc     /* send/recv number of rows will be sent from current proc
1862        recover info for the receiver of row_connector from the sender */        recover info for the receiver of row_connector from the sender */
# Line 1888  Paso_SystemMatrix* Paso_Preconditioner_A Line 1868  Paso_SystemMatrix* Paso_Preconditioner_A
1868     j = 0;     j = 0;
1869     for (i=0; i<size; i++) {     for (i=0; i<size; i++) {
1870       if (i != rank && recv_len[i] > 0) {       if (i != rank && recv_len[i] > 0) {
1871      neighbor[num_neighbors] = i;          neighbor[num_neighbors] = i;
1872      num_neighbors ++;          num_neighbors ++;
1873      j += recv_len[i];          j += recv_len[i];
1874      offsetInShared[num_neighbors] = j;          offsetInShared[num_neighbors] = j;
1875       }       }
1876     }     }
1877     TMPMEMFREE(shared);     delete[] shared;
1878     TMPMEMFREE(recv_len);     delete[] recv_len;
1879     shared = TMPMEMALLOC(j, index_t);     shared = new index_t[j];
1880     k = offsetInShared[num_neighbors];     k = offsetInShared[num_neighbors];
1881     #pragma omp parallel for schedule(static) private(i)     #pragma omp parallel for schedule(static) private(i)
1882     for (i=0; i<k; i++) {     for (i=0; i<k; i++) {
1883       shared[i] = i + num_Pmain_cols;       shared[i] = i + num_Pmain_cols;
1884     }     }
1885     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,     recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1886                          neighbor, shared, offsetInShared, 1, 0, mpi_info);                 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1887     row_connector = Paso_Connector_alloc(send, recv);     row_connector.reset(new paso::Connector(send, recv));
1888     TMPMEMFREE(shared);     delete[] shared;
    Paso_SharedComponents_free(recv);  
    Paso_SharedComponents_free(send);  
1889    
1890     /* send/recv pattern->ptr for rowCoupleBlock */     /* send/recv pattern->ptr for rowCoupleBlock */
1891     num_RAPext_rows = offsetInShared[num_neighbors];     num_RAPext_rows = offsetInShared[num_neighbors];
1892     row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);     row_couple_ptr = new index_t[num_RAPext_rows+1];
1893     for (p=0; p<num_neighbors; p++) {     for (p=0; p<num_neighbors; p++) {
1894       j = offsetInShared[p];       j = offsetInShared[p];
1895       i = offsetInShared[p+1];       i = offsetInShared[p+1];
1896       #ifdef ESYS_MPI       #ifdef ESYS_MPI
1897       MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],       MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1898          mpi_info->msg_tag_counter+neighbor[p],                  mpi_info->msg_tag_counter+neighbor[p],
1899          mpi_info->comm, &mpi_requests[p]);                  mpi_info->comm, &mpi_requests[p]);
1900       #endif       #endif
1901     }     }
1902     send = row_connector->send;     send = row_connector->send;
1903     for (p=0; p<send->numNeighbors; p++) {     for (p=0; p<send->numNeighbors; p++) {
1904       #ifdef ESYS_MPI       #ifdef ESYS_MPI
1905       MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],       MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1906          MPI_INT, send->neighbor[p],                  MPI_INT, send->neighbor[p],
1907          mpi_info->msg_tag_counter+rank,                  mpi_info->msg_tag_counter+rank,
1908          mpi_info->comm, &mpi_requests[p+num_neighbors]);                  mpi_info->comm, &mpi_requests[p+num_neighbors]);
1909       #endif       #endif
1910     }     }
1911     #ifdef ESYS_MPI     #ifdef ESYS_MPI
1912     MPI_Waitall(num_neighbors + send->numNeighbors,     MPI_Waitall(num_neighbors + send->numNeighbors,
1913      mpi_requests, mpi_stati);          mpi_requests, mpi_stati);
1914     #endif     #endif
1915     mpi_info->msg_tag_counter += size;     ESYS_MPI_INC_COUNTER(*mpi_info, size);
1916     TMPMEMFREE(send_len);     delete[] send_len;
1917    
1918     sum = 0;     sum = 0;
1919     for (i=0; i<num_RAPext_rows; i++) {     for (i=0; i<num_RAPext_rows; i++) {
# Line 1947  Paso_SystemMatrix* Paso_Preconditioner_A Line 1925  Paso_SystemMatrix* Paso_Preconditioner_A
1925    
1926     /* send/recv pattern->index for rowCoupleBlock */     /* send/recv pattern->index for rowCoupleBlock */
1927     k = row_couple_ptr[num_RAPext_rows];     k = row_couple_ptr[num_RAPext_rows];
1928     row_couple_idx = MEMALLOC(k, index_t);     row_couple_idx = new index_t[k];
1929     for (p=0; p<num_neighbors; p++) {     for (p=0; p<num_neighbors; p++) {
1930       j1 = row_couple_ptr[offsetInShared[p]];       j1 = row_couple_ptr[offsetInShared[p]];
1931       j2 = row_couple_ptr[offsetInShared[p+1]];       j2 = row_couple_ptr[offsetInShared[p+1]];
1932       #ifdef ESYS_MPI       #ifdef ESYS_MPI
1933       MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],       MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1934          mpi_info->msg_tag_counter+neighbor[p],                  mpi_info->msg_tag_counter+neighbor[p],
1935          mpi_info->comm, &mpi_requests[p]);                  mpi_info->comm, &mpi_requests[p]);
1936       #endif       #endif
1937     }     }
1938     for (p=0; p<send->numNeighbors; p++) {     for (p=0; p<send->numNeighbors; p++) {
1939       #ifdef ESYS_MPI       #ifdef ESYS_MPI
1940       MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],       MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1941          MPI_INT, send->neighbor[p],                  MPI_INT, send->neighbor[p],
1942          mpi_info->msg_tag_counter+rank,                  mpi_info->msg_tag_counter+rank,
1943          mpi_info->comm, &mpi_requests[p+num_neighbors]);                  mpi_info->comm, &mpi_requests[p+num_neighbors]);
1944       #endif       #endif
1945     }     }
1946     #ifdef ESYS_MPI     #ifdef ESYS_MPI
1947     MPI_Waitall(num_neighbors + send->numNeighbors,     MPI_Waitall(num_neighbors + send->numNeighbors,
1948          mpi_requests, mpi_stati);                  mpi_requests, mpi_stati);
1949     #endif     #endif
1950     mpi_info->msg_tag_counter += size;     ESYS_MPI_INC_COUNTER(*mpi_info, size);
1951    
1952     offset = input_dist->first_component[rank];      offset = input_dist->first_component[rank];
1953     k = row_couple_ptr[num_RAPext_rows];      k = row_couple_ptr[num_RAPext_rows];
1954     #pragma omp parallel for schedule(static) private(i)      #pragma omp parallel for schedule(static) private(i)
1955     for (i=0; i<k; i++) {      for (i=0; i<k; i++) {
1956       row_couple_idx[i] -= offset;          row_couple_idx[i] -= offset;
1957     }      }
1958     #pragma omp parallel for schedule(static) private(i)      #pragma omp parallel for schedule(static) private(i)
1959     for (i=0; i<size; i++) {      for (i=0; i<size; i++) {
1960       TMPMEMFREE(send_ptr[i]);          delete[] send_ptr[i];
1961       TMPMEMFREE(send_idx[i]);          delete[] send_idx[i];
1962     }      }
1963     TMPMEMFREE(send_ptr);      delete[] send_ptr;
1964     TMPMEMFREE(send_idx);      delete[] send_idx;
1965     TMPMEMFREE(len);      delete[] len;
1966     TMPMEMFREE(offsetInShared);      delete[] offsetInShared;
1967     TMPMEMFREE(neighbor);      delete[] neighbor;
1968     TMPMEMFREE(mpi_requests);      delete[] mpi_requests;
1969     TMPMEMFREE(mpi_stati);      delete[] mpi_stati;
1970     Esys_MPIInfo_free(mpi_info);      Esys_MPIInfo_free(mpi_info);
1971    
1972     /* Now, we can create pattern for mainBlock and coupleBlock */      /* Now, we can create pattern for mainBlock and coupleBlock */
1973     main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,      paso::Pattern_ptr main_pattern(new paso::Pattern(MATRIX_FORMAT_DEFAULT,
1974              num_Pmain_cols, RAP_main_ptr, RAP_main_idx);                 num_Pmain_cols, num_Pmain_cols, RAP_main_ptr, RAP_main_idx));
1975     col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,      paso::Pattern_ptr col_couple_pattern(new paso::Pattern(
1976              num_Pmain_cols, num_RAPext_cols,                 MATRIX_FORMAT_DEFAULT, num_Pmain_cols, num_RAPext_cols,
1977              RAP_couple_ptr, RAP_couple_idx);                 RAP_couple_ptr, RAP_couple_idx));
1978     row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,      paso::Pattern_ptr row_couple_pattern(new paso::Pattern(
1979              num_RAPext_rows, num_Pmain_cols,                 MATRIX_FORMAT_DEFAULT, num_RAPext_rows, num_Pmain_cols,
1980              row_couple_ptr, row_couple_idx);                 row_couple_ptr, row_couple_idx));
1981    
1982     /* next, create the system matrix */      /* next, create the system matrix */
1983     pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,      pattern.reset(new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1984                  output_dist, input_dist, main_pattern, col_couple_pattern,                  output_dist, input_dist, main_pattern, col_couple_pattern,
1985                  row_couple_pattern, col_connector, row_connector);                  row_couple_pattern, col_connector, row_connector));
1986     out = Paso_SystemMatrix_alloc(A->type, pattern,      out.reset(new paso::SystemMatrix(A->type, pattern, row_block_size,
1987                  row_block_size, col_block_size, FALSE);                                      col_block_size, false));
1988    
1989     /* finally, fill in the data*/      /* finally, fill in the data*/
1990     memcpy(out->mainBlock->val, RAP_main_val,      memcpy(out->mainBlock->val, RAP_main_val,
1991          out->mainBlock->len* sizeof(double));                  out->mainBlock->len* sizeof(double));
1992     memcpy(out->col_coupleBlock->val, RAP_couple_val,      memcpy(out->col_coupleBlock->val, RAP_couple_val,
1993          out->col_coupleBlock->len * sizeof(double));                  out->col_coupleBlock->len * sizeof(double));
1994    
1995     /* Clean up */      delete[] RAP_main_val;
1996     Paso_SystemMatrixPattern_free(pattern);      delete[] RAP_couple_val;
1997     Paso_Pattern_free(main_pattern);      if (!Esys_noError()) {
1998     Paso_Pattern_free(col_couple_pattern);          out.reset();
1999     Paso_Pattern_free(row_couple_pattern);      }
2000     Paso_Connector_free(col_connector);      return out;
    Paso_Connector_free(row_connector);  
    Paso_Distribution_free(output_dist);  
    Paso_Distribution_free(input_dist);  
    TMPMEMFREE(RAP_main_val);  
    TMPMEMFREE(RAP_couple_val);  
    if (Esys_noError()) {  
      return out;  
    } else {  
      Paso_SystemMatrix_free(out);  
      return NULL;  
    }  
2001  }  }
2002    
2003    
2004  Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(  paso::SystemMatrix_ptr Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2005      Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)          paso::SystemMatrix_ptr A, paso::SystemMatrix_ptr P,
2006            paso::SystemMatrix_ptr R)
2007  {  {
2008     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2009     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;     paso::SystemMatrix_ptr out;
2010     Paso_SystemMatrix *out=NULL;     paso::SystemMatrixPattern_ptr pattern;
2011     Paso_SystemMatrixPattern *pattern=NULL;     paso::Distribution_ptr input_dist, output_dist;
2012     Paso_Distribution *input_dist=NULL, *output_dist=NULL;     paso::SharedComponents_ptr send, recv;
2013     Paso_SharedComponents *send =NULL, *recv=NULL;     paso::Connector_ptr col_connector, row_connector;
    Paso_Connector *col_connector=NULL, *row_connector=NULL;  
    Paso_Pattern *main_pattern=NULL;  
    Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;  
2014     const dim_t row_block_size=A->row_block_size;     const dim_t row_block_size=A->row_block_size;
2015     const dim_t col_block_size=A->col_block_size;     const dim_t col_block_size=A->col_block_size;
2016     const dim_t block_size = A->block_size;     const dim_t block_size = A->block_size;
# Line 2075  Paso_SystemMatrix* Paso_Preconditioner_A Line 2040  Paso_SystemMatrix* Paso_Preconditioner_A
2040       int *mpi_requests=NULL, *mpi_stati=NULL;       int *mpi_requests=NULL, *mpi_stati=NULL;
2041     #endif     #endif
2042    
2043     /* two sparse matrixes R_main and R_couple will be generate, as the     /* two sparse matrices R_main and R_couple will be generate, as the
2044        transpose of P_main and P_col_couple, respectively. Note that,        transpose of P_main and P_col_couple, respectively. Note that,
2045        R_couple is actually the row_coupleBlock of R (R=P^T) */        R_couple is actually the row_coupleBlock of R (R=P^T) */
2046     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);     paso::SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
2047     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);     paso::SparseMatrix_ptr R_couple(P->col_coupleBlock->getTranspose());
2048    
2049     /* generate P_ext, i.e. portion of P that is tored on neighbor procs     /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2050        and needed locally for triple matrix product RAP        and needed locally for triple matrix product RAP
2051        to be specific, P_ext (on processor k) are group of rows in P, where        to be specific, P_ext (on processor k) are group of rows in P, where
2052        the list of rows from processor q is given by        the list of rows from processor q is given by
2053      A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
2054      rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2055        on q.        on q.
2056        P_ext is represented by two sparse matrixes P_ext_main and        P_ext is represented by two sparse matrices P_ext_main and
2057        P_ext_couple */        P_ext_couple */
2058     Paso_Preconditioner_AMG_extendB(A, P);     Paso_Preconditioner_AMG_extendB(A, P);
2059    
# Line 2105  Paso_SystemMatrix* Paso_Preconditioner_A Line 2070  Paso_SystemMatrix* Paso_Preconditioner_A
2070     num_Pext_cols = 0;     num_Pext_cols = 0;
2071     if (P->global_id) {     if (P->global_id) {
2072       /* first count the number of cols "num_Pext_cols" in both P_ext_couple       /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2073      and P->col_coupleBlock */          and P->col_coupleBlock */
2074       iptr = 0;       iptr = 0;
2075       if (num_Pcouple_cols || sum > 0) {       if (num_Pcouple_cols || sum > 0) {
2076      temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);          temp = new index_t[num_Pcouple_cols+sum];
2077      for (; iptr<sum; iptr++){          for (; iptr<sum; iptr++){
2078        temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];            temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2079      }          }
2080      for (j=0; j<num_Pcouple_cols; j++, iptr++){          for (j=0; j<num_Pcouple_cols; j++, iptr++){
2081        temp[iptr] = P->global_id[j];            temp[iptr] = P->global_id[j];
2082      }          }
2083       }       }
2084       if (iptr) {       if (iptr) {
2085      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2086        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_Pext_cols = 1;
2087      #else          i = temp[0];
2088        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
2089      #endif            if (temp[j] > i) {
2090      num_Pext_cols = 1;              i = temp[j];
2091      i = temp[0];              temp[num_Pext_cols++] = i;
2092      for (j=1; j<iptr; j++) {            }
2093        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_Pext_cols++] = i;  
       }  
     }  
2094       }       }
2095    
2096       /* resize the pattern of P_ext_couple */       /* resize the pattern of P_ext_couple */
2097       if(num_Pext_cols){       if(num_Pext_cols){
2098      global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);          global_id_P = new index_t[num_Pext_cols];
2099      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
2100        global_id_P[i] = temp[i];            global_id_P[i] = temp[i];
2101       }       }
2102       if (num_Pcouple_cols || sum > 0)       if (num_Pcouple_cols || sum > 0)
2103      TMPMEMFREE(temp);          delete[] temp;
2104       for (i=0; i<sum; i++) {       for (i=0; i<sum; i++) {
2105      where_p = (index_t *)bsearch(          where_p = (index_t *)bsearch(
2106              &(P->remote_coupleBlock->pattern->index[i]),                          &(P->remote_coupleBlock->pattern->index[i]),
2107              global_id_P, num_Pext_cols,                          global_id_P, num_Pext_cols,
2108                          sizeof(index_t), Paso_comparIndex);                          sizeof(index_t), paso::comparIndex);
2109      P->remote_coupleBlock->pattern->index[i] =          P->remote_coupleBlock->pattern->index[i] =
2110              (index_t)(where_p -global_id_P);                          (index_t)(where_p -global_id_P);
2111       }         }  
2112    
2113       /* build the mapping */       /* build the mapping */
2114       if (num_Pcouple_cols) {       if (num_Pcouple_cols) {
2115      Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);          Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2116      iptr = 0;          iptr = 0;
2117      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
2118        if (global_id_P[i] == P->global_id[iptr]) {            if (global_id_P[i] == P->global_id[iptr]) {
2119          Pcouple_to_Pext[iptr++] = i;              Pcouple_to_Pext[iptr++] = i;
2120          if (iptr == num_Pcouple_cols) break;              if (iptr == num_Pcouple_cols) break;
2121        }            }
2122       }       }
2123     }     }
2124    
2125     /* alloc and initialize the makers */     /* alloc and initialise the makers */
2126     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
2127     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
2128     A_marker = TMPMEMALLOC(num_A_cols, index_t);     A_marker = new index_t[num_A_cols];
2129     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2130     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
2131     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
# Line 2177  Paso_SystemMatrix* Paso_Preconditioner_A Line 2138  Paso_SystemMatrix* Paso_Preconditioner_A
2138       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
2139       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
2140       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2141      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
2142      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
2143      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2144      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2145        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
2146    
2147        /* check whether entry RA[i_r, i2] has been previously visited.            /* check whether entry RA[i_r, i2] has been previously visited.
2148           RAP new entry is possible only if entry RA[i_r, i2] has not               RAP new entry is possible only if entry RA[i_r, i2] has not
2149           been visited yet */               been visited yet */
2150        if (A_marker[i2] != i_r) {            if (A_marker[i2] != i_r) {
2151          /* first, mark entry RA[i_r,i2] as visited */              /* first, mark entry RA[i_r,i2] as visited */
2152          A_marker[i2] = i_r;              A_marker[i2] = i_r;
2153    
2154          /* then loop over elements in row i2 of P_ext_main */              /* then loop over elements in row i2 of P_ext_main */
2155          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2156          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2157          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
2158    
2159          /* check whether entry RAP[i_r,i_c] has been created.                  /* check whether entry RAP[i_r,i_c] has been created.
2160             If not yet created, create the entry and increase                     If not yet created, create the entry and increase
2161             the total number of elements in RAP_ext */                     the total number of elements in RAP_ext */
2162          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2163            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2164            sum++;                    sum++;
2165          }                  }
2166          }              }
2167    
2168          /* loop over elements in row i2 of P_ext_couple, do the same */              /* loop over elements in row i2 of P_ext_couple, do the same */
2169          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2170          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2171          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2172          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2173            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2174            sum++;                    sum++;
2175          }                  }
2176          }              }
2177        }            }
2178      }          }
2179    
2180      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
2181         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
2182      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2183      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2184        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
2185        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2186          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
2187          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2188          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2189          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
2190          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2191            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2192            sum++;                    sum++;
2193          }                  }
2194          }              }
2195          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2196          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2197          /* note that we need to map the column index in                  /* note that we need to map the column index in
2198             P->col_coupleBlock back into the column index in                     P->col_coupleBlock back into the column index in
2199             P_ext_couple */                     P_ext_couple */
2200          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2201          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2202            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2203            sum++;                    sum++;
2204          }                  }
2205          }              }
2206        }            }
2207      }          }
2208       }       }
2209     }     }
2210    
2211     /* Now we have the number of non-zero elements in RAP_ext, allocate     /* Now we have the number of non-zero elements in RAP_ext, allocate
2212        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2213     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);     RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2214     RAP_ext_idx = MEMALLOC(sum, index_t);     RAP_ext_idx = new index_t[sum];
2215     RAP_ext_val = MEMALLOC(sum * block_size, double);     RAP_ext_val = new double[sum * block_size];
2216     RA_val = TMPMEMALLOC(block_size, double);     RA_val = new double[block_size];
2217     RAP_val = TMPMEMALLOC(block_size, double);     RAP_val = new double[block_size];
2218    
2219     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2220     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
# Line 2268  Paso_SystemMatrix* Paso_Preconditioner_A Line 2229  Paso_SystemMatrix* Paso_Preconditioner_A
2229       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
2230       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
2231       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2232      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
2233      R_val = &(R_couple->val[j1*block_size]);          R_val = &(R_couple->val[j1*block_size]);
2234    
2235            /* then, loop over elements in row i1 of A->col_coupleBlock */
2236            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2237            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2238              i2 = A->col_coupleBlock->pattern->index[j2];
2239              temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2240              for (irb=0; irb<row_block_size; irb++) {
2241                for (icb=0; icb<col_block_size; icb++) {
2242                    rtmp = ZERO;
2243                    for (ib=0; ib<col_block_size; ib++) {
2244                      rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2245                    }
2246                    RA_val[irb+row_block_size*icb]=rtmp;
2247                }
2248              }
2249    
2250              /* check whether entry RA[i_r, i2] has been previously visited.
2251                 RAP new entry is possible only if entry RA[i_r, i2] has not
2252                 been visited yet */
2253              if (A_marker[i2] != i_r) {
2254                /* first, mark entry RA[i_r,i2] as visited */
2255                A_marker[i2] = i_r;
2256    
2257                /* then loop over elements in row i2 of P_ext_main */
2258                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2259                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2260                    i_c = P->row_coupleBlock->pattern->index[j3];
2261                    temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2262                    for (irb=0; irb<row_block_size; irb++) {
2263                      for (icb=0; icb<col_block_size; icb++) {
2264                        rtmp = ZERO;
2265                        for (ib=0; ib<col_block_size; ib++) {
2266                            rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2267                        }
2268                        RAP_val[irb+row_block_size*icb]=rtmp;
2269                      }
2270                    }
2271    
2272    
2273                    /* check whether entry RAP[i_r,i_c] has been created.
2274                       If not yet created, create the entry and increase
2275                       the total number of elements in RAP_ext */
2276                    if (P_marker[i_c] < row_marker) {
2277                      P_marker[i_c] = sum;
2278                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2279                      RAP_ext_idx[sum] = i_c + offset;
2280                      sum++;
2281                    } else {
2282                      temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2283                      for (ib=0; ib<block_size; ib++) {
2284                        temp_val[ib] += RAP_val[ib];
2285                      }
2286                    }
2287                }
2288    
2289      /* then, loop over elements in row i1 of A->col_coupleBlock */              /* loop over elements in row i2 of P_ext_couple, do the same */
2290      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2291      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2292        i2 = A->col_coupleBlock->pattern->index[j2];                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
       temp_val = &(A->col_coupleBlock->val[j2*block_size]);  
       for (irb=0; irb<row_block_size; irb++) {  
         for (icb=0; icb<col_block_size; icb++) {  
         rtmp = ZERO;  
         for (ib=0; ib<col_block_size; ib++) {  
           rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];  
         }  
         RA_val[irb+row_block_size*icb]=rtmp;  
         }  
       }  
   
       /* check whether entry RA[i_r, i2] has been previously visited.  
          RAP new entry is possible only if entry RA[i_r, i2] has not  
          been visited yet */  
       if (A_marker[i2] != i_r) {  
         /* first, mark entry RA[i_r,i2] as visited */  
         A_marker[i2] = i_r;  
   
         /* then loop over elements in row i2 of P_ext_main */  
         j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->row_coupleBlock->pattern->index[j3];  
         temp_val = &(P->row_coupleBlock->val[j3*block_size]);  
         for (irb=0; irb<row_block_size; irb++) {  
           for (icb=0; icb<col_block_size; icb++) {  
             rtmp = ZERO;  
             for (ib=0; ib<col_block_size; ib++) {  
             rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];  
             }  
             RAP_val[irb+row_block_size*icb]=rtmp;  
           }  
         }  
   
   
         /* check whether entry RAP[i_r,i_c] has been created.  
            If not yet created, create the entry and increase  
            the total number of elements in RAP_ext */  
         if (P_marker[i_c] < row_marker) {  
           P_marker[i_c] = sum;  
           memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));  
           RAP_ext_idx[sum] = i_c + offset;  
           sum++;  
         } else {  
           temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);  
           for (ib=0; ib<block_size; ib++) {  
             temp_val[ib] += RAP_val[ib];  
           }  
         }  
         }  
   
         /* loop over elements in row i2 of P_ext_couple, do the same */  
         j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;  
2293                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2294                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2295                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2340  Paso_SystemMatrix* Paso_Preconditioner_A Line 2301  Paso_SystemMatrix* Paso_Preconditioner_A
2301                    }                    }
2302                  }                  }
2303    
2304          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2305            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2306            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2307            RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2308            sum++;                    sum++;
2309          } else {                  } else {
2310                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2311                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2312                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2313                    }                    }
2314          }                  }
2315          }              }
2316    
2317        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.            /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2318           Only the contributions are added. */               Only the contributions are added. */
2319        } else {            } else {
2320          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2321          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2322          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
2323                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2324                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2325                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2374  Paso_SystemMatrix* Paso_Preconditioner_A Line 2335  Paso_SystemMatrix* Paso_Preconditioner_A
2335                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2336                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2337                  }                  }
2338          }              }
2339          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2340          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2341          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2342                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2343                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2344                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2393  Paso_SystemMatrix* Paso_Preconditioner_A Line 2354  Paso_SystemMatrix* Paso_Preconditioner_A
2354                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2355                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2356                  }                  }
2357          }              }
2358        }            }
2359      }          }
2360    
2361      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
2362         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
2363      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2364      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2365        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
2366            temp_val = &(A->mainBlock->val[j2*block_size]);            temp_val = &(A->mainBlock->val[j2*block_size]);
2367            for (irb=0; irb<row_block_size; irb++) {            for (irb=0; irb<row_block_size; irb++) {
2368              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
# Line 2413  Paso_SystemMatrix* Paso_Preconditioner_A Line 2374  Paso_SystemMatrix* Paso_Preconditioner_A
2374              }              }
2375            }            }
2376    
2377        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2378          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
2379          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2380          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2381          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
2382                  temp_val = &(P->mainBlock->val[j3*block_size]);                  temp_val = &(P->mainBlock->val[j3*block_size]);
2383                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2384                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2429  Paso_SystemMatrix* Paso_Preconditioner_A Line 2390  Paso_SystemMatrix* Paso_Preconditioner_A
2390                    }                    }
2391                  }                  }
2392    
2393          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2394            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2395            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2396            RAP_ext_idx[sum] = i_c + offset;                    RAP_ext_idx[sum] = i_c + offset;
2397            sum++;                    sum++;
2398          } else {                  } else {
2399                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2400                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2401                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2402                    }                    }
2403          }                  }
2404          }              }
2405          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2406          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2407          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2408                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2409                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2410                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2455  Paso_SystemMatrix* Paso_Preconditioner_A Line 2416  Paso_SystemMatrix* Paso_Preconditioner_A
2416                    }                    }
2417                  }                  }
2418    
2419          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2420            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2421            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2422                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2423            sum++;                    sum++;
2424          } else {                  } else {
2425                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2426                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2427                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2428                    }                    }
2429          }                  }
2430          }              }
2431        } else {            } else {
2432          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2433          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2434          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
2435                  temp_val = &(P->mainBlock->val[j3*block_size]);                  temp_val = &(P->mainBlock->val[j3*block_size]);
2436                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2437                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2486  Paso_SystemMatrix* Paso_Preconditioner_A Line 2447  Paso_SystemMatrix* Paso_Preconditioner_A
2447                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2448                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2449                  }                  }
2450          }              }
2451          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2452          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2453          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2454                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2455                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2456                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2505  Paso_SystemMatrix* Paso_Preconditioner_A Line 2466  Paso_SystemMatrix* Paso_Preconditioner_A
2466                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2467                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2468                  }                  }
2469          }              }
2470        }            }
2471      }          }
2472       }       }
2473       RAP_ext_ptr[i_r+1] = sum;       RAP_ext_ptr[i_r+1] = sum;
2474     }     }
2475     TMPMEMFREE(P_marker);     delete[] P_marker;
2476     TMPMEMFREE(Pcouple_to_Pext);     delete[] Pcouple_to_Pext;
2477    
2478     /* Now we have part of RAP[r,c] where row "r" is the list of rows     /* Now we have part of RAP[r,c] where row "r" is the list of rows
2479        which is given by the column list of P->col_coupleBlock, and        which is given by the column list of P->col_coupleBlock, and
2480        column "c" is the list of columns which possiblely covers the        column "c" is the list of columns which possibly covers the
2481        whole column range of systme martris P. This part of data is to        whole column range of system matrix P. This part of data is to
2482        be passed to neighboring processors, and added to corresponding        be passed to neighbouring processors, and added to corresponding
2483        RAP[r,c] entries in the neighboring processors */        RAP[r,c] entries in the neighbouring processors */
2484     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2485          &RAP_ext_val, global_id_P, block_size);                  &RAP_ext_val, global_id_P, block_size);
2486    
2487     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2488     sum = RAP_ext_ptr[num_RAPext_rows];     sum = RAP_ext_ptr[num_RAPext_rows];
2489     num_RAPext_cols = 0;     num_RAPext_cols = 0;
2490     if (num_Pext_cols || sum > 0) {     if (num_Pext_cols || sum > 0) {
2491       temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);       temp = new index_t[num_Pext_cols+sum];
2492       j1_ub = offset + num_Pmain_cols;       j1_ub = offset + num_Pmain_cols;
2493       for (i=0, iptr=0; i<sum; i++) {       for (i=0, iptr=0; i<sum; i++) {
2494      if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */          if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */
2495        temp[iptr++] = RAP_ext_idx[i];          /* XXX */            temp[iptr++] = RAP_ext_idx[i];                  /* XXX */
2496       }       }
2497       for (j=0; j<num_Pext_cols; j++, iptr++){       for (j=0; j<num_Pext_cols; j++, iptr++){
2498      temp[iptr] = global_id_P[j];          temp[iptr] = global_id_P[j];
2499       }       }
2500    
2501       if (iptr) {       if (iptr) {
2502      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2503        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_RAPext_cols = 1;
2504      #else          i = temp[0];
2505        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
2506      #endif            if (temp[j] > i) {
2507      num_RAPext_cols = 1;              i = temp[j];
2508      i = temp[0];              temp[num_RAPext_cols++] = i;
2509      for (j=1; j<iptr; j++) {            }
2510        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_RAPext_cols++] = i;  
       }  
     }  
2511       }       }
2512     }     }
2513    
2514     /* resize the pattern of P_ext_couple */     /* resize the pattern of P_ext_couple */
2515     if(num_RAPext_cols){     if(num_RAPext_cols){
2516       global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);       global_id_RAP = new index_t[num_RAPext_cols];
2517       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
2518      global_id_RAP[i] = temp[i];          global_id_RAP[i] = temp[i];
2519     }     }
2520     if (num_Pext_cols || sum > 0)     if (num_Pext_cols || sum > 0)
2521       TMPMEMFREE(temp);       delete[] temp;
2522     j1_ub = offset + num_Pmain_cols;     j1_ub = offset + num_Pmain_cols;
2523     for (i=0; i<sum; i++) {     for (i=0; i<sum; i++) {
2524       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2525      where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,          where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2526  /*XXX*/         num_RAPext_cols, sizeof(index_t), Paso_comparIndex);  /*XXX*/                 num_RAPext_cols, sizeof(index_t), paso::comparIndex);
2527      RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);          RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2528       } else       } else
2529      RAP_ext_idx[i] = RAP_ext_idx[i] - offset;          RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2530     }     }
2531    
2532     /* build the mapping */     /* build the mapping */
2533     if (num_Pcouple_cols) {     if (num_Pcouple_cols) {
2534       Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);       Pcouple_to_RAP = new index_t[num_Pcouple_cols];
2535       iptr = 0;       iptr = 0;
2536       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
2537      if (global_id_RAP[i] == P->global_id[iptr]) {          if (global_id_RAP[i] == P->global_id[iptr]) {
2538        Pcouple_to_RAP[iptr++] = i;            Pcouple_to_RAP[iptr++] = i;
2539        if (iptr == num_Pcouple_cols) break;            if (iptr == num_Pcouple_cols) break;
2540      }          }
2541     }     }
2542    
2543     if (num_Pext_cols) {     if (num_Pext_cols) {
2544       Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);       Pext_to_RAP = new index_t[num_Pext_cols];
2545       iptr = 0;       iptr = 0;
2546       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
2547      if (global_id_RAP[i] == global_id_P[iptr]) {          if (global_id_RAP[i] == global_id_P[iptr]) {
2548        Pext_to_RAP[iptr++] = i;            Pext_to_RAP[iptr++] = i;
2549        if (iptr == num_Pext_cols) break;            if (iptr == num_Pext_cols) break;
2550      }          }
2551     }     }
2552    
2553     if (global_id_P){     if (global_id_P){
2554       TMPMEMFREE(global_id_P);       delete[] global_id_P;
2555       global_id_P = NULL;       global_id_P = NULL;
2556     }     }
2557    
2558     /* alloc and initialize the makers */     /* alloc and initialise the makers */
2559     sum = num_RAPext_cols + num_Pmain_cols;     sum = num_RAPext_cols + num_Pmain_cols;
2560     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
2561     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2562     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
2563     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
# Line 2618  Paso_SystemMatrix* Paso_Preconditioner_A Line 2575  Paso_SystemMatrix* Paso_Preconditioner_A
2575       row_marker_ext = j;       row_marker_ext = j;
2576    
2577       /* Mark the diagonal element RAP[i_r, i_r], and other elements       /* Mark the diagonal element RAP[i_r, i_r], and other elements
2578      in RAP_ext */          in RAP_ext */
2579       P_marker[i_r] = i;       P_marker[i_r] = i;
2580       i++;       i++;
2581       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
2582      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2583        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
2584          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2585            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
2586            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
2587          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2588            P_marker[i_c] = i;                    P_marker[i_c] = i;
2589            i++;                    i++;
2590          }                  }
2591            } else {                } else {
2592          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
2593            P_marker[i_c] = j;                    P_marker[i_c] = j;
2594            j++;                    j++;
2595          }                  }
2596            }                }
2597          }              }
2598          break;              break;
2599        }            }
2600      }          }
2601       }       }
2602    
2603       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
2604       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
2605       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2606      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
2607    
2608            /* then, loop over elements in row i1 of A->col_coupleBlock */
2609            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2610            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2611              i2 = A->col_coupleBlock->pattern->index[j2];
2612    
2613              /* check whether entry RA[i_r, i2] has been previously visited.
2614                 RAP new entry is possible only if entry RA[i_r, i2] has not
2615                 been visited yet */
2616              if (A_marker[i2] != i_r) {
2617                /* first, mark entry RA[i_r,i2] as visited */
2618                A_marker[i2] = i_r;
2619    
2620                /* then loop over elements in row i2 of P_ext_main */
2621                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2622                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2623                    i_c = P->row_coupleBlock->pattern->index[j3];
2624    
2625                    /* check whether entry RAP[i_r,i_c] has been created.
2626                       If not yet created, create the entry and increase
2627                       the total number of elements in RAP_ext */
2628                    if (P_marker[i_c] < row_marker) {
2629                      P_marker[i_c] = i;
2630                      i++;
2631                    }
2632                }
2633    
2634                /* loop over elements in row i2 of P_ext_couple, do the same */
2635                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2636                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2637                    i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2638                    if (P_marker[i_c] < row_marker_ext) {
2639                      P_marker[i_c] = j;
2640                      j++;
2641                    }
2642                }
2643              }
2644            }
2645    
2646      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* now loop over elements in row i1 of A->mainBlock, repeat
2647      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];             the process we have done to block A->col_coupleBlock */
2648      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2649        i2 = A->col_coupleBlock->pattern->index[j2];          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2650              i2 = A->mainBlock->pattern->index[j2];
2651        /* check whether entry RA[i_r, i2] has been previously visited.            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2652           RAP new entry is possible only if entry RA[i_r, i2] has not              A_marker[i2 + num_Acouple_cols] = i_r;
2653           been visited yet */              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2654        if (A_marker[i2] != i_r) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2655          /* first, mark entry RA[i_r,i2] as visited */                  i_c = P->mainBlock->pattern->index[j3];
2656          A_marker[i2] = i_r;                  if (P_marker[i_c] < row_marker) {
2657                      P_marker[i_c] = i;
2658          /* then loop over elements in row i2 of P_ext_main */                    i++;
2659          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];                  }
2660          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              }
2661          i_c = P->row_coupleBlock->pattern->index[j3];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2662                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2663          /* check whether entry RAP[i_r,i_c] has been created.                  /* note that we need to map the column index in
2664             If not yet created, create the entry and increase                     P->col_coupleBlock back into the column index in
2665             the total number of elements in RAP_ext */                     P_ext_couple */
2666          if (P_marker[i_c] < row_marker) {                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2667            P_marker[i_c] = i;                  if (P_marker[i_c] < row_marker_ext) {
2668            i++;                    P_marker[i_c] = j;
2669          }                    j++;
2670          }                  }
2671                }
2672          /* loop over elements in row i2 of P_ext_couple, do the same */            }
2673          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];          }
         for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         if (P_marker[i_c] < row_marker_ext) {  
           P_marker[i_c] = j;  
           j++;  
         }  
         }  
       }  
     }  
   
     /* now loop over elements in row i1 of A->mainBlock, repeat  
        the process we have done to block A->col_coupleBlock */  
     j2_ub = A->mainBlock->pattern->ptr[i1+1];  
     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
       i2 = A->mainBlock->pattern->index[j2];  
       if (A_marker[i2 + num_Acouple_cols] != i_r) {  
         A_marker[i2 + num_Acouple_cols] = i_r;  
         j3_ub = P->mainBlock->pattern->ptr[i2+1];  
         for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->mainBlock->pattern->index[j3];  
         if (P_marker[i_c] < row_marker) {  
           P_marker[i_c] = i;  
           i++;  
         }  
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         /* note that we need to map the column index in  
            P->col_coupleBlock back into the column index in  
            P_ext_couple */  
         i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         if (P_marker[i_c] < row_marker_ext) {  
           P_marker[i_c] = j;  
           j++;  
         }  
         }  
       }  
     }  
2674       }       }
2675     }     }
2676    
# Line 2721  Paso_SystemMatrix* Paso_Preconditioner_A Line 2678  Paso_SystemMatrix* Paso_Preconditioner_A
2678        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2679        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2680        RAP_couple */        RAP_couple */
2681     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_main_ptr = new index_t[num_Pmain_cols+1];
2682     RAP_main_idx = MEMALLOC(i, index_t);     RAP_main_idx = new index_t[i];
2683     RAP_main_val = TMPMEMALLOC(i * block_size, double);     RAP_main_val = new double[i * block_size];
2684     RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
2685     RAP_couple_idx = MEMALLOC(j, index_t);     RAP_couple_idx = new index_t[j];
2686     RAP_couple_val = TMPMEMALLOC(j * block_size, double);     RAP_couple_val = new double[j * block_size];
2687    
2688     RAP_main_ptr[num_Pmain_cols] = i;     RAP_main_ptr[num_Pmain_cols] = i;
2689     RAP_couple_ptr[num_Pmain_cols] = j;     RAP_couple_ptr[num_Pmain_cols] = j;
# Line 2748  Paso_SystemMatrix* Paso_Preconditioner_A Line 2705  Paso_SystemMatrix* Paso_Preconditioner_A
2705       RAP_couple_ptr[i_r] = row_marker_ext;       RAP_couple_ptr[i_r] = row_marker_ext;
2706    
2707       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2708      in row i_r of RAP_ext */          in row i_r of RAP_ext */
2709       P_marker[i_r] = i;       P_marker[i_r] = i;
2710       RAP_main_val[i] = ZERO;       RAP_main_val[i] = ZERO;
2711       RAP_main_idx[i] = i_r;       RAP_main_idx[i] = i_r;
2712       i++;       i++;
2713    
2714       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
2715      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2716        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
2717          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2718            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
2719            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
2720          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2721            P_marker[i_c] = i;                    P_marker[i_c] = i;
2722            memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));                    memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2723            RAP_main_idx[i] = i_c;                    RAP_main_idx[i] = i_c;
2724            i++;                    i++;
2725          } else {                  } else {
2726            temp_val = &(RAP_ext_val[k*block_size]);                    temp_val = &(RAP_ext_val[k*block_size]);
2727            RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);                    RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2728            for (ib=0; ib<block_size; ib++)                    for (ib=0; ib<block_size; ib++)
2729              RAP_val[ib] += temp_val[ib];                      RAP_val[ib] += temp_val[ib];
2730          }                  }
2731            } else {                } else {
2732          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
2733            P_marker[i_c] = j;                    P_marker[i_c] = j;
2734            memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));                    memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2735            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
2736            j++;                    j++;
2737          } else {                  } else {
2738            temp_val = &(RAP_ext_val[k*block_size]);                    temp_val = &(RAP_ext_val[k*block_size]);
2739            RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);                    RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2740            for (ib=0; ib<block_size; ib++)                    for (ib=0; ib<block_size; ib++)
2741              RAP_val[ib] += temp_val[ib];                      RAP_val[ib] += temp_val[ib];
2742          }                  }
2743            }                }
2744          }              }
2745          break;              break;
2746        }            }
2747      }          }
2748       }       }
2749    
2750       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
2751       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
2752       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2753      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
2754      R_val = &(R_main->val[j1*block_size]);          R_val = &(R_main->val[j1*block_size]);
2755    
2756      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
2757      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2758      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2759        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
2760            temp_val = &(A->col_coupleBlock->val[j2*block_size]);            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2761            for (irb=0; irb<row_block_size; irb++) {            for (irb=0; irb<row_block_size; irb++) {
2762              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
# Line 2812  Paso_SystemMatrix* Paso_Preconditioner_A Line 2769  Paso_SystemMatrix* Paso_Preconditioner_A
2769            }            }
2770    
2771    
2772        /* check whether entry RA[i_r, i2] has been previously visited.            /* check whether entry RA[i_r, i2] has been previously visited.
2773           RAP new entry is possible only if entry RA[i_r, i2] has not               RAP new entry is possible only if entry RA[i_r, i2] has not
2774           been visited yet */               been visited yet */
2775        if (A_marker[i2] != i_r) {            if (A_marker[i2] != i_r) {
2776          /* first, mark entry RA[i_r,i2] as visited */              /* first, mark entry RA[i_r,i2] as visited */
2777          A_marker[i2] = i_r;              A_marker[i2] = i_r;
2778    
2779          /* then loop over elements in row i2 of P_ext_main */              /* then loop over elements in row i2 of P_ext_main */
2780          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2781          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2782          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
2783                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2784                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2785                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2835  Paso_SystemMatrix* Paso_Preconditioner_A Line 2792  Paso_SystemMatrix* Paso_Preconditioner_A
2792                  }                  }
2793    
2794    
2795          /* check whether entry RAP[i_r,i_c] has been created.                  /* check whether entry RAP[i_r,i_c] has been created.
2796             If not yet created, create the entry and increase                     If not yet created, create the entry and increase
2797             the total number of elements in RAP_ext */                     the total number of elements in RAP_ext */
2798          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2799            P_marker[i_c] = i;                    P_marker[i_c] = i;
2800            memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2801            RAP_main_idx[i] = i_c;                    RAP_main_idx[i] = i_c;
2802            i++;                    i++;
2803          } else {                  } else {
2804                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2805                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2806                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2807                    }                    }
2808          }                  }
2809          }              }
2810    
2811          /* loop over elements in row i2 of P_ext_couple, do the same */              /* loop over elements in row i2 of P_ext_couple, do the same */
2812          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2813          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2814          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2815                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2816                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2817                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2866  Paso_SystemMatrix* Paso_Preconditioner_A Line 2823  Paso_SystemMatrix* Paso_Preconditioner_A
2823                    }                    }
2824                  }                  }
2825    
2826          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
2827            P_marker[i_c] = j;                    P_marker[i_c] = j;
2828            memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2829            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
2830            j++;                    j++;
2831          } else {                  } else {
2832                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2833                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2834                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2835                    }                    }
2836          }                  }
2837          }              }
2838    
2839        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.            /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2840           Only the contributions are added. */               Only the contributions are added. */
2841        } else {            } else {
2842          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2843          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2844          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
2845                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2846                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2847                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2900  Paso_SystemMatrix* Paso_Preconditioner_A Line 2857  Paso_SystemMatrix* Paso_Preconditioner_A
2857                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2858                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2859                  }                  }
2860          }              }
2861          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2862          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2863          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2864                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2865                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2866                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2919  Paso_SystemMatrix* Paso_Preconditioner_A Line 2876  Paso_SystemMatrix* Paso_Preconditioner_A
2876                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2877                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2878                  }                  }
2879          }              }
2880        }            }
2881      }          }
2882    
2883      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
2884         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
2885      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2886      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2887        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
2888            temp_val = &(A->mainBlock->val[j2*block_size]);            temp_val = &(A->mainBlock->val[j2*block_size]);
2889            for (irb=0; irb<row_block_size; irb++) {            for (irb=0; irb<row_block_size; irb++) {
2890              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
# Line 2939  Paso_SystemMatrix* Paso_Preconditioner_A Line 2896  Paso_SystemMatrix* Paso_Preconditioner_A
2896              }              }
2897            }            }
2898    
2899        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2900          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
2901          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2902          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2903          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
2904                  temp_val = &(P->mainBlock->val[j3*block_size]);                  temp_val = &(P->mainBlock->val[j3*block_size]);
2905                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2906                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2955  Paso_SystemMatrix* Paso_Preconditioner_A Line 2912  Paso_SystemMatrix* Paso_Preconditioner_A
2912                    }                    }
2913                  }                  }
2914    
2915          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2916            P_marker[i_c] = i;                    P_marker[i_c] = i;
2917            memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2918            RAP_main_idx[i] = i_c;                    RAP_main_idx[i] = i_c;
2919            i++;                    i++;
2920          } else {                  } else {
2921                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2922                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2923                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2924                    }                    }
2925          }                  }
2926          }              }
2927          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2928          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2929          /* note that we need to map the column index in                  /* note that we need to map the column index in
2930             P->col_coupleBlock back into the column index in                     P->col_coupleBlock back into the column index in
2931             P_ext_couple */                     P_ext_couple */
2932          i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2933                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2934                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2935                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2984  Paso_SystemMatrix* Paso_Preconditioner_A Line 2941  Paso_SystemMatrix* Paso_Preconditioner_A
2941                    }                    }
2942                  }                  }
2943    
2944          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
2945            P_marker[i_c] = j;                    P_marker[i_c] = j;
2946            memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));                    memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2947            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
2948            j++;                    j++;
2949          } else {                  } else {
2950                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2951                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2952                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2953                    }                    }
2954          }                  }
2955          }              }
2956    
2957        } else {            } else {
2958          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2959          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2960          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
2961                  temp_val = &(P->mainBlock->val[j3*block_size]);                  temp_val = &(P->mainBlock->val[j3*block_size]);
2962                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2963                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 3016  Paso_SystemMatrix* Paso_Preconditioner_A Line 2973  Paso_SystemMatrix* Paso_Preconditioner_A
2973                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2974                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2975                  }                  }
2976          }              }
2977          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2978          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2979          i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2980                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);