/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Diff of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/amg_from_3530/paso/src/AMG_Interpolation.c revision 3763 by lgao, Tue Jan 10 05:10:17 2012 UTC trunk/paso/src/AMG_Interpolation.cpp revision 4836 by caltinay, Mon Apr 7 05:51:55 2014 UTC
# Line 1  Line 1 
1  //  /*****************************************************************************
 /*******************************************************  
2  *  *
3  * Copyright (c) 2003-2011 by University of Queensland  * Copyright (c) 2003-2014 by University of Queensland
4  * Earth Systems Science Computational Center (ESSCC)  * http://www.uq.edu.au
 * http://www.uq.edu.au/esscc  
5  *  *
6  * Primary Business: Queensland, Australia  * Primary Business: Queensland, Australia
7  * Licensed under the Open Software License version 3.0  * Licensed under the Open Software License version 3.0
8  * http://www.opensource.org/licenses/osl-3.0.php  * http://www.opensource.org/licenses/osl-3.0.php
9  *  *
10  *******************************************************/  * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11    * Development 2012-2013 by School of Earth Sciences
12    * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13    *
14    *****************************************************************************/
15    
16    
17  /**************************************************************/  /****************************************************************************/
18    
19  /* Paso: defines AMG Interpolation                            */  /* Paso: defines AMG Interpolation                            */
20    
21  /**************************************************************/  /****************************************************************************/
22    
23  /* Author: l.gao@uq.edu.au                                    */  /* Author: l.gao@uq.edu.au                                    */
24    
25  /**************************************************************/  /****************************************************************************/
26    
27  #include "Paso.h"  #include "Paso.h"
28  #include "SparseMatrix.h"  #include "SparseMatrix.h"
29  #include "PasoUtil.h"  #include "PasoUtil.h"
30  #include "Preconditioner.h"  #include "Preconditioner.h"
31    
32  /**************************************************************  /****************************************************************************
33    
34      Methods nessecary for AMG preconditioner      Methods necessary for AMG preconditioner
35    
36      construct n_C x n_C interpolation operator A_C from matrix A      construct n_C x n_C interpolation operator A_C from matrix A
37      and prolongation matrix P.      and prolongation matrix P.
38    
39      The coarsening operator A_C is defined by RAP where R=P^T.      The coarsening operator A_C is defined by RAP where R=P^T.
40    
41  ***************************************************************/  *****************************************************************************/
   
 #define MY_DEBUG 0  
 #define MY_DEBUG1 1  
42    
43  /* Extend system matrix B with extra two sparse matrixes:  /* Extend system matrix B with extra two sparse matrices:
44      B_ext_main and B_ext_couple          B_ext_main and B_ext_couple
45     The combination of this two sparse matrixes represents       The combination of this two sparse matrices represents  
46     the portion of B that is stored on neighbor procs and     the portion of B that is stored on neighbour procs and
47     needed locally for triple matrix product (B^T) A B.     needed locally for triple matrix product (B^T) A B.
48    
49     FOR NOW, we assume that the system matrix B has a NULL     FOR NOW, we assume that the system matrix B has a NULL
# Line 58  Line 57 
57          A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
58          rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59     on q. */     on q. */
60  void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)  void Paso_Preconditioner_AMG_extendB(paso::SystemMatrix_ptr A,
61                                         paso::SystemMatrix_ptr B)
62  {  {
63    Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;      paso::Pattern_ptr pattern_main, pattern_couple;
64    Paso_Coupler *coupler=NULL;      paso::Coupler_ptr coupler;
65    Paso_SharedComponents *send=NULL, *recv=NULL;      paso::SharedComponents_ptr send, recv;
66    double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;      double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
67    index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;      index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
68    index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;      index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
69    index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;      index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
70    index_t *idx_m=NULL, *idx_c=NULL;      index_t *idx_m=NULL, *idx_c=NULL;
71    index_t i, j, k, m, n, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c;      index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
72    index_t offset, len, block_size, block_size_size, max_num_cols;      index_t offset, len, block_size, block_size_size, max_num_cols;
73    index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;      index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
74    dim_t *recv_degree=NULL, *send_degree=NULL;      dim_t *recv_degree=NULL, *send_degree=NULL;
75    dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;      dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
76    
77    if (size == 1) return;      if (size == 1) return;
78    
79    if (B->row_coupleBlock) {      B->row_coupleBlock.reset();
80      Paso_SparseMatrix_free(B->row_coupleBlock);  
81      B->row_coupleBlock = NULL;      if (B->remote_coupleBlock.get()) {
82    }          Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to remote_coupleBlock has already been set.");
83            return;
84    if (B->row_coupleBlock || B->remote_coupleBlock) {      }
     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");  
     return;  
   }  
   
 if (MY_DEBUG1) fprintf(stderr, "CP1_1\n");  
85    
86    /* sending/receiving unknown's global ID */      /* sending/receiving unknown's global ID */
87    num_main_cols = B->mainBlock->numCols;      num_main_cols = B->mainBlock->numCols;
88    cols = TMPMEMALLOC(num_main_cols, double);      cols = new double[num_main_cols];
89    offset = B->col_distribution->first_component[rank];      offset = B->col_distribution->first_component[rank];
90    #pragma omp parallel for private(i) schedule(static)      #pragma omp parallel for private(i) schedule(static)
91    for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;      for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
92    if (B->global_id == NULL) {      if (B->global_id == NULL) {
93      coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);          coupler.reset(new paso::Coupler(B->col_coupler->connector, 1));
94      Paso_Coupler_startCollect(coupler, cols);          coupler->startCollect(cols);
95    }      }
96    
97    recv_buf = TMPMEMALLOC(size, index_t);      recv_buf = new index_t[size];
98    recv_degree = TMPMEMALLOC(size, dim_t);      recv_degree = new dim_t[size];
99    recv_offset = TMPMEMALLOC(size+1, index_t);      recv_offset = new index_t[size+1];
100    #pragma omp parallel for private(i) schedule(static)    #pragma omp parallel for private(i) schedule(static)
101    for (i=0; i<size; i++){      for (i=0; i<size; i++){
102      recv_buf[i] = 0;          recv_buf[i] = 0;
103      recv_degree[i] = 1;          recv_degree[i] = 1;
104      recv_offset[i] = i;          recv_offset[i] = i;
105    }      }
106    
107    block_size = B->col_coupler->block_size;    block_size = B->block_size;
108    block_size_size = block_size * sizeof(double);    block_size_size = block_size * sizeof(double);
109    num_couple_cols = B->col_coupleBlock->numCols;    num_couple_cols = B->col_coupleBlock->numCols;
110    send = A->col_coupler->connector->send;    send = A->col_coupler->connector->send;
111    recv = A->col_coupler->connector->recv;    recv = A->col_coupler->connector->recv;
112    num_neighbors = send->numNeighbors;    num_neighbors = send->numNeighbors;
113    p = send->offsetInShared[num_neighbors];    p = send->offsetInShared[num_neighbors];
114    len = p * num_main_cols * 2;   /*XXX used to be max_num_cols, do I mean num_main_cols?? */    len = p * B->col_distribution->first_component[size];
115    send_buf = TMPMEMALLOC(len * block_size, double);    send_buf = new double[len * block_size];
116    send_idx = TMPMEMALLOC(len, index_t);    send_idx = new index_t[len];
117    send_offset = TMPMEMALLOC((p+1)*2, index_t);    send_offset = new index_t[(p+1)*2];
118    send_degree = TMPMEMALLOC(num_neighbors, dim_t);    send_degree = new dim_t[num_neighbors];
119    i = num_main_cols + num_couple_cols;    i = num_main_cols + num_couple_cols;
120    send_m = TMPMEMALLOC(i * block_size, double);    send_m = new double[i * block_size];
121    send_c = TMPMEMALLOC(i * block_size, double);    send_c = new double[i * block_size];
122    idx_m = TMPMEMALLOC(i, index_t);    idx_m = new index_t[i];
123    idx_c = TMPMEMALLOC(i, index_t);    idx_c = new index_t[i];
124    
125    /* waiting for receiving unknown's global ID */    /* waiting for receiving unknown's global ID */
126    if (B->global_id == NULL) {    if (B->global_id == NULL) {
127      Paso_Coupler_finishCollect(coupler);        coupler->finishCollect();
128      global_id = MEMALLOC(num_couple_cols, index_t);      global_id = new index_t[num_couple_cols];
129      #pragma omp parallel for private(i) schedule(static)      #pragma omp parallel for private(i) schedule(static)
130      for (i=0; i<num_couple_cols; ++i)      for (i=0; i<num_couple_cols; ++i)
131          global_id[i] = coupler->recv_buffer[i];          global_id[i] = coupler->recv_buffer[i];
132      B->global_id = global_id;      B->global_id = global_id;
     Paso_Coupler_free(coupler);  
133    } else    } else
134      global_id = B->global_id;      global_id = B->global_id;
135    
136    /* distribute the number of cols in current col_coupleBlock to all ranks */    /* distribute the number of cols in current col_coupleBlock to all ranks */
137      #ifdef ESYS_MPI
138    MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);    MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
139      #endif
 if (MY_DEBUG1) fprintf(stderr, "CP1_2\n");  
140    
141    /* distribute global_ids of cols to be considered to all ranks*/    /* distribute global_ids of cols to be considered to all ranks*/
142    len = 0;    len = 0;
# Line 152  if (MY_DEBUG1) fprintf(stderr, "CP1_2\n" Line 146  if (MY_DEBUG1) fprintf(stderr, "CP1_2\n"
146      recv_offset[i] = len;      recv_offset[i] = len;
147      len += recv_buf[i];      len += recv_buf[i];
148      if (max_num_cols < recv_buf[i])      if (max_num_cols < recv_buf[i])
149      max_num_cols = recv_buf[i];          max_num_cols = recv_buf[i];
150    }    }
151    recv_offset[size] = len;    recv_offset[size] = len;
152    cols_array = TMPMEMALLOC(len, index_t);    cols_array = new index_t[len];
153      #ifdef ESYS_MPI
154    MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);    MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155      #endif
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_3\n", rank);  
 if (MY_DEBUG) {  
     int i=0;  
     char hostname[256];  
     gethostname(hostname, sizeof(hostname));  
     fprintf(stderr, "rank %d PID %d on %s ready for attach\n", A->mpi_info->rank, getpid(), hostname);  
     if (rank == 1) {  
       fflush(stdout);  
       while (0 == i)  
         sleep(5);  
     }  
 }  
156    
157    /* first, prepare the ptr_ptr to be received */    /* first, prepare the ptr_ptr to be received */
158    q = recv->numNeighbors;    q = recv->numNeighbors;
159    len = recv->offsetInShared[q];    len = recv->offsetInShared[q];
160    ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);    ptr_ptr = new index_t[(len+1) * 2];
161    for (p=0; p<q; p++) {    for (p=0; p<q; p++) {
162      row = recv->offsetInShared[p];      row = recv->offsetInShared[p];
163      m = recv->offsetInShared[p + 1];      m = recv->offsetInShared[p + 1];
164      MPI_Irecv(&(ptr_ptr[row]), 2 * (m-row), MPI_INT, recv->neighbor[p],      #ifdef ESYS_MPI
165        MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166                  A->mpi_info->msg_tag_counter+recv->neighbor[p],                  A->mpi_info->msg_tag_counter+recv->neighbor[p],
167                  A->mpi_info->comm,                  A->mpi_info->comm,
168                  &(A->col_coupler->mpi_requests[p]));                  &(A->col_coupler->mpi_requests[p]));
169        #endif
170    }    }
171    
172    /* now prepare the rows to be sent (the degree, the offset and the data) */    /* now prepare the rows to be sent (the degree, the offset and the data) */
173    len = 0;    len = 0;
174      i0 = 0;
175    for (p=0; p<num_neighbors; p++) {    for (p=0; p<num_neighbors; p++) {
176      i = 0;      i = i0;
177      neighbor = send->neighbor[p];      neighbor = send->neighbor[p];
178      m_lb = B->col_distribution->first_component[neighbor];      m_lb = B->col_distribution->first_component[neighbor];
179      m_ub = B->col_distribution->first_component[neighbor + 1];      m_ub = B->col_distribution->first_component[neighbor + 1];
180      j_ub = send->offsetInShared[p + 1];      j_ub = send->offsetInShared[p + 1];
181      for (j=send->offsetInShared[p]; j<j_ub; j++) {      for (j=send->offsetInShared[p]; j<j_ub; j++) {
182      row = send->shared[j];          row = send->shared[j];
183      l_m = 0;          l_m = 0;
184      l_c = 0;          l_c = 0;
185      k_ub = B->col_coupleBlock->pattern->ptr[row + 1];          k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
186      k_lb = B->col_coupleBlock->pattern->ptr[row];          k_lb = B->col_coupleBlock->pattern->ptr[row];
187    
188      /* check part of col_coupleBlock for data to be passed @row */          /* check part of col_coupleBlock for data to be passed @row */
189  //  q = B->mainBlock->pattern->index[B->mainBlock->pattern->ptr[row]] + offset;          for (k=k_lb; k<k_ub; k++) {
     for (k=k_lb; k<k_ub; k++) {  
       m = global_id[B->col_coupleBlock->pattern->index[k]];  
 //    if (m > q) break;  
       if (m > offset) break;  
       if (m>= m_lb && m < m_ub) {  
         /* data to be passed to sparse matrix B_ext_main */  
         idx_m[l_m] = m - m_lb;  
         memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);  
         l_m++;  
       } else {  
         /* data to be passed to sparse matrix B_ext_couple */  
         idx_c[l_c] = m;  
 if (MY_DEBUG) fprintf(stderr, "rank%d (1) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);  
         memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);  
         l_c++;  
       }  
     }  
     k_lb = k;  
   
     /* check mainBlock for data to be passed @row to sparse  
        matrix B_ext_couple */  
     k_ub = B->mainBlock->pattern->ptr[row + 1];  
     k = B->mainBlock->pattern->ptr[row];  
     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));  
     for (; k<k_ub; k++) {  
       m = B->mainBlock->pattern->index[k] + offset;  
 if (MY_DEBUG) fprintf(stderr, "rank%d (2) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);  
       idx_c[l_c] = m;  
       l_c++;  
     }  
   
     /* check the rest part of col_coupleBlock for data to  
        be passed @row to sparse matrix B_ext_couple */  
     k = k_lb;  
     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];  
     for (k=k_lb; k<k_ub; k++) {  
190            m = global_id[B->col_coupleBlock->pattern->index[k]];            m = global_id[B->col_coupleBlock->pattern->index[k]];
191        if (m>= m_lb && i < m_ub) {            if (m > offset) break;
192          /* data to be passed to sparse matrix B_ext_main */            if (m>= m_lb && m < m_ub) {
193          idx_m[l_m] = m - m_lb;              /* data to be passed to sparse matrix B_ext_main */
194          memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);              idx_m[l_m] = m - m_lb;
195          l_m++;              memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196        } else {              l_m++;
197          /* data to be passed to sparse matrix B_ext_couple */            } else {
198          idx_c[l_c] = m;              /* data to be passed to sparse matrix B_ext_couple */
199  if (MY_DEBUG) fprintf(stderr, "rank%d (3) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);              idx_c[l_c] = m;
200          memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);              memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
201          l_c++;              l_c++;
202        }            }
203      }          }
204            k_lb = k;
205  if (MY_DEBUG) fprintf(stderr, "rank %d p %d i %d j %d l_c %d l_m %d ub %d\n", rank, p, i, j, l_c, l_m, num_couple_cols + num_main_cols);  
206      memcpy(&(send_buf[len]), &(send_m[0]), block_size_size*l_m);          /* check mainBlock for data to be passed @row to sparse
207      memcpy(&(send_idx[len]), &(idx_m[0]), l_m * sizeof(index_t));             matrix B_ext_couple */
208            k_ub = B->mainBlock->pattern->ptr[row + 1];
209            k = B->mainBlock->pattern->ptr[row];
210            memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
211            for (; k<k_ub; k++) {
212              m = B->mainBlock->pattern->index[k] + offset;
213              idx_c[l_c] = m;
214              l_c++;
215            }
216    
217            /* check the rest part of col_coupleBlock for data to
218               be passed @row to sparse matrix B_ext_couple */
219            k = k_lb;
220            k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
221            for (k=k_lb; k<k_ub; k++) {
222              m = global_id[B->col_coupleBlock->pattern->index[k]];
223              if (m>= m_lb && m < m_ub) {
224                /* data to be passed to sparse matrix B_ext_main */
225                idx_m[l_m] = m - m_lb;
226                memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227                l_m++;
228              } else {
229                /* data to be passed to sparse matrix B_ext_couple */
230                idx_c[l_c] = m;
231                memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
232                l_c++;
233              }
234            }
235    
236            memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
237            memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
238          send_offset[2*i] = l_m;          send_offset[2*i] = l_m;
239      len += l_m;          len += l_m;
240      memcpy(&(send_buf[len]), &(send_c[0]), block_size_size*l_c);          memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
241      memcpy(&(send_idx[len]), &(idx_c[0]), l_c * sizeof(index_t));          memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
242      send_offset[2*i+1] = l_c;          send_offset[2*i+1] = l_c;
243      len += l_c;          len += l_c;
244                i++;
     i++;  
245      }      }
246    
247      /* sending */      /* sending */
248      MPI_Issend(send_offset, 2*i, MPI_INT, send->neighbor[p],      #ifdef ESYS_MPI
249        MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
250                  A->mpi_info->msg_tag_counter+rank,                  A->mpi_info->msg_tag_counter+rank,
251                  A->mpi_info->comm,                  A->mpi_info->comm,
252                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
253        #endif
254      send_degree[p] = len;      send_degree[p] = len;
255        i0 = i;
256    }    }
257    TMPMEMFREE(send_m);    delete[] send_m;
258    TMPMEMFREE(send_c);    delete[] send_c;
259    TMPMEMFREE(idx_m);    delete[] idx_m;
260    TMPMEMFREE(idx_c);    delete[] idx_c;
261    
262    
263    q = recv->numNeighbors;    q = recv->numNeighbors;
264    len = recv->offsetInShared[q];    len = recv->offsetInShared[q];
265    ptr_main = MEMALLOC((len+1), index_t);    ptr_main = new index_t[(len+1)];
266    ptr_couple = MEMALLOC((len+1), index_t);    ptr_couple = new index_t[(len+1)];
   
267    
268      #ifdef ESYS_MPI
269    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
270                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
271                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
272    A->mpi_info->msg_tag_counter += size;    #endif
273      ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
274    
275    j = 0;    j = 0;
276    k = 0;    k = 0;
277    ptr_main[0] = 0;    ptr_main[0] = 0;
278    ptr_couple[0] = 0;    ptr_couple[0] = 0;
 //  #pragma omp parallel for private(i,j,k) schedule(static)  
279    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
280      j += ptr_ptr[2*i];      j += ptr_ptr[2*i];
281      k += ptr_ptr[2*i+1];      k += ptr_ptr[2*i+1];
# Line 301  if (MY_DEBUG) fprintf(stderr, "rank %d p Line 283  if (MY_DEBUG) fprintf(stderr, "rank %d p
283      ptr_couple[i+1] = k;      ptr_couple[i+1] = k;
284    }    }
285    
286    TMPMEMFREE(ptr_ptr);      delete[] ptr_ptr;  
287    idx_main = MEMALLOC(j, index_t);    idx_main = new index_t[j];
288    idx_couple = MEMALLOC(k, index_t);    idx_couple = new index_t[k];
289    ptr_idx = TMPMEMALLOC(j+k, index_t);    ptr_idx = new index_t[j+k];
290    ptr_val = TMPMEMALLOC(j+k, double);    ptr_val = new double[(j+k) * block_size];
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_4_2 %d %d %d\n", rank, j, k, len);  
291    
292    /* send/receive index array */    /* send/receive index array */
293    j=0;    j=0;
# Line 316  if (MY_DEBUG1) fprintf(stderr, "rank %d Line 297  if (MY_DEBUG1) fprintf(stderr, "rank %d
297      m = recv->offsetInShared[p+1];      m = recv->offsetInShared[p+1];
298      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
299      if (i > 0) {      if (i > 0) {
300      k_ub ++;          k_ub ++;
301            #ifdef ESYS_MPI
302          MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],          MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
303                  A->mpi_info->msg_tag_counter+recv->neighbor[p],                  A->mpi_info->msg_tag_counter+recv->neighbor[p],
304                  A->mpi_info->comm,                  A->mpi_info->comm,
305                  &(A->col_coupler->mpi_requests[p]));                  &(A->col_coupler->mpi_requests[p]));
306      } else {          #endif
 if (MY_DEBUG) fprintf(stderr, "rank%d k %d m %d main(%d %d) couple(%d %d)\n", rank, k, m, ptr_main[m], ptr_main[k], ptr_couple[m], ptr_couple[k]);  
307      }      }
308      j += i;      j += i;
309    }    }
310    
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_4_3  %d %d\n", rank, recv->numNeighbors, k_ub);  
311    j=0;    j=0;
312    k_ub = 0;    k_ub = 0;
313    for (p=0; p<num_neighbors; p++) {    for (p=0; p<num_neighbors; p++) {
314      i = send_degree[p] - j;      i = send_degree[p] - j;
315      if (i > 0){      if (i > 0){
316      k_ub ++;          k_ub ++;
317            #ifdef ESYS_MPI
318          MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],          MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
319                  A->mpi_info->msg_tag_counter+rank,                  A->mpi_info->msg_tag_counter+rank,
320                  A->mpi_info->comm,                  A->mpi_info->comm,
321                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
322      } else {          #endif
 if (MY_DEBUG) fprintf(stderr, "rank%d send_degree %d, p %d, j %d\n", rank, send_degree[p], p, j);  
323      }      }
324      j = send_degree[p];      j = send_degree[p];
325    }    }
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_4_4 %d %d\n", rank,num_neighbors, k_ub);  
326    
327      #ifdef ESYS_MPI
328    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
329                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
330                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
331    A->mpi_info->msg_tag_counter += size;    #endif
332  if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_5 %d %d %d\n", rank, len, ptr_main[len], ptr_couple[len]);    ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
333    
334    #pragma omp parallel for private(i,j,k,m,p) schedule(static)    #pragma omp parallel for private(i,j,k,m,p) schedule(static)
335    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
# Line 357  if (MY_DEBUG1) fprintf(stderr, "rank %d Line 337  if (MY_DEBUG1) fprintf(stderr, "rank %d
337      k = ptr_main[i+1];      k = ptr_main[i+1];
338      m = ptr_couple[i];      m = ptr_couple[i];
339      for (p=j; p<k; p++) {      for (p=j; p<k; p++) {
340      idx_main[p] = ptr_idx[m+p];          idx_main[p] = ptr_idx[m+p];
341      }      }
342      j = ptr_couple[i+1];      j = ptr_couple[i+1];
343      for (p=m; p<j; p++) {      for (p=m; p<j; p++) {
344      idx_couple[p] = ptr_idx[k+p];          idx_couple[p] = ptr_idx[k+p];
345      }      }
346    }    }
347    TMPMEMFREE(ptr_idx);    delete[] ptr_idx;
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_5_1\n", rank);  
348    
349    /* allocate pattern and sparsematrix for B_ext_main */    /* allocate pattern and sparsematrix for B_ext_main */
350    pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,    pattern_main.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
351                  len, num_main_cols, ptr_main, idx_main);                  len, num_main_cols, ptr_main, idx_main));
352    B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,    B->row_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
353          pattern_main, B->row_block_size, B->col_block_size,                  pattern_main, B->row_block_size, B->col_block_size,
354          FALSE);                  false));
   Paso_Pattern_free(pattern_main);  
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5_2\n", rank);  
355    
356    /* allocate pattern and sparsematrix for B_ext_couple */    /* allocate pattern and sparsematrix for B_ext_couple */
357    pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,    pattern_couple.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
358                  len, B->col_distribution->first_component[size],                  len, B->col_distribution->first_component[size],
359          ptr_couple, idx_couple);                  ptr_couple, idx_couple));
360    B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,    B->remote_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
361                  pattern_couple, B->row_block_size, B->col_block_size,                  pattern_couple, B->row_block_size, B->col_block_size,
362                  FALSE);                  false));
   Paso_Pattern_free(pattern_couple);  
   
 if (MY_DEBUG1) fprintf(stderr, "CP1_6\n");  
363    
364    /* send/receive value array */    /* send/receive value array */
365    j=0;    j=0;
# Line 393  if (MY_DEBUG1) fprintf(stderr, "CP1_6\n" Line 367  if (MY_DEBUG1) fprintf(stderr, "CP1_6\n"
367      k = recv->offsetInShared[p];      k = recv->offsetInShared[p];
368      m = recv->offsetInShared[p+1];      m = recv->offsetInShared[p+1];
369      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
370        #ifdef ESYS_MPI
371      if (i > 0)      if (i > 0)
372          MPI_Irecv(&(ptr_val[j]), i * block_size,          MPI_Irecv(&(ptr_val[j]), i * block_size,
373                  MPI_DOUBLE, recv->neighbor[p],                  MPI_DOUBLE, recv->neighbor[p],
374                  A->mpi_info->msg_tag_counter+recv->neighbor[p],                  A->mpi_info->msg_tag_counter+recv->neighbor[p],
375                  A->mpi_info->comm,                  A->mpi_info->comm,
376                  &(A->col_coupler->mpi_requests[p]));                  &(A->col_coupler->mpi_requests[p]));
377      j += i;      #endif
378        j += (i * block_size);
379    }    }
380    
381    j=0;    j=0;
382    for (p=0; p<num_neighbors; p++) {    for (p=0; p<num_neighbors; p++) {
383      i = send_degree[p] - j;      i = send_degree[p] - j;
384        #ifdef ESYS_MPI
385      if (i > 0)      if (i > 0)
386          MPI_Issend(&(send_buf[j]), i*block_size, MPI_DOUBLE, send->neighbor[p],          MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
387                  A->mpi_info->msg_tag_counter+rank,                  A->mpi_info->msg_tag_counter+rank,
388                  A->mpi_info->comm,                  A->mpi_info->comm,
389                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
390      j = send_degree[p];      #endif
391        j = send_degree[p] ;
392    }    }
393    
394      #ifdef ESYS_MPI
395    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
396                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
397                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
398    A->mpi_info->msg_tag_counter += size;    #endif
399  if (MY_DEBUG1) fprintf(stderr, "CP1_7\n");    ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
400    
401    #pragma omp parallel for private(i,j,k,m,p) schedule(static)    #pragma omp parallel for private(i,j,k,m,p) schedule(static)
402    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
# Line 425  if (MY_DEBUG1) fprintf(stderr, "CP1_7\n" Line 404  if (MY_DEBUG1) fprintf(stderr, "CP1_7\n"
404      k = ptr_main[i+1];      k = ptr_main[i+1];
405      m = ptr_couple[i];      m = ptr_couple[i];
406      for (p=j; p<k; p++) {      for (p=j; p<k; p++) {
407      B->row_coupleBlock->val[p] = ptr_val[m+p];          memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
408      }      }
409      j = ptr_couple[i+1];      j = ptr_couple[i+1];
410      for (p=m; p<j; p++) {      for (p=m; p<j; p++) {
411      B->remote_coupleBlock->val[p] = ptr_val[k+p];          memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
412      }      }
413    }    }
414    TMPMEMFREE(ptr_val);    delete[] ptr_val;
415    
416    /* release all temp memory allocation */  
417    TMPMEMFREE(cols);      /* release all temp memory allocation */
418    TMPMEMFREE(cols_array);      delete[] cols;
419    TMPMEMFREE(recv_offset);      delete[] cols_array;
420    TMPMEMFREE(recv_degree);      delete[] recv_offset;
421    TMPMEMFREE(recv_buf);      delete[] recv_degree;
422    TMPMEMFREE(send_buf);      delete[] recv_buf;
423    TMPMEMFREE(send_offset);      delete[] send_buf;
424    TMPMEMFREE(send_degree);      delete[] send_offset;
425    TMPMEMFREE(send_idx);      delete[] send_degree;
426        delete[] send_idx;
427  }  }
428    
429  /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)  /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
430     has the same number of rows as P->col_coupleBlock->numCols. Now, we need     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
431     to copy block of data in T to neighbor processors, defined by     to copy block of data in T to neighbour processors, defined by
432      P->col_coupler->connector->recv->neighbor[k] where k is in          P->col_coupler->connector->recv->neighbor[k] where k is in
433      [0, P->col_coupler->connector->recv->numNeighbors).          [0, P->col_coupler->connector->recv->numNeighbors).
434     Rows to be copied to neighbor processor k is in the list defined by     Rows to be copied to neighbor processor k is in the list defined by
435      P->col_coupler->connector->recv->offsetInShared[k] ...          P->col_coupler->connector->recv->offsetInShared[k] ...
436      P->col_coupler->connector->recv->offsetInShared[k+1]  */          P->col_coupler->connector->recv->offsetInShared[k+1]  */
437  void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,  void Paso_Preconditioner_AMG_CopyRemoteData(paso::SystemMatrix_ptr P,
438      index_t **p_ptr, index_t **p_idx, double **p_val, index_t *global_id)          index_t **p_ptr, index_t **p_idx, double **p_val,
439            index_t *global_id, index_t block_size)
440  {  {
441    Paso_SharedComponents *send=NULL, *recv=NULL;      paso::SharedComponents_ptr send, recv;
442    index_t send_neighbors, recv_neighbors, send_rows, recv_rows;      index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
443    index_t i, j, p, m, n, rank, size, offset;      index_t i, j, p, m, n, size;
444    index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;      index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
445    index_t *ptr=*p_ptr, *idx=*p_idx;      index_t *ptr=*p_ptr, *idx=*p_idx;
446    double  *val=*p_val, *recv_val=NULL;      double  *val=*p_val, *recv_val=NULL;
447      #ifdef ESYS_MPI
448    rank = P->mpi_info->rank;      index_t rank = P->mpi_info->rank;
449    size = P->mpi_info->size;    #endif
450    send = P->col_coupler->connector->recv;  
451    recv = P->col_coupler->connector->send;      size = P->mpi_info->size;
452    send_neighbors = send->numNeighbors;      send = P->col_coupler->connector->recv;
453    recv_neighbors = recv->numNeighbors;      recv = P->col_coupler->connector->send;
454    send_rows = P->col_coupleBlock->numCols;      send_neighbors = send->numNeighbors;
455    recv_rows = recv->offsetInShared[recv_neighbors];      recv_neighbors = recv->numNeighbors;
456    offset = P->col_distribution->first_component[rank];      send_rows = P->col_coupleBlock->numCols;
457        recv_rows = recv->offsetInShared[recv_neighbors];
458    
459    send_degree = TMPMEMALLOC(send_rows, index_t);      send_degree = new index_t[send_rows];
460    recv_ptr = MEMALLOC(recv_rows + 1, index_t);      recv_ptr = new index_t[recv_rows + 1];
461    #pragma omp for schedule(static) private(i)    #pragma omp for schedule(static) private(i)
462    for (i=0; i<send_rows; i++)      for (i=0; i<send_rows; i++)
463      send_degree[i] = ptr[i+1] - ptr[i];          send_degree[i] = ptr[i+1] - ptr[i];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP5_1 srows %d rrows %d\n", rank, send_rows, recv_rows);  
464    
465    /* First, send/receive the degree */    /* First, send/receive the degree */
466    for (p=0; p<recv_neighbors; p++) { /* Receiving */    for (p=0; p<recv_neighbors; p++) { /* Receiving */
467      m = recv->offsetInShared[p];      m = recv->offsetInShared[p];
468      n = recv->offsetInShared[p+1];      n = recv->offsetInShared[p+1];
469        #ifdef ESYS_MPI
470      MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],      MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
471          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
472          P->mpi_info->comm,                  P->mpi_info->comm,
473          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
474        #endif
475    }    }
476    for (p=0; p<send_neighbors; p++) { /* Sending */    for (p=0; p<send_neighbors; p++) { /* Sending */
477      m = send->offsetInShared[p];      m = send->offsetInShared[p];
478      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
479        #ifdef ESYS_MPI
480      MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],      MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
481          P->mpi_info->msg_tag_counter + rank,                  P->mpi_info->msg_tag_counter + rank,
482          P->mpi_info->comm,                  P->mpi_info->comm,
483          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
484        #endif
485    }    }
486      #ifdef ESYS_MPI
487    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
488          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
489          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
490    P->mpi_info->msg_tag_counter += size;    #endif
491  if (MY_DEBUG) fprintf(stderr, "rank %d CP5_3 %d %d\n", rank, recv_neighbors, send_neighbors);    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
492    
493    TMPMEMFREE(send_degree);    delete[] send_degree;
494    m = Paso_Util_cumsum(recv_rows, recv_ptr);    m = Paso_Util_cumsum(recv_rows, recv_ptr);
495    recv_ptr[recv_rows] = m;    recv_ptr[recv_rows] = m;
496    recv_idx = MEMALLOC(m, index_t);    recv_idx = new index_t[m];
497    recv_val = MEMALLOC(m, double);    recv_val = new double[m * block_size];
 if (MY_DEBUG) fprintf(stderr, "rank %d receive size %d\n", rank, m);  
498    
499    /* Next, send/receive the index array */    /* Next, send/receive the index array */
500    j = 0;    j = 0;
# Line 518  if (MY_DEBUG) fprintf(stderr, "rank %d r Line 503  if (MY_DEBUG) fprintf(stderr, "rank %d r
503      n = recv->offsetInShared[p+1];      n = recv->offsetInShared[p+1];
504      i = recv_ptr[n] - recv_ptr[m];      i = recv_ptr[n] - recv_ptr[m];
505      if (i > 0) {      if (i > 0) {
506  if (MY_DEBUG) fprintf(stderr, "rank %d recv message size %d from %d (%d, %d, %d, %d, %d) \n", rank, i, recv->neighbor[p], p, m, n, recv_ptr[n], recv_ptr[m]);        #ifdef ESYS_MPI
507        MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],        MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
508          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
509          P->mpi_info->comm,                  P->mpi_info->comm,
510          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
511      } else        #endif
512  if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty recv message %d %d %d %d %d %d\n", rank, recv_neighbors, p, m, n, recv_ptr[n], recv_ptr[m]);      }
513      j += i;      j += i;
514    }    }
515    
# Line 534  if (MY_DEBUG) fprintf(stderr, "rank %d W Line 519  if (MY_DEBUG) fprintf(stderr, "rank %d W
519      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
520      i = ptr[n] - ptr[m];      i = ptr[n] - ptr[m];
521      if (i >0) {      if (i >0) {
522  if (MY_DEBUG) fprintf(stderr, "rank %d send message size %d to %d (%d, %d, %d, %d, %d) \n", rank, i, send->neighbor[p], p, m, n, ptr[n], ptr[m]);          #ifdef ESYS_MPI
523      MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],          MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
524          P->mpi_info->msg_tag_counter + rank,                  P->mpi_info->msg_tag_counter + rank,
525          P->mpi_info->comm,                  P->mpi_info->comm,
526          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
527      j += i;          #endif
528      } else          j += i;
529  if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty send message %d %d %d %d %d %d\n", rank, send_neighbors, p, m, n, ptr[n], ptr[m]);      }
530    }    }
531      #ifdef ESYS_MPI
532    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
533          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
534          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
535    P->mpi_info->msg_tag_counter += size;    #endif
536  if (MY_DEBUG) fprintf(stderr, "rank %d CP5_5\n", rank);    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
537    
538    /* Last, send/receive the data array */    /* Last, send/receive the data array */
539    j = 0;    j = 0;
# Line 555  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 541  if (MY_DEBUG) fprintf(stderr, "rank %d C
541      m = recv->offsetInShared[p];      m = recv->offsetInShared[p];
542      n = recv->offsetInShared[p+1];      n = recv->offsetInShared[p+1];
543      i = recv_ptr[n] - recv_ptr[m];      i = recv_ptr[n] - recv_ptr[m];
544        #ifdef ESYS_MPI
545      if (i > 0)      if (i > 0)
546        MPI_Irecv(&(recv_val[j]), i, MPI_DOUBLE, recv->neighbor[p],        MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
547          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
548          P->mpi_info->comm,                  P->mpi_info->comm,
549          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
550      j += i;      #endif
551        j += (i*block_size);
552    }    }
553    
554    j = 0;    j = 0;
# Line 569  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 557  if (MY_DEBUG) fprintf(stderr, "rank %d C
557      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
558      i = ptr[n] - ptr[m];      i = ptr[n] - ptr[m];
559      if (i >0) {      if (i >0) {
560      MPI_Issend(&(val[j]), i, MPI_DOUBLE, send->neighbor[p],          #ifdef ESYS_MPI
561          P->mpi_info->msg_tag_counter + rank,          MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
562          P->mpi_info->comm,                  P->mpi_info->msg_tag_counter + rank,
563          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  P->mpi_info->comm,
564      j += i;                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
565            #endif
566            j += i * block_size;
567      }      }
568    }    }
569      #ifdef ESYS_MPI
570    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
571          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
572          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
573    P->mpi_info->msg_tag_counter += size;    #endif
574  if (MY_DEBUG) fprintf(stderr, "rank %d CP5_7\n", rank);    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
575    
576    /* Clean up and return with recevied ptr, index and data arrays */    /* Clean up and return with received ptr, index and data arrays */
577    MEMFREE(ptr);    delete[] ptr;
578    MEMFREE(idx);    delete[] idx;
579    MEMFREE(val);    delete[] val;
 if (MY_DEBUG) fprintf(stderr, "rank %d len %d\n", rank, recv_ptr[recv_rows]);  
580    *p_ptr = recv_ptr;    *p_ptr = recv_ptr;
581    *p_idx = recv_idx;    *p_idx = recv_idx;
582    *p_val = recv_val;    *p_val = recv_val;
583  }  }
584    
585  Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(  paso::SystemMatrix_ptr Paso_Preconditioner_AMG_buildInterpolationOperator(
586      Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)          paso::SystemMatrix_ptr A, paso::SystemMatrix_ptr P,
587            paso::SystemMatrix_ptr R)
588  {  {
589     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
590     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;     paso::SystemMatrix_ptr out;
591     Paso_SystemMatrix *out=NULL;     paso::SystemMatrixPattern_ptr pattern;
592     Paso_SystemMatrixPattern *pattern=NULL;     paso::Distribution_ptr input_dist, output_dist;
593     Paso_Distribution *input_dist=NULL, *output_dist=NULL;     paso::Connector_ptr col_connector, row_connector;
    Paso_SharedComponents *send =NULL, *recv=NULL;  
    Paso_Connector *col_connector=NULL, *row_connector=NULL;  
    Paso_Coupler *coupler=NULL;  
    Paso_Pattern *main_pattern=NULL;  
    Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;  
594     const dim_t row_block_size=A->row_block_size;     const dim_t row_block_size=A->row_block_size;
595     const dim_t col_block_size=A->col_block_size;     const dim_t col_block_size=A->col_block_size;
596     const dim_t num_threads=omp_get_max_threads();     const dim_t block_size = A->block_size;
597       const dim_t P_block_size = P->block_size;
598     const double ZERO = 0.0;     const double ZERO = 0.0;
599     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
600     double RAP_val, RA_val, R_val, *temp_val=NULL;     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
601     index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;     index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
602     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
603     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
# Line 618  Paso_SystemMatrix* Paso_Preconditioner_A Line 605  Paso_SystemMatrix* Paso_Preconditioner_A
605     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
606     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
607     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
608     index_t sum, i, j, k, iptr;     index_t sum, i, j, k, iptr, irb, icb, ib;
609     index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;     index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
610     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
611     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
612     index_t row_marker_ext, *where_p=NULL;     index_t row_marker_ext, *where_p=NULL;
613     index_t **send_ptr=NULL, **send_idx=NULL;     index_t **send_ptr=NULL, **send_idx=NULL;
614     dim_t l, p, q, global_label, num_neighbors;     dim_t p, num_neighbors;
615     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
616     Esys_MPI_rank *neighbor=NULL;     Esys_MPI_rank *neighbor=NULL;
617     #ifdef ESYS_MPI     #ifdef ESYS_MPI
# Line 634  Paso_SystemMatrix* Paso_Preconditioner_A Line 621  Paso_SystemMatrix* Paso_Preconditioner_A
621       int *mpi_requests=NULL, *mpi_stati=NULL;       int *mpi_requests=NULL, *mpi_stati=NULL;
622     #endif     #endif
623    
624     if (size == 1) return NULL;  /*   if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
625     /* two sparse matrixes R_main and R_couple will be generate, as the       return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
626    
627       /* two sparse matrices R_main and R_couple will be generate, as the
628        transpose of P_main and P_col_couple, respectively. Note that,        transpose of P_main and P_col_couple, respectively. Note that,
629        R_couple is actually the row_coupleBlock of R (R=P^T) */        R_couple is actually the row_coupleBlock of R (R=P^T) */
630     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);     paso::SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
631     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);     paso::SparseMatrix_ptr R_couple;
632       if (size > 1)
633         R_couple = P->col_coupleBlock->getTranspose();
634    
635       /* generate P_ext, i.e. portion of P that is stored on neighbor procs
636          and needed locally for triple matrix product RAP
637          to be specific, P_ext (on processor k) are group of rows in P, where
638          the list of rows from processor q is given by
639            A->col_coupler->connector->send->shared[rPtr...]
640            rPtr=A->col_coupler->connector->send->OffsetInShared[k]
641          on q.
642          P_ext is represented by two sparse matrices P_ext_main and
643          P_ext_couple */
644       Paso_Preconditioner_AMG_extendB(A, P);
645    
646       /* count the number of cols in P_ext_couple, resize the pattern of
647          sparse matrix P_ext_couple with new compressed order, and then
648          build the col id mapping from P->col_coupleBlock to
649          P_ext_couple */  
650       num_Pmain_cols = P->mainBlock->numCols;
651       if (size > 1) {
652         num_Pcouple_cols = P->col_coupleBlock->numCols;
653         num_Acouple_cols = A->col_coupleBlock->numCols;
654         sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
655       } else {
656         num_Pcouple_cols = 0;
657         num_Acouple_cols = 0;
658         sum = 0;
659       }
660       num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
661       offset = P->col_distribution->first_component[rank];
662       num_Pext_cols = 0;
663       if (P->global_id) {
664         /* first count the number of cols "num_Pext_cols" in both P_ext_couple
665            and P->col_coupleBlock */
666         iptr = 0;
667         if (num_Pcouple_cols || sum > 0) {
668            temp = new index_t[num_Pcouple_cols+sum];
669            #pragma omp parallel for lastprivate(iptr) schedule(static)
670            for (iptr=0; iptr<sum; iptr++){
671              temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
672            }
673            for (j=0; j<num_Pcouple_cols; j++, iptr++){
674              temp[iptr] = P->global_id[j];
675            }
676         }
677         if (iptr) {
678              qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
679            num_Pext_cols = 1;
680            i = temp[0];
681            for (j=1; j<iptr; j++) {
682              if (temp[j] > i) {
683                i = temp[j];
684                temp[num_Pext_cols++] = i;
685              }
686            }
687         }
688         /* resize the pattern of P_ext_couple */
689         if(num_Pext_cols){
690            global_id_P = new index_t[num_Pext_cols];
691            #pragma omp parallel for private(i) schedule(static)
692            for (i=0; i<num_Pext_cols; i++)
693              global_id_P[i] = temp[i];
694         }
695         if (num_Pcouple_cols || sum > 0)
696            delete[] temp;
697         #pragma omp parallel for private(i, where_p) schedule(static)
698         for (i=0; i<sum; i++) {
699            where_p = (index_t *)bsearch(
700                            &(P->remote_coupleBlock->pattern->index[i]),
701                            global_id_P, num_Pext_cols,
702                            sizeof(index_t), paso::comparIndex);
703            P->remote_coupleBlock->pattern->index[i] =
704                            (index_t)(where_p -global_id_P);
705         }  
706    
707         /* build the mapping */
708         if (num_Pcouple_cols) {
709            Pcouple_to_Pext = new index_t[num_Pcouple_cols];
710            iptr = 0;
711            for (i=0; i<num_Pext_cols; i++)
712              if (global_id_P[i] == P->global_id[iptr]) {
713                Pcouple_to_Pext[iptr++] = i;
714                if (iptr == num_Pcouple_cols) break;
715              }
716         }
717       }
718    
719       /* alloc and initialise the makers */
720       sum = num_Pext_cols + num_Pmain_cols;
721       P_marker = new index_t[sum];
722       A_marker = new index_t[num_A_cols];
723       #pragma omp parallel for private(i) schedule(static)
724       for (i=0; i<sum; i++) P_marker[i] = -1;
725       #pragma omp parallel for private(i) schedule(static)
726       for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
727    
728       /* Now, count the size of RAP_ext. Start with rows in R_couple */
729       sum = 0;
730       for (i_r=0; i_r<num_Pcouple_cols; i_r++){
731         row_marker = sum;
732         /* then, loop over elements in row i_r of R_couple */
733         j1_ub = R_couple->pattern->ptr[i_r+1];
734         for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
735            i1 = R_couple->pattern->index[j1];
736            /* then, loop over elements in row i1 of A->col_coupleBlock */
737            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
738            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
739              i2 = A->col_coupleBlock->pattern->index[j2];
740    
741              /* check whether entry RA[i_r, i2] has been previously visited.
742                 RAP new entry is possible only if entry RA[i_r, i2] has not
743                 been visited yet */
744              if (A_marker[i2] != i_r) {
745                /* first, mark entry RA[i_r,i2] as visited */
746                A_marker[i2] = i_r;
747    
748                /* then loop over elements in row i2 of P_ext_main */
749                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
750                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
751                    i_c = P->row_coupleBlock->pattern->index[j3];
752    
753                    /* check whether entry RAP[i_r,i_c] has been created.
754                       If not yet created, create the entry and increase
755                       the total number of elements in RAP_ext */
756                    if (P_marker[i_c] < row_marker) {
757                      P_marker[i_c] = sum;
758                      sum++;
759                    }
760                }
761    
762                /* loop over elements in row i2 of P_ext_couple, do the same */
763                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
764                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
765                    i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
766                    if (P_marker[i_c] < row_marker) {
767                      P_marker[i_c] = sum;
768                      sum++;
769                    }
770                }
771              }
772            }
773    
774            /* now loop over elements in row i1 of A->mainBlock, repeat
775               the process we have done to block A->col_coupleBlock */
776            j2_ub = A->mainBlock->pattern->ptr[i1+1];
777            for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
778              i2 = A->mainBlock->pattern->index[j2];
779              if (A_marker[i2 + num_Acouple_cols] != i_r) {
780                A_marker[i2 + num_Acouple_cols] = i_r;
781                j3_ub = P->mainBlock->pattern->ptr[i2+1];
782                for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
783                    i_c = P->mainBlock->pattern->index[j3];
784                    if (P_marker[i_c] < row_marker) {
785                      P_marker[i_c] = sum;
786                      sum++;
787                    }
788                }
789                j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
790                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
791                    /* note that we need to map the column index in
792                       P->col_coupleBlock back into the column index in
793                       P_ext_couple */
794                    i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
795                    if (P_marker[i_c] < row_marker) {
796                      P_marker[i_c] = sum;
797                      sum++;
798                    }
799                }
800              }
801            }
802         }
803       }
804    
805       /* Now we have the number of non-zero elements in RAP_ext, allocate
806          PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
807       RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
808       RAP_ext_idx = new index_t[sum];
809       RAP_ext_val = new double[sum * block_size];
810       RA_val = new double[block_size];
811       RAP_val = new double[block_size];
812    
813       /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
814       sum = num_Pext_cols + num_Pmain_cols;
815       #pragma omp parallel for private(i) schedule(static)
816       for (i=0; i<sum; i++) P_marker[i] = -1;
817       #pragma omp parallel for private(i) schedule(static)
818       for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
819       sum = 0;
820       RAP_ext_ptr[0] = 0;
821       for (i_r=0; i_r<num_Pcouple_cols; i_r++){
822         row_marker = sum;
823         /* then, loop over elements in row i_r of R_couple */
824         j1_ub = R_couple->pattern->ptr[i_r+1];
825         for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
826            i1 = R_couple->pattern->index[j1];
827            R_val = &(R_couple->val[j1*P_block_size]);
828    
829            /* then, loop over elements in row i1 of A->col_coupleBlock */
830            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
831            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
832              i2 = A->col_coupleBlock->pattern->index[j2];
833              temp_val = &(A->col_coupleBlock->val[j2*block_size]);
834              for (irb=0; irb<row_block_size; irb++)
835                for (icb=0; icb<col_block_size; icb++)
836                    RA_val[irb+row_block_size*icb] = ZERO;
837              for (irb=0; irb<P_block_size; irb++) {
838                rtmp=R_val[irb];
839                for (icb=0; icb<col_block_size; icb++) {
840                    RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
841                }
842              }
843    
844              /* check whether entry RA[i_r, i2] has been previously visited.
845                 RAP new entry is possible only if entry RA[i_r, i2] has not
846                 been visited yet */
847              if (A_marker[i2] != i_r) {
848                /* first, mark entry RA[i_r,i2] as visited */
849                A_marker[i2] = i_r;
850    
851                /* then loop over elements in row i2 of P_ext_main */
852                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
853                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
854                    i_c = P->row_coupleBlock->pattern->index[j3];
855                    temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
856                    for (irb=0; irb<row_block_size; irb++)
857                      for (icb=0; icb<col_block_size; icb++)
858                        RAP_val[irb+row_block_size*icb] = ZERO;
859                    for (icb=0; icb<P_block_size; icb++) {
860                      rtmp = temp_val[icb];
861                      for (irb=0; irb<row_block_size; irb++) {
862                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
863                      }
864                    }
865    
866                    /* check whether entry RAP[i_r,i_c] has been created.
867                       If not yet created, create the entry and increase
868                       the total number of elements in RAP_ext */
869                    if (P_marker[i_c] < row_marker) {
870                      P_marker[i_c] = sum;
871                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
872                      RAP_ext_idx[sum] = i_c + offset;
873                      sum++;
874                    } else {
875                      temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
876                      for (ib=0; ib<block_size; ib++) {
877                        temp_val[ib] += RAP_val[ib];
878                      }
879                    }
880                }
881    
882                /* loop over elements in row i2 of P_ext_couple, do the same */
883                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
884                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
885                    i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
886                    temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
887                    for (irb=0; irb<row_block_size; irb++)
888                      for (icb=0; icb<col_block_size; icb++)
889                        RAP_val[irb+row_block_size*icb]=ZERO;
890                    for (icb=0; icb<P_block_size; icb++) {
891                      rtmp = temp_val[icb];
892                      for (irb=0; irb<row_block_size; irb++) {
893                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
894                      }
895                    }
896    
897                    if (P_marker[i_c] < row_marker) {
898                      P_marker[i_c] = sum;
899                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
900                      RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
901                      sum++;
902                    } else {
903                      temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
904                      for (ib=0; ib<block_size; ib++) {
905                        temp_val[ib] += RAP_val[ib];
906                      }
907                    }
908                }
909    
910              /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
911                 Only the contributions are added. */
912              } else {
913                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
914                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
915                    i_c = P->row_coupleBlock->pattern->index[j3];
916                    temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
917                    for (irb=0; irb<row_block_size; irb++)
918                      for (icb=0; icb<col_block_size; icb++)
919                        RAP_val[irb+row_block_size*icb]=ZERO;
920                    for (icb=0; icb<P_block_size; icb++) {
921                      rtmp = temp_val[icb];
922                      for (irb=0; irb<row_block_size; irb++) {
923                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
924                      }
925                    }
926    
927                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
928                    for (ib=0; ib<block_size; ib++) {
929                      temp_val[ib] += RAP_val[ib];
930                    }
931                }
932                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
933                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
934                    i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
935                    temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
936                    for (irb=0; irb<row_block_size; irb++)
937                      for (icb=0; icb<col_block_size; icb++)
938                        RAP_val[irb+row_block_size*icb]=ZERO;
939                    for (icb=0; icb<P_block_size; icb++) {
940                      rtmp = temp_val[icb];
941                      for (irb=0; irb<row_block_size; irb++) {
942                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
943                      }
944                    }
945    
946                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
947                    for (ib=0; ib<block_size; ib++) {
948                      temp_val[ib] += RAP_val[ib];
949                    }
950                }
951              }
952            }
953    
954            /* now loop over elements in row i1 of A->mainBlock, repeat
955               the process we have done to block A->col_coupleBlock */
956            j2_ub = A->mainBlock->pattern->ptr[i1+1];
957            for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
958              i2 = A->mainBlock->pattern->index[j2];
959              temp_val = &(A->mainBlock->val[j2*block_size]);
960              for (irb=0; irb<row_block_size; irb++)
961                for (icb=0; icb<col_block_size; icb++)
962                    RA_val[irb+row_block_size*icb]=ZERO;
963              for (irb=0; irb<P_block_size; irb++) {
964                rtmp=R_val[irb];
965                for (icb=0; icb<col_block_size; icb++) {
966                    RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
967                }
968              }
969    
970              if (A_marker[i2 + num_Acouple_cols] != i_r) {
971                A_marker[i2 + num_Acouple_cols] = i_r;
972                j3_ub = P->mainBlock->pattern->ptr[i2+1];
973                for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
974                    i_c = P->mainBlock->pattern->index[j3];
975                    temp_val = &(P->mainBlock->val[j3*P_block_size]);
976                    for (irb=0; irb<row_block_size; irb++)
977                      for (icb=0; icb<col_block_size; icb++)
978                        RAP_val[irb+row_block_size*icb]=ZERO;
979                    for (icb=0; icb<P_block_size; icb++) {
980                      rtmp = temp_val[icb];
981                      for (irb=0; irb<row_block_size; irb++) {
982                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
983                      }
984                    }
985    
986                    if (P_marker[i_c] < row_marker) {
987                      P_marker[i_c] = sum;
988                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
989                      RAP_ext_idx[sum] = i_c + offset;
990                      sum++;
991                    } else {
992                      temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
993                      for (ib=0; ib<block_size; ib++) {
994                        temp_val[ib] += RAP_val[ib];
995                      }
996                    }
997                }
998                j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
999                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1000                    i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1001                    temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1002                    for (irb=0; irb<row_block_size; irb++)
1003                      for (icb=0; icb<col_block_size; icb++)
1004                        RAP_val[irb+row_block_size*icb]=ZERO;
1005                    for (icb=0; icb<P_block_size; icb++) {
1006                      rtmp=temp_val[icb];
1007                      for (irb=0; irb<row_block_size; irb++) {
1008                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1009                      }
1010                    }
1011    
1012                    if (P_marker[i_c] < row_marker) {
1013                      P_marker[i_c] = sum;
1014                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1015                      RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1016                      sum++;
1017                    } else {
1018                      temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1019                      for (ib=0; ib<block_size; ib++) {
1020                        temp_val[ib] += RAP_val[ib];
1021                      }
1022                    }
1023                }
1024              } else {
1025                j3_ub = P->mainBlock->pattern->ptr[i2+1];
1026                for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1027                    i_c = P->mainBlock->pattern->index[j3];
1028                    temp_val = &(P->mainBlock->val[j3*P_block_size]);
1029                    for (irb=0; irb<row_block_size; irb++)
1030                      for (icb=0; icb<col_block_size; icb++)
1031                        RAP_val[irb+row_block_size*icb]=ZERO;
1032                    for (icb=0; icb<P_block_size; icb++) {
1033                      rtmp=temp_val[icb];
1034                      for (irb=0; irb<row_block_size; irb++) {
1035                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1036                      }
1037                    }
1038    
1039                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1040                    for (ib=0; ib<block_size; ib++) {
1041                      temp_val[ib] += RAP_val[ib];
1042                    }
1043                }
1044                j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1045                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1046                    i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1047                    temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1048                    for (irb=0; irb<row_block_size; irb++)
1049                      for (icb=0; icb<col_block_size; icb++)
1050                        RAP_val[irb+row_block_size*icb]=ZERO;
1051                    for (icb=0; icb<P_block_size; icb++) {
1052                        rtmp=temp_val[icb];
1053                      for (irb=0; irb<row_block_size; irb++) {
1054                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1055                      }
1056                    }
1057    
1058                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1059                    for (ib=0; ib<block_size; ib++) {
1060                      temp_val[ib] += RAP_val[ib];
1061                    }
1062                }
1063              }
1064            }
1065         }
1066         RAP_ext_ptr[i_r+1] = sum;
1067       }
1068       delete[] P_marker;
1069       delete[] Pcouple_to_Pext;
1070    
1071       /* Now we have part of RAP[r,c] where row "r" is the list of rows
1072          which is given by the column list of P->col_coupleBlock, and
1073          column "c" is the list of columns which possibly covers the
1074          whole column range of system matrix P. This part of data is to
1075          be passed to neighbouring processors, and added to corresponding
1076          RAP[r,c] entries in the neighbouring processors */
1077       Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1078                    &RAP_ext_val, global_id_P, block_size);
1079    
1080       num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1081       sum = RAP_ext_ptr[num_RAPext_rows];
1082       num_RAPext_cols = 0;
1083       if (num_Pext_cols || sum > 0) {
1084         temp = new index_t[num_Pext_cols+sum];
1085         j1_ub = offset + num_Pmain_cols;
1086         for (i=0, iptr=0; i<sum; i++) {
1087            if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */
1088              temp[iptr++] = RAP_ext_idx[i];                  /* XXX */
1089         }
1090         for (j=0; j<num_Pext_cols; j++, iptr++){
1091            temp[iptr] = global_id_P[j];
1092         }
1093    
1094         if (iptr) {
1095              qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1096            num_RAPext_cols = 1;
1097            i = temp[0];
1098            for (j=1; j<iptr; j++) {
1099              if (temp[j] > i) {
1100                i = temp[j];
1101                temp[num_RAPext_cols++] = i;
1102              }
1103            }
1104         }
1105       }
1106    
1107       /* resize the pattern of P_ext_couple */
1108       if(num_RAPext_cols){
1109         global_id_RAP = new index_t[num_RAPext_cols];
1110         #pragma omp parallel for private(i) schedule(static)
1111         for (i=0; i<num_RAPext_cols; i++)
1112            global_id_RAP[i] = temp[i];
1113       }
1114       if (num_Pext_cols || sum > 0)
1115         delete[] temp;
1116       j1_ub = offset + num_Pmain_cols;
1117       #pragma omp parallel for private(i, where_p) schedule(static)
1118       for (i=0; i<sum; i++) {
1119         if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1120            where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1121    /*XXX*/                 num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1122            RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1123         } else
1124            RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1125       }
1126    
1127       /* build the mapping */
1128       if (num_Pcouple_cols) {
1129         Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1130         iptr = 0;
1131         for (i=0; i<num_RAPext_cols; i++)
1132            if (global_id_RAP[i] == P->global_id[iptr]) {
1133              Pcouple_to_RAP[iptr++] = i;
1134              if (iptr == num_Pcouple_cols) break;
1135            }
1136       }
1137    
1138       if (num_Pext_cols) {
1139         Pext_to_RAP = new index_t[num_Pext_cols];
1140         iptr = 0;
1141         for (i=0; i<num_RAPext_cols; i++)
1142            if (global_id_RAP[i] == global_id_P[iptr]) {
1143              Pext_to_RAP[iptr++] = i;
1144              if (iptr == num_Pext_cols) break;
1145            }
1146       }
1147    
1148       if (global_id_P){
1149         delete[] global_id_P;
1150         global_id_P = NULL;
1151       }
1152    
1153       /* alloc and initialise the makers */
1154       sum = num_RAPext_cols + num_Pmain_cols;
1155       P_marker = new index_t[sum];
1156       #pragma omp parallel for private(i) schedule(static)
1157       for (i=0; i<sum; i++) P_marker[i] = -1;
1158       #pragma omp parallel for private(i) schedule(static)
1159       for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1160    
1161       /* Now, count the size of RAP. Start with rows in R_main */
1162       num_neighbors = P->col_coupler->connector->send->numNeighbors;
1163       offsetInShared = P->col_coupler->connector->send->offsetInShared;
1164       shared = P->col_coupler->connector->send->shared;
1165       i = 0;
1166       j = 0;
1167       for (i_r=0; i_r<num_Pmain_cols; i_r++){
1168         /* Mark the start of row for both main block and couple block */
1169         row_marker = i;
1170         row_marker_ext = j;
1171    
1172         /* Mark the diagonal element RAP[i_r, i_r], and other elements
1173            in RAP_ext */
1174         P_marker[i_r] = i;
1175         i++;
1176         for (j1=0; j1<num_neighbors; j1++) {
1177            for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1178              if (shared[j2] == i_r) {
1179                for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1180                  i_c = RAP_ext_idx[k];
1181                  if (i_c < num_Pmain_cols) {
1182                    if (P_marker[i_c] < row_marker) {
1183                      P_marker[i_c] = i;
1184                      i++;
1185                    }
1186                  } else {
1187                    if (P_marker[i_c] < row_marker_ext) {
1188                      P_marker[i_c] = j;
1189                      j++;
1190                    }
1191                  }
1192                }
1193                break;
1194              }
1195            }
1196         }
1197    
1198         /* then, loop over elements in row i_r of R_main */
1199         j1_ub = R_main->pattern->ptr[i_r+1];
1200         for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1201            i1 = R_main->pattern->index[j1];
1202    
1203            /* then, loop over elements in row i1 of A->col_coupleBlock */
1204            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1205            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1206              i2 = A->col_coupleBlock->pattern->index[j2];
1207    
1208              /* check whether entry RA[i_r, i2] has been previously visited.
1209                 RAP new entry is possible only if entry RA[i_r, i2] has not
1210                 been visited yet */
1211              if (A_marker[i2] != i_r) {
1212                /* first, mark entry RA[i_r,i2] as visited */
1213                A_marker[i2] = i_r;
1214    
1215                /* then loop over elements in row i2 of P_ext_main */
1216                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1217                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1218                    i_c = P->row_coupleBlock->pattern->index[j3];
1219    
1220                    /* check whether entry RAP[i_r,i_c] has been created.
1221                       If not yet created, create the entry and increase
1222                       the total number of elements in RAP_ext */
1223                    if (P_marker[i_c] < row_marker) {
1224                      P_marker[i_c] = i;
1225                      i++;
1226                    }
1227                }
1228    
1229                /* loop over elements in row i2 of P_ext_couple, do the same */
1230                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1231                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232                    i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1233                    if (P_marker[i_c] < row_marker_ext) {
1234                      P_marker[i_c] = j;
1235                      j++;
1236                    }
1237                }
1238              }
1239            }
1240    
1241            /* now loop over elements in row i1 of A->mainBlock, repeat
1242               the process we have done to block A->col_coupleBlock */
1243            j2_ub = A->mainBlock->pattern->ptr[i1+1];
1244            for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1245              i2 = A->mainBlock->pattern->index[j2];
1246              if (A_marker[i2 + num_Acouple_cols] != i_r) {
1247                A_marker[i2 + num_Acouple_cols] = i_r;
1248                j3_ub = P->mainBlock->pattern->ptr[i2+1];
1249                for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1250                    i_c = P->mainBlock->pattern->index[j3];
1251                    if (P_marker[i_c] < row_marker) {
1252                      P_marker[i_c] = i;
1253                      i++;
1254                    }
1255                }
1256                j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1257                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1258                    /* note that we need to map the column index in
1259                       P->col_coupleBlock back into the column index in
1260                       P_ext_couple */
1261                    i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1262                    if (P_marker[i_c] < row_marker_ext) {
1263                      P_marker[i_c] = j;
1264                      j++;
1265                    }
1266                }
1267              }
1268            }
1269         }
1270       }
1271    
1272       /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1273          Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1274          and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1275          RAP_couple */
1276       RAP_main_ptr = new index_t[num_Pmain_cols+1];
1277       RAP_main_idx = new index_t[i];
1278       RAP_main_val = new double[i * block_size];
1279       RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1280       RAP_couple_idx = new index_t[j];
1281       RAP_couple_val = new double[j * block_size];
1282    
1283       RAP_main_ptr[num_Pmain_cols] = i;
1284       RAP_couple_ptr[num_Pmain_cols] = j;
1285    
1286       #pragma omp parallel for private(i) schedule(static)
1287       for (i=0; i<sum; i++) P_marker[i] = -1;
1288       #pragma omp parallel for private(i) schedule(static)
1289       for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1290    
1291       /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1292          in R_main */
1293       i = 0;
1294       j = 0;
1295       for (i_r=0; i_r<num_Pmain_cols; i_r++){
1296         /* Mark the start of row for both main block and couple block */
1297         row_marker = i;
1298         row_marker_ext = j;
1299         RAP_main_ptr[i_r] = row_marker;
1300         RAP_couple_ptr[i_r] = row_marker_ext;
1301    
1302         /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1303            in row i_r of RAP_ext */
1304         P_marker[i_r] = i;
1305         for (ib=0; ib<block_size; ib++)
1306           RAP_main_val[i*block_size+ib] = ZERO;
1307         RAP_main_idx[i] = i_r;
1308         i++;
1309    
1310         for (j1=0; j1<num_neighbors; j1++) {
1311            for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1312              if (shared[j2] == i_r) {
1313                for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1314                  i_c = RAP_ext_idx[k];
1315                  if (i_c < num_Pmain_cols) {
1316                    if (P_marker[i_c] < row_marker) {
1317                      P_marker[i_c] = i;
1318                      memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1319                      RAP_main_idx[i] = i_c;
1320                      i++;
1321                    } else {
1322                      t1_val = &(RAP_ext_val[k*block_size]);
1323                      t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1324                      for (ib=0; ib<block_size; ib++)
1325                        t2_val[ib] += t1_val[ib];
1326                    }
1327                  } else {
1328                    if (P_marker[i_c] < row_marker_ext) {
1329                      P_marker[i_c] = j;
1330                      memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1331                      RAP_couple_idx[j] = i_c - num_Pmain_cols;
1332                      j++;
1333                    } else {
1334                      t1_val = &(RAP_ext_val[k*block_size]);
1335                      t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1336                      for (ib=0; ib<block_size; ib++)
1337                        t2_val[ib] += t1_val[ib];
1338                    }
1339                  }
1340                }
1341                break;
1342              }
1343            }
1344         }
1345    
1346         /* then, loop over elements in row i_r of R_main */
1347         j1_ub = R_main->pattern->ptr[i_r+1];
1348         for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1349            i1 = R_main->pattern->index[j1];
1350            R_val = &(R_main->val[j1*P_block_size]);
1351    
1352            /* then, loop over elements in row i1 of A->col_coupleBlock */
1353            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1354            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1355              i2 = A->col_coupleBlock->pattern->index[j2];
1356              temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1357              for (irb=0; irb<row_block_size; irb++)
1358                for (icb=0; icb<col_block_size; icb++)
1359                    RA_val[irb+row_block_size*icb]=ZERO;
1360              for (irb=0; irb<P_block_size; irb++) {
1361                rtmp=R_val[irb];
1362                for (icb=0; icb<col_block_size; icb++) {
1363                    RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1364                }
1365              }
1366    
1367    
1368              /* check whether entry RA[i_r, i2] has been previously visited.
1369                 RAP new entry is possible only if entry RA[i_r, i2] has not
1370                 been visited yet */
1371              if (A_marker[i2] != i_r) {
1372                /* first, mark entry RA[i_r,i2] as visited */
1373                A_marker[i2] = i_r;
1374    
1375                /* then loop over elements in row i2 of P_ext_main */
1376                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1377                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1378                    i_c = P->row_coupleBlock->pattern->index[j3];
1379                    temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1380                    for (irb=0; irb<row_block_size; irb++)
1381                      for (icb=0; icb<col_block_size; icb++)
1382                        RAP_val[irb+row_block_size*icb]=ZERO;
1383                    for (icb=0; icb<P_block_size; icb++) {
1384                      rtmp=temp_val[icb];
1385                      for (irb=0; irb<row_block_size; irb++) {
1386                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1387                      }
1388                    }
1389    
1390    
1391                    /* check whether entry RAP[i_r,i_c] has been created.
1392                       If not yet created, create the entry and increase
1393                       the total number of elements in RAP_ext */
1394                    if (P_marker[i_c] < row_marker) {
1395                      P_marker[i_c] = i;
1396                      memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1397                      RAP_main_idx[i] = i_c;
1398                      i++;
1399                    } else {
1400                      temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1401                      for (ib=0; ib<block_size; ib++) {
1402                        temp_val[ib] += RAP_val[ib];
1403                      }
1404                    }
1405                }
1406    
1407                /* loop over elements in row i2 of P_ext_couple, do the same */
1408                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1409                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1410                    i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1411                    temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1412                    for (irb=0; irb<row_block_size; irb++)
1413                      for (icb=0; icb<col_block_size; icb++)
1414                        RAP_val[irb+row_block_size*icb]=ZERO;
1415                    for (icb=0; icb<P_block_size; icb++) {
1416                      rtmp=temp_val[icb];
1417                      for (irb=0; irb<row_block_size; irb++) {
1418                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1419                      }
1420                    }
1421    
1422                    if (P_marker[i_c] < row_marker_ext) {
1423                      P_marker[i_c] = j;
1424                      memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1425                      RAP_couple_idx[j] = i_c - num_Pmain_cols;
1426                      j++;
1427                    } else {
1428                      temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1429                      for (ib=0; ib<block_size; ib++) {
1430                        temp_val[ib] += RAP_val[ib];
1431                      }
1432                    }
1433                }
1434    
1435              /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1436                 Only the contributions are added. */
1437              } else {
1438                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1439                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1440                    i_c = P->row_coupleBlock->pattern->index[j3];
1441                    temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1442                    for (irb=0; irb<row_block_size; irb++)
1443                      for (icb=0; icb<col_block_size; icb++)
1444                        RAP_val[irb+row_block_size*icb]=ZERO;
1445                    for (icb=0; icb<P_block_size; icb++) {
1446                      rtmp=temp_val[icb];
1447                      for (irb=0; irb<row_block_size; irb++) {
1448                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1449                      }
1450                    }
1451    
1452                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1453                    for (ib=0; ib<block_size; ib++) {
1454                      temp_val[ib] += RAP_val[ib];
1455                    }
1456                }
1457                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1458                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1459                    i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1460                    temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1461                    for (irb=0; irb<row_block_size; irb++)
1462                      for (icb=0; icb<col_block_size; icb++)
1463                        RAP_val[irb+row_block_size*icb]=ZERO;
1464                    for (icb=0; icb<P_block_size; icb++) {
1465                      rtmp=temp_val[icb];
1466                      for (irb=0; irb<row_block_size; irb++) {
1467                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1468                      }
1469                    }
1470    
1471                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1472                    for (ib=0; ib<block_size; ib++) {
1473                      temp_val[ib] += RAP_val[ib];
1474                    }
1475                }
1476              }
1477            }
1478    
1479            /* now loop over elements in row i1 of A->mainBlock, repeat
1480               the process we have done to block A->col_coupleBlock */
1481            j2_ub = A->mainBlock->pattern->ptr[i1+1];
1482            for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1483              i2 = A->mainBlock->pattern->index[j2];
1484              temp_val = &(A->mainBlock->val[j2*block_size]);
1485              for (irb=0; irb<row_block_size; irb++)
1486                for (icb=0; icb<col_block_size; icb++)
1487                    RA_val[irb+row_block_size*icb]=ZERO;
1488              for (irb=0; irb<P_block_size; irb++) {
1489                rtmp=R_val[irb];
1490                for (icb=0; icb<col_block_size; icb++) {
1491                    RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1492                }
1493              }
1494    
1495              if (A_marker[i2 + num_Acouple_cols] != i_r) {
1496                A_marker[i2 + num_Acouple_cols] = i_r;
1497                j3_ub = P->mainBlock->pattern->ptr[i2+1];
1498                for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1499                    i_c = P->mainBlock->pattern->index[j3];
1500                    temp_val = &(P->mainBlock->val[j3*P_block_size]);
1501                    for (irb=0; irb<row_block_size; irb++)
1502                      for (icb=0; icb<col_block_size; icb++)
1503                        RAP_val[irb+row_block_size*icb]=ZERO;
1504                    for (icb=0; icb<P_block_size; icb++) {
1505                      rtmp=temp_val[icb];
1506                      for (irb=0; irb<row_block_size; irb++) {
1507                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1508                      }
1509                    }
1510                    if (P_marker[i_c] < row_marker) {
1511                      P_marker[i_c] = i;
1512                      memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1513                      RAP_main_idx[i] = i_c;
1514                      i++;
1515                    } else {
1516                      temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1517                      for (ib=0; ib<block_size; ib++) {
1518                        temp_val[ib] += RAP_val[ib];
1519                      }
1520                    }
1521                }
1522                j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1523                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1524                    /* note that we need to map the column index in
1525                       P->col_coupleBlock back into the column index in
1526                       P_ext_couple */
1527                    i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1528                    temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1529                    for (irb=0; irb<row_block_size; irb++)
1530                      for (icb=0; icb<col_block_size; icb++)
1531                        RAP_val[irb+row_block_size*icb]=ZERO;
1532                    for (icb=0; icb<P_block_size; icb++) {
1533                      rtmp=temp_val[icb];
1534                      for (irb=0; irb<row_block_size; irb++) {
1535                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1536                      }
1537                    }
1538    
1539                    if (P_marker[i_c] < row_marker_ext) {
1540                      P_marker[i_c] = j;
1541                      memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1542                      RAP_couple_idx[j] = i_c - num_Pmain_cols;
1543                      j++;
1544                    } else {
1545                      temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1546                      for (ib=0; ib<block_size; ib++) {
1547                        temp_val[ib] += RAP_val[ib];
1548                      }
1549                    }
1550                }
1551    
1552              } else {
1553                j3_ub = P->mainBlock->pattern->ptr[i2+1];
1554                for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1555                    i_c = P->mainBlock->pattern->index[j3];
1556                    temp_val = &(P->mainBlock->val[j3*P_block_size]);
1557                    for (irb=0; irb<row_block_size; irb++)
1558                      for (icb=0; icb<col_block_size; icb++)
1559                        RAP_val[irb+row_block_size*icb]=ZERO;
1560                    for (icb=0; icb<P_block_size; icb++) {
1561                      rtmp=temp_val[icb];
1562                      for (irb=0; irb<row_block_size; irb++) {
1563                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1564                      }
1565                    }
1566    
1567                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1568                    for (ib=0; ib<block_size; ib++) {
1569                      temp_val[ib] += RAP_val[ib];
1570                    }
1571                }
1572                j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1573                for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1574                    i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1575                    temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1576                    for (irb=0; irb<row_block_size; irb++)
1577                      for (icb=0; icb<col_block_size; icb++)
1578                        RAP_val[irb+row_block_size*icb]=ZERO;
1579                    for (icb=0; icb<P_block_size; icb++) {
1580                      rtmp = temp_val[icb];
1581                      for (irb=0; irb<row_block_size; irb++) {
1582                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1583                      }
1584                    }
1585    
1586                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1587                    for (ib=0; ib<block_size; ib++) {
1588                      temp_val[ib] += RAP_val[ib];
1589                    }
1590                }
1591              }
1592            }
1593         }
1594    
1595         /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1596         if (i > row_marker) {
1597            offset = i - row_marker;
1598            temp = new index_t[offset];
1599            #pragma omp parallel for schedule(static) private(iptr)
1600            for (iptr=0; iptr<offset; iptr++)
1601              temp[iptr] = RAP_main_idx[row_marker+iptr];
1602            if (offset > 0) {
1603                qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1604            }
1605            temp_val = new double[offset * block_size];
1606            #pragma omp parallel for schedule(static) private(iptr,k)
1607            for (iptr=0; iptr<offset; iptr++){
1608              k = P_marker[temp[iptr]];
1609              memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1610              P_marker[temp[iptr]] = iptr + row_marker;
1611            }
1612            #pragma omp parallel for schedule(static) private(iptr)
1613            for (iptr=0; iptr<offset; iptr++){
1614              RAP_main_idx[row_marker+iptr] = temp[iptr];
1615              memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1616            }
1617            delete[] temp;
1618            delete[] temp_val;
1619         }
1620         if (j > row_marker_ext) {
1621            offset = j - row_marker_ext;
1622            temp = new index_t[offset];
1623            #pragma omp parallel for schedule(static) private(iptr)
1624            for (iptr=0; iptr<offset; iptr++)
1625              temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1626            if (offset > 0) {
1627                qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1628            }
1629            temp_val = new double[offset * block_size];
1630            #pragma omp parallel for schedule(static) private(iptr, k)
1631            for (iptr=0; iptr<offset; iptr++){
1632              k = P_marker[temp[iptr] + num_Pmain_cols];
1633              memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1634              P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1635            }
1636            #pragma omp parallel for schedule(static) private(iptr)
1637            for (iptr=0; iptr<offset; iptr++){
1638              RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1639              memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1640            }
1641            delete[] temp;
1642            delete[] temp_val;
1643         }
1644       }
1645    
1646       delete[] RA_val;
1647       delete[] RAP_val;
1648       delete[] A_marker;
1649       delete[] Pext_to_RAP;
1650       delete[] Pcouple_to_RAP;
1651       delete[] RAP_ext_ptr;
1652       delete[] RAP_ext_idx;
1653       delete[] RAP_ext_val;
1654       R_main.reset();
1655       R_couple.reset();
1656    
1657       /* Check whether there are empty columns in RAP_couple */
1658       #pragma omp parallel for schedule(static) private(i)
1659       for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1660       /* num of non-empty columns is stored in "k" */
1661       k = 0;  
1662       j = RAP_couple_ptr[num_Pmain_cols];
1663       for (i=0; i<j; i++) {
1664         i1 = RAP_couple_idx[i];
1665         if (P_marker[i1]) {
1666            P_marker[i1] = 0;
1667            k++;
1668         }
1669       }
1670    
1671       /* empty columns is found */
1672       if (k < num_RAPext_cols) {
1673         temp = new index_t[k];
1674         k = 0;
1675         for (i=0; i<num_RAPext_cols; i++)
1676            if (!P_marker[i]) {
1677              P_marker[i] = k;
1678              temp[k] = global_id_RAP[i];
1679              k++;
1680            }
1681         #pragma omp parallel for schedule(static) private(i, i1)
1682         for (i=0; i<j; i++) {
1683            i1 = RAP_couple_idx[i];
1684            RAP_couple_idx[i] = P_marker[i1];
1685         }
1686         num_RAPext_cols = k;
1687         delete[] global_id_RAP;
1688         global_id_RAP = temp;
1689       }
1690       delete[] P_marker;
1691    
1692       /******************************************************/
1693       /* Start to create the coarse level System Matrix A_c */
1694       /******************************************************/
1695       /* first, prepare the sender/receiver for the col_connector */
1696       dist = P->pattern->input_distribution->first_component;
1697       recv_len = new dim_t[size];
1698       send_len = new dim_t[size];
1699       neighbor = new Esys_MPI_rank[size];
1700       offsetInShared = new index_t[size+1];
1701       shared = new index_t[num_RAPext_cols];
1702       memset(recv_len, 0, sizeof(dim_t) * size);
1703       memset(send_len, 0, sizeof(dim_t) * size);
1704       num_neighbors = 0;
1705       offsetInShared[0] = 0;
1706       for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1707         shared[i] = i + num_Pmain_cols;
1708         if (k <= global_id_RAP[i]) {
1709            if (recv_len[j] > 0) {
1710              neighbor[num_neighbors] = j;
1711              num_neighbors ++;
1712              offsetInShared[num_neighbors] = i;
1713            }
1714            while (k <= global_id_RAP[i]) {
1715              j++;
1716              k = dist[j+1];
1717            }
1718         }
1719         recv_len[j] ++;
1720       }
1721       if (recv_len[j] > 0) {
1722         neighbor[num_neighbors] = j;
1723         num_neighbors ++;
1724         offsetInShared[num_neighbors] = i;
1725       }
1726    
1727       paso::SharedComponents_ptr recv(new paso::SharedComponents(
1728                   num_Pmain_cols, num_neighbors, neighbor, shared,
1729                   offsetInShared, 1, 0, mpi_info));
1730    
1731  if (MY_DEBUG) fprintf(stderr, "CP1\n");     #ifdef ESYS_MPI
1732       MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1733       #endif
1734    
1735       #ifdef ESYS_MPI
1736         mpi_requests=new MPI_Request[size*2];
1737         mpi_stati=new MPI_Status[size*2];
1738       #else
1739         mpi_requests=new int[size*2];
1740         mpi_stati=new int[size*2];
1741       #endif
1742       num_neighbors = 0;
1743       j = 0;
1744       offsetInShared[0] = 0;
1745       for (i=0; i<size; i++) {
1746         if (send_len[i] > 0) {
1747            neighbor[num_neighbors] = i;
1748            num_neighbors ++;
1749            j += send_len[i];
1750            offsetInShared[num_neighbors] = j;
1751         }
1752       }
1753       delete[] shared;
1754       shared = new index_t[j];
1755       for (i=0, j=0; i<num_neighbors; i++) {
1756         k = neighbor[i];
1757         #ifdef ESYS_MPI
1758         MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1759                    mpi_info->msg_tag_counter+k,
1760                    mpi_info->comm, &mpi_requests[i]);
1761         #endif
1762         j += send_len[k];
1763       }
1764       for (i=0, j=0; i<recv->numNeighbors; i++) {
1765         k = recv->neighbor[i];
1766         #ifdef ESYS_MPI
1767         MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1768                    mpi_info->msg_tag_counter+rank,
1769                    mpi_info->comm, &mpi_requests[i+num_neighbors]);
1770         #endif
1771         j += recv_len[k];
1772       }
1773       #ifdef ESYS_MPI
1774       MPI_Waitall(num_neighbors + recv->numNeighbors,
1775                    mpi_requests, mpi_stati);
1776       #endif
1777       ESYS_MPI_INC_COUNTER(*mpi_info, size);
1778    
1779       j = offsetInShared[num_neighbors];
1780       offset = dist[rank];
1781       #pragma omp parallel for schedule(static) private(i)
1782       for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1783    
1784     /* generate P_ext, i.e. portion of P that is tored on neighbor procs     paso::SharedComponents_ptr send(new paso::SharedComponents(
1785                   num_Pmain_cols, num_neighbors, neighbor, shared,
1786                   offsetInShared, 1, 0, mpi_info));
1787    
1788       col_connector.reset(new paso::Connector(send, recv));
1789       delete[] shared;
1790    
1791       /* now, create row distribution (output_distri) and col
1792          distribution (input_distribution) */
1793       input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1794       output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1795    
1796       /* then, prepare the sender/receiver for the row_connector, first, prepare
1797          the information for sender */
1798       sum = RAP_couple_ptr[num_Pmain_cols];
1799       len = new dim_t[size];
1800       send_ptr = new index_t*[size];
1801       send_idx = new index_t*[size];
1802       #pragma omp parallel for schedule(static) private(i)
1803       for (i=0; i<size; i++) {
1804         send_ptr[i] = new index_t[num_Pmain_cols];
1805         send_idx[i] = new index_t[sum];
1806         memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1807       }
1808       memset(len, 0, sizeof(dim_t) * size);
1809       recv = col_connector->recv;
1810       sum=0;
1811       for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1812         i1 = RAP_couple_ptr[i_r];
1813         i2 = RAP_couple_ptr[i_r+1];
1814         if (i2 > i1) {
1815            /* then row i_r will be in the sender of row_connector, now check
1816               how many neighbours i_r needs to be send to */
1817            for (j=i1; j<i2; j++) {
1818              i_c = RAP_couple_idx[j];
1819              /* find out the corresponding neighbor "p" of column i_c */
1820              for (p=0; p<recv->numNeighbors; p++) {
1821                if (i_c < recv->offsetInShared[p+1]) {
1822                  k = recv->neighbor[p];
1823                  if (send_ptr[k][i_r] == 0) sum++;
1824                  send_ptr[k][i_r] ++;
1825                  send_idx[k][len[k]] = global_id_RAP[i_c];
1826                  len[k] ++;
1827                  break;
1828                }
1829              }
1830            }
1831         }
1832       }
1833       if (global_id_RAP) {
1834         delete[] global_id_RAP;
1835         global_id_RAP = NULL;
1836       }
1837    
1838       /* now allocate the sender */
1839       shared = new index_t[sum];
1840       memset(send_len, 0, sizeof(dim_t) * size);
1841       num_neighbors=0;
1842       offsetInShared[0] = 0;
1843       for (p=0, k=0; p<size; p++) {
1844         for (i=0; i<num_Pmain_cols; i++) {
1845            if (send_ptr[p][i] > 0) {
1846              shared[k] = i;
1847              k++;
1848              send_ptr[p][send_len[p]] = send_ptr[p][i];
1849              send_len[p]++;
1850            }
1851         }
1852         if (k > offsetInShared[num_neighbors]) {
1853            neighbor[num_neighbors] = p;
1854            num_neighbors ++;
1855            offsetInShared[num_neighbors] = k;
1856         }
1857       }
1858       send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1859                   neighbor, shared, offsetInShared, 1, 0, mpi_info));
1860    
1861       /* send/recv number of rows will be sent from current proc
1862          recover info for the receiver of row_connector from the sender */
1863       #ifdef ESYS_MPI
1864       MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1865       #endif
1866       num_neighbors = 0;
1867       offsetInShared[0] = 0;
1868       j = 0;
1869       for (i=0; i<size; i++) {
1870         if (i != rank && recv_len[i] > 0) {
1871            neighbor[num_neighbors] = i;
1872            num_neighbors ++;
1873            j += recv_len[i];
1874            offsetInShared[num_neighbors] = j;
1875         }
1876       }
1877       delete[] shared;
1878       delete[] recv_len;
1879       shared = new index_t[j];
1880       k = offsetInShared[num_neighbors];
1881       #pragma omp parallel for schedule(static) private(i)
1882       for (i=0; i<k; i++) {
1883         shared[i] = i + num_Pmain_cols;
1884       }
1885       recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1886                   neighbor, shared, offsetInShared, 1, 0, mpi_info));
1887       row_connector.reset(new paso::Connector(send, recv));
1888       delete[] shared;
1889    
1890       /* send/recv pattern->ptr for rowCoupleBlock */
1891       num_RAPext_rows = offsetInShared[num_neighbors];
1892       row_couple_ptr = new index_t[num_RAPext_rows+1];
1893       for (p=0; p<num_neighbors; p++) {
1894         j = offsetInShared[p];
1895         i = offsetInShared[p+1];
1896         #ifdef ESYS_MPI
1897         MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1898                    mpi_info->msg_tag_counter+neighbor[p],
1899                    mpi_info->comm, &mpi_requests[p]);
1900         #endif
1901       }
1902       send = row_connector->send;
1903       for (p=0; p<send->numNeighbors; p++) {
1904         #ifdef ESYS_MPI
1905         MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1906                    MPI_INT, send->neighbor[p],
1907                    mpi_info->msg_tag_counter+rank,
1908                    mpi_info->comm, &mpi_requests[p+num_neighbors]);
1909         #endif
1910       }
1911       #ifdef ESYS_MPI
1912       MPI_Waitall(num_neighbors + send->numNeighbors,
1913            mpi_requests, mpi_stati);
1914       #endif
1915       ESYS_MPI_INC_COUNTER(*mpi_info, size);
1916       delete[] send_len;
1917    
1918       sum = 0;
1919       for (i=0; i<num_RAPext_rows; i++) {
1920         k = row_couple_ptr[i];
1921         row_couple_ptr[i] = sum;
1922         sum += k;
1923       }
1924       row_couple_ptr[num_RAPext_rows] = sum;
1925    
1926       /* send/recv pattern->index for rowCoupleBlock */
1927       k = row_couple_ptr[num_RAPext_rows];
1928       row_couple_idx = new index_t[k];
1929       for (p=0; p<num_neighbors; p++) {
1930         j1 = row_couple_ptr[offsetInShared[p]];
1931         j2 = row_couple_ptr[offsetInShared[p+1]];
1932         #ifdef ESYS_MPI
1933         MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1934                    mpi_info->msg_tag_counter+neighbor[p],
1935                    mpi_info->comm, &mpi_requests[p]);
1936         #endif
1937       }
1938       for (p=0; p<send->numNeighbors; p++) {
1939         #ifdef ESYS_MPI
1940         MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1941                    MPI_INT, send->neighbor[p],
1942                    mpi_info->msg_tag_counter+rank,
1943                    mpi_info->comm, &mpi_requests[p+num_neighbors]);
1944         #endif
1945       }
1946       #ifdef ESYS_MPI
1947       MPI_Waitall(num_neighbors + send->numNeighbors,
1948                    mpi_requests, mpi_stati);
1949       #endif
1950       ESYS_MPI_INC_COUNTER(*mpi_info, size);
1951    
1952        offset = input_dist->first_component[rank];
1953        k = row_couple_ptr[num_RAPext_rows];
1954        #pragma omp parallel for schedule(static) private(i)
1955        for (i=0; i<k; i++) {
1956            row_couple_idx[i] -= offset;
1957        }
1958        #pragma omp parallel for schedule(static) private(i)
1959        for (i=0; i<size; i++) {
1960            delete[] send_ptr[i];
1961            delete[] send_idx[i];
1962        }
1963        delete[] send_ptr;
1964        delete[] send_idx;
1965        delete[] len;
1966        delete[] offsetInShared;
1967        delete[] neighbor;
1968        delete[] mpi_requests;
1969        delete[] mpi_stati;
1970        Esys_MPIInfo_free(mpi_info);
1971    
1972        /* Now, we can create pattern for mainBlock and coupleBlock */
1973        paso::Pattern_ptr main_pattern(new paso::Pattern(MATRIX_FORMAT_DEFAULT,
1974                   num_Pmain_cols, num_Pmain_cols, RAP_main_ptr, RAP_main_idx));
1975        paso::Pattern_ptr col_couple_pattern(new paso::Pattern(
1976                   MATRIX_FORMAT_DEFAULT, num_Pmain_cols, num_RAPext_cols,
1977                   RAP_couple_ptr, RAP_couple_idx));
1978        paso::Pattern_ptr row_couple_pattern(new paso::Pattern(
1979                   MATRIX_FORMAT_DEFAULT, num_RAPext_rows, num_Pmain_cols,
1980                   row_couple_ptr, row_couple_idx));
1981    
1982        /* next, create the system matrix */
1983        pattern.reset(new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1984                    output_dist, input_dist, main_pattern, col_couple_pattern,
1985                    row_couple_pattern, col_connector, row_connector));
1986        out.reset(new paso::SystemMatrix(A->type, pattern, row_block_size,
1987                                        col_block_size, false));
1988    
1989        /* finally, fill in the data*/
1990        memcpy(out->mainBlock->val, RAP_main_val,
1991                    out->mainBlock->len* sizeof(double));
1992        memcpy(out->col_coupleBlock->val, RAP_couple_val,
1993                    out->col_coupleBlock->len * sizeof(double));
1994    
1995        delete[] RAP_main_val;
1996        delete[] RAP_couple_val;
1997        if (!Esys_noError()) {
1998            out.reset();
1999        }
2000        return out;
2001    }
2002    
2003    
2004    paso::SystemMatrix_ptr Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2005            paso::SystemMatrix_ptr A, paso::SystemMatrix_ptr P,
2006            paso::SystemMatrix_ptr R)
2007    {
2008       Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2009       paso::SystemMatrix_ptr out;
2010       paso::SystemMatrixPattern_ptr pattern;
2011       paso::Distribution_ptr input_dist, output_dist;
2012       paso::SharedComponents_ptr send, recv;
2013       paso::Connector_ptr col_connector, row_connector;
2014       const dim_t row_block_size=A->row_block_size;
2015       const dim_t col_block_size=A->col_block_size;
2016       const dim_t block_size = A->block_size;
2017       const double ZERO = 0.0;
2018       double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2019       double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2020       index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2021       index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2022       index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2023       index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2024       index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2025       index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2026       index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2027       index_t sum, i, j, k, iptr, irb, icb, ib;
2028       index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2029       index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2030       index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2031       index_t row_marker_ext, *where_p=NULL;
2032       index_t **send_ptr=NULL, **send_idx=NULL;
2033       dim_t p, num_neighbors;
2034       dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2035       Esys_MPI_rank *neighbor=NULL;
2036       #ifdef ESYS_MPI
2037         MPI_Request* mpi_requests=NULL;
2038         MPI_Status* mpi_stati=NULL;
2039       #else
2040         int *mpi_requests=NULL, *mpi_stati=NULL;
2041       #endif
2042    
2043       /* two sparse matrices R_main and R_couple will be generate, as the
2044          transpose of P_main and P_col_couple, respectively. Note that,
2045          R_couple is actually the row_coupleBlock of R (R=P^T) */
2046       paso::SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
2047       paso::SparseMatrix_ptr R_couple(P->col_coupleBlock->getTranspose());
2048    
2049       /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2050        and needed locally for triple matrix product RAP        and needed locally for triple matrix product RAP
2051        to be specific, P_ext (on processor k) are group of rows in P, where        to be specific, P_ext (on processor k) are group of rows in P, where
2052        the list of rows from processor q is given by        the list of rows from processor q is given by
2053      A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
2054      rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2055        on q.        on q.
2056        P_ext is represented by two sparse matrixes P_ext_main and        P_ext is represented by two sparse matrices P_ext_main and
2057        P_ext_couple */        P_ext_couple */
2058     Paso_Preconditioner_AMG_extendB(A, P);     Paso_Preconditioner_AMG_extendB(A, P);
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP2\n", rank);  
2059    
2060     /* count the number of cols in P_ext_couple, resize the pattern of     /* count the number of cols in P_ext_couple, resize the pattern of
2061        sparse matrix P_ext_couple with new compressed order, and then        sparse matrix P_ext_couple with new compressed order, and then
# Line 668  if (MY_DEBUG1) fprintf(stderr, "rank %d Line 2070  if (MY_DEBUG1) fprintf(stderr, "rank %d
2070     num_Pext_cols = 0;     num_Pext_cols = 0;
2071     if (P->global_id) {     if (P->global_id) {
2072       /* first count the number of cols "num_Pext_cols" in both P_ext_couple       /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2073      and P->col_coupleBlock */          and P->col_coupleBlock */
2074       iptr = 0;       iptr = 0;
2075       if (num_Pcouple_cols || sum > 0) {       if (num_Pcouple_cols || sum > 0) {
2076      temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);          temp = new index_t[num_Pcouple_cols+sum];
2077      for (; iptr<sum; iptr++)          for (; iptr<sum; iptr++){
2078        temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];            temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2079      for (j=0; j<num_Pcouple_cols; j++, iptr++)          }
2080        temp[iptr] = P->global_id[j];          for (j=0; j<num_Pcouple_cols; j++, iptr++){
2081              temp[iptr] = P->global_id[j];
2082            }
2083       }       }
2084       if (iptr) {       if (iptr) {
2085      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2086        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_Pext_cols = 1;
2087      #else          i = temp[0];
2088        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
2089      #endif            if (temp[j] > i) {
2090      num_Pext_cols = 1;              i = temp[j];
2091      i = temp[0];              temp[num_Pext_cols++] = i;
2092      for (j=1; j<iptr; j++) {            }
2093        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_Pext_cols++] = i;  
       }  
     }  
2094       }       }
2095    
2096       /* resize the pattern of P_ext_couple */       /* resize the pattern of P_ext_couple */
2097       if(num_Pext_cols){       if(num_Pext_cols){
2098      global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);          global_id_P = new index_t[num_Pext_cols];
2099      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
2100        global_id_P[i] = temp[i];            global_id_P[i] = temp[i];
2101       }       }
2102       if (num_Pcouple_cols || sum > 0)       if (num_Pcouple_cols || sum > 0)
2103      TMPMEMFREE(temp);          delete[] temp;
2104       for (i=0; i<sum; i++) {       for (i=0; i<sum; i++) {
2105      where_p = (index_t *)bsearch(          where_p = (index_t *)bsearch(
2106              &(P->remote_coupleBlock->pattern->index[i]),                          &(P->remote_coupleBlock->pattern->index[i]),
2107              global_id_P, num_Pext_cols,                          global_id_P, num_Pext_cols,
2108                          sizeof(index_t), Paso_comparIndex);                          sizeof(index_t), paso::comparIndex);
2109      P->remote_coupleBlock->pattern->index[i] =          P->remote_coupleBlock->pattern->index[i] =
2110              (index_t)(where_p -global_id_P);                          (index_t)(where_p -global_id_P);
2111       }         }  
2112    
2113       /* build the mapping */       /* build the mapping */
2114       if (num_Pcouple_cols) {       if (num_Pcouple_cols) {
2115      Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);          Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2116      iptr = 0;          iptr = 0;
2117      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
2118        if (global_id_P[i] == P->global_id[iptr]) {            if (global_id_P[i] == P->global_id[iptr]) {
2119          Pcouple_to_Pext[iptr++] = i;              Pcouple_to_Pext[iptr++] = i;
2120          if (iptr == num_Pcouple_cols) break;              if (iptr == num_Pcouple_cols) break;
2121        }            }
2122       }       }
2123     }     }
2124    
2125  if (MY_DEBUG) fprintf(stderr, "CP3\n");     /* alloc and initialise the makers */
 if (MY_DEBUG && rank == 0) {  
 fprintf(stderr, "Rank %d Now Prolongation Matrix ************************\n", rank);  
 Paso_SystemMatrix_print(P);  
 fprintf(stderr, "Rank %d Now Restriction Matrix ************************\n", rank);  
 Paso_SystemMatrix_print(R);  
 }  
   
    /* alloc and initialize the makers */  
2126     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
2127     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
2128     A_marker = TMPMEMALLOC(num_A_cols, index_t);     A_marker = new index_t[num_A_cols];
2129     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2130     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
2131     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2132     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2133    
 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_1 %d\n", rank, num_Pcouple_cols);  
2134     /* Now, count the size of RAP_ext. Start with rows in R_couple */     /* Now, count the size of RAP_ext. Start with rows in R_couple */
2135     sum = 0;     sum = 0;
2136     for (i_r=0; i_r<num_Pcouple_cols; i_r++){     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2137       row_marker = sum;       row_marker = sum;
2138       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
2139       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_2 %d (%d %d)\n", rank, i_r, R_couple->pattern->ptr[i_r], j1_ub);  
2140       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2141      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
2142  if (MY_DEBUG) fprintf(stderr, "rank %d CP3_3 %d %d\n", rank, j1, i1);          /* then, loop over elements in row i1 of A->col_coupleBlock */
2143      /* then, loop over elements in row i1 of A->col_coupleBlock */          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2144      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2145  if (MY_DEBUG) fprintf(stderr, "rank %d CP3_4 %d %d %d\n", rank, j1, i1, j2_ub);            i2 = A->col_coupleBlock->pattern->index[j2];
2146      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
2147        i2 = A->col_coupleBlock->pattern->index[j2];            /* check whether entry RA[i_r, i2] has been previously visited.
2148                 RAP new entry is possible only if entry RA[i_r, i2] has not
2149        /* check whether entry RA[i_r, i2] has been previously visited.               been visited yet */
2150           RAP new entry is possible only if entry RA[i_r, i2] has not            if (A_marker[i2] != i_r) {
2151           been visited yet */              /* first, mark entry RA[i_r,i2] as visited */
2152        if (A_marker[i2] != i_r) {              A_marker[i2] = i_r;
2153          /* first, mark entry RA[i_r,i2] as visited */  
2154          A_marker[i2] = i_r;              /* then loop over elements in row i2 of P_ext_main */
2155                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2156          /* then loop over elements in row i2 of P_ext_main */              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2157          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];                  i_c = P->row_coupleBlock->pattern->index[j3];
2158          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
2159          i_c = P->row_coupleBlock->pattern->index[j3];                  /* check whether entry RAP[i_r,i_c] has been created.
2160                       If not yet created, create the entry and increase
2161          /* check whether entry RAP[i_r,i_c] has been created.                     the total number of elements in RAP_ext */
2162             If not yet created, create the entry and increase                  if (P_marker[i_c] < row_marker) {
2163             the total number of elements in RAP_ext */                    P_marker[i_c] = sum;
2164          if (P_marker[i_c] < row_marker) {                    sum++;
2165            P_marker[i_c] = sum;                  }
2166            sum++;              }
2167          }  
2168          }              /* loop over elements in row i2 of P_ext_couple, do the same */
2169                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2170          /* loop over elements in row i2 of P_ext_couple, do the same */              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2171          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2172          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  if (P_marker[i_c] < row_marker) {
2173          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                    P_marker[i_c] = sum;
2174          if (P_marker[i_c] < row_marker) {                    sum++;
2175            P_marker[i_c] = sum;                  }
2176            sum++;              }
2177          }            }
2178          }          }
2179        }  
2180      }          /* now loop over elements in row i1 of A->mainBlock, repeat
2181               the process we have done to block A->col_coupleBlock */
2182      /* now loop over elements in row i1 of A->mainBlock, repeat          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2183         the process we have done to block A->col_coupleBlock */          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2184      j2_ub = A->mainBlock->pattern->ptr[i1+1];            i2 = A->mainBlock->pattern->index[j2];
2185      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2186        i2 = A->mainBlock->pattern->index[j2];              A_marker[i2 + num_Acouple_cols] = i_r;
2187        if (A_marker[i2 + num_Acouple_cols] != i_r) {              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2188          A_marker[i2 + num_Acouple_cols] = i_r;              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2189          j3_ub = P->mainBlock->pattern->ptr[i2+1];                  i_c = P->mainBlock->pattern->index[j3];
2190          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  if (P_marker[i_c] < row_marker) {
2191          i_c = P->mainBlock->pattern->index[j3];                    P_marker[i_c] = sum;
2192          if (P_marker[i_c] < row_marker) {                    sum++;
2193            P_marker[i_c] = sum;                  }
2194            sum++;              }
2195          }              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2196          }              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2197          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];                  /* note that we need to map the column index in
2198          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                     P->col_coupleBlock back into the column index in
2199          /* note that we need to map the column index in                     P_ext_couple */
2200             P->col_coupleBlock back into the column index in                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2201             P_ext_couple */                  if (P_marker[i_c] < row_marker) {
2202          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                    P_marker[i_c] = sum;
2203  if (MY_DEBUG1 && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d-%d out of %d+%d) 5\n", rank, i_c, P->col_coupleBlock->pattern->index[j3], num_Pext_cols, num_Pmain_cols);                    sum++;
2204          if (P_marker[i_c] < row_marker) {                  }
2205            P_marker[i_c] = sum;              }
2206            sum++;            }
2207          }          }
         }  
       }  
     }  
2208       }       }
2209     }     }
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP4 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);  
2210    
2211     /* Now we have the number of non-zero elements in RAP_ext, allocate     /* Now we have the number of non-zero elements in RAP_ext, allocate
2212        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2213     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);     RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2214     RAP_ext_idx = MEMALLOC(sum, index_t);     RAP_ext_idx = new index_t[sum];
2215     RAP_ext_val = MEMALLOC(sum, double);     RAP_ext_val = new double[sum * block_size];
2216       RA_val = new double[block_size];
2217       RAP_val = new double[block_size];
2218    
2219     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2220     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
# Line 834  if (MY_DEBUG1) fprintf(stderr, "rank %d Line 2222  if (MY_DEBUG1) fprintf(stderr, "rank %d
2222     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
2223     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2224     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_1 sum %d\n", rank, sum);  
2225     sum = 0;     sum = 0;
2226     RAP_ext_ptr[0] = 0;     RAP_ext_ptr[0] = 0;
2227     for (i_r=0; i_r<num_Pcouple_cols; i_r++){     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2228       row_marker = sum;       row_marker = sum;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_2 i_r %d sum %d\n", rank, i_r, sum);  
2229       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
2230       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
2231       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2232      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
2233      R_val = R_couple->val[j1];          R_val = &(R_couple->val[j1*block_size]);
2234    
2235      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
2236      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2237      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2238        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
2239        RA_val = R_val * A->col_coupleBlock->val[j2];            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2240              for (irb=0; irb<row_block_size; irb++) {
2241        /* check whether entry RA[i_r, i2] has been previously visited.              for (icb=0; icb<col_block_size; icb++) {
2242           RAP new entry is possible only if entry RA[i_r, i2] has not                  rtmp = ZERO;
2243           been visited yet */                  for (ib=0; ib<col_block_size; ib++) {
2244        if (A_marker[i2] != i_r) {                    rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2245          /* first, mark entry RA[i_r,i2] as visited */                  }
2246          A_marker[i2] = i_r;                  RA_val[irb+row_block_size*icb]=rtmp;
2247                }
2248          /* then loop over elements in row i2 of P_ext_main */            }
2249          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];  
2250          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {            /* check whether entry RA[i_r, i2] has been previously visited.
2251          i_c = P->row_coupleBlock->pattern->index[j3];               RAP new entry is possible only if entry RA[i_r, i2] has not
2252          RAP_val = RA_val * P->row_coupleBlock->val[j3];               been visited yet */
2253              if (A_marker[i2] != i_r) {
2254          /* check whether entry RAP[i_r,i_c] has been created.              /* first, mark entry RA[i_r,i2] as visited */
2255             If not yet created, create the entry and increase              A_marker[i2] = i_r;
2256             the total number of elements in RAP_ext */  
2257          if (P_marker[i_c] < row_marker) {              /* then loop over elements in row i2 of P_ext_main */
2258            P_marker[i_c] = sum;              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2259            RAP_ext_val[sum] = RAP_val;              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2260            RAP_ext_idx[sum] = i_c + offset;                  i_c = P->row_coupleBlock->pattern->index[j3];
2261  if (MY_DEBUG){                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2262  if (rank == 1) fprintf(stderr, "1-ext[%d (%d,%d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);                  for (irb=0; irb<row_block_size; irb++) {
2263  }                    for (icb=0; icb<col_block_size; icb++) {
2264            sum++;                      rtmp = ZERO;
2265          } else {                      for (ib=0; ib<col_block_size; ib++) {
2266            RAP_ext_val[P_marker[i_c]] += RAP_val;                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2267  if (MY_DEBUG){                      }
2268  if (rank == 1) fprintf(stderr, "1-ext[(%d,%d)]+=%f\n", i_r, i_c, RAP_val);                      RAP_val[irb+row_block_size*icb]=rtmp;
2269  }                    }
2270          }                  }
2271          }  
2272    
2273          /* loop over elements in row i2 of P_ext_couple, do the same */                  /* check whether entry RAP[i_r,i_c] has been created.
2274          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];                     If not yet created, create the entry and increase
2275          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                     the total number of elements in RAP_ext */
2276          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                  if (P_marker[i_c] < row_marker) {
2277          RAP_val = RA_val * P->remote_coupleBlock->val[j3];                    P_marker[i_c] = sum;
2278                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2279          if (P_marker[i_c] < row_marker) {                    RAP_ext_idx[sum] = i_c + offset;
2280            P_marker[i_c] = sum;                    sum++;
2281            RAP_ext_val[sum] = RAP_val;                  } else {
2282            RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2283  if (MY_DEBUG){                    for (ib=0; ib<block_size; ib++) {
2284  if (rank == 1) fprintf(stderr, "ext[%d (%d,%d)]=%f (R%f, A(%d %d)%f, P%f)\n", sum, i_r, i_c, RAP_ext_val[sum], R_val, i1, i2, A->col_coupleBlock->val[j2], P->remote_coupleBlock->val[j3]);                      temp_val[ib] += RAP_val[ib];
2285  }                    }
2286            sum++;                  }
2287          } else {              }
2288            RAP_ext_val[P_marker[i_c]] += RAP_val;  
2289  if (MY_DEBUG){              /* loop over elements in row i2 of P_ext_couple, do the same */
2290  if (rank == 1) fprintf(stderr, "ext[(%d,%d)]+=%f\n", i_r, i_c, RAP_val);              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2291  }              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2292          }                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2293          }                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2294                    for (irb=0; irb<row_block_size; irb++) {
2295                      for (icb=0; icb<col_block_size; icb++) {
2296                        rtmp = ZERO;
2297                        for (ib=0; ib<col_block_size; ib++) {
2298                            rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2299                        }
2300                        RAP_val[irb+row_block_size*icb]=rtmp;
2301                      }
2302                    }
2303    
2304                    if (P_marker[i_c] < row_marker) {
2305                      P_marker[i_c] = sum;
2306                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2307                      RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2308                      sum++;
2309                    } else {
2310                      temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2311                      for (ib=0; ib<block_size; ib++) {
2312                        temp_val[ib] += RAP_val[ib];
2313                      }
2314                    }
2315                }
2316    
2317              /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2318                 Only the contributions are added. */
2319              } else {
2320                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2321                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2322                    i_c = P->row_coupleBlock->pattern->index[j3];
2323                    temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2324                    for (irb=0; irb<row_block_size; irb++) {
2325                      for (icb=0; icb<col_block_size; icb++) {
2326                        rtmp = ZERO;
2327                        for (ib=0; ib<col_block_size; ib++) {
2328                            rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2329                        }
2330                        RAP_val[irb+row_block_size*icb]=rtmp;
2331                      }
2332                    }
2333    
2334                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2335                    for (ib=0; ib<block_size; ib++) {
2336                      temp_val[ib] += RAP_val[ib];
2337                    }
2338                }
2339                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2340                for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2341                    i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2342                    temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2343                    for (irb=0; irb<row_block_size; irb++) {
2344                      for (icb=0; icb<col_block_size; icb++) {
2345                        rtmp = ZERO;
2346                        for (ib=0; ib<col_block_size; ib++) {
2347                            rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2348                        }
2349                        RAP_val[irb+row_block_size*icb]=rtmp;
2350                      }
2351                    }
2352    
2353                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2354                    for (ib=0; ib<block_size; ib++) {
2355                      temp_val[ib] += RAP_val[ib];
2356                    }
2357                }
2358              }
2359            }
2360    
2361        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.          /* now loop over elements in row i1 of A->mainBlock, repeat
2362           Only the contributions are added. */             the process we have done to block A->col_coupleBlock */
2363        } else {          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2364          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2365          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {            i2 = A->mainBlock->pattern->index[j2];
2366          i_c = P->row_coupleBlock->pattern->index[j3];            temp_val = &(A->mainBlock->val[j2*block_size]);
2367          RAP_val = RA_val * P->row_coupleBlock->val[j3];            for (irb=0; irb<row_block_size; irb++) {
2368          RAP_ext_val[P_marker[i_c]] += RAP_val;              for (icb=0; icb<col_block_size; icb++) {
2369  if (MY_DEBUG){                  rtmp = ZERO;
2370  if (rank == 1) fprintf(stderr, "Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);                  for (ib=0; ib<col_block_size; ib++) {
2371  }                    rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2372          }                  }
2373          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];                  RA_val[irb+row_block_size*icb]=rtmp;
2374          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              }
2375          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;            }
2376          RAP_val = RA_val * P->remote_coupleBlock->val[j3];  
2377          RAP_ext_val[P_marker[i_c]] += RAP_val;            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2378  if (MY_DEBUG){              A_marker[i2 + num_Acouple_cols] = i_r;
2379  if (rank == 1) fprintf(stderr, "Visited ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2380  }              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2381          }                  i_c = P->mainBlock->pattern->index[j3];
2382        }                  temp_val = &(P->mainBlock->val[j3*block_size]);
2383      }                  for (irb=0; irb<row_block_size; irb++) {
2384                      for (icb=0; icb<col_block_size; icb++) {
2385      /* now loop over elements in row i1 of A->mainBlock, repeat                      rtmp = ZERO;
2386         the process we have done to block A->col_coupleBlock */                      for (ib=0; ib<col_block_size; ib++) {
2387      j2_ub = A->mainBlock->pattern->ptr[i1+1];                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2388      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {                      }
2389        i2 = A->mainBlock->pattern->index[j2];                      RAP_val[irb+row_block_size*icb]=rtmp;
2390        RA_val = R_val * A->mainBlock->val[j2];                    }
2391        if (A_marker[i2 + num_Acouple_cols] != i_r) {                  }
2392          A_marker[i2 + num_Acouple_cols] = i_r;  
2393          j3_ub = P->mainBlock->pattern->ptr[i2+1];                  if (P_marker[i_c] < row_marker) {
2394          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                    P_marker[i_c] = sum;
2395          i_c = P->mainBlock->pattern->index[j3];                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2396          RAP_val = RA_val * P->mainBlock->val[j3];                    RAP_ext_idx[sum] = i_c + offset;
2397          if (P_marker[i_c] < row_marker) {                    sum++;
2398            P_marker[i_c] = sum;                  } else {
2399            RAP_ext_val[sum] = RAP_val;                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2400            RAP_ext_idx[sum] = i_c + offset;                    for (ib=0; ib<block_size; ib++) {
2401  if (MY_DEBUG){                      temp_val[ib] += RAP_val[ib];
2402  if (rank == 1) fprintf(stderr, "Main 1-ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);                    }
2403  }                  }
2404            sum++;              }
2405          } else {              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2406            RAP_ext_val[P_marker[i_c]] += RAP_val;              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2407  if (MY_DEBUG){                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2408  if (rank == 1) fprintf(stderr, "Main 1-ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2409  }                  for (irb=0; irb<row_block_size; irb++) {
2410          }                    for (icb=0; icb<col_block_size; icb++) {
2411          }                      rtmp = ZERO;
2412          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];                      for (ib=0; ib<col_block_size; ib++) {
2413          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2414          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                      }
2415  if (MY_DEBUG1 && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 1\n", rank, i_c, num_Pext_cols + num_Pmain_cols);                      RAP_val[irb+row_block_size*icb]=rtmp;
2416          RAP_val = RA_val * P->col_coupleBlock->val[j3];                    }
2417          if (P_marker[i_c] < row_marker) {                  }
2418            P_marker[i_c] = sum;  
2419            RAP_ext_val[sum] = RAP_val;                  if (P_marker[i_c] < row_marker) {
2420                      P_marker[i_c] = sum;
2421                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2422                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2423  if (MY_DEBUG){                    sum++;
2424  if (rank == 1) fprintf(stderr, "Main ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);                  } else {
2425  }                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2426            sum++;                    for (ib=0; ib<block_size; ib++) {
2427          } else {                      temp_val[ib] += RAP_val[ib];
2428            RAP_ext_val[P_marker[i_c]] += RAP_val;                    }
2429  if (MY_DEBUG){                  }
2430  if (rank == 1) fprintf(stderr, "Main ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);              }
2431  }            } else {
2432          }              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2433          }              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2434        } else {                  i_c = P->mainBlock->pattern->index[j3];
2435          j3_ub = P->mainBlock->pattern->ptr[i2+1];                  temp_val = &(P->mainBlock->val[j3*block_size]);
2436          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  for (irb=0; irb<row_block_size; irb++) {
2437          i_c = P->mainBlock->pattern->index[j3];                    for (icb=0; icb<col_block_size; icb++) {
2438          RAP_val = RA_val * P->mainBlock->val[j3];                      rtmp = ZERO;
2439          RAP_ext_val[P_marker[i_c]] += RAP_val;                      for (ib=0; ib<col_block_size; ib++) {
2440  if (MY_DEBUG){                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2441  if (rank == 1) fprintf(stderr, "Main Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);                      }
2442  }                      RAP_val[irb+row_block_size*icb]=rtmp;
2443          }                    }
2444          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];                  }
2445          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
2446          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2447          RAP_val = RA_val * P->col_coupleBlock->val[j3];                  for (ib=0; ib<block_size; ib++) {
2448          RAP_ext_val[P_marker[i_c]] += RAP_val;                    temp_val[ib] += RAP_val[ib];
2449  if (MY_DEBUG){                  }
2450  if (rank == 1) fprintf(stderr, "Main Visited ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);              }
2451  }              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2452          }              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2453        }                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2454      }                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2455                    for (irb=0; irb<row_block_size; irb++) {
2456                      for (icb=0; icb<col_block_size; icb++) {
2457                        rtmp = ZERO;
2458                        for (ib=0; ib<col_block_size; ib++) {
2459                            rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2460                        }
2461                        RAP_val[irb+row_block_size*icb]=rtmp;
2462                      }
2463                    }
2464    
2465                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2466                    for (ib=0; ib<block_size; ib++) {
2467                      temp_val[ib] += RAP_val[ib];
2468                    }
2469                }
2470              }
2471            }
2472       }       }
2473       RAP_ext_ptr[i_r+1] = sum;       RAP_ext_ptr[i_r+1] = sum;
2474     }     }
2475     TMPMEMFREE(P_marker);     delete[] P_marker;
2476     TMPMEMFREE(Pcouple_to_Pext);     delete[] Pcouple_to_Pext;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP5 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);  
   
 if (MY_DEBUG) {  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d RAP_ext_idx[%d] = (", rank, sum);  
 for (i=0; i<sum; i++) {  
   sprintf(str2, "%d ", RAP_ext_idx[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
2477    
2478     /* Now we have part of RAP[r,c] where row "r" is the list of rows     /* Now we have part of RAP[r,c] where row "r" is the list of rows
2479        which is given by the column list of P->col_coupleBlock, and        which is given by the column list of P->col_coupleBlock, and
2480        column "c" is the list of columns which possiblely covers the        column "c" is the list of columns which possibly covers the
2481        whole column range of systme martris P. This part of data is to        whole column range of system matrix P. This part of data is to
2482        be passed to neighboring processors, and added to corresponding        be passed to neighbouring processors, and added to corresponding
2483        RAP[r,c] entries in the neighboring processors */        RAP[r,c] entries in the neighbouring processors */
2484     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2485          &RAP_ext_val, global_id_P);                  &RAP_ext_val, global_id_P, block_size);
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP6\n", rank);  
2486    
2487     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2488     sum = RAP_ext_ptr[num_RAPext_rows];     sum = RAP_ext_ptr[num_RAPext_rows];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP6_0 %d %d (%d %d)\n", rank, sum, num_Pext_cols, num_RAPext_rows, num_Pcouple_cols);  
2489     num_RAPext_cols = 0;     num_RAPext_cols = 0;
2490     if (num_Pext_cols || sum > 0) {     if (num_Pext_cols || sum > 0) {
2491       temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);       temp = new index_t[num_Pext_cols+sum];
2492       j1_ub = offset + num_Pmain_cols;       j1_ub = offset + num_Pmain_cols;
2493       for (i=0, iptr=0; i<sum; i++) {       for (i=0, iptr=0; i<sum; i++) {
2494      if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */ {          if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */
2495        temp[iptr++] = RAP_ext_idx[i];          /* XXX */            temp[iptr++] = RAP_ext_idx[i];                  /* XXX */
2496  if (MY_DEBUG && rank ==0) fprintf(stderr, "RAP_ext_idx[%d]=%d\n", i, RAP_ext_idx[i]);       }
2497  }       for (j=0; j<num_Pext_cols; j++, iptr++){
2498            temp[iptr] = global_id_P[j];
2499       }       }
      for (j=0; j<num_Pext_cols; j++, iptr++)  
     temp[iptr] = global_id_P[j];  
2500    
2501       if (iptr) {       if (iptr) {
2502      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2503        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_RAPext_cols = 1;
2504      #else          i = temp[0];
2505        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
2506      #endif            if (temp[j] > i) {
2507      num_RAPext_cols = 1;              i = temp[j];
2508      i = temp[0];              temp[num_RAPext_cols++] = i;
2509      for (j=1; j<iptr; j++) {            }
2510        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_RAPext_cols++] = i;  
       }  
     }  
   
 if (MY_DEBUG) {  
 char *str1, *str2;  
 index_t my_sum=iptr, i;  
 str1 = TMPMEMALLOC(my_sum*20+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d temp[%d] = (", rank, my_sum);  
 for (i=0; i<my_sum; i++){  
   sprintf(str2, "%d ", temp[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
2511       }       }
2512     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP6_1 %d %d\n", rank, iptr, num_RAPext_cols);  
2513    
2514     /* resize the pattern of P_ext_couple */     /* resize the pattern of P_ext_couple */
2515     if(num_RAPext_cols){     if(num_RAPext_cols){
2516       global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);       global_id_RAP = new index_t[num_RAPext_cols];
2517       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
2518      global_id_RAP[i] = temp[i];          global_id_RAP[i] = temp[i];
2519     }     }
 if (MY_DEBUG) {  
 char *str1, *str2;  
 index_t my_sum=num_RAPext_cols, i;  
 str1 = TMPMEMALLOC(my_sum*20+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, my_sum);  
 for (i=0; i<my_sum; i++){  
   sprintf(str2, "%d ", global_id_RAP[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
2520     if (num_Pext_cols || sum > 0)     if (num_Pext_cols || sum > 0)
2521       TMPMEMFREE(temp);       delete[] temp;
2522     j1_ub = offset + num_Pmain_cols;     j1_ub = offset + num_Pmain_cols;
2523     for (i=0; i<sum; i++) {     for (i=0; i<sum; i++) {
2524       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2525      where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,          where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2526  /*XXX*/         num_RAPext_cols, sizeof(index_t), Paso_comparIndex);  /*XXX*/                 num_RAPext_cols, sizeof(index_t), paso::comparIndex);
2527      RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);          RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2528       } else       } else
2529      RAP_ext_idx[i] = RAP_ext_idx[i] - offset;          RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2530     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP7 %d\n", rank, num_Pext_cols);  
2531    
2532     /* build the mapping */     /* build the mapping */
2533     if (num_Pcouple_cols) {     if (num_Pcouple_cols) {
2534       Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);       Pcouple_to_RAP = new index_t[num_Pcouple_cols];
2535       iptr = 0;       iptr = 0;
2536       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
2537      if (global_id_RAP[i] == P->global_id[iptr]) {          if (global_id_RAP[i] == P->global_id[iptr]) {
2538        Pcouple_to_RAP[iptr++] = i;            Pcouple_to_RAP[iptr++] = i;
2539        if (iptr == num_Pcouple_cols) break;            if (iptr == num_Pcouple_cols) break;
2540      }          }
2541     }     }
2542    
2543     if (num_Pext_cols) {     if (num_Pext_cols) {
2544       Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);       Pext_to_RAP = new index_t[num_Pext_cols];
2545       iptr = 0;       iptr = 0;
2546       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
2547      if (global_id_RAP[i] == global_id_P[iptr]) {          if (global_id_RAP[i] == global_id_P[iptr]) {
2548        Pext_to_RAP[iptr++] = i;            Pext_to_RAP[iptr++] = i;
2549        if (iptr == num_Pext_cols) break;            if (iptr == num_Pext_cols) break;
2550      }          }
2551     }     }
2552    
2553     if (global_id_P){     if (global_id_P){
2554       TMPMEMFREE(global_id_P);       delete[] global_id_P;
2555       global_id_P = NULL;       global_id_P = NULL;
2556     }     }
2557    
2558     /* alloc and initialize the makers */     /* alloc and initialise the makers */
2559     sum = num_RAPext_cols + num_Pmain_cols;     sum = num_RAPext_cols + num_Pmain_cols;
2560     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
2561     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2562     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
2563     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2564     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
 if (MY_DEBUG1) fprintf(stderr, "rank %d CP8 %d\n", rank, num_Pmain_cols);  
2565    
2566     /* Now, count the size of RAP. Start with rows in R_main */     /* Now, count the size of RAP. Start with rows in R_main */
2567     num_neighbors = P->col_coupler->connector->send->numNeighbors;     num_neighbors = P->col_coupler->connector->send->numNeighbors;
# Line 1158  if (MY_DEBUG1) fprintf(stderr, "rank %d Line 2575  if (MY_DEBUG1) fprintf(stderr, "rank %d
2575       row_marker_ext = j;       row_marker_ext = j;
2576    
2577       /* Mark the diagonal element RAP[i_r, i_r], and other elements       /* Mark the diagonal element RAP[i_r, i_r], and other elements
2578      in RAP_ext */          in RAP_ext */
2579       P_marker[i_r] = i;       P_marker[i_r] = i;
2580       i++;       i++;
2581       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
2582      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2583        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
2584          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2585            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
2586            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
2587          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2588            P_marker[i_c] = i;                    P_marker[i_c] = i;
2589            i++;                    i++;
2590          }                  }
2591            } else {                } else {
2592          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
2593            P_marker[i_c] = j;                    P_marker[i_c] = j;
2594            j++;                    j++;
2595          }                  }
2596            }                }
2597          }              }
2598          break;              break;
2599        }            }
2600      }          }
2601       }       }
2602    
2603       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
2604       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
2605       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2606      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
2607    
2608      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
2609      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2610      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2611        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
2612    
2613        /* check whether entry RA[i_r, i2] has been previously visited.            /* check whether entry RA[i_r, i2] has been previously visited.
2614           RAP new entry is possible only if entry RA[i_r, i2] has not               RAP new entry is possible only if entry RA[i_r, i2] has not
2615           been visited yet */               been visited yet */
2616        if (A_marker[i2] != i_r) {            if (A_marker[i2] != i_r) {
2617          /* first, mark entry RA[i_r,i2] as visited */              /* first, mark entry RA[i_r,i2] as visited */
2618          A_marker[i2] = i_r;              A_marker[i2] = i_r;
2619    
2620          /* then loop over elements in row i2 of P_ext_main */              /* then loop over elements in row i2 of P_ext_main */
2621          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2622          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2623          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
2624    
2625          /* check whether entry RAP[i_r,i_c] has been created.                  /* check whether entry RAP[i_r,i_c] has been created.
2626             If not yet created, create the entry and increase                     If not yet created, create the entry and increase
2627             the total number of elements in RAP_ext */                     the total number of elements in RAP_ext */
2628          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2629            P_marker[i_c] = i;                    P_marker[i_c] = i;
2630            i++;                    i++;
2631          }                  }
2632          }              }
2633    
2634          /* loop over elements in row i2 of P_ext_couple, do the same */              /* loop over elements in row i2 of P_ext_couple, do the same */
2635          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2636          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2637          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2638          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
2639            P_marker[i_c] = j;                    P_marker[i_c] = j;
2640            j++;                    j++;
2641          }                  }
2642          }              }
2643        }            }
2644      }          }
2645    
2646      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
2647         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
2648      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2649      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2650        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
2651        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2652          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
2653          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2654          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2655          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
2656          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2657            P_marker[i_c] = i;                    P_marker[i_c] = i;
2658            i++;                    i++;
2659          }                  }
2660          }              }
2661          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2662          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2663          /* note that we need to map the column index in                  /* note that we need to map the column index in
2664             P->col_coupleBlock back into the column index in                     P->col_coupleBlock back into the column index in
2665             P_ext_couple */                     P_ext_couple */
2666          i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2667  if (MY_DEBUG1 && (i_c <0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 2\n", rank, i_c, sum);                  if (P_marker[i_c] < row_marker_ext) {
2668          if (P_marker[i_c] < row_marker_ext) {                    P_marker[i_c] = j;
2669            P_marker[i_c] = j;                    j++;
2670            j++;                  }
2671          }              }
2672          }            }
2673        }          }
     }  
2674       }       }
2675     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP9 main_size%d couple_size%d\n", rank, i, j);  
2676    
2677     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
2678        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2679        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2680        RAP_couple */        RAP_couple */
2681     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_main_ptr = new index_t[num_Pmain_cols+1];
2682     RAP_main_idx = MEMALLOC(i, index_t);     RAP_main_idx = new index_t[i];
2683     RAP_main_val = TMPMEMALLOC(i, double);     RAP_main_val = new double[i * block_size];
2684     RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
2685     RAP_couple_idx = MEMALLOC(j, index_t);     RAP_couple_idx = new index_t[j];
2686     RAP_couple_val = TMPMEMALLOC(j, double);     RAP_couple_val = new double[j * block_size];
2687    
2688     RAP_main_ptr[num_Pmain_cols] = i;     RAP_main_ptr[num_Pmain_cols] = i;
2689     RAP_couple_ptr[num_Pmain_cols] = j;     RAP_couple_ptr[num_Pmain_cols] = j;
# Line 1290  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 2705  if (MY_DEBUG) fprintf(stderr, "rank %d C
2705       RAP_couple_ptr[i_r] = row_marker_ext;       RAP_couple_ptr[i_r] = row_marker_ext;
2706    
2707       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2708      in row i_r of RAP_ext */          in row i_r of RAP_ext */
2709       P_marker[i_r] = i;       P_marker[i_r] = i;
2710       RAP_main_val[i] = ZERO;       RAP_main_val[i] = ZERO;
2711       RAP_main_idx[i] = i_r;       RAP_main_idx[i] = i_r;
2712       i++;       i++;
2713    
2714       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
2715      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2716        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
2717          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2718            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
2719            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
2720          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2721            P_marker[i_c] = i;                    P_marker[i_c] = i;
2722            RAP_main_val[i] = RAP_ext_val[k];                    memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2723  if (MY_DEBUG){                    RAP_main_idx[i] = i_c;
2724  if (rank == 0 && i == 2) fprintf(stderr, "ext %f\n", RAP_ext_val[k]);                    i++;
2725  }                  } else {
2726            RAP_main_idx[i] = i_c;                    temp_val = &(RAP_ext_val[k*block_size]);
2727            i++;                    RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2728          } else {                    for (ib=0; ib<block_size; ib++)
2729            RAP_main_val[P_marker[i_c]] += RAP_ext_val[k];                      RAP_val[ib] += temp_val[ib];
2730  if (MY_DEBUG){                  }
2731  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "i_c %d ext %f\n", i_c, RAP_ext_val[k]);                } else {
2732  }                    if (P_marker[i_c] < row_marker_ext) {
2733          }                    P_marker[i_c] = j;
2734            } else {                    memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2735          if (P_marker[i_c] < row_marker_ext) {                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
2736            P_marker[i_c] = j;                    j++;
2737            RAP_couple_val[j] = RAP_ext_val[k];                  } else {
2738            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    temp_val = &(RAP_ext_val[k*block_size]);
2739            j++;                    RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2740          } else {                    for (ib=0; ib<block_size; ib++)
2741            RAP_couple_val[P_marker[i_c]] += RAP_ext_val[k];                      RAP_val[ib] += temp_val[ib];
2742          }                  }
2743            }                }
2744          }              }
2745          break;              break;
2746        }            }
2747      }          }
2748       }       }
2749    
2750       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
2751       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
2752       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2753      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
2754      R_val = R_main->val[j1];          R_val = &(R_main->val[j1*block_size]);
2755    
2756      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
2757      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2758      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2759        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
2760        RA_val = R_val * A->col_coupleBlock->val[j2];            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2761              for (irb=0; irb<row_block_size; irb++) {
2762        /* check whether entry RA[i_r, i2] has been previously visited.              for (icb=0; icb<col_block_size; icb++) {
2763           RAP new entry is possible only if entry RA[i_r, i2] has not                  rtmp = ZERO;
2764           been visited yet */                  for (ib=0; ib<col_block_size; ib++) {
2765        if (A_marker[i2] != i_r) {                    rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2766          /* first, mark entry RA[i_r,i2] as visited */                  }
2767          A_marker[i2] = i_r;                  RA_val[irb+row_block_size*icb]=rtmp;
2768                }
2769          /* then loop over elements in row i2 of P_ext_main */            }
2770          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];  
2771          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
2772          i_c = P->row_coupleBlock->pattern->index[j3];            /* check whether entry RA[i_r, i2] has been previously visited.
2773          RAP_val = RA_val * P->row_coupleBlock->val[j3];               RAP new entry is possible only if entry RA[i_r, i2] has not
2774                 been visited yet */
2775          /* check whether entry RAP[i_r,i_c] has been created.            if (A_marker[i2] != i_r) {
2776             If not yet created, create the entry and increase              /* first, mark entry RA[i_r,i2] as visited */
2777             the total number of elements in RAP_ext */              A_marker[i2] = i_r;
2778          if (P_marker[i_c] < row_marker) {  
2779            P_marker[i_c] = i;              /* then loop over elements in row i2 of P_ext_main */
2780            RAP_main_val[i] = RAP_val;              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2781  if (MY_DEBUG){              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2782  if (rank == 0 && i == 2) fprintf(stderr, "R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);                  i_c = P->row_coupleBlock->pattern->index[j3];
2783  }                    temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2784            RAP_main_idx[i] = i_c;                  for (irb=0; irb<row_block_size; irb++) {
2785            i++;                    for (icb=0; icb<col_block_size; icb++) {
2786          } else {                      rtmp = ZERO;
2787            RAP_main_val[P_marker[i_c]] += RAP_val;                      for (ib=0; ib<col_block_size; ib++) {
2788  if (MY_DEBUG){                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2789  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1 , R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);                      }
2790  }                      RAP_val[irb+row_block_size*icb]=rtmp;
2791          }                    }
2792          }                  }
2793    
2794          /* loop over elements in row i2 of P_ext_couple, do the same */  
2795          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];                  /* check whether entry RAP[i_r,i_c] has been created.
2796          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                     If not yet created, create the entry and increase
2797          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                     the total number of elements in RAP_ext */
2798          RAP_val = RA_val * P->remote_coupleBlock->val[j3];                  if (P_marker[i_c] < row_marker) {
2799          if (P_marker[i_c] < row_marker_ext) {                    P_marker[i_c] = i;
2800            P_marker[i_c] = j;                    memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2801            RAP_couple_val[j] = RAP_val;                    RAP_main_idx[i] = i_c;
2802            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    i++;
2803            j++;                  } else {
2804          } else {                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2805            RAP_couple_val[P_marker[i_c]] += RAP_val;                    for (ib=0; ib<block_size; ib++) {
2806          }                      temp_val[ib] += RAP_val[ib];
2807          }                    }
2808                    }
2809        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.              }
2810           Only the contributions are added. */  
2811        } else {              /* loop over elements in row i2 of P_ext_couple, do the same */
2812          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2813          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2814          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2815          RAP_val = RA_val * P->row_coupleBlock->val[j3];                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2816          RAP_main_val[P_marker[i_c]] += RAP_val;                  for (irb=0; irb<row_block_size; irb++) {
2817  if (MY_DEBUG){                    for (icb=0; icb<col_block_size; icb++) {
2818  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);                      rtmp = ZERO;
2819  }                      for (ib=0; ib<col_block_size; ib++) {
2820          }                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2821          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];                      }
2822          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                      RAP_val[irb+row_block_size*icb]=rtmp;
2823          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                    }
2824          RAP_val = RA_val * P->remote_coupleBlock->val[j3];                  }
2825          RAP_couple_val[P_marker[i_c]] += RAP_val;  
2826          }                  if (P_marker[i_c] < row_marker_ext) {
2827        }                    P_marker[i_c] = j;
2828      }                    memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2829                      RAP_couple_idx[j] = i_c - num_Pmain_cols;
2830      /* now loop over elements in row i1 of A->mainBlock, repeat                    j++;
2831         the process we have done to block A->col_coupleBlock */                  } else {
2832      j2_ub = A->mainBlock->pattern->ptr[i1+1];                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2833      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {                    for (ib=0; ib<block_size; ib++) {
2834        i2 = A->mainBlock->pattern->index[j2];                      temp_val[ib] += RAP_val[ib];
2835        RA_val = R_val * A->mainBlock->val[j2];                    }
2836        if (A_marker[i2 + num_Acouple_cols] != i_r) {                  }
2837          A_marker[i2 + num_Acouple_cols] = i_r;              }
2838          j3_ub = P->mainBlock->pattern->ptr[i2+1];  
2839          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {            /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2840          i_c = P->mainBlock->pattern->index[j3];               Only the contributions are added. */
2841          RAP_val = RA_val * P->mainBlock->val[j3];            } else {
2842          if (P_marker[i_c] < row_marker) {              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2843            P_marker[i_c] = i;              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2844            RAP_main_val[i] = RAP_val;                  i_c = P->row_coupleBlock->pattern->index[j3];
2845  if (MY_DEBUG){                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2846  if (rank == 0 && i == 2) fprintf(stderr, "Main R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);                  for (irb=0; irb<row_block_size; irb++) {
2847  }                    for (icb=0; icb<col_block_size; icb++) {
2848            RAP_main_idx[i] = i_c;                      rtmp = ZERO;
2849            i++;                      for (ib=0; ib<col_block_size; ib++) {
2850          } else {                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2851            RAP_main_val[P_marker[i_c]] += RAP_val;                      }
2852  if (MY_DEBUG){                        RAP_val[irb+row_block_size*icb]=rtmp;
2853  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);                    }
2854  }                  }
2855          }  
2856          }                  temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2857          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];                  for (ib=0; ib<block_size; ib++) {
2858          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                    temp_val[ib] += RAP_val[ib];
2859          /* note that we need to map the column index in                  }
2860             P->col_coupleBlock back into the column index in              }
2861             P_ext_couple */              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2862          i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2863  if (MY_DEBUG1 && (i_c < 0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d--%d,%d) 3\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2864          RAP_val = RA_val * P->col_coupleBlock->val[j3];                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2865          if (P_marker[i_c] < row_marker_ext) {                  for (irb=0; irb<row_block_size; irb++) {
2866            P_marker[i_c] = j;                    for (icb=0; icb<col_block_size; icb++) {
2867            RAP_couple_val[j] = RAP_val;                      rtmp = ZERO;
2868            RAP_couple_idx[j] = i_c - num_Pmain_cols;                      for (ib=0; ib<col_block_size; ib++) {
2869            j++;                          rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2870          } else {                      }
2871            RAP_couple_val[P_marker[i_c]] += RAP_val;                      RAP_val[irb+row_block_size*icb]=rtmp;
2872          }                    }
2873          }                  }
2874    
2875        } else {                  temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2876          j3_ub = P->mainBlock->pattern->ptr[i2+1];                  for (ib=0; ib<block_size; ib++) {
2877          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                    temp_val[ib] += RAP_val[ib];
2878          i_c = P->mainBlock->pattern->index[j3];                  }
2879          RAP_val = RA_val * P->mainBlock->val[j3];              }
2880          RAP_main_val[P_marker[i_c]] += RAP_val;            }
2881  if (MY_DEBUG){          }
2882  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main Visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);  
2883  }          /* now loop over elements in row i1 of A->mainBlock, repeat
2884          }             the process we have done to block A->col_coupleBlock */
2885          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2886          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2887          i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;            i2 = A->mainBlock->pattern->index[j2];
2888  if (MY_DEBUG1 && i_c >= sum) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d -- %d,%d) 4\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);            temp_val = &(A->mainBlock->val[j2*block_size]);
2889          RAP_val = RA_val * P->col_coupleBlock->val[j3];            for (irb=0; irb<row_block_size; irb++) {
2890          RAP_couple_val[P_marker[i_c]] += RAP_val;              for (icb=0; icb<col_block_size; icb++) {
2891          }                  rtmp = ZERO;
2892        }                  for (ib=0; ib<col_block_size; ib++) {