/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Diff of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/amg_from_3530/paso/src/AMG_Interpolation.c revision 3817 by lgao, Fri Feb 10 03:43:35 2012 UTC trunk/paso/src/AMG_Interpolation.cpp revision 4836 by caltinay, Mon Apr 7 05:51:55 2014 UTC
# Line 1  Line 1 
1  //  /*****************************************************************************
 /*******************************************************  
2  *  *
3  * Copyright (c) 2003-2011 by University of Queensland  * Copyright (c) 2003-2014 by University of Queensland
4  * Earth Systems Science Computational Center (ESSCC)  * http://www.uq.edu.au
 * http://www.uq.edu.au/esscc  
5  *  *
6  * Primary Business: Queensland, Australia  * Primary Business: Queensland, Australia
7  * Licensed under the Open Software License version 3.0  * Licensed under the Open Software License version 3.0
8  * http://www.opensource.org/licenses/osl-3.0.php  * http://www.opensource.org/licenses/osl-3.0.php
9  *  *
10  *******************************************************/  * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11    * Development 2012-2013 by School of Earth Sciences
12    * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13    *
14    *****************************************************************************/
15    
16    
17  /**************************************************************/  /****************************************************************************/
18    
19  /* Paso: defines AMG Interpolation                            */  /* Paso: defines AMG Interpolation                            */
20    
21  /**************************************************************/  /****************************************************************************/
22    
23  /* Author: l.gao@uq.edu.au                                    */  /* Author: l.gao@uq.edu.au                                    */
24    
25  /**************************************************************/  /****************************************************************************/
26    
27  #include "Paso.h"  #include "Paso.h"
28  #include "SparseMatrix.h"  #include "SparseMatrix.h"
29  #include "PasoUtil.h"  #include "PasoUtil.h"
30  #include "Preconditioner.h"  #include "Preconditioner.h"
31    
32  /**************************************************************  /****************************************************************************
33    
34      Methods nessecary for AMG preconditioner      Methods necessary for AMG preconditioner
35    
36      construct n_C x n_C interpolation operator A_C from matrix A      construct n_C x n_C interpolation operator A_C from matrix A
37      and prolongation matrix P.      and prolongation matrix P.
38    
39      The coarsening operator A_C is defined by RAP where R=P^T.      The coarsening operator A_C is defined by RAP where R=P^T.
40    
41  ***************************************************************/  *****************************************************************************/
   
 #define MY_DEBUG 0  
 #define MY_DEBUG1 1  
42    
43  /* Extend system matrix B with extra two sparse matrixes:  /* Extend system matrix B with extra two sparse matrices:
44      B_ext_main and B_ext_couple          B_ext_main and B_ext_couple
45     The combination of this two sparse matrixes represents       The combination of this two sparse matrices represents  
46     the portion of B that is stored on neighbor procs and     the portion of B that is stored on neighbour procs and
47     needed locally for triple matrix product (B^T) A B.     needed locally for triple matrix product (B^T) A B.
48    
49     FOR NOW, we assume that the system matrix B has a NULL     FOR NOW, we assume that the system matrix B has a NULL
# Line 58  Line 57 
57          A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
58          rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59     on q. */     on q. */
60  void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)  void Paso_Preconditioner_AMG_extendB(paso::SystemMatrix_ptr A,
61                                         paso::SystemMatrix_ptr B)
62  {  {
63    Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;      paso::Pattern_ptr pattern_main, pattern_couple;
64    Paso_Coupler *coupler=NULL;      paso::Coupler_ptr coupler;
65    Paso_SharedComponents *send=NULL, *recv=NULL;      paso::SharedComponents_ptr send, recv;
66    double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;      double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
67    index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;      index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
68    index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;      index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
69    index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;      index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
70    index_t *idx_m=NULL, *idx_c=NULL;      index_t *idx_m=NULL, *idx_c=NULL;
71    index_t i, j, k, m, n, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;      index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
72    index_t offset, len, block_size, block_size_size, max_num_cols;      index_t offset, len, block_size, block_size_size, max_num_cols;
73    index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;      index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
74    dim_t *recv_degree=NULL, *send_degree=NULL;      dim_t *recv_degree=NULL, *send_degree=NULL;
75    dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;      dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
76    
77    if (size == 1) return;      if (size == 1) return;
78    
79    if (B->row_coupleBlock) {      B->row_coupleBlock.reset();
80      Paso_SparseMatrix_free(B->row_coupleBlock);  
81      B->row_coupleBlock = NULL;      if (B->remote_coupleBlock.get()) {
82    }          Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to remote_coupleBlock has already been set.");
83            return;
84    if (B->row_coupleBlock || B->remote_coupleBlock) {      }
     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");  
     return;  
   }  
   
 if (MY_DEBUG) fprintf(stderr, "CP1_1\n");  
85    
86    /* sending/receiving unknown's global ID */      /* sending/receiving unknown's global ID */
87    num_main_cols = B->mainBlock->numCols;      num_main_cols = B->mainBlock->numCols;
88    cols = TMPMEMALLOC(num_main_cols, double);      cols = new double[num_main_cols];
89    offset = B->col_distribution->first_component[rank];      offset = B->col_distribution->first_component[rank];
90    #pragma omp parallel for private(i) schedule(static)      #pragma omp parallel for private(i) schedule(static)
91    for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;      for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
92    if (B->global_id == NULL) {      if (B->global_id == NULL) {
93      coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);          coupler.reset(new paso::Coupler(B->col_coupler->connector, 1));
94      Paso_Coupler_startCollect(coupler, cols);          coupler->startCollect(cols);
95    }      }
96    
97    recv_buf = TMPMEMALLOC(size, index_t);      recv_buf = new index_t[size];
98    recv_degree = TMPMEMALLOC(size, dim_t);      recv_degree = new dim_t[size];
99    recv_offset = TMPMEMALLOC(size+1, index_t);      recv_offset = new index_t[size+1];
100    #pragma omp parallel for private(i) schedule(static)    #pragma omp parallel for private(i) schedule(static)
101    for (i=0; i<size; i++){      for (i=0; i<size; i++){
102      recv_buf[i] = 0;          recv_buf[i] = 0;
103      recv_degree[i] = 1;          recv_degree[i] = 1;
104      recv_offset[i] = i;          recv_offset[i] = i;
105    }      }
106    
107    block_size = B->block_size;    block_size = B->block_size;
108    block_size_size = block_size * sizeof(double);    block_size_size = block_size * sizeof(double);
# Line 117  if (MY_DEBUG) fprintf(stderr, "CP1_1\n") Line 112  if (MY_DEBUG) fprintf(stderr, "CP1_1\n")
112    num_neighbors = send->numNeighbors;    num_neighbors = send->numNeighbors;
113    p = send->offsetInShared[num_neighbors];    p = send->offsetInShared[num_neighbors];
114    len = p * B->col_distribution->first_component[size];    len = p * B->col_distribution->first_component[size];
115    send_buf = TMPMEMALLOC(len * block_size, double);    send_buf = new double[len * block_size];
116    send_idx = TMPMEMALLOC(len, index_t);    send_idx = new index_t[len];
117    send_offset = TMPMEMALLOC((p+1)*2, index_t);    send_offset = new index_t[(p+1)*2];
118    send_degree = TMPMEMALLOC(num_neighbors, dim_t);    send_degree = new dim_t[num_neighbors];
119    i = num_main_cols + num_couple_cols;    i = num_main_cols + num_couple_cols;
120    send_m = TMPMEMALLOC(i * block_size, double);    send_m = new double[i * block_size];
121    send_c = TMPMEMALLOC(i * block_size, double);    send_c = new double[i * block_size];
122    idx_m = TMPMEMALLOC(i, index_t);    idx_m = new index_t[i];
123    idx_c = TMPMEMALLOC(i, index_t);    idx_c = new index_t[i];
124    
125    /* waiting for receiving unknown's global ID */    /* waiting for receiving unknown's global ID */
126    if (B->global_id == NULL) {    if (B->global_id == NULL) {
127      Paso_Coupler_finishCollect(coupler);        coupler->finishCollect();
128      global_id = MEMALLOC(num_couple_cols, index_t);      global_id = new index_t[num_couple_cols];
129      #pragma omp parallel for private(i) schedule(static)      #pragma omp parallel for private(i) schedule(static)
130      for (i=0; i<num_couple_cols; ++i)      for (i=0; i<num_couple_cols; ++i)
131          global_id[i] = coupler->recv_buffer[i];          global_id[i] = coupler->recv_buffer[i];
132      B->global_id = global_id;      B->global_id = global_id;
     Paso_Coupler_free(coupler);  
133    } else    } else
134      global_id = B->global_id;      global_id = B->global_id;
135    
136    /* distribute the number of cols in current col_coupleBlock to all ranks */    /* distribute the number of cols in current col_coupleBlock to all ranks */
137      #ifdef ESYS_MPI
138    MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);    MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
139      #endif
 if (MY_DEBUG) fprintf(stderr, "CP1_2\n");  
140    
141    /* distribute global_ids of cols to be considered to all ranks*/    /* distribute global_ids of cols to be considered to all ranks*/
142    len = 0;    len = 0;
# Line 152  if (MY_DEBUG) fprintf(stderr, "CP1_2\n") Line 146  if (MY_DEBUG) fprintf(stderr, "CP1_2\n")
146      recv_offset[i] = len;      recv_offset[i] = len;
147      len += recv_buf[i];      len += recv_buf[i];
148      if (max_num_cols < recv_buf[i])      if (max_num_cols < recv_buf[i])
149      max_num_cols = recv_buf[i];          max_num_cols = recv_buf[i];
150    }    }
151    recv_offset[size] = len;    recv_offset[size] = len;
152    cols_array = TMPMEMALLOC(len, index_t);    cols_array = new index_t[len];
153      #ifdef ESYS_MPI
154    MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);    MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155      #endif
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_3\n", rank);  
 if (MY_DEBUG) {  
     int i=0;  
     char hostname[256];  
     gethostname(hostname, sizeof(hostname));  
     fprintf(stderr, "rank %d PID %d on %s ready for attach\n", A->mpi_info->rank, getpid(), hostname);  
     if (rank == 1) {  
       fflush(stdout);  
       while (0 == i)  
         sleep(5);  
     }  
 }  
156    
157    /* first, prepare the ptr_ptr to be received */    /* first, prepare the ptr_ptr to be received */
158    q = recv->numNeighbors;    q = recv->numNeighbors;
159    len = recv->offsetInShared[q];    len = recv->offsetInShared[q];
160    ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);    ptr_ptr = new index_t[(len+1) * 2];
161    for (p=0; p<q; p++) {    for (p=0; p<q; p++) {
162      row = recv->offsetInShared[p];      row = recv->offsetInShared[p];
163      m = recv->offsetInShared[p + 1];      m = recv->offsetInShared[p + 1];
164        #ifdef ESYS_MPI
165      MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],      MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166                  A->mpi_info->msg_tag_counter+recv->neighbor[p],                  A->mpi_info->msg_tag_counter+recv->neighbor[p],
167                  A->mpi_info->comm,                  A->mpi_info->comm,
168                  &(A->col_coupler->mpi_requests[p]));                  &(A->col_coupler->mpi_requests[p]));
169  if(MY_DEBUG) fprintf(stderr, "rank %d receive %d from %d tag %d (numNeighbors %d)\n", rank, 2*(m-row), recv->neighbor[p], A->mpi_info->msg_tag_counter+recv->neighbor[p], q);      #endif
170    }    }
171    
 if (MY_DEBUG && rank == 1){  
 //Paso_SystemMatrix_print(B);  
 char *str1, *str2;  
 int sum, rank, i, iPtr, ib, block_size;  
 sum = B->col_coupleBlock->numCols*2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "B-col-coupleB: %d rows\n-----------\n", sum);  
 for (i=23; i<24; i++) {  
   sprintf(str2, "Row %d: ",i);  
   strcat(str1, str2);  
   for (iPtr =B->col_coupleBlock->pattern->ptr[i]; iPtr<B->col_coupleBlock->pattern->ptr[i+1]; ++iPtr) {  
     sprintf(str1, "%s (%d %d %f %f", str1, B->global_id[B->col_coupleBlock->pattern->index[iPtr]], iPtr, B->col_coupleBlock->val[iPtr*2], B->col_coupleBlock->val[iPtr*2+1]);  
     for (ib = 0; ib<block_size; ib++) {  
 //      sprintf(str1, "%s %f", str1, B->col_coupleBlock->val[iPtr*block_size+ib]);  
     }  
     sprintf(str1, "%s) ", str1);  
   }  
   sprintf(str1, "%s\n", str1);  
 }  
 fprintf(stderr, "%s\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
172    /* now prepare the rows to be sent (the degree, the offset and the data) */    /* now prepare the rows to be sent (the degree, the offset and the data) */
173    len = 0;    len = 0;
174    i0 = 0;    i0 = 0;
# Line 218  TMPMEMFREE(str2); Line 178  TMPMEMFREE(str2);
178      m_lb = B->col_distribution->first_component[neighbor];      m_lb = B->col_distribution->first_component[neighbor];
179      m_ub = B->col_distribution->first_component[neighbor + 1];      m_ub = B->col_distribution->first_component[neighbor + 1];
180      j_ub = send->offsetInShared[p + 1];      j_ub = send->offsetInShared[p + 1];
 if (MY_DEBUG1 && rank == 1 && p == 0) fprintf(stderr, "rank%d neighbor %d m_lb %d m_ub %d offset %d\n", rank, neighbor, m_lb, m_ub, offset);  
181      for (j=send->offsetInShared[p]; j<j_ub; j++) {      for (j=send->offsetInShared[p]; j<j_ub; j++) {
182      row = send->shared[j];          row = send->shared[j];
183      l_m = 0;          l_m = 0;
184      l_c = 0;          l_c = 0;
185      k_ub = B->col_coupleBlock->pattern->ptr[row + 1];          k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
186      k_lb = B->col_coupleBlock->pattern->ptr[row];          k_lb = B->col_coupleBlock->pattern->ptr[row];
187    
188      /* check part of col_coupleBlock for data to be passed @row */          /* check part of col_coupleBlock for data to be passed @row */
189  //  q = B->mainBlock->pattern->index[B->mainBlock->pattern->ptr[row]] + offset;          for (k=k_lb; k<k_ub; k++) {
     for (k=k_lb; k<k_ub; k++) {  
       m = global_id[B->col_coupleBlock->pattern->index[k]];  
 //    if (m > q) break;  
       if (m > offset) break;  
 if (MY_DEBUG) fprintf(stderr, "rank%d (1) row %d m %d k %d\n", rank, row, m, B->col_coupleBlock->pattern->index[k]);  
       if (m>= m_lb && m < m_ub) {  
         /* data to be passed to sparse matrix B_ext_main */  
         idx_m[l_m] = m - m_lb;  
         memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);  
 if (MY_DEBUG && rank == 1 && p == 0 && j == 23)  
 fprintf(stderr, "[1] l_m %d idx %d val %f %f k%d row%d\n", l_m, m - m_lb, B->col_coupleBlock->val[block_size*k], send_m[l_m*block_size+1], k, row);  
         l_m++;  
       } else {  
         /* data to be passed to sparse matrix B_ext_couple */  
         idx_c[l_c] = m;  
 if (MY_DEBUG) fprintf(stderr, "rank%d (1) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);  
         memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);  
         l_c++;  
       }  
     }  
     k_lb = k;  
   
     /* check mainBlock for data to be passed @row to sparse  
        matrix B_ext_couple */  
     k_ub = B->mainBlock->pattern->ptr[row + 1];  
     k = B->mainBlock->pattern->ptr[row];  
     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));  
     for (; k<k_ub; k++) {  
       m = B->mainBlock->pattern->index[k] + offset;  
 if (MY_DEBUG) fprintf(stderr, "rank%d (2) row %d m %d k %d\n", rank, row, m, B->mainBlock->pattern->index[k]);  
 if (MY_DEBUG) fprintf(stderr, "rank%d (2) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);  
       idx_c[l_c] = m;  
       l_c++;  
     }  
   
     /* check the rest part of col_coupleBlock for data to  
        be passed @row to sparse matrix B_ext_couple */  
     k = k_lb;  
     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];  
     for (k=k_lb; k<k_ub; k++) {  
190            m = global_id[B->col_coupleBlock->pattern->index[k]];            m = global_id[B->col_coupleBlock->pattern->index[k]];
191  if (MY_DEBUG) fprintf(stderr, "rank%d (3) row %d m %d k %d\n", rank, row, m, B->col_coupleBlock->pattern->index[k]);            if (m > offset) break;
192        if (m>= m_lb && m < m_ub) {            if (m>= m_lb && m < m_ub) {
193          /* data to be passed to sparse matrix B_ext_main */              /* data to be passed to sparse matrix B_ext_main */
194          idx_m[l_m] = m - m_lb;              idx_m[l_m] = m - m_lb;
195          memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);              memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196  if (MY_DEBUG && rank == 1 && p == 0 && j == 23)              l_m++;
197  fprintf(stderr, "l_m %d idx %d val %f\n", l_m, m - m_lb, B->col_coupleBlock->val[block_size*k]);            } else {
198          l_m++;              /* data to be passed to sparse matrix B_ext_couple */
199        } else {              idx_c[l_c] = m;
200          /* data to be passed to sparse matrix B_ext_couple */              memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
201          idx_c[l_c] = m;              l_c++;
202  if (MY_DEBUG) fprintf(stderr, "rank%d (3) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);            }
203          memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);          }
204          l_c++;          k_lb = k;
205        }  
206      }          /* check mainBlock for data to be passed @row to sparse
207               matrix B_ext_couple */
208  if (MY_DEBUG && rank == 2) fprintf(stderr, "rank %d p %d i %d j %d l_c %d l_m %d ub %d\n", rank, p, i, j, l_c, l_m, num_couple_cols + num_main_cols);          k_ub = B->mainBlock->pattern->ptr[row + 1];
209      memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);          k = B->mainBlock->pattern->ptr[row];
210      memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));          memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
211            for (; k<k_ub; k++) {
212              m = B->mainBlock->pattern->index[k] + offset;
213              idx_c[l_c] = m;
214              l_c++;
215            }
216    
217            /* check the rest part of col_coupleBlock for data to
218               be passed @row to sparse matrix B_ext_couple */
219            k = k_lb;
220            k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
221            for (k=k_lb; k<k_ub; k++) {
222              m = global_id[B->col_coupleBlock->pattern->index[k]];
223              if (m>= m_lb && m < m_ub) {
224                /* data to be passed to sparse matrix B_ext_main */
225                idx_m[l_m] = m - m_lb;
226                memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227                l_m++;
228              } else {
229                /* data to be passed to sparse matrix B_ext_couple */
230                idx_c[l_c] = m;
231                memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
232                l_c++;
233              }
234            }
235    
236            memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
237            memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
238          send_offset[2*i] = l_m;          send_offset[2*i] = l_m;
239      len += l_m;          len += l_m;
240      memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);          memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
241      memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));          memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
242      send_offset[2*i+1] = l_c;          send_offset[2*i+1] = l_c;
243      len += l_c;          len += l_c;
244  if (MY_DEBUG && rank == 0) {          i++;
 int sum = l_m+l_c, my_i;  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d send_idx[%d] offset %d m%d c%d= (", rank, sum, len-sum, l_m, l_c);  
 for (my_i=len-sum; my_i<len; my_i++){  
   sprintf(str2, "%d ", send_idx[my_i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
     i++;  
245      }      }
246    
 if (MY_DEBUG && rank == 1) {  
 int my_i,sum = len;  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d send_idx[%d] = (", rank, sum);  
 for (my_i=0; my_i<sum; my_i++){  
   sprintf(str2, "%d ", send_idx[my_i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
   
247      /* sending */      /* sending */
248        #ifdef ESYS_MPI
249      MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],      MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
250                  A->mpi_info->msg_tag_counter+rank,                  A->mpi_info->msg_tag_counter+rank,
251                  A->mpi_info->comm,                  A->mpi_info->comm,
252                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
253  if(MY_DEBUG) fprintf(stderr, "rank %d send %d to %d tag %d (offset %d numNeighbors %d)\n", rank, 2*(i-i0), send->neighbor[p], A->mpi_info->msg_tag_counter+rank, i0, send->numNeighbors);      #endif
254      send_degree[p] = len;      send_degree[p] = len;
255      i0 = i;      i0 = i;
256    }    }
257    TMPMEMFREE(send_m);    delete[] send_m;
258    TMPMEMFREE(send_c);    delete[] send_c;
259    TMPMEMFREE(idx_m);    delete[] idx_m;
260    TMPMEMFREE(idx_c);    delete[] idx_c;
261    
262    
263    q = recv->numNeighbors;    q = recv->numNeighbors;
264    len = recv->offsetInShared[q];    len = recv->offsetInShared[q];
265    ptr_main = MEMALLOC((len+1), index_t);    ptr_main = new index_t[(len+1)];
266    ptr_couple = MEMALLOC((len+1), index_t);    ptr_couple = new index_t[(len+1)];
   
267    
268      #ifdef ESYS_MPI
269    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
270                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
271                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
272    A->mpi_info->msg_tag_counter += size;    #endif
273      ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
 if (MY_DEBUG) {  
 int sum = recv->offsetInShared[recv->numNeighbors] * 2;  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d ptr_ptr[%d] = (", rank, sum);  
 for (i=0; i<sum; i++){  
   sprintf(str2, "%d ", ptr_ptr[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
274    
275    j = 0;    j = 0;
276    k = 0;    k = 0;
277    ptr_main[0] = 0;    ptr_main[0] = 0;
278    ptr_couple[0] = 0;    ptr_couple[0] = 0;
 //  #pragma omp parallel for private(i,j,k) schedule(static)  
279    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
280      j += ptr_ptr[2*i];      j += ptr_ptr[2*i];
281      k += ptr_ptr[2*i+1];      k += ptr_ptr[2*i+1];
# Line 380  TMPMEMFREE(str2); Line 283  TMPMEMFREE(str2);
283      ptr_couple[i+1] = k;      ptr_couple[i+1] = k;
284    }    }
285    
286    TMPMEMFREE(ptr_ptr);      delete[] ptr_ptr;  
287    idx_main = MEMALLOC(j, index_t);    idx_main = new index_t[j];
288    idx_couple = MEMALLOC(k, index_t);    idx_couple = new index_t[k];
289    ptr_idx = TMPMEMALLOC(j+k, index_t);    ptr_idx = new index_t[j+k];
290    ptr_val = TMPMEMALLOC((j+k) * block_size, double);    ptr_val = new double[(j+k) * block_size];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_4_2 %d %d %d\n", rank, j, k, len);  
291    
292    /* send/receive index array */    /* send/receive index array */
293    j=0;    j=0;
# Line 395  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 297  if (MY_DEBUG) fprintf(stderr, "rank %d C
297      m = recv->offsetInShared[p+1];      m = recv->offsetInShared[p+1];
298      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
299      if (i > 0) {      if (i > 0) {
300      k_ub ++;          k_ub ++;
301            #ifdef ESYS_MPI
302          MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],          MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
303                  A->mpi_info->msg_tag_counter+recv->neighbor[p],                  A->mpi_info->msg_tag_counter+recv->neighbor[p],
304                  A->mpi_info->comm,                  A->mpi_info->comm,
305                  &(A->col_coupler->mpi_requests[p]));                  &(A->col_coupler->mpi_requests[p]));
306  if(MY_DEBUG) fprintf(stderr, "rank %d (INDEX) recv %d from %d tag %d (%d %d %d %d %d %d)\n", rank, i, recv->neighbor[p], A->mpi_info->msg_tag_counter+recv->neighbor[p], m, k, ptr_main[m], ptr_main[k], ptr_couple[m], ptr_couple[k]);          #endif
     } else {  
 if (MY_DEBUG) fprintf(stderr, "rank%d k %d m %d main(%d %d) couple(%d %d)\n", rank, k, m, ptr_main[m], ptr_main[k], ptr_couple[m], ptr_couple[k]);  
307      }      }
308      j += i;      j += i;
309    }    }
310    
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_4_3  %d %d\n", rank, recv->numNeighbors, k_ub);  
311    j=0;    j=0;
312    k_ub = 0;    k_ub = 0;
313    for (p=0; p<num_neighbors; p++) {    for (p=0; p<num_neighbors; p++) {
314      i = send_degree[p] - j;      i = send_degree[p] - j;
315      if (i > 0){      if (i > 0){
316      k_ub ++;          k_ub ++;
317  if (MY_DEBUG && rank == 0 && send->neighbor[p] == 1) {          #ifdef ESYS_MPI
 int sum = i, my_i;  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d send_idx[%d] offset %d = (", rank, sum, j);  
 for (my_i=0; my_i<sum; my_i++){  
   sprintf(str2, "%d ", send_idx[j+my_i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
318          MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],          MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
319                  A->mpi_info->msg_tag_counter+rank,                  A->mpi_info->msg_tag_counter+rank,
320                  A->mpi_info->comm,                  A->mpi_info->comm,
321                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
322  if(MY_DEBUG) fprintf(stderr, "rank %d (INDEX) send %d to %d tag %d\n", rank, i, send->neighbor[p], A->mpi_info->msg_tag_counter+rank);          #endif
     } else {  
 if (MY_DEBUG) fprintf(stderr, "rank%d send_degree %d, p %d, j %d\n", rank, send_degree[p], p, j);  
323      }      }
324      j = send_degree[p];      j = send_degree[p];
325    }    }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_4_4 %d %d\n", rank,num_neighbors, k_ub);  
326    
327      #ifdef ESYS_MPI
328    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
329                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
330                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
331    A->mpi_info->msg_tag_counter += size;    #endif
332  if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5 %d %d %d\n", rank, len, ptr_main[len], ptr_couple[len]);    ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
 if (MY_DEBUG && rank == 3) {  
 int my_i, sum1 = recv->offsetInShared[recv->numNeighbors];  
 int sum = ptr_main[sum1] + ptr_couple[sum1];  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d ptr_idx[%d] = (", rank, sum);  
 for (my_i=0; my_i<500; my_i++){  
   sprintf(str2, "%d ", ptr_idx[my_i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
333    
334  //  #pragma omp parallel for private(i,j,k,m,p) schedule(static)    #pragma omp parallel for private(i,j,k,m,p) schedule(static)
335    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
336      j = ptr_main[i];      j = ptr_main[i];
337      k = ptr_main[i+1];      k = ptr_main[i+1];
338      m = ptr_couple[i];      m = ptr_couple[i];
339      for (p=j; p<k; p++) {      for (p=j; p<k; p++) {
340      idx_main[p] = ptr_idx[m+p];          idx_main[p] = ptr_idx[m+p];
341      }      }
342      j = ptr_couple[i+1];      j = ptr_couple[i+1];
343      for (p=m; p<j; p++) {      for (p=m; p<j; p++) {
344      idx_couple[p] = ptr_idx[k+p];          idx_couple[p] = ptr_idx[k+p];
345      }      }
346    }    }
347    TMPMEMFREE(ptr_idx);    delete[] ptr_idx;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5_1 %d %d %d\n", rank, num_main_cols, len, B->mainBlock->numRows);  
 if (MY_DEBUG && rank == 3) {  
 int sum = len, sum1 = ptr_main[sum];  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum1*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d ptr_main[%d] = (", rank, sum);  
 for (i=0; i<sum+1; i++){  
   sprintf(str2, "%d ", ptr_main[i]);  
   strcat(str1, str2);  
 }  
 //fprintf(stderr, "%s)\n", str1);  
 sprintf(str1, "rank %d idx_main[%d] = (", rank, sum1);  
 for (i=500; i<700; i++){  
   sprintf(str2, "%d-%d ", i, idx_main[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
348    
349    /* allocate pattern and sparsematrix for B_ext_main */    /* allocate pattern and sparsematrix for B_ext_main */
350    pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,    pattern_main.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
351                  len, num_main_cols, ptr_main, idx_main);                  len, num_main_cols, ptr_main, idx_main));
352    B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,    B->row_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
353          pattern_main, B->row_block_size, B->col_block_size,                  pattern_main, B->row_block_size, B->col_block_size,
354          FALSE);                  false));
   Paso_Pattern_free(pattern_main);  
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5_2\n", rank);  
355    
356    /* allocate pattern and sparsematrix for B_ext_couple */    /* allocate pattern and sparsematrix for B_ext_couple */
357    pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,    pattern_couple.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
358                  len, B->col_distribution->first_component[size],                  len, B->col_distribution->first_component[size],
359          ptr_couple, idx_couple);                  ptr_couple, idx_couple));
360    B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,    B->remote_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
361                  pattern_couple, B->row_block_size, B->col_block_size,                  pattern_couple, B->row_block_size, B->col_block_size,
362                  FALSE);                  false));
   Paso_Pattern_free(pattern_couple);  
   
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_6\n", rank);  
363    
364    /* send/receive value array */    /* send/receive value array */
365    j=0;    j=0;
# Line 523  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 367  if (MY_DEBUG) fprintf(stderr, "rank %d C
367      k = recv->offsetInShared[p];      k = recv->offsetInShared[p];
368      m = recv->offsetInShared[p+1];      m = recv->offsetInShared[p+1];
369      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];      i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
370        #ifdef ESYS_MPI
371      if (i > 0)      if (i > 0)
372          MPI_Irecv(&(ptr_val[j]), i * block_size,          MPI_Irecv(&(ptr_val[j]), i * block_size,
373                  MPI_DOUBLE, recv->neighbor[p],                  MPI_DOUBLE, recv->neighbor[p],
374                  A->mpi_info->msg_tag_counter+recv->neighbor[p],                  A->mpi_info->msg_tag_counter+recv->neighbor[p],
375                  A->mpi_info->comm,                  A->mpi_info->comm,
376                  &(A->col_coupler->mpi_requests[p]));                  &(A->col_coupler->mpi_requests[p]));
377        #endif
378      j += (i * block_size);      j += (i * block_size);
379    }    }
380    
381    j=0;    j=0;
382    for (p=0; p<num_neighbors; p++) {    for (p=0; p<num_neighbors; p++) {
383      i = send_degree[p] - j;      i = send_degree[p] - j;
384        #ifdef ESYS_MPI
385      if (i > 0)      if (i > 0)
386          MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],          MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
387                  A->mpi_info->msg_tag_counter+rank,                  A->mpi_info->msg_tag_counter+rank,
388                  A->mpi_info->comm,                  A->mpi_info->comm,
389                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));                  &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
390        #endif
391      j = send_degree[p] ;      j = send_degree[p] ;
392    }    }
393    
394      #ifdef ESYS_MPI
395    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,    MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
396                  A->col_coupler->mpi_requests,                  A->col_coupler->mpi_requests,
397                  A->col_coupler->mpi_stati);                  A->col_coupler->mpi_stati);
398    A->mpi_info->msg_tag_counter += size;    #endif
399  if (MY_DEBUG) fprintf(stderr, "CP1_7\n");    ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
400    
401    #pragma omp parallel for private(i,j,k,m,p) schedule(static)    #pragma omp parallel for private(i,j,k,m,p) schedule(static)
402    for (i=0; i<len; i++) {    for (i=0; i<len; i++) {
# Line 555  if (MY_DEBUG) fprintf(stderr, "CP1_7\n") Line 404  if (MY_DEBUG) fprintf(stderr, "CP1_7\n")
404      k = ptr_main[i+1];      k = ptr_main[i+1];
405      m = ptr_couple[i];      m = ptr_couple[i];
406      for (p=j; p<k; p++) {      for (p=j; p<k; p++) {
407      memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);          memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
 //  B->row_coupleBlock->val[p*block_size] = ptr_val[(m+p)*block_size];  
408      }      }
409      j = ptr_couple[i+1];      j = ptr_couple[i+1];
410      for (p=m; p<j; p++) {      for (p=m; p<j; p++) {
411      memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);          memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
 //  B->remote_coupleBlock->val[p*block_size] = ptr_val[(k+p)*block_size];  
412      }      }
413    }    }
414    TMPMEMFREE(ptr_val);    delete[] ptr_val;
415    
416    
417    /* release all temp memory allocation */      /* release all temp memory allocation */
418    TMPMEMFREE(cols);      delete[] cols;
419    TMPMEMFREE(cols_array);      delete[] cols_array;
420    TMPMEMFREE(recv_offset);      delete[] recv_offset;
421    TMPMEMFREE(recv_degree);      delete[] recv_degree;
422    TMPMEMFREE(recv_buf);      delete[] recv_buf;
423    TMPMEMFREE(send_buf);      delete[] send_buf;
424    TMPMEMFREE(send_offset);      delete[] send_offset;
425    TMPMEMFREE(send_degree);      delete[] send_degree;
426    TMPMEMFREE(send_idx);      delete[] send_idx;
427  }  }
428    
429  /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)  /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
430     has the same number of rows as P->col_coupleBlock->numCols. Now, we need     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
431     to copy block of data in T to neighbor processors, defined by     to copy block of data in T to neighbour processors, defined by
432      P->col_coupler->connector->recv->neighbor[k] where k is in          P->col_coupler->connector->recv->neighbor[k] where k is in
433      [0, P->col_coupler->connector->recv->numNeighbors).          [0, P->col_coupler->connector->recv->numNeighbors).
434     Rows to be copied to neighbor processor k is in the list defined by     Rows to be copied to neighbor processor k is in the list defined by
435      P->col_coupler->connector->recv->offsetInShared[k] ...          P->col_coupler->connector->recv->offsetInShared[k] ...
436      P->col_coupler->connector->recv->offsetInShared[k+1]  */          P->col_coupler->connector->recv->offsetInShared[k+1]  */
437  void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,  void Paso_Preconditioner_AMG_CopyRemoteData(paso::SystemMatrix_ptr P,
438      index_t **p_ptr, index_t **p_idx, double **p_val,          index_t **p_ptr, index_t **p_idx, double **p_val,
439      index_t *global_id, index_t block_size)          index_t *global_id, index_t block_size)
440  {  {
441    Paso_SharedComponents *send=NULL, *recv=NULL;      paso::SharedComponents_ptr send, recv;
442    index_t send_neighbors, recv_neighbors, send_rows, recv_rows;      index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
443    index_t i, j, p, m, n, rank, size, offset;      index_t i, j, p, m, n, size;
444    index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;      index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
445    index_t *ptr=*p_ptr, *idx=*p_idx;      index_t *ptr=*p_ptr, *idx=*p_idx;
446    double  *val=*p_val, *recv_val=NULL;      double  *val=*p_val, *recv_val=NULL;
447      #ifdef ESYS_MPI
448    rank = P->mpi_info->rank;      index_t rank = P->mpi_info->rank;
449    size = P->mpi_info->size;    #endif
450    send = P->col_coupler->connector->recv;  
451    recv = P->col_coupler->connector->send;      size = P->mpi_info->size;
452    send_neighbors = send->numNeighbors;      send = P->col_coupler->connector->recv;
453    recv_neighbors = recv->numNeighbors;      recv = P->col_coupler->connector->send;
454    send_rows = P->col_coupleBlock->numCols;      send_neighbors = send->numNeighbors;
455    recv_rows = recv->offsetInShared[recv_neighbors];      recv_neighbors = recv->numNeighbors;
456    offset = P->col_distribution->first_component[rank];      send_rows = P->col_coupleBlock->numCols;
457        recv_rows = recv->offsetInShared[recv_neighbors];
458    
459    send_degree = TMPMEMALLOC(send_rows, index_t);      send_degree = new index_t[send_rows];
460    recv_ptr = MEMALLOC(recv_rows + 1, index_t);      recv_ptr = new index_t[recv_rows + 1];
461    #pragma omp for schedule(static) private(i)    #pragma omp for schedule(static) private(i)
462    for (i=0; i<send_rows; i++)      for (i=0; i<send_rows; i++)
463      send_degree[i] = ptr[i+1] - ptr[i];          send_degree[i] = ptr[i+1] - ptr[i];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP5_1 srows %d rrows %d\n", rank, send_rows, recv_rows);  
464    
465    /* First, send/receive the degree */    /* First, send/receive the degree */
466    for (p=0; p<recv_neighbors; p++) { /* Receiving */    for (p=0; p<recv_neighbors; p++) { /* Receiving */
467      m = recv->offsetInShared[p];      m = recv->offsetInShared[p];
468      n = recv->offsetInShared[p+1];      n = recv->offsetInShared[p+1];
469        #ifdef ESYS_MPI
470      MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],      MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
471          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
472          P->mpi_info->comm,                  P->mpi_info->comm,
473          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
474        #endif
475    }    }
476    for (p=0; p<send_neighbors; p++) { /* Sending */    for (p=0; p<send_neighbors; p++) { /* Sending */
477      m = send->offsetInShared[p];      m = send->offsetInShared[p];
478      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
479        #ifdef ESYS_MPI
480      MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],      MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
481          P->mpi_info->msg_tag_counter + rank,                  P->mpi_info->msg_tag_counter + rank,
482          P->mpi_info->comm,                  P->mpi_info->comm,
483          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
484        #endif
485    }    }
486      #ifdef ESYS_MPI
487    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
488          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
489          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
490    P->mpi_info->msg_tag_counter += size;    #endif
491  if (MY_DEBUG) fprintf(stderr, "rank %d CP5_3 %d %d\n", rank, recv_neighbors, send_neighbors);    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
492    
493    TMPMEMFREE(send_degree);    delete[] send_degree;
494    m = Paso_Util_cumsum(recv_rows, recv_ptr);    m = Paso_Util_cumsum(recv_rows, recv_ptr);
495    recv_ptr[recv_rows] = m;    recv_ptr[recv_rows] = m;
496    recv_idx = MEMALLOC(m, index_t);    recv_idx = new index_t[m];
497    recv_val = MEMALLOC(m * block_size, double);    recv_val = new double[m * block_size];
 if (MY_DEBUG) fprintf(stderr, "rank %d receive size %d\n", rank, m);  
498    
499    /* Next, send/receive the index array */    /* Next, send/receive the index array */
500    j = 0;    j = 0;
# Line 652  if (MY_DEBUG) fprintf(stderr, "rank %d r Line 503  if (MY_DEBUG) fprintf(stderr, "rank %d r
503      n = recv->offsetInShared[p+1];      n = recv->offsetInShared[p+1];
504      i = recv_ptr[n] - recv_ptr[m];      i = recv_ptr[n] - recv_ptr[m];
505      if (i > 0) {      if (i > 0) {
506  if (MY_DEBUG) fprintf(stderr, "rank %d recv message size %d from %d (%d, %d, %d, %d, %d) \n", rank, i, recv->neighbor[p], p, m, n, recv_ptr[n], recv_ptr[m]);        #ifdef ESYS_MPI
507        MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],        MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
508          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
509          P->mpi_info->comm,                  P->mpi_info->comm,
510          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
511      } else        #endif
512  if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty recv message %d %d %d %d %d %d\n", rank, recv_neighbors, p, m, n, recv_ptr[n], recv_ptr[m]);      }
513      j += i;      j += i;
514    }    }
515    
# Line 668  if (MY_DEBUG) fprintf(stderr, "rank %d W Line 519  if (MY_DEBUG) fprintf(stderr, "rank %d W
519      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
520      i = ptr[n] - ptr[m];      i = ptr[n] - ptr[m];
521      if (i >0) {      if (i >0) {
522  if (MY_DEBUG) fprintf(stderr, "rank %d send message size %d to %d (%d, %d, %d, %d, %d) \n", rank, i, send->neighbor[p], p, m, n, ptr[n], ptr[m]);          #ifdef ESYS_MPI
523      MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],          MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
524          P->mpi_info->msg_tag_counter + rank,                  P->mpi_info->msg_tag_counter + rank,
525          P->mpi_info->comm,                  P->mpi_info->comm,
526          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
527      j += i;          #endif
528      } else          j += i;
529  if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty send message %d %d %d %d %d %d\n", rank, send_neighbors, p, m, n, ptr[n], ptr[m]);      }
530    }    }
531      #ifdef ESYS_MPI
532    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
533          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
534          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
535    P->mpi_info->msg_tag_counter += size;    #endif
536  if (MY_DEBUG) fprintf(stderr, "rank %d CP5_5\n", rank);    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
537    
538    /* Last, send/receive the data array */    /* Last, send/receive the data array */
539    j = 0;    j = 0;
# Line 689  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 541  if (MY_DEBUG) fprintf(stderr, "rank %d C
541      m = recv->offsetInShared[p];      m = recv->offsetInShared[p];
542      n = recv->offsetInShared[p+1];      n = recv->offsetInShared[p+1];
543      i = recv_ptr[n] - recv_ptr[m];      i = recv_ptr[n] - recv_ptr[m];
544        #ifdef ESYS_MPI
545      if (i > 0)      if (i > 0)
546        MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],        MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
547          P->mpi_info->msg_tag_counter + recv->neighbor[p],                  P->mpi_info->msg_tag_counter + recv->neighbor[p],
548          P->mpi_info->comm,                  P->mpi_info->comm,
549          &(P->col_coupler->mpi_requests[p]));                  &(P->col_coupler->mpi_requests[p]));
550        #endif
551      j += (i*block_size);      j += (i*block_size);
552    }    }
553    
# Line 703  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 557  if (MY_DEBUG) fprintf(stderr, "rank %d C
557      n = send->offsetInShared[p+1];      n = send->offsetInShared[p+1];
558      i = ptr[n] - ptr[m];      i = ptr[n] - ptr[m];
559      if (i >0) {      if (i >0) {
560      MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],          #ifdef ESYS_MPI
561          P->mpi_info->msg_tag_counter + rank,          MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
562          P->mpi_info->comm,                  P->mpi_info->msg_tag_counter + rank,
563          &(P->col_coupler->mpi_requests[p+recv_neighbors]));                  P->mpi_info->comm,
564      j += i * block_size;                  &(P->col_coupler->mpi_requests[p+recv_neighbors]));
565            #endif
566            j += i * block_size;
567      }      }
568    }    }
569      #ifdef ESYS_MPI
570    MPI_Waitall(send_neighbors+recv_neighbors,    MPI_Waitall(send_neighbors+recv_neighbors,
571          P->col_coupler->mpi_requests,                  P->col_coupler->mpi_requests,
572          P->col_coupler->mpi_stati);                  P->col_coupler->mpi_stati);
573    P->mpi_info->msg_tag_counter += size;    #endif
574  if (MY_DEBUG) fprintf(stderr, "rank %d CP5_7\n", rank);    ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
575    
576    /* Clean up and return with recevied ptr, index and data arrays */    /* Clean up and return with received ptr, index and data arrays */
577    MEMFREE(ptr);    delete[] ptr;
578    MEMFREE(idx);    delete[] idx;
579    MEMFREE(val);    delete[] val;
 if (MY_DEBUG) fprintf(stderr, "rank %d len %d\n", rank, recv_ptr[recv_rows]);  
580    *p_ptr = recv_ptr;    *p_ptr = recv_ptr;
581    *p_idx = recv_idx;    *p_idx = recv_idx;
582    *p_val = recv_val;    *p_val = recv_val;
583  }  }
584    
585  Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(  paso::SystemMatrix_ptr Paso_Preconditioner_AMG_buildInterpolationOperator(
586      Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)          paso::SystemMatrix_ptr A, paso::SystemMatrix_ptr P,
587            paso::SystemMatrix_ptr R)
588  {  {
589     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
590     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;     paso::SystemMatrix_ptr out;
591     Paso_SystemMatrix *out=NULL;     paso::SystemMatrixPattern_ptr pattern;
592     Paso_SystemMatrixPattern *pattern=NULL;     paso::Distribution_ptr input_dist, output_dist;
593     Paso_Distribution *input_dist=NULL, *output_dist=NULL;     paso::Connector_ptr col_connector, row_connector;
    Paso_SharedComponents *send =NULL, *recv=NULL;  
    Paso_Connector *col_connector=NULL, *row_connector=NULL;  
    Paso_Coupler *coupler=NULL;  
    Paso_Pattern *main_pattern=NULL;  
    Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;  
594     const dim_t row_block_size=A->row_block_size;     const dim_t row_block_size=A->row_block_size;
595     const dim_t col_block_size=A->col_block_size;     const dim_t col_block_size=A->col_block_size;
596     const dim_t block_size = A->block_size;     const dim_t block_size = A->block_size;
597     const dim_t P_block_size = P->block_size;     const dim_t P_block_size = P->block_size;
    const dim_t num_threads=omp_get_max_threads();  
598     const double ZERO = 0.0;     const double ZERO = 0.0;
599     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
600     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
# Line 760  Paso_SystemMatrix* Paso_Preconditioner_A Line 611  Paso_SystemMatrix* Paso_Preconditioner_A
611     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
612     index_t row_marker_ext, *where_p=NULL;     index_t row_marker_ext, *where_p=NULL;
613     index_t **send_ptr=NULL, **send_idx=NULL;     index_t **send_ptr=NULL, **send_idx=NULL;
614     dim_t l, p, q, global_label, num_neighbors;     dim_t p, num_neighbors;
615     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
616     Esys_MPI_rank *neighbor=NULL;     Esys_MPI_rank *neighbor=NULL;
617     #ifdef ESYS_MPI     #ifdef ESYS_MPI
# Line 770  Paso_SystemMatrix* Paso_Preconditioner_A Line 621  Paso_SystemMatrix* Paso_Preconditioner_A
621       int *mpi_requests=NULL, *mpi_stati=NULL;       int *mpi_requests=NULL, *mpi_stati=NULL;
622     #endif     #endif
623    
624  //   if (size == 1) return NULL;  /*   if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
625  //   if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))       return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
 //     return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);  
626    
627     /* two sparse matrixes R_main and R_couple will be generate, as the     /* two sparse matrices R_main and R_couple will be generate, as the
628        transpose of P_main and P_col_couple, respectively. Note that,        transpose of P_main and P_col_couple, respectively. Note that,
629        R_couple is actually the row_coupleBlock of R (R=P^T) */        R_couple is actually the row_coupleBlock of R (R=P^T) */
630     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);     paso::SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
631       paso::SparseMatrix_ptr R_couple;
632     if (size > 1)     if (size > 1)
633       R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);       R_couple = P->col_coupleBlock->getTranspose();
   
 if (MY_DEBUG) fprintf(stderr, "rank %d CP1\n", rank);  
634    
635     /* generate P_ext, i.e. portion of P that is tored on neighbor procs     /* generate P_ext, i.e. portion of P that is stored on neighbor procs
636        and needed locally for triple matrix product RAP        and needed locally for triple matrix product RAP
637        to be specific, P_ext (on processor k) are group of rows in P, where        to be specific, P_ext (on processor k) are group of rows in P, where
638        the list of rows from processor q is given by        the list of rows from processor q is given by
639      A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
640      rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
641        on q.        on q.
642        P_ext is represented by two sparse matrixes P_ext_main and        P_ext is represented by two sparse matrices P_ext_main and
643        P_ext_couple */        P_ext_couple */
644     Paso_Preconditioner_AMG_extendB(A, P);     Paso_Preconditioner_AMG_extendB(A, P);
 if (MY_DEBUG) fprintf(stderr, "rank %d CP2\n", rank);  
645    
646     /* count the number of cols in P_ext_couple, resize the pattern of     /* count the number of cols in P_ext_couple, resize the pattern of
647        sparse matrix P_ext_couple with new compressed order, and then        sparse matrix P_ext_couple with new compressed order, and then
# Line 814  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 662  if (MY_DEBUG) fprintf(stderr, "rank %d C
662     num_Pext_cols = 0;     num_Pext_cols = 0;
663     if (P->global_id) {     if (P->global_id) {
664       /* first count the number of cols "num_Pext_cols" in both P_ext_couple       /* first count the number of cols "num_Pext_cols" in both P_ext_couple
665      and P->col_coupleBlock */          and P->col_coupleBlock */
666       iptr = 0;       iptr = 0;
667       if (num_Pcouple_cols || sum > 0) {       if (num_Pcouple_cols || sum > 0) {
668      temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);          temp = new index_t[num_Pcouple_cols+sum];
669      for (; iptr<sum; iptr++){          #pragma omp parallel for lastprivate(iptr) schedule(static)
670        temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];          for (iptr=0; iptr<sum; iptr++){
671  if (MY_DEBUG && rank ==1)            temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
672  fprintf(stderr, "rank %d remote_block[%d] = %d\n", rank, iptr, temp[iptr]);          }
673      }          for (j=0; j<num_Pcouple_cols; j++, iptr++){
674      for (j=0; j<num_Pcouple_cols; j++, iptr++){            temp[iptr] = P->global_id[j];
675        temp[iptr] = P->global_id[j];          }
 if (MY_DEBUG && rank ==1)  
 fprintf(stderr, "rank %d global_id[%d] = %d\n", rank, j, P->global_id[j]);  
     }  
676       }       }
677       if (iptr) {       if (iptr) {
678      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
679        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_Pext_cols = 1;
680      #else          i = temp[0];
681        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
682      #endif            if (temp[j] > i) {
683      num_Pext_cols = 1;              i = temp[j];
684      i = temp[0];              temp[num_Pext_cols++] = i;
685      for (j=1; j<iptr; j++) {            }
686        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_Pext_cols++] = i;  
       }  
     }  
687       }       }
 if (MY_DEBUG) fprintf(stderr, "rank%d CP2_1\n", rank);  
688       /* resize the pattern of P_ext_couple */       /* resize the pattern of P_ext_couple */
689       if(num_Pext_cols){       if(num_Pext_cols){
690      global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);          global_id_P = new index_t[num_Pext_cols];
691      for (i=0; i<num_Pext_cols; i++)          #pragma omp parallel for private(i) schedule(static)
692        global_id_P[i] = temp[i];          for (i=0; i<num_Pext_cols; i++)
693  if (MY_DEBUG && rank == 1) {            global_id_P[i] = temp[i];
 int my_i, sum=num_Pext_cols;  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d global_id_P[%d] = (", rank, sum);  
 for (my_i=0; my_i<sum; my_i++) {  
   sprintf(str2, "%d ", global_id_P[my_i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
   
694       }       }
695       if (num_Pcouple_cols || sum > 0)       if (num_Pcouple_cols || sum > 0)
696      TMPMEMFREE(temp);          delete[] temp;
697         #pragma omp parallel for private(i, where_p) schedule(static)
698       for (i=0; i<sum; i++) {       for (i=0; i<sum; i++) {
699      where_p = (index_t *)bsearch(          where_p = (index_t *)bsearch(
700              &(P->remote_coupleBlock->pattern->index[i]),                          &(P->remote_coupleBlock->pattern->index[i]),
701              global_id_P, num_Pext_cols,                          global_id_P, num_Pext_cols,
702                          sizeof(index_t), Paso_comparIndex);                          sizeof(index_t), paso::comparIndex);
703      P->remote_coupleBlock->pattern->index[i] =          P->remote_coupleBlock->pattern->index[i] =
704              (index_t)(where_p -global_id_P);                          (index_t)(where_p -global_id_P);
705       }         }  
706    
707       /* build the mapping */       /* build the mapping */
708       if (num_Pcouple_cols) {       if (num_Pcouple_cols) {
709      Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);          Pcouple_to_Pext = new index_t[num_Pcouple_cols];
710      iptr = 0;          iptr = 0;
711      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
712        if (global_id_P[i] == P->global_id[iptr]) {            if (global_id_P[i] == P->global_id[iptr]) {
713          Pcouple_to_Pext[iptr++] = i;              Pcouple_to_Pext[iptr++] = i;
714          if (iptr == num_Pcouple_cols) break;              if (iptr == num_Pcouple_cols) break;
715        }            }
716       }       }
717     }     }
718    
719  if (MY_DEBUG) fprintf(stderr, "CP3\n");     /* alloc and initialise the makers */
 if (MY_DEBUG && rank == 0) {  
 fprintf(stderr, "Rank %d Now Prolongation Matrix ************************\n", rank);  
 Paso_SystemMatrix_print(P);  
 fprintf(stderr, "Rank %d Now Restriction Matrix ************************\n", rank);  
 Paso_SystemMatrix_print(R);  
 }  
   
    /* alloc and initialize the makers */  
720     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
721     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
722     A_marker = TMPMEMALLOC(num_A_cols, index_t);     A_marker = new index_t[num_A_cols];
723     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
724     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
725     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
726     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
727    
 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_1 %d\n", rank, num_Pcouple_cols);  
728     /* Now, count the size of RAP_ext. Start with rows in R_couple */     /* Now, count the size of RAP_ext. Start with rows in R_couple */
729     sum = 0;     sum = 0;
730     for (i_r=0; i_r<num_Pcouple_cols; i_r++){     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
731       row_marker = sum;       row_marker = sum;
732       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
733       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_2 %d (%d %d)\n", rank, i_r, R_couple->pattern->ptr[i_r], j1_ub);  
734       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
735      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
736  if (MY_DEBUG) fprintf(stderr, "rank %d CP3_3 %d %d\n", rank, j1, i1);          /* then, loop over elements in row i1 of A->col_coupleBlock */
737      /* then, loop over elements in row i1 of A->col_coupleBlock */          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
738      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
739  if (MY_DEBUG) fprintf(stderr, "rank %d CP3_4 %d %d %d\n", rank, j1, i1, j2_ub);            i2 = A->col_coupleBlock->pattern->index[j2];
740      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
741        i2 = A->col_coupleBlock->pattern->index[j2];            /* check whether entry RA[i_r, i2] has been previously visited.
742                 RAP new entry is possible only if entry RA[i_r, i2] has not
743        /* check whether entry RA[i_r, i2] has been previously visited.               been visited yet */
744           RAP new entry is possible only if entry RA[i_r, i2] has not            if (A_marker[i2] != i_r) {
745           been visited yet */              /* first, mark entry RA[i_r,i2] as visited */
746        if (A_marker[i2] != i_r) {              A_marker[i2] = i_r;
747          /* first, mark entry RA[i_r,i2] as visited */  
748          A_marker[i2] = i_r;              /* then loop over elements in row i2 of P_ext_main */
749                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
750          /* then loop over elements in row i2 of P_ext_main */              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
751          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];                  i_c = P->row_coupleBlock->pattern->index[j3];
752          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
753          i_c = P->row_coupleBlock->pattern->index[j3];                  /* check whether entry RAP[i_r,i_c] has been created.
754                       If not yet created, create the entry and increase
755          /* check whether entry RAP[i_r,i_c] has been created.                     the total number of elements in RAP_ext */
756             If not yet created, create the entry and increase                  if (P_marker[i_c] < row_marker) {
757             the total number of elements in RAP_ext */                    P_marker[i_c] = sum;
758          if (P_marker[i_c] < row_marker) {                    sum++;
759            P_marker[i_c] = sum;                  }
760            sum++;              }
761          }  
762          }              /* loop over elements in row i2 of P_ext_couple, do the same */
763                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
764          /* loop over elements in row i2 of P_ext_couple, do the same */              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
765          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
766          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  if (P_marker[i_c] < row_marker) {
767          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                    P_marker[i_c] = sum;
768          if (P_marker[i_c] < row_marker) {                    sum++;
769            P_marker[i_c] = sum;                  }
770            sum++;              }
771          }            }
772          }          }
773        }  
774      }          /* now loop over elements in row i1 of A->mainBlock, repeat
775               the process we have done to block A->col_coupleBlock */
776      /* now loop over elements in row i1 of A->mainBlock, repeat          j2_ub = A->mainBlock->pattern->ptr[i1+1];
777         the process we have done to block A->col_coupleBlock */          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
778      j2_ub = A->mainBlock->pattern->ptr[i1+1];            i2 = A->mainBlock->pattern->index[j2];
779      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
780        i2 = A->mainBlock->pattern->index[j2];              A_marker[i2 + num_Acouple_cols] = i_r;
781        if (A_marker[i2 + num_Acouple_cols] != i_r) {              j3_ub = P->mainBlock->pattern->ptr[i2+1];
782          A_marker[i2 + num_Acouple_cols] = i_r;              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
783          j3_ub = P->mainBlock->pattern->ptr[i2+1];                  i_c = P->mainBlock->pattern->index[j3];
784          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  if (P_marker[i_c] < row_marker) {
785          i_c = P->mainBlock->pattern->index[j3];                    P_marker[i_c] = sum;
786          if (P_marker[i_c] < row_marker) {                    sum++;
787            P_marker[i_c] = sum;                  }
788            sum++;              }
789          }              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
790          }              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
791          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];                  /* note that we need to map the column index in
792          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                     P->col_coupleBlock back into the column index in
793          /* note that we need to map the column index in                     P_ext_couple */
794             P->col_coupleBlock back into the column index in                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
795             P_ext_couple */                  if (P_marker[i_c] < row_marker) {
796          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                    P_marker[i_c] = sum;
797  if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d-%d out of %d+%d) 5\n", rank, i_c, P->col_coupleBlock->pattern->index[j3], num_Pext_cols, num_Pmain_cols);                    sum++;
798          if (P_marker[i_c] < row_marker) {                  }
799            P_marker[i_c] = sum;              }
800            sum++;            }
801          }          }
         }  
       }  
     }  
802       }       }
803     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);  
804    
805     /* Now we have the number of non-zero elements in RAP_ext, allocate     /* Now we have the number of non-zero elements in RAP_ext, allocate
806        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
807     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);     RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
808     RAP_ext_idx = MEMALLOC(sum, index_t);     RAP_ext_idx = new index_t[sum];
809     RAP_ext_val = MEMALLOC(sum * block_size, double);     RAP_ext_val = new double[sum * block_size];
810     RA_val = TMPMEMALLOC(block_size, double);     RA_val = new double[block_size];
811     RAP_val = TMPMEMALLOC(block_size, double);     RAP_val = new double[block_size];
812    
813     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
814     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
# Line 1003  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 816  if (MY_DEBUG) fprintf(stderr, "rank %d C
816     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
817     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
818     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_1 sum %d\n", rank, sum);  
819     sum = 0;     sum = 0;
820     RAP_ext_ptr[0] = 0;     RAP_ext_ptr[0] = 0;
821     for (i_r=0; i_r<num_Pcouple_cols; i_r++){     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
822       row_marker = sum;       row_marker = sum;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_2 i_r %d sum %d\n", rank, i_r, sum);  
823       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
824       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
825       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
826      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
827      R_val = &(R_couple->val[j1*P_block_size]);          R_val = &(R_couple->val[j1*P_block_size]);
828    
829      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
830      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
831      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
832        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
833        //RA_val = R_val * A->col_coupleBlock->val[j2];            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
834        temp_val = &(A->col_coupleBlock->val[j2*block_size]);            for (irb=0; irb<row_block_size; irb++)
       for (irb=0; irb<row_block_size; irb++)  
835              for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
836                  RA_val[irb+row_block_size*icb] = ZERO;                  RA_val[irb+row_block_size*icb] = ZERO;
837        for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
838          rtmp=R_val[irb];              rtmp=R_val[irb];
839          for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
840          RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
841          }              }
842        }            }
   
       /* check whether entry RA[i_r, i2] has been previously visited.  
          RAP new entry is possible only if entry RA[i_r, i2] has not  
          been visited yet */  
       if (A_marker[i2] != i_r) {  
         /* first, mark entry RA[i_r,i2] as visited */  
         A_marker[i2] = i_r;  
   
         /* then loop over elements in row i2 of P_ext_main */  
         j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->row_coupleBlock->pattern->index[j3];  
 //      RAP_val = RA_val * P->row_coupleBlock->val[j3];  
         temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);  
         for (irb=0; irb<row_block_size; irb++)  
           for (icb=0; icb<col_block_size; icb++)  
             RAP_val[irb+row_block_size*icb] = ZERO;  
         for (icb=0; icb<P_block_size; icb++) {  
           rtmp = temp_val[icb];  
           for (irb=0; irb<row_block_size; irb++) {  
             RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;  
           }  
         }  
   
         /* check whether entry RAP[i_r,i_c] has been created.  
            If not yet created, create the entry and increase  
            the total number of elements in RAP_ext */  
         if (P_marker[i_c] < row_marker) {  
           P_marker[i_c] = sum;  
           //RAP_ext_val[sum*block_size] = RAP_val;  
           memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));  
           RAP_ext_idx[sum] = i_c + offset;  
 if (MY_DEBUG){  
 if (rank == 1) fprintf(stderr, "1-ext[%d (%d,%d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum*block_size]);  
 }  
           sum++;  
         } else {  
           //RAP_ext_val[P_marker[i_c] * block_size] += RAP_val;  
           temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);  
           for (ib=0; ib<block_size; ib++) {  
             temp_val[ib] += RAP_val[ib];  
           }  
             
 if (MY_DEBUG){  
 if (rank == 1) fprintf(stderr, "1-ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);  
 }  
         }  
         }  
843    
844          /* loop over elements in row i2 of P_ext_couple, do the same */            /* check whether entry RA[i_r, i2] has been previously visited.
845          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];               RAP new entry is possible only if entry RA[i_r, i2] has not
846          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {               been visited yet */
847          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;            if (A_marker[i2] != i_r) {
848          //RAP_val = RA_val * P->remote_coupleBlock->val[j3];              /* first, mark entry RA[i_r,i2] as visited */
849                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);              A_marker[i2] = i_r;
850          for (irb=0; irb<row_block_size; irb++)  
851            for (icb=0; icb<col_block_size; icb++)              /* then loop over elements in row i2 of P_ext_main */
852              RAP_val[irb+row_block_size*icb]=ZERO;              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
853                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
854                    i_c = P->row_coupleBlock->pattern->index[j3];
855                    temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
856                    for (irb=0; irb<row_block_size; irb++)
857                      for (icb=0; icb<col_block_size; icb++)
858                        RAP_val[irb+row_block_size*icb] = ZERO;
859                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
860            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
861                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
862                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
863                    }                    }
864                  }                  }
865    
866          if (P_marker[i_c] < row_marker) {                  /* check whether entry RAP[i_r,i_c] has been created.
867            P_marker[i_c] = sum;                     If not yet created, create the entry and increase
868            //RAP_ext_val[sum] = RAP_val;                     the total number of elements in RAP_ext */
869            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                  if (P_marker[i_c] < row_marker) {
870            RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    P_marker[i_c] = sum;
871  if (MY_DEBUG){                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
872  if (rank == 1) fprintf(stderr, "ext[%d (%d,%d)]=%f (R%f, A(%d %d)%f, P%f)\n", sum, i_r, i_c, RAP_ext_val[sum], *R_val, i1, i2, A->col_coupleBlock->val[j2], P->remote_coupleBlock->val[j3]);                    RAP_ext_idx[sum] = i_c + offset;
873  }                    sum++;
874            sum++;                  } else {
         } else {  
           //RAP_ext_val[P_marker[i_c]] += RAP_val;  
875                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
876                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
877                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
878                    }                    }
879                    }
880                }
881    
882  if (MY_DEBUG){              /* loop over elements in row i2 of P_ext_couple, do the same */
883  if (rank == 1) fprintf(stderr, "ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
884  }              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
885          }                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
886          }                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
887                    for (irb=0; irb<row_block_size; irb++)
888                      for (icb=0; icb<col_block_size; icb++)
889                        RAP_val[irb+row_block_size*icb]=ZERO;
890                    for (icb=0; icb<P_block_size; icb++) {
891                      rtmp = temp_val[icb];
892                      for (irb=0; irb<row_block_size; irb++) {
893                        RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
894                      }
895                    }
896    
897        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.                  if (P_marker[i_c] < row_marker) {
898           Only the contributions are added. */                    P_marker[i_c] = sum;
899        } else {                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
900          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
901          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                    sum++;
902          i_c = P->row_coupleBlock->pattern->index[j3];                  } else {
903          //RAP_val = RA_val * P->row_coupleBlock->val[j3];                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
904                      for (ib=0; ib<block_size; ib++) {
905                        temp_val[ib] += RAP_val[ib];
906                      }
907                    }
908                }
909    
910              /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
911                 Only the contributions are added. */
912              } else {
913                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
914                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
915                    i_c = P->row_coupleBlock->pattern->index[j3];
916                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
917          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
918            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
919              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
920                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
921            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
922                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
923                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
924                    }                    }
925                  }                  }
926    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
927                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
928                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
929                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
930                  }                  }
931                }
932  if (MY_DEBUG){              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
933  if (rank == 1) fprintf(stderr, "Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
934  }                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
         }  
         j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;  
         //RAP_val = RA_val * P->remote_coupleBlock->val[j3];  
935                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
936          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
937            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
938              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
939                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
940            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
941                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
942                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
943                    }                    }
944                  }                  }
945    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
946                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
947                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
948                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
949                  }                  }
950                }
951              }
952            }
953    
954  if (MY_DEBUG){          /* now loop over elements in row i1 of A->mainBlock, repeat
955  if (rank == 1) fprintf(stderr, "Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);             the process we have done to block A->col_coupleBlock */
956  }          j2_ub = A->mainBlock->pattern->ptr[i1+1];
957          }          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
958        }            i2 = A->mainBlock->pattern->index[j2];
     }  
   
     /* now loop over elements in row i1 of A->mainBlock, repeat  
        the process we have done to block A->col_coupleBlock */  
     j2_ub = A->mainBlock->pattern->ptr[i1+1];  
     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
       i2 = A->mainBlock->pattern->index[j2];  
       //RA_val = R_val * A->mainBlock->val[j2];  
959            temp_val = &(A->mainBlock->val[j2*block_size]);            temp_val = &(A->mainBlock->val[j2*block_size]);
960        for (irb=0; irb<row_block_size; irb++)            for (irb=0; irb<row_block_size; irb++)
961          for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
962          RA_val[irb+row_block_size*icb]=ZERO;                  RA_val[irb+row_block_size*icb]=ZERO;
963            for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
964          rtmp=R_val[irb];              rtmp=R_val[irb];
965              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
966                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
967              }              }
968            }            }
969    
970        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
971          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
972          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
973          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
974          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
         //RAP_val = RA_val * P->mainBlock->val[j3];  
975                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
976          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
977            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
978              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
979                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
980            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
981                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
982                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
983                    }                    }
984                  }                  }
985    
986          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
987            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
988            //RAP_ext_val[sum] = RAP_val;                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
989            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    RAP_ext_idx[sum] = i_c + offset;
990            RAP_ext_idx[sum] = i_c + offset;                    sum++;
991  if (MY_DEBUG){                  } else {
 if (rank == 1) fprintf(stderr, "Main 1-ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);  
 }  
           sum++;  
         } else {  
           //RAP_ext_val[P_marker[i_c]] += RAP_val;  
992                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
993                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
994                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
995                    }                    }
996                    }
997  if (MY_DEBUG){              }
998  if (rank == 1) fprintf(stderr, "Main 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
999  }              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1000          }                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
 if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 1\n", rank, i_c, num_Pext_cols + num_Pmain_cols);  
         //RAP_val = RA_val * P->col_coupleBlock->val[j3];  
1001                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1002          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1003                    for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1004              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1005                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1006            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1007                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1008                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1009                    }                    }
1010                  }                  }
1011    
1012          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
1013            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
1014            //RAP_ext_val[sum] = RAP_val;                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
           memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));  
1015                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1016  if (MY_DEBUG){                    sum++;
1017  if (rank == 1) fprintf(stderr, "Main ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);                  } else {
 }  
           sum++;  
         } else {  
           //RAP_ext_val[P_marker[i_c]] += RAP_val;  
1018                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1019                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1020                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1021                    }                    }
1022                    }
1023  if (MY_DEBUG){              }
1024  if (rank == 1) fprintf(stderr, "Main ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);            } else {
1025  }              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1026          }              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1027          }                  i_c = P->mainBlock->pattern->index[j3];
       } else {  
         j3_ub = P->mainBlock->pattern->ptr[i2+1];  
         for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->mainBlock->pattern->index[j3];  
         //RAP_val = RA_val * P->mainBlock->val[j3];  
1028                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
1029          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1030            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1031              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1032                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1033            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1034                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1035                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1036                    }                    }
1037                  }                  }
1038    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
1039                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1040                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1041                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1042                  }                  }
1043                }
1044  if (MY_DEBUG){              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1045  if (rank == 1) fprintf(stderr, "Main Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1046  }                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         //RAP_val = RA_val * P->col_coupleBlock->val[j3];  
1047                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1048          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1049            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1050              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1051                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1052              rtmp=temp_val[icb];                      rtmp=temp_val[icb];
1053                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1054                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1055                    }                    }
1056                  }                  }
1057    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
1058                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1059                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1060                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1061                  }                  }
1062                }
1063  if (MY_DEBUG){            }
1064  if (rank == 1) fprintf(stderr, "Main Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);          }
 }  
         }  
       }  
     }  
1065       }       }
1066       RAP_ext_ptr[i_r+1] = sum;       RAP_ext_ptr[i_r+1] = sum;
1067     }     }
1068     TMPMEMFREE(P_marker);     delete[] P_marker;
1069     TMPMEMFREE(Pcouple_to_Pext);     delete[] Pcouple_to_Pext;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP5 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);  
   
 if (MY_DEBUG) {  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d RAP_ext_idx[%d] = (", rank, sum);  
 for (i=0; i<sum; i++) {  
   sprintf(str2, "%d ", RAP_ext_idx[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
1070    
1071     /* Now we have part of RAP[r,c] where row "r" is the list of rows     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1072        which is given by the column list of P->col_coupleBlock, and        which is given by the column list of P->col_coupleBlock, and
1073        column "c" is the list of columns which possiblely covers the        column "c" is the list of columns which possibly covers the
1074        whole column range of systme martris P. This part of data is to        whole column range of system matrix P. This part of data is to
1075        be passed to neighboring processors, and added to corresponding        be passed to neighbouring processors, and added to corresponding
1076        RAP[r,c] entries in the neighboring processors */        RAP[r,c] entries in the neighbouring processors */
1077     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1078          &RAP_ext_val, global_id_P, block_size);                  &RAP_ext_val, global_id_P, block_size);
 if (MY_DEBUG) fprintf(stderr, "rank %d CP6\n", rank);  
1079    
1080     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1081     sum = RAP_ext_ptr[num_RAPext_rows];     sum = RAP_ext_ptr[num_RAPext_rows];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP6_0 %d %d (%d %d)\n", rank, sum, num_Pext_cols, num_RAPext_rows, num_Pcouple_cols);  
1082     num_RAPext_cols = 0;     num_RAPext_cols = 0;
1083     if (num_Pext_cols || sum > 0) {     if (num_Pext_cols || sum > 0) {
1084       temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);       temp = new index_t[num_Pext_cols+sum];
1085       j1_ub = offset + num_Pmain_cols;       j1_ub = offset + num_Pmain_cols;
1086       for (i=0, iptr=0; i<sum; i++) {       for (i=0, iptr=0; i<sum; i++) {
1087      if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */ {          if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */
1088        temp[iptr++] = RAP_ext_idx[i];          /* XXX */            temp[iptr++] = RAP_ext_idx[i];                  /* XXX */
 if (MY_DEBUG && rank ==1) fprintf(stderr, "RAP_ext_idx[%d]=%d\n", i, RAP_ext_idx[i]);  
 }  
1089       }       }
1090       for (j=0; j<num_Pext_cols; j++, iptr++){       for (j=0; j<num_Pext_cols; j++, iptr++){
1091      temp[iptr] = global_id_P[j];          temp[iptr] = global_id_P[j];
1092       }       }
1093    
1094       if (iptr) {       if (iptr) {
1095      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1096        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_RAPext_cols = 1;
1097      #else          i = temp[0];
1098        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
1099      #endif            if (temp[j] > i) {
1100      num_RAPext_cols = 1;              i = temp[j];
1101      i = temp[0];              temp[num_RAPext_cols++] = i;
1102      for (j=1; j<iptr; j++) {            }
1103        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_RAPext_cols++] = i;  
       }  
     }  
   
 if (MY_DEBUG) {  
 char *str1, *str2;  
 index_t my_sum=iptr, i;  
 str1 = TMPMEMALLOC(my_sum*20+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d temp[%d] = (", rank, my_sum);  
 for (i=0; i<my_sum; i++){  
   sprintf(str2, "%d ", temp[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
1104       }       }
1105     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP6_1 %d %d\n", rank, iptr, num_RAPext_cols);  
1106    
1107     /* resize the pattern of P_ext_couple */     /* resize the pattern of P_ext_couple */
1108     if(num_RAPext_cols){     if(num_RAPext_cols){
1109       global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);       global_id_RAP = new index_t[num_RAPext_cols];
1110         #pragma omp parallel for private(i) schedule(static)
1111       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1112      global_id_RAP[i] = temp[i];          global_id_RAP[i] = temp[i];
1113     }     }
 if (MY_DEBUG) {  
 char *str1, *str2;  
 index_t my_sum=num_RAPext_cols, i;  
 str1 = TMPMEMALLOC(my_sum*20+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, my_sum);  
 for (i=0; i<my_sum; i++){  
   sprintf(str2, "%d ", global_id_RAP[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
1114     if (num_Pext_cols || sum > 0)     if (num_Pext_cols || sum > 0)
1115       TMPMEMFREE(temp);       delete[] temp;
1116     j1_ub = offset + num_Pmain_cols;     j1_ub = offset + num_Pmain_cols;
1117       #pragma omp parallel for private(i, where_p) schedule(static)
1118     for (i=0; i<sum; i++) {     for (i=0; i<sum; i++) {
1119       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){       if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1120      where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,          where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1121  /*XXX*/         num_RAPext_cols, sizeof(index_t), Paso_comparIndex);  /*XXX*/                 num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1122      RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);          RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1123       } else       } else
1124      RAP_ext_idx[i] = RAP_ext_idx[i] - offset;          RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1125     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP7 %d\n", rank, num_Pext_cols);  
1126    
1127     /* build the mapping */     /* build the mapping */
1128     if (num_Pcouple_cols) {     if (num_Pcouple_cols) {
1129       Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);       Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1130       iptr = 0;       iptr = 0;
1131       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1132      if (global_id_RAP[i] == P->global_id[iptr]) {          if (global_id_RAP[i] == P->global_id[iptr]) {
1133        Pcouple_to_RAP[iptr++] = i;            Pcouple_to_RAP[iptr++] = i;
1134        if (iptr == num_Pcouple_cols) break;            if (iptr == num_Pcouple_cols) break;
1135      }          }
1136     }     }
1137    
1138     if (num_Pext_cols) {     if (num_Pext_cols) {
1139       Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);       Pext_to_RAP = new index_t[num_Pext_cols];
1140       iptr = 0;       iptr = 0;
1141       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1142      if (global_id_RAP[i] == global_id_P[iptr]) {          if (global_id_RAP[i] == global_id_P[iptr]) {
1143        Pext_to_RAP[iptr++] = i;            Pext_to_RAP[iptr++] = i;
1144        if (iptr == num_Pext_cols) break;            if (iptr == num_Pext_cols) break;
1145      }          }
1146     }     }
1147    
1148     if (global_id_P){     if (global_id_P){
1149       TMPMEMFREE(global_id_P);       delete[] global_id_P;
1150       global_id_P = NULL;       global_id_P = NULL;
1151     }     }
1152    
1153     /* alloc and initialize the makers */     /* alloc and initialise the makers */
1154     sum = num_RAPext_cols + num_Pmain_cols;     sum = num_RAPext_cols + num_Pmain_cols;
1155     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
1156     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
1157     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
1158     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
1159     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP8 %d\n", rank, num_Pmain_cols);  
1160    
1161     /* Now, count the size of RAP. Start with rows in R_main */     /* Now, count the size of RAP. Start with rows in R_main */
1162     num_neighbors = P->col_coupler->connector->send->numNeighbors;     num_neighbors = P->col_coupler->connector->send->numNeighbors;
# Line 1479  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 1170  if (MY_DEBUG) fprintf(stderr, "rank %d C
1170       row_marker_ext = j;       row_marker_ext = j;
1171    
1172       /* Mark the diagonal element RAP[i_r, i_r], and other elements       /* Mark the diagonal element RAP[i_r, i_r], and other elements
1173      in RAP_ext */          in RAP_ext */
1174       P_marker[i_r] = i;       P_marker[i_r] = i;
1175       i++;       i++;
1176       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
1177      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1178        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
1179          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1180            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
1181            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
1182          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
1183            P_marker[i_c] = i;                    P_marker[i_c] = i;
1184            i++;                    i++;
1185          }                  }
1186            } else {                } else {
1187          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
1188            P_marker[i_c] = j;                    P_marker[i_c] = j;
1189            j++;                    j++;
1190          }                  }
1191            }                }
1192          }              }
1193          break;              break;
1194        }            }
1195      }          }
1196       }       }
1197    
1198       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
1199       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
1200       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1201      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
1202    
1203            /* then, loop over elements in row i1 of A->col_coupleBlock */
1204            j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1205            for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1206              i2 = A->col_coupleBlock->pattern->index[j2];
1207    
1208              /* check whether entry RA[i_r, i2] has been previously visited.
1209                 RAP new entry is possible only if entry RA[i_r, i2] has not
1210                 been visited yet */
1211              if (A_marker[i2] != i_r) {
1212                /* first, mark entry RA[i_r,i2] as visited */
1213                A_marker[i2] = i_r;
1214    
1215                /* then loop over elements in row i2 of P_ext_main */
1216                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1217                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1218                    i_c = P->row_coupleBlock->pattern->index[j3];
1219    
1220                    /* check whether entry RAP[i_r,i_c] has been created.
1221                       If not yet created, create the entry and increase
1222                       the total number of elements in RAP_ext */
1223                    if (P_marker[i_c] < row_marker) {
1224                      P_marker[i_c] = i;
1225                      i++;
1226                    }
1227                }
1228    
1229      /* then, loop over elements in row i1 of A->col_coupleBlock */              /* loop over elements in row i2 of P_ext_couple, do the same */
1230      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1231      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232        i2 = A->col_coupleBlock->pattern->index[j2];                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1233                    if (P_marker[i_c] < row_marker_ext) {
1234        /* check whether entry RA[i_r, i2] has been previously visited.                    P_marker[i_c] = j;
1235           RAP new entry is possible only if entry RA[i_r, i2] has not                    j++;
1236           been visited yet */                  }
1237        if (A_marker[i2] != i_r) {              }
1238          /* first, mark entry RA[i_r,i2] as visited */            }
1239          A_marker[i2] = i_r;          }
1240    
1241          /* then loop over elements in row i2 of P_ext_main */          /* now loop over elements in row i1 of A->mainBlock, repeat
1242          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];             the process we have done to block A->col_coupleBlock */
1243          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {          j2_ub = A->mainBlock->pattern->ptr[i1+1];
1244          i_c = P->row_coupleBlock->pattern->index[j3];          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1245              i2 = A->mainBlock->pattern->index[j2];
1246          /* check whether entry RAP[i_r,i_c] has been created.            if (A_marker[i2 + num_Acouple_cols] != i_r) {
1247             If not yet created, create the entry and increase              A_marker[i2 + num_Acouple_cols] = i_r;
1248             the total number of elements in RAP_ext */              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1249          if (P_marker[i_c] < row_marker) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1250            P_marker[i_c] = i;                  i_c = P->mainBlock->pattern->index[j3];
1251            i++;                  if (P_marker[i_c] < row_marker) {
1252          }                    P_marker[i_c] = i;
1253          }                    i++;
1254                    }
1255          /* loop over elements in row i2 of P_ext_couple, do the same */              }
1256          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1257          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1258          i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                  /* note that we need to map the column index in
1259          if (P_marker[i_c] < row_marker_ext) {                     P->col_coupleBlock back into the column index in
1260            P_marker[i_c] = j;                     P_ext_couple */
1261            j++;                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1262          }                  if (P_marker[i_c] < row_marker_ext) {
1263          }                    P_marker[i_c] = j;
1264        }                    j++;
1265      }                  }
1266                }
1267      /* now loop over elements in row i1 of A->mainBlock, repeat            }
1268         the process we have done to block A->col_coupleBlock */          }
     j2_ub = A->mainBlock->pattern->ptr[i1+1];  
     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
       i2 = A->mainBlock->pattern->index[j2];  
       if (A_marker[i2 + num_Acouple_cols] != i_r) {  
         A_marker[i2 + num_Acouple_cols] = i_r;  
         j3_ub = P->mainBlock->pattern->ptr[i2+1];  
         for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->mainBlock->pattern->index[j3];  
         if (P_marker[i_c] < row_marker) {  
           P_marker[i_c] = i;  
           i++;  
         }  
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         /* note that we need to map the column index in  
            P->col_coupleBlock back into the column index in  
            P_ext_couple */  
         i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
 if (MY_DEBUG && (i_c <0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 2\n", rank, i_c, sum);  
         if (P_marker[i_c] < row_marker_ext) {  
           P_marker[i_c] = j;  
           j++;  
         }  
         }  
       }  
     }  
1269       }       }
1270     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP9 main_size%d couple_size%d\n", rank, i, j);  
1271    
1272     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1273        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,        Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1274        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for        and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1275        RAP_couple */        RAP_couple */
1276     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_main_ptr = new index_t[num_Pmain_cols+1];
1277     RAP_main_idx = MEMALLOC(i, index_t);     RAP_main_idx = new index_t[i];
1278     RAP_main_val = TMPMEMALLOC(i * block_size, double);     RAP_main_val = new double[i * block_size];
1279     RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1280     RAP_couple_idx = MEMALLOC(j, index_t);     RAP_couple_idx = new index_t[j];
1281     RAP_couple_val = TMPMEMALLOC(j * block_size, double);     RAP_couple_val = new double[j * block_size];
1282    
1283     RAP_main_ptr[num_Pmain_cols] = i;     RAP_main_ptr[num_Pmain_cols] = i;
1284     RAP_couple_ptr[num_Pmain_cols] = j;     RAP_couple_ptr[num_Pmain_cols] = j;
# Line 1611  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 1300  if (MY_DEBUG) fprintf(stderr, "rank %d C
1300       RAP_couple_ptr[i_r] = row_marker_ext;       RAP_couple_ptr[i_r] = row_marker_ext;
1301    
1302       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements       /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1303      in row i_r of RAP_ext */          in row i_r of RAP_ext */
1304       P_marker[i_r] = i;       P_marker[i_r] = i;
1305       for (ib=0; ib<block_size; ib++)       for (ib=0; ib<block_size; ib++)
1306         RAP_main_val[i*block_size+ib] = ZERO;         RAP_main_val[i*block_size+ib] = ZERO;
# Line 1619  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 1308  if (MY_DEBUG) fprintf(stderr, "rank %d C
1308       i++;       i++;
1309    
1310       for (j1=0; j1<num_neighbors; j1++) {       for (j1=0; j1<num_neighbors; j1++) {
1311      for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {          for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1312        if (shared[j2] == i_r) {            if (shared[j2] == i_r) {
1313          for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {              for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1314            i_c = RAP_ext_idx[k];                i_c = RAP_ext_idx[k];
1315            if (i_c < num_Pmain_cols) {                if (i_c < num_Pmain_cols) {
1316  if (MY_DEBUG){                  if (P_marker[i_c] < row_marker) {
1317  if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: ext %f\n", RAP_ext_val[k*block_size]);                    P_marker[i_c] = i;
1318  }                    memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1319          if (P_marker[i_c] < row_marker) {                    RAP_main_idx[i] = i_c;
1320            P_marker[i_c] = i;                    i++;
1321            //RAP_main_val[i] = RAP_ext_val[k];                  } else {
1322            memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));                    t1_val = &(RAP_ext_val[k*block_size]);
1323  if (MY_DEBUG){                    t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1324  if (rank == 0 && i == 2) fprintf(stderr, "ext %f\n", RAP_ext_val[k*block_size]);                    for (ib=0; ib<block_size; ib++)
1325  }                      t2_val[ib] += t1_val[ib];
1326            RAP_main_idx[i] = i_c;                  }
1327            i++;                } else {
1328          } else {                  if (P_marker[i_c] < row_marker_ext) {
1329            //RAP_main_val[P_marker[i_c]] += RAP_ext_val[k];                    P_marker[i_c] = j;
1330            t1_val = &(RAP_ext_val[k*block_size]);                    memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1331            t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
1332            for (ib=0; ib<block_size; ib++)                    j++;
1333              t2_val[ib] += t1_val[ib];                  } else {
1334  if (MY_DEBUG){                    t1_val = &(RAP_ext_val[k*block_size]);
1335  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "i_c %d ext %f\n", i_c, RAP_ext_val[k]);                    t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1336  }                      for (ib=0; ib<block_size; ib++)
1337          }                      t2_val[ib] += t1_val[ib];
1338            } else {                  }
1339          if (P_marker[i_c] < row_marker_ext) {                }
1340            P_marker[i_c] = j;              }
1341            //RAP_couple_val[j] = RAP_ext_val[k];              break;
1342            memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));            }
1343            RAP_couple_idx[j] = i_c - num_Pmain_cols;          }
           j++;  
         } else {  
           //RAP_couple_val[P_marker[i_c]] += RAP_ext_val[k];  
           t1_val = &(RAP_ext_val[k*block_size]);  
           t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);  
           for (ib=0; ib<block_size; ib++)  
             t2_val[ib] += t1_val[ib];  
         }  
           }  
         }  
         break;  
       }  
     }  
1344       }       }
1345    
1346       /* then, loop over elements in row i_r of R_main */       /* then, loop over elements in row i_r of R_main */
1347       j1_ub = R_main->pattern->ptr[i_r+1];       j1_ub = R_main->pattern->ptr[i_r+1];
1348       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1349      i1 = R_main->pattern->index[j1];          i1 = R_main->pattern->index[j1];
1350      R_val = &(R_main->val[j1*P_block_size]);          R_val = &(R_main->val[j1*P_block_size]);
1351    
1352      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
1353      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1354      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1355        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
       //RA_val = R_val * A->col_coupleBlock->val[j2];  
1356            temp_val = &(A->col_coupleBlock->val[j2*block_size]);            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1357        for (irb=0; irb<row_block_size; irb++)            for (irb=0; irb<row_block_size; irb++)
1358          for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
1359          RA_val[irb+row_block_size*icb]=ZERO;                  RA_val[irb+row_block_size*icb]=ZERO;
1360            for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
1361          rtmp=R_val[irb];              rtmp=R_val[irb];
1362              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
1363                  RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1364              }              }
1365            }            }
1366    
1367    
1368        /* check whether entry RA[i_r, i2] has been previously visited.            /* check whether entry RA[i_r, i2] has been previously visited.
1369           RAP new entry is possible only if entry RA[i_r, i2] has not               RAP new entry is possible only if entry RA[i_r, i2] has not
1370           been visited yet */               been visited yet */
1371        if (A_marker[i2] != i_r) {            if (A_marker[i2] != i_r) {
1372          /* first, mark entry RA[i_r,i2] as visited */              /* first, mark entry RA[i_r,i2] as visited */
1373          A_marker[i2] = i_r;              A_marker[i2] = i_r;
1374    
1375          /* then loop over elements in row i2 of P_ext_main */              /* then loop over elements in row i2 of P_ext_main */
1376          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1377          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1378          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
         //RAP_val = RA_val * P->row_coupleBlock->val[j3];  
1379                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1380          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1381            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1382              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1383                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1384            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1385                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1386                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1387                    }                    }
1388                  }                  }
1389    
1390    
1391          /* check whether entry RAP[i_r,i_c] has been created.                  /* check whether entry RAP[i_r,i_c] has been created.
1392             If not yet created, create the entry and increase                     If not yet created, create the entry and increase
1393             the total number of elements in RAP_ext */                     the total number of elements in RAP_ext */
1394  if (MY_DEBUG){                  if (P_marker[i_c] < row_marker) {
1395  if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP %.20f R %f %f A %f P %f %f [%d %d]\n", RAP_val[0], R_val[0], R_val[1], A->col_coupleBlock->val[j2*block_size], temp_val[0], temp_val[1], i2, j3);                    P_marker[i_c] = i;
1396  }                    memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1397          if (P_marker[i_c] < row_marker) {                    RAP_main_idx[i] = i_c;
1398            P_marker[i_c] = i;                    i++;
1399            //RAP_main_val[i] = RAP_val;                  } else {
           memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));  
 if (MY_DEBUG){  
 if (rank == 0 && i == 2) fprintf(stderr, "R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);  
 }    
           RAP_main_idx[i] = i_c;  
           i++;  
         } else {  
           //RAP_main_val[P_marker[i_c]] += RAP_val;  
1400                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1401                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1402                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1403                    }                    }
1404                    }
1405                }
1406    
1407  if (MY_DEBUG){              /* loop over elements in row i2 of P_ext_couple, do the same */
1408  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1 , *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1409  }              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1410          }                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         }  
   
         /* loop over elements in row i2 of P_ext_couple, do the same */  
         j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         //RAP_val = RA_val * P->remote_coupleBlock->val[j3];  
1411                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1412          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1413            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1414              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1415                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1416            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1417                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1418                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1419                    }                    }
1420                  }                  }
1421    
1422          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
1423            P_marker[i_c] = j;                    P_marker[i_c] = j;
1424            //RAP_couple_val[j] = RAP_val;                    memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1425            memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
1426            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    j++;
1427            j++;                  } else {
         } else {  
           //RAP_couple_val[P_marker[i_c]] += RAP_val;  
1428                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1429                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1430                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1431                    }                    }
1432          }                  }
1433          }              }
1434    
1435        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.            /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1436           Only the contributions are added. */               Only the contributions are added. */
1437        } else {            } else {
1438          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1439          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1440          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
         //RAP_val = RA_val * P->row_coupleBlock->val[j3];  
1441                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1442          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1443            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1444              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1445                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1446            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1447                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1448                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1449                    }                    }
1450                  }                  }
1451    
         //RAP_main_val[P_marker[i_c]] += RAP_val;  
 if (MY_DEBUG){  
 if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[2] %.20f\n", RAP_val[0]);  
 }  
1452                  temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1453                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1454                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1455                  }                  }
1456                }
1457  if (MY_DEBUG){              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1458  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1459  }                  i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         }  
         j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         //RAP_val = RA_val * P->remote_coupleBlock->val[j3];  
1460                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1461          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1462            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1463              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1464                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1465            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1466                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1467                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1468                    }                    }
1469                  }                  }
1470    
         //RAP_couple_val[P_marker[i_c]] += RAP_val;  
1471                  temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1472                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1473                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1474                  }                  }
1475          }              }
1476        }            }
1477      }          }
1478    
1479      /* now loop over elements in row i1 of A->mainBlock, repeat          /* now loop over elements in row i1 of A->mainBlock, repeat
1480         the process we have done to block A->col_coupleBlock */             the process we have done to block A->col_coupleBlock */
1481      j2_ub = A->mainBlock->pattern->ptr[i1+1];          j2_ub = A->mainBlock->pattern->ptr[i1+1];
1482      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1483        i2 = A->mainBlock->pattern->index[j2];            i2 = A->mainBlock->pattern->index[j2];
       //RA_val = R_val * A->mainBlock->val[j2];  
1484            temp_val = &(A->mainBlock->val[j2*block_size]);            temp_val = &(A->mainBlock->val[j2*block_size]);
1485        for (irb=0; irb<row_block_size; irb++)            for (irb=0; irb<row_block_size; irb++)
1486          for (icb=0; icb<col_block_size; icb++)              for (icb=0; icb<col_block_size; icb++)
1487          RA_val[irb+row_block_size*icb]=ZERO;                  RA_val[irb+row_block_size*icb]=ZERO;
1488            for (irb=0; irb<P_block_size; irb++) {            for (irb=0; irb<P_block_size; irb++) {
1489          rtmp=R_val[irb];              rtmp=R_val[irb];
1490              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
1491                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];                  RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1492              }              }
1493            }            }
1494    
1495        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
1496          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
1497          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1498          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1499          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
         //RAP_val = RA_val * P->mainBlock->val[j3];  
1500                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
1501          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1502            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1503              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1504                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1505            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1506                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1507                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1508                    }                    }
1509                  }                  }
1510                    if (P_marker[i_c] < row_marker) {
1511  if (MY_DEBUG){                    P_marker[i_c] = i;
1512  if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[3] %.20f R %f %f A %f P %f %f\n [%d %d]", RAP_val[0], R_val[0], R_val[1], A->mainBlock->val[j2*block_size], temp_val[0], temp_val[1], i2, j3);                    memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1513  }                    RAP_main_idx[i] = i_c;
1514          if (P_marker[i_c] < row_marker) {                    i++;
1515            P_marker[i_c] = i;                  } else {
           //RAP_main_val[i] = RAP_val;  
           memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));  
 if (MY_DEBUG){  
 if (rank == 0 && i == 2) fprintf(stderr, "Main R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);  
 }  
           RAP_main_idx[i] = i_c;  
           i++;  
         } else {  
           //RAP_main_val[P_marker[i_c]] += RAP_val;  
1516                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1517                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1518                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1519                    }                    }
1520                    }
1521  if (MY_DEBUG){                }
1522  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1523  }              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1524          }                  /* note that we need to map the column index in
1525          }                     P->col_coupleBlock back into the column index in
1526          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];                     P_ext_couple */
1527          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         /* note that we need to map the column index in  
            P->col_coupleBlock back into the column index in  
            P_ext_couple */  
         i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
 if (MY_DEBUG && (i_c < 0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d--%d,%d) 3\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);  
         //RAP_val = RA_val * P->col_coupleBlock->val[j3];  
1528                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1529          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1530            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1531              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1532                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1533            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1534                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1535                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1536                    }                    }
1537                  }                  }
1538    
1539          if (P_marker[i_c] < row_marker_ext) {                  if (P_marker[i_c] < row_marker_ext) {
1540            P_marker[i_c] = j;                    P_marker[i_c] = j;
1541            //RAP_couple_val[j] = RAP_val;                    memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1542            memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));                    RAP_couple_idx[j] = i_c - num_Pmain_cols;
1543            RAP_couple_idx[j] = i_c - num_Pmain_cols;                    j++;
1544            j++;                  } else {
         } else {  
           //RAP_couple_val[P_marker[i_c]] += RAP_val;  
1545                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1546                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
1547                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
1548                    }                    }
1549          }                  }
1550          }              }
1551    
1552        } else {            } else {
1553          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
1554          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1555          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
         //RAP_val = RA_val * P->mainBlock->val[j3];  
1556                  temp_val = &(P->mainBlock->val[j3*P_block_size]);                  temp_val = &(P->mainBlock->val[j3*P_block_size]);
1557          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1558            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1559              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1560                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1561            rtmp=temp_val[icb];                    rtmp=temp_val[icb];
1562                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1563                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1564                    }                    }
1565                  }                  }
1566    
         //RAP_main_val[P_marker[i_c]] += RAP_val;  
 if (MY_DEBUG){  
 if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[4] %f\n", RAP_val[0]);  
 }  
1567                  temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1568                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1569                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1570                  }                  }
1571                }
1572  if (MY_DEBUG){              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1573  if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main Visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1574  }                  i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
 if (MY_DEBUG && i_c >= sum) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d -- %d,%d) 4\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);  
         //RAP_val = RA_val * P->col_coupleBlock->val[j3];  
1575                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1576          for (irb=0; irb<row_block_size; irb++)                  for (irb=0; irb<row_block_size; irb++)
1577            for (icb=0; icb<col_block_size; icb++)                    for (icb=0; icb<col_block_size; icb++)
1578              RAP_val[irb+row_block_size*icb]=ZERO;                      RAP_val[irb+row_block_size*icb]=ZERO;
1579                  for (icb=0; icb<P_block_size; icb++) {                  for (icb=0; icb<P_block_size; icb++) {
1580            rtmp = temp_val[icb];                    rtmp = temp_val[icb];
1581                    for (irb=0; irb<row_block_size; irb++) {                    for (irb=0; irb<row_block_size; irb++) {
1582                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;                      RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1583                    }                    }
1584                  }                  }
1585    
         //RAP_couple_val[P_marker[i_c]] += RAP_val;  
1586                  temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1587                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
1588                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
1589                  }                  }
1590          }              }
1591        }            }
1592      }          }
1593       }       }
1594    
1595       /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */       /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1596       if (i > row_marker) {       if (i > row_marker) {
1597      offset = i - row_marker;          offset = i - row_marker;
1598      temp = TMPMEMALLOC(offset, index_t);          temp = new index_t[offset];
1599      for (iptr=0; iptr<offset; iptr++)          #pragma omp parallel for schedule(static) private(iptr)
1600        temp[iptr] = RAP_main_idx[row_marker+iptr];          for (iptr=0; iptr<offset; iptr++)
1601      if (offset > 0) {            temp[iptr] = RAP_main_idx[row_marker+iptr];
1602        #ifdef USE_QSORTG          if (offset > 0) {
1603          qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);              qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1604        #else          }
1605          qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);          temp_val = new double[offset * block_size];
1606        #endif          #pragma omp parallel for schedule(static) private(iptr,k)
1607      }          for (iptr=0; iptr<offset; iptr++){
1608      temp_val = TMPMEMALLOC(offset * block_size, double);            k = P_marker[temp[iptr]];
1609      for (iptr=0; iptr<offset; iptr++){            memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1610        k = P_marker[temp[iptr]];            P_marker[temp[iptr]] = iptr + row_marker;
1611        //temp_val[iptr] = RAP_main_val[k];          }
1612        memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));          #pragma omp parallel for schedule(static) private(iptr)
1613        P_marker[temp[iptr]] = iptr + row_marker;          for (iptr=0; iptr<offset; iptr++){
1614      }            RAP_main_idx[row_marker+iptr] = temp[iptr];
1615      for (iptr=0; iptr<offset; iptr++){            memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1616        RAP_main_idx[row_marker+iptr] = temp[iptr];          }
1617        //RAP_main_val[row_marker+iptr] = temp_val[iptr];          delete[] temp;
1618        memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));          delete[] temp_val;
     }  
     TMPMEMFREE(temp);  
     TMPMEMFREE(temp_val);  
1619       }       }
1620       if (j > row_marker_ext) {       if (j > row_marker_ext) {
1621          offset = j - row_marker_ext;          offset = j - row_marker_ext;
1622          temp = TMPMEMALLOC(offset, index_t);          temp = new index_t[offset];
1623            #pragma omp parallel for schedule(static) private(iptr)
1624          for (iptr=0; iptr<offset; iptr++)          for (iptr=0; iptr<offset; iptr++)
1625            temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];            temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1626          if (offset > 0) {          if (offset > 0) {
1627            #ifdef USE_QSORTG              qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
             qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);  
           #else  
             qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);  
           #endif  
1628          }          }
1629          temp_val = TMPMEMALLOC(offset * block_size, double);          temp_val = new double[offset * block_size];
1630            #pragma omp parallel for schedule(static) private(iptr, k)
1631          for (iptr=0; iptr<offset; iptr++){          for (iptr=0; iptr<offset; iptr++){
 if (MY_DEBUG && temp[iptr] >= num_RAPext_cols)  
 fprintf(stderr, "rank%d index %d exceed %d\n", rank, temp[iptr], num_RAPext_cols);  
1632            k = P_marker[temp[iptr] + num_Pmain_cols];            k = P_marker[temp[iptr] + num_Pmain_cols];
1633            //temp_val[iptr] = RAP_couple_val[k];            memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
       memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));  
1634            P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;            P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1635          }          }
1636            #pragma omp parallel for schedule(static) private(iptr)
1637          for (iptr=0; iptr<offset; iptr++){          for (iptr=0; iptr<offset; iptr++){
1638            RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];            RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1639            //RAP_couple_val[row_marker_ext+iptr] = temp_val[iptr];            memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
       memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));  
1640          }          }
1641          TMPMEMFREE(temp);          delete[] temp;
1642          TMPMEMFREE(temp_val);          delete[] temp_val;
1643       }       }
1644     }     }
1645    
1646     TMPMEMFREE(RA_val);     delete[] RA_val;
1647     TMPMEMFREE(RAP_val);     delete[] RAP_val;
1648     TMPMEMFREE(A_marker);     delete[] A_marker;
1649     TMPMEMFREE(Pext_to_RAP);     delete[] Pext_to_RAP;
1650     TMPMEMFREE(Pcouple_to_RAP);     delete[] Pcouple_to_RAP;
1651     MEMFREE(RAP_ext_ptr);     delete[] RAP_ext_ptr;
1652     MEMFREE(RAP_ext_idx);     delete[] RAP_ext_idx;
1653     MEMFREE(RAP_ext_val);     delete[] RAP_ext_val;
1654     Paso_SparseMatrix_free(R_main);     R_main.reset();
1655     Paso_SparseMatrix_free(R_couple);     R_couple.reset();
   
 if (MY_DEBUG){  
 if (rank == 0) fprintf(stderr, "A[%d]=%f A[%d]=%f A[%d]=%f A[%d]=%f \n", RAP_main_idx[0], RAP_main_val[0], RAP_main_idx[1], RAP_main_val[1], RAP_main_idx[2], RAP_main_val[2], RAP_main_idx[3], RAP_main_val[3]);  
 }  
 if (MY_DEBUG) fprintf(stderr, "rank %d CP10\n", rank);  
 if (MY_DEBUG && rank == 1) {  
 int sum = num_RAPext_cols, my_i;  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, sum);  
 for (my_i=0; my_i<sum; my_i++){  
   sprintf(str2, "%d ", global_id_RAP[my_i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 sprintf(str1, "rank %d distribution = (", rank);  
 for (my_i=0; my_i<size; my_i++){  
   sprintf(str2, "%d ", P->pattern->input_distribution->first_component[my_i+1]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
   
1656    
1657     /* Check whether there are empty columns in RAP_couple */     /* Check whether there are empty columns in RAP_couple */
1658     #pragma omp parallel for schedule(static) private(i)     #pragma omp parallel for schedule(static) private(i)
# Line 2077  TMPMEMFREE(str2); Line 1660  TMPMEMFREE(str2);
1660     /* num of non-empty columns is stored in "k" */     /* num of non-empty columns is stored in "k" */
1661     k = 0;       k = 0;  
1662     j = RAP_couple_ptr[num_Pmain_cols];     j = RAP_couple_ptr[num_Pmain_cols];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP10_1 %d %d %d\n", rank, num_RAPext_cols, num_Pmain_cols, j);  
1663     for (i=0; i<j; i++) {     for (i=0; i<j; i++) {
1664       i1 = RAP_couple_idx[i];       i1 = RAP_couple_idx[i];
1665       if (P_marker[i1]) {       if (P_marker[i1]) {
1666      P_marker[i1] = 0;          P_marker[i1] = 0;
1667      k++;          k++;
1668       }       }
1669     }     }
1670    
 if (MY_DEBUG) fprintf(stderr, "rank %d CP10_2 %d\n", rank, k);  
1671     /* empty columns is found */     /* empty columns is found */
1672     if (k < num_RAPext_cols) {     if (k < num_RAPext_cols) {
1673       temp = TMPMEMALLOC(k, index_t);       temp = new index_t[k];
1674       k = 0;       k = 0;
1675       for (i=0; i<num_RAPext_cols; i++)       for (i=0; i<num_RAPext_cols; i++)
1676      if (!P_marker[i]) {          if (!P_marker[i]) {
1677        P_marker[i] = k;            P_marker[i] = k;
1678        temp[k] = global_id_RAP[i];            temp[k] = global_id_RAP[i];
1679        k++;            k++;
1680      }          }
1681         #pragma omp parallel for schedule(static) private(i, i1)
1682       for (i=0; i<j; i++) {       for (i=0; i<j; i++) {
1683      i1 = RAP_couple_idx[i];          i1 = RAP_couple_idx[i];
1684      RAP_couple_idx[i] = P_marker[i1];          RAP_couple_idx[i] = P_marker[i1];
1685       }       }
1686       num_RAPext_cols = k;       num_RAPext_cols = k;
1687       TMPMEMFREE(global_id_RAP);       delete[] global_id_RAP;
1688       global_id_RAP = temp;       global_id_RAP = temp;
1689     }     }
1690     TMPMEMFREE(P_marker);     delete[] P_marker;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP11\n", rank);  
1691    
1692     /******************************************************/     /******************************************************/
1693     /* Start to create the coasre level System Matrix A_c */     /* Start to create the coarse level System Matrix A_c */
1694     /******************************************************/     /******************************************************/
1695     /* first, prepare the sender/receiver for the col_connector */     /* first, prepare the sender/receiver for the col_connector */
1696     dist = P->pattern->input_distribution->first_component;     dist = P->pattern->input_distribution->first_component;
1697     recv_len = TMPMEMALLOC(size, dim_t);     recv_len = new dim_t[size];
1698     send_len = TMPMEMALLOC(size, dim_t);     send_len = new dim_t[size];
1699     neighbor = TMPMEMALLOC(size, Esys_MPI_rank);     neighbor = new Esys_MPI_rank[size];
1700     offsetInShared = TMPMEMALLOC(size+1, index_t);     offsetInShared = new index_t[size+1];
1701     shared = TMPMEMALLOC(num_RAPext_cols, index_t);     shared = new index_t[num_RAPext_cols];
1702     memset(recv_len, 0, sizeof(dim_t) * size);     memset(recv_len, 0, sizeof(dim_t) * size);
1703     memset(send_len, 0, sizeof(dim_t) * size);     memset(send_len, 0, sizeof(dim_t) * size);
1704     num_neighbors = 0;     num_neighbors = 0;
1705     offsetInShared[0] = 0;     offsetInShared[0] = 0;
1706     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
 if (MY_DEBUG && rank == 1) fprintf(stderr, "i%d j%d k%d %d %d\n", i, j, k, global_id_RAP[i], recv_len[j]);  
1707       shared[i] = i + num_Pmain_cols;       shared[i] = i + num_Pmain_cols;
1708       if (k <= global_id_RAP[i]) {       if (k <= global_id_RAP[i]) {
1709      if (recv_len[j] > 0) {          if (recv_len[j] > 0) {
1710        neighbor[num_neighbors] = j;            neighbor[num_neighbors] = j;
1711        num_neighbors ++;            num_neighbors ++;
1712        offsetInShared[num_neighbors] = i;            offsetInShared[num_neighbors] = i;
1713      }          }
1714      while (k <= global_id_RAP[i]) {          while (k <= global_id_RAP[i]) {
1715        j++;            j++;
1716        k = dist[j+1];            k = dist[j+1];
1717      }          }
1718       }       }
1719       recv_len[j] ++;       recv_len[j] ++;
1720     }     }
# Line 2143  if (MY_DEBUG && rank == 1) fprintf(stder Line 1723  if (MY_DEBUG && rank == 1) fprintf(stder
1723       num_neighbors ++;       num_neighbors ++;
1724       offsetInShared[num_neighbors] = i;       offsetInShared[num_neighbors] = i;
1725     }     }
    recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,  
             neighbor, shared, offsetInShared, 1, 0, mpi_info);  
1726    
1727       paso::SharedComponents_ptr recv(new paso::SharedComponents(
1728                   num_Pmain_cols, num_neighbors, neighbor, shared,
1729                   offsetInShared, 1, 0, mpi_info));
1730    
1731       #ifdef ESYS_MPI
1732     MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);     MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1733  if (MY_DEBUG) fprintf(stderr, "rank %d CP12 %d %d\n", rank, num_neighbors, num_RAPext_cols);     #endif
1734    
1735     #ifdef ESYS_MPI     #ifdef ESYS_MPI
1736       mpi_requests=TMPMEMALLOC(size*2,MPI_Request);       mpi_requests=new MPI_Request[size*2];
1737       mpi_stati=TMPMEMALLOC(size*2,MPI_Status);       mpi_stati=new MPI_Status[size*2];
1738     #else     #else
1739       mpi_requests=TMPMEMALLOC(size*2,int);       mpi_requests=new int[size*2];
1740       mpi_stati=TMPMEMALLOC(size*2,int);       mpi_stati=new int[size*2];
1741     #endif     #endif
1742     num_neighbors = 0;     num_neighbors = 0;
1743     j = 0;     j = 0;
1744     offsetInShared[0] = 0;     offsetInShared[0] = 0;
1745     for (i=0; i<size; i++) {     for (i=0; i<size; i++) {
1746       if (send_len[i] > 0) {       if (send_len[i] > 0) {
1747      neighbor[num_neighbors] = i;          neighbor[num_neighbors] = i;
1748      num_neighbors ++;          num_neighbors ++;
1749      j += send_len[i];          j += send_len[i];
1750      offsetInShared[num_neighbors] = j;          offsetInShared[num_neighbors] = j;
1751       }       }
1752     }     }
1753     TMPMEMFREE(shared);     delete[] shared;
1754     shared = TMPMEMALLOC(j, index_t);     shared = new index_t[j];
1755     for (i=0, j=0; i<num_neighbors; i++) {     for (i=0, j=0; i<num_neighbors; i++) {
1756       k = neighbor[i];       k = neighbor[i];
1757         #ifdef ESYS_MPI
1758       MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,       MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1759          mpi_info->msg_tag_counter+k,                  mpi_info->msg_tag_counter+k,
1760          mpi_info->comm, &mpi_requests[i]);                  mpi_info->comm, &mpi_requests[i]);
1761         #endif
1762       j += send_len[k];       j += send_len[k];
1763     }     }
1764     for (i=0, j=0; i<recv->numNeighbors; i++) {     for (i=0, j=0; i<recv->numNeighbors; i++) {
1765       k = recv->neighbor[i];       k = recv->neighbor[i];
1766         #ifdef ESYS_MPI
1767       MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,       MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1768          mpi_info->msg_tag_counter+rank,                  mpi_info->msg_tag_counter+rank,
1769          mpi_info->comm, &mpi_requests[i+num_neighbors]);                  mpi_info->comm, &mpi_requests[i+num_neighbors]);
1770         #endif
1771       j += recv_len[k];       j += recv_len[k];
1772     }     }
1773       #ifdef ESYS_MPI
1774     MPI_Waitall(num_neighbors + recv->numNeighbors,     MPI_Waitall(num_neighbors + recv->numNeighbors,
1775          mpi_requests, mpi_stati);                  mpi_requests, mpi_stati);
1776     mpi_info->msg_tag_counter += size;     #endif
1777  if (MY_DEBUG) fprintf(stderr, "rank %d CP13\n", rank);     ESYS_MPI_INC_COUNTER(*mpi_info, size);
1778    
1779     j = offsetInShared[num_neighbors];     j = offsetInShared[num_neighbors];
1780     offset = dist[rank];     offset = dist[rank];
1781       #pragma omp parallel for schedule(static) private(i)
1782     for (i=0; i<j; i++) shared[i] = shared[i] - offset;     for (i=0; i<j; i++) shared[i] = shared[i] - offset;
    send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,  
             neighbor, shared, offsetInShared, 1, 0, mpi_info);  
1783    
1784     col_connector = Paso_Connector_alloc(send, recv);     paso::SharedComponents_ptr send(new paso::SharedComponents(
1785     TMPMEMFREE(shared);                 num_Pmain_cols, num_neighbors, neighbor, shared,
1786     Paso_SharedComponents_free(recv);                 offsetInShared, 1, 0, mpi_info));
1787     Paso_SharedComponents_free(send);  
1788  if (MY_DEBUG) fprintf(stderr, "rank %d CP14\n", rank);     col_connector.reset(new paso::Connector(send, recv));
1789       delete[] shared;
1790    
1791     /* now, create row distribution (output_distri) and col     /* now, create row distribution (output_distri) and col
1792        distribution (input_distribution) */        distribution (input_distribution) */
1793     input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);     input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1794     output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);     output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_1\n", rank);  
1795    
1796     /* then, prepare the sender/receiver for the row_connector, first, prepare     /* then, prepare the sender/receiver for the row_connector, first, prepare
1797        the information for sender */        the information for sender */
1798     sum = RAP_couple_ptr[num_Pmain_cols];     sum = RAP_couple_ptr[num_Pmain_cols];
1799     len = TMPMEMALLOC(size, dim_t);     len = new dim_t[size];
1800     send_ptr = TMPMEMALLOC(size, index_t*);     send_ptr = new index_t*[size];
1801     send_idx = TMPMEMALLOC(size, index_t*);     send_idx = new index_t*[size];
1802       #pragma omp parallel for schedule(static) private(i)
1803     for (i=0; i<size; i++) {     for (i=0; i<size; i++) {
1804       send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);       send_ptr[i] = new index_t[num_Pmain_cols];
1805       send_idx[i] = TMPMEMALLOC(sum, index_t);       send_idx[i] = new index_t[sum];
1806       memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);       memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1807     }     }
1808     memset(len, 0, sizeof(dim_t) * size);     memset(len, 0, sizeof(dim_t) * size);
 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_2 %d\n", rank, sum);  
1809     recv = col_connector->recv;     recv = col_connector->recv;
1810     sum=0;     sum=0;
1811     for (i_r=0; i_r<num_Pmain_cols; i_r++) {     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1812       i1 = RAP_couple_ptr[i_r];       i1 = RAP_couple_ptr[i_r];
1813       i2 = RAP_couple_ptr[i_r+1];       i2 = RAP_couple_ptr[i_r+1];
1814       if (i2 > i1) {       if (i2 > i1) {
1815      /* then row i_r will be in the sender of row_connector, now check          /* then row i_r will be in the sender of row_connector, now check
1816         how many neighbors i_r needs to be send to */             how many neighbours i_r needs to be send to */
1817      for (j=i1; j<i2; j++) {          for (j=i1; j<i2; j++) {
1818        i_c = RAP_couple_idx[j];            i_c = RAP_couple_idx[j];
1819        /* find out the corresponding neighbor "p" of column i_c */            /* find out the corresponding neighbor "p" of column i_c */
1820        for (p=0; p<recv->numNeighbors; p++) {            for (p=0; p<recv->numNeighbors; p++) {
1821          if (i_c < recv->offsetInShared[p+1]) {              if (i_c < recv->offsetInShared[p+1]) {
1822            k = recv->neighbor[p];                k = recv->neighbor[p];
1823  if (MY_DEBUG && rank == 1 && k == 1)                if (send_ptr[k][i_r] == 0) sum++;
1824  fprintf(stderr, "******* i_r %d i_c %d p %d ub %d numNeighbors %d\n", i_r, i_c, p, recv->offsetInShared[p+1], recv->numNeighbors);                send_ptr[k][i_r] ++;
1825            if (send_ptr[k][i_r] == 0) sum++;                send_idx[k][len[k]] = global_id_RAP[i_c];
1826            send_ptr[k][i_r] ++;                len[k] ++;
1827            send_idx[k][len[k]] = global_id_RAP[i_c];                break;
1828            len[k] ++;              }
1829            break;            }
1830          }          }
       }  
     }  
1831       }       }
1832     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_3 %d\n", rank, send_ptr[1][0]);  
1833     if (global_id_RAP) {     if (global_id_RAP) {
1834       TMPMEMFREE(global_id_RAP);       delete[] global_id_RAP;
1835       global_id_RAP = NULL;       global_id_RAP = NULL;
1836     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_4 %d sum%d cols%d\n", rank, len[0], sum, num_RAPext_cols);  
1837    
1838     /* now allocate the sender */     /* now allocate the sender */
1839     shared = TMPMEMALLOC(sum, index_t);     shared = new index_t[sum];
1840     memset(send_len, 0, sizeof(dim_t) * size);     memset(send_len, 0, sizeof(dim_t) * size);
1841     num_neighbors=0;     num_neighbors=0;
1842     offsetInShared[0] = 0;     offsetInShared[0] = 0;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_5 %d\n", rank, size);  
1843     for (p=0, k=0; p<size; p++) {     for (p=0, k=0; p<size; p++) {
 if (MY_DEBUG && rank == 1) fprintf(stderr, "rank %d CP14_6 %d %d %d\n", rank, p, k, num_neighbors);  
1844       for (i=0; i<num_Pmain_cols; i++) {       for (i=0; i<num_Pmain_cols; i++) {
1845      if (send_ptr[p][i] > 0) {          if (send_ptr[p][i] > 0) {
1846        shared[k] = i;            shared[k] = i;
1847        k++;            k++;
1848        send_ptr[p][send_len[p]] = send_ptr[p][i];            send_ptr[p][send_len[p]] = send_ptr[p][i];
1849        send_len[p]++;            send_len[p]++;
1850      }          }
1851       }       }
1852       if (k > offsetInShared[num_neighbors]) {       if (k > offsetInShared[num_neighbors]) {
1853      neighbor[num_neighbors] = p;          neighbor[num_neighbors] = p;
1854      num_neighbors ++;          num_neighbors ++;
1855      offsetInShared[num_neighbors] = k;          offsetInShared[num_neighbors] = k;
1856       }       }
1857     }     }
1858  if (MY_DEBUG) fprintf(stderr, "rank %d CP14_8 %d %d %d %d %d\n", rank, k, send_len[0], send_len[1], offsetInShared[1], k);     send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1859     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,                 neighbor, shared, offsetInShared, 1, 0, mpi_info));
                         neighbor, shared, offsetInShared, 1, 0, mpi_info);  
1860    
1861     /* send/recv number of rows will be sent from current proc     /* send/recv number of rows will be sent from current proc
1862        recover info for the receiver of row_connector from the sender */        recover info for the receiver of row_connector from the sender */
1863       #ifdef ESYS_MPI
1864     MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);     MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1865  if (MY_DEBUG) fprintf(stderr, "rank %d CP15 %d %d\n", rank, recv_len[0], recv_len[1]);     #endif
1866     num_neighbors = 0;     num_neighbors = 0;
1867     offsetInShared[0] = 0;     offsetInShared[0] = 0;
1868     j = 0;     j = 0;
1869     for (i=0; i<size; i++) {     for (i=0; i<size; i++) {
1870       if (i != rank && recv_len[i] > 0) {       if (i != rank && recv_len[i] > 0) {
1871      neighbor[num_neighbors] = i;          neighbor[num_neighbors] = i;
1872      num_neighbors ++;          num_neighbors ++;
1873      j += recv_len[i];          j += recv_len[i];
1874      offsetInShared[num_neighbors] = j;          offsetInShared[num_neighbors] = j;
1875       }       }
1876     }     }
1877  if (MY_DEBUG) fprintf(stderr, "rank %d CP15_1\n", rank);     delete[] shared;
1878     TMPMEMFREE(shared);     delete[] recv_len;
1879     TMPMEMFREE(recv_len);     shared = new index_t[j];
    shared = TMPMEMALLOC(j, index_t);  
1880     k = offsetInShared[num_neighbors];     k = offsetInShared[num_neighbors];
1881       #pragma omp parallel for schedule(static) private(i)
1882     for (i=0; i<k; i++) {     for (i=0; i<k; i++) {
1883       shared[i] = i + num_Pmain_cols;       shared[i] = i + num_Pmain_cols;
1884     }     }
1885  if (MY_DEBUG) fprintf(stderr, "rank %d CP15_2\n", rank);     recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1886     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,                 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1887                          neighbor, shared, offsetInShared, 1, 0, mpi_info);     row_connector.reset(new paso::Connector(send, recv));
1888  if (MY_DEBUG) fprintf(stderr, "rank %d CP15_3\n", rank);     delete[] shared;
    row_connector = Paso_Connector_alloc(send, recv);  
    TMPMEMFREE(shared);  
    Paso_SharedComponents_free(recv);  
    Paso_SharedComponents_free(send);  
 if (MY_DEBUG) fprintf(stderr, "rank %d CP16 %d R%d S%d\n", rank, k, num_neighbors, send->numNeighbors);  
1889    
1890     /* send/recv pattern->ptr for rowCoupleBlock */     /* send/recv pattern->ptr for rowCoupleBlock */
1891     num_RAPext_rows = offsetInShared[num_neighbors];     num_RAPext_rows = offsetInShared[num_neighbors];
1892     row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);     row_couple_ptr = new index_t[num_RAPext_rows+1];
1893     for (p=0; p<num_neighbors; p++) {     for (p=0; p<num_neighbors; p++) {
1894       j = offsetInShared[p];       j = offsetInShared[p];
1895       i = offsetInShared[p+1];       i = offsetInShared[p+1];
1896         #ifdef ESYS_MPI
1897       MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],       MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1898          mpi_info->msg_tag_counter+neighbor[p],                  mpi_info->msg_tag_counter+neighbor[p],
1899          mpi_info->comm, &mpi_requests[p]);                  mpi_info->comm, &mpi_requests[p]);
1900  if (MY_DEBUG) fprintf(stderr, "rank %d RECV %d from %d tag %d\n", rank, i-j, neighbor[p], mpi_info->msg_tag_counter+neighbor[p]);       #endif
1901     }     }
1902     send = row_connector->send;     send = row_connector->send;
1903     for (p=0; p<send->numNeighbors; p++) {     for (p=0; p<send->numNeighbors; p++) {
1904         #ifdef ESYS_MPI
1905       MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],       MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1906          MPI_INT, send->neighbor[p],                  MPI_INT, send->neighbor[p],
1907          mpi_info->msg_tag_counter+rank,                  mpi_info->msg_tag_counter+rank,
1908          mpi_info->comm, &mpi_requests[p+num_neighbors]);                  mpi_info->comm, &mpi_requests[p+num_neighbors]);
1909  if (MY_DEBUG) fprintf(stderr, "rank %d SEND %d TO %d tag %d\n", rank, send_len[send->neighbor[p]], send->neighbor[p], mpi_info->msg_tag_counter+rank);       #endif
1910     }     }
1911       #ifdef ESYS_MPI
1912     MPI_Waitall(num_neighbors + send->numNeighbors,     MPI_Waitall(num_neighbors + send->numNeighbors,
1913      mpi_requests, mpi_stati);          mpi_requests, mpi_stati);
1914     mpi_info->msg_tag_counter += size;     #endif
1915     TMPMEMFREE(send_len);     ESYS_MPI_INC_COUNTER(*mpi_info, size);
1916  if (MY_DEBUG) fprintf(stderr, "rank %d CP17\n", rank);     delete[] send_len;
1917    
1918     sum = 0;     sum = 0;
1919     for (i=0; i<num_RAPext_rows; i++) {     for (i=0; i<num_RAPext_rows; i++) {
# Line 2346  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 1925  if (MY_DEBUG) fprintf(stderr, "rank %d C
1925    
1926     /* send/recv pattern->index for rowCoupleBlock */     /* send/recv pattern->index for rowCoupleBlock */
1927     k = row_couple_ptr[num_RAPext_rows];     k = row_couple_ptr[num_RAPext_rows];
1928     row_couple_idx = MEMALLOC(k, index_t);     row_couple_idx = new index_t[k];
1929     for (p=0; p<num_neighbors; p++) {     for (p=0; p<num_neighbors; p++) {
1930       j1 = row_couple_ptr[offsetInShared[p]];       j1 = row_couple_ptr[offsetInShared[p]];
1931       j2 = row_couple_ptr[offsetInShared[p+1]];       j2 = row_couple_ptr[offsetInShared[p+1]];
1932         #ifdef ESYS_MPI
1933       MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],       MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1934          mpi_info->msg_tag_counter+neighbor[p],                  mpi_info->msg_tag_counter+neighbor[p],
1935          mpi_info->comm, &mpi_requests[p]);                  mpi_info->comm, &mpi_requests[p]);
1936         #endif
1937     }     }
1938     for (p=0; p<send->numNeighbors; p++) {     for (p=0; p<send->numNeighbors; p++) {
1939         #ifdef ESYS_MPI
1940       MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],       MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1941          MPI_INT, send->neighbor[p],                  MPI_INT, send->neighbor[p],
1942          mpi_info->msg_tag_counter+rank,                  mpi_info->msg_tag_counter+rank,
1943          mpi_info->comm, &mpi_requests[p+num_neighbors]);                  mpi_info->comm, &mpi_requests[p+num_neighbors]);
1944         #endif
1945     }     }
1946       #ifdef ESYS_MPI
1947     MPI_Waitall(num_neighbors + send->numNeighbors,     MPI_Waitall(num_neighbors + send->numNeighbors,
1948          mpi_requests, mpi_stati);                  mpi_requests, mpi_stati);
1949     mpi_info->msg_tag_counter += size;     #endif
1950       ESYS_MPI_INC_COUNTER(*mpi_info, size);
    offset = input_dist->first_component[rank];  
    k = row_couple_ptr[num_RAPext_rows];  
    for (i=0; i<k; i++) {  
      row_couple_idx[i] -= offset;  
    }  
   
    for (i=0; i<size; i++) {  
      TMPMEMFREE(send_ptr[i]);  
      TMPMEMFREE(send_idx[i]);  
    }  
    TMPMEMFREE(send_ptr);  
    TMPMEMFREE(send_idx);  
    TMPMEMFREE(len);  
    TMPMEMFREE(offsetInShared);  
    TMPMEMFREE(neighbor);  
    TMPMEMFREE(mpi_requests);  
    TMPMEMFREE(mpi_stati);  
    Esys_MPIInfo_free(mpi_info);  
 if (MY_DEBUG) fprintf(stderr, "rank %d CP18\n", rank);  
1951    
1952     /* Now, we can create pattern for mainBlock and coupleBlock */      offset = input_dist->first_component[rank];
1953     main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,      k = row_couple_ptr[num_RAPext_rows];
1954              num_Pmain_cols, RAP_main_ptr, RAP_main_idx);      #pragma omp parallel for schedule(static) private(i)
1955     col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,      for (i=0; i<k; i++) {
1956              num_Pmain_cols, num_RAPext_cols,          row_couple_idx[i] -= offset;
1957              RAP_couple_ptr, RAP_couple_idx);      }
1958     row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,      #pragma omp parallel for schedule(static) private(i)
1959              num_RAPext_rows, num_Pmain_cols,      for (i=0; i<size; i++) {
1960              row_couple_ptr, row_couple_idx);          delete[] send_ptr[i];
1961            delete[] send_idx[i];
1962        }
1963        delete[] send_ptr;
1964        delete[] send_idx;
1965        delete[] len;
1966        delete[] offsetInShared;
1967        delete[] neighbor;
1968        delete[] mpi_requests;
1969        delete[] mpi_stati;
1970        Esys_MPIInfo_free(mpi_info);
1971    
1972        /* Now, we can create pattern for mainBlock and coupleBlock */
1973        paso::Pattern_ptr main_pattern(new paso::Pattern(MATRIX_FORMAT_DEFAULT,
1974                   num_Pmain_cols, num_Pmain_cols, RAP_main_ptr, RAP_main_idx));
1975        paso::Pattern_ptr col_couple_pattern(new paso::Pattern(
1976                   MATRIX_FORMAT_DEFAULT, num_Pmain_cols, num_RAPext_cols,
1977                   RAP_couple_ptr, RAP_couple_idx));
1978        paso::Pattern_ptr row_couple_pattern(new paso::Pattern(
1979                   MATRIX_FORMAT_DEFAULT, num_RAPext_rows, num_Pmain_cols,
1980                   row_couple_ptr, row_couple_idx));
1981    
1982     /* next, create the system matrix */      /* next, create the system matrix */
1983     pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,      pattern.reset(new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1984                  output_dist, input_dist, main_pattern, col_couple_pattern,                  output_dist, input_dist, main_pattern, col_couple_pattern,
1985                  row_couple_pattern, col_connector, row_connector);                  row_couple_pattern, col_connector, row_connector));
1986     out = Paso_SystemMatrix_alloc(A->type, pattern,      out.reset(new paso::SystemMatrix(A->type, pattern, row_block_size,
1987                  row_block_size, col_block_size, FALSE);                                      col_block_size, false));
1988  if (MY_DEBUG) fprintf(stderr, "rank %d CP19\n", rank);  
1989        /* finally, fill in the data*/
1990     /* finally, fill in the data*/      memcpy(out->mainBlock->val, RAP_main_val,
1991     memcpy(out->mainBlock->val, RAP_main_val,                  out->mainBlock->len* sizeof(double));
1992          out->mainBlock->len* sizeof(double));      memcpy(out->col_coupleBlock->val, RAP_couple_val,
1993     memcpy(out->col_coupleBlock->val, RAP_couple_val,                  out->col_coupleBlock->len * sizeof(double));
1994          out->col_coupleBlock->len * sizeof(double));  
1995        delete[] RAP_main_val;
1996     /* Clean up */      delete[] RAP_couple_val;
1997     Paso_SystemMatrixPattern_free(pattern);      if (!Esys_noError()) {
1998     Paso_Pattern_free(main_pattern);          out.reset();
1999     Paso_Pattern_free(col_couple_pattern);      }
2000     Paso_Pattern_free(row_couple_pattern);      return out;
    Paso_Connector_free(col_connector);  
    Paso_Connector_free(row_connector);  
    Paso_Distribution_free(output_dist);  
    Paso_Distribution_free(input_dist);  
    TMPMEMFREE(RAP_main_val);  
    TMPMEMFREE(RAP_couple_val);  
    if (Esys_noError()) {  
      return out;  
    } else {  
      Paso_SystemMatrix_free(out);  
      return NULL;  
    }  
2001  }  }
2002    
2003    
2004  Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(  paso::SystemMatrix_ptr Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2005      Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)          paso::SystemMatrix_ptr A, paso::SystemMatrix_ptr P,
2006            paso::SystemMatrix_ptr R)
2007  {  {
2008     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2009     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;     paso::SystemMatrix_ptr out;
2010     Paso_SystemMatrix *out=NULL;     paso::SystemMatrixPattern_ptr pattern;
2011     Paso_SystemMatrixPattern *pattern=NULL;     paso::Distribution_ptr input_dist, output_dist;
2012     Paso_Distribution *input_dist=NULL, *output_dist=NULL;     paso::SharedComponents_ptr send, recv;
2013     Paso_SharedComponents *send =NULL, *recv=NULL;     paso::Connector_ptr col_connector, row_connector;
    Paso_Connector *col_connector=NULL, *row_connector=NULL;  
    Paso_Coupler *coupler=NULL;  
    Paso_Pattern *main_pattern=NULL;  
    Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;  
2014     const dim_t row_block_size=A->row_block_size;     const dim_t row_block_size=A->row_block_size;
2015     const dim_t col_block_size=A->col_block_size;     const dim_t col_block_size=A->col_block_size;
2016     const dim_t block_size = A->block_size;     const dim_t block_size = A->block_size;
    const dim_t P_block_size = P->block_size;  
    const dim_t num_threads=omp_get_max_threads();  
2017     const double ZERO = 0.0;     const double ZERO = 0.0;
2018     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2019     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
# Line 2462  Paso_SystemMatrix* Paso_Preconditioner_A Line 2030  Paso_SystemMatrix* Paso_Preconditioner_A
2030     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2031     index_t row_marker_ext, *where_p=NULL;     index_t row_marker_ext, *where_p=NULL;
2032     index_t **send_ptr=NULL, **send_idx=NULL;     index_t **send_ptr=NULL, **send_idx=NULL;
2033     dim_t l, p, q, global_label, num_neighbors;     dim_t p, num_neighbors;
2034     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2035     Esys_MPI_rank *neighbor=NULL;     Esys_MPI_rank *neighbor=NULL;
2036     #ifdef ESYS_MPI     #ifdef ESYS_MPI
# Line 2472  Paso_SystemMatrix* Paso_Preconditioner_A Line 2040  Paso_SystemMatrix* Paso_Preconditioner_A
2040       int *mpi_requests=NULL, *mpi_stati=NULL;       int *mpi_requests=NULL, *mpi_stati=NULL;
2041     #endif     #endif
2042    
2043     /* two sparse matrixes R_main and R_couple will be generate, as the     /* two sparse matrices R_main and R_couple will be generate, as the
2044        transpose of P_main and P_col_couple, respectively. Note that,        transpose of P_main and P_col_couple, respectively. Note that,
2045        R_couple is actually the row_coupleBlock of R (R=P^T) */        R_couple is actually the row_coupleBlock of R (R=P^T) */
2046     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);     paso::SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
2047     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);     paso::SparseMatrix_ptr R_couple(P->col_coupleBlock->getTranspose());
2048    
2049  if (MY_DEBUG) fprintf(stderr, "CP1\n");     /* generate P_ext, i.e. portion of P that is stored on neighbor procs
   
    /* generate P_ext, i.e. portion of P that is tored on neighbor procs  
2050        and needed locally for triple matrix product RAP        and needed locally for triple matrix product RAP
2051        to be specific, P_ext (on processor k) are group of rows in P, where        to be specific, P_ext (on processor k) are group of rows in P, where
2052        the list of rows from processor q is given by        the list of rows from processor q is given by
2053      A->col_coupler->connector->send->shared[rPtr...]          A->col_coupler->connector->send->shared[rPtr...]
2054      rPtr=A->col_coupler->connector->send->OffsetInShared[k]          rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2055        on q.        on q.
2056        P_ext is represented by two sparse matrixes P_ext_main and        P_ext is represented by two sparse matrices P_ext_main and
2057        P_ext_couple */        P_ext_couple */
2058     Paso_Preconditioner_AMG_extendB(A, P);     Paso_Preconditioner_AMG_extendB(A, P);
 if (MY_DEBUG) fprintf(stderr, "rank %d CP2\n", rank);  
2059    
2060     /* count the number of cols in P_ext_couple, resize the pattern of     /* count the number of cols in P_ext_couple, resize the pattern of
2061        sparse matrix P_ext_couple with new compressed order, and then        sparse matrix P_ext_couple with new compressed order, and then
# Line 2505  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 2070  if (MY_DEBUG) fprintf(stderr, "rank %d C
2070     num_Pext_cols = 0;     num_Pext_cols = 0;
2071     if (P->global_id) {     if (P->global_id) {
2072       /* first count the number of cols "num_Pext_cols" in both P_ext_couple       /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2073      and P->col_coupleBlock */          and P->col_coupleBlock */
2074       iptr = 0;       iptr = 0;
2075       if (num_Pcouple_cols || sum > 0) {       if (num_Pcouple_cols || sum > 0) {
2076      temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);          temp = new index_t[num_Pcouple_cols+sum];
2077      for (; iptr<sum; iptr++){          for (; iptr<sum; iptr++){
2078        temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];            temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2079  if (MY_DEBUG && rank ==1)          }
2080  fprintf(stderr, "rank %d remote_block[%d] = %d\n", rank, iptr, temp[iptr]);          for (j=0; j<num_Pcouple_cols; j++, iptr++){
2081      }            temp[iptr] = P->global_id[j];
2082      for (j=0; j<num_Pcouple_cols; j++, iptr++){          }
       temp[iptr] = P->global_id[j];  
 if (MY_DEBUG && rank ==1)  
 fprintf(stderr, "rank %d global_id[%d] = %d\n", rank, j, P->global_id[j]);  
     }  
2083       }       }
2084       if (iptr) {       if (iptr) {
2085      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2086        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_Pext_cols = 1;
2087      #else          i = temp[0];
2088        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
2089      #endif            if (temp[j] > i) {
2090      num_Pext_cols = 1;              i = temp[j];
2091      i = temp[0];              temp[num_Pext_cols++] = i;
2092      for (j=1; j<iptr; j++) {            }
2093        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_Pext_cols++] = i;  
       }  
     }  
2094       }       }
2095    
2096       /* resize the pattern of P_ext_couple */       /* resize the pattern of P_ext_couple */
2097       if(num_Pext_cols){       if(num_Pext_cols){
2098      global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);          global_id_P = new index_t[num_Pext_cols];
2099      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
2100        global_id_P[i] = temp[i];            global_id_P[i] = temp[i];
 if (MY_DEBUG && rank == 1) {  
 int my_i, sum=num_Pext_cols;  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(100, char);  
 sprintf(str1, "rank %d global_id_P[%d] = (", rank, sum);  
 for (my_i=0; my_i<sum; my_i++) {  
   sprintf(str2, "%d ", global_id_P[my_i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
   
2101       }       }
2102       if (num_Pcouple_cols || sum > 0)       if (num_Pcouple_cols || sum > 0)
2103      TMPMEMFREE(temp);          delete[] temp;
2104       for (i=0; i<sum; i++) {       for (i=0; i<sum; i++) {
2105      where_p = (index_t *)bsearch(          where_p = (index_t *)bsearch(
2106              &(P->remote_coupleBlock->pattern->index[i]),                          &(P->remote_coupleBlock->pattern->index[i]),
2107              global_id_P, num_Pext_cols,                          global_id_P, num_Pext_cols,
2108                          sizeof(index_t), Paso_comparIndex);                          sizeof(index_t), paso::comparIndex);
2109      P->remote_coupleBlock->pattern->index[i] =          P->remote_coupleBlock->pattern->index[i] =
2110              (index_t)(where_p -global_id_P);                          (index_t)(where_p -global_id_P);
2111       }         }  
2112    
2113       /* build the mapping */       /* build the mapping */
2114       if (num_Pcouple_cols) {       if (num_Pcouple_cols) {
2115      Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);          Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2116      iptr = 0;          iptr = 0;
2117      for (i=0; i<num_Pext_cols; i++)          for (i=0; i<num_Pext_cols; i++)
2118        if (global_id_P[i] == P->global_id[iptr]) {            if (global_id_P[i] == P->global_id[iptr]) {
2119          Pcouple_to_Pext[iptr++] = i;              Pcouple_to_Pext[iptr++] = i;
2120          if (iptr == num_Pcouple_cols) break;              if (iptr == num_Pcouple_cols) break;
2121        }            }
2122       }       }
2123     }     }
2124    
2125  if (MY_DEBUG) fprintf(stderr, "CP3\n");     /* alloc and initialise the makers */
 if (MY_DEBUG && rank == 0) {  
 fprintf(stderr, "Rank %d Now Prolongation Matrix ************************\n", rank);  
 Paso_SystemMatrix_print(P);  
 fprintf(stderr, "Rank %d Now Restriction Matrix ************************\n", rank);  
 Paso_SystemMatrix_print(R);  
 }  
   
    /* alloc and initialize the makers */  
2126     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
2127     P_marker = TMPMEMALLOC(sum, index_t);     P_marker = new index_t[sum];
2128     A_marker = TMPMEMALLOC(num_A_cols, index_t);     A_marker = new index_t[num_A_cols];
2129     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2130     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
2131     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2132     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2133    
 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_1 %d\n", rank, num_Pcouple_cols);  
2134     /* Now, count the size of RAP_ext. Start with rows in R_couple */     /* Now, count the size of RAP_ext. Start with rows in R_couple */
2135     sum = 0;     sum = 0;
2136     for (i_r=0; i_r<num_Pcouple_cols; i_r++){     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2137       row_marker = sum;       row_marker = sum;
2138       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
2139       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_2 %d (%d %d)\n", rank, i_r, R_couple->pattern->ptr[i_r], j1_ub);  
2140       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2141      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
2142  if (MY_DEBUG) fprintf(stderr, "rank %d CP3_3 %d %d\n", rank, j1, i1);          /* then, loop over elements in row i1 of A->col_coupleBlock */
2143      /* then, loop over elements in row i1 of A->col_coupleBlock */          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2144      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2145  if (MY_DEBUG) fprintf(stderr, "rank %d CP3_4 %d %d %d\n", rank, j1, i1, j2_ub);            i2 = A->col_coupleBlock->pattern->index[j2];
2146      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
2147        i2 = A->col_coupleBlock->pattern->index[j2];            /* check whether entry RA[i_r, i2] has been previously visited.
2148                 RAP new entry is possible only if entry RA[i_r, i2] has not
2149        /* check whether entry RA[i_r, i2] has been previously visited.               been visited yet */
2150           RAP new entry is possible only if entry RA[i_r, i2] has not            if (A_marker[i2] != i_r) {
2151           been visited yet */              /* first, mark entry RA[i_r,i2] as visited */
2152        if (A_marker[i2] != i_r) {              A_marker[i2] = i_r;
2153          /* first, mark entry RA[i_r,i2] as visited */  
2154          A_marker[i2] = i_r;              /* then loop over elements in row i2 of P_ext_main */
2155                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2156          /* then loop over elements in row i2 of P_ext_main */              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2157          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];                  i_c = P->row_coupleBlock->pattern->index[j3];
2158          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
2159          i_c = P->row_coupleBlock->pattern->index[j3];                  /* check whether entry RAP[i_r,i_c] has been created.
2160                       If not yet created, create the entry and increase
2161          /* check whether entry RAP[i_r,i_c] has been created.                     the total number of elements in RAP_ext */
2162             If not yet created, create the entry and increase                  if (P_marker[i_c] < row_marker) {
2163             the total number of elements in RAP_ext */                    P_marker[i_c] = sum;
2164          if (P_marker[i_c] < row_marker) {                    sum++;
2165            P_marker[i_c] = sum;                  }
2166            sum++;              }
2167          }  
2168          }              /* loop over elements in row i2 of P_ext_couple, do the same */
2169                j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2170          /* loop over elements in row i2 of P_ext_couple, do the same */              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2171          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2172          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  if (P_marker[i_c] < row_marker) {
2173          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;                    P_marker[i_c] = sum;
2174          if (P_marker[i_c] < row_marker) {                    sum++;
2175            P_marker[i_c] = sum;                  }
2176            sum++;              }
2177          }            }
2178          }          }
2179        }  
2180      }          /* now loop over elements in row i1 of A->mainBlock, repeat
2181               the process we have done to block A->col_coupleBlock */
2182      /* now loop over elements in row i1 of A->mainBlock, repeat          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2183         the process we have done to block A->col_coupleBlock */          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2184      j2_ub = A->mainBlock->pattern->ptr[i1+1];            i2 = A->mainBlock->pattern->index[j2];
2185      for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2186        i2 = A->mainBlock->pattern->index[j2];              A_marker[i2 + num_Acouple_cols] = i_r;
2187        if (A_marker[i2 + num_Acouple_cols] != i_r) {              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2188          A_marker[i2 + num_Acouple_cols] = i_r;              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2189          j3_ub = P->mainBlock->pattern->ptr[i2+1];                  i_c = P->mainBlock->pattern->index[j3];
2190          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                  if (P_marker[i_c] < row_marker) {
2191          i_c = P->mainBlock->pattern->index[j3];                    P_marker[i_c] = sum;
2192          if (P_marker[i_c] < row_marker) {                    sum++;
2193            P_marker[i_c] = sum;                  }
2194            sum++;              }
2195          }              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2196          }              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2197          j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];                  /* note that we need to map the column index in
2198          for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {                     P->col_coupleBlock back into the column index in
2199          /* note that we need to map the column index in                     P_ext_couple */
2200             P->col_coupleBlock back into the column index in                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2201             P_ext_couple */                  if (P_marker[i_c] < row_marker) {
2202          i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;                    P_marker[i_c] = sum;
2203  if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d-%d out of %d+%d) 5\n", rank, i_c, P->col_coupleBlock->pattern->index[j3], num_Pext_cols, num_Pmain_cols);                    sum++;
2204          if (P_marker[i_c] < row_marker) {                  }
2205            P_marker[i_c] = sum;              }
2206            sum++;            }
2207          }          }
         }  
       }  
     }  
2208       }       }
2209     }     }
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);  
2210    
2211     /* Now we have the number of non-zero elements in RAP_ext, allocate     /* Now we have the number of non-zero elements in RAP_ext, allocate
2212        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */        PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2213     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);     RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2214     RAP_ext_idx = MEMALLOC(sum, index_t);     RAP_ext_idx = new index_t[sum];
2215     RAP_ext_val = MEMALLOC(sum * block_size, double);     RAP_ext_val = new double[sum * block_size];
2216     RA_val = TMPMEMALLOC(block_size, double);     RA_val = new double[block_size];
2217     RAP_val = TMPMEMALLOC(block_size, double);     RAP_val = new double[block_size];
2218    
2219     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2220     sum = num_Pext_cols + num_Pmain_cols;     sum = num_Pext_cols + num_Pmain_cols;
# Line 2694  if (MY_DEBUG) fprintf(stderr, "rank %d C Line 2222  if (MY_DEBUG) fprintf(stderr, "rank %d C
2222     for (i=0; i<sum; i++) P_marker[i] = -1;     for (i=0; i<sum; i++) P_marker[i] = -1;
2223     #pragma omp parallel for private(i) schedule(static)     #pragma omp parallel for private(i) schedule(static)
2224     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_1 sum %d\n", rank, sum);  
2225     sum = 0;     sum = 0;
2226     RAP_ext_ptr[0] = 0;     RAP_ext_ptr[0] = 0;
2227     for (i_r=0; i_r<num_Pcouple_cols; i_r++){     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2228       row_marker = sum;       row_marker = sum;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_2 i_r %d sum %d\n", rank, i_r, sum);  
2229       /* then, loop over elements in row i_r of R_couple */       /* then, loop over elements in row i_r of R_couple */
2230       j1_ub = R_couple->pattern->ptr[i_r+1];       j1_ub = R_couple->pattern->ptr[i_r+1];
2231       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){       for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2232      i1 = R_couple->pattern->index[j1];          i1 = R_couple->pattern->index[j1];
2233      R_val = &(R_couple->val[j1*block_size]);          R_val = &(R_couple->val[j1*block_size]);
2234    
2235      /* then, loop over elements in row i1 of A->col_coupleBlock */          /* then, loop over elements in row i1 of A->col_coupleBlock */
2236      j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];          j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2237      for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {          for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2238        i2 = A->col_coupleBlock->pattern->index[j2];            i2 = A->col_coupleBlock->pattern->index[j2];
2239        //RA_val = R_val * A->col_coupleBlock->val[j2];            temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2240        temp_val = &(A->col_coupleBlock->val[j2*block_size]);            for (irb=0; irb<row_block_size; irb++) {
2241        for (irb=0; irb<row_block_size; irb++) {              for (icb=0; icb<col_block_size; icb++) {
2242          for (icb=0; icb<col_block_size; icb++) {                  rtmp = ZERO;
2243          rtmp = ZERO;                  for (ib=0; ib<col_block_size; ib++) {
2244          for (ib=0; ib<col_block_size; ib++) {                    rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2245            rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];                  }
2246          }                  RA_val[irb+row_block_size*icb]=rtmp;
2247          RA_val[irb+row_block_size*icb]=rtmp;              }
2248          }            }
       }  
   
       /* check whether entry RA[i_r, i2] has been previously visited.  
          RAP new entry is possible only if entry RA[i_r, i2] has not  
          been visited yet */  
       if (A_marker[i2] != i_r) {  
         /* first, mark entry RA[i_r,i2] as visited */  
         A_marker[i2] = i_r;  
   
         /* then loop over elements in row i2 of P_ext_main */  
         j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->row_coupleBlock->pattern->index[j3];  
 //      RAP_val = RA_val * P->row_coupleBlock->val[j3];  
         temp_val = &(P->row_coupleBlock->val[j3*block_size]);  
         for (irb=0; irb<row_block_size; irb++) {  
           for (icb=0; icb<col_block_size; icb++) {  
             rtmp = ZERO;  
             for (ib=0; ib<col_block_size; ib++) {  
             rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];  
             }  
             RAP_val[irb+row_block_size*icb]=rtmp;  
           }  
         }  
   
   
         /* check whether entry RAP[i_r,i_c] has been created.  
            If not yet created, create the entry and increase  
            the total number of elements in RAP_ext */  
         if (P_marker[i_c] < row_marker) {  
           P_marker[i_c] = sum;  
           //RAP_ext_val[sum*block_size] = RAP_val;  
           memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));  
           RAP_ext_idx[sum] = i_c + offset;  
 if (MY_DEBUG){  
 if (rank == 1) fprintf(stderr, "1-ext[%d (%d,%d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum*block_size]);  
 }  
           sum++;  
         } else {  
           //RAP_ext_val[P_marker[i_c] * block_size] += RAP_val;  
           temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);  
           for (ib=0; ib<block_size; ib++) {  
             temp_val[ib] += RAP_val[ib];  
           }  
             
 if (MY_DEBUG){  
 if (rank == 1) fprintf(stderr, "1-ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);  
 }  
         }  
         }  
2249    
2250          /* loop over elements in row i2 of P_ext_couple, do the same */            /* check whether entry RA[i_r, i2] has been previously visited.
2251          j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];               RAP new entry is possible only if entry RA[i_r, i2] has not
2252          for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {               been visited yet */
2253          i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;            if (A_marker[i2] != i_r) {
2254          //RAP_val = RA_val * P->remote_coupleBlock->val[j3];              /* first, mark entry RA[i_r,i2] as visited */
2255                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);              A_marker[i2] = i_r;
2256    
2257                /* then loop over elements in row i2 of P_ext_main */
2258                j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2259                for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2260                    i_c = P->row_coupleBlock->pattern->index[j3];
2261                    temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2262                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2263                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
2264                      rtmp = ZERO;                      rtmp = ZERO;
# Line 2787  if (rank == 1) fprintf(stderr, "1-ext[(% Line 2269  if (rank == 1) fprintf(stderr, "1-ext[(%
2269                    }                    }
2270                  }                  }
2271    
2272          if (P_marker[i_c] < row_marker) {  
2273            P_marker[i_c] = sum;                  /* check whether entry RAP[i_r,i_c] has been created.
2274            //RAP_ext_val[sum] = RAP_val;                     If not yet created, create the entry and increase
2275            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                     the total number of elements in RAP_ext */
2276            RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                  if (P_marker[i_c] < row_marker) {
2277  if (MY_DEBUG){                    P_marker[i_c] = sum;
2278  if (rank == 1) fprintf(stderr, "ext[%d (%d,%d)]=%f (R%f, A(%d %d)%f, P%f)\n", sum, i_r, i_c, RAP_ext_val[sum], *R_val, i1, i2, A->col_coupleBlock->val[j2], P->remote_coupleBlock->val[j3]);                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2279  }                    RAP_ext_idx[sum] = i_c + offset;
2280            sum++;                    sum++;
2281          } else {                  } else {
           //RAP_ext_val[P_marker[i_c]] += RAP_val;  
2282                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2283                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2284                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2285                    }                    }
2286                    }
2287                }
2288    
2289  if (MY_DEBUG){              /* loop over elements in row i2 of P_ext_couple, do the same */
2290  if (rank == 1) fprintf(stderr, "ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2291  }              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2292          }                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2293          }                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2294                    for (irb=0; irb<row_block_size; irb++) {
2295                      for (icb=0; icb<col_block_size; icb++) {
2296                        rtmp = ZERO;
2297                        for (ib=0; ib<col_block_size; ib++) {
2298                            rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2299                        }
2300                        RAP_val[irb+row_block_size*icb]=rtmp;
2301                      }
2302                    }
2303    
2304                    if (P_marker[i_c] < row_marker) {
2305                      P_marker[i_c] = sum;
2306                      memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2307                      RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2308                      sum++;
2309                    } else {
2310                      temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2311                      for (ib=0; ib<block_size; ib++) {
2312                        temp_val[ib] += RAP_val[ib];
2313                      }
2314                    }
2315                }
2316    
2317        /* If entry RA[i_r, i2] is visited, no new RAP entry is created.            /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2318           Only the contributions are added. */               Only the contributions are added. */
2319        } else {            } else {
2320          j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];              j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2321          for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2322          i_c = P->row_coupleBlock->pattern->index[j3];                  i_c = P->row_coupleBlock->pattern->index[j3];
         //RAP_val = RA_val * P->row_coupleBlock->val[j3];  
2323                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);                  temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2324                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2325                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2827  if (rank == 1) fprintf(stderr, "ext[(%d, Line 2331  if (rank == 1) fprintf(stderr, "ext[(%d,
2331                    }                    }
2332                  }                  }
2333    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
2334                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2335                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2336                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2337                  }                  }
2338                }
2339  if (MY_DEBUG){              j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2340  if (rank == 1) fprintf(stderr, "Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);              for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2341  }                  i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
         }  
         j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;  
         //RAP_val = RA_val * P->remote_coupleBlock->val[j3];  
2342                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);                  temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2343                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2344                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2852  if (rank == 1) fprintf(stderr, "Visited Line 2350  if (rank == 1) fprintf(stderr, "Visited
2350                    }                    }
2351                  }                  }
2352    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
2353                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2354                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2355                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2356                  }                  }
2357                }
2358              }
2359            }
2360    
2361  if (MY_DEBUG){          /* now loop over elements in row i1 of A->mainBlock, repeat
2362  if (rank == 1) fprintf(stderr, "Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);             the process we have done to block A->col_coupleBlock */
2363  }          j2_ub = A->mainBlock->pattern->ptr[i1+1];
2364          }          for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2365        }            i2 = A->mainBlock->pattern->index[j2];
     }  
   
     /* now loop over elements in row i1 of A->mainBlock, repeat  
        the process we have done to block A->col_coupleBlock */  
     j2_ub = A->mainBlock->pattern->ptr[i1+1];  
     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {  
       i2 = A->mainBlock->pattern->index[j2];  
       //RA_val = R_val * A->mainBlock->val[j2];  
2366            temp_val = &(A->mainBlock->val[j2*block_size]);            temp_val = &(A->mainBlock->val[j2*block_size]);
2367            for (irb=0; irb<row_block_size; irb++) {            for (irb=0; irb<row_block_size; irb++) {
2368              for (icb=0; icb<col_block_size; icb++) {              for (icb=0; icb<col_block_size; icb++) {
# Line 2882  if (rank == 1) fprintf(stderr, "Visited Line 2374  if (rank == 1) fprintf(stderr, "Visited
2374              }              }
2375            }            }
2376    
2377        if (A_marker[i2 + num_Acouple_cols] != i_r) {            if (A_marker[i2 + num_Acouple_cols] != i_r) {
2378          A_marker[i2 + num_Acouple_cols] = i_r;              A_marker[i2 + num_Acouple_cols] = i_r;
2379          j3_ub = P->mainBlock->pattern->ptr[i2+1];              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2380          for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2381          i_c = P->mainBlock->pattern->index[j3];                  i_c = P->mainBlock->pattern->index[j3];
         //RAP_val = RA_val * P->mainBlock->val[j3];  
2382                  temp_val = &(P->mainBlock->val[j3*block_size]);                  temp_val = &(P->mainBlock->val[j3*block_size]);
2383                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2384                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2899  if (rank == 1) fprintf(stderr, "Visited Line 2390  if (rank == 1) fprintf(stderr, "Visited
2390                    }                    }
2391                  }                  }
2392    
2393          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2394            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2395            //RAP_ext_val[sum] = RAP_val;                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2396            memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));                    RAP_ext_idx[sum] = i_c + offset;
2397            RAP_ext_idx[sum] = i_c + offset;                    sum++;
2398  if (MY_DEBUG){                  } else {
 if (rank == 1) fprintf(stderr, "Main 1-ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);  
 }  
           sum++;  
         } else {  
           //RAP_ext_val[P_marker[i_c]] += RAP_val;  
2399                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2400                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2401                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2402                    }                    }
2403                    }
2404  if (MY_DEBUG){              }
2405  if (rank == 1) fprintf(stderr, "Main 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2406  }              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2407          }                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
 if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 1\n", rank, i_c, num_Pext_cols + num_Pmain_cols);  
         //RAP_val = RA_val * P->col_coupleBlock->val[j3];  
2408                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2409                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2410                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2936  if (MY_DEBUG && (i_c < 0 || i_c >= num_P Line 2416  if (MY_DEBUG && (i_c < 0 || i_c >= num_P
2416                    }                    }
2417                  }                  }
2418    
2419          if (P_marker[i_c] < row_marker) {                  if (P_marker[i_c] < row_marker) {
2420            P_marker[i_c] = sum;                    P_marker[i_c] = sum;
2421            //RAP_ext_val[sum] = RAP_val;                    memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
           memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));  
2422                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];                    RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2423  if (MY_DEBUG){                    sum++;
2424  if (rank == 1) fprintf(stderr, "Main ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);                  } else {
 }  
           sum++;  
         } else {  
           //RAP_ext_val[P_marker[i_c]] += RAP_val;  
2425                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                    temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2426                    for (ib=0; ib<block_size; ib++) {                    for (ib=0; ib<block_size; ib++) {
2427                      temp_val[ib] += RAP_val[ib];                      temp_val[ib] += RAP_val[ib];
2428                    }                    }
2429                    }
2430  if (MY_DEBUG){              }
2431  if (rank == 1) fprintf(stderr, "Main ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);            } else {
2432  }              j3_ub = P->mainBlock->pattern->ptr[i2+1];
2433          }              for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2434          }                  i_c = P->mainBlock->pattern->index[j3];
       } else {  
         j3_ub = P->mainBlock->pattern->ptr[i2+1];  
         for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = P->mainBlock->pattern->index[j3];  
         //RAP_val = RA_val * P->mainBlock->val[j3];  
2435                  temp_val = &(P->mainBlock->val[j3*block_size]);                  temp_val = &(P->mainBlock->val[j3*block_size]);
2436                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2437                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2973  if (rank == 1) fprintf(stderr, "Main ext Line 2443  if (rank == 1) fprintf(stderr, "Main ext
2443                    }                    }
2444                  }                  }
2445    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
2446                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2447                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2448                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2449                  }                  }
2450                }
2451  if (MY_DEBUG){              j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2452  if (rank == 1) fprintf(stderr, "Main Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);              for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2453  }                  i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
         }  
         j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];  
         for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {  
         i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;  
         //RAP_val = RA_val * P->col_coupleBlock->val[j3];  
2454                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);                  temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2455                  for (irb=0; irb<row_block_size; irb++) {                  for (irb=0; irb<row_block_size; irb++) {
2456                    for (icb=0; icb<col_block_size; icb++) {                    for (icb=0; icb<col_block_size; icb++) {
# Line 2998  if (rank == 1) fprintf(stderr, "Main Vis Line 2462  if (rank == 1) fprintf(stderr, "Main Vis
2462                    }                    }
2463                  }                  }
2464    
         //RAP_ext_val[P_marker[i_c]] += RAP_val;  
2465                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);                  temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2466                  for (ib=0; ib<block_size; ib++) {                  for (ib=0; ib<block_size; ib++) {
2467                    temp_val[ib] += RAP_val[ib];                    temp_val[ib] += RAP_val[ib];
2468                  }                  }
2469                }
2470  if (MY_DEBUG){            }
2471  if (rank == 1) fprintf(stderr, "Main Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);          }
 }  
         }  
       }  
     }  
2472       }       }
2473       RAP_ext_ptr[i_r+1] = sum;       RAP_ext_ptr[i_r+1] = sum;
2474     }     }
2475     TMPMEMFREE(P_marker);     delete[] P_marker;
2476     TMPMEMFREE(Pcouple_to_Pext);     delete[] Pcouple_to_Pext;
 if (MY_DEBUG) fprintf(stderr, "rank %d CP5 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);  
   
 if (MY_DEBUG) {  
 char *str1, *str2;  
 str1 = TMPMEMALLOC(sum*100+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d RAP_ext_idx[%d] = (", rank, sum);  
 for (i=0; i<sum; i++) {  
   sprintf(str2, "%d ", RAP_ext_idx[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
2477    
2478     /* Now we have part of RAP[r,c] where row "r" is the list of rows     /* Now we have part of RAP[r,c] where row "r" is the list of rows
2479        which is given by the column list of P->col_coupleBlock, and        which is given by the column list of P->col_coupleBlock, and
2480        column "c" is the list of columns which possiblely covers the        column "c" is the list of columns which possibly covers the
2481        whole column range of systme martris P. This part of data is to        whole column range of system matrix P. This part of data is to
2482        be passed to neighboring processors, and added to corresponding        be passed to neighbouring processors, and added to corresponding
2483        RAP[r,c] entries in the neighboring processors */        RAP[r,c] entries in the neighbouring processors */
2484     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2485          &RAP_ext_val, global_id_P, block_size);                  &RAP_ext_val, global_id_P, block_size);
 if (MY_DEBUG) fprintf(stderr, "rank %d CP6\n", rank);  
2486    
2487     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2488     sum = RAP_ext_ptr[num_RAPext_rows];     sum = RAP_ext_ptr[num_RAPext_rows];
 if (MY_DEBUG) fprintf(stderr, "rank %d CP6_0 %d %d (%d %d)\n", rank, sum, num_Pext_cols, num_RAPext_rows, num_Pcouple_cols);  
2489     num_RAPext_cols = 0;     num_RAPext_cols = 0;
2490     if (num_Pext_cols || sum > 0) {     if (num_Pext_cols || sum > 0) {
2491       temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);       temp = new index_t[num_Pext_cols+sum];
2492       j1_ub = offset + num_Pmain_cols;       j1_ub = offset + num_Pmain_cols;
2493       for (i=0, iptr=0; i<sum; i++) {       for (i=0, iptr=0; i<sum; i++) {
2494      if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */ {          if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub)  /* XXX */
2495        temp[iptr++] = RAP_ext_idx[i];          /* XXX */            temp[iptr++] = RAP_ext_idx[i];                  /* XXX */
 if (MY_DEBUG && rank ==1) fprintf(stderr, "RAP_ext_idx[%d]=%d\n", i, RAP_ext_idx[i]);  
 }  
2496       }       }
2497       for (j=0; j<num_Pext_cols; j++, iptr++){       for (j=0; j<num_Pext_cols; j++, iptr++){
2498      temp[iptr] = global_id_P[j];          temp[iptr] = global_id_P[j];
2499       }       }
2500    
2501       if (iptr) {       if (iptr) {
2502      #ifdef USE_QSORTG            qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2503        qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          num_RAPext_cols = 1;
2504      #else          i = temp[0];
2505        qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);          for (j=1; j<iptr; j++) {
2506      #endif            if (temp[j] > i) {
2507      num_RAPext_cols = 1;              i = temp[j];
2508      i = temp[0];              temp[num_RAPext_cols++] = i;
2509      for (j=1; j<iptr; j++) {            }
2510        if (temp[j] > i) {          }
         i = temp[j];  
         temp[num_RAPext_cols++] = i;  
       }  
     }  
   
 if (MY_DEBUG) {  
 char *str1, *str2;  
 index_t my_sum=iptr, i;  
 str1 = TMPMEMALLOC(my_sum*20+100, char);  
 str2 = TMPMEMALLOC(15, char);  
 sprintf(str1, "rank %d temp[%d] = (", rank, my_sum);  
 for (i=0; i<my_sum; i++){  
   sprintf(str2, "%d ", temp[i]);  
   strcat(str1, str2);  
 }  
 fprintf(stderr, "%s)\n", str1);  
 TMPMEMFREE(str1);  
 TMPMEMFREE(str2);  
 }  
2511       }       }
2512     }     }