/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Annotation of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4816 - (hide annotations)
Fri Mar 28 06:16:02 2014 UTC (5 years, 8 months ago) by caltinay
File size: 122136 byte(s)
paso::SharedComponents now header-only and shared ptr managed.

1 jfenwick 3981 /*****************************************************************************
2 lgao 3763 *
3 jfenwick 4657 * Copyright (c) 2003-2014 by University of Queensland
4 jfenwick 3981 * http://www.uq.edu.au
5 lgao 3763 *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10 jfenwick 3981 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 jfenwick 4657 * Development 2012-2013 by School of Earth Sciences
12     * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 jfenwick 3981 *
14     *****************************************************************************/
15 lgao 3763
16    
17 caltinay 4816 /****************************************************************************/
18 lgao 3763
19     /* Paso: defines AMG Interpolation */
20    
21 caltinay 4816 /****************************************************************************/
22 lgao 3763
23     /* Author: l.gao@uq.edu.au */
24    
25 caltinay 4816 /****************************************************************************/
26 lgao 3763
27     #include "Paso.h"
28     #include "SparseMatrix.h"
29     #include "PasoUtil.h"
30     #include "Preconditioner.h"
31    
32 caltinay 4816 /****************************************************************************
33 lgao 3763
34 jfenwick 4345 Methods necessary for AMG preconditioner
35 lgao 3763
36     construct n_C x n_C interpolation operator A_C from matrix A
37     and prolongation matrix P.
38    
39     The coarsening operator A_C is defined by RAP where R=P^T.
40    
41 caltinay 4816 *****************************************************************************/
42 lgao 3763
43 jfenwick 4345 /* Extend system matrix B with extra two sparse matrices:
44 lgao 3763 B_ext_main and B_ext_couple
45 jfenwick 4345 The combination of this two sparse matrices represents
46     the portion of B that is stored on neighbour procs and
47 lgao 3763 needed locally for triple matrix product (B^T) A B.
48    
49     FOR NOW, we assume that the system matrix B has a NULL
50     row_coupleBlock and a NULL remote_coupleBlock. Based on
51     this assumption, we use link row_coupleBlock for sparse
52     matrix B_ext_main, and link remote_coupleBlock for sparse
53     matrix B_ext_couple.
54    
55     To be specific, B_ext (on processor k) are group of rows
56     in B, where the list of rows from processor q is given by
57     A->col_coupler->connector->send->shared[rPtr...]
58     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59     on q. */
60     void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
61     {
62 caltinay 4803 paso::Pattern *pattern_main=NULL, *pattern_couple=NULL;
63 caltinay 4801 paso::Coupler *coupler=NULL;
64 caltinay 4816 paso::SharedComponents_ptr send, recv;
65 lgao 3763 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
66     index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
67     index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
68     index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
69     index_t *idx_m=NULL, *idx_c=NULL;
70 lgao 3828 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
71 lgao 3763 index_t offset, len, block_size, block_size_size, max_num_cols;
72     index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
73     dim_t *recv_degree=NULL, *send_degree=NULL;
74     dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
75    
76     if (size == 1) return;
77    
78     if (B->row_coupleBlock) {
79 caltinay 4797 paso::SparseMatrix_free(B->row_coupleBlock);
80 lgao 3763 B->row_coupleBlock = NULL;
81     }
82    
83     if (B->row_coupleBlock || B->remote_coupleBlock) {
84     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
85     return;
86     }
87    
88     /* sending/receiving unknown's global ID */
89     num_main_cols = B->mainBlock->numCols;
90 jfenwick 4275 cols = new double[num_main_cols];
91 lgao 3763 offset = B->col_distribution->first_component[rank];
92     #pragma omp parallel for private(i) schedule(static)
93     for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
94     if (B->global_id == NULL) {
95 caltinay 4801 coupler=paso::Coupler_alloc(B->col_coupler->connector, 1);
96     paso::Coupler_startCollect(coupler, cols);
97 lgao 3763 }
98    
99 jfenwick 4275 recv_buf = new index_t[size];
100     recv_degree = new dim_t[size];
101     recv_offset = new index_t[size+1];
102 lgao 3763 #pragma omp parallel for private(i) schedule(static)
103     for (i=0; i<size; i++){
104     recv_buf[i] = 0;
105     recv_degree[i] = 1;
106     recv_offset[i] = i;
107     }
108    
109 lgao 3779 block_size = B->block_size;
110 lgao 3763 block_size_size = block_size * sizeof(double);
111     num_couple_cols = B->col_coupleBlock->numCols;
112     send = A->col_coupler->connector->send;
113     recv = A->col_coupler->connector->recv;
114     num_neighbors = send->numNeighbors;
115     p = send->offsetInShared[num_neighbors];
116 lgao 3767 len = p * B->col_distribution->first_component[size];
117 jfenwick 4275 send_buf = new double[len * block_size];
118     send_idx = new index_t[len];
119     send_offset = new index_t[(p+1)*2];
120     send_degree = new dim_t[num_neighbors];
121 lgao 3763 i = num_main_cols + num_couple_cols;
122 jfenwick 4275 send_m = new double[i * block_size];
123     send_c = new double[i * block_size];
124     idx_m = new index_t[i];
125     idx_c = new index_t[i];
126 lgao 3763
127     /* waiting for receiving unknown's global ID */
128     if (B->global_id == NULL) {
129 caltinay 4801 paso::Coupler_finishCollect(coupler);
130 jfenwick 4275 global_id = new index_t[num_couple_cols];
131 lgao 3763 #pragma omp parallel for private(i) schedule(static)
132     for (i=0; i<num_couple_cols; ++i)
133     global_id[i] = coupler->recv_buffer[i];
134     B->global_id = global_id;
135 caltinay 4801 paso::Coupler_free(coupler);
136 lgao 3763 } else
137     global_id = B->global_id;
138    
139     /* distribute the number of cols in current col_coupleBlock to all ranks */
140 lgao 3828 #ifdef ESYS_MPI
141 lgao 3763 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
142 lgao 3828 #endif
143 lgao 3763
144     /* distribute global_ids of cols to be considered to all ranks*/
145     len = 0;
146     max_num_cols = 0;
147     for (i=0; i<size; i++){
148     recv_degree[i] = recv_buf[i];
149     recv_offset[i] = len;
150     len += recv_buf[i];
151     if (max_num_cols < recv_buf[i])
152     max_num_cols = recv_buf[i];
153     }
154     recv_offset[size] = len;
155 jfenwick 4275 cols_array = new index_t[len];
156 lgao 3828 #ifdef ESYS_MPI
157 lgao 3763 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
158 lgao 3828 #endif
159 lgao 3763
160     /* first, prepare the ptr_ptr to be received */
161     q = recv->numNeighbors;
162     len = recv->offsetInShared[q];
163 jfenwick 4275 ptr_ptr = new index_t[(len+1) * 2];
164 lgao 3763 for (p=0; p<q; p++) {
165     row = recv->offsetInShared[p];
166     m = recv->offsetInShared[p + 1];
167 lgao 3828 #ifdef ESYS_MPI
168 lgao 3767 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
169 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
170     A->mpi_info->comm,
171     &(A->col_coupler->mpi_requests[p]));
172 lgao 3828 #endif
173 lgao 3763 }
174    
175     /* now prepare the rows to be sent (the degree, the offset and the data) */
176     len = 0;
177 lgao 3817 i0 = 0;
178 lgao 3763 for (p=0; p<num_neighbors; p++) {
179 lgao 3817 i = i0;
180 lgao 3763 neighbor = send->neighbor[p];
181     m_lb = B->col_distribution->first_component[neighbor];
182     m_ub = B->col_distribution->first_component[neighbor + 1];
183     j_ub = send->offsetInShared[p + 1];
184     for (j=send->offsetInShared[p]; j<j_ub; j++) {
185     row = send->shared[j];
186     l_m = 0;
187     l_c = 0;
188     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
189     k_lb = B->col_coupleBlock->pattern->ptr[row];
190    
191     /* check part of col_coupleBlock for data to be passed @row */
192     for (k=k_lb; k<k_ub; k++) {
193     m = global_id[B->col_coupleBlock->pattern->index[k]];
194     if (m > offset) break;
195     if (m>= m_lb && m < m_ub) {
196     /* data to be passed to sparse matrix B_ext_main */
197     idx_m[l_m] = m - m_lb;
198     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
199     l_m++;
200     } else {
201     /* data to be passed to sparse matrix B_ext_couple */
202     idx_c[l_c] = m;
203     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
204     l_c++;
205     }
206     }
207     k_lb = k;
208    
209     /* check mainBlock for data to be passed @row to sparse
210     matrix B_ext_couple */
211     k_ub = B->mainBlock->pattern->ptr[row + 1];
212     k = B->mainBlock->pattern->ptr[row];
213     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
214     for (; k<k_ub; k++) {
215     m = B->mainBlock->pattern->index[k] + offset;
216     idx_c[l_c] = m;
217     l_c++;
218     }
219    
220     /* check the rest part of col_coupleBlock for data to
221     be passed @row to sparse matrix B_ext_couple */
222     k = k_lb;
223     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
224     for (k=k_lb; k<k_ub; k++) {
225     m = global_id[B->col_coupleBlock->pattern->index[k]];
226 lgao 3767 if (m>= m_lb && m < m_ub) {
227 lgao 3763 /* data to be passed to sparse matrix B_ext_main */
228     idx_m[l_m] = m - m_lb;
229     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
230     l_m++;
231     } else {
232     /* data to be passed to sparse matrix B_ext_couple */
233     idx_c[l_c] = m;
234     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
235     l_c++;
236     }
237     }
238    
239 lgao 3779 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
240 lgao 3767 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
241 lgao 3763 send_offset[2*i] = l_m;
242     len += l_m;
243 lgao 3779 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
244 lgao 3767 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
245 lgao 3763 send_offset[2*i+1] = l_c;
246     len += l_c;
247     i++;
248     }
249    
250     /* sending */
251 lgao 3828 #ifdef ESYS_MPI
252 lgao 3817 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
253 lgao 3763 A->mpi_info->msg_tag_counter+rank,
254     A->mpi_info->comm,
255     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
256 lgao 3828 #endif
257 lgao 3763 send_degree[p] = len;
258 lgao 3817 i0 = i;
259 lgao 3763 }
260 jfenwick 4275 delete[] send_m;
261     delete[] send_c;
262     delete[] idx_m;
263     delete[] idx_c;
264 lgao 3763
265    
266     q = recv->numNeighbors;
267     len = recv->offsetInShared[q];
268 jfenwick 4275 ptr_main = new index_t[(len+1)];
269     ptr_couple = new index_t[(len+1)];
270 lgao 3763
271 lgao 3828 #ifdef ESYS_MPI
272 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
273     A->col_coupler->mpi_requests,
274     A->col_coupler->mpi_stati);
275 lgao 3828 #endif
276 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
277 lgao 3763
278     j = 0;
279     k = 0;
280     ptr_main[0] = 0;
281     ptr_couple[0] = 0;
282     for (i=0; i<len; i++) {
283     j += ptr_ptr[2*i];
284     k += ptr_ptr[2*i+1];
285     ptr_main[i+1] = j;
286     ptr_couple[i+1] = k;
287     }
288    
289 jfenwick 4275 delete[] ptr_ptr;
290     idx_main = new index_t[j];
291     idx_couple = new index_t[k];
292     ptr_idx = new index_t[j+k];
293     ptr_val = new double[(j+k) * block_size];
294 lgao 3763
295     /* send/receive index array */
296     j=0;
297     k_ub = 0;
298     for (p=0; p<recv->numNeighbors; p++) {
299     k = recv->offsetInShared[p];
300     m = recv->offsetInShared[p+1];
301     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
302     if (i > 0) {
303     k_ub ++;
304 lgao 3828 #ifdef ESYS_MPI
305 lgao 3763 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
306     A->mpi_info->msg_tag_counter+recv->neighbor[p],
307     A->mpi_info->comm,
308     &(A->col_coupler->mpi_requests[p]));
309 lgao 3828 #endif
310 lgao 3763 }
311     j += i;
312     }
313    
314     j=0;
315     k_ub = 0;
316     for (p=0; p<num_neighbors; p++) {
317     i = send_degree[p] - j;
318     if (i > 0){
319     k_ub ++;
320 lgao 3828 #ifdef ESYS_MPI
321 lgao 3763 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
322     A->mpi_info->msg_tag_counter+rank,
323     A->mpi_info->comm,
324     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
325 lgao 3828 #endif
326 lgao 3763 }
327     j = send_degree[p];
328     }
329    
330 lgao 3828 #ifdef ESYS_MPI
331 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
332     A->col_coupler->mpi_requests,
333     A->col_coupler->mpi_stati);
334 lgao 3828 #endif
335 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
336 lgao 3763
337 lgao 3821 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
338 lgao 3763 for (i=0; i<len; i++) {
339     j = ptr_main[i];
340     k = ptr_main[i+1];
341     m = ptr_couple[i];
342     for (p=j; p<k; p++) {
343     idx_main[p] = ptr_idx[m+p];
344     }
345     j = ptr_couple[i+1];
346     for (p=m; p<j; p++) {
347     idx_couple[p] = ptr_idx[k+p];
348     }
349     }
350 jfenwick 4275 delete[] ptr_idx;
351 lgao 3763
352     /* allocate pattern and sparsematrix for B_ext_main */
353 caltinay 4803 pattern_main = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
354 lgao 3763 len, num_main_cols, ptr_main, idx_main);
355 caltinay 4797 B->row_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
356 lgao 3763 pattern_main, B->row_block_size, B->col_block_size,
357     FALSE);
358 caltinay 4803 paso::Pattern_free(pattern_main);
359 lgao 3763
360     /* allocate pattern and sparsematrix for B_ext_couple */
361 caltinay 4803 pattern_couple = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
362 lgao 3763 len, B->col_distribution->first_component[size],
363     ptr_couple, idx_couple);
364 caltinay 4797 B->remote_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
365 lgao 3763 pattern_couple, B->row_block_size, B->col_block_size,
366     FALSE);
367 caltinay 4803 paso::Pattern_free(pattern_couple);
368 lgao 3763
369     /* send/receive value array */
370     j=0;
371     for (p=0; p<recv->numNeighbors; p++) {
372     k = recv->offsetInShared[p];
373     m = recv->offsetInShared[p+1];
374     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
375 lgao 3828 #ifdef ESYS_MPI
376 lgao 3763 if (i > 0)
377     MPI_Irecv(&(ptr_val[j]), i * block_size,
378     MPI_DOUBLE, recv->neighbor[p],
379     A->mpi_info->msg_tag_counter+recv->neighbor[p],
380     A->mpi_info->comm,
381     &(A->col_coupler->mpi_requests[p]));
382 lgao 3828 #endif
383 lgao 3779 j += (i * block_size);
384 lgao 3763 }
385    
386     j=0;
387     for (p=0; p<num_neighbors; p++) {
388     i = send_degree[p] - j;
389 lgao 3828 #ifdef ESYS_MPI
390 lgao 3763 if (i > 0)
391 lgao 3779 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
392 lgao 3763 A->mpi_info->msg_tag_counter+rank,
393     A->mpi_info->comm,
394     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
395 lgao 3828 #endif
396 lgao 3779 j = send_degree[p] ;
397 lgao 3763 }
398    
399 lgao 3828 #ifdef ESYS_MPI
400 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
401     A->col_coupler->mpi_requests,
402     A->col_coupler->mpi_stati);
403 lgao 3828 #endif
404 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
405 lgao 3763
406     #pragma omp parallel for private(i,j,k,m,p) schedule(static)
407     for (i=0; i<len; i++) {
408     j = ptr_main[i];
409     k = ptr_main[i+1];
410     m = ptr_couple[i];
411     for (p=j; p<k; p++) {
412 lgao 3817 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
413 lgao 3763 }
414     j = ptr_couple[i+1];
415     for (p=m; p<j; p++) {
416 lgao 3817 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
417 lgao 3763 }
418     }
419 jfenwick 4275 delete[] ptr_val;
420 lgao 3763
421 lgao 3817
422 lgao 3763 /* release all temp memory allocation */
423 jfenwick 4275 delete[] cols;
424     delete[] cols_array;
425     delete[] recv_offset;
426     delete[] recv_degree;
427     delete[] recv_buf;
428     delete[] send_buf;
429     delete[] send_offset;
430     delete[] send_degree;
431     delete[] send_idx;
432 lgao 3763 }
433    
434     /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
435     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
436 jfenwick 4345 to copy block of data in T to neighbour processors, defined by
437 lgao 3763 P->col_coupler->connector->recv->neighbor[k] where k is in
438     [0, P->col_coupler->connector->recv->numNeighbors).
439     Rows to be copied to neighbor processor k is in the list defined by
440     P->col_coupler->connector->recv->offsetInShared[k] ...
441     P->col_coupler->connector->recv->offsetInShared[k+1] */
442     void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
443 lgao 3779 index_t **p_ptr, index_t **p_idx, double **p_val,
444     index_t *global_id, index_t block_size)
445 lgao 3763 {
446 caltinay 4816 paso::SharedComponents_ptr send, recv;
447 lgao 3763 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
448 gross 3834 index_t i, j, p, m, n, size;
449 lgao 3763 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
450     index_t *ptr=*p_ptr, *idx=*p_idx;
451     double *val=*p_val, *recv_val=NULL;
452 gross 3834 #ifdef ESYS_MPI
453     index_t rank = P->mpi_info->rank;
454     #endif
455 lgao 3763
456     size = P->mpi_info->size;
457     send = P->col_coupler->connector->recv;
458     recv = P->col_coupler->connector->send;
459     send_neighbors = send->numNeighbors;
460     recv_neighbors = recv->numNeighbors;
461     send_rows = P->col_coupleBlock->numCols;
462     recv_rows = recv->offsetInShared[recv_neighbors];
463    
464 jfenwick 4275 send_degree = new index_t[send_rows];
465     recv_ptr = new index_t[recv_rows + 1];
466 lgao 3763 #pragma omp for schedule(static) private(i)
467     for (i=0; i<send_rows; i++)
468     send_degree[i] = ptr[i+1] - ptr[i];
469    
470     /* First, send/receive the degree */
471     for (p=0; p<recv_neighbors; p++) { /* Receiving */
472     m = recv->offsetInShared[p];
473     n = recv->offsetInShared[p+1];
474 lgao 3828 #ifdef ESYS_MPI
475 lgao 3763 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
476     P->mpi_info->msg_tag_counter + recv->neighbor[p],
477     P->mpi_info->comm,
478     &(P->col_coupler->mpi_requests[p]));
479 lgao 3828 #endif
480 lgao 3763 }
481     for (p=0; p<send_neighbors; p++) { /* Sending */
482     m = send->offsetInShared[p];
483     n = send->offsetInShared[p+1];
484 lgao 3828 #ifdef ESYS_MPI
485 lgao 3763 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
486     P->mpi_info->msg_tag_counter + rank,
487     P->mpi_info->comm,
488     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
489 lgao 3828 #endif
490 lgao 3763 }
491 lgao 3828 #ifdef ESYS_MPI
492 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
493     P->col_coupler->mpi_requests,
494     P->col_coupler->mpi_stati);
495 lgao 3828 #endif
496 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
497 lgao 3763
498 jfenwick 4275 delete[] send_degree;
499 lgao 3763 m = Paso_Util_cumsum(recv_rows, recv_ptr);
500     recv_ptr[recv_rows] = m;
501 jfenwick 4275 recv_idx = new index_t[m];
502     recv_val = new double[m * block_size];
503 lgao 3763
504     /* Next, send/receive the index array */
505     j = 0;
506     for (p=0; p<recv_neighbors; p++) { /* Receiving */
507     m = recv->offsetInShared[p];
508     n = recv->offsetInShared[p+1];
509     i = recv_ptr[n] - recv_ptr[m];
510     if (i > 0) {
511 lgao 3828 #ifdef ESYS_MPI
512 lgao 3763 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
513     P->mpi_info->msg_tag_counter + recv->neighbor[p],
514     P->mpi_info->comm,
515     &(P->col_coupler->mpi_requests[p]));
516 lgao 3828 #endif
517 lgao 3819 }
518 lgao 3763 j += i;
519     }
520    
521     j = 0;
522     for (p=0; p<send_neighbors; p++) { /* Sending */
523     m = send->offsetInShared[p];
524     n = send->offsetInShared[p+1];
525     i = ptr[n] - ptr[m];
526     if (i >0) {
527 lgao 3828 #ifdef ESYS_MPI
528 lgao 3763 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
529     P->mpi_info->msg_tag_counter + rank,
530     P->mpi_info->comm,
531     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
532 lgao 3828 #endif
533 lgao 3763 j += i;
534 lgao 3819 }
535 lgao 3763 }
536 lgao 3828 #ifdef ESYS_MPI
537 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
538     P->col_coupler->mpi_requests,
539     P->col_coupler->mpi_stati);
540 lgao 3828 #endif
541 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
542 lgao 3763
543     /* Last, send/receive the data array */
544     j = 0;
545     for (p=0; p<recv_neighbors; p++) { /* Receiving */
546     m = recv->offsetInShared[p];
547     n = recv->offsetInShared[p+1];
548     i = recv_ptr[n] - recv_ptr[m];
549 lgao 3828 #ifdef ESYS_MPI
550 lgao 3763 if (i > 0)
551 lgao 3779 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
552 lgao 3763 P->mpi_info->msg_tag_counter + recv->neighbor[p],
553     P->mpi_info->comm,
554     &(P->col_coupler->mpi_requests[p]));
555 lgao 3828 #endif
556 lgao 3779 j += (i*block_size);
557 lgao 3763 }
558    
559     j = 0;
560     for (p=0; p<send_neighbors; p++) { /* Sending */
561     m = send->offsetInShared[p];
562     n = send->offsetInShared[p+1];
563     i = ptr[n] - ptr[m];
564     if (i >0) {
565 lgao 3828 #ifdef ESYS_MPI
566 lgao 3779 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
567 lgao 3763 P->mpi_info->msg_tag_counter + rank,
568     P->mpi_info->comm,
569     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
570 lgao 3828 #endif
571 lgao 3779 j += i * block_size;
572 lgao 3763 }
573     }
574 lgao 3828 #ifdef ESYS_MPI
575 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
576     P->col_coupler->mpi_requests,
577     P->col_coupler->mpi_stati);
578 lgao 3828 #endif
579 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
580 lgao 3763
581 jfenwick 4345 /* Clean up and return with received ptr, index and data arrays */
582 jfenwick 4275 delete[] ptr;
583     delete[] idx;
584     delete[] val;
585 lgao 3763 *p_ptr = recv_ptr;
586     *p_idx = recv_idx;
587     *p_val = recv_val;
588     }
589    
590     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
591     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
592     {
593     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
594 caltinay 4797 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
595 lgao 3763 Paso_SystemMatrix *out=NULL;
596 caltinay 4800 paso::SystemMatrixPattern *pattern=NULL;
597 caltinay 4814 paso::Distribution_ptr input_dist, output_dist;
598 caltinay 4801 paso::Connector *col_connector=NULL, *row_connector=NULL;
599 caltinay 4803 paso::Pattern *main_pattern=NULL;
600     paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
601 lgao 3763 const dim_t row_block_size=A->row_block_size;
602     const dim_t col_block_size=A->col_block_size;
603 lgao 3779 const dim_t block_size = A->block_size;
604     const dim_t P_block_size = P->block_size;
605 lgao 3763 const double ZERO = 0.0;
606     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
607 lgao 3779 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
608 lgao 3763 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
609     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
610     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
611     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
612     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
613     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
614     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
615 lgao 3779 index_t sum, i, j, k, iptr, irb, icb, ib;
616 lgao 3763 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
617     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
618     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
619     index_t row_marker_ext, *where_p=NULL;
620     index_t **send_ptr=NULL, **send_idx=NULL;
621 lgao 3828 dim_t p, num_neighbors;
622 lgao 3763 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
623     Esys_MPI_rank *neighbor=NULL;
624     #ifdef ESYS_MPI
625     MPI_Request* mpi_requests=NULL;
626     MPI_Status* mpi_stati=NULL;
627     #else
628     int *mpi_requests=NULL, *mpi_stati=NULL;
629     #endif
630    
631 lgao 3828 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
632     return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
633 lgao 3779
634 jfenwick 4345 /* two sparse matrices R_main and R_couple will be generate, as the
635 lgao 3763 transpose of P_main and P_col_couple, respectively. Note that,
636     R_couple is actually the row_coupleBlock of R (R=P^T) */
637 caltinay 4797 R_main = paso::SparseMatrix_getTranspose(P->mainBlock);
638 lgao 3817 if (size > 1)
639 caltinay 4797 R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock);
640 lgao 3763
641 jfenwick 4345 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
642 lgao 3779 and needed locally for triple matrix product RAP
643     to be specific, P_ext (on processor k) are group of rows in P, where
644     the list of rows from processor q is given by
645     A->col_coupler->connector->send->shared[rPtr...]
646     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
647     on q.
648 jfenwick 4345 P_ext is represented by two sparse matrices P_ext_main and
649 lgao 3779 P_ext_couple */
650     Paso_Preconditioner_AMG_extendB(A, P);
651    
652     /* count the number of cols in P_ext_couple, resize the pattern of
653     sparse matrix P_ext_couple with new compressed order, and then
654     build the col id mapping from P->col_coupleBlock to
655     P_ext_couple */
656     num_Pmain_cols = P->mainBlock->numCols;
657 lgao 3817 if (size > 1) {
658     num_Pcouple_cols = P->col_coupleBlock->numCols;
659     num_Acouple_cols = A->col_coupleBlock->numCols;
660     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
661     } else {
662     num_Pcouple_cols = 0;
663     num_Acouple_cols = 0;
664     sum = 0;
665     }
666 lgao 3779 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
667     offset = P->col_distribution->first_component[rank];
668     num_Pext_cols = 0;
669     if (P->global_id) {
670     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
671     and P->col_coupleBlock */
672     iptr = 0;
673     if (num_Pcouple_cols || sum > 0) {
674 jfenwick 4275 temp = new index_t[num_Pcouple_cols+sum];
675 lgao 3821 #pragma omp parallel for lastprivate(iptr) schedule(static)
676     for (iptr=0; iptr<sum; iptr++){
677 lgao 3779 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
678     }
679     for (j=0; j<num_Pcouple_cols; j++, iptr++){
680     temp[iptr] = P->global_id[j];
681     }
682     }
683     if (iptr) {
684     #ifdef USE_QSORTG
685 caltinay 4803 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
686 lgao 3779 #else
687 caltinay 4803 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
688 lgao 3779 #endif
689     num_Pext_cols = 1;
690     i = temp[0];
691     for (j=1; j<iptr; j++) {
692     if (temp[j] > i) {
693     i = temp[j];
694     temp[num_Pext_cols++] = i;
695     }
696     }
697     }
698     /* resize the pattern of P_ext_couple */
699     if(num_Pext_cols){
700 jfenwick 4275 global_id_P = new index_t[num_Pext_cols];
701 lgao 3821 #pragma omp parallel for private(i) schedule(static)
702 lgao 3779 for (i=0; i<num_Pext_cols; i++)
703     global_id_P[i] = temp[i];
704     }
705     if (num_Pcouple_cols || sum > 0)
706 jfenwick 4275 delete[] temp;
707 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
708 lgao 3779 for (i=0; i<sum; i++) {
709     where_p = (index_t *)bsearch(
710     &(P->remote_coupleBlock->pattern->index[i]),
711     global_id_P, num_Pext_cols,
712 caltinay 4803 sizeof(index_t), paso::comparIndex);
713 lgao 3779 P->remote_coupleBlock->pattern->index[i] =
714     (index_t)(where_p -global_id_P);
715     }
716    
717     /* build the mapping */
718     if (num_Pcouple_cols) {
719 jfenwick 4275 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
720 lgao 3779 iptr = 0;
721     for (i=0; i<num_Pext_cols; i++)
722     if (global_id_P[i] == P->global_id[iptr]) {
723     Pcouple_to_Pext[iptr++] = i;
724     if (iptr == num_Pcouple_cols) break;
725     }
726     }
727     }
728    
729 jfenwick 4345 /* alloc and initialise the makers */
730 lgao 3779 sum = num_Pext_cols + num_Pmain_cols;
731 jfenwick 4275 P_marker = new index_t[sum];
732     A_marker = new index_t[num_A_cols];
733 lgao 3779 #pragma omp parallel for private(i) schedule(static)
734     for (i=0; i<sum; i++) P_marker[i] = -1;
735     #pragma omp parallel for private(i) schedule(static)
736     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
737    
738     /* Now, count the size of RAP_ext. Start with rows in R_couple */
739     sum = 0;
740     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
741     row_marker = sum;
742     /* then, loop over elements in row i_r of R_couple */
743     j1_ub = R_couple->pattern->ptr[i_r+1];
744     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
745     i1 = R_couple->pattern->index[j1];
746     /* then, loop over elements in row i1 of A->col_coupleBlock */
747     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
748     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
749     i2 = A->col_coupleBlock->pattern->index[j2];
750    
751     /* check whether entry RA[i_r, i2] has been previously visited.
752     RAP new entry is possible only if entry RA[i_r, i2] has not
753     been visited yet */
754     if (A_marker[i2] != i_r) {
755     /* first, mark entry RA[i_r,i2] as visited */
756     A_marker[i2] = i_r;
757    
758     /* then loop over elements in row i2 of P_ext_main */
759     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
760     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
761     i_c = P->row_coupleBlock->pattern->index[j3];
762    
763     /* check whether entry RAP[i_r,i_c] has been created.
764     If not yet created, create the entry and increase
765     the total number of elements in RAP_ext */
766     if (P_marker[i_c] < row_marker) {
767     P_marker[i_c] = sum;
768     sum++;
769     }
770     }
771    
772     /* loop over elements in row i2 of P_ext_couple, do the same */
773     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
774     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
775     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
776     if (P_marker[i_c] < row_marker) {
777     P_marker[i_c] = sum;
778     sum++;
779     }
780     }
781     }
782     }
783    
784     /* now loop over elements in row i1 of A->mainBlock, repeat
785     the process we have done to block A->col_coupleBlock */
786     j2_ub = A->mainBlock->pattern->ptr[i1+1];
787     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
788     i2 = A->mainBlock->pattern->index[j2];
789     if (A_marker[i2 + num_Acouple_cols] != i_r) {
790     A_marker[i2 + num_Acouple_cols] = i_r;
791     j3_ub = P->mainBlock->pattern->ptr[i2+1];
792     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
793     i_c = P->mainBlock->pattern->index[j3];
794     if (P_marker[i_c] < row_marker) {
795     P_marker[i_c] = sum;
796     sum++;
797     }
798     }
799     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
800     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
801     /* note that we need to map the column index in
802     P->col_coupleBlock back into the column index in
803     P_ext_couple */
804     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
805     if (P_marker[i_c] < row_marker) {
806     P_marker[i_c] = sum;
807     sum++;
808     }
809     }
810     }
811     }
812     }
813     }
814    
815     /* Now we have the number of non-zero elements in RAP_ext, allocate
816     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
817 jfenwick 4275 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
818     RAP_ext_idx = new index_t[sum];
819     RAP_ext_val = new double[sum * block_size];
820     RA_val = new double[block_size];
821     RAP_val = new double[block_size];
822 lgao 3779
823     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
824     sum = num_Pext_cols + num_Pmain_cols;
825     #pragma omp parallel for private(i) schedule(static)
826     for (i=0; i<sum; i++) P_marker[i] = -1;
827     #pragma omp parallel for private(i) schedule(static)
828     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
829     sum = 0;
830     RAP_ext_ptr[0] = 0;
831     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
832     row_marker = sum;
833     /* then, loop over elements in row i_r of R_couple */
834     j1_ub = R_couple->pattern->ptr[i_r+1];
835     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
836     i1 = R_couple->pattern->index[j1];
837     R_val = &(R_couple->val[j1*P_block_size]);
838    
839     /* then, loop over elements in row i1 of A->col_coupleBlock */
840     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
841     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
842     i2 = A->col_coupleBlock->pattern->index[j2];
843     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
844     for (irb=0; irb<row_block_size; irb++)
845     for (icb=0; icb<col_block_size; icb++)
846     RA_val[irb+row_block_size*icb] = ZERO;
847     for (irb=0; irb<P_block_size; irb++) {
848     rtmp=R_val[irb];
849     for (icb=0; icb<col_block_size; icb++) {
850     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
851     }
852     }
853    
854     /* check whether entry RA[i_r, i2] has been previously visited.
855     RAP new entry is possible only if entry RA[i_r, i2] has not
856     been visited yet */
857     if (A_marker[i2] != i_r) {
858     /* first, mark entry RA[i_r,i2] as visited */
859     A_marker[i2] = i_r;
860    
861     /* then loop over elements in row i2 of P_ext_main */
862     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
863     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
864     i_c = P->row_coupleBlock->pattern->index[j3];
865     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
866     for (irb=0; irb<row_block_size; irb++)
867     for (icb=0; icb<col_block_size; icb++)
868     RAP_val[irb+row_block_size*icb] = ZERO;
869     for (icb=0; icb<P_block_size; icb++) {
870     rtmp = temp_val[icb];
871     for (irb=0; irb<row_block_size; irb++) {
872     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
873     }
874     }
875    
876     /* check whether entry RAP[i_r,i_c] has been created.
877     If not yet created, create the entry and increase
878     the total number of elements in RAP_ext */
879     if (P_marker[i_c] < row_marker) {
880     P_marker[i_c] = sum;
881     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
882     RAP_ext_idx[sum] = i_c + offset;
883     sum++;
884     } else {
885     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
886     for (ib=0; ib<block_size; ib++) {
887     temp_val[ib] += RAP_val[ib];
888     }
889     }
890     }
891    
892     /* loop over elements in row i2 of P_ext_couple, do the same */
893     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
894     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
895     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
896     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
897     for (irb=0; irb<row_block_size; irb++)
898     for (icb=0; icb<col_block_size; icb++)
899     RAP_val[irb+row_block_size*icb]=ZERO;
900     for (icb=0; icb<P_block_size; icb++) {
901     rtmp = temp_val[icb];
902     for (irb=0; irb<row_block_size; irb++) {
903     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
904     }
905     }
906    
907     if (P_marker[i_c] < row_marker) {
908     P_marker[i_c] = sum;
909     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
910     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
911     sum++;
912     } else {
913     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
914     for (ib=0; ib<block_size; ib++) {
915     temp_val[ib] += RAP_val[ib];
916     }
917     }
918     }
919    
920     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
921     Only the contributions are added. */
922     } else {
923     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
924     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
925     i_c = P->row_coupleBlock->pattern->index[j3];
926     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
927     for (irb=0; irb<row_block_size; irb++)
928     for (icb=0; icb<col_block_size; icb++)
929     RAP_val[irb+row_block_size*icb]=ZERO;
930     for (icb=0; icb<P_block_size; icb++) {
931     rtmp = temp_val[icb];
932     for (irb=0; irb<row_block_size; irb++) {
933     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
934     }
935     }
936    
937     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
938     for (ib=0; ib<block_size; ib++) {
939     temp_val[ib] += RAP_val[ib];
940     }
941     }
942     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
943     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
944     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
945     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
946     for (irb=0; irb<row_block_size; irb++)
947     for (icb=0; icb<col_block_size; icb++)
948     RAP_val[irb+row_block_size*icb]=ZERO;
949     for (icb=0; icb<P_block_size; icb++) {
950     rtmp = temp_val[icb];
951     for (irb=0; irb<row_block_size; irb++) {
952     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
953     }
954     }
955    
956     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
957     for (ib=0; ib<block_size; ib++) {
958     temp_val[ib] += RAP_val[ib];
959     }
960     }
961     }
962     }
963    
964     /* now loop over elements in row i1 of A->mainBlock, repeat
965     the process we have done to block A->col_coupleBlock */
966     j2_ub = A->mainBlock->pattern->ptr[i1+1];
967     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
968     i2 = A->mainBlock->pattern->index[j2];
969     temp_val = &(A->mainBlock->val[j2*block_size]);
970     for (irb=0; irb<row_block_size; irb++)
971     for (icb=0; icb<col_block_size; icb++)
972     RA_val[irb+row_block_size*icb]=ZERO;
973     for (irb=0; irb<P_block_size; irb++) {
974     rtmp=R_val[irb];
975     for (icb=0; icb<col_block_size; icb++) {
976     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
977     }
978     }
979    
980     if (A_marker[i2 + num_Acouple_cols] != i_r) {
981     A_marker[i2 + num_Acouple_cols] = i_r;
982     j3_ub = P->mainBlock->pattern->ptr[i2+1];
983     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
984     i_c = P->mainBlock->pattern->index[j3];
985     temp_val = &(P->mainBlock->val[j3*P_block_size]);
986     for (irb=0; irb<row_block_size; irb++)
987     for (icb=0; icb<col_block_size; icb++)
988     RAP_val[irb+row_block_size*icb]=ZERO;
989     for (icb=0; icb<P_block_size; icb++) {
990     rtmp = temp_val[icb];
991     for (irb=0; irb<row_block_size; irb++) {
992     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
993     }
994     }
995    
996     if (P_marker[i_c] < row_marker) {
997     P_marker[i_c] = sum;
998     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
999     RAP_ext_idx[sum] = i_c + offset;
1000     sum++;
1001     } else {
1002     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1003     for (ib=0; ib<block_size; ib++) {
1004     temp_val[ib] += RAP_val[ib];
1005     }
1006     }
1007     }
1008     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1009     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1010     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1011     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1012     for (irb=0; irb<row_block_size; irb++)
1013     for (icb=0; icb<col_block_size; icb++)
1014     RAP_val[irb+row_block_size*icb]=ZERO;
1015     for (icb=0; icb<P_block_size; icb++) {
1016     rtmp=temp_val[icb];
1017     for (irb=0; irb<row_block_size; irb++) {
1018     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1019     }
1020     }
1021    
1022     if (P_marker[i_c] < row_marker) {
1023     P_marker[i_c] = sum;
1024     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1025     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1026     sum++;
1027     } else {
1028     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1029     for (ib=0; ib<block_size; ib++) {
1030     temp_val[ib] += RAP_val[ib];
1031     }
1032     }
1033     }
1034     } else {
1035     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1036     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1037     i_c = P->mainBlock->pattern->index[j3];
1038     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1039     for (irb=0; irb<row_block_size; irb++)
1040     for (icb=0; icb<col_block_size; icb++)
1041     RAP_val[irb+row_block_size*icb]=ZERO;
1042     for (icb=0; icb<P_block_size; icb++) {
1043     rtmp=temp_val[icb];
1044     for (irb=0; irb<row_block_size; irb++) {
1045     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1046     }
1047     }
1048    
1049     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1050     for (ib=0; ib<block_size; ib++) {
1051     temp_val[ib] += RAP_val[ib];
1052     }
1053     }
1054     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1055     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1056     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1057     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1058     for (irb=0; irb<row_block_size; irb++)
1059     for (icb=0; icb<col_block_size; icb++)
1060     RAP_val[irb+row_block_size*icb]=ZERO;
1061     for (icb=0; icb<P_block_size; icb++) {
1062     rtmp=temp_val[icb];
1063     for (irb=0; irb<row_block_size; irb++) {
1064     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1065     }
1066     }
1067    
1068     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1069     for (ib=0; ib<block_size; ib++) {
1070     temp_val[ib] += RAP_val[ib];
1071     }
1072     }
1073     }
1074     }
1075     }
1076     RAP_ext_ptr[i_r+1] = sum;
1077     }
1078 jfenwick 4275 delete[] P_marker;
1079     delete[] Pcouple_to_Pext;
1080 lgao 3779
1081     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1082     which is given by the column list of P->col_coupleBlock, and
1083 jfenwick 4345 column "c" is the list of columns which possibly covers the
1084     whole column range of system matrix P. This part of data is to
1085     be passed to neighbouring processors, and added to corresponding
1086     RAP[r,c] entries in the neighbouring processors */
1087 lgao 3779 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1088     &RAP_ext_val, global_id_P, block_size);
1089    
1090     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1091     sum = RAP_ext_ptr[num_RAPext_rows];
1092     num_RAPext_cols = 0;
1093     if (num_Pext_cols || sum > 0) {
1094 jfenwick 4275 temp = new index_t[num_Pext_cols+sum];
1095 lgao 3779 j1_ub = offset + num_Pmain_cols;
1096     for (i=0, iptr=0; i<sum; i++) {
1097 lgao 3819 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1098 lgao 3779 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1099     }
1100     for (j=0; j<num_Pext_cols; j++, iptr++){
1101     temp[iptr] = global_id_P[j];
1102     }
1103    
1104     if (iptr) {
1105     #ifdef USE_QSORTG
1106 caltinay 4803 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1107 lgao 3779 #else
1108 caltinay 4803 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1109 lgao 3779 #endif
1110     num_RAPext_cols = 1;
1111     i = temp[0];
1112     for (j=1; j<iptr; j++) {
1113     if (temp[j] > i) {
1114     i = temp[j];
1115     temp[num_RAPext_cols++] = i;
1116     }
1117     }
1118     }
1119     }
1120    
1121     /* resize the pattern of P_ext_couple */
1122     if(num_RAPext_cols){
1123 jfenwick 4275 global_id_RAP = new index_t[num_RAPext_cols];
1124 lgao 3821 #pragma omp parallel for private(i) schedule(static)
1125 lgao 3779 for (i=0; i<num_RAPext_cols; i++)
1126     global_id_RAP[i] = temp[i];
1127     }
1128     if (num_Pext_cols || sum > 0)
1129 jfenwick 4275 delete[] temp;
1130 lgao 3779 j1_ub = offset + num_Pmain_cols;
1131 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
1132 lgao 3779 for (i=0; i<sum; i++) {
1133     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1134     where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1135 caltinay 4803 /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1136 lgao 3779 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1137     } else
1138     RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1139     }
1140    
1141     /* build the mapping */
1142     if (num_Pcouple_cols) {
1143 jfenwick 4275 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1144 lgao 3779 iptr = 0;
1145     for (i=0; i<num_RAPext_cols; i++)
1146     if (global_id_RAP[i] == P->global_id[iptr]) {
1147     Pcouple_to_RAP[iptr++] = i;
1148     if (iptr == num_Pcouple_cols) break;
1149     }
1150     }
1151    
1152     if (num_Pext_cols) {
1153 jfenwick 4275 Pext_to_RAP = new index_t[num_Pext_cols];
1154 lgao 3779 iptr = 0;
1155     for (i=0; i<num_RAPext_cols; i++)
1156     if (global_id_RAP[i] == global_id_P[iptr]) {
1157     Pext_to_RAP[iptr++] = i;
1158     if (iptr == num_Pext_cols) break;
1159     }
1160     }
1161    
1162     if (global_id_P){
1163 jfenwick 4275 delete[] global_id_P;
1164 lgao 3779 global_id_P = NULL;
1165     }
1166    
1167 jfenwick 4345 /* alloc and initialise the makers */
1168 lgao 3779 sum = num_RAPext_cols + num_Pmain_cols;
1169 jfenwick 4275 P_marker = new index_t[sum];
1170 lgao 3779 #pragma omp parallel for private(i) schedule(static)
1171     for (i=0; i<sum; i++) P_marker[i] = -1;
1172     #pragma omp parallel for private(i) schedule(static)
1173     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1174    
1175     /* Now, count the size of RAP. Start with rows in R_main */
1176     num_neighbors = P->col_coupler->connector->send->numNeighbors;
1177     offsetInShared = P->col_coupler->connector->send->offsetInShared;
1178     shared = P->col_coupler->connector->send->shared;
1179     i = 0;
1180     j = 0;
1181     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1182     /* Mark the start of row for both main block and couple block */
1183     row_marker = i;
1184     row_marker_ext = j;
1185    
1186     /* Mark the diagonal element RAP[i_r, i_r], and other elements
1187     in RAP_ext */
1188     P_marker[i_r] = i;
1189     i++;
1190     for (j1=0; j1<num_neighbors; j1++) {
1191     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1192     if (shared[j2] == i_r) {
1193     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1194     i_c = RAP_ext_idx[k];
1195     if (i_c < num_Pmain_cols) {
1196     if (P_marker[i_c] < row_marker) {
1197     P_marker[i_c] = i;
1198     i++;
1199     }
1200     } else {
1201     if (P_marker[i_c] < row_marker_ext) {
1202     P_marker[i_c] = j;
1203     j++;
1204     }
1205     }
1206     }
1207     break;
1208     }
1209     }
1210     }
1211    
1212     /* then, loop over elements in row i_r of R_main */
1213     j1_ub = R_main->pattern->ptr[i_r+1];
1214     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1215     i1 = R_main->pattern->index[j1];
1216    
1217     /* then, loop over elements in row i1 of A->col_coupleBlock */
1218     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1219     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1220     i2 = A->col_coupleBlock->pattern->index[j2];
1221    
1222     /* check whether entry RA[i_r, i2] has been previously visited.
1223     RAP new entry is possible only if entry RA[i_r, i2] has not
1224     been visited yet */
1225     if (A_marker[i2] != i_r) {
1226     /* first, mark entry RA[i_r,i2] as visited */
1227     A_marker[i2] = i_r;
1228    
1229     /* then loop over elements in row i2 of P_ext_main */
1230     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1231     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232     i_c = P->row_coupleBlock->pattern->index[j3];
1233    
1234     /* check whether entry RAP[i_r,i_c] has been created.
1235     If not yet created, create the entry and increase
1236     the total number of elements in RAP_ext */
1237     if (P_marker[i_c] < row_marker) {
1238     P_marker[i_c] = i;
1239     i++;
1240     }
1241     }
1242    
1243     /* loop over elements in row i2 of P_ext_couple, do the same */
1244     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1245     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1246     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1247     if (P_marker[i_c] < row_marker_ext) {
1248     P_marker[i_c] = j;
1249     j++;
1250     }
1251     }
1252     }
1253     }
1254    
1255     /* now loop over elements in row i1 of A->mainBlock, repeat
1256     the process we have done to block A->col_coupleBlock */
1257     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1258     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1259     i2 = A->mainBlock->pattern->index[j2];
1260     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1261     A_marker[i2 + num_Acouple_cols] = i_r;
1262     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1263     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1264     i_c = P->mainBlock->pattern->index[j3];
1265     if (P_marker[i_c] < row_marker) {
1266     P_marker[i_c] = i;
1267     i++;
1268     }
1269     }
1270     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1271     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1272     /* note that we need to map the column index in
1273     P->col_coupleBlock back into the column index in
1274     P_ext_couple */
1275     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1276     if (P_marker[i_c] < row_marker_ext) {
1277     P_marker[i_c] = j;
1278     j++;
1279     }
1280     }
1281     }
1282     }
1283     }
1284     }
1285    
1286     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1287     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1288     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1289     RAP_couple */
1290 jfenwick 4275 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1291     RAP_main_idx = new index_t[i];
1292     RAP_main_val = new double[i * block_size];
1293     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1294     RAP_couple_idx = new index_t[j];
1295     RAP_couple_val = new double[j * block_size];
1296 lgao 3779
1297     RAP_main_ptr[num_Pmain_cols] = i;
1298     RAP_couple_ptr[num_Pmain_cols] = j;
1299    
1300     #pragma omp parallel for private(i) schedule(static)
1301     for (i=0; i<sum; i++) P_marker[i] = -1;
1302     #pragma omp parallel for private(i) schedule(static)
1303     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1304    
1305     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1306     in R_main */
1307     i = 0;
1308     j = 0;
1309     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1310     /* Mark the start of row for both main block and couple block */
1311     row_marker = i;
1312     row_marker_ext = j;
1313     RAP_main_ptr[i_r] = row_marker;
1314     RAP_couple_ptr[i_r] = row_marker_ext;
1315    
1316     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1317     in row i_r of RAP_ext */
1318     P_marker[i_r] = i;
1319     for (ib=0; ib<block_size; ib++)
1320     RAP_main_val[i*block_size+ib] = ZERO;
1321     RAP_main_idx[i] = i_r;
1322     i++;
1323    
1324     for (j1=0; j1<num_neighbors; j1++) {
1325     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1326     if (shared[j2] == i_r) {
1327     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1328     i_c = RAP_ext_idx[k];
1329     if (i_c < num_Pmain_cols) {
1330     if (P_marker[i_c] < row_marker) {
1331     P_marker[i_c] = i;
1332     memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1333     RAP_main_idx[i] = i_c;
1334     i++;
1335     } else {
1336     t1_val = &(RAP_ext_val[k*block_size]);
1337     t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1338     for (ib=0; ib<block_size; ib++)
1339     t2_val[ib] += t1_val[ib];
1340     }
1341     } else {
1342     if (P_marker[i_c] < row_marker_ext) {
1343     P_marker[i_c] = j;
1344     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1345     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1346     j++;
1347     } else {
1348     t1_val = &(RAP_ext_val[k*block_size]);
1349     t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1350     for (ib=0; ib<block_size; ib++)
1351     t2_val[ib] += t1_val[ib];
1352     }
1353     }
1354     }
1355     break;
1356     }
1357     }
1358     }
1359    
1360     /* then, loop over elements in row i_r of R_main */
1361     j1_ub = R_main->pattern->ptr[i_r+1];
1362     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1363     i1 = R_main->pattern->index[j1];
1364     R_val = &(R_main->val[j1*P_block_size]);
1365    
1366     /* then, loop over elements in row i1 of A->col_coupleBlock */
1367     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1368     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1369     i2 = A->col_coupleBlock->pattern->index[j2];
1370     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1371     for (irb=0; irb<row_block_size; irb++)
1372     for (icb=0; icb<col_block_size; icb++)
1373     RA_val[irb+row_block_size*icb]=ZERO;
1374     for (irb=0; irb<P_block_size; irb++) {
1375     rtmp=R_val[irb];
1376     for (icb=0; icb<col_block_size; icb++) {
1377     RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1378     }
1379     }
1380    
1381    
1382     /* check whether entry RA[i_r, i2] has been previously visited.
1383     RAP new entry is possible only if entry RA[i_r, i2] has not
1384     been visited yet */
1385     if (A_marker[i2] != i_r) {
1386     /* first, mark entry RA[i_r,i2] as visited */
1387     A_marker[i2] = i_r;
1388    
1389     /* then loop over elements in row i2 of P_ext_main */
1390     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1391     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1392     i_c = P->row_coupleBlock->pattern->index[j3];
1393     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1394     for (irb=0; irb<row_block_size; irb++)
1395     for (icb=0; icb<col_block_size; icb++)
1396     RAP_val[irb+row_block_size*icb]=ZERO;
1397     for (icb=0; icb<P_block_size; icb++) {
1398     rtmp=temp_val[icb];
1399     for (irb=0; irb<row_block_size; irb++) {
1400     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1401     }
1402     }
1403    
1404    
1405     /* check whether entry RAP[i_r,i_c] has been created.
1406     If not yet created, create the entry and increase
1407     the total number of elements in RAP_ext */
1408     if (P_marker[i_c] < row_marker) {
1409     P_marker[i_c] = i;
1410     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1411     RAP_main_idx[i] = i_c;
1412     i++;
1413     } else {
1414     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1415     for (ib=0; ib<block_size; ib++) {
1416     temp_val[ib] += RAP_val[ib];
1417     }
1418     }
1419     }
1420    
1421     /* loop over elements in row i2 of P_ext_couple, do the same */
1422     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1423     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1424     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1425     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1426     for (irb=0; irb<row_block_size; irb++)
1427     for (icb=0; icb<col_block_size; icb++)
1428     RAP_val[irb+row_block_size*icb]=ZERO;
1429     for (icb=0; icb<P_block_size; icb++) {
1430     rtmp=temp_val[icb];
1431     for (irb=0; irb<row_block_size; irb++) {
1432     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1433     }
1434     }
1435    
1436     if (P_marker[i_c] < row_marker_ext) {
1437     P_marker[i_c] = j;
1438     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1439     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1440     j++;
1441     } else {
1442     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1443     for (ib=0; ib<block_size; ib++) {
1444     temp_val[ib] += RAP_val[ib];
1445     }
1446     }
1447     }
1448    
1449     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1450     Only the contributions are added. */
1451     } else {
1452     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1453     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1454     i_c = P->row_coupleBlock->pattern->index[j3];
1455     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1456     for (irb=0; irb<row_block_size; irb++)
1457     for (icb=0; icb<col_block_size; icb++)
1458     RAP_val[irb+row_block_size*icb]=ZERO;
1459     for (icb=0; icb<P_block_size; icb++) {
1460     rtmp=temp_val[icb];
1461     for (irb=0; irb<row_block_size; irb++) {
1462     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1463     }
1464     }
1465    
1466     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1467     for (ib=0; ib<block_size; ib++) {
1468     temp_val[ib] += RAP_val[ib];
1469     }
1470     }
1471     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1472     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1473     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1474     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1475     for (irb=0; irb<row_block_size; irb++)
1476     for (icb=0; icb<col_block_size; icb++)
1477     RAP_val[irb+row_block_size*icb]=ZERO;
1478     for (icb=0; icb<P_block_size; icb++) {
1479     rtmp=temp_val[icb];
1480     for (irb=0; irb<row_block_size; irb++) {
1481     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1482     }
1483     }
1484    
1485     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1486     for (ib=0; ib<block_size; ib++) {
1487     temp_val[ib] += RAP_val[ib];
1488     }
1489     }
1490     }
1491     }
1492    
1493     /* now loop over elements in row i1 of A->mainBlock, repeat
1494     the process we have done to block A->col_coupleBlock */
1495     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1496     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1497     i2 = A->mainBlock->pattern->index[j2];
1498     temp_val = &(A->mainBlock->val[j2*block_size]);
1499     for (irb=0; irb<row_block_size; irb++)
1500     for (icb=0; icb<col_block_size; icb++)
1501     RA_val[irb+row_block_size*icb]=ZERO;
1502     for (irb=0; irb<P_block_size; irb++) {
1503     rtmp=R_val[irb];
1504     for (icb=0; icb<col_block_size; icb++) {
1505     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1506     }
1507     }
1508    
1509     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1510     A_marker[i2 + num_Acouple_cols] = i_r;
1511     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1512     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1513     i_c = P->mainBlock->pattern->index[j3];
1514     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1515     for (irb=0; irb<row_block_size; irb++)
1516     for (icb=0; icb<col_block_size; icb++)
1517     RAP_val[irb+row_block_size*icb]=ZERO;
1518     for (icb=0; icb<P_block_size; icb++) {
1519     rtmp=temp_val[icb];
1520     for (irb=0; irb<row_block_size; irb++) {
1521     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1522     }
1523     }
1524     if (P_marker[i_c] < row_marker) {
1525     P_marker[i_c] = i;
1526     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1527     RAP_main_idx[i] = i_c;
1528     i++;
1529     } else {
1530     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1531     for (ib=0; ib<block_size; ib++) {
1532     temp_val[ib] += RAP_val[ib];
1533     }
1534     }
1535     }
1536     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1537     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1538     /* note that we need to map the column index in
1539     P->col_coupleBlock back into the column index in
1540     P_ext_couple */
1541     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1542     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1543     for (irb=0; irb<row_block_size; irb++)
1544     for (icb=0; icb<col_block_size; icb++)
1545     RAP_val[irb+row_block_size*icb]=ZERO;
1546     for (icb=0; icb<P_block_size; icb++) {
1547     rtmp=temp_val[icb];
1548     for (irb=0; irb<row_block_size; irb++) {
1549     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1550     }
1551     }
1552    
1553     if (P_marker[i_c] < row_marker_ext) {
1554     P_marker[i_c] = j;
1555     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1556     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1557     j++;
1558     } else {
1559     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1560     for (ib=0; ib<block_size; ib++) {
1561     temp_val[ib] += RAP_val[ib];
1562     }
1563     }
1564     }
1565    
1566     } else {
1567     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1568     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1569     i_c = P->mainBlock->pattern->index[j3];
1570     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1571     for (irb=0; irb<row_block_size; irb++)
1572     for (icb=0; icb<col_block_size; icb++)
1573     RAP_val[irb+row_block_size*icb]=ZERO;
1574     for (icb=0; icb<P_block_size; icb++) {
1575     rtmp=temp_val[icb];
1576     for (irb=0; irb<row_block_size; irb++) {
1577     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1578     }
1579     }
1580    
1581     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1582     for (ib=0; ib<block_size; ib++) {
1583     temp_val[ib] += RAP_val[ib];
1584     }
1585     }
1586     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1587     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1588     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1589     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1590     for (irb=0; irb<row_block_size; irb++)
1591     for (icb=0; icb<col_block_size; icb++)
1592     RAP_val[irb+row_block_size*icb]=ZERO;
1593     for (icb=0; icb<P_block_size; icb++) {
1594     rtmp = temp_val[icb];
1595     for (irb=0; irb<row_block_size; irb++) {
1596     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1597     }
1598     }
1599    
1600     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1601     for (ib=0; ib<block_size; ib++) {
1602     temp_val[ib] += RAP_val[ib];
1603     }
1604     }
1605     }
1606     }
1607     }
1608    
1609     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1610     if (i > row_marker) {
1611     offset = i - row_marker;
1612 jfenwick 4275 temp = new index_t[offset];
1613 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1614 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1615     temp[iptr] = RAP_main_idx[row_marker+iptr];
1616     if (offset > 0) {
1617     #ifdef USE_QSORTG
1618 caltinay 4803 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1619 lgao 3779 #else
1620 caltinay 4803 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1621 lgao 3779 #endif
1622     }
1623 jfenwick 4275 temp_val = new double[offset * block_size];
1624 lgao 3821 #pragma omp parallel for schedule(static) private(iptr,k)
1625 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1626     k = P_marker[temp[iptr]];
1627     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1628     P_marker[temp[iptr]] = iptr + row_marker;
1629     }
1630 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1631 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1632     RAP_main_idx[row_marker+iptr] = temp[iptr];
1633     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1634     }
1635 jfenwick 4275 delete[] temp;
1636     delete[] temp_val;
1637 lgao 3779 }
1638     if (j > row_marker_ext) {
1639     offset = j - row_marker_ext;
1640 jfenwick 4275 temp = new index_t[offset];
1641 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1642 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1643     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1644     if (offset > 0) {
1645     #ifdef USE_QSORTG
1646 caltinay 4803 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1647 lgao 3779 #else
1648 caltinay 4803 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1649 lgao 3779 #endif
1650     }
1651 jfenwick 4275 temp_val = new double[offset * block_size];
1652 lgao 3821 #pragma omp parallel for schedule(static) private(iptr, k)
1653 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1654     k = P_marker[temp[iptr] + num_Pmain_cols];
1655     memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1656     P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1657     }
1658 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1659 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1660     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1661     memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1662     }
1663 jfenwick 4275 delete[] temp;
1664     delete[] temp_val;
1665 lgao 3779 }
1666     }
1667    
1668 jfenwick 4275 delete[] RA_val;
1669     delete[] RAP_val;
1670     delete[] A_marker;
1671     delete[] Pext_to_RAP;
1672     delete[] Pcouple_to_RAP;
1673     delete[] RAP_ext_ptr;
1674     delete[] RAP_ext_idx;
1675     delete[] RAP_ext_val;
1676 caltinay 4797 paso::SparseMatrix_free(R_main);
1677     paso::SparseMatrix_free(R_couple);
1678 lgao 3779
1679     /* Check whether there are empty columns in RAP_couple */
1680     #pragma omp parallel for schedule(static) private(i)
1681     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1682     /* num of non-empty columns is stored in "k" */
1683     k = 0;
1684     j = RAP_couple_ptr[num_Pmain_cols];
1685     for (i=0; i<j; i++) {
1686     i1 = RAP_couple_idx[i];
1687     if (P_marker[i1]) {
1688     P_marker[i1] = 0;
1689     k++;
1690     }
1691     }
1692    
1693     /* empty columns is found */
1694     if (k < num_RAPext_cols) {
1695 jfenwick 4275 temp = new index_t[k];
1696 lgao 3779 k = 0;
1697     for (i=0; i<num_RAPext_cols; i++)
1698     if (!P_marker[i]) {
1699     P_marker[i] = k;
1700     temp[k] = global_id_RAP[i];
1701     k++;
1702     }
1703 lgao 3821 #pragma omp parallel for schedule(static) private(i, i1)
1704 lgao 3779 for (i=0; i<j; i++) {
1705     i1 = RAP_couple_idx[i];
1706     RAP_couple_idx[i] = P_marker[i1];
1707     }
1708     num_RAPext_cols = k;
1709 jfenwick 4275 delete[] global_id_RAP;
1710 lgao 3779 global_id_RAP = temp;
1711     }
1712 jfenwick 4275 delete[] P_marker;
1713 lgao 3779
1714     /******************************************************/
1715 jfenwick 4345 /* Start to create the coarse level System Matrix A_c */
1716 lgao 3779 /******************************************************/
1717     /* first, prepare the sender/receiver for the col_connector */
1718     dist = P->pattern->input_distribution->first_component;
1719 jfenwick 4275 recv_len = new dim_t[size];
1720     send_len = new dim_t[size];
1721     neighbor = new Esys_MPI_rank[size];
1722     offsetInShared = new index_t[size+1];
1723     shared = new index_t[num_RAPext_cols];
1724 lgao 3779 memset(recv_len, 0, sizeof(dim_t) * size);
1725     memset(send_len, 0, sizeof(dim_t) * size);
1726     num_neighbors = 0;
1727     offsetInShared[0] = 0;
1728     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1729     shared[i] = i + num_Pmain_cols;
1730     if (k <= global_id_RAP[i]) {
1731     if (recv_len[j] > 0) {
1732     neighbor[num_neighbors] = j;
1733     num_neighbors ++;
1734     offsetInShared[num_neighbors] = i;
1735     }
1736     while (k <= global_id_RAP[i]) {
1737     j++;
1738     k = dist[j+1];
1739     }
1740     }
1741     recv_len[j] ++;
1742     }
1743     if (recv_len[j] > 0) {
1744     neighbor[num_neighbors] = j;
1745     num_neighbors ++;
1746     offsetInShared[num_neighbors] = i;
1747     }
1748    
1749 caltinay 4816 paso::SharedComponents_ptr recv(new paso::SharedComponents(
1750     num_Pmain_cols, num_neighbors, neighbor, shared,
1751     offsetInShared, 1, 0, mpi_info));
1752    
1753 lgao 3828 #ifdef ESYS_MPI
1754 lgao 3779 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1755 lgao 3828 #endif
1756 lgao 3779
1757     #ifdef ESYS_MPI
1758 jfenwick 4275 mpi_requests=new MPI_Request[size*2];
1759     mpi_stati=new MPI_Status[size*2];
1760 lgao 3779 #else
1761 jfenwick 4275 mpi_requests=new int[size*2];
1762     mpi_stati=new int[size*2];
1763 lgao 3779 #endif
1764     num_neighbors = 0;
1765     j = 0;
1766     offsetInShared[0] = 0;
1767     for (i=0; i<size; i++) {
1768     if (send_len[i] > 0) {
1769     neighbor[num_neighbors] = i;
1770     num_neighbors ++;
1771     j += send_len[i];
1772     offsetInShared[num_neighbors] = j;
1773     }
1774     }
1775 jfenwick 4275 delete[] shared;
1776     shared = new index_t[j];
1777 lgao 3779 for (i=0, j=0; i<num_neighbors; i++) {
1778     k = neighbor[i];
1779 lgao 3828 #ifdef ESYS_MPI
1780 lgao 3779 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1781     mpi_info->msg_tag_counter+k,
1782     mpi_info->comm, &mpi_requests[i]);
1783 lgao 3828 #endif
1784 lgao 3779 j += send_len[k];
1785     }
1786     for (i=0, j=0; i<recv->numNeighbors; i++) {
1787     k = recv->neighbor[i];
1788 lgao 3828 #ifdef ESYS_MPI
1789 lgao 3779 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1790     mpi_info->msg_tag_counter+rank,
1791     mpi_info->comm, &mpi_requests[i+num_neighbors]);
1792 lgao 3828 #endif
1793 lgao 3779 j += recv_len[k];
1794     }
1795 lgao 3828 #ifdef ESYS_MPI
1796 lgao 3779 MPI_Waitall(num_neighbors + recv->numNeighbors,
1797     mpi_requests, mpi_stati);
1798 lgao 3828 #endif
1799 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1800 lgao 3779
1801     j = offsetInShared[num_neighbors];
1802     offset = dist[rank];
1803 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1804 lgao 3779 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1805    
1806 caltinay 4816 paso::SharedComponents_ptr send(new paso::SharedComponents(
1807     num_Pmain_cols, num_neighbors, neighbor, shared,
1808     offsetInShared, 1, 0, mpi_info));
1809    
1810 caltinay 4801 col_connector = paso::Connector_alloc(send, recv);
1811 jfenwick 4275 delete[] shared;
1812 lgao 3779
1813     /* now, create row distribution (output_distri) and col
1814     distribution (input_distribution) */
1815 caltinay 4814 input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1816     output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1817 lgao 3779
1818     /* then, prepare the sender/receiver for the row_connector, first, prepare
1819     the information for sender */
1820     sum = RAP_couple_ptr[num_Pmain_cols];
1821 jfenwick 4275 len = new dim_t[size];
1822     send_ptr = new index_t*[size];
1823     send_idx = new index_t*[size];
1824 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1825 lgao 3779 for (i=0; i<size; i++) {
1826 jfenwick 4275 send_ptr[i] = new index_t[num_Pmain_cols];
1827     send_idx[i] = new index_t[sum];
1828 lgao 3779 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1829     }
1830     memset(len, 0, sizeof(dim_t) * size);
1831     recv = col_connector->recv;
1832     sum=0;
1833     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1834     i1 = RAP_couple_ptr[i_r];
1835     i2 = RAP_couple_ptr[i_r+1];
1836     if (i2 > i1) {
1837     /* then row i_r will be in the sender of row_connector, now check
1838 jfenwick 4345 how many neighbours i_r needs to be send to */
1839 lgao 3779 for (j=i1; j<i2; j++) {
1840     i_c = RAP_couple_idx[j];
1841     /* find out the corresponding neighbor "p" of column i_c */
1842     for (p=0; p<recv->numNeighbors; p++) {
1843     if (i_c < recv->offsetInShared[p+1]) {
1844     k = recv->neighbor[p];
1845     if (send_ptr[k][i_r] == 0) sum++;
1846     send_ptr[k][i_r] ++;
1847     send_idx[k][len[k]] = global_id_RAP[i_c];
1848     len[k] ++;
1849     break;
1850     }
1851     }
1852     }
1853     }
1854     }
1855     if (global_id_RAP) {
1856 jfenwick 4275 delete[] global_id_RAP;
1857 lgao 3779 global_id_RAP = NULL;
1858     }
1859    
1860     /* now allocate the sender */
1861 jfenwick 4275 shared = new index_t[sum];
1862 lgao 3779 memset(send_len, 0, sizeof(dim_t) * size);
1863     num_neighbors=0;
1864     offsetInShared[0] = 0;
1865     for (p=0, k=0; p<size; p++) {
1866     for (i=0; i<num_Pmain_cols; i++) {
1867     if (send_ptr[p][i] > 0) {
1868     shared[k] = i;
1869     k++;
1870     send_ptr[p][send_len[p]] = send_ptr[p][i];
1871     send_len[p]++;
1872     }
1873     }
1874     if (k > offsetInShared[num_neighbors]) {
1875     neighbor[num_neighbors] = p;
1876     num_neighbors ++;
1877     offsetInShared[num_neighbors] = k;
1878     }
1879     }
1880 caltinay 4816 send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1881     neighbor, shared, offsetInShared, 1, 0, mpi_info));
1882 lgao 3779
1883     /* send/recv number of rows will be sent from current proc
1884     recover info for the receiver of row_connector from the sender */
1885 lgao 3828 #ifdef ESYS_MPI
1886 lgao 3779 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1887 lgao 3828 #endif
1888 lgao 3779 num_neighbors = 0;
1889     offsetInShared[0] = 0;
1890     j = 0;
1891     for (i=0; i<size; i++) {
1892     if (i != rank && recv_len[i] > 0) {
1893     neighbor[num_neighbors] = i;
1894     num_neighbors ++;
1895     j += recv_len[i];
1896     offsetInShared[num_neighbors] = j;
1897     }
1898     }
1899 jfenwick 4275 delete[] shared;
1900     delete[] recv_len;
1901     shared = new index_t[j];
1902 lgao 3779 k = offsetInShared[num_neighbors];
1903 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1904 lgao 3779 for (i=0; i<k; i++) {
1905     shared[i] = i + num_Pmain_cols;
1906     }
1907 caltinay 4816 recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1908     neighbor, shared, offsetInShared, 1, 0, mpi_info));
1909 caltinay 4801 row_connector = paso::Connector_alloc(send, recv);
1910 jfenwick 4275 delete[] shared;
1911 lgao 3779
1912     /* send/recv pattern->ptr for rowCoupleBlock */
1913     num_RAPext_rows = offsetInShared[num_neighbors];
1914 jfenwick 4275 row_couple_ptr = new index_t[num_RAPext_rows+1];
1915 lgao 3779 for (p=0; p<num_neighbors; p++) {
1916     j = offsetInShared[p];
1917     i = offsetInShared[p+1];
1918 lgao 3828 #ifdef ESYS_MPI
1919 lgao 3779 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1920     mpi_info->msg_tag_counter+neighbor[p],
1921     mpi_info->comm, &mpi_requests[p]);
1922 lgao 3828 #endif
1923 lgao 3779 }
1924     send = row_connector->send;
1925     for (p=0; p<send->numNeighbors; p++) {
1926 lgao 3828 #ifdef ESYS_MPI
1927 lgao 3779 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1928     MPI_INT, send->neighbor[p],
1929     mpi_info->msg_tag_counter+rank,
1930     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1931 lgao 3828 #endif
1932 lgao 3779 }
1933 lgao 3828 #ifdef ESYS_MPI
1934 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1935     mpi_requests, mpi_stati);
1936 lgao 3828 #endif
1937 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1938 jfenwick 4275 delete[] send_len;
1939 lgao 3779
1940     sum = 0;
1941     for (i=0; i<num_RAPext_rows; i++) {
1942     k = row_couple_ptr[i];
1943     row_couple_ptr[i] = sum;
1944     sum += k;
1945     }
1946     row_couple_ptr[num_RAPext_rows] = sum;
1947    
1948     /* send/recv pattern->index for rowCoupleBlock */
1949     k = row_couple_ptr[num_RAPext_rows];
1950 jfenwick 4275 row_couple_idx = new index_t[k];
1951 lgao 3779 for (p=0; p<num_neighbors; p++) {
1952     j1 = row_couple_ptr[offsetInShared[p]];
1953     j2 = row_couple_ptr[offsetInShared[p+1]];
1954 lgao 3828 #ifdef ESYS_MPI
1955 lgao 3779 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1956     mpi_info->msg_tag_counter+neighbor[p],
1957     mpi_info->comm, &mpi_requests[p]);
1958 lgao 3828 #endif
1959 lgao 3779 }
1960     for (p=0; p<send->numNeighbors; p++) {
1961 lgao 3828 #ifdef ESYS_MPI
1962 lgao 3779 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1963     MPI_INT, send->neighbor[p],
1964     mpi_info->msg_tag_counter+rank,
1965     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1966 lgao 3828 #endif
1967 lgao 3779 }
1968 lgao 3828 #ifdef ESYS_MPI
1969 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1970     mpi_requests, mpi_stati);
1971 lgao 3828 #endif
1972 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1973 lgao 3779
1974     offset = input_dist->first_component[rank];
1975     k = row_couple_ptr[num_RAPext_rows];
1976 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1977 lgao 3779 for (i=0; i<k; i++) {
1978     row_couple_idx[i] -= offset;
1979     }
1980 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1981 lgao 3779 for (i=0; i<size; i++) {
1982 jfenwick 4275 delete[] send_ptr[i];
1983     delete[] send_idx[i];
1984 lgao 3779 }
1985 jfenwick 4275 delete[] send_ptr;
1986     delete[] send_idx;
1987     delete[] len;
1988     delete[] offsetInShared;
1989     delete[] neighbor;
1990     delete[] mpi_requests;
1991     delete[] mpi_stati;
1992 lgao 3779 Esys_MPIInfo_free(mpi_info);
1993    
1994     /* Now, we can create pattern for mainBlock and coupleBlock */
1995 caltinay 4803 main_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1996 lgao 3779 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1997 caltinay 4803 col_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1998 lgao 3779 num_Pmain_cols, num_RAPext_cols,
1999     RAP_couple_ptr, RAP_couple_idx);
2000 caltinay 4803 row_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2001 lgao 3779 num_RAPext_rows, num_Pmain_cols,
2002     row_couple_ptr, row_couple_idx);
2003    
2004     /* next, create the system matrix */
2005 caltinay 4800 pattern = new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
2006 lgao 3779 output_dist, input_dist, main_pattern, col_couple_pattern,
2007     row_couple_pattern, col_connector, row_connector);
2008     out = Paso_SystemMatrix_alloc(A->type, pattern,
2009     row_block_size, col_block_size, FALSE);
2010    
2011     /* finally, fill in the data*/
2012     memcpy(out->mainBlock->val, RAP_main_val,
2013     out->mainBlock->len* sizeof(double));
2014     memcpy(out->col_coupleBlock->val, RAP_couple_val,
2015     out->col_coupleBlock->len * sizeof(double));
2016    
2017     /* Clean up */
2018 caltinay 4800 paso::SystemMatrixPattern_free(pattern);
2019 caltinay 4803 paso::Pattern_free(main_pattern);
2020     paso::Pattern_free(col_couple_pattern);
2021     paso::Pattern_free(row_couple_pattern);
2022 caltinay 4801 paso::Connector_free(col_connector);
2023     paso::Connector_free(row_connector);
2024 jfenwick 4275 delete[] RAP_main_val;
2025     delete[] RAP_couple_val;
2026 lgao 3779 if (Esys_noError()) {
2027     return out;
2028     } else {
2029     Paso_SystemMatrix_free(out);
2030     return NULL;
2031     }
2032     }
2033    
2034    
2035     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2036     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2037     {
2038     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2039 caltinay 4797 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
2040 lgao 3779 Paso_SystemMatrix *out=NULL;
2041 caltinay 4800 paso::SystemMatrixPattern *pattern=NULL;
2042 caltinay 4814 paso::Distribution_ptr input_dist, output_dist;
2043 caltinay 4816 paso::SharedComponents_ptr send, recv;
2044 caltinay 4801 paso::Connector *col_connector=NULL, *row_connector=NULL;
2045 caltinay 4803 paso::Pattern *main_pattern=NULL;
2046     paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2047 lgao 3779 const dim_t row_block_size=A->row_block_size;
2048     const dim_t col_block_size=A->col_block_size;
2049     const dim_t block_size = A->block_size;
2050     const double ZERO = 0.0;
2051     double *RAP_main_val=NULL, *RAP_couple_val=