/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Annotation of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4836 - (hide annotations)
Mon Apr 7 05:51:55 2014 UTC (5 years, 8 months ago) by caltinay
File size: 132966 byte(s)
"Some" SystemMatrix clean up.....

1 jfenwick 3981 /*****************************************************************************
2 lgao 3763 *
3 jfenwick 4657 * Copyright (c) 2003-2014 by University of Queensland
4 jfenwick 3981 * http://www.uq.edu.au
5 lgao 3763 *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10 jfenwick 3981 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 jfenwick 4657 * Development 2012-2013 by School of Earth Sciences
12     * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 jfenwick 3981 *
14     *****************************************************************************/
15 lgao 3763
16    
17 caltinay 4816 /****************************************************************************/
18 lgao 3763
19     /* Paso: defines AMG Interpolation */
20    
21 caltinay 4816 /****************************************************************************/
22 lgao 3763
23     /* Author: l.gao@uq.edu.au */
24    
25 caltinay 4816 /****************************************************************************/
26 lgao 3763
27     #include "Paso.h"
28     #include "SparseMatrix.h"
29     #include "PasoUtil.h"
30     #include "Preconditioner.h"
31    
32 caltinay 4816 /****************************************************************************
33 lgao 3763
34 jfenwick 4345 Methods necessary for AMG preconditioner
35 lgao 3763
36     construct n_C x n_C interpolation operator A_C from matrix A
37     and prolongation matrix P.
38    
39     The coarsening operator A_C is defined by RAP where R=P^T.
40    
41 caltinay 4816 *****************************************************************************/
42 lgao 3763
43 jfenwick 4345 /* Extend system matrix B with extra two sparse matrices:
44 caltinay 4829 B_ext_main and B_ext_couple
45 jfenwick 4345 The combination of this two sparse matrices represents
46     the portion of B that is stored on neighbour procs and
47 lgao 3763 needed locally for triple matrix product (B^T) A B.
48    
49     FOR NOW, we assume that the system matrix B has a NULL
50     row_coupleBlock and a NULL remote_coupleBlock. Based on
51     this assumption, we use link row_coupleBlock for sparse
52     matrix B_ext_main, and link remote_coupleBlock for sparse
53     matrix B_ext_couple.
54    
55     To be specific, B_ext (on processor k) are group of rows
56     in B, where the list of rows from processor q is given by
57     A->col_coupler->connector->send->shared[rPtr...]
58     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59     on q. */
60 caltinay 4836 void Paso_Preconditioner_AMG_extendB(paso::SystemMatrix_ptr A,
61     paso::SystemMatrix_ptr B)
62 lgao 3763 {
63 caltinay 4819 paso::Pattern_ptr pattern_main, pattern_couple;
64 caltinay 4829 paso::Coupler_ptr coupler;
65     paso::SharedComponents_ptr send, recv;
66     double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
67     index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
68     index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
69     index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
70     index_t *idx_m=NULL, *idx_c=NULL;
71     index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
72     index_t offset, len, block_size, block_size_size, max_num_cols;
73     index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
74     dim_t *recv_degree=NULL, *send_degree=NULL;
75     dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
76 lgao 3763
77 caltinay 4829 if (size == 1) return;
78 lgao 3763
79 caltinay 4829 B->row_coupleBlock.reset();
80 lgao 3763
81 caltinay 4829 if (B->remote_coupleBlock.get()) {
82     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to remote_coupleBlock has already been set.");
83     return;
84     }
85 lgao 3763
86 caltinay 4829 /* sending/receiving unknown's global ID */
87     num_main_cols = B->mainBlock->numCols;
88     cols = new double[num_main_cols];
89     offset = B->col_distribution->first_component[rank];
90     #pragma omp parallel for private(i) schedule(static)
91     for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
92     if (B->global_id == NULL) {
93     coupler.reset(new paso::Coupler(B->col_coupler->connector, 1));
94     coupler->startCollect(cols);
95     }
96 lgao 3763
97 caltinay 4829 recv_buf = new index_t[size];
98     recv_degree = new dim_t[size];
99     recv_offset = new index_t[size+1];
100 lgao 3763 #pragma omp parallel for private(i) schedule(static)
101 caltinay 4829 for (i=0; i<size; i++){
102     recv_buf[i] = 0;
103     recv_degree[i] = 1;
104     recv_offset[i] = i;
105     }
106 lgao 3763
107 lgao 3779 block_size = B->block_size;
108 lgao 3763 block_size_size = block_size * sizeof(double);
109     num_couple_cols = B->col_coupleBlock->numCols;
110     send = A->col_coupler->connector->send;
111     recv = A->col_coupler->connector->recv;
112     num_neighbors = send->numNeighbors;
113     p = send->offsetInShared[num_neighbors];
114 lgao 3767 len = p * B->col_distribution->first_component[size];
115 jfenwick 4275 send_buf = new double[len * block_size];
116     send_idx = new index_t[len];
117     send_offset = new index_t[(p+1)*2];
118     send_degree = new dim_t[num_neighbors];
119 lgao 3763 i = num_main_cols + num_couple_cols;
120 jfenwick 4275 send_m = new double[i * block_size];
121     send_c = new double[i * block_size];
122     idx_m = new index_t[i];
123     idx_c = new index_t[i];
124 lgao 3763
125     /* waiting for receiving unknown's global ID */
126     if (B->global_id == NULL) {
127 caltinay 4817 coupler->finishCollect();
128 jfenwick 4275 global_id = new index_t[num_couple_cols];
129 lgao 3763 #pragma omp parallel for private(i) schedule(static)
130     for (i=0; i<num_couple_cols; ++i)
131     global_id[i] = coupler->recv_buffer[i];
132     B->global_id = global_id;
133     } else
134     global_id = B->global_id;
135    
136     /* distribute the number of cols in current col_coupleBlock to all ranks */
137 lgao 3828 #ifdef ESYS_MPI
138 lgao 3763 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
139 lgao 3828 #endif
140 lgao 3763
141     /* distribute global_ids of cols to be considered to all ranks*/
142     len = 0;
143     max_num_cols = 0;
144     for (i=0; i<size; i++){
145     recv_degree[i] = recv_buf[i];
146     recv_offset[i] = len;
147     len += recv_buf[i];
148     if (max_num_cols < recv_buf[i])
149 caltinay 4829 max_num_cols = recv_buf[i];
150 lgao 3763 }
151     recv_offset[size] = len;
152 jfenwick 4275 cols_array = new index_t[len];
153 lgao 3828 #ifdef ESYS_MPI
154 lgao 3763 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155 lgao 3828 #endif
156 lgao 3763
157     /* first, prepare the ptr_ptr to be received */
158     q = recv->numNeighbors;
159     len = recv->offsetInShared[q];
160 jfenwick 4275 ptr_ptr = new index_t[(len+1) * 2];
161 lgao 3763 for (p=0; p<q; p++) {
162     row = recv->offsetInShared[p];
163     m = recv->offsetInShared[p + 1];
164 lgao 3828 #ifdef ESYS_MPI
165 lgao 3767 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
167     A->mpi_info->comm,
168     &(A->col_coupler->mpi_requests[p]));
169 lgao 3828 #endif
170 lgao 3763 }
171    
172     /* now prepare the rows to be sent (the degree, the offset and the data) */
173     len = 0;
174 lgao 3817 i0 = 0;
175 lgao 3763 for (p=0; p<num_neighbors; p++) {
176 lgao 3817 i = i0;
177 lgao 3763 neighbor = send->neighbor[p];
178     m_lb = B->col_distribution->first_component[neighbor];
179     m_ub = B->col_distribution->first_component[neighbor + 1];
180     j_ub = send->offsetInShared[p + 1];
181     for (j=send->offsetInShared[p]; j<j_ub; j++) {
182 caltinay 4829 row = send->shared[j];
183     l_m = 0;
184     l_c = 0;
185     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
186     k_lb = B->col_coupleBlock->pattern->ptr[row];
187 lgao 3763
188 caltinay 4829 /* check part of col_coupleBlock for data to be passed @row */
189     for (k=k_lb; k<k_ub; k++) {
190     m = global_id[B->col_coupleBlock->pattern->index[k]];
191     if (m > offset) break;
192     if (m>= m_lb && m < m_ub) {
193     /* data to be passed to sparse matrix B_ext_main */
194     idx_m[l_m] = m - m_lb;
195     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196     l_m++;
197     } else {
198     /* data to be passed to sparse matrix B_ext_couple */
199     idx_c[l_c] = m;
200     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
201     l_c++;
202     }
203     }
204     k_lb = k;
205 lgao 3763
206 caltinay 4829 /* check mainBlock for data to be passed @row to sparse
207     matrix B_ext_couple */
208     k_ub = B->mainBlock->pattern->ptr[row + 1];
209     k = B->mainBlock->pattern->ptr[row];
210     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
211     for (; k<k_ub; k++) {
212     m = B->mainBlock->pattern->index[k] + offset;
213     idx_c[l_c] = m;
214     l_c++;
215     }
216 lgao 3763
217 caltinay 4829 /* check the rest part of col_coupleBlock for data to
218     be passed @row to sparse matrix B_ext_couple */
219     k = k_lb;
220     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
221     for (k=k_lb; k<k_ub; k++) {
222 lgao 3763 m = global_id[B->col_coupleBlock->pattern->index[k]];
223 caltinay 4829 if (m>= m_lb && m < m_ub) {
224     /* data to be passed to sparse matrix B_ext_main */
225     idx_m[l_m] = m - m_lb;
226     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227     l_m++;
228     } else {
229     /* data to be passed to sparse matrix B_ext_couple */
230     idx_c[l_c] = m;
231     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
232     l_c++;
233     }
234     }
235 lgao 3763
236 caltinay 4829 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
237     memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
238 lgao 3763 send_offset[2*i] = l_m;
239 caltinay 4829 len += l_m;
240     memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
241     memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
242     send_offset[2*i+1] = l_c;
243     len += l_c;
244     i++;
245 lgao 3763 }
246    
247     /* sending */
248 lgao 3828 #ifdef ESYS_MPI
249 lgao 3817 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
250 lgao 3763 A->mpi_info->msg_tag_counter+rank,
251     A->mpi_info->comm,
252     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
253 lgao 3828 #endif
254 lgao 3763 send_degree[p] = len;
255 lgao 3817 i0 = i;
256 lgao 3763 }
257 jfenwick 4275 delete[] send_m;
258     delete[] send_c;
259     delete[] idx_m;
260     delete[] idx_c;
261 lgao 3763
262    
263     q = recv->numNeighbors;
264     len = recv->offsetInShared[q];
265 jfenwick 4275 ptr_main = new index_t[(len+1)];
266     ptr_couple = new index_t[(len+1)];
267 lgao 3763
268 lgao 3828 #ifdef ESYS_MPI
269 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
270     A->col_coupler->mpi_requests,
271     A->col_coupler->mpi_stati);
272 lgao 3828 #endif
273 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
274 lgao 3763
275     j = 0;
276     k = 0;
277     ptr_main[0] = 0;
278     ptr_couple[0] = 0;
279     for (i=0; i<len; i++) {
280     j += ptr_ptr[2*i];
281     k += ptr_ptr[2*i+1];
282     ptr_main[i+1] = j;
283     ptr_couple[i+1] = k;
284     }
285    
286 jfenwick 4275 delete[] ptr_ptr;
287     idx_main = new index_t[j];
288     idx_couple = new index_t[k];
289     ptr_idx = new index_t[j+k];
290     ptr_val = new double[(j+k) * block_size];
291 lgao 3763
292     /* send/receive index array */
293     j=0;
294     k_ub = 0;
295     for (p=0; p<recv->numNeighbors; p++) {
296     k = recv->offsetInShared[p];
297     m = recv->offsetInShared[p+1];
298     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
299     if (i > 0) {
300 caltinay 4829 k_ub ++;
301     #ifdef ESYS_MPI
302 lgao 3763 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
303     A->mpi_info->msg_tag_counter+recv->neighbor[p],
304     A->mpi_info->comm,
305     &(A->col_coupler->mpi_requests[p]));
306 caltinay 4829 #endif
307 lgao 3763 }
308     j += i;
309     }
310    
311     j=0;
312     k_ub = 0;
313     for (p=0; p<num_neighbors; p++) {
314     i = send_degree[p] - j;
315     if (i > 0){
316 caltinay 4829 k_ub ++;
317     #ifdef ESYS_MPI
318 lgao 3763 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
319     A->mpi_info->msg_tag_counter+rank,
320     A->mpi_info->comm,
321     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
322 caltinay 4829 #endif
323 lgao 3763 }
324     j = send_degree[p];
325     }
326    
327 lgao 3828 #ifdef ESYS_MPI
328 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
329     A->col_coupler->mpi_requests,
330     A->col_coupler->mpi_stati);
331 lgao 3828 #endif
332 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
333 lgao 3763
334 lgao 3821 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
335 lgao 3763 for (i=0; i<len; i++) {
336     j = ptr_main[i];
337     k = ptr_main[i+1];
338     m = ptr_couple[i];
339     for (p=j; p<k; p++) {
340 caltinay 4829 idx_main[p] = ptr_idx[m+p];
341 lgao 3763 }
342     j = ptr_couple[i+1];
343     for (p=m; p<j; p++) {
344 caltinay 4829 idx_couple[p] = ptr_idx[k+p];
345 lgao 3763 }
346     }
347 jfenwick 4275 delete[] ptr_idx;
348 lgao 3763
349     /* allocate pattern and sparsematrix for B_ext_main */
350 caltinay 4819 pattern_main.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
351     len, num_main_cols, ptr_main, idx_main));
352 caltinay 4829 B->row_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
353     pattern_main, B->row_block_size, B->col_block_size,
354     false));
355 lgao 3763
356     /* allocate pattern and sparsematrix for B_ext_couple */
357 caltinay 4819 pattern_couple.reset(new paso::Pattern(B->col_coupleBlock->pattern->type,
358 lgao 3763 len, B->col_distribution->first_component[size],
359 caltinay 4819 ptr_couple, idx_couple));
360 caltinay 4829 B->remote_coupleBlock.reset(new paso::SparseMatrix(B->col_coupleBlock->type,
361 lgao 3763 pattern_couple, B->row_block_size, B->col_block_size,
362 caltinay 4829 false));
363 lgao 3763
364     /* send/receive value array */
365     j=0;
366     for (p=0; p<recv->numNeighbors; p++) {
367     k = recv->offsetInShared[p];
368     m = recv->offsetInShared[p+1];
369     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
370 lgao 3828 #ifdef ESYS_MPI
371 lgao 3763 if (i > 0)
372     MPI_Irecv(&(ptr_val[j]), i * block_size,
373     MPI_DOUBLE, recv->neighbor[p],
374     A->mpi_info->msg_tag_counter+recv->neighbor[p],
375     A->mpi_info->comm,
376     &(A->col_coupler->mpi_requests[p]));
377 lgao 3828 #endif
378 lgao 3779 j += (i * block_size);
379 lgao 3763 }
380    
381     j=0;
382     for (p=0; p<num_neighbors; p++) {
383     i = send_degree[p] - j;
384 lgao 3828 #ifdef ESYS_MPI
385 lgao 3763 if (i > 0)
386 lgao 3779 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
387 lgao 3763 A->mpi_info->msg_tag_counter+rank,
388     A->mpi_info->comm,
389     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
390 lgao 3828 #endif
391 lgao 3779 j = send_degree[p] ;
392 lgao 3763 }
393    
394 lgao 3828 #ifdef ESYS_MPI
395 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
396     A->col_coupler->mpi_requests,
397     A->col_coupler->mpi_stati);
398 lgao 3828 #endif
399 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
400 lgao 3763
401     #pragma omp parallel for private(i,j,k,m,p) schedule(static)
402     for (i=0; i<len; i++) {
403     j = ptr_main[i];
404     k = ptr_main[i+1];
405     m = ptr_couple[i];
406     for (p=j; p<k; p++) {
407 caltinay 4829 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
408 lgao 3763 }
409     j = ptr_couple[i+1];
410     for (p=m; p<j; p++) {
411 caltinay 4829 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
412 lgao 3763 }
413     }
414 jfenwick 4275 delete[] ptr_val;
415 lgao 3763
416 lgao 3817
417 caltinay 4829 /* release all temp memory allocation */
418     delete[] cols;
419     delete[] cols_array;
420     delete[] recv_offset;
421     delete[] recv_degree;
422     delete[] recv_buf;
423     delete[] send_buf;
424     delete[] send_offset;
425     delete[] send_degree;
426     delete[] send_idx;
427 lgao 3763 }
428    
429     /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
430     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
431 jfenwick 4345 to copy block of data in T to neighbour processors, defined by
432 caltinay 4829 P->col_coupler->connector->recv->neighbor[k] where k is in
433     [0, P->col_coupler->connector->recv->numNeighbors).
434 lgao 3763 Rows to be copied to neighbor processor k is in the list defined by
435 caltinay 4829 P->col_coupler->connector->recv->offsetInShared[k] ...
436     P->col_coupler->connector->recv->offsetInShared[k+1] */
437 caltinay 4836 void Paso_Preconditioner_AMG_CopyRemoteData(paso::SystemMatrix_ptr P,
438 caltinay 4829 index_t **p_ptr, index_t **p_idx, double **p_val,
439     index_t *global_id, index_t block_size)
440 lgao 3763 {
441 caltinay 4816 paso::SharedComponents_ptr send, recv;
442 caltinay 4829 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
443     index_t i, j, p, m, n, size;
444     index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
445     index_t *ptr=*p_ptr, *idx=*p_idx;
446     double *val=*p_val, *recv_val=NULL;
447 gross 3834 #ifdef ESYS_MPI
448 caltinay 4829 index_t rank = P->mpi_info->rank;
449 gross 3834 #endif
450 lgao 3763
451 caltinay 4829 size = P->mpi_info->size;
452     send = P->col_coupler->connector->recv;
453     recv = P->col_coupler->connector->send;
454     send_neighbors = send->numNeighbors;
455     recv_neighbors = recv->numNeighbors;
456     send_rows = P->col_coupleBlock->numCols;
457     recv_rows = recv->offsetInShared[recv_neighbors];
458 lgao 3763
459 caltinay 4829 send_degree = new index_t[send_rows];
460     recv_ptr = new index_t[recv_rows + 1];
461 lgao 3763 #pragma omp for schedule(static) private(i)
462 caltinay 4829 for (i=0; i<send_rows; i++)
463     send_degree[i] = ptr[i+1] - ptr[i];
464 lgao 3763
465     /* First, send/receive the degree */
466     for (p=0; p<recv_neighbors; p++) { /* Receiving */
467     m = recv->offsetInShared[p];
468     n = recv->offsetInShared[p+1];
469 lgao 3828 #ifdef ESYS_MPI
470 lgao 3763 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
471 caltinay 4829 P->mpi_info->msg_tag_counter + recv->neighbor[p],
472     P->mpi_info->comm,
473     &(P->col_coupler->mpi_requests[p]));
474 lgao 3828 #endif
475 lgao 3763 }
476     for (p=0; p<send_neighbors; p++) { /* Sending */
477     m = send->offsetInShared[p];
478     n = send->offsetInShared[p+1];
479 lgao 3828 #ifdef ESYS_MPI
480 lgao 3763 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
481 caltinay 4829 P->mpi_info->msg_tag_counter + rank,
482     P->mpi_info->comm,
483     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
484 lgao 3828 #endif
485 lgao 3763 }
486 lgao 3828 #ifdef ESYS_MPI
487 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
488 caltinay 4829 P->col_coupler->mpi_requests,
489     P->col_coupler->mpi_stati);
490 lgao 3828 #endif
491 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
492 lgao 3763
493 jfenwick 4275 delete[] send_degree;
494 lgao 3763 m = Paso_Util_cumsum(recv_rows, recv_ptr);
495     recv_ptr[recv_rows] = m;
496 jfenwick 4275 recv_idx = new index_t[m];
497     recv_val = new double[m * block_size];
498 lgao 3763
499     /* Next, send/receive the index array */
500     j = 0;
501     for (p=0; p<recv_neighbors; p++) { /* Receiving */
502     m = recv->offsetInShared[p];
503     n = recv->offsetInShared[p+1];
504     i = recv_ptr[n] - recv_ptr[m];
505     if (i > 0) {
506 lgao 3828 #ifdef ESYS_MPI
507 lgao 3763 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
508 caltinay 4829 P->mpi_info->msg_tag_counter + recv->neighbor[p],
509     P->mpi_info->comm,
510     &(P->col_coupler->mpi_requests[p]));
511 lgao 3828 #endif
512 lgao 3819 }
513 lgao 3763 j += i;
514     }
515    
516     j = 0;
517     for (p=0; p<send_neighbors; p++) { /* Sending */
518     m = send->offsetInShared[p];
519     n = send->offsetInShared[p+1];
520     i = ptr[n] - ptr[m];
521     if (i >0) {
522 caltinay 4829 #ifdef ESYS_MPI
523     MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
524     P->mpi_info->msg_tag_counter + rank,
525     P->mpi_info->comm,
526     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
527     #endif
528     j += i;
529 lgao 3819 }
530 lgao 3763 }
531 lgao 3828 #ifdef ESYS_MPI
532 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
533 caltinay 4829 P->col_coupler->mpi_requests,
534     P->col_coupler->mpi_stati);
535 lgao 3828 #endif
536 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
537 lgao 3763
538     /* Last, send/receive the data array */
539     j = 0;
540     for (p=0; p<recv_neighbors; p++) { /* Receiving */
541     m = recv->offsetInShared[p];
542     n = recv->offsetInShared[p+1];
543     i = recv_ptr[n] - recv_ptr[m];
544 lgao 3828 #ifdef ESYS_MPI
545 lgao 3763 if (i > 0)
546 lgao 3779 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
547 caltinay 4829 P->mpi_info->msg_tag_counter + recv->neighbor[p],
548     P->mpi_info->comm,
549     &(P->col_coupler->mpi_requests[p]));
550 lgao 3828 #endif
551 lgao 3779 j += (i*block_size);
552 lgao 3763 }
553    
554     j = 0;
555     for (p=0; p<send_neighbors; p++) { /* Sending */
556     m = send->offsetInShared[p];
557     n = send->offsetInShared[p+1];
558     i = ptr[n] - ptr[m];
559     if (i >0) {
560 caltinay 4829 #ifdef ESYS_MPI
561     MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
562     P->mpi_info->msg_tag_counter + rank,
563     P->mpi_info->comm,
564     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
565     #endif
566     j += i * block_size;
567 lgao 3763 }
568     }
569 lgao 3828 #ifdef ESYS_MPI
570 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
571 caltinay 4829 P->col_coupler->mpi_requests,
572     P->col_coupler->mpi_stati);
573 lgao 3828 #endif
574 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
575 lgao 3763
576 jfenwick 4345 /* Clean up and return with received ptr, index and data arrays */
577 jfenwick 4275 delete[] ptr;
578     delete[] idx;
579     delete[] val;
580 lgao 3763 *p_ptr = recv_ptr;
581     *p_idx = recv_idx;
582     *p_val = recv_val;
583     }
584    
585 caltinay 4836 paso::SystemMatrix_ptr Paso_Preconditioner_AMG_buildInterpolationOperator(
586     paso::SystemMatrix_ptr A, paso::SystemMatrix_ptr P,
587     paso::SystemMatrix_ptr R)
588 lgao 3763 {
589     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
590 caltinay 4836 paso::SystemMatrix_ptr out;
591 caltinay 4818 paso::SystemMatrixPattern_ptr pattern;
592 caltinay 4814 paso::Distribution_ptr input_dist, output_dist;
593 caltinay 4817 paso::Connector_ptr col_connector, row_connector;
594 lgao 3763 const dim_t row_block_size=A->row_block_size;
595     const dim_t col_block_size=A->col_block_size;
596 lgao 3779 const dim_t block_size = A->block_size;
597     const dim_t P_block_size = P->block_size;
598 lgao 3763 const double ZERO = 0.0;
599     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
600 lgao 3779 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
601 lgao 3763 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
602     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
603     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
604     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
605     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
606     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
607     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
608 lgao 3779 index_t sum, i, j, k, iptr, irb, icb, ib;
609 lgao 3763 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
610     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
611     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
612     index_t row_marker_ext, *where_p=NULL;
613     index_t **send_ptr=NULL, **send_idx=NULL;
614 lgao 3828 dim_t p, num_neighbors;
615 lgao 3763 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
616     Esys_MPI_rank *neighbor=NULL;
617     #ifdef ESYS_MPI
618     MPI_Request* mpi_requests=NULL;
619     MPI_Status* mpi_stati=NULL;
620     #else
621     int *mpi_requests=NULL, *mpi_stati=NULL;
622     #endif
623    
624 lgao 3828 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
625     return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
626 lgao 3779
627 jfenwick 4345 /* two sparse matrices R_main and R_couple will be generate, as the
628 lgao 3763 transpose of P_main and P_col_couple, respectively. Note that,
629     R_couple is actually the row_coupleBlock of R (R=P^T) */
630 caltinay 4829 paso::SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
631     paso::SparseMatrix_ptr R_couple;
632 lgao 3817 if (size > 1)
633 caltinay 4829 R_couple = P->col_coupleBlock->getTranspose();
634 lgao 3763
635 jfenwick 4345 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
636 lgao 3779 and needed locally for triple matrix product RAP
637     to be specific, P_ext (on processor k) are group of rows in P, where
638     the list of rows from processor q is given by
639 caltinay 4829 A->col_coupler->connector->send->shared[rPtr...]
640     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
641 lgao 3779 on q.
642 jfenwick 4345 P_ext is represented by two sparse matrices P_ext_main and
643 lgao 3779 P_ext_couple */
644     Paso_Preconditioner_AMG_extendB(A, P);
645    
646     /* count the number of cols in P_ext_couple, resize the pattern of
647     sparse matrix P_ext_couple with new compressed order, and then
648     build the col id mapping from P->col_coupleBlock to
649     P_ext_couple */
650     num_Pmain_cols = P->mainBlock->numCols;
651 lgao 3817 if (size > 1) {
652     num_Pcouple_cols = P->col_coupleBlock->numCols;
653     num_Acouple_cols = A->col_coupleBlock->numCols;
654     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
655     } else {
656     num_Pcouple_cols = 0;
657     num_Acouple_cols = 0;
658     sum = 0;
659     }
660 lgao 3779 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
661     offset = P->col_distribution->first_component[rank];
662     num_Pext_cols = 0;
663     if (P->global_id) {
664     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
665 caltinay 4829 and P->col_coupleBlock */
666 lgao 3779 iptr = 0;
667     if (num_Pcouple_cols || sum > 0) {
668 caltinay 4829 temp = new index_t[num_Pcouple_cols+sum];
669     #pragma omp parallel for lastprivate(iptr) schedule(static)
670     for (iptr=0; iptr<sum; iptr++){
671     temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
672     }
673     for (j=0; j<num_Pcouple_cols; j++, iptr++){
674     temp[iptr] = P->global_id[j];
675     }
676 lgao 3779 }
677     if (iptr) {
678 caltinay 4829 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
679     num_Pext_cols = 1;
680     i = temp[0];
681     for (j=1; j<iptr; j++) {
682     if (temp[j] > i) {
683     i = temp[j];
684     temp[num_Pext_cols++] = i;
685     }
686     }
687 lgao 3779 }
688     /* resize the pattern of P_ext_couple */
689     if(num_Pext_cols){
690 caltinay 4829 global_id_P = new index_t[num_Pext_cols];
691     #pragma omp parallel for private(i) schedule(static)
692     for (i=0; i<num_Pext_cols; i++)
693     global_id_P[i] = temp[i];
694 lgao 3779 }
695     if (num_Pcouple_cols || sum > 0)
696 caltinay 4829 delete[] temp;
697 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
698 lgao 3779 for (i=0; i<sum; i++) {
699 caltinay 4829 where_p = (index_t *)bsearch(
700     &(P->remote_coupleBlock->pattern->index[i]),
701     global_id_P, num_Pext_cols,
702 caltinay 4803 sizeof(index_t), paso::comparIndex);
703 caltinay 4829 P->remote_coupleBlock->pattern->index[i] =
704     (index_t)(where_p -global_id_P);
705 lgao 3779 }
706    
707     /* build the mapping */
708     if (num_Pcouple_cols) {
709 caltinay 4829 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
710     iptr = 0;
711     for (i=0; i<num_Pext_cols; i++)
712     if (global_id_P[i] == P->global_id[iptr]) {
713     Pcouple_to_Pext[iptr++] = i;
714     if (iptr == num_Pcouple_cols) break;
715     }
716 lgao 3779 }
717     }
718    
719 jfenwick 4345 /* alloc and initialise the makers */
720 lgao 3779 sum = num_Pext_cols + num_Pmain_cols;
721 jfenwick 4275 P_marker = new index_t[sum];
722     A_marker = new index_t[num_A_cols];
723 lgao 3779 #pragma omp parallel for private(i) schedule(static)
724     for (i=0; i<sum; i++) P_marker[i] = -1;
725     #pragma omp parallel for private(i) schedule(static)
726     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
727    
728     /* Now, count the size of RAP_ext. Start with rows in R_couple */
729     sum = 0;
730     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
731     row_marker = sum;
732     /* then, loop over elements in row i_r of R_couple */
733     j1_ub = R_couple->pattern->ptr[i_r+1];
734     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
735 caltinay 4829 i1 = R_couple->pattern->index[j1];
736     /* then, loop over elements in row i1 of A->col_coupleBlock */
737     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
738     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
739     i2 = A->col_coupleBlock->pattern->index[j2];
740 lgao 3779
741 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
742     RAP new entry is possible only if entry RA[i_r, i2] has not
743     been visited yet */
744     if (A_marker[i2] != i_r) {
745     /* first, mark entry RA[i_r,i2] as visited */
746     A_marker[i2] = i_r;
747 lgao 3779
748 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
749     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
750     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
751     i_c = P->row_coupleBlock->pattern->index[j3];
752 lgao 3779
753 caltinay 4829 /* check whether entry RAP[i_r,i_c] has been created.
754     If not yet created, create the entry and increase
755     the total number of elements in RAP_ext */
756     if (P_marker[i_c] < row_marker) {
757     P_marker[i_c] = sum;
758     sum++;
759     }
760     }
761 lgao 3779
762 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
763     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
764     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
765     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
766     if (P_marker[i_c] < row_marker) {
767     P_marker[i_c] = sum;
768     sum++;
769     }
770     }
771     }
772     }
773 lgao 3779
774 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
775     the process we have done to block A->col_coupleBlock */
776     j2_ub = A->mainBlock->pattern->ptr[i1+1];
777     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
778     i2 = A->mainBlock->pattern->index[j2];
779     if (A_marker[i2 + num_Acouple_cols] != i_r) {
780     A_marker[i2 + num_Acouple_cols] = i_r;
781     j3_ub = P->mainBlock->pattern->ptr[i2+1];
782     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
783     i_c = P->mainBlock->pattern->index[j3];
784     if (P_marker[i_c] < row_marker) {
785     P_marker[i_c] = sum;
786     sum++;
787     }
788     }
789     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
790     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
791     /* note that we need to map the column index in
792     P->col_coupleBlock back into the column index in
793     P_ext_couple */
794     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
795     if (P_marker[i_c] < row_marker) {
796     P_marker[i_c] = sum;
797     sum++;
798     }
799     }
800     }
801     }
802 lgao 3779 }
803     }
804    
805     /* Now we have the number of non-zero elements in RAP_ext, allocate
806     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
807 jfenwick 4275 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
808     RAP_ext_idx = new index_t[sum];
809     RAP_ext_val = new double[sum * block_size];
810     RA_val = new double[block_size];
811     RAP_val = new double[block_size];
812 lgao 3779
813     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
814     sum = num_Pext_cols + num_Pmain_cols;
815     #pragma omp parallel for private(i) schedule(static)
816     for (i=0; i<sum; i++) P_marker[i] = -1;
817     #pragma omp parallel for private(i) schedule(static)
818     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
819     sum = 0;
820     RAP_ext_ptr[0] = 0;
821     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
822     row_marker = sum;
823     /* then, loop over elements in row i_r of R_couple */
824     j1_ub = R_couple->pattern->ptr[i_r+1];
825     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
826 caltinay 4829 i1 = R_couple->pattern->index[j1];
827     R_val = &(R_couple->val[j1*P_block_size]);
828 lgao 3779
829 caltinay 4829 /* then, loop over elements in row i1 of A->col_coupleBlock */
830     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
831     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
832     i2 = A->col_coupleBlock->pattern->index[j2];
833     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
834     for (irb=0; irb<row_block_size; irb++)
835 lgao 3779 for (icb=0; icb<col_block_size; icb++)
836     RA_val[irb+row_block_size*icb] = ZERO;
837 caltinay 4829 for (irb=0; irb<P_block_size; irb++) {
838     rtmp=R_val[irb];
839     for (icb=0; icb<col_block_size; icb++) {
840     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
841     }
842     }
843 lgao 3779
844 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
845     RAP new entry is possible only if entry RA[i_r, i2] has not
846     been visited yet */
847     if (A_marker[i2] != i_r) {
848     /* first, mark entry RA[i_r,i2] as visited */
849     A_marker[i2] = i_r;
850 lgao 3779
851 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
852     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
853     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
854     i_c = P->row_coupleBlock->pattern->index[j3];
855     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
856     for (irb=0; irb<row_block_size; irb++)
857     for (icb=0; icb<col_block_size; icb++)
858     RAP_val[irb+row_block_size*icb] = ZERO;
859     for (icb=0; icb<P_block_size; icb++) {
860     rtmp = temp_val[icb];
861     for (irb=0; irb<row_block_size; irb++) {
862     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
863     }
864     }
865 lgao 3779
866 caltinay 4829 /* check whether entry RAP[i_r,i_c] has been created.
867     If not yet created, create the entry and increase
868     the total number of elements in RAP_ext */
869     if (P_marker[i_c] < row_marker) {
870     P_marker[i_c] = sum;
871     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
872     RAP_ext_idx[sum] = i_c + offset;
873     sum++;
874     } else {
875     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
876     for (ib=0; ib<block_size; ib++) {
877     temp_val[ib] += RAP_val[ib];
878     }
879     }
880     }
881 lgao 3779
882 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
883     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
884     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
885     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
886 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
887 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
888     for (icb=0; icb<col_block_size; icb++)
889     RAP_val[irb+row_block_size*icb]=ZERO;
890 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
891 caltinay 4829 rtmp = temp_val[icb];
892 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
893     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
894     }
895     }
896    
897 caltinay 4829 if (P_marker[i_c] < row_marker) {
898     P_marker[i_c] = sum;
899     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
900     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
901     sum++;
902     } else {
903 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
904     for (ib=0; ib<block_size; ib++) {
905     temp_val[ib] += RAP_val[ib];
906     }
907 caltinay 4829 }
908     }
909 lgao 3779
910 caltinay 4829 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
911     Only the contributions are added. */
912     } else {
913     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
914     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
915     i_c = P->row_coupleBlock->pattern->index[j3];
916 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
917 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
918     for (icb=0; icb<col_block_size; icb++)
919     RAP_val[irb+row_block_size*icb]=ZERO;
920 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
921 caltinay 4829 rtmp = temp_val[icb];
922 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
923     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
924     }
925     }
926    
927     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
928     for (ib=0; ib<block_size; ib++) {
929     temp_val[ib] += RAP_val[ib];
930     }
931 caltinay 4829 }
932     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
933     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
934     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
935 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
936 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
937     for (icb=0; icb<col_block_size; icb++)
938     RAP_val[irb+row_block_size*icb]=ZERO;
939 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
940 caltinay 4829 rtmp = temp_val[icb];
941 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
942     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
943     }
944     }
945    
946     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
947     for (ib=0; ib<block_size; ib++) {
948     temp_val[ib] += RAP_val[ib];
949     }
950 caltinay 4829 }
951     }
952     }
953 lgao 3779
954 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
955     the process we have done to block A->col_coupleBlock */
956     j2_ub = A->mainBlock->pattern->ptr[i1+1];
957     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
958     i2 = A->mainBlock->pattern->index[j2];
959 lgao 3779 temp_val = &(A->mainBlock->val[j2*block_size]);
960 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
961     for (icb=0; icb<col_block_size; icb++)
962     RA_val[irb+row_block_size*icb]=ZERO;
963 lgao 3779 for (irb=0; irb<P_block_size; irb++) {
964 caltinay 4829 rtmp=R_val[irb];
965 lgao 3779 for (icb=0; icb<col_block_size; icb++) {
966     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
967     }
968     }
969    
970 caltinay 4829 if (A_marker[i2 + num_Acouple_cols] != i_r) {
971     A_marker[i2 + num_Acouple_cols] = i_r;
972     j3_ub = P->mainBlock->pattern->ptr[i2+1];
973     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
974     i_c = P->mainBlock->pattern->index[j3];
975 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
976 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
977     for (icb=0; icb<col_block_size; icb++)
978     RAP_val[irb+row_block_size*icb]=ZERO;
979 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
980 caltinay 4829 rtmp = temp_val[icb];
981 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
982     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
983     }
984     }
985    
986 caltinay 4829 if (P_marker[i_c] < row_marker) {
987     P_marker[i_c] = sum;
988     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
989     RAP_ext_idx[sum] = i_c + offset;
990     sum++;
991     } else {
992 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
993     for (ib=0; ib<block_size; ib++) {
994     temp_val[ib] += RAP_val[ib];
995     }
996 caltinay 4829 }
997     }
998     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
999     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1000     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1001 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1002 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1003 lgao 3779 for (icb=0; icb<col_block_size; icb++)
1004 caltinay 4829 RAP_val[irb+row_block_size*icb]=ZERO;
1005 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1006 caltinay 4829 rtmp=temp_val[icb];
1007 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1008     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1009     }
1010     }
1011    
1012 caltinay 4829 if (P_marker[i_c] < row_marker) {
1013     P_marker[i_c] = sum;
1014     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1015 lgao 3779 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1016 caltinay 4829 sum++;
1017     } else {
1018 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1019     for (ib=0; ib<block_size; ib++) {
1020     temp_val[ib] += RAP_val[ib];
1021     }
1022 caltinay 4829 }
1023     }
1024     } else {
1025     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1026     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1027     i_c = P->mainBlock->pattern->index[j3];
1028 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1029 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1030     for (icb=0; icb<col_block_size; icb++)
1031     RAP_val[irb+row_block_size*icb]=ZERO;
1032 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1033 caltinay 4829 rtmp=temp_val[icb];
1034 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1035     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1036     }
1037     }
1038    
1039     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1040     for (ib=0; ib<block_size; ib++) {
1041     temp_val[ib] += RAP_val[ib];
1042     }
1043 caltinay 4829 }
1044     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1045     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1046     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1047 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1048 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1049     for (icb=0; icb<col_block_size; icb++)
1050     RAP_val[irb+row_block_size*icb]=ZERO;
1051 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1052 caltinay 4829 rtmp=temp_val[icb];
1053 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1054     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1055     }
1056     }
1057    
1058     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1059     for (ib=0; ib<block_size; ib++) {
1060     temp_val[ib] += RAP_val[ib];
1061     }
1062 caltinay 4829 }
1063     }
1064     }
1065 lgao 3779 }
1066     RAP_ext_ptr[i_r+1] = sum;
1067     }
1068 jfenwick 4275 delete[] P_marker;
1069     delete[] Pcouple_to_Pext;
1070 lgao 3779
1071     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1072     which is given by the column list of P->col_coupleBlock, and
1073 jfenwick 4345 column "c" is the list of columns which possibly covers the
1074     whole column range of system matrix P. This part of data is to
1075     be passed to neighbouring processors, and added to corresponding
1076     RAP[r,c] entries in the neighbouring processors */
1077 lgao 3779 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1078 caltinay 4829 &RAP_ext_val, global_id_P, block_size);
1079 lgao 3779
1080     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1081     sum = RAP_ext_ptr[num_RAPext_rows];
1082     num_RAPext_cols = 0;
1083     if (num_Pext_cols || sum > 0) {
1084 jfenwick 4275 temp = new index_t[num_Pext_cols+sum];
1085 lgao 3779 j1_ub = offset + num_Pmain_cols;
1086     for (i=0, iptr=0; i<sum; i++) {
1087 caltinay 4829 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1088     temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1089 lgao 3779 }
1090     for (j=0; j<num_Pext_cols; j++, iptr++){
1091 caltinay 4829 temp[iptr] = global_id_P[j];
1092 lgao 3779 }
1093    
1094     if (iptr) {
1095 caltinay 4829 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1096     num_RAPext_cols = 1;
1097     i = temp[0];
1098     for (j=1; j<iptr; j++) {
1099     if (temp[j] > i) {
1100     i = temp[j];
1101     temp[num_RAPext_cols++] = i;
1102     }
1103     }
1104 lgao 3779 }
1105     }
1106    
1107     /* resize the pattern of P_ext_couple */
1108     if(num_RAPext_cols){
1109 jfenwick 4275 global_id_RAP = new index_t[num_RAPext_cols];
1110 lgao 3821 #pragma omp parallel for private(i) schedule(static)
1111 lgao 3779 for (i=0; i<num_RAPext_cols; i++)
1112 caltinay 4829 global_id_RAP[i] = temp[i];
1113 lgao 3779 }
1114     if (num_Pext_cols || sum > 0)
1115 jfenwick 4275 delete[] temp;
1116 lgao 3779 j1_ub = offset + num_Pmain_cols;
1117 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
1118 lgao 3779 for (i=0; i<sum; i++) {
1119     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1120 caltinay 4829 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1121     /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1122     RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1123 lgao 3779 } else
1124 caltinay 4829 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1125 lgao 3779 }
1126    
1127     /* build the mapping */
1128     if (num_Pcouple_cols) {
1129 jfenwick 4275 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1130 lgao 3779 iptr = 0;
1131     for (i=0; i<num_RAPext_cols; i++)
1132 caltinay 4829 if (global_id_RAP[i] == P->global_id[iptr]) {
1133     Pcouple_to_RAP[iptr++] = i;
1134     if (iptr == num_Pcouple_cols) break;
1135     }
1136 lgao 3779 }
1137    
1138     if (num_Pext_cols) {
1139 jfenwick 4275 Pext_to_RAP = new index_t[num_Pext_cols];
1140 lgao 3779 iptr = 0;
1141     for (i=0; i<num_RAPext_cols; i++)
1142 caltinay 4829 if (global_id_RAP[i] == global_id_P[iptr]) {
1143     Pext_to_RAP[iptr++] = i;
1144     if (iptr == num_Pext_cols) break;
1145     }
1146 lgao 3779 }
1147    
1148     if (global_id_P){
1149 jfenwick 4275 delete[] global_id_P;
1150 lgao 3779 global_id_P = NULL;
1151     }
1152    
1153 jfenwick 4345 /* alloc and initialise the makers */
1154 lgao 3779 sum = num_RAPext_cols + num_Pmain_cols;
1155 jfenwick 4275 P_marker = new index_t[sum];
1156 lgao 3779 #pragma omp parallel for private(i) schedule(static)
1157     for (i=0; i<sum; i++) P_marker[i] = -1;
1158     #pragma omp parallel for private(i) schedule(static)
1159     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1160    
1161     /* Now, count the size of RAP. Start with rows in R_main */
1162     num_neighbors = P->col_coupler->connector->send->numNeighbors;
1163     offsetInShared = P->col_coupler->connector->send->offsetInShared;
1164     shared = P->col_coupler->connector->send->shared;
1165     i = 0;
1166     j = 0;
1167     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1168     /* Mark the start of row for both main block and couple block */
1169     row_marker = i;
1170     row_marker_ext = j;
1171    
1172     /* Mark the diagonal element RAP[i_r, i_r], and other elements
1173 caltinay 4829 in RAP_ext */
1174 lgao 3779 P_marker[i_r] = i;
1175     i++;
1176     for (j1=0; j1<num_neighbors; j1++) {
1177 caltinay 4829 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1178     if (shared[j2] == i_r) {
1179     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1180     i_c = RAP_ext_idx[k];
1181     if (i_c < num_Pmain_cols) {
1182     if (P_marker[i_c] < row_marker) {
1183     P_marker[i_c] = i;
1184     i++;
1185     }
1186     } else {
1187     if (P_marker[i_c] < row_marker_ext) {
1188     P_marker[i_c] = j;
1189     j++;
1190     }
1191     }
1192     }
1193     break;
1194     }
1195     }
1196 lgao 3779 }
1197    
1198     /* then, loop over elements in row i_r of R_main */
1199     j1_ub = R_main->pattern->ptr[i_r+1];
1200     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1201 caltinay 4829 i1 = R_main->pattern->index[j1];
1202 lgao 3779
1203 caltinay 4829 /* then, loop over elements in row i1 of A->col_coupleBlock */
1204     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1205     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1206     i2 = A->col_coupleBlock->pattern->index[j2];
1207 lgao 3779
1208 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
1209     RAP new entry is possible only if entry RA[i_r, i2] has not
1210     been visited yet */
1211     if (A_marker[i2] != i_r) {
1212     /* first, mark entry RA[i_r,i2] as visited */
1213     A_marker[i2] = i_r;
1214 lgao 3779
1215 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
1216     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1217     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1218     i_c = P->row_coupleBlock->pattern->index[j3];
1219 lgao 3779
1220 caltinay 4829 /* check whether entry RAP[i_r,i_c] has been created.
1221     If not yet created, create the entry and increase
1222     the total number of elements in RAP_ext */
1223     if (P_marker[i_c] < row_marker) {
1224     P_marker[i_c] = i;
1225     i++;
1226     }
1227     }
1228 lgao 3779
1229 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
1230     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1231     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1233     if (P_marker[i_c] < row_marker_ext) {
1234     P_marker[i_c] = j;
1235     j++;
1236     }
1237     }
1238     }
1239     }
1240 lgao 3779
1241 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
1242     the process we have done to block A->col_coupleBlock */
1243     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1244     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1245     i2 = A->mainBlock->pattern->index[j2];
1246     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1247     A_marker[i2 + num_Acouple_cols] = i_r;
1248     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1249     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1250     i_c = P->mainBlock->pattern->index[j3];
1251     if (P_marker[i_c] < row_marker) {
1252     P_marker[i_c] = i;
1253     i++;
1254     }
1255     }
1256     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1257     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1258     /* note that we need to map the column index in
1259     P->col_coupleBlock back into the column index in
1260     P_ext_couple */
1261     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1262     if (P_marker[i_c] < row_marker_ext) {
1263     P_marker[i_c] = j;
1264     j++;
1265     }
1266     }
1267     }
1268     }
1269 lgao 3779 }
1270     }
1271    
1272     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1273     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1274     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1275     RAP_couple */
1276 jfenwick 4275 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1277     RAP_main_idx = new index_t[i];
1278     RAP_main_val = new double[i * block_size];
1279     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1280     RAP_couple_idx = new index_t[j];
1281     RAP_couple_val = new double[j * block_size];
1282 lgao 3779
1283     RAP_main_ptr[num_Pmain_cols] = i;
1284     RAP_couple_ptr[num_Pmain_cols] = j;
1285    
1286     #pragma omp parallel for private(i) schedule(static)
1287     for (i=0; i<sum; i++) P_marker[i] = -1;
1288     #pragma omp parallel for private(i) schedule(static)
1289     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1290    
1291     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1292     in R_main */
1293     i = 0;
1294     j = 0;
1295     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1296     /* Mark the start of row for both main block and couple block */
1297     row_marker = i;
1298     row_marker_ext = j;
1299     RAP_main_ptr[i_r] = row_marker;
1300     RAP_couple_ptr[i_r] = row_marker_ext;
1301    
1302     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1303 caltinay 4829 in row i_r of RAP_ext */
1304 lgao 3779 P_marker[i_r] = i;
1305     for (ib=0; ib<block_size; ib++)
1306     RAP_main_val[i*block_size+ib] = ZERO;
1307     RAP_main_idx[i] = i_r;
1308     i++;
1309    
1310     for (j1=0; j1<num_neighbors; j1++) {
1311 caltinay 4829 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1312     if (shared[j2] == i_r) {
1313     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1314     i_c = RAP_ext_idx[k];
1315     if (i_c < num_Pmain_cols) {
1316     if (P_marker[i_c] < row_marker) {
1317     P_marker[i_c] = i;
1318     memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1319     RAP_main_idx[i] = i_c;
1320     i++;
1321     } else {
1322     t1_val = &(RAP_ext_val[k*block_size]);
1323     t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1324     for (ib=0; ib<block_size; ib++)
1325     t2_val[ib] += t1_val[ib];
1326     }
1327     } else {
1328     if (P_marker[i_c] < row_marker_ext) {
1329     P_marker[i_c] = j;
1330     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1331     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1332     j++;
1333     } else {
1334     t1_val = &(RAP_ext_val[k*block_size]);
1335     t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1336     for (ib=0; ib<block_size; ib++)
1337     t2_val[ib] += t1_val[ib];
1338     }
1339     }
1340     }
1341     break;
1342     }
1343     }
1344 lgao 3779 }
1345    
1346     /* then, loop over elements in row i_r of R_main */
1347     j1_ub = R_main->pattern->ptr[i_r+1];
1348     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1349 caltinay 4829 i1 = R_main->pattern->index[j1];
1350     R_val = &(R_main->val[j1*P_block_size]);
1351 lgao 3779
1352 caltinay 4829 /* then, loop over elements in row i1 of A->col_coupleBlock */
1353     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1354     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1355     i2 = A->col_coupleBlock->pattern->index[j2];
1356 lgao 3779 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1357 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1358     for (icb=0; icb<col_block_size; icb++)
1359     RA_val[irb+row_block_size*icb]=ZERO;
1360 lgao 3779 for (irb=0; irb<P_block_size; irb++) {
1361 caltinay 4829 rtmp=R_val[irb];
1362 lgao 3779 for (icb=0; icb<col_block_size; icb++) {
1363     RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1364     }
1365     }
1366    
1367    
1368 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
1369     RAP new entry is possible only if entry RA[i_r, i2] has not
1370     been visited yet */
1371     if (A_marker[i2] != i_r) {
1372     /* first, mark entry RA[i_r,i2] as visited */
1373     A_marker[i2] = i_r;
1374 lgao 3779
1375 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
1376     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1377     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1378     i_c = P->row_coupleBlock->pattern->index[j3];
1379 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1380 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1381     for (icb=0; icb<col_block_size; icb++)
1382     RAP_val[irb+row_block_size*icb]=ZERO;
1383 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1384 caltinay 4829 rtmp=temp_val[icb];
1385 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1386     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1387     }
1388     }
1389    
1390    
1391 caltinay 4829 /* check whether entry RAP[i_r,i_c] has been created.
1392     If not yet created, create the entry and increase
1393     the total number of elements in RAP_ext */
1394     if (P_marker[i_c] < row_marker) {
1395     P_marker[i_c] = i;
1396     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1397     RAP_main_idx[i] = i_c;
1398     i++;
1399     } else {
1400 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1401     for (ib=0; ib<block_size; ib++) {
1402     temp_val[ib] += RAP_val[ib];
1403     }
1404 caltinay 4829 }
1405     }
1406 lgao 3779
1407 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
1408     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1409     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1410     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1411 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1412 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1413     for (icb=0; icb<col_block_size; icb++)
1414     RAP_val[irb+row_block_size*icb]=ZERO;
1415 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1416 caltinay 4829 rtmp=temp_val[icb];
1417 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1418     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1419     }
1420     }
1421    
1422 caltinay 4829 if (P_marker[i_c] < row_marker_ext) {
1423     P_marker[i_c] = j;
1424     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1425     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1426     j++;
1427     } else {
1428 lgao 3779 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1429     for (ib=0; ib<block_size; ib++) {
1430     temp_val[ib] += RAP_val[ib];
1431     }
1432 caltinay 4829 }
1433     }
1434 lgao 3779
1435 caltinay 4829 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1436     Only the contributions are added. */
1437     } else {
1438     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1439     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1440     i_c = P->row_coupleBlock->pattern->index[j3];
1441 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1442 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1443     for (icb=0; icb<col_block_size; icb++)
1444     RAP_val[irb+row_block_size*icb]=ZERO;
1445 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1446 caltinay 4829 rtmp=temp_val[icb];
1447 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1448     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1449     }
1450     }
1451    
1452     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1453     for (ib=0; ib<block_size; ib++) {
1454     temp_val[ib] += RAP_val[ib];
1455     }
1456 caltinay 4829 }
1457     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1458     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1459     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1460 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1461 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1462     for (icb=0; icb<col_block_size; icb++)
1463     RAP_val[irb+row_block_size*icb]=ZERO;
1464 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1465 caltinay 4829 rtmp=temp_val[icb];
1466 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1467     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1468     }
1469     }
1470    
1471     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1472     for (ib=0; ib<block_size; ib++) {
1473     temp_val[ib] += RAP_val[ib];
1474     }
1475 caltinay 4829 }
1476     }
1477     }
1478 lgao 3779
1479 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
1480     the process we have done to block A->col_coupleBlock */
1481     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1482     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1483     i2 = A->mainBlock->pattern->index[j2];
1484 lgao 3779 temp_val = &(A->mainBlock->val[j2*block_size]);
1485 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1486     for (icb=0; icb<col_block_size; icb++)
1487     RA_val[irb+row_block_size*icb]=ZERO;
1488 lgao 3779 for (irb=0; irb<P_block_size; irb++) {
1489 caltinay 4829 rtmp=R_val[irb];
1490 lgao 3779 for (icb=0; icb<col_block_size; icb++) {
1491     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1492     }
1493     }
1494    
1495 caltinay 4829 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1496     A_marker[i2 + num_Acouple_cols] = i_r;
1497     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1498     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1499     i_c = P->mainBlock->pattern->index[j3];
1500 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1501 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1502     for (icb=0; icb<col_block_size; icb++)
1503     RAP_val[irb+row_block_size*icb]=ZERO;
1504 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1505 caltinay 4829 rtmp=temp_val[icb];
1506 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1507     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1508     }
1509     }
1510 caltinay 4829 if (P_marker[i_c] < row_marker) {
1511     P_marker[i_c] = i;
1512     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1513     RAP_main_idx[i] = i_c;
1514     i++;
1515     } else {
1516 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1517     for (ib=0; ib<block_size; ib++) {
1518     temp_val[ib] += RAP_val[ib];
1519     }
1520 caltinay 4829 }
1521     }
1522     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1523     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1524     /* note that we need to map the column index in
1525     P->col_coupleBlock back into the column index in
1526     P_ext_couple */
1527     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1528 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1529 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1530     for (icb=0; icb<col_block_size; icb++)
1531     RAP_val[irb+row_block_size*icb]=ZERO;
1532 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1533 caltinay 4829 rtmp=temp_val[icb];
1534 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1535     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1536     }
1537     }
1538    
1539 caltinay 4829 if (P_marker[i_c] < row_marker_ext) {
1540     P_marker[i_c] = j;
1541     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1542     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1543     j++;
1544     } else {
1545 lgao 3779 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1546     for (ib=0; ib<block_size; ib++) {
1547     temp_val[ib] += RAP_val[ib];
1548     }
1549 caltinay 4829 }
1550     }
1551 lgao 3779
1552 caltinay 4829 } else {
1553     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1554     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1555     i_c = P->mainBlock->pattern->index[j3];
1556 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1557 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1558     for (icb=0; icb<col_block_size; icb++)
1559     RAP_val[irb+row_block_size*icb]=ZERO;
1560 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1561 caltinay 4829 rtmp=temp_val[icb];
1562 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1563     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1564     }
1565     }
1566    
1567     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1568     for (ib=0; ib<block_size; ib++) {
1569     temp_val[ib] += RAP_val[ib];
1570     }
1571 caltinay 4829 }
1572     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1573     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1574     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1575 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1576 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1577     for (icb=0; icb<col_block_size; icb++)
1578     RAP_val[irb+row_block_size*icb]=ZERO;
1579 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1580 caltinay 4829 rtmp = temp_val[icb];
1581 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1582     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1583     }
1584     }
1585    
1586     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1587     for (ib=0; ib<block_size; ib++) {
1588     temp_val[ib] += RAP_val[ib];
1589     }
1590 caltinay 4829 }
1591     }
1592     }
1593 lgao 3779 }
1594    
1595     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1596     if (i > row_marker) {
1597 caltinay 4829 offset = i - row_marker;
1598     temp = new index_t[offset];
1599     #pragma omp parallel for schedule(static) private(iptr)
1600     for (iptr=0; iptr<offset; iptr++)
1601     temp[iptr] = RAP_main_idx[row_marker+iptr];
1602     if (offset > 0) {
1603     qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1604     }
1605     temp_val = new double[offset * block_size];
1606     #pragma omp parallel for schedule(static) private(iptr,k)
1607     for (iptr=0; iptr<offset; iptr++){
1608     k = P_marker[temp[iptr]];
1609     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1610     P_marker[temp[iptr]] = iptr + row_marker;
1611     }
1612     #pragma omp parallel for schedule(static) private(iptr)
1613     for (iptr=0; iptr<offset; iptr++){
1614     RAP_main_idx[row_marker+iptr] = temp[iptr];
1615     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1616     }
1617     delete[] temp;
1618     delete[] temp_val;
1619 lgao 3779 }
1620     if (j > row_marker_ext) {
1621     offset = j - row_marker_ext;
1622 jfenwick 4275 temp = new index_t[offset];
1623 caltinay 4829 #pragma omp parallel for schedule(static) private(iptr)
1624 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1625     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1626     if (offset > 0) {
1627 caltinay 4803 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1628 lgao 3779 }
1629 jfenwick 4275 temp_val = new double[offset * block_size];
1630 caltinay 4829 #pragma omp parallel for schedule(static) private(iptr, k)
1631 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1632     k = P_marker[temp[iptr] + num_Pmain_cols];
1633 caltinay 4829 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1634 lgao 3779 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1635     }
1636 caltinay 4829 #pragma omp parallel for schedule(static) private(iptr)
1637 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1638     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1639 caltinay 4829 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1640 lgao 3779 }
1641 jfenwick 4275 delete[] temp;
1642     delete[] temp_val;
1643 lgao 3779 }
1644     }
1645    
1646 jfenwick 4275 delete[] RA_val;
1647     delete[] RAP_val;
1648     delete[] A_marker;
1649     delete[] Pext_to_RAP;
1650     delete[] Pcouple_to_RAP;
1651     delete[] RAP_ext_ptr;
1652     delete[] RAP_ext_idx;
1653     delete[] RAP_ext_val;
1654 caltinay 4829 R_main.reset();
1655     R_couple.reset();
1656 lgao 3779
1657     /* Check whether there are empty columns in RAP_couple */
1658     #pragma omp parallel for schedule(static) private(i)
1659     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1660     /* num of non-empty columns is stored in "k" */
1661     k = 0;
1662     j = RAP_couple_ptr[num_Pmain_cols];
1663     for (i=0; i<j; i++) {
1664     i1 = RAP_couple_idx[i];
1665     if (P_marker[i1]) {
1666 caltinay 4829 P_marker[i1] = 0;
1667     k++;
1668 lgao 3779 }
1669     }
1670    
1671     /* empty columns is found */
1672     if (k < num_RAPext_cols) {
1673 jfenwick 4275 temp = new index_t[k];
1674 lgao 3779 k = 0;
1675     for (i=0; i<num_RAPext_cols; i++)
1676 caltinay 4829 if (!P_marker[i]) {
1677     P_marker[i] = k;
1678     temp[k] = global_id_RAP[i];
1679     k++;
1680     }
1681 lgao 3821 #pragma omp parallel for schedule(static) private(i, i1)
1682 lgao 3779 for (i=0; i<j; i++) {
1683 caltinay 4829 i1 = RAP_couple_idx[i];
1684     RAP_couple_idx[i] = P_marker[i1];
1685 lgao 3779 }
1686     num_RAPext_cols = k;
1687 jfenwick 4275 delete[] global_id_RAP;
1688 lgao 3779 global_id_RAP = temp;
1689     }
1690 jfenwick 4275 delete[] P_marker;
1691 lgao 3779
1692     /******************************************************/
1693 jfenwick 4345 /* Start to create the coarse level System Matrix A_c */
1694 lgao 3779 /******************************************************/
1695     /* first, prepare the sender/receiver for the col_connector */
1696     dist = P->pattern->input_distribution->first_component;
1697 jfenwick 4275 recv_len = new dim_t[size];
1698     send_len = new dim_t[size];
1699     neighbor = new Esys_MPI_rank[size];
1700     offsetInShared = new index_t[size+1];
1701     shared = new index_t[num_RAPext_cols];
1702 lgao 3779 memset(recv_len, 0, sizeof(dim_t) * size);
1703     memset(send_len, 0, sizeof(dim_t) * size);
1704     num_neighbors = 0;
1705     offsetInShared[0] = 0;
1706     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1707     shared[i] = i + num_Pmain_cols;
1708     if (k <= global_id_RAP[i]) {
1709 caltinay 4829 if (recv_len[j] > 0) {
1710     neighbor[num_neighbors] = j;
1711     num_neighbors ++;
1712     offsetInShared[num_neighbors] = i;
1713     }
1714     while (k <= global_id_RAP[i]) {
1715     j++;
1716     k = dist[j+1];
1717     }
1718 lgao 3779 }
1719     recv_len[j] ++;
1720     }
1721     if (recv_len[j] > 0) {
1722     neighbor[num_neighbors] = j;
1723     num_neighbors ++;
1724     offsetInShared[num_neighbors] = i;
1725     }
1726    
1727 caltinay 4816 paso::SharedComponents_ptr recv(new paso::SharedComponents(
1728     num_Pmain_cols, num_neighbors, neighbor, shared,
1729     offsetInShared, 1, 0, mpi_info));
1730    
1731 lgao 3828 #ifdef ESYS_MPI
1732 lgao 3779 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1733 lgao 3828 #endif
1734 lgao 3779
1735     #ifdef ESYS_MPI
1736 jfenwick 4275 mpi_requests=new MPI_Request[size*2];
1737     mpi_stati=new MPI_Status[size*2];
1738 lgao 3779 #else
1739 jfenwick 4275 mpi_requests=new int[size*2];
1740     mpi_stati=new int[size*2];
1741 lgao 3779 #endif
1742     num_neighbors = 0;
1743     j = 0;
1744     offsetInShared[0] = 0;
1745     for (i=0; i<size; i++) {
1746     if (send_len[i] > 0) {
1747 caltinay 4829 neighbor[num_neighbors] = i;
1748     num_neighbors ++;
1749     j += send_len[i];
1750     offsetInShared[num_neighbors] = j;
1751 lgao 3779 }
1752     }
1753 jfenwick 4275 delete[] shared;
1754     shared = new index_t[j];
1755 lgao 3779 for (i=0, j=0; i<num_neighbors; i++) {
1756     k = neighbor[i];
1757 lgao 3828 #ifdef ESYS_MPI
1758 lgao 3779 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1759 caltinay 4829 mpi_info->msg_tag_counter+k,
1760     mpi_info->comm, &mpi_requests[i]);
1761 lgao 3828 #endif
1762 lgao 3779 j += send_len[k];
1763     }
1764     for (i=0, j=0; i<recv->numNeighbors; i++) {
1765     k = recv->neighbor[i];
1766 lgao 3828 #ifdef ESYS_MPI
1767 lgao 3779 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1768 caltinay 4829 mpi_info->msg_tag_counter+rank,
1769     mpi_info->comm, &mpi_requests[i+num_neighbors]);
1770 lgao 3828 #endif
1771 lgao 3779 j += recv_len[k];
1772     }
1773 lgao 3828 #ifdef ESYS_MPI
1774 lgao 3779 MPI_Waitall(num_neighbors + recv->numNeighbors,
1775 caltinay 4829 mpi_requests, mpi_stati);
1776 lgao 3828 #endif
1777 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1778 lgao 3779
1779     j = offsetInShared[num_neighbors];
1780     offset = dist[rank];
1781 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1782 lgao 3779 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1783    
1784 caltinay 4816 paso::SharedComponents_ptr send(new paso::SharedComponents(
1785     num_Pmain_cols, num_neighbors, neighbor, shared,
1786     offsetInShared, 1, 0, mpi_info));
1787    
1788 caltinay 4817 col_connector.reset(new paso::Connector(send, recv));
1789 jfenwick 4275 delete[] shared;
1790 lgao 3779
1791     /* now, create row distribution (output_distri) and col
1792     distribution (input_distribution) */
1793 caltinay 4814 input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1794     output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1795 lgao 3779
1796     /* then, prepare the sender/receiver for the row_connector, first, prepare
1797     the information for sender */
1798     sum = RAP_couple_ptr[num_Pmain_cols];
1799 jfenwick 4275 len = new dim_t[size];
1800     send_ptr = new index_t*[size];
1801     send_idx = new index_t*[size];
1802 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1803 lgao 3779 for (i=0; i<size; i++) {
1804 jfenwick 4275 send_ptr[i] = new index_t[num_Pmain_cols];
1805     send_idx[i] = new index_t[sum];
1806 lgao 3779 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1807     }
1808     memset(len, 0, sizeof(dim_t) * size);
1809     recv = col_connector->recv;
1810     sum=0;
1811     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1812     i1 = RAP_couple_ptr[i_r];
1813     i2 = RAP_couple_ptr[i_r+1];
1814     if (i2 > i1) {
1815 caltinay 4829 /* then row i_r will be in the sender of row_connector, now check
1816     how many neighbours i_r needs to be send to */
1817     for (j=i1; j<i2; j++) {
1818     i_c = RAP_couple_idx[j];
1819     /* find out the corresponding neighbor "p" of column i_c */
1820     for (p=0; p<recv->numNeighbors; p++) {
1821     if (i_c < recv->offsetInShared[p+1]) {
1822     k = recv->neighbor[p];
1823     if (send_ptr[k][i_r] == 0) sum++;
1824     send_ptr[k][i_r] ++;
1825     send_idx[k][len[k]] = global_id_RAP[i_c];
1826     len[k] ++;
1827     break;
1828     }
1829     }
1830     }
1831 lgao 3779 }
1832     }
1833     if (global_id_RAP) {
1834 jfenwick 4275 delete[] global_id_RAP;
1835 lgao 3779 global_id_RAP = NULL;
1836     }
1837    
1838     /* now allocate the sender */
1839 jfenwick 4275 shared = new index_t[sum];
1840 lgao 3779 memset(send_len, 0, sizeof(dim_t) * size);
1841     num_neighbors=0;
1842     offsetInShared[0] = 0;
1843     for (p=0, k=0; p<size; p++) {
1844     for (i=0; i<num_Pmain_cols; i++) {
1845 caltinay 4829 if (send_ptr[p][i] > 0) {
1846     shared[k] = i;
1847     k++;
1848     send_ptr[p][send_len[p]] = send_ptr[p][i];
1849     send_len[p]++;
1850     }
1851 lgao 3779 }
1852     if (k > offsetInShared[num_neighbors]) {
1853 caltinay 4829 neighbor[num_neighbors] = p;
1854     num_neighbors ++;
1855     offsetInShared[num_neighbors] = k;
1856 lgao 3779 }
1857     }
1858 caltinay 4816 send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1859     neighbor, shared, offsetInShared, 1, 0, mpi_info));
1860 lgao 3779
1861     /* send/recv number of rows will be sent from current proc
1862     recover info for the receiver of row_connector from the sender */
1863 lgao 3828 #ifdef ESYS_MPI
1864 lgao 3779 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1865 lgao 3828 #endif
1866 lgao 3779 num_neighbors = 0;
1867     offsetInShared[0] = 0;
1868     j = 0;
1869     for (i=0; i<size; i++) {
1870     if (i != rank && recv_len[i] > 0) {
1871 caltinay 4829 neighbor[num_neighbors] = i;
1872     num_neighbors ++;
1873     j += recv_len[i];
1874     offsetInShared[num_neighbors] = j;
1875 lgao 3779 }
1876     }
1877 jfenwick 4275 delete[] shared;
1878     delete[] recv_len;
1879     shared = new index_t[j];
1880 lgao 3779 k = offsetInShared[num_neighbors];
1881 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1882 lgao 3779 for (i=0; i<k; i++) {
1883     shared[i] = i + num_Pmain_cols;
1884     }
1885 caltinay 4816 recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1886     neighbor, shared, offsetInShared, 1, 0, mpi_info));
1887 caltinay 4817 row_connector.reset(new paso::Connector(send, recv));
1888 jfenwick 4275 delete[] shared;
1889 lgao 3779
1890     /* send/recv pattern->ptr for rowCoupleBlock */
1891     num_RAPext_rows = offsetInShared[num_neighbors];
1892 jfenwick 4275 row_couple_ptr = new index_t[num_RAPext_rows+1];
1893 lgao 3779 for (p=0; p<num_neighbors; p++) {
1894     j = offsetInShared[p];
1895     i = offsetInShared[p+1];
1896 lgao 3828 #ifdef ESYS_MPI
1897 lgao 3779 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1898 caltinay 4829 mpi_info->msg_tag_counter+neighbor[p],
1899     mpi_info->comm, &mpi_requests[p]);
1900 lgao 3828 #endif
1901 lgao 3779 }
1902     send = row_connector->send;
1903     for (p=0; p<send->numNeighbors; p++) {
1904 lgao 3828 #ifdef ESYS_MPI
1905 lgao 3779 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1906 caltinay 4829 MPI_INT, send->neighbor[p],
1907     mpi_info->msg_tag_counter+rank,
1908     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1909 lgao 3828 #endif
1910 lgao 3779 }
1911 lgao 3828 #ifdef ESYS_MPI
1912 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1913 caltinay 4829 mpi_requests, mpi_stati);
1914 lgao 3828 #endif
1915 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1916 jfenwick 4275 delete[] send_len;
1917 lgao 3779
1918     sum = 0;
1919     for (i=0; i<num_RAPext_rows; i++) {
1920     k = row_couple_ptr[i];
1921     row_couple_ptr[i] = sum;
1922     sum += k;
1923     }
1924     row_couple_ptr[num_RAPext_rows] = sum;
1925    
1926     /* send/recv pattern->index for rowCoupleBlock */
1927     k = row_couple_ptr[num_RAPext_rows];
1928 jfenwick 4275 row_couple_idx = new index_t[k];
1929 lgao 3779 for (p=0; p<num_neighbors; p++) {
1930     j1 = row_couple_ptr[offsetInShared[p]];
1931     j2 = row_couple_ptr[offsetInShared[p+1]];
1932 lgao 3828 #ifdef ESYS_MPI
1933 lgao 3779 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1934 caltinay 4829 mpi_info->msg_tag_counter+neighbor[p],
1935     mpi_info->comm, &mpi_requests[p]);
1936 lgao 3828 #endif
1937 lgao 3779 }
1938     for (p=0; p<send->numNeighbors; p++) {
1939 lgao 3828 #ifdef ESYS_MPI
1940 lgao 3779 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1941 caltinay 4829 MPI_INT, send->neighbor[p],
1942     mpi_info->msg_tag_counter+rank,
1943     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1944 lgao 3828 #endif
1945 lgao 3779 }
1946 lgao 3828 #ifdef ESYS_MPI
1947 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1948 caltinay 4829 mpi_requests, mpi_stati);
1949 lgao 3828 #endif
1950 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1951 lgao 3779
1952 caltinay 4836 offset = input_dist->first_component[rank];
1953     k = row_couple_ptr[num_RAPext_rows];
1954     #pragma omp parallel for schedule(static) private(i)
1955     for (i=0; i<k; i++) {
1956     row_couple_idx[i] -= offset;
1957     }
1958     #pragma omp parallel for schedule(static) private(i)
1959     for (i=0; i<size; i++) {
1960     delete[] send_ptr[i];
1961     delete[] send_idx[i];
1962     }
1963     delete[] send_ptr;
1964     delete[] send_idx;
1965     delete[] len;
1966     delete[] offsetInShared;
1967     delete[] neighbor;
1968     delete[] mpi_requests;
1969     delete[] mpi_stati;
1970     Esys_MPIInfo_free(mpi_info);
1971 lgao 3779
1972 caltinay 4836 /* Now, we can create pattern for mainBlock and coupleBlock */
1973     paso::Pattern_ptr main_pattern(new paso::Pattern(MATRIX_FORMAT_DEFAULT,
1974 caltinay 4819 num_Pmain_cols, num_Pmain_cols, RAP_main_ptr, RAP_main_idx));
1975 caltinay 4836 paso::Pattern_ptr col_couple_pattern(new paso::Pattern(
1976 caltinay 4819 MATRIX_FORMAT_DEFAULT, num_Pmain_cols, num_RAPext_cols,
1977     RAP_couple_ptr, RAP_couple_idx));
1978 caltinay 4836 paso::Pattern_ptr row_couple_pattern(new paso::Pattern(
1979 caltinay 4819 MATRIX_FORMAT_DEFAULT, num_RAPext_rows, num_Pmain_cols,
1980     row_couple_ptr, row_couple_idx));
1981 lgao 3779
1982 caltinay 4836 /* next, create the system matrix */
1983     pattern.reset(new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1984 lgao 3779 output_dist, input_dist, main_pattern, col_couple_pattern,
1985 caltinay 4818 row_couple_pattern,