/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Annotation of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4934 - (hide annotations)
Tue May 13 00:28:11 2014 UTC (5 years, 5 months ago) by jfenwick
File size: 132779 byte(s)
This commit is brought to you by the number 4934 and the tool "meld".
Merge of partially complete split world code from branch.



1 jfenwick 3981 /*****************************************************************************
2 lgao 3763 *
3 jfenwick 4657 * Copyright (c) 2003-2014 by University of Queensland
4 jfenwick 3981 * http://www.uq.edu.au
5 lgao 3763 *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10 jfenwick 3981 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 jfenwick 4657 * Development 2012-2013 by School of Earth Sciences
12     * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 jfenwick 3981 *
14     *****************************************************************************/
15 lgao 3763
16    
17 caltinay 4816 /****************************************************************************/
18 lgao 3763
19     /* Paso: defines AMG Interpolation */
20    
21 caltinay 4816 /****************************************************************************/
22 lgao 3763
23     /* Author: l.gao@uq.edu.au */
24    
25 caltinay 4816 /****************************************************************************/
26 lgao 3763
27     #include "Paso.h"
28     #include "SparseMatrix.h"
29     #include "PasoUtil.h"
30     #include "Preconditioner.h"
31    
32 caltinay 4869 #include <cstring> // memcpy
33    
34 caltinay 4843 namespace paso {
35    
36 caltinay 4816 /****************************************************************************
37 lgao 3763
38 jfenwick 4345 Methods necessary for AMG preconditioner
39 lgao 3763
40     construct n_C x n_C interpolation operator A_C from matrix A
41     and prolongation matrix P.
42    
43     The coarsening operator A_C is defined by RAP where R=P^T.
44    
45 caltinay 4816 *****************************************************************************/
46 lgao 3763
47 caltinay 4843 /* Extend system matrix B with extra two sparse matrices:
48 caltinay 4829 B_ext_main and B_ext_couple
49 caltinay 4843 The combination of this two sparse matrices represents
50     the portion of B that is stored on neighbour procs and
51 lgao 3763 needed locally for triple matrix product (B^T) A B.
52    
53 caltinay 4843 FOR NOW, we assume that the system matrix B has a NULL
54 lgao 3763 row_coupleBlock and a NULL remote_coupleBlock. Based on
55 caltinay 4843 this assumption, we use link row_coupleBlock for sparse
56 lgao 3763 matrix B_ext_main, and link remote_coupleBlock for sparse
57 caltinay 4843 matrix B_ext_couple.
58 lgao 3763
59 caltinay 4843 To be specific, B_ext (on processor k) are group of rows
60     in B, where the list of rows from processor q is given by
61     A->col_coupler->connector->send->shared[rPtr...]
62 lgao 3763 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
63 caltinay 4843 on q.
64     */
65     void Preconditioner_AMG_extendB(SystemMatrix_ptr A, SystemMatrix_ptr B)
66 lgao 3763 {
67 caltinay 4843 if (A->mpi_info->size == 1) return;
68    
69     if (B->remote_coupleBlock.get()) {
70     Esys_setError(VALUE_ERROR, "Preconditioner_AMG_extendB: the link to "
71     "remote_coupleBlock has already been set.");
72     return;
73     }
74     #ifdef ESYS_MPI
75     B->row_coupleBlock.reset();
76    
77     Pattern_ptr pattern_main, pattern_couple;
78     Coupler_ptr coupler;
79     SharedComponents_ptr send, recv;
80 caltinay 4829 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
81     index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
82     index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
83     index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
84     index_t *idx_m=NULL, *idx_c=NULL;
85     index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
86     index_t offset, len, block_size, block_size_size, max_num_cols;
87     index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
88     dim_t *recv_degree=NULL, *send_degree=NULL;
89     dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
90 lgao 3763
91 caltinay 4843 // sending/receiving unknown's global ID
92 caltinay 4829 num_main_cols = B->mainBlock->numCols;
93     cols = new double[num_main_cols];
94     offset = B->col_distribution->first_component[rank];
95 caltinay 4843 #pragma omp parallel for private(i) schedule(static)
96 caltinay 4829 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
97     if (B->global_id == NULL) {
98 caltinay 4843 coupler.reset(new Coupler(B->col_coupler->connector, 1));
99 caltinay 4829 coupler->startCollect(cols);
100     }
101 lgao 3763
102 caltinay 4829 recv_buf = new index_t[size];
103     recv_degree = new dim_t[size];
104     recv_offset = new index_t[size+1];
105 caltinay 4843 #pragma omp parallel for private(i) schedule(static)
106 caltinay 4829 for (i=0; i<size; i++){
107     recv_buf[i] = 0;
108     recv_degree[i] = 1;
109     recv_offset[i] = i;
110     }
111 lgao 3763
112 caltinay 4843 block_size = B->block_size;
113     block_size_size = block_size * sizeof(double);
114     num_couple_cols = B->col_coupleBlock->numCols;
115     send = A->col_coupler->connector->send;
116     recv = A->col_coupler->connector->recv;
117     num_neighbors = send->numNeighbors;
118     p = send->offsetInShared[num_neighbors];
119     len = p * B->col_distribution->first_component[size];
120     send_buf = new double[len * block_size];
121     send_idx = new index_t[len];
122     send_offset = new index_t[(p+1)*2];
123     send_degree = new dim_t[num_neighbors];
124     i = num_main_cols + num_couple_cols;
125     send_m = new double[i * block_size];
126     send_c = new double[i * block_size];
127     idx_m = new index_t[i];
128     idx_c = new index_t[i];
129 lgao 3763
130 caltinay 4843 /* waiting for receiving unknown's global ID */
131     if (B->global_id == NULL) {
132     coupler->finishCollect();
133     global_id = new index_t[num_couple_cols];
134     #pragma omp parallel for private(i) schedule(static)
135     for (i=0; i<num_couple_cols; ++i)
136     global_id[i] = coupler->recv_buffer[i];
137     B->global_id = global_id;
138     } else
139     global_id = B->global_id;
140 lgao 3763
141 caltinay 4843 /* distribute the number of cols in current col_coupleBlock to all ranks */
142     MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
143 lgao 3763
144 caltinay 4843 /* distribute global_ids of cols to be considered to all ranks*/
145     len = 0;
146     max_num_cols = 0;
147     for (i=0; i<size; i++){
148     recv_degree[i] = recv_buf[i];
149     recv_offset[i] = len;
150     len += recv_buf[i];
151     if (max_num_cols < recv_buf[i])
152     max_num_cols = recv_buf[i];
153     }
154     recv_offset[size] = len;
155     cols_array = new index_t[len];
156     MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
157 lgao 3763
158 caltinay 4843 // first, prepare the ptr_ptr to be received
159     q = recv->numNeighbors;
160     len = recv->offsetInShared[q];
161     ptr_ptr = new index_t[(len+1) * 2];
162     for (p=0; p<q; p++) {
163     row = recv->offsetInShared[p];
164     m = recv->offsetInShared[p + 1];
165     MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
167     A->mpi_info->comm,
168     &(A->col_coupler->mpi_requests[p]));
169 caltinay 4843 }
170 lgao 3763
171 caltinay 4843 // now prepare the rows to be sent (the degree, the offset and the data)
172     len = 0;
173     i0 = 0;
174     for (p=0; p<num_neighbors; p++) {
175     i = i0;
176     neighbor = send->neighbor[p];
177     m_lb = B->col_distribution->first_component[neighbor];
178     m_ub = B->col_distribution->first_component[neighbor + 1];
179     j_ub = send->offsetInShared[p + 1];
180     for (j=send->offsetInShared[p]; j<j_ub; j++) {
181     row = send->shared[j];
182     l_m = 0;
183     l_c = 0;
184     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
185     k_lb = B->col_coupleBlock->pattern->ptr[row];
186 lgao 3763
187 caltinay 4843 /* check part of col_coupleBlock for data to be passed @row */
188     for (k=k_lb; k<k_ub; k++) {
189     m = global_id[B->col_coupleBlock->pattern->index[k]];
190     if (m > offset) break;
191     if (m>= m_lb && m < m_ub) {
192     /* data to be passed to sparse matrix B_ext_main */
193     idx_m[l_m] = m - m_lb;
194     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
195     l_m++;
196     } else {
197     /* data to be passed to sparse matrix B_ext_couple */
198     idx_c[l_c] = m;
199     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
200     l_c++;
201     }
202     }
203     k_lb = k;
204 lgao 3763
205 caltinay 4843 /* check mainBlock for data to be passed @row to sparse
206     matrix B_ext_couple */
207     k_ub = B->mainBlock->pattern->ptr[row + 1];
208     k = B->mainBlock->pattern->ptr[row];
209     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
210     for (; k<k_ub; k++) {
211     m = B->mainBlock->pattern->index[k] + offset;
212     idx_c[l_c] = m;
213     l_c++;
214     }
215 lgao 3763
216 caltinay 4843 /* check the rest part of col_coupleBlock for data to
217     be passed @row to sparse matrix B_ext_couple */
218     k = k_lb;
219     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
220     for (k=k_lb; k<k_ub; k++) {
221     m = global_id[B->col_coupleBlock->pattern->index[k]];
222     if (m>= m_lb && m < m_ub) {
223     /* data to be passed to sparse matrix B_ext_main */
224     idx_m[l_m] = m - m_lb;
225     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
226     l_m++;
227     } else {
228     /* data to be passed to sparse matrix B_ext_couple */
229     idx_c[l_c] = m;
230     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
231     l_c++;
232     }
233     }
234    
235     memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
236     memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
237     send_offset[2*i] = l_m;
238     len += l_m;
239     memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
240     memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
241     send_offset[2*i+1] = l_c;
242     len += l_c;
243     i++;
244 caltinay 4829 }
245 lgao 3763
246 caltinay 4843 /* sending */
247     MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
248 lgao 3763 A->mpi_info->msg_tag_counter+rank,
249     A->mpi_info->comm,
250     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
251 caltinay 4843 send_degree[p] = len;
252     i0 = i;
253     }
254     delete[] send_m;
255     delete[] send_c;
256     delete[] idx_m;
257     delete[] idx_c;
258 lgao 3763
259 caltinay 4843 q = recv->numNeighbors;
260     len = recv->offsetInShared[q];
261     ptr_main = new index_t[(len+1)];
262     ptr_couple = new index_t[(len+1)];
263 lgao 3763
264 caltinay 4843 MPI_Waitall(A->col_coupler->connector->send->numNeighbors +
265     A->col_coupler->connector->recv->numNeighbors,
266 lgao 3763 A->col_coupler->mpi_requests,
267     A->col_coupler->mpi_stati);
268 caltinay 4843 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
269 lgao 3763
270 caltinay 4843 j = 0;
271     k = 0;
272     ptr_main[0] = 0;
273     ptr_couple[0] = 0;
274     for (i=0; i<len; i++) {
275     j += ptr_ptr[2*i];
276     k += ptr_ptr[2*i+1];
277     ptr_main[i+1] = j;
278     ptr_couple[i+1] = k;
279     }
280 lgao 3763
281 caltinay 4843 delete[] ptr_ptr;
282     idx_main = new index_t[j];
283     idx_couple = new index_t[k];
284     ptr_idx = new index_t[j+k];
285     ptr_val = new double[(j+k) * block_size];
286 lgao 3763
287 caltinay 4843 /* send/receive index array */
288     j=0;
289     k_ub = 0;
290     for (p=0; p<recv->numNeighbors; p++) {
291     k = recv->offsetInShared[p];
292     m = recv->offsetInShared[p+1];
293     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
294     if (i > 0) {
295     k_ub ++;
296     MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
297 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
298     A->mpi_info->comm,
299     &(A->col_coupler->mpi_requests[p]));
300 caltinay 4843 }
301     j += i;
302 lgao 3763 }
303    
304 caltinay 4843 j=0;
305     k_ub = 0;
306     for (p=0; p<num_neighbors; p++) {
307     i = send_degree[p] - j;
308     if (i > 0){
309     k_ub ++;
310     MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
311 lgao 3763 A->mpi_info->msg_tag_counter+rank,
312     A->mpi_info->comm,
313     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
314 caltinay 4843 }
315     j = send_degree[p];
316 lgao 3763 }
317    
318 caltinay 4843 MPI_Waitall(A->col_coupler->connector->send->numNeighbors +
319     A->col_coupler->connector->recv->numNeighbors,
320 lgao 3763 A->col_coupler->mpi_requests,
321     A->col_coupler->mpi_stati);
322 caltinay 4843 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
323 lgao 3763
324 caltinay 4843 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
325     for (i=0; i<len; i++) {
326     j = ptr_main[i];
327     k = ptr_main[i+1];
328     m = ptr_couple[i];
329     for (p=j; p<k; p++) {
330     idx_main[p] = ptr_idx[m+p];
331     }
332     j = ptr_couple[i+1];
333     for (p=m; p<j; p++) {
334     idx_couple[p] = ptr_idx[k+p];
335     }
336 lgao 3763 }
337 caltinay 4843 delete[] ptr_idx;
338 lgao 3763
339 caltinay 4843 /* allocate pattern and sparsematrix for B_ext_main */
340     pattern_main.reset(new Pattern(B->col_coupleBlock->pattern->type,
341 caltinay 4819 len, num_main_cols, ptr_main, idx_main));
342 caltinay 4843 B->row_coupleBlock.reset(new SparseMatrix(B->col_coupleBlock->type,
343 caltinay 4829 pattern_main, B->row_block_size, B->col_block_size,
344     false));
345 lgao 3763
346 caltinay 4843 /* allocate pattern and sparsematrix for B_ext_couple */
347     pattern_couple.reset(new Pattern(B->col_coupleBlock->pattern->type,
348     len, B->col_distribution->first_component[size],
349 caltinay 4819 ptr_couple, idx_couple));
350 caltinay 4843 B->remote_coupleBlock.reset(new SparseMatrix(B->col_coupleBlock->type,
351 lgao 3763 pattern_couple, B->row_block_size, B->col_block_size,
352 caltinay 4829 false));
353 lgao 3763
354 caltinay 4843 /* send/receive value array */
355     j=0;
356     for (p=0; p<recv->numNeighbors; p++) {
357     k = recv->offsetInShared[p];
358     m = recv->offsetInShared[p+1];
359     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
360     if (i > 0)
361     MPI_Irecv(&(ptr_val[j]), i * block_size,
362 lgao 3763 MPI_DOUBLE, recv->neighbor[p],
363     A->mpi_info->msg_tag_counter+recv->neighbor[p],
364     A->mpi_info->comm,
365     &(A->col_coupler->mpi_requests[p]));
366 caltinay 4843 j += (i * block_size);
367     }
368 lgao 3763
369 caltinay 4843 j=0;
370     for (p=0; p<num_neighbors; p++) {
371     i = send_degree[p] - j;
372     if (i > 0)
373     MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
374 lgao 3763 A->mpi_info->msg_tag_counter+rank,
375     A->mpi_info->comm,
376     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
377 caltinay 4843 j = send_degree[p] ;
378     }
379 lgao 3763
380 caltinay 4843 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
381 lgao 3763 A->col_coupler->mpi_requests,
382     A->col_coupler->mpi_stati);
383 caltinay 4843 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
384 lgao 3763
385 caltinay 4843 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
386     for (i=0; i<len; i++) {
387     j = ptr_main[i];
388     k = ptr_main[i+1];
389     m = ptr_couple[i];
390     for (p=j; p<k; p++) {
391     memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
392     }
393     j = ptr_couple[i+1];
394     for (p=m; p<j; p++) {
395     memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
396     }
397 lgao 3763 }
398    
399 caltinay 4843 delete[] ptr_val;
400 caltinay 4829 delete[] cols;
401     delete[] cols_array;
402     delete[] recv_offset;
403     delete[] recv_degree;
404     delete[] recv_buf;
405     delete[] send_buf;
406     delete[] send_offset;
407     delete[] send_degree;
408     delete[] send_idx;
409 caltinay 4843 #endif // ESYS_MPI
410 lgao 3763 }
411    
412 caltinay 4843 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
413     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
414     to copy block of data in T to neighbour processors, defined by
415     P->col_coupler->connector->recv->neighbor[k] where k is in
416 caltinay 4829 [0, P->col_coupler->connector->recv->numNeighbors).
417 lgao 3763 Rows to be copied to neighbor processor k is in the list defined by
418 caltinay 4829 P->col_coupler->connector->recv->offsetInShared[k] ...
419     P->col_coupler->connector->recv->offsetInShared[k+1] */
420 caltinay 4843 void Preconditioner_AMG_CopyRemoteData(SystemMatrix_ptr P,
421     index_t **p_ptr, index_t **p_idx, double **p_val,
422     index_t *global_id, index_t block_size)
423 lgao 3763 {
424 caltinay 4843 SharedComponents_ptr send, recv;
425 caltinay 4829 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
426     index_t i, j, p, m, n, size;
427     index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
428     index_t *ptr=*p_ptr, *idx=*p_idx;
429     double *val=*p_val, *recv_val=NULL;
430 gross 3834 #ifdef ESYS_MPI
431 caltinay 4829 index_t rank = P->mpi_info->rank;
432 gross 3834 #endif
433 lgao 3763
434 caltinay 4829 size = P->mpi_info->size;
435     send = P->col_coupler->connector->recv;
436     recv = P->col_coupler->connector->send;
437     send_neighbors = send->numNeighbors;
438     recv_neighbors = recv->numNeighbors;
439     send_rows = P->col_coupleBlock->numCols;
440     recv_rows = recv->offsetInShared[recv_neighbors];
441 lgao 3763
442 caltinay 4829 send_degree = new index_t[send_rows];
443     recv_ptr = new index_t[recv_rows + 1];
444 lgao 3763 #pragma omp for schedule(static) private(i)
445 caltinay 4843 for (i=0; i<send_rows; i++)
446 caltinay 4829 send_degree[i] = ptr[i+1] - ptr[i];
447 lgao 3763
448     /* First, send/receive the degree */
449     for (p=0; p<recv_neighbors; p++) { /* Receiving */
450     m = recv->offsetInShared[p];
451     n = recv->offsetInShared[p+1];
452 lgao 3828 #ifdef ESYS_MPI
453 lgao 3763 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
454 caltinay 4829 P->mpi_info->msg_tag_counter + recv->neighbor[p],
455     P->mpi_info->comm,
456     &(P->col_coupler->mpi_requests[p]));
457 lgao 3828 #endif
458 lgao 3763 }
459     for (p=0; p<send_neighbors; p++) { /* Sending */
460     m = send->offsetInShared[p];
461     n = send->offsetInShared[p+1];
462 lgao 3828 #ifdef ESYS_MPI
463 lgao 3763 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
464 caltinay 4829 P->mpi_info->msg_tag_counter + rank,
465     P->mpi_info->comm,
466     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
467 lgao 3828 #endif
468 lgao 3763 }
469 lgao 3828 #ifdef ESYS_MPI
470 caltinay 4843 MPI_Waitall(send_neighbors+recv_neighbors,
471 caltinay 4829 P->col_coupler->mpi_requests,
472     P->col_coupler->mpi_stati);
473 lgao 3828 #endif
474 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
475 lgao 3763
476 jfenwick 4275 delete[] send_degree;
477 caltinay 4852 m = util::cumsum(recv_rows, recv_ptr);
478 lgao 3763 recv_ptr[recv_rows] = m;
479 caltinay 4843 recv_idx = new index_t[m];
480 jfenwick 4275 recv_val = new double[m * block_size];
481 lgao 3763
482     /* Next, send/receive the index array */
483     j = 0;
484     for (p=0; p<recv_neighbors; p++) { /* Receiving */
485     m = recv->offsetInShared[p];
486     n = recv->offsetInShared[p+1];
487 caltinay 4843 i = recv_ptr[n] - recv_ptr[m];
488 lgao 3763 if (i > 0) {
489 lgao 3828 #ifdef ESYS_MPI
490 lgao 3763 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
491 caltinay 4829 P->mpi_info->msg_tag_counter + recv->neighbor[p],
492     P->mpi_info->comm,
493     &(P->col_coupler->mpi_requests[p]));
494 lgao 3828 #endif
495 caltinay 4843 }
496 lgao 3763 j += i;
497     }
498    
499     j = 0;
500     for (p=0; p<send_neighbors; p++) { /* Sending */
501     m = send->offsetInShared[p];
502     n = send->offsetInShared[p+1];
503     i = ptr[n] - ptr[m];
504     if (i >0) {
505 caltinay 4829 #ifdef ESYS_MPI
506     MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
507     P->mpi_info->msg_tag_counter + rank,
508     P->mpi_info->comm,
509     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
510     #endif
511     j += i;
512 caltinay 4843 }
513 lgao 3763 }
514 lgao 3828 #ifdef ESYS_MPI
515 caltinay 4843 MPI_Waitall(send_neighbors+recv_neighbors,
516 caltinay 4829 P->col_coupler->mpi_requests,
517     P->col_coupler->mpi_stati);
518 lgao 3828 #endif
519 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
520 lgao 3763
521     /* Last, send/receive the data array */
522     j = 0;
523     for (p=0; p<recv_neighbors; p++) { /* Receiving */
524     m = recv->offsetInShared[p];
525     n = recv->offsetInShared[p+1];
526 caltinay 4843 i = recv_ptr[n] - recv_ptr[m];
527 lgao 3828 #ifdef ESYS_MPI
528 caltinay 4843 if (i > 0)
529 lgao 3779 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
530 caltinay 4829 P->mpi_info->msg_tag_counter + recv->neighbor[p],
531     P->mpi_info->comm,
532     &(P->col_coupler->mpi_requests[p]));
533 lgao 3828 #endif
534 lgao 3779 j += (i*block_size);
535 lgao 3763 }
536    
537     j = 0;
538     for (p=0; p<send_neighbors; p++) { /* Sending */
539     m = send->offsetInShared[p];
540     n = send->offsetInShared[p+1];
541     i = ptr[n] - ptr[m];
542     if (i >0) {
543 caltinay 4829 #ifdef ESYS_MPI
544     MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
545     P->mpi_info->msg_tag_counter + rank,
546     P->mpi_info->comm,
547     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
548     #endif
549     j += i * block_size;
550 lgao 3763 }
551     }
552 lgao 3828 #ifdef ESYS_MPI
553 caltinay 4843 MPI_Waitall(send_neighbors+recv_neighbors,
554 caltinay 4829 P->col_coupler->mpi_requests,
555     P->col_coupler->mpi_stati);
556 lgao 3828 #endif
557 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
558 lgao 3763
559 jfenwick 4345 /* Clean up and return with received ptr, index and data arrays */
560 jfenwick 4275 delete[] ptr;
561     delete[] idx;
562     delete[] val;
563 lgao 3763 *p_ptr = recv_ptr;
564     *p_idx = recv_idx;
565     *p_val = recv_val;
566     }
567    
568 caltinay 4843 SystemMatrix_ptr Preconditioner_AMG_buildInterpolationOperator(
569     SystemMatrix_ptr A, SystemMatrix_ptr P,
570     SystemMatrix_ptr R)
571 lgao 3763 {
572 jfenwick 4934 esysUtils::JMPI& mpi_info=A->mpi_info;
573 caltinay 4843 SystemMatrix_ptr out;
574     SystemMatrixPattern_ptr pattern;
575     Distribution_ptr input_dist, output_dist;
576     Connector_ptr col_connector, row_connector;
577 lgao 3763 const dim_t row_block_size=A->row_block_size;
578     const dim_t col_block_size=A->col_block_size;
579 lgao 3779 const dim_t block_size = A->block_size;
580     const dim_t P_block_size = P->block_size;
581 lgao 3763 const double ZERO = 0.0;
582     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
583 lgao 3779 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
584 lgao 3763 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
585     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
586     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
587     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
588     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
589     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
590     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
591 lgao 3779 index_t sum, i, j, k, iptr, irb, icb, ib;
592 lgao 3763 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
593     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
594     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
595     index_t row_marker_ext, *where_p=NULL;
596     index_t **send_ptr=NULL, **send_idx=NULL;
597 lgao 3828 dim_t p, num_neighbors;
598 lgao 3763 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
599     Esys_MPI_rank *neighbor=NULL;
600     #ifdef ESYS_MPI
601     MPI_Request* mpi_requests=NULL;
602     MPI_Status* mpi_stati=NULL;
603     #else
604     int *mpi_requests=NULL, *mpi_stati=NULL;
605     #endif
606    
607 caltinay 4843 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
608     return Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
609 lgao 3779
610 caltinay 4843 /* two sparse matrices R_main and R_couple will be generate, as the
611     transpose of P_main and P_col_couple, respectively. Note that,
612 lgao 3763 R_couple is actually the row_coupleBlock of R (R=P^T) */
613 caltinay 4843 SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
614     SparseMatrix_ptr R_couple;
615     if (size > 1)
616 caltinay 4829 R_couple = P->col_coupleBlock->getTranspose();
617 lgao 3763
618 jfenwick 4345 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
619 caltinay 4843 and needed locally for triple matrix product RAP
620     to be specific, P_ext (on processor k) are group of rows in P, where
621     the list of rows from processor q is given by
622     A->col_coupler->connector->send->shared[rPtr...]
623 caltinay 4829 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
624 caltinay 4843 on q.
625     P_ext is represented by two sparse matrices P_ext_main and
626 lgao 3779 P_ext_couple */
627 caltinay 4843 Preconditioner_AMG_extendB(A, P);
628 lgao 3779
629 caltinay 4843 /* count the number of cols in P_ext_couple, resize the pattern of
630 lgao 3779 sparse matrix P_ext_couple with new compressed order, and then
631     build the col id mapping from P->col_coupleBlock to
632 caltinay 4843 P_ext_couple */
633 lgao 3779 num_Pmain_cols = P->mainBlock->numCols;
634 lgao 3817 if (size > 1) {
635     num_Pcouple_cols = P->col_coupleBlock->numCols;
636     num_Acouple_cols = A->col_coupleBlock->numCols;
637     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
638     } else {
639     num_Pcouple_cols = 0;
640     num_Acouple_cols = 0;
641     sum = 0;
642     }
643 lgao 3779 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
644     offset = P->col_distribution->first_component[rank];
645     num_Pext_cols = 0;
646     if (P->global_id) {
647 caltinay 4843 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
648 caltinay 4829 and P->col_coupleBlock */
649 lgao 3779 iptr = 0;
650     if (num_Pcouple_cols || sum > 0) {
651 caltinay 4829 temp = new index_t[num_Pcouple_cols+sum];
652     #pragma omp parallel for lastprivate(iptr) schedule(static)
653 caltinay 4843 for (iptr=0; iptr<sum; iptr++){
654 caltinay 4829 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
655     }
656     for (j=0; j<num_Pcouple_cols; j++, iptr++){
657     temp[iptr] = P->global_id[j];
658     }
659 lgao 3779 }
660     if (iptr) {
661 caltinay 4852 qsort(temp, (size_t)iptr, sizeof(index_t), util::comparIndex);
662 caltinay 4829 num_Pext_cols = 1;
663     i = temp[0];
664     for (j=1; j<iptr; j++) {
665     if (temp[j] > i) {
666     i = temp[j];
667     temp[num_Pext_cols++] = i;
668     }
669     }
670 lgao 3779 }
671     /* resize the pattern of P_ext_couple */
672     if(num_Pext_cols){
673 caltinay 4829 global_id_P = new index_t[num_Pext_cols];
674     #pragma omp parallel for private(i) schedule(static)
675     for (i=0; i<num_Pext_cols; i++)
676     global_id_P[i] = temp[i];
677 lgao 3779 }
678 caltinay 4843 if (num_Pcouple_cols || sum > 0)
679 caltinay 4829 delete[] temp;
680 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
681 lgao 3779 for (i=0; i<sum; i++) {
682 caltinay 4829 where_p = (index_t *)bsearch(
683     &(P->remote_coupleBlock->pattern->index[i]),
684 caltinay 4843 global_id_P, num_Pext_cols,
685 caltinay 4852 sizeof(index_t), util::comparIndex);
686 caltinay 4843 P->remote_coupleBlock->pattern->index[i] =
687 caltinay 4829 (index_t)(where_p -global_id_P);
688 caltinay 4843 }
689 lgao 3779
690     /* build the mapping */
691     if (num_Pcouple_cols) {
692 caltinay 4829 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
693     iptr = 0;
694     for (i=0; i<num_Pext_cols; i++)
695     if (global_id_P[i] == P->global_id[iptr]) {
696     Pcouple_to_Pext[iptr++] = i;
697     if (iptr == num_Pcouple_cols) break;
698     }
699 lgao 3779 }
700     }
701    
702 jfenwick 4345 /* alloc and initialise the makers */
703 lgao 3779 sum = num_Pext_cols + num_Pmain_cols;
704 jfenwick 4275 P_marker = new index_t[sum];
705     A_marker = new index_t[num_A_cols];
706 lgao 3779 #pragma omp parallel for private(i) schedule(static)
707     for (i=0; i<sum; i++) P_marker[i] = -1;
708     #pragma omp parallel for private(i) schedule(static)
709     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
710    
711     /* Now, count the size of RAP_ext. Start with rows in R_couple */
712     sum = 0;
713     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
714     row_marker = sum;
715     /* then, loop over elements in row i_r of R_couple */
716     j1_ub = R_couple->pattern->ptr[i_r+1];
717     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
718 caltinay 4829 i1 = R_couple->pattern->index[j1];
719     /* then, loop over elements in row i1 of A->col_coupleBlock */
720     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
721     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
722     i2 = A->col_coupleBlock->pattern->index[j2];
723 lgao 3779
724 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
725     RAP new entry is possible only if entry RA[i_r, i2] has not
726     been visited yet */
727     if (A_marker[i2] != i_r) {
728     /* first, mark entry RA[i_r,i2] as visited */
729 caltinay 4843 A_marker[i2] = i_r;
730 lgao 3779
731 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
732     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
733     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
734     i_c = P->row_coupleBlock->pattern->index[j3];
735 lgao 3779
736 caltinay 4843 /* check whether entry RAP[i_r,i_c] has been created.
737     If not yet created, create the entry and increase
738 caltinay 4829 the total number of elements in RAP_ext */
739     if (P_marker[i_c] < row_marker) {
740     P_marker[i_c] = sum;
741     sum++;
742     }
743     }
744 lgao 3779
745 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
746     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
747     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
748     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
749     if (P_marker[i_c] < row_marker) {
750     P_marker[i_c] = sum;
751     sum++;
752     }
753     }
754     }
755     }
756 lgao 3779
757 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
758     the process we have done to block A->col_coupleBlock */
759     j2_ub = A->mainBlock->pattern->ptr[i1+1];
760     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
761     i2 = A->mainBlock->pattern->index[j2];
762     if (A_marker[i2 + num_Acouple_cols] != i_r) {
763     A_marker[i2 + num_Acouple_cols] = i_r;
764     j3_ub = P->mainBlock->pattern->ptr[i2+1];
765     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
766     i_c = P->mainBlock->pattern->index[j3];
767     if (P_marker[i_c] < row_marker) {
768     P_marker[i_c] = sum;
769     sum++;
770     }
771     }
772     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
773     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
774 caltinay 4843 /* note that we need to map the column index in
775     P->col_coupleBlock back into the column index in
776 caltinay 4829 P_ext_couple */
777     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
778     if (P_marker[i_c] < row_marker) {
779     P_marker[i_c] = sum;
780     sum++;
781     }
782     }
783     }
784     }
785 lgao 3779 }
786     }
787    
788     /* Now we have the number of non-zero elements in RAP_ext, allocate
789     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
790 jfenwick 4275 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
791     RAP_ext_idx = new index_t[sum];
792     RAP_ext_val = new double[sum * block_size];
793     RA_val = new double[block_size];
794     RAP_val = new double[block_size];
795 lgao 3779
796     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
797     sum = num_Pext_cols + num_Pmain_cols;
798     #pragma omp parallel for private(i) schedule(static)
799     for (i=0; i<sum; i++) P_marker[i] = -1;
800     #pragma omp parallel for private(i) schedule(static)
801     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
802     sum = 0;
803     RAP_ext_ptr[0] = 0;
804     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
805     row_marker = sum;
806     /* then, loop over elements in row i_r of R_couple */
807     j1_ub = R_couple->pattern->ptr[i_r+1];
808     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
809 caltinay 4829 i1 = R_couple->pattern->index[j1];
810     R_val = &(R_couple->val[j1*P_block_size]);
811 lgao 3779
812 caltinay 4829 /* then, loop over elements in row i1 of A->col_coupleBlock */
813     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
814     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
815     i2 = A->col_coupleBlock->pattern->index[j2];
816     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
817 caltinay 4843 for (irb=0; irb<row_block_size; irb++)
818     for (icb=0; icb<col_block_size; icb++)
819 lgao 3779 RA_val[irb+row_block_size*icb] = ZERO;
820 caltinay 4829 for (irb=0; irb<P_block_size; irb++) {
821     rtmp=R_val[irb];
822     for (icb=0; icb<col_block_size; icb++) {
823     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
824     }
825     }
826 lgao 3779
827 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
828     RAP new entry is possible only if entry RA[i_r, i2] has not
829     been visited yet */
830     if (A_marker[i2] != i_r) {
831     /* first, mark entry RA[i_r,i2] as visited */
832 caltinay 4843 A_marker[i2] = i_r;
833 lgao 3779
834 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
835     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
836     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
837     i_c = P->row_coupleBlock->pattern->index[j3];
838     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
839     for (irb=0; irb<row_block_size; irb++)
840     for (icb=0; icb<col_block_size; icb++)
841     RAP_val[irb+row_block_size*icb] = ZERO;
842     for (icb=0; icb<P_block_size; icb++) {
843     rtmp = temp_val[icb];
844     for (irb=0; irb<row_block_size; irb++) {
845     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
846     }
847     }
848 lgao 3779
849 caltinay 4843 /* check whether entry RAP[i_r,i_c] has been created.
850     If not yet created, create the entry and increase
851 caltinay 4829 the total number of elements in RAP_ext */
852     if (P_marker[i_c] < row_marker) {
853     P_marker[i_c] = sum;
854     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
855 caltinay 4843 RAP_ext_idx[sum] = i_c + offset;
856 caltinay 4829 sum++;
857     } else {
858     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
859     for (ib=0; ib<block_size; ib++) {
860     temp_val[ib] += RAP_val[ib];
861     }
862     }
863     }
864 lgao 3779
865 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
866     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
867     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
868     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
869 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
870 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
871     for (icb=0; icb<col_block_size; icb++)
872     RAP_val[irb+row_block_size*icb]=ZERO;
873 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
874 caltinay 4829 rtmp = temp_val[icb];
875 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
876     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
877     }
878     }
879    
880 caltinay 4829 if (P_marker[i_c] < row_marker) {
881     P_marker[i_c] = sum;
882     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
883     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
884     sum++;
885     } else {
886 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
887     for (ib=0; ib<block_size; ib++) {
888     temp_val[ib] += RAP_val[ib];
889     }
890 caltinay 4829 }
891     }
892 lgao 3779
893 caltinay 4829 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
894     Only the contributions are added. */
895     } else {
896     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
897     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
898     i_c = P->row_coupleBlock->pattern->index[j3];
899 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
900 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
901     for (icb=0; icb<col_block_size; icb++)
902     RAP_val[irb+row_block_size*icb]=ZERO;
903 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
904 caltinay 4829 rtmp = temp_val[icb];
905 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
906     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
907     }
908     }
909    
910     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
911     for (ib=0; ib<block_size; ib++) {
912     temp_val[ib] += RAP_val[ib];
913     }
914 caltinay 4829 }
915     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
916     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
917     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
918 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
919 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
920     for (icb=0; icb<col_block_size; icb++)
921     RAP_val[irb+row_block_size*icb]=ZERO;
922 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
923 caltinay 4829 rtmp = temp_val[icb];
924 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
925     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
926     }
927     }
928    
929     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
930     for (ib=0; ib<block_size; ib++) {
931     temp_val[ib] += RAP_val[ib];
932     }
933 caltinay 4829 }
934     }
935     }
936 lgao 3779
937 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
938     the process we have done to block A->col_coupleBlock */
939     j2_ub = A->mainBlock->pattern->ptr[i1+1];
940     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
941     i2 = A->mainBlock->pattern->index[j2];
942 lgao 3779 temp_val = &(A->mainBlock->val[j2*block_size]);
943 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
944     for (icb=0; icb<col_block_size; icb++)
945     RA_val[irb+row_block_size*icb]=ZERO;
946 lgao 3779 for (irb=0; irb<P_block_size; irb++) {
947 caltinay 4829 rtmp=R_val[irb];
948 lgao 3779 for (icb=0; icb<col_block_size; icb++) {
949     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
950     }
951     }
952    
953 caltinay 4829 if (A_marker[i2 + num_Acouple_cols] != i_r) {
954     A_marker[i2 + num_Acouple_cols] = i_r;
955     j3_ub = P->mainBlock->pattern->ptr[i2+1];
956     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
957     i_c = P->mainBlock->pattern->index[j3];
958 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
959 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
960     for (icb=0; icb<col_block_size; icb++)
961     RAP_val[irb+row_block_size*icb]=ZERO;
962 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
963 caltinay 4829 rtmp = temp_val[icb];
964 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
965     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
966     }
967     }
968    
969 caltinay 4829 if (P_marker[i_c] < row_marker) {
970     P_marker[i_c] = sum;
971     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
972     RAP_ext_idx[sum] = i_c + offset;
973     sum++;
974     } else {
975 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
976     for (ib=0; ib<block_size; ib++) {
977     temp_val[ib] += RAP_val[ib];
978     }
979 caltinay 4829 }
980     }
981     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
982     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
983     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
984 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
985 caltinay 4843 for (irb=0; irb<row_block_size; irb++)
986     for (icb=0; icb<col_block_size; icb++)
987 caltinay 4829 RAP_val[irb+row_block_size*icb]=ZERO;
988 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
989 caltinay 4829 rtmp=temp_val[icb];
990 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
991     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
992     }
993     }
994    
995 caltinay 4829 if (P_marker[i_c] < row_marker) {
996     P_marker[i_c] = sum;
997     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
998 lgao 3779 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
999 caltinay 4829 sum++;
1000     } else {
1001 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1002     for (ib=0; ib<block_size; ib++) {
1003     temp_val[ib] += RAP_val[ib];
1004     }
1005 caltinay 4829 }
1006     }
1007     } else {
1008     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1009     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1010     i_c = P->mainBlock->pattern->index[j3];
1011 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1012 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1013     for (icb=0; icb<col_block_size; icb++)
1014     RAP_val[irb+row_block_size*icb]=ZERO;
1015 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1016 caltinay 4829 rtmp=temp_val[icb];
1017 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1018     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1019     }
1020     }
1021    
1022     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1023     for (ib=0; ib<block_size; ib++) {
1024     temp_val[ib] += RAP_val[ib];
1025     }
1026 caltinay 4829 }
1027     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1028     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1029     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1030 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1031 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1032     for (icb=0; icb<col_block_size; icb++)
1033     RAP_val[irb+row_block_size*icb]=ZERO;
1034 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1035 caltinay 4829 rtmp=temp_val[icb];
1036 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1037     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1038     }
1039     }
1040    
1041     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1042     for (ib=0; ib<block_size; ib++) {
1043     temp_val[ib] += RAP_val[ib];
1044     }
1045 caltinay 4829 }
1046     }
1047     }
1048 lgao 3779 }
1049     RAP_ext_ptr[i_r+1] = sum;
1050     }
1051 jfenwick 4275 delete[] P_marker;
1052     delete[] Pcouple_to_Pext;
1053 lgao 3779
1054 caltinay 4843 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1055     which is given by the column list of P->col_coupleBlock, and
1056 jfenwick 4345 column "c" is the list of columns which possibly covers the
1057     whole column range of system matrix P. This part of data is to
1058     be passed to neighbouring processors, and added to corresponding
1059     RAP[r,c] entries in the neighbouring processors */
1060 caltinay 4843 Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1061 caltinay 4829 &RAP_ext_val, global_id_P, block_size);
1062 lgao 3779
1063     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1064     sum = RAP_ext_ptr[num_RAPext_rows];
1065     num_RAPext_cols = 0;
1066     if (num_Pext_cols || sum > 0) {
1067 jfenwick 4275 temp = new index_t[num_Pext_cols+sum];
1068 lgao 3779 j1_ub = offset + num_Pmain_cols;
1069     for (i=0, iptr=0; i<sum; i++) {
1070 caltinay 4843 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1071 caltinay 4829 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1072 lgao 3779 }
1073     for (j=0; j<num_Pext_cols; j++, iptr++){
1074 caltinay 4829 temp[iptr] = global_id_P[j];
1075 lgao 3779 }
1076    
1077     if (iptr) {
1078 caltinay 4852 qsort(temp, (size_t)iptr, sizeof(index_t), util::comparIndex);
1079 caltinay 4829 num_RAPext_cols = 1;
1080     i = temp[0];
1081     for (j=1; j<iptr; j++) {
1082     if (temp[j] > i) {
1083     i = temp[j];
1084     temp[num_RAPext_cols++] = i;
1085     }
1086     }
1087 lgao 3779 }
1088     }
1089    
1090     /* resize the pattern of P_ext_couple */
1091     if(num_RAPext_cols){
1092 jfenwick 4275 global_id_RAP = new index_t[num_RAPext_cols];
1093 lgao 3821 #pragma omp parallel for private(i) schedule(static)
1094 lgao 3779 for (i=0; i<num_RAPext_cols; i++)
1095 caltinay 4829 global_id_RAP[i] = temp[i];
1096 lgao 3779 }
1097     if (num_Pext_cols || sum > 0)
1098 jfenwick 4275 delete[] temp;
1099 lgao 3779 j1_ub = offset + num_Pmain_cols;
1100 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
1101 lgao 3779 for (i=0; i<sum; i++) {
1102     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1103 caltinay 4829 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1104 caltinay 4852 /*XXX*/ num_RAPext_cols, sizeof(index_t), util::comparIndex);
1105 caltinay 4829 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1106 caltinay 4843 } else
1107 caltinay 4829 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1108 lgao 3779 }
1109    
1110     /* build the mapping */
1111     if (num_Pcouple_cols) {
1112 jfenwick 4275 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1113 lgao 3779 iptr = 0;
1114 caltinay 4843 for (i=0; i<num_RAPext_cols; i++)
1115 caltinay 4829 if (global_id_RAP[i] == P->global_id[iptr]) {
1116     Pcouple_to_RAP[iptr++] = i;
1117     if (iptr == num_Pcouple_cols) break;
1118     }
1119 lgao 3779 }
1120    
1121     if (num_Pext_cols) {
1122 jfenwick 4275 Pext_to_RAP = new index_t[num_Pext_cols];
1123 lgao 3779 iptr = 0;
1124     for (i=0; i<num_RAPext_cols; i++)
1125 caltinay 4829 if (global_id_RAP[i] == global_id_P[iptr]) {
1126     Pext_to_RAP[iptr++] = i;
1127     if (iptr == num_Pext_cols) break;
1128     }
1129 lgao 3779 }
1130    
1131     if (global_id_P){
1132 jfenwick 4275 delete[] global_id_P;
1133 lgao 3779 global_id_P = NULL;
1134     }
1135    
1136 jfenwick 4345 /* alloc and initialise the makers */
1137 lgao 3779 sum = num_RAPext_cols + num_Pmain_cols;
1138 jfenwick 4275 P_marker = new index_t[sum];
1139 lgao 3779 #pragma omp parallel for private(i) schedule(static)
1140     for (i=0; i<sum; i++) P_marker[i] = -1;
1141     #pragma omp parallel for private(i) schedule(static)
1142     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1143    
1144     /* Now, count the size of RAP. Start with rows in R_main */
1145     num_neighbors = P->col_coupler->connector->send->numNeighbors;
1146     offsetInShared = P->col_coupler->connector->send->offsetInShared;
1147     shared = P->col_coupler->connector->send->shared;
1148     i = 0;
1149     j = 0;
1150     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1151     /* Mark the start of row for both main block and couple block */
1152     row_marker = i;
1153     row_marker_ext = j;
1154    
1155     /* Mark the diagonal element RAP[i_r, i_r], and other elements
1156 caltinay 4829 in RAP_ext */
1157 lgao 3779 P_marker[i_r] = i;
1158     i++;
1159     for (j1=0; j1<num_neighbors; j1++) {
1160 caltinay 4829 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1161     if (shared[j2] == i_r) {
1162     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1163     i_c = RAP_ext_idx[k];
1164     if (i_c < num_Pmain_cols) {
1165     if (P_marker[i_c] < row_marker) {
1166     P_marker[i_c] = i;
1167     i++;
1168     }
1169     } else {
1170     if (P_marker[i_c] < row_marker_ext) {
1171     P_marker[i_c] = j;
1172     j++;
1173     }
1174     }
1175     }
1176     break;
1177     }
1178     }
1179 lgao 3779 }
1180    
1181     /* then, loop over elements in row i_r of R_main */
1182     j1_ub = R_main->pattern->ptr[i_r+1];
1183     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1184 caltinay 4829 i1 = R_main->pattern->index[j1];
1185 lgao 3779
1186 caltinay 4829 /* then, loop over elements in row i1 of A->col_coupleBlock */
1187     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1188     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1189     i2 = A->col_coupleBlock->pattern->index[j2];
1190 lgao 3779
1191 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
1192     RAP new entry is possible only if entry RA[i_r, i2] has not
1193     been visited yet */
1194     if (A_marker[i2] != i_r) {
1195     /* first, mark entry RA[i_r,i2] as visited */
1196 caltinay 4843 A_marker[i2] = i_r;
1197 lgao 3779
1198 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
1199     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1200     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1201     i_c = P->row_coupleBlock->pattern->index[j3];
1202 lgao 3779
1203 caltinay 4843 /* check whether entry RAP[i_r,i_c] has been created.
1204     If not yet created, create the entry and increase
1205 caltinay 4829 the total number of elements in RAP_ext */
1206     if (P_marker[i_c] < row_marker) {
1207     P_marker[i_c] = i;
1208     i++;
1209     }
1210     }
1211 lgao 3779
1212 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
1213     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1214     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1215     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1216     if (P_marker[i_c] < row_marker_ext) {
1217     P_marker[i_c] = j;
1218     j++;
1219     }
1220     }
1221     }
1222     }
1223 lgao 3779
1224 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
1225     the process we have done to block A->col_coupleBlock */
1226     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1227     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1228     i2 = A->mainBlock->pattern->index[j2];
1229     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1230     A_marker[i2 + num_Acouple_cols] = i_r;
1231     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1232     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1233     i_c = P->mainBlock->pattern->index[j3];
1234     if (P_marker[i_c] < row_marker) {
1235     P_marker[i_c] = i;
1236     i++;
1237     }
1238     }
1239     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1240     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1241 caltinay 4843 /* note that we need to map the column index in
1242     P->col_coupleBlock back into the column index in
1243 caltinay 4829 P_ext_couple */
1244     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1245     if (P_marker[i_c] < row_marker_ext) {
1246     P_marker[i_c] = j;
1247     j++;
1248     }
1249     }
1250     }
1251     }
1252 lgao 3779 }
1253     }
1254    
1255     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1256 caltinay 4843 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1257 lgao 3779 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1258     RAP_couple */
1259 jfenwick 4275 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1260     RAP_main_idx = new index_t[i];
1261     RAP_main_val = new double[i * block_size];
1262     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1263     RAP_couple_idx = new index_t[j];
1264     RAP_couple_val = new double[j * block_size];
1265 lgao 3779
1266     RAP_main_ptr[num_Pmain_cols] = i;
1267     RAP_couple_ptr[num_Pmain_cols] = j;
1268    
1269     #pragma omp parallel for private(i) schedule(static)
1270     for (i=0; i<sum; i++) P_marker[i] = -1;
1271     #pragma omp parallel for private(i) schedule(static)
1272     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1273    
1274 caltinay 4843 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1275 lgao 3779 in R_main */
1276     i = 0;
1277     j = 0;
1278     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1279     /* Mark the start of row for both main block and couple block */
1280     row_marker = i;
1281     row_marker_ext = j;
1282     RAP_main_ptr[i_r] = row_marker;
1283     RAP_couple_ptr[i_r] = row_marker_ext;
1284    
1285     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1286 caltinay 4829 in row i_r of RAP_ext */
1287 lgao 3779 P_marker[i_r] = i;
1288     for (ib=0; ib<block_size; ib++)
1289     RAP_main_val[i*block_size+ib] = ZERO;
1290     RAP_main_idx[i] = i_r;
1291     i++;
1292    
1293     for (j1=0; j1<num_neighbors; j1++) {
1294 caltinay 4829 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1295     if (shared[j2] == i_r) {
1296     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1297     i_c = RAP_ext_idx[k];
1298     if (i_c < num_Pmain_cols) {
1299     if (P_marker[i_c] < row_marker) {
1300     P_marker[i_c] = i;
1301     memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1302     RAP_main_idx[i] = i_c;
1303     i++;
1304     } else {
1305     t1_val = &(RAP_ext_val[k*block_size]);
1306     t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1307     for (ib=0; ib<block_size; ib++)
1308     t2_val[ib] += t1_val[ib];
1309     }
1310     } else {
1311     if (P_marker[i_c] < row_marker_ext) {
1312     P_marker[i_c] = j;
1313     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1314     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1315     j++;
1316     } else {
1317     t1_val = &(RAP_ext_val[k*block_size]);
1318     t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1319     for (ib=0; ib<block_size; ib++)
1320     t2_val[ib] += t1_val[ib];
1321     }
1322     }
1323     }
1324     break;
1325     }
1326     }
1327 lgao 3779 }
1328    
1329     /* then, loop over elements in row i_r of R_main */
1330     j1_ub = R_main->pattern->ptr[i_r+1];
1331     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1332 caltinay 4829 i1 = R_main->pattern->index[j1];
1333     R_val = &(R_main->val[j1*P_block_size]);
1334 lgao 3779
1335 caltinay 4829 /* then, loop over elements in row i1 of A->col_coupleBlock */
1336     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1337     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1338     i2 = A->col_coupleBlock->pattern->index[j2];
1339 lgao 3779 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1340 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1341     for (icb=0; icb<col_block_size; icb++)
1342     RA_val[irb+row_block_size*icb]=ZERO;
1343 lgao 3779 for (irb=0; irb<P_block_size; irb++) {
1344 caltinay 4829 rtmp=R_val[irb];
1345 lgao 3779 for (icb=0; icb<col_block_size; icb++) {
1346     RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1347     }
1348     }
1349    
1350    
1351 caltinay 4829 /* check whether entry RA[i_r, i2] has been previously visited.
1352     RAP new entry is possible only if entry RA[i_r, i2] has not
1353     been visited yet */
1354     if (A_marker[i2] != i_r) {
1355     /* first, mark entry RA[i_r,i2] as visited */
1356 caltinay 4843 A_marker[i2] = i_r;
1357 lgao 3779
1358 caltinay 4829 /* then loop over elements in row i2 of P_ext_main */
1359     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1360     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1361     i_c = P->row_coupleBlock->pattern->index[j3];
1362 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1363 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1364     for (icb=0; icb<col_block_size; icb++)
1365     RAP_val[irb+row_block_size*icb]=ZERO;
1366 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1367 caltinay 4829 rtmp=temp_val[icb];
1368 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1369     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1370     }
1371     }
1372    
1373    
1374 caltinay 4843 /* check whether entry RAP[i_r,i_c] has been created.
1375     If not yet created, create the entry and increase
1376 caltinay 4829 the total number of elements in RAP_ext */
1377     if (P_marker[i_c] < row_marker) {
1378     P_marker[i_c] = i;
1379     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1380     RAP_main_idx[i] = i_c;
1381     i++;
1382     } else {
1383 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1384     for (ib=0; ib<block_size; ib++) {
1385     temp_val[ib] += RAP_val[ib];
1386     }
1387 caltinay 4829 }
1388     }
1389 lgao 3779
1390 caltinay 4829 /* loop over elements in row i2 of P_ext_couple, do the same */
1391     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1392     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1393     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1394 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1395 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1396     for (icb=0; icb<col_block_size; icb++)
1397     RAP_val[irb+row_block_size*icb]=ZERO;
1398 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1399 caltinay 4829 rtmp=temp_val[icb];
1400 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1401     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1402     }
1403     }
1404    
1405 caltinay 4829 if (P_marker[i_c] < row_marker_ext) {
1406     P_marker[i_c] = j;
1407     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1408     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1409     j++;
1410     } else {
1411 lgao 3779 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1412     for (ib=0; ib<block_size; ib++) {
1413     temp_val[ib] += RAP_val[ib];
1414     }
1415 caltinay 4829 }
1416     }
1417 lgao 3779
1418 caltinay 4829 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1419     Only the contributions are added. */
1420     } else {
1421     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1422     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1423     i_c = P->row_coupleBlock->pattern->index[j3];
1424 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1425 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1426     for (icb=0; icb<col_block_size; icb++)
1427     RAP_val[irb+row_block_size*icb]=ZERO;
1428 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1429 caltinay 4829 rtmp=temp_val[icb];
1430 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1431     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1432     }
1433     }
1434    
1435     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1436     for (ib=0; ib<block_size; ib++) {
1437     temp_val[ib] += RAP_val[ib];
1438     }
1439 caltinay 4829 }
1440     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1441     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1442     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1443 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1444 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1445     for (icb=0; icb<col_block_size; icb++)
1446     RAP_val[irb+row_block_size*icb]=ZERO;
1447 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1448 caltinay 4829 rtmp=temp_val[icb];
1449 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1450     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1451     }
1452     }
1453    
1454     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1455     for (ib=0; ib<block_size; ib++) {
1456     temp_val[ib] += RAP_val[ib];
1457     }
1458 caltinay 4829 }
1459     }
1460     }
1461 lgao 3779
1462 caltinay 4829 /* now loop over elements in row i1 of A->mainBlock, repeat
1463     the process we have done to block A->col_coupleBlock */
1464     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1465     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1466     i2 = A->mainBlock->pattern->index[j2];
1467 lgao 3779 temp_val = &(A->mainBlock->val[j2*block_size]);
1468 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1469     for (icb=0; icb<col_block_size; icb++)
1470     RA_val[irb+row_block_size*icb]=ZERO;
1471 lgao 3779 for (irb=0; irb<P_block_size; irb++) {
1472 caltinay 4829 rtmp=R_val[irb];
1473 lgao 3779 for (icb=0; icb<col_block_size; icb++) {
1474     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1475     }
1476     }
1477    
1478 caltinay 4829 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1479     A_marker[i2 + num_Acouple_cols] = i_r;
1480     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1481     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1482     i_c = P->mainBlock->pattern->index[j3];
1483 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1484 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1485     for (icb=0; icb<col_block_size; icb++)
1486     RAP_val[irb+row_block_size*icb]=ZERO;
1487 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1488 caltinay 4829 rtmp=temp_val[icb];
1489 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1490     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1491     }
1492     }
1493 caltinay 4829 if (P_marker[i_c] < row_marker) {
1494     P_marker[i_c] = i;
1495     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1496     RAP_main_idx[i] = i_c;
1497     i++;
1498     } else {
1499 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1500     for (ib=0; ib<block_size; ib++) {
1501     temp_val[ib] += RAP_val[ib];
1502     }
1503 caltinay 4829 }
1504     }
1505     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1506     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1507 caltinay 4843 /* note that we need to map the column index in
1508     P->col_coupleBlock back into the column index in
1509 caltinay 4829 P_ext_couple */
1510     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1511 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1512 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1513     for (icb=0; icb<col_block_size; icb++)
1514     RAP_val[irb+row_block_size*icb]=ZERO;
1515 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1516 caltinay 4829 rtmp=temp_val[icb];
1517 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1518     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1519     }
1520     }
1521    
1522 caltinay 4829 if (P_marker[i_c] < row_marker_ext) {
1523     P_marker[i_c] = j;
1524     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1525     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1526     j++;
1527     } else {
1528 lgao 3779 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1529     for (ib=0; ib<block_size; ib++) {
1530     temp_val[ib] += RAP_val[ib];
1531     }
1532 caltinay 4829 }
1533     }
1534 lgao 3779
1535 caltinay 4829 } else {
1536     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1537     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1538     i_c = P->mainBlock->pattern->index[j3];
1539 lgao 3779 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1540 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1541     for (icb=0; icb<col_block_size; icb++)
1542     RAP_val[irb+row_block_size*icb]=ZERO;
1543 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1544 caltinay 4829 rtmp=temp_val[icb];
1545 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1546     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1547     }
1548     }
1549    
1550     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1551     for (ib=0; ib<block_size; ib++) {
1552     temp_val[ib] += RAP_val[ib];
1553     }
1554 caltinay 4829 }
1555     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1556     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1557     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1558 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1559 caltinay 4829 for (irb=0; irb<row_block_size; irb++)
1560     for (icb=0; icb<col_block_size; icb++)
1561     RAP_val[irb+row_block_size*icb]=ZERO;
1562 lgao 3779 for (icb=0; icb<P_block_size; icb++) {
1563 caltinay 4829 rtmp = temp_val[icb];
1564 lgao 3779 for (irb=0; irb<row_block_size; irb++) {
1565     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1566     }
1567     }
1568    
1569     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1570     for (ib=0; ib<block_size; ib++) {
1571     temp_val[ib] += RAP_val[ib];
1572     }
1573 caltinay 4829 }
1574     }
1575     }
1576 lgao 3779 }
1577    
1578     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1579     if (i > row_marker) {
1580 caltinay 4829 offset = i - row_marker;
1581     temp = new index_t[offset];
1582     #pragma omp parallel for schedule(static) private(iptr)
1583     for (iptr=0; iptr<offset; iptr++)
1584     temp[iptr] = RAP_main_idx[row_marker+iptr];
1585     if (offset > 0) {
1586 caltinay 4852 qsort(temp, (size_t)offset, sizeof(index_t), util::comparIndex);
1587 caltinay 4829 }
1588     temp_val = new double[offset * block_size];
1589     #pragma omp parallel for schedule(static) private(iptr,k)
1590     for (iptr=0; iptr<offset; iptr++){
1591     k = P_marker[temp[iptr]];
1592     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1593     P_marker[temp[iptr]] = iptr + row_marker;
1594     }
1595     #pragma omp parallel for schedule(static) private(iptr)
1596     for (iptr=0; iptr<offset; iptr++){
1597     RAP_main_idx[row_marker+iptr] = temp[iptr];
1598     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1599     }
1600     delete[] temp;
1601     delete[] temp_val;
1602 lgao 3779 }
1603     if (j > row_marker_ext) {
1604     offset = j - row_marker_ext;
1605 jfenwick 4275 temp = new index_t[offset];
1606 caltinay 4829 #pragma omp parallel for schedule(static) private(iptr)
1607 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1608     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1609     if (offset > 0) {
1610 caltinay 4852 qsort(temp, (size_t)offset, sizeof(index_t), util::comparIndex);
1611 lgao 3779 }
1612 jfenwick 4275 temp_val = new double[offset * block_size];
1613 caltinay 4829 #pragma omp parallel for schedule(static) private(iptr, k)
1614 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1615     k = P_marker[temp[iptr] + num_Pmain_cols];
1616 caltinay 4829 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1617 lgao 3779 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1618     }
1619 caltinay 4829 #pragma omp parallel for schedule(static) private(iptr)
1620 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1621     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1622 caltinay 4829 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1623 lgao 3779 }
1624 jfenwick 4275 delete[] temp;
1625     delete[] temp_val;
1626 lgao 3779 }
1627     }
1628    
1629 jfenwick 4275 delete[] RA_val;
1630     delete[] RAP_val;
1631     delete[] A_marker;
1632     delete[] Pext_to_RAP;
1633     delete[] Pcouple_to_RAP;
1634     delete[] RAP_ext_ptr;
1635     delete[] RAP_ext_idx;
1636     delete[] RAP_ext_val;
1637 caltinay 4829 R_main.reset();
1638     R_couple.reset();
1639 lgao 3779
1640 caltinay 4843 /* Check whether there are empty columns in RAP_couple */
1641 lgao 3779 #pragma omp parallel for schedule(static) private(i)
1642     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1643     /* num of non-empty columns is stored in "k" */
1644 caltinay 4843 k = 0;
1645 lgao 3779 j = RAP_couple_ptr[num_Pmain_cols];
1646     for (i=0; i<j; i++) {
1647     i1 = RAP_couple_idx[i];
1648     if (P_marker[i1]) {
1649 caltinay 4829 P_marker[i1] = 0;
1650     k++;
1651 lgao 3779 }
1652     }
1653    
1654     /* empty columns is found */
1655     if (k < num_RAPext_cols) {
1656 jfenwick 4275 temp = new index_t[k];
1657 lgao 3779 k = 0;
1658 caltinay 4843 for (i=0; i<num_RAPext_cols; i++)
1659 caltinay 4829 if (!P_marker[i]) {
1660     P_marker[i] = k;
1661     temp[k] = global_id_RAP[i];
1662     k++;
1663     }
1664 lgao 3821 #pragma omp parallel for schedule(static) private(i, i1)
1665 lgao 3779 for (i=0; i<j; i++) {
1666 caltinay 4829 i1 = RAP_couple_idx[i];
1667     RAP_couple_idx[i] = P_marker[i1];
1668 lgao 3779 }
1669     num_RAPext_cols = k;
1670 jfenwick 4275 delete[] global_id_RAP;
1671 lgao 3779 global_id_RAP = temp;
1672     }
1673 jfenwick 4275 delete[] P_marker;
1674 lgao 3779
1675     /******************************************************/
1676 jfenwick 4345 /* Start to create the coarse level System Matrix A_c */
1677 lgao 3779 /******************************************************/
1678     /* first, prepare the sender/receiver for the col_connector */
1679     dist = P->pattern->input_distribution->first_component;
1680 jfenwick 4275 recv_len = new dim_t[size];
1681     send_len = new dim_t[size];
1682     neighbor = new Esys_MPI_rank[size];
1683     offsetInShared = new index_t[size+1];
1684     shared = new index_t[num_RAPext_cols];
1685 lgao 3779 memset(recv_len, 0, sizeof(dim_t) * size);
1686     memset(send_len, 0, sizeof(dim_t) * size);
1687     num_neighbors = 0;
1688     offsetInShared[0] = 0;
1689     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1690     shared[i] = i + num_Pmain_cols;
1691     if (k <= global_id_RAP[i]) {
1692 caltinay 4829 if (recv_len[j] > 0) {
1693     neighbor[num_neighbors] = j;
1694     num_neighbors ++;
1695     offsetInShared[num_neighbors] = i;
1696     }
1697     while (k <= global_id_RAP[i]) {
1698     j++;
1699     k = dist[j+1];
1700     }
1701 lgao 3779 }
1702     recv_len[j] ++;
1703     }
1704     if (recv_len[j] > 0) {
1705     neighbor[num_neighbors] = j;
1706     num_neighbors ++;
1707     offsetInShared[num_neighbors] = i;
1708     }
1709    
1710 caltinay 4843 SharedComponents_ptr recv(new SharedComponents(
1711 caltinay 4816 num_Pmain_cols, num_neighbors, neighbor, shared,
1712     offsetInShared, 1, 0, mpi_info));
1713    
1714 lgao 3828 #ifdef ESYS_MPI
1715 lgao 3779 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1716 lgao 3828 #endif
1717 lgao 3779
1718     #ifdef ESYS_MPI
1719 jfenwick 4275 mpi_requests=new MPI_Request[size*2];
1720     mpi_stati=new MPI_Status[size*2];
1721 lgao 3779 #else
1722 jfenwick 4275 mpi_requests=new int[size*2];
1723     mpi_stati=new int[size*2];
1724 lgao 3779 #endif
1725     num_neighbors = 0;
1726     j = 0;
1727     offsetInShared[0] = 0;
1728     for (i=0; i<size; i++) {
1729     if (send_len[i] > 0) {
1730 caltinay 4829 neighbor[num_neighbors] = i;
1731     num_neighbors ++;
1732     j += send_len[i];
1733     offsetInShared[num_neighbors] = j;
1734 lgao 3779 }
1735     }
1736 jfenwick 4275 delete[] shared;
1737     shared = new index_t[j];
1738 lgao 3779 for (i=0, j=0; i<num_neighbors; i++) {
1739     k = neighbor[i];
1740 lgao 3828 #ifdef ESYS_MPI
1741 caltinay 4843 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1742 caltinay 4829 mpi_info->msg_tag_counter+k,
1743     mpi_info->comm, &mpi_requests[i]);
1744 lgao 3828 #endif
1745 lgao 3779 j += send_len[k];
1746     }
1747     for (i=0, j=0; i<recv->numNeighbors; i++) {
1748     k = recv->neighbor[i];
1749 lgao 3828 #ifdef ESYS_MPI
1750 lgao 3779 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1751 caltinay 4829 mpi_info->msg_tag_counter+rank,
1752     mpi_info->comm, &mpi_requests[i+num_neighbors]);
1753 lgao 3828 #endif
1754 lgao 3779 j += recv_len[k];
1755     }
1756 lgao 3828 #ifdef ESYS_MPI
1757 caltinay 4843 MPI_Waitall(num_neighbors + recv->numNeighbors,
1758     mpi_requests, mpi_stati);
1759 lgao 3828 #endif
1760 caltinay 4843 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1761 lgao 3779
1762     j = offsetInShared[num_neighbors];
1763     offset = dist[rank];
1764 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1765 lgao 3779 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1766    
1767 caltinay 4843 SharedComponents_ptr send(new SharedComponents(
1768 caltinay 4816 num_Pmain_cols, num_neighbors, neighbor, shared,
1769     offsetInShared, 1, 0, mpi_info));
1770    
1771 caltinay 4843 col_connector.reset(new Connector(send, recv));
1772 jfenwick 4275 delete[] shared;
1773 lgao 3779
1774     /* now, create row distribution (output_distri) and col
1775     distribution (input_distribution) */
1776 caltinay 4843 input_dist.reset(new Distribution(mpi_info, dist, 1, 0));
1777     output_dist.reset(new Distribution(mpi_info, dist, 1, 0));
1778 lgao 3779
1779     /* then, prepare the sender/receiver for the row_connector, first, prepare
1780     the information for sender */
1781     sum = RAP_couple_ptr[num_Pmain_cols];
1782 jfenwick 4275 len = new dim_t[size];
1783     send_ptr = new index_t*[size];
1784     send_idx = new index_t*[size];
1785 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1786 lgao 3779 for (i=0; i<size; i++) {
1787 jfenwick 4275 send_ptr[i] = new index_t[num_Pmain_cols];
1788     send_idx[i] = new index_t[sum];
1789 lgao 3779 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1790     }
1791     memset(len, 0, sizeof(dim_t) * size);
1792     recv = col_connector->recv;
1793     sum=0;
1794     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1795     i1 = RAP_couple_ptr[i_r];
1796     i2 = RAP_couple_ptr[i_r+1];
1797     if (i2 > i1) {
1798 caltinay 4829 /* then row i_r will be in the sender of row_connector, now check
1799     how many neighbours i_r needs to be send to */
1800     for (j=i1; j<i2; j++) {
1801     i_c = RAP_couple_idx[j];
1802     /* find out the corresponding neighbor "p" of column i_c */
1803     for (p=0; p<recv->numNeighbors; p++) {
1804     if (i_c < recv->offsetInShared[p+1]) {
1805     k = recv->neighbor[p];
1806     if (send_ptr[k][i_r] == 0) sum++;
1807     send_ptr[k][i_r] ++;
1808     send_idx[k][len[k]] = global_id_RAP[i_c];
1809     len[k] ++;
1810     break;
1811     }
1812     }
1813     }
1814 lgao 3779 }
1815     }
1816     if (global_id_RAP) {
1817 jfenwick 4275 delete[] global_id_RAP;
1818 lgao 3779 global_id_RAP = NULL;
1819     }
1820    
1821     /* now allocate the sender */
1822 jfenwick 4275 shared = new index_t[sum];
1823 lgao 3779 memset(send_len, 0, sizeof(dim_t) * size);
1824     num_neighbors=0;
1825     offsetInShared[0] = 0;
1826     for (p=0, k=0; p<size; p++) {
1827     for (i=0; i<num_Pmain_cols; i++) {
1828 caltinay 4829 if (send_ptr[p][i] > 0) {
1829     shared[k] = i;
1830     k++;
1831     send_ptr[p][send_len[p]] = send_ptr[p][i];
1832     send_len[p]++;
1833     }
1834 lgao 3779 }
1835     if (k > offsetInShared[num_neighbors]) {
1836 caltinay 4829 neighbor[num_neighbors] = p;
1837     num_neighbors ++;
1838     offsetInShared[num_neighbors] = k;
1839 lgao 3779 }
1840     }
1841 caltinay 4843 send.reset(new SharedComponents(num_Pmain_cols, num_neighbors,
1842 caltinay 4816 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1843 lgao 3779
1844 caltinay 4843 /* send/recv number of rows will be sent from current proc
1845 lgao 3779 recover info for the receiver of row_connector from the sender */
1846 lgao 3828 #ifdef ESYS_MPI
1847 lgao 3779 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1848 lgao 3828 #endif
1849 lgao 3779 num_neighbors = 0;
1850     offsetInShared[0] = 0;
1851     j = 0;
1852     for (i=0; i<size; i++) {
1853     if (i != rank && recv_len[i] > 0) {
1854 caltinay 4829 neighbor[num_neighbors] = i;
1855     num_neighbors ++;
1856     j += recv_len[i];
1857     offsetInShared[num_neighbors] = j;
1858 lgao 3779 }
1859     }
1860 jfenwick 4275 delete[] shared;
1861     delete[] recv_len;
1862     shared = new index_t[j];
1863 lgao 3779 k = offsetInShared[num_neighbors];
1864 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1865 lgao 3779 for (i=0; i<k; i++) {
1866 caltinay 4843 shared[i] = i + num_Pmain_cols;
1867 lgao 3779 }
1868 caltinay 4843 recv.reset(new SharedComponents(num_Pmain_cols, num_neighbors,
1869 caltinay 4816 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1870 caltinay 4843 row_connector.reset(new Connector(send, recv));
1871 jfenwick 4275 delete[] shared;
1872 lgao 3779
1873     /* send/recv pattern->ptr for rowCoupleBlock */
1874 caltinay 4843 num_RAPext_rows = offsetInShared[num_neighbors];
1875 jfenwick 4275 row_couple_ptr = new index_t[num_RAPext_rows+1];
1876 lgao 3779 for (p=0; p<num_neighbors; p++) {
1877     j = offsetInShared[p];
1878     i = offsetInShared[p+1];
1879 lgao 3828 #ifdef ESYS_MPI
1880 lgao 3779 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1881 caltinay 4829 mpi_info->msg_tag_counter+neighbor[p],
1882     mpi_info->comm, &mpi_requests[p]);
1883 lgao 3828 #endif
1884 lgao 3779 }
1885     send = row_connector->send;
1886     for (p=0; p<send->numNeighbors; p++) {
1887 lgao 3828 #ifdef ESYS_MPI
1888 caltinay 4843 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1889 caltinay 4829 MPI_INT, send->neighbor[p],
1890     mpi_info->msg_tag_counter+rank,
1891     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1892 lgao 3828 #endif
1893 lgao 3779 }
1894 lgao 3828 #ifdef ESYS_MPI
1895 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1896 caltinay 4843 mpi_requests, mpi_stati);
1897 lgao 3828 #endif
1898 caltinay 4843 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1899 jfenwick 4275 delete[] send_len;
1900 lgao 3779
1901     sum = 0;
1902     for (i=0; i<num_RAPext_rows; i++) {
1903     k = row_couple_ptr[i];
1904     row_couple_ptr[i] = sum;
1905     sum += k;
1906     }
1907     row_couple_ptr[num_RAPext_rows] = sum;
1908    
1909     /* send/recv pattern->index for rowCoupleBlock */
1910     k = row_couple_ptr[num_RAPext_rows];
1911 jfenwick 4275 row_couple_idx = new index_t[k];
1912 lgao 3779 for (p=0; p<num_neighbors; p++) {
1913     j1 = row_couple_ptr[offsetInShared[p]];
1914     j2 = row_couple_ptr[offsetInShared[p+1]];
1915 lgao 3828 #ifdef ESYS_MPI
1916 lgao 3779 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1917 caltinay 4829 mpi_info->msg_tag_counter+neighbor[p],
1918     mpi_info->comm, &mpi_requests[p]);
1919 lgao 3828 #endif
1920 lgao 3779 }
1921     for (p=0; p<send->numNeighbors; p++) {
1922 lgao 3828 #ifdef ESYS_MPI
1923 caltinay 4843 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1924 caltinay 4829 MPI_INT, send->neighbor[p],
1925     mpi_info->msg_tag_counter+rank,
1926     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1927 lgao 3828 #endif
1928 lgao 3779 }
1929 lgao 3828 #ifdef ESYS_MPI
1930 caltinay 4843 MPI_Waitall(num_neighbors + send->numNeighbors,
1931     mpi_requests, mpi_stati);
1932 lgao 3828 #endif
1933 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1934 lgao 3779
1935 caltinay 4836 offset = input_dist->first_component[rank];
1936     k = row_couple_ptr[num_RAPext_rows];
1937     #pragma omp parallel for schedule(static) private(i)
1938     for (i=0; i<k; i++) {
1939     row_couple_idx[i] -= offset;
1940     }
1941     #pragma omp parallel for schedule(static) private(i)
1942     for (i=0; i<size; i++) {
1943     delete[] send_ptr[i];
1944     delete[] send_idx[i];
1945     }
1946     delete[] send_ptr;
1947     delete[] send_idx;
1948     delete[] len;
1949     delete[] offsetInShared;
1950     delete[] neighbor;
1951     delete[] mpi_requests;
1952     delete[] mpi_stati;
1953 lgao 3779
1954 jfenwick 4934
1955 caltinay 4836 /* Now, we can create pattern for mainBlock and coupleBlock */
1956 caltinay 4843 Pattern_ptr main_pattern(new Pattern(MATRIX_FORMAT_DEFAULT,
1957 caltinay 4819 num_Pmain_cols, num_Pmain_cols, RAP_main_ptr, RAP_main_idx));
1958 caltinay 4843 Pattern_ptr col_couple_pattern(new Pattern(
1959 caltinay 4819 MATRIX_FORMAT_DEFAULT, num_Pmain_cols, num_RAPext_cols,
1960     RAP_couple_ptr, RAP_couple_idx));
1961 caltinay 4843 Pattern_ptr row_couple_pattern(new Pattern(
1962 caltinay 4819 MATRIX_FORMAT_DEFAULT, num_RAPext_rows, num_Pmain_cols,
1963     row_couple_ptr, row_couple_idx));
1964 lgao 3779
1965 caltinay 4836 /* next, create the system matrix */
1966 caltinay 4843 pattern.reset(new SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1967 lgao 3779 output_dist, input_dist, main_pattern, col_couple_pattern,
1968 caltinay 4818 row_couple_pattern, col_connector, row_connector));
1969 caltinay 4843 out.reset(new SystemMatrix(A->type<