/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Annotation of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4817 - (hide annotations)
Fri Mar 28 08:04:09 2014 UTC (5 years, 8 months ago) by caltinay
File size: 121916 byte(s)
Coupler/Connector shared ptrs.

1 jfenwick 3981 /*****************************************************************************
2 lgao 3763 *
3 jfenwick 4657 * Copyright (c) 2003-2014 by University of Queensland
4 jfenwick 3981 * http://www.uq.edu.au
5 lgao 3763 *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10 jfenwick 3981 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 jfenwick 4657 * Development 2012-2013 by School of Earth Sciences
12     * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 jfenwick 3981 *
14     *****************************************************************************/
15 lgao 3763
16    
17 caltinay 4816 /****************************************************************************/
18 lgao 3763
19     /* Paso: defines AMG Interpolation */
20    
21 caltinay 4816 /****************************************************************************/
22 lgao 3763
23     /* Author: l.gao@uq.edu.au */
24    
25 caltinay 4816 /****************************************************************************/
26 lgao 3763
27     #include "Paso.h"
28     #include "SparseMatrix.h"
29     #include "PasoUtil.h"
30     #include "Preconditioner.h"
31    
32 caltinay 4816 /****************************************************************************
33 lgao 3763
34 jfenwick 4345 Methods necessary for AMG preconditioner
35 lgao 3763
36     construct n_C x n_C interpolation operator A_C from matrix A
37     and prolongation matrix P.
38    
39     The coarsening operator A_C is defined by RAP where R=P^T.
40    
41 caltinay 4816 *****************************************************************************/
42 lgao 3763
43 jfenwick 4345 /* Extend system matrix B with extra two sparse matrices:
44 lgao 3763 B_ext_main and B_ext_couple
45 jfenwick 4345 The combination of this two sparse matrices represents
46     the portion of B that is stored on neighbour procs and
47 lgao 3763 needed locally for triple matrix product (B^T) A B.
48    
49     FOR NOW, we assume that the system matrix B has a NULL
50     row_coupleBlock and a NULL remote_coupleBlock. Based on
51     this assumption, we use link row_coupleBlock for sparse
52     matrix B_ext_main, and link remote_coupleBlock for sparse
53     matrix B_ext_couple.
54    
55     To be specific, B_ext (on processor k) are group of rows
56     in B, where the list of rows from processor q is given by
57     A->col_coupler->connector->send->shared[rPtr...]
58     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59     on q. */
60     void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
61     {
62 caltinay 4803 paso::Pattern *pattern_main=NULL, *pattern_couple=NULL;
63 caltinay 4817 paso::Coupler_ptr coupler;
64 caltinay 4816 paso::SharedComponents_ptr send, recv;
65 lgao 3763 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
66     index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
67     index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
68     index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
69     index_t *idx_m=NULL, *idx_c=NULL;
70 lgao 3828 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
71 lgao 3763 index_t offset, len, block_size, block_size_size, max_num_cols;
72     index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
73     dim_t *recv_degree=NULL, *send_degree=NULL;
74     dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
75    
76     if (size == 1) return;
77    
78     if (B->row_coupleBlock) {
79 caltinay 4797 paso::SparseMatrix_free(B->row_coupleBlock);
80 lgao 3763 B->row_coupleBlock = NULL;
81     }
82    
83     if (B->row_coupleBlock || B->remote_coupleBlock) {
84     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
85     return;
86     }
87    
88     /* sending/receiving unknown's global ID */
89     num_main_cols = B->mainBlock->numCols;
90 jfenwick 4275 cols = new double[num_main_cols];
91 lgao 3763 offset = B->col_distribution->first_component[rank];
92     #pragma omp parallel for private(i) schedule(static)
93     for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
94     if (B->global_id == NULL) {
95 caltinay 4817 coupler.reset(new paso::Coupler(B->col_coupler->connector, 1));
96     coupler->startCollect(cols);
97 lgao 3763 }
98    
99 jfenwick 4275 recv_buf = new index_t[size];
100     recv_degree = new dim_t[size];
101     recv_offset = new index_t[size+1];
102 lgao 3763 #pragma omp parallel for private(i) schedule(static)
103     for (i=0; i<size; i++){
104     recv_buf[i] = 0;
105     recv_degree[i] = 1;
106     recv_offset[i] = i;
107     }
108    
109 lgao 3779 block_size = B->block_size;
110 lgao 3763 block_size_size = block_size * sizeof(double);
111     num_couple_cols = B->col_coupleBlock->numCols;
112     send = A->col_coupler->connector->send;
113     recv = A->col_coupler->connector->recv;
114     num_neighbors = send->numNeighbors;
115     p = send->offsetInShared[num_neighbors];
116 lgao 3767 len = p * B->col_distribution->first_component[size];
117 jfenwick 4275 send_buf = new double[len * block_size];
118     send_idx = new index_t[len];
119     send_offset = new index_t[(p+1)*2];
120     send_degree = new dim_t[num_neighbors];
121 lgao 3763 i = num_main_cols + num_couple_cols;
122 jfenwick 4275 send_m = new double[i * block_size];
123     send_c = new double[i * block_size];
124     idx_m = new index_t[i];
125     idx_c = new index_t[i];
126 lgao 3763
127     /* waiting for receiving unknown's global ID */
128     if (B->global_id == NULL) {
129 caltinay 4817 coupler->finishCollect();
130 jfenwick 4275 global_id = new index_t[num_couple_cols];
131 lgao 3763 #pragma omp parallel for private(i) schedule(static)
132     for (i=0; i<num_couple_cols; ++i)
133     global_id[i] = coupler->recv_buffer[i];
134     B->global_id = global_id;
135     } else
136     global_id = B->global_id;
137    
138     /* distribute the number of cols in current col_coupleBlock to all ranks */
139 lgao 3828 #ifdef ESYS_MPI
140 lgao 3763 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
141 lgao 3828 #endif
142 lgao 3763
143     /* distribute global_ids of cols to be considered to all ranks*/
144     len = 0;
145     max_num_cols = 0;
146     for (i=0; i<size; i++){
147     recv_degree[i] = recv_buf[i];
148     recv_offset[i] = len;
149     len += recv_buf[i];
150     if (max_num_cols < recv_buf[i])
151     max_num_cols = recv_buf[i];
152     }
153     recv_offset[size] = len;
154 jfenwick 4275 cols_array = new index_t[len];
155 lgao 3828 #ifdef ESYS_MPI
156 lgao 3763 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
157 lgao 3828 #endif
158 lgao 3763
159     /* first, prepare the ptr_ptr to be received */
160     q = recv->numNeighbors;
161     len = recv->offsetInShared[q];
162 jfenwick 4275 ptr_ptr = new index_t[(len+1) * 2];
163 lgao 3763 for (p=0; p<q; p++) {
164     row = recv->offsetInShared[p];
165     m = recv->offsetInShared[p + 1];
166 lgao 3828 #ifdef ESYS_MPI
167 lgao 3767 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
168 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
169     A->mpi_info->comm,
170     &(A->col_coupler->mpi_requests[p]));
171 lgao 3828 #endif
172 lgao 3763 }
173    
174     /* now prepare the rows to be sent (the degree, the offset and the data) */
175     len = 0;
176 lgao 3817 i0 = 0;
177 lgao 3763 for (p=0; p<num_neighbors; p++) {
178 lgao 3817 i = i0;
179 lgao 3763 neighbor = send->neighbor[p];
180     m_lb = B->col_distribution->first_component[neighbor];
181     m_ub = B->col_distribution->first_component[neighbor + 1];
182     j_ub = send->offsetInShared[p + 1];
183     for (j=send->offsetInShared[p]; j<j_ub; j++) {
184     row = send->shared[j];
185     l_m = 0;
186     l_c = 0;
187     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
188     k_lb = B->col_coupleBlock->pattern->ptr[row];
189    
190     /* check part of col_coupleBlock for data to be passed @row */
191     for (k=k_lb; k<k_ub; k++) {
192     m = global_id[B->col_coupleBlock->pattern->index[k]];
193     if (m > offset) break;
194     if (m>= m_lb && m < m_ub) {
195     /* data to be passed to sparse matrix B_ext_main */
196     idx_m[l_m] = m - m_lb;
197     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
198     l_m++;
199     } else {
200     /* data to be passed to sparse matrix B_ext_couple */
201     idx_c[l_c] = m;
202     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
203     l_c++;
204     }
205     }
206     k_lb = k;
207    
208     /* check mainBlock for data to be passed @row to sparse
209     matrix B_ext_couple */
210     k_ub = B->mainBlock->pattern->ptr[row + 1];
211     k = B->mainBlock->pattern->ptr[row];
212     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
213     for (; k<k_ub; k++) {
214     m = B->mainBlock->pattern->index[k] + offset;
215     idx_c[l_c] = m;
216     l_c++;
217     }
218    
219     /* check the rest part of col_coupleBlock for data to
220     be passed @row to sparse matrix B_ext_couple */
221     k = k_lb;
222     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
223     for (k=k_lb; k<k_ub; k++) {
224     m = global_id[B->col_coupleBlock->pattern->index[k]];
225 lgao 3767 if (m>= m_lb && m < m_ub) {
226 lgao 3763 /* data to be passed to sparse matrix B_ext_main */
227     idx_m[l_m] = m - m_lb;
228     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
229     l_m++;
230     } else {
231     /* data to be passed to sparse matrix B_ext_couple */
232     idx_c[l_c] = m;
233     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
234     l_c++;
235     }
236     }
237    
238 lgao 3779 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
239 lgao 3767 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
240 lgao 3763 send_offset[2*i] = l_m;
241     len += l_m;
242 lgao 3779 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
243 lgao 3767 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
244 lgao 3763 send_offset[2*i+1] = l_c;
245     len += l_c;
246     i++;
247     }
248    
249     /* sending */
250 lgao 3828 #ifdef ESYS_MPI
251 lgao 3817 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
252 lgao 3763 A->mpi_info->msg_tag_counter+rank,
253     A->mpi_info->comm,
254     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
255 lgao 3828 #endif
256 lgao 3763 send_degree[p] = len;
257 lgao 3817 i0 = i;
258 lgao 3763 }
259 jfenwick 4275 delete[] send_m;
260     delete[] send_c;
261     delete[] idx_m;
262     delete[] idx_c;
263 lgao 3763
264    
265     q = recv->numNeighbors;
266     len = recv->offsetInShared[q];
267 jfenwick 4275 ptr_main = new index_t[(len+1)];
268     ptr_couple = new index_t[(len+1)];
269 lgao 3763
270 lgao 3828 #ifdef ESYS_MPI
271 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
272     A->col_coupler->mpi_requests,
273     A->col_coupler->mpi_stati);
274 lgao 3828 #endif
275 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
276 lgao 3763
277     j = 0;
278     k = 0;
279     ptr_main[0] = 0;
280     ptr_couple[0] = 0;
281     for (i=0; i<len; i++) {
282     j += ptr_ptr[2*i];
283     k += ptr_ptr[2*i+1];
284     ptr_main[i+1] = j;
285     ptr_couple[i+1] = k;
286     }
287    
288 jfenwick 4275 delete[] ptr_ptr;
289     idx_main = new index_t[j];
290     idx_couple = new index_t[k];
291     ptr_idx = new index_t[j+k];
292     ptr_val = new double[(j+k) * block_size];
293 lgao 3763
294     /* send/receive index array */
295     j=0;
296     k_ub = 0;
297     for (p=0; p<recv->numNeighbors; p++) {
298     k = recv->offsetInShared[p];
299     m = recv->offsetInShared[p+1];
300     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
301     if (i > 0) {
302     k_ub ++;
303 lgao 3828 #ifdef ESYS_MPI
304 lgao 3763 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
305     A->mpi_info->msg_tag_counter+recv->neighbor[p],
306     A->mpi_info->comm,
307     &(A->col_coupler->mpi_requests[p]));
308 lgao 3828 #endif
309 lgao 3763 }
310     j += i;
311     }
312    
313     j=0;
314     k_ub = 0;
315     for (p=0; p<num_neighbors; p++) {
316     i = send_degree[p] - j;
317     if (i > 0){
318     k_ub ++;
319 lgao 3828 #ifdef ESYS_MPI
320 lgao 3763 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
321     A->mpi_info->msg_tag_counter+rank,
322     A->mpi_info->comm,
323     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
324 lgao 3828 #endif
325 lgao 3763 }
326     j = send_degree[p];
327     }
328    
329 lgao 3828 #ifdef ESYS_MPI
330 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
331     A->col_coupler->mpi_requests,
332     A->col_coupler->mpi_stati);
333 lgao 3828 #endif
334 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
335 lgao 3763
336 lgao 3821 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
337 lgao 3763 for (i=0; i<len; i++) {
338     j = ptr_main[i];
339     k = ptr_main[i+1];
340     m = ptr_couple[i];
341     for (p=j; p<k; p++) {
342     idx_main[p] = ptr_idx[m+p];
343     }
344     j = ptr_couple[i+1];
345     for (p=m; p<j; p++) {
346     idx_couple[p] = ptr_idx[k+p];
347     }
348     }
349 jfenwick 4275 delete[] ptr_idx;
350 lgao 3763
351     /* allocate pattern and sparsematrix for B_ext_main */
352 caltinay 4803 pattern_main = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
353 lgao 3763 len, num_main_cols, ptr_main, idx_main);
354 caltinay 4797 B->row_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
355 lgao 3763 pattern_main, B->row_block_size, B->col_block_size,
356     FALSE);
357 caltinay 4803 paso::Pattern_free(pattern_main);
358 lgao 3763
359     /* allocate pattern and sparsematrix for B_ext_couple */
360 caltinay 4803 pattern_couple = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
361 lgao 3763 len, B->col_distribution->first_component[size],
362     ptr_couple, idx_couple);
363 caltinay 4797 B->remote_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
364 lgao 3763 pattern_couple, B->row_block_size, B->col_block_size,
365     FALSE);
366 caltinay 4803 paso::Pattern_free(pattern_couple);
367 lgao 3763
368     /* send/receive value array */
369     j=0;
370     for (p=0; p<recv->numNeighbors; p++) {
371     k = recv->offsetInShared[p];
372     m = recv->offsetInShared[p+1];
373     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
374 lgao 3828 #ifdef ESYS_MPI
375 lgao 3763 if (i > 0)
376     MPI_Irecv(&(ptr_val[j]), i * block_size,
377     MPI_DOUBLE, recv->neighbor[p],
378     A->mpi_info->msg_tag_counter+recv->neighbor[p],
379     A->mpi_info->comm,
380     &(A->col_coupler->mpi_requests[p]));
381 lgao 3828 #endif
382 lgao 3779 j += (i * block_size);
383 lgao 3763 }
384    
385     j=0;
386     for (p=0; p<num_neighbors; p++) {
387     i = send_degree[p] - j;
388 lgao 3828 #ifdef ESYS_MPI
389 lgao 3763 if (i > 0)
390 lgao 3779 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
391 lgao 3763 A->mpi_info->msg_tag_counter+rank,
392     A->mpi_info->comm,
393     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
394 lgao 3828 #endif
395 lgao 3779 j = send_degree[p] ;
396 lgao 3763 }
397    
398 lgao 3828 #ifdef ESYS_MPI
399 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
400     A->col_coupler->mpi_requests,
401     A->col_coupler->mpi_stati);
402 lgao 3828 #endif
403 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
404 lgao 3763
405     #pragma omp parallel for private(i,j,k,m,p) schedule(static)
406     for (i=0; i<len; i++) {
407     j = ptr_main[i];
408     k = ptr_main[i+1];
409     m = ptr_couple[i];
410     for (p=j; p<k; p++) {
411 lgao 3817 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
412 lgao 3763 }
413     j = ptr_couple[i+1];
414     for (p=m; p<j; p++) {
415 lgao 3817 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
416 lgao 3763 }
417     }
418 jfenwick 4275 delete[] ptr_val;
419 lgao 3763
420 lgao 3817
421 lgao 3763 /* release all temp memory allocation */
422 jfenwick 4275 delete[] cols;
423     delete[] cols_array;
424     delete[] recv_offset;
425     delete[] recv_degree;
426     delete[] recv_buf;
427     delete[] send_buf;
428     delete[] send_offset;
429     delete[] send_degree;
430     delete[] send_idx;
431 lgao 3763 }
432    
433     /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
434     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
435 jfenwick 4345 to copy block of data in T to neighbour processors, defined by
436 lgao 3763 P->col_coupler->connector->recv->neighbor[k] where k is in
437     [0, P->col_coupler->connector->recv->numNeighbors).
438     Rows to be copied to neighbor processor k is in the list defined by
439     P->col_coupler->connector->recv->offsetInShared[k] ...
440     P->col_coupler->connector->recv->offsetInShared[k+1] */
441     void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
442 lgao 3779 index_t **p_ptr, index_t **p_idx, double **p_val,
443     index_t *global_id, index_t block_size)
444 lgao 3763 {
445 caltinay 4816 paso::SharedComponents_ptr send, recv;
446 lgao 3763 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
447 gross 3834 index_t i, j, p, m, n, size;
448 lgao 3763 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
449     index_t *ptr=*p_ptr, *idx=*p_idx;
450     double *val=*p_val, *recv_val=NULL;
451 gross 3834 #ifdef ESYS_MPI
452     index_t rank = P->mpi_info->rank;
453     #endif
454 lgao 3763
455     size = P->mpi_info->size;
456     send = P->col_coupler->connector->recv;
457     recv = P->col_coupler->connector->send;
458     send_neighbors = send->numNeighbors;
459     recv_neighbors = recv->numNeighbors;
460     send_rows = P->col_coupleBlock->numCols;
461     recv_rows = recv->offsetInShared[recv_neighbors];
462    
463 jfenwick 4275 send_degree = new index_t[send_rows];
464     recv_ptr = new index_t[recv_rows + 1];
465 lgao 3763 #pragma omp for schedule(static) private(i)
466     for (i=0; i<send_rows; i++)
467     send_degree[i] = ptr[i+1] - ptr[i];
468    
469     /* First, send/receive the degree */
470     for (p=0; p<recv_neighbors; p++) { /* Receiving */
471     m = recv->offsetInShared[p];
472     n = recv->offsetInShared[p+1];
473 lgao 3828 #ifdef ESYS_MPI
474 lgao 3763 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
475     P->mpi_info->msg_tag_counter + recv->neighbor[p],
476     P->mpi_info->comm,
477     &(P->col_coupler->mpi_requests[p]));
478 lgao 3828 #endif
479 lgao 3763 }
480     for (p=0; p<send_neighbors; p++) { /* Sending */
481     m = send->offsetInShared[p];
482     n = send->offsetInShared[p+1];
483 lgao 3828 #ifdef ESYS_MPI
484 lgao 3763 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
485     P->mpi_info->msg_tag_counter + rank,
486     P->mpi_info->comm,
487     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
488 lgao 3828 #endif
489 lgao 3763 }
490 lgao 3828 #ifdef ESYS_MPI
491 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
492     P->col_coupler->mpi_requests,
493     P->col_coupler->mpi_stati);
494 lgao 3828 #endif
495 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
496 lgao 3763
497 jfenwick 4275 delete[] send_degree;
498 lgao 3763 m = Paso_Util_cumsum(recv_rows, recv_ptr);
499     recv_ptr[recv_rows] = m;
500 jfenwick 4275 recv_idx = new index_t[m];
501     recv_val = new double[m * block_size];
502 lgao 3763
503     /* Next, send/receive the index array */
504     j = 0;
505     for (p=0; p<recv_neighbors; p++) { /* Receiving */
506     m = recv->offsetInShared[p];
507     n = recv->offsetInShared[p+1];
508     i = recv_ptr[n] - recv_ptr[m];
509     if (i > 0) {
510 lgao 3828 #ifdef ESYS_MPI
511 lgao 3763 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
512     P->mpi_info->msg_tag_counter + recv->neighbor[p],
513     P->mpi_info->comm,
514     &(P->col_coupler->mpi_requests[p]));
515 lgao 3828 #endif
516 lgao 3819 }
517 lgao 3763 j += i;
518     }
519    
520     j = 0;
521     for (p=0; p<send_neighbors; p++) { /* Sending */
522     m = send->offsetInShared[p];
523     n = send->offsetInShared[p+1];
524     i = ptr[n] - ptr[m];
525     if (i >0) {
526 lgao 3828 #ifdef ESYS_MPI
527 lgao 3763 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
528     P->mpi_info->msg_tag_counter + rank,
529     P->mpi_info->comm,
530     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
531 lgao 3828 #endif
532 lgao 3763 j += i;
533 lgao 3819 }
534 lgao 3763 }
535 lgao 3828 #ifdef ESYS_MPI
536 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
537     P->col_coupler->mpi_requests,
538     P->col_coupler->mpi_stati);
539 lgao 3828 #endif
540 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
541 lgao 3763
542     /* Last, send/receive the data array */
543     j = 0;
544     for (p=0; p<recv_neighbors; p++) { /* Receiving */
545     m = recv->offsetInShared[p];
546     n = recv->offsetInShared[p+1];
547     i = recv_ptr[n] - recv_ptr[m];
548 lgao 3828 #ifdef ESYS_MPI
549 lgao 3763 if (i > 0)
550 lgao 3779 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
551 lgao 3763 P->mpi_info->msg_tag_counter + recv->neighbor[p],
552     P->mpi_info->comm,
553     &(P->col_coupler->mpi_requests[p]));
554 lgao 3828 #endif
555 lgao 3779 j += (i*block_size);
556 lgao 3763 }
557    
558     j = 0;
559     for (p=0; p<send_neighbors; p++) { /* Sending */
560     m = send->offsetInShared[p];
561     n = send->offsetInShared[p+1];
562     i = ptr[n] - ptr[m];
563     if (i >0) {
564 lgao 3828 #ifdef ESYS_MPI
565 lgao 3779 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
566 lgao 3763 P->mpi_info->msg_tag_counter + rank,
567     P->mpi_info->comm,
568     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
569 lgao 3828 #endif
570 lgao 3779 j += i * block_size;
571 lgao 3763 }
572     }
573 lgao 3828 #ifdef ESYS_MPI
574 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
575     P->col_coupler->mpi_requests,
576     P->col_coupler->mpi_stati);
577 lgao 3828 #endif
578 jfenwick 4505 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
579 lgao 3763
580 jfenwick 4345 /* Clean up and return with received ptr, index and data arrays */
581 jfenwick 4275 delete[] ptr;
582     delete[] idx;
583     delete[] val;
584 lgao 3763 *p_ptr = recv_ptr;
585     *p_idx = recv_idx;
586     *p_val = recv_val;
587     }
588    
589     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
590     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
591     {
592     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
593 caltinay 4797 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
594 lgao 3763 Paso_SystemMatrix *out=NULL;
595 caltinay 4800 paso::SystemMatrixPattern *pattern=NULL;
596 caltinay 4814 paso::Distribution_ptr input_dist, output_dist;
597 caltinay 4817 paso::Connector_ptr col_connector, row_connector;
598 caltinay 4803 paso::Pattern *main_pattern=NULL;
599     paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
600 lgao 3763 const dim_t row_block_size=A->row_block_size;
601     const dim_t col_block_size=A->col_block_size;
602 lgao 3779 const dim_t block_size = A->block_size;
603     const dim_t P_block_size = P->block_size;
604 lgao 3763 const double ZERO = 0.0;
605     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
606 lgao 3779 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
607 lgao 3763 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
608     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
609     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
610     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
611     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
612     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
613     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
614 lgao 3779 index_t sum, i, j, k, iptr, irb, icb, ib;
615 lgao 3763 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
616     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
617     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
618     index_t row_marker_ext, *where_p=NULL;
619     index_t **send_ptr=NULL, **send_idx=NULL;
620 lgao 3828 dim_t p, num_neighbors;
621 lgao 3763 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
622     Esys_MPI_rank *neighbor=NULL;
623     #ifdef ESYS_MPI
624     MPI_Request* mpi_requests=NULL;
625     MPI_Status* mpi_stati=NULL;
626     #else
627     int *mpi_requests=NULL, *mpi_stati=NULL;
628     #endif
629    
630 lgao 3828 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
631     return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
632 lgao 3779
633 jfenwick 4345 /* two sparse matrices R_main and R_couple will be generate, as the
634 lgao 3763 transpose of P_main and P_col_couple, respectively. Note that,
635     R_couple is actually the row_coupleBlock of R (R=P^T) */
636 caltinay 4797 R_main = paso::SparseMatrix_getTranspose(P->mainBlock);
637 lgao 3817 if (size > 1)
638 caltinay 4797 R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock);
639 lgao 3763
640 jfenwick 4345 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
641 lgao 3779 and needed locally for triple matrix product RAP
642     to be specific, P_ext (on processor k) are group of rows in P, where
643     the list of rows from processor q is given by
644     A->col_coupler->connector->send->shared[rPtr...]
645     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
646     on q.
647 jfenwick 4345 P_ext is represented by two sparse matrices P_ext_main and
648 lgao 3779 P_ext_couple */
649     Paso_Preconditioner_AMG_extendB(A, P);
650    
651     /* count the number of cols in P_ext_couple, resize the pattern of
652     sparse matrix P_ext_couple with new compressed order, and then
653     build the col id mapping from P->col_coupleBlock to
654     P_ext_couple */
655     num_Pmain_cols = P->mainBlock->numCols;
656 lgao 3817 if (size > 1) {
657     num_Pcouple_cols = P->col_coupleBlock->numCols;
658     num_Acouple_cols = A->col_coupleBlock->numCols;
659     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
660     } else {
661     num_Pcouple_cols = 0;
662     num_Acouple_cols = 0;
663     sum = 0;
664     }
665 lgao 3779 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
666     offset = P->col_distribution->first_component[rank];
667     num_Pext_cols = 0;
668     if (P->global_id) {
669     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
670     and P->col_coupleBlock */
671     iptr = 0;
672     if (num_Pcouple_cols || sum > 0) {
673 jfenwick 4275 temp = new index_t[num_Pcouple_cols+sum];
674 lgao 3821 #pragma omp parallel for lastprivate(iptr) schedule(static)
675     for (iptr=0; iptr<sum; iptr++){
676 lgao 3779 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
677     }
678     for (j=0; j<num_Pcouple_cols; j++, iptr++){
679     temp[iptr] = P->global_id[j];
680     }
681     }
682     if (iptr) {
683     #ifdef USE_QSORTG
684 caltinay 4803 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
685 lgao 3779 #else
686 caltinay 4803 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
687 lgao 3779 #endif
688     num_Pext_cols = 1;
689     i = temp[0];
690     for (j=1; j<iptr; j++) {
691     if (temp[j] > i) {
692     i = temp[j];
693     temp[num_Pext_cols++] = i;
694     }
695     }
696     }
697     /* resize the pattern of P_ext_couple */
698     if(num_Pext_cols){
699 jfenwick 4275 global_id_P = new index_t[num_Pext_cols];
700 lgao 3821 #pragma omp parallel for private(i) schedule(static)
701 lgao 3779 for (i=0; i<num_Pext_cols; i++)
702     global_id_P[i] = temp[i];
703     }
704     if (num_Pcouple_cols || sum > 0)
705 jfenwick 4275 delete[] temp;
706 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
707 lgao 3779 for (i=0; i<sum; i++) {
708     where_p = (index_t *)bsearch(
709     &(P->remote_coupleBlock->pattern->index[i]),
710     global_id_P, num_Pext_cols,
711 caltinay 4803 sizeof(index_t), paso::comparIndex);
712 lgao 3779 P->remote_coupleBlock->pattern->index[i] =
713     (index_t)(where_p -global_id_P);
714     }
715    
716     /* build the mapping */
717     if (num_Pcouple_cols) {
718 jfenwick 4275 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
719 lgao 3779 iptr = 0;
720     for (i=0; i<num_Pext_cols; i++)
721     if (global_id_P[i] == P->global_id[iptr]) {
722     Pcouple_to_Pext[iptr++] = i;
723     if (iptr == num_Pcouple_cols) break;
724     }
725     }
726     }
727    
728 jfenwick 4345 /* alloc and initialise the makers */
729 lgao 3779 sum = num_Pext_cols + num_Pmain_cols;
730 jfenwick 4275 P_marker = new index_t[sum];
731     A_marker = new index_t[num_A_cols];
732 lgao 3779 #pragma omp parallel for private(i) schedule(static)
733     for (i=0; i<sum; i++) P_marker[i] = -1;
734     #pragma omp parallel for private(i) schedule(static)
735     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
736    
737     /* Now, count the size of RAP_ext. Start with rows in R_couple */
738     sum = 0;
739     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
740     row_marker = sum;
741     /* then, loop over elements in row i_r of R_couple */
742     j1_ub = R_couple->pattern->ptr[i_r+1];
743     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
744     i1 = R_couple->pattern->index[j1];
745     /* then, loop over elements in row i1 of A->col_coupleBlock */
746     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
747     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
748     i2 = A->col_coupleBlock->pattern->index[j2];
749    
750     /* check whether entry RA[i_r, i2] has been previously visited.
751     RAP new entry is possible only if entry RA[i_r, i2] has not
752     been visited yet */
753     if (A_marker[i2] != i_r) {
754     /* first, mark entry RA[i_r,i2] as visited */
755     A_marker[i2] = i_r;
756    
757     /* then loop over elements in row i2 of P_ext_main */
758     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
759     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
760     i_c = P->row_coupleBlock->pattern->index[j3];
761    
762     /* check whether entry RAP[i_r,i_c] has been created.
763     If not yet created, create the entry and increase
764     the total number of elements in RAP_ext */
765     if (P_marker[i_c] < row_marker) {
766     P_marker[i_c] = sum;
767     sum++;
768     }
769     }
770    
771     /* loop over elements in row i2 of P_ext_couple, do the same */
772     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
773     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
774     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
775     if (P_marker[i_c] < row_marker) {
776     P_marker[i_c] = sum;
777     sum++;
778     }
779     }
780     }
781     }
782    
783     /* now loop over elements in row i1 of A->mainBlock, repeat
784     the process we have done to block A->col_coupleBlock */
785     j2_ub = A->mainBlock->pattern->ptr[i1+1];
786     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
787     i2 = A->mainBlock->pattern->index[j2];
788     if (A_marker[i2 + num_Acouple_cols] != i_r) {
789     A_marker[i2 + num_Acouple_cols] = i_r;
790     j3_ub = P->mainBlock->pattern->ptr[i2+1];
791     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
792     i_c = P->mainBlock->pattern->index[j3];
793     if (P_marker[i_c] < row_marker) {
794     P_marker[i_c] = sum;
795     sum++;
796     }
797     }
798     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
799     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
800     /* note that we need to map the column index in
801     P->col_coupleBlock back into the column index in
802     P_ext_couple */
803     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
804     if (P_marker[i_c] < row_marker) {
805     P_marker[i_c] = sum;
806     sum++;
807     }
808     }
809     }
810     }
811     }
812     }
813    
814     /* Now we have the number of non-zero elements in RAP_ext, allocate
815     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
816 jfenwick 4275 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
817     RAP_ext_idx = new index_t[sum];
818     RAP_ext_val = new double[sum * block_size];
819     RA_val = new double[block_size];
820     RAP_val = new double[block_size];
821 lgao 3779
822     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
823     sum = num_Pext_cols + num_Pmain_cols;
824     #pragma omp parallel for private(i) schedule(static)
825     for (i=0; i<sum; i++) P_marker[i] = -1;
826     #pragma omp parallel for private(i) schedule(static)
827     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
828     sum = 0;
829     RAP_ext_ptr[0] = 0;
830     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
831     row_marker = sum;
832     /* then, loop over elements in row i_r of R_couple */
833     j1_ub = R_couple->pattern->ptr[i_r+1];
834     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
835     i1 = R_couple->pattern->index[j1];
836     R_val = &(R_couple->val[j1*P_block_size]);
837    
838     /* then, loop over elements in row i1 of A->col_coupleBlock */
839     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
840     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
841     i2 = A->col_coupleBlock->pattern->index[j2];
842     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
843     for (irb=0; irb<row_block_size; irb++)
844     for (icb=0; icb<col_block_size; icb++)
845     RA_val[irb+row_block_size*icb] = ZERO;
846     for (irb=0; irb<P_block_size; irb++) {
847     rtmp=R_val[irb];
848     for (icb=0; icb<col_block_size; icb++) {
849     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
850     }
851     }
852    
853     /* check whether entry RA[i_r, i2] has been previously visited.
854     RAP new entry is possible only if entry RA[i_r, i2] has not
855     been visited yet */
856     if (A_marker[i2] != i_r) {
857     /* first, mark entry RA[i_r,i2] as visited */
858     A_marker[i2] = i_r;
859    
860     /* then loop over elements in row i2 of P_ext_main */
861     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
862     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
863     i_c = P->row_coupleBlock->pattern->index[j3];
864     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
865     for (irb=0; irb<row_block_size; irb++)
866     for (icb=0; icb<col_block_size; icb++)
867     RAP_val[irb+row_block_size*icb] = ZERO;
868     for (icb=0; icb<P_block_size; icb++) {
869     rtmp = temp_val[icb];
870     for (irb=0; irb<row_block_size; irb++) {
871     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
872     }
873     }
874    
875     /* check whether entry RAP[i_r,i_c] has been created.
876     If not yet created, create the entry and increase
877     the total number of elements in RAP_ext */
878     if (P_marker[i_c] < row_marker) {
879     P_marker[i_c] = sum;
880     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
881     RAP_ext_idx[sum] = i_c + offset;
882     sum++;
883     } else {
884     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
885     for (ib=0; ib<block_size; ib++) {
886     temp_val[ib] += RAP_val[ib];
887     }
888     }
889     }
890    
891     /* loop over elements in row i2 of P_ext_couple, do the same */
892     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
893     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
894     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
895     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
896     for (irb=0; irb<row_block_size; irb++)
897     for (icb=0; icb<col_block_size; icb++)
898     RAP_val[irb+row_block_size*icb]=ZERO;
899     for (icb=0; icb<P_block_size; icb++) {
900     rtmp = temp_val[icb];
901     for (irb=0; irb<row_block_size; irb++) {
902     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
903     }
904     }
905    
906     if (P_marker[i_c] < row_marker) {
907     P_marker[i_c] = sum;
908     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
909     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
910     sum++;
911     } else {
912     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
913     for (ib=0; ib<block_size; ib++) {
914     temp_val[ib] += RAP_val[ib];
915     }
916     }
917     }
918    
919     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
920     Only the contributions are added. */
921     } else {
922     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
923     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
924     i_c = P->row_coupleBlock->pattern->index[j3];
925     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
926     for (irb=0; irb<row_block_size; irb++)
927     for (icb=0; icb<col_block_size; icb++)
928     RAP_val[irb+row_block_size*icb]=ZERO;
929     for (icb=0; icb<P_block_size; icb++) {
930     rtmp = temp_val[icb];
931     for (irb=0; irb<row_block_size; irb++) {
932     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
933     }
934     }
935    
936     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
937     for (ib=0; ib<block_size; ib++) {
938     temp_val[ib] += RAP_val[ib];
939     }
940     }
941     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
942     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
943     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
944     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
945     for (irb=0; irb<row_block_size; irb++)
946     for (icb=0; icb<col_block_size; icb++)
947     RAP_val[irb+row_block_size*icb]=ZERO;
948     for (icb=0; icb<P_block_size; icb++) {
949     rtmp = temp_val[icb];
950     for (irb=0; irb<row_block_size; irb++) {
951     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
952     }
953     }
954    
955     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
956     for (ib=0; ib<block_size; ib++) {
957     temp_val[ib] += RAP_val[ib];
958     }
959     }
960     }
961     }
962    
963     /* now loop over elements in row i1 of A->mainBlock, repeat
964     the process we have done to block A->col_coupleBlock */
965     j2_ub = A->mainBlock->pattern->ptr[i1+1];
966     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
967     i2 = A->mainBlock->pattern->index[j2];
968     temp_val = &(A->mainBlock->val[j2*block_size]);
969     for (irb=0; irb<row_block_size; irb++)
970     for (icb=0; icb<col_block_size; icb++)
971     RA_val[irb+row_block_size*icb]=ZERO;
972     for (irb=0; irb<P_block_size; irb++) {
973     rtmp=R_val[irb];
974     for (icb=0; icb<col_block_size; icb++) {
975     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
976     }
977     }
978    
979     if (A_marker[i2 + num_Acouple_cols] != i_r) {
980     A_marker[i2 + num_Acouple_cols] = i_r;
981     j3_ub = P->mainBlock->pattern->ptr[i2+1];
982     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
983     i_c = P->mainBlock->pattern->index[j3];
984     temp_val = &(P->mainBlock->val[j3*P_block_size]);
985     for (irb=0; irb<row_block_size; irb++)
986     for (icb=0; icb<col_block_size; icb++)
987     RAP_val[irb+row_block_size*icb]=ZERO;
988     for (icb=0; icb<P_block_size; icb++) {
989     rtmp = temp_val[icb];
990     for (irb=0; irb<row_block_size; irb++) {
991     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
992     }
993     }
994    
995     if (P_marker[i_c] < row_marker) {
996     P_marker[i_c] = sum;
997     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
998     RAP_ext_idx[sum] = i_c + offset;
999     sum++;
1000     } else {
1001     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1002     for (ib=0; ib<block_size; ib++) {
1003     temp_val[ib] += RAP_val[ib];
1004     }
1005     }
1006     }
1007     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1008     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1009     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1010     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1011     for (irb=0; irb<row_block_size; irb++)
1012     for (icb=0; icb<col_block_size; icb++)
1013     RAP_val[irb+row_block_size*icb]=ZERO;
1014     for (icb=0; icb<P_block_size; icb++) {
1015     rtmp=temp_val[icb];
1016     for (irb=0; irb<row_block_size; irb++) {
1017     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1018     }
1019     }
1020    
1021     if (P_marker[i_c] < row_marker) {
1022     P_marker[i_c] = sum;
1023     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1024     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1025     sum++;
1026     } else {
1027     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1028     for (ib=0; ib<block_size; ib++) {
1029     temp_val[ib] += RAP_val[ib];
1030     }
1031     }
1032     }
1033     } else {
1034     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1035     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1036     i_c = P->mainBlock->pattern->index[j3];
1037     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1038     for (irb=0; irb<row_block_size; irb++)
1039     for (icb=0; icb<col_block_size; icb++)
1040     RAP_val[irb+row_block_size*icb]=ZERO;
1041     for (icb=0; icb<P_block_size; icb++) {
1042     rtmp=temp_val[icb];
1043     for (irb=0; irb<row_block_size; irb++) {
1044     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1045     }
1046     }
1047    
1048     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1049     for (ib=0; ib<block_size; ib++) {
1050     temp_val[ib] += RAP_val[ib];
1051     }
1052     }
1053     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1054     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1055     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1056     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1057     for (irb=0; irb<row_block_size; irb++)
1058     for (icb=0; icb<col_block_size; icb++)
1059     RAP_val[irb+row_block_size*icb]=ZERO;
1060     for (icb=0; icb<P_block_size; icb++) {
1061     rtmp=temp_val[icb];
1062     for (irb=0; irb<row_block_size; irb++) {
1063     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1064     }
1065     }
1066    
1067     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1068     for (ib=0; ib<block_size; ib++) {
1069     temp_val[ib] += RAP_val[ib];
1070     }
1071     }
1072     }
1073     }
1074     }
1075     RAP_ext_ptr[i_r+1] = sum;
1076     }
1077 jfenwick 4275 delete[] P_marker;
1078     delete[] Pcouple_to_Pext;
1079 lgao 3779
1080     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1081     which is given by the column list of P->col_coupleBlock, and
1082 jfenwick 4345 column "c" is the list of columns which possibly covers the
1083     whole column range of system matrix P. This part of data is to
1084     be passed to neighbouring processors, and added to corresponding
1085     RAP[r,c] entries in the neighbouring processors */
1086 lgao 3779 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1087     &RAP_ext_val, global_id_P, block_size);
1088    
1089     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1090     sum = RAP_ext_ptr[num_RAPext_rows];
1091     num_RAPext_cols = 0;
1092     if (num_Pext_cols || sum > 0) {
1093 jfenwick 4275 temp = new index_t[num_Pext_cols+sum];
1094 lgao 3779 j1_ub = offset + num_Pmain_cols;
1095     for (i=0, iptr=0; i<sum; i++) {
1096 lgao 3819 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1097 lgao 3779 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1098     }
1099     for (j=0; j<num_Pext_cols; j++, iptr++){
1100     temp[iptr] = global_id_P[j];
1101     }
1102    
1103     if (iptr) {
1104     #ifdef USE_QSORTG
1105 caltinay 4803 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1106 lgao 3779 #else
1107 caltinay 4803 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1108 lgao 3779 #endif
1109     num_RAPext_cols = 1;
1110     i = temp[0];
1111     for (j=1; j<iptr; j++) {
1112     if (temp[j] > i) {
1113     i = temp[j];
1114     temp[num_RAPext_cols++] = i;
1115     }
1116     }
1117     }
1118     }
1119    
1120     /* resize the pattern of P_ext_couple */
1121     if(num_RAPext_cols){
1122 jfenwick 4275 global_id_RAP = new index_t[num_RAPext_cols];
1123 lgao 3821 #pragma omp parallel for private(i) schedule(static)
1124 lgao 3779 for (i=0; i<num_RAPext_cols; i++)
1125     global_id_RAP[i] = temp[i];
1126     }
1127     if (num_Pext_cols || sum > 0)
1128 jfenwick 4275 delete[] temp;
1129 lgao 3779 j1_ub = offset + num_Pmain_cols;
1130 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
1131 lgao 3779 for (i=0; i<sum; i++) {
1132     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1133     where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1134 caltinay 4803 /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1135 lgao 3779 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1136     } else
1137     RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1138     }
1139    
1140     /* build the mapping */
1141     if (num_Pcouple_cols) {
1142 jfenwick 4275 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1143 lgao 3779 iptr = 0;
1144     for (i=0; i<num_RAPext_cols; i++)
1145     if (global_id_RAP[i] == P->global_id[iptr]) {
1146     Pcouple_to_RAP[iptr++] = i;
1147     if (iptr == num_Pcouple_cols) break;
1148     }
1149     }
1150    
1151     if (num_Pext_cols) {
1152 jfenwick 4275 Pext_to_RAP = new index_t[num_Pext_cols];
1153 lgao 3779 iptr = 0;
1154     for (i=0; i<num_RAPext_cols; i++)
1155     if (global_id_RAP[i] == global_id_P[iptr]) {
1156     Pext_to_RAP[iptr++] = i;
1157     if (iptr == num_Pext_cols) break;
1158     }
1159     }
1160    
1161     if (global_id_P){
1162 jfenwick 4275 delete[] global_id_P;
1163 lgao 3779 global_id_P = NULL;
1164     }
1165    
1166 jfenwick 4345 /* alloc and initialise the makers */
1167 lgao 3779 sum = num_RAPext_cols + num_Pmain_cols;
1168 jfenwick 4275 P_marker = new index_t[sum];
1169 lgao 3779 #pragma omp parallel for private(i) schedule(static)
1170     for (i=0; i<sum; i++) P_marker[i] = -1;
1171     #pragma omp parallel for private(i) schedule(static)
1172     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1173    
1174     /* Now, count the size of RAP. Start with rows in R_main */
1175     num_neighbors = P->col_coupler->connector->send->numNeighbors;
1176     offsetInShared = P->col_coupler->connector->send->offsetInShared;
1177     shared = P->col_coupler->connector->send->shared;
1178     i = 0;
1179     j = 0;
1180     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1181     /* Mark the start of row for both main block and couple block */
1182     row_marker = i;
1183     row_marker_ext = j;
1184    
1185     /* Mark the diagonal element RAP[i_r, i_r], and other elements
1186     in RAP_ext */
1187     P_marker[i_r] = i;
1188     i++;
1189     for (j1=0; j1<num_neighbors; j1++) {
1190     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1191     if (shared[j2] == i_r) {
1192     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1193     i_c = RAP_ext_idx[k];
1194     if (i_c < num_Pmain_cols) {
1195     if (P_marker[i_c] < row_marker) {
1196     P_marker[i_c] = i;
1197     i++;
1198     }
1199     } else {
1200     if (P_marker[i_c] < row_marker_ext) {
1201     P_marker[i_c] = j;
1202     j++;
1203     }
1204     }
1205     }
1206     break;
1207     }
1208     }
1209     }
1210    
1211     /* then, loop over elements in row i_r of R_main */
1212     j1_ub = R_main->pattern->ptr[i_r+1];
1213     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1214     i1 = R_main->pattern->index[j1];
1215    
1216     /* then, loop over elements in row i1 of A->col_coupleBlock */
1217     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1218     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1219     i2 = A->col_coupleBlock->pattern->index[j2];
1220    
1221     /* check whether entry RA[i_r, i2] has been previously visited.
1222     RAP new entry is possible only if entry RA[i_r, i2] has not
1223     been visited yet */
1224     if (A_marker[i2] != i_r) {
1225     /* first, mark entry RA[i_r,i2] as visited */
1226     A_marker[i2] = i_r;
1227    
1228     /* then loop over elements in row i2 of P_ext_main */
1229     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1230     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1231     i_c = P->row_coupleBlock->pattern->index[j3];
1232    
1233     /* check whether entry RAP[i_r,i_c] has been created.
1234     If not yet created, create the entry and increase
1235     the total number of elements in RAP_ext */
1236     if (P_marker[i_c] < row_marker) {
1237     P_marker[i_c] = i;
1238     i++;
1239     }
1240     }
1241    
1242     /* loop over elements in row i2 of P_ext_couple, do the same */
1243     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1244     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1245     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1246     if (P_marker[i_c] < row_marker_ext) {
1247     P_marker[i_c] = j;
1248     j++;
1249     }
1250     }
1251     }
1252     }
1253    
1254     /* now loop over elements in row i1 of A->mainBlock, repeat
1255     the process we have done to block A->col_coupleBlock */
1256     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1257     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1258     i2 = A->mainBlock->pattern->index[j2];
1259     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1260     A_marker[i2 + num_Acouple_cols] = i_r;
1261     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1262     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1263     i_c = P->mainBlock->pattern->index[j3];
1264     if (P_marker[i_c] < row_marker) {
1265     P_marker[i_c] = i;
1266     i++;
1267     }
1268     }
1269     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1270     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1271     /* note that we need to map the column index in
1272     P->col_coupleBlock back into the column index in
1273     P_ext_couple */
1274     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1275     if (P_marker[i_c] < row_marker_ext) {
1276     P_marker[i_c] = j;
1277     j++;
1278     }
1279     }
1280     }
1281     }
1282     }
1283     }
1284    
1285     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1286     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1287     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1288     RAP_couple */
1289 jfenwick 4275 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1290     RAP_main_idx = new index_t[i];
1291     RAP_main_val = new double[i * block_size];
1292     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1293     RAP_couple_idx = new index_t[j];
1294     RAP_couple_val = new double[j * block_size];
1295 lgao 3779
1296     RAP_main_ptr[num_Pmain_cols] = i;
1297     RAP_couple_ptr[num_Pmain_cols] = j;
1298    
1299     #pragma omp parallel for private(i) schedule(static)
1300     for (i=0; i<sum; i++) P_marker[i] = -1;
1301     #pragma omp parallel for private(i) schedule(static)
1302     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1303    
1304     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1305     in R_main */
1306     i = 0;
1307     j = 0;
1308     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1309     /* Mark the start of row for both main block and couple block */
1310     row_marker = i;
1311     row_marker_ext = j;
1312     RAP_main_ptr[i_r] = row_marker;
1313     RAP_couple_ptr[i_r] = row_marker_ext;
1314    
1315     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1316     in row i_r of RAP_ext */
1317     P_marker[i_r] = i;
1318     for (ib=0; ib<block_size; ib++)
1319     RAP_main_val[i*block_size+ib] = ZERO;
1320     RAP_main_idx[i] = i_r;
1321     i++;
1322    
1323     for (j1=0; j1<num_neighbors; j1++) {
1324     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1325     if (shared[j2] == i_r) {
1326     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1327     i_c = RAP_ext_idx[k];
1328     if (i_c < num_Pmain_cols) {
1329     if (P_marker[i_c] < row_marker) {
1330     P_marker[i_c] = i;
1331     memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1332     RAP_main_idx[i] = i_c;
1333     i++;
1334     } else {
1335     t1_val = &(RAP_ext_val[k*block_size]);
1336     t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1337     for (ib=0; ib<block_size; ib++)
1338     t2_val[ib] += t1_val[ib];
1339     }
1340     } else {
1341     if (P_marker[i_c] < row_marker_ext) {
1342     P_marker[i_c] = j;
1343     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1344     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1345     j++;
1346     } else {
1347     t1_val = &(RAP_ext_val[k*block_size]);
1348     t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1349     for (ib=0; ib<block_size; ib++)
1350     t2_val[ib] += t1_val[ib];
1351     }
1352     }
1353     }
1354     break;
1355     }
1356     }
1357     }
1358    
1359     /* then, loop over elements in row i_r of R_main */
1360     j1_ub = R_main->pattern->ptr[i_r+1];
1361     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1362     i1 = R_main->pattern->index[j1];
1363     R_val = &(R_main->val[j1*P_block_size]);
1364    
1365     /* then, loop over elements in row i1 of A->col_coupleBlock */
1366     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1367     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1368     i2 = A->col_coupleBlock->pattern->index[j2];
1369     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1370     for (irb=0; irb<row_block_size; irb++)
1371     for (icb=0; icb<col_block_size; icb++)
1372     RA_val[irb+row_block_size*icb]=ZERO;
1373     for (irb=0; irb<P_block_size; irb++) {
1374     rtmp=R_val[irb];
1375     for (icb=0; icb<col_block_size; icb++) {
1376     RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1377     }
1378     }
1379    
1380    
1381     /* check whether entry RA[i_r, i2] has been previously visited.
1382     RAP new entry is possible only if entry RA[i_r, i2] has not
1383     been visited yet */
1384     if (A_marker[i2] != i_r) {
1385     /* first, mark entry RA[i_r,i2] as visited */
1386     A_marker[i2] = i_r;
1387    
1388     /* then loop over elements in row i2 of P_ext_main */
1389     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1390     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1391     i_c = P->row_coupleBlock->pattern->index[j3];
1392     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1393     for (irb=0; irb<row_block_size; irb++)
1394     for (icb=0; icb<col_block_size; icb++)
1395     RAP_val[irb+row_block_size*icb]=ZERO;
1396     for (icb=0; icb<P_block_size; icb++) {
1397     rtmp=temp_val[icb];
1398     for (irb=0; irb<row_block_size; irb++) {
1399     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1400     }
1401     }
1402    
1403    
1404     /* check whether entry RAP[i_r,i_c] has been created.
1405     If not yet created, create the entry and increase
1406     the total number of elements in RAP_ext */
1407     if (P_marker[i_c] < row_marker) {
1408     P_marker[i_c] = i;
1409     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1410     RAP_main_idx[i] = i_c;
1411     i++;
1412     } else {
1413     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1414     for (ib=0; ib<block_size; ib++) {
1415     temp_val[ib] += RAP_val[ib];
1416     }
1417     }
1418     }
1419    
1420     /* loop over elements in row i2 of P_ext_couple, do the same */
1421     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1422     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1423     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1424     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1425     for (irb=0; irb<row_block_size; irb++)
1426     for (icb=0; icb<col_block_size; icb++)
1427     RAP_val[irb+row_block_size*icb]=ZERO;
1428     for (icb=0; icb<P_block_size; icb++) {
1429     rtmp=temp_val[icb];
1430     for (irb=0; irb<row_block_size; irb++) {
1431     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1432     }
1433     }
1434    
1435     if (P_marker[i_c] < row_marker_ext) {
1436     P_marker[i_c] = j;
1437     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1438     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1439     j++;
1440     } else {
1441     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1442     for (ib=0; ib<block_size; ib++) {
1443     temp_val[ib] += RAP_val[ib];
1444     }
1445     }
1446     }
1447    
1448     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1449     Only the contributions are added. */
1450     } else {
1451     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1452     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1453     i_c = P->row_coupleBlock->pattern->index[j3];
1454     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1455     for (irb=0; irb<row_block_size; irb++)
1456     for (icb=0; icb<col_block_size; icb++)
1457     RAP_val[irb+row_block_size*icb]=ZERO;
1458     for (icb=0; icb<P_block_size; icb++) {
1459     rtmp=temp_val[icb];
1460     for (irb=0; irb<row_block_size; irb++) {
1461     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1462     }
1463     }
1464    
1465     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1466     for (ib=0; ib<block_size; ib++) {
1467     temp_val[ib] += RAP_val[ib];
1468     }
1469     }
1470     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1471     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1472     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1473     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1474     for (irb=0; irb<row_block_size; irb++)
1475     for (icb=0; icb<col_block_size; icb++)
1476     RAP_val[irb+row_block_size*icb]=ZERO;
1477     for (icb=0; icb<P_block_size; icb++) {
1478     rtmp=temp_val[icb];
1479     for (irb=0; irb<row_block_size; irb++) {
1480     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1481     }
1482     }
1483    
1484     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1485     for (ib=0; ib<block_size; ib++) {
1486     temp_val[ib] += RAP_val[ib];
1487     }
1488     }
1489     }
1490     }
1491    
1492     /* now loop over elements in row i1 of A->mainBlock, repeat
1493     the process we have done to block A->col_coupleBlock */
1494     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1495     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1496     i2 = A->mainBlock->pattern->index[j2];
1497     temp_val = &(A->mainBlock->val[j2*block_size]);
1498     for (irb=0; irb<row_block_size; irb++)
1499     for (icb=0; icb<col_block_size; icb++)
1500     RA_val[irb+row_block_size*icb]=ZERO;
1501     for (irb=0; irb<P_block_size; irb++) {
1502     rtmp=R_val[irb];
1503     for (icb=0; icb<col_block_size; icb++) {
1504     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1505     }
1506     }
1507    
1508     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1509     A_marker[i2 + num_Acouple_cols] = i_r;
1510     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1511     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1512     i_c = P->mainBlock->pattern->index[j3];
1513     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1514     for (irb=0; irb<row_block_size; irb++)
1515     for (icb=0; icb<col_block_size; icb++)
1516     RAP_val[irb+row_block_size*icb]=ZERO;
1517     for (icb=0; icb<P_block_size; icb++) {
1518     rtmp=temp_val[icb];
1519     for (irb=0; irb<row_block_size; irb++) {
1520     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1521     }
1522     }
1523     if (P_marker[i_c] < row_marker) {
1524     P_marker[i_c] = i;
1525     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1526     RAP_main_idx[i] = i_c;
1527     i++;
1528     } else {
1529     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1530     for (ib=0; ib<block_size; ib++) {
1531     temp_val[ib] += RAP_val[ib];
1532     }
1533     }
1534     }
1535     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1536     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1537     /* note that we need to map the column index in
1538     P->col_coupleBlock back into the column index in
1539     P_ext_couple */
1540     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1541     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1542     for (irb=0; irb<row_block_size; irb++)
1543     for (icb=0; icb<col_block_size; icb++)
1544     RAP_val[irb+row_block_size*icb]=ZERO;
1545     for (icb=0; icb<P_block_size; icb++) {
1546     rtmp=temp_val[icb];
1547     for (irb=0; irb<row_block_size; irb++) {
1548     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1549     }
1550     }
1551    
1552     if (P_marker[i_c] < row_marker_ext) {
1553     P_marker[i_c] = j;
1554     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1555     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1556     j++;
1557     } else {
1558     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1559     for (ib=0; ib<block_size; ib++) {
1560     temp_val[ib] += RAP_val[ib];
1561     }
1562     }
1563     }
1564    
1565     } else {
1566     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1567     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1568     i_c = P->mainBlock->pattern->index[j3];
1569     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1570     for (irb=0; irb<row_block_size; irb++)
1571     for (icb=0; icb<col_block_size; icb++)
1572     RAP_val[irb+row_block_size*icb]=ZERO;
1573     for (icb=0; icb<P_block_size; icb++) {
1574     rtmp=temp_val[icb];
1575     for (irb=0; irb<row_block_size; irb++) {
1576     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1577     }
1578     }
1579    
1580     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1581     for (ib=0; ib<block_size; ib++) {
1582     temp_val[ib] += RAP_val[ib];
1583     }
1584     }
1585     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1586     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1587     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1588     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1589     for (irb=0; irb<row_block_size; irb++)
1590     for (icb=0; icb<col_block_size; icb++)
1591     RAP_val[irb+row_block_size*icb]=ZERO;
1592     for (icb=0; icb<P_block_size; icb++) {
1593     rtmp = temp_val[icb];
1594     for (irb=0; irb<row_block_size; irb++) {
1595     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1596     }
1597     }
1598    
1599     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1600     for (ib=0; ib<block_size; ib++) {
1601     temp_val[ib] += RAP_val[ib];
1602     }
1603     }
1604     }
1605     }
1606     }
1607    
1608     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1609     if (i > row_marker) {
1610     offset = i - row_marker;
1611 jfenwick 4275 temp = new index_t[offset];
1612 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1613 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1614     temp[iptr] = RAP_main_idx[row_marker+iptr];
1615     if (offset > 0) {
1616     #ifdef USE_QSORTG
1617 caltinay 4803 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1618 lgao 3779 #else
1619 caltinay 4803 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1620 lgao 3779 #endif
1621     }
1622 jfenwick 4275 temp_val = new double[offset * block_size];
1623 lgao 3821 #pragma omp parallel for schedule(static) private(iptr,k)
1624 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1625     k = P_marker[temp[iptr]];
1626     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1627     P_marker[temp[iptr]] = iptr + row_marker;
1628     }
1629 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1630 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1631     RAP_main_idx[row_marker+iptr] = temp[iptr];
1632     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1633     }
1634 jfenwick 4275 delete[] temp;
1635     delete[] temp_val;
1636 lgao 3779 }
1637     if (j > row_marker_ext) {
1638     offset = j - row_marker_ext;
1639 jfenwick 4275 temp = new index_t[offset];
1640 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1641 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1642     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1643     if (offset > 0) {
1644     #ifdef USE_QSORTG
1645 caltinay 4803 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1646 lgao 3779 #else
1647 caltinay 4803 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1648 lgao 3779 #endif
1649     }
1650 jfenwick 4275 temp_val = new double[offset * block_size];
1651 lgao 3821 #pragma omp parallel for schedule(static) private(iptr, k)
1652 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1653     k = P_marker[temp[iptr] + num_Pmain_cols];
1654     memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1655     P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1656     }
1657 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1658 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1659     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1660     memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1661     }
1662 jfenwick 4275 delete[] temp;
1663     delete[] temp_val;
1664 lgao 3779 }
1665     }
1666    
1667 jfenwick 4275 delete[] RA_val;
1668     delete[] RAP_val;
1669     delete[] A_marker;
1670     delete[] Pext_to_RAP;
1671     delete[] Pcouple_to_RAP;
1672     delete[] RAP_ext_ptr;
1673     delete[] RAP_ext_idx;
1674     delete[] RAP_ext_val;
1675 caltinay 4797 paso::SparseMatrix_free(R_main);
1676     paso::SparseMatrix_free(R_couple);
1677 lgao 3779
1678     /* Check whether there are empty columns in RAP_couple */
1679     #pragma omp parallel for schedule(static) private(i)
1680     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1681     /* num of non-empty columns is stored in "k" */
1682     k = 0;
1683     j = RAP_couple_ptr[num_Pmain_cols];
1684     for (i=0; i<j; i++) {
1685     i1 = RAP_couple_idx[i];
1686     if (P_marker[i1]) {
1687     P_marker[i1] = 0;
1688     k++;
1689     }
1690     }
1691    
1692     /* empty columns is found */
1693     if (k < num_RAPext_cols) {
1694 jfenwick 4275 temp = new index_t[k];
1695 lgao 3779 k = 0;
1696     for (i=0; i<num_RAPext_cols; i++)
1697     if (!P_marker[i]) {
1698     P_marker[i] = k;
1699     temp[k] = global_id_RAP[i];
1700     k++;
1701     }
1702 lgao 3821 #pragma omp parallel for schedule(static) private(i, i1)
1703 lgao 3779 for (i=0; i<j; i++) {
1704     i1 = RAP_couple_idx[i];
1705     RAP_couple_idx[i] = P_marker[i1];
1706     }
1707     num_RAPext_cols = k;
1708 jfenwick 4275 delete[] global_id_RAP;
1709 lgao 3779 global_id_RAP = temp;
1710     }
1711 jfenwick 4275 delete[] P_marker;
1712 lgao 3779
1713     /******************************************************/
1714 jfenwick 4345 /* Start to create the coarse level System Matrix A_c */
1715 lgao 3779 /******************************************************/
1716     /* first, prepare the sender/receiver for the col_connector */
1717     dist = P->pattern->input_distribution->first_component;
1718 jfenwick 4275 recv_len = new dim_t[size];
1719     send_len = new dim_t[size];
1720     neighbor = new Esys_MPI_rank[size];
1721     offsetInShared = new index_t[size+1];
1722     shared = new index_t[num_RAPext_cols];
1723 lgao 3779 memset(recv_len, 0, sizeof(dim_t) * size);
1724     memset(send_len, 0, sizeof(dim_t) * size);
1725     num_neighbors = 0;
1726     offsetInShared[0] = 0;
1727     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1728     shared[i] = i + num_Pmain_cols;
1729     if (k <= global_id_RAP[i]) {
1730     if (recv_len[j] > 0) {
1731     neighbor[num_neighbors] = j;
1732     num_neighbors ++;
1733     offsetInShared[num_neighbors] = i;
1734     }
1735     while (k <= global_id_RAP[i]) {
1736     j++;
1737     k = dist[j+1];
1738     }
1739     }
1740     recv_len[j] ++;
1741     }
1742     if (recv_len[j] > 0) {
1743     neighbor[num_neighbors] = j;
1744     num_neighbors ++;
1745     offsetInShared[num_neighbors] = i;
1746     }
1747    
1748 caltinay 4816 paso::SharedComponents_ptr recv(new paso::SharedComponents(
1749     num_Pmain_cols, num_neighbors, neighbor, shared,
1750     offsetInShared, 1, 0, mpi_info));
1751    
1752 lgao 3828 #ifdef ESYS_MPI
1753 lgao 3779 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1754 lgao 3828 #endif
1755 lgao 3779
1756     #ifdef ESYS_MPI
1757 jfenwick 4275 mpi_requests=new MPI_Request[size*2];
1758     mpi_stati=new MPI_Status[size*2];
1759 lgao 3779 #else
1760 jfenwick 4275 mpi_requests=new int[size*2];
1761     mpi_stati=new int[size*2];
1762 lgao 3779 #endif
1763     num_neighbors = 0;
1764     j = 0;
1765     offsetInShared[0] = 0;
1766     for (i=0; i<size; i++) {
1767     if (send_len[i] > 0) {
1768     neighbor[num_neighbors] = i;
1769     num_neighbors ++;
1770     j += send_len[i];
1771     offsetInShared[num_neighbors] = j;
1772     }
1773     }
1774 jfenwick 4275 delete[] shared;
1775     shared = new index_t[j];
1776 lgao 3779 for (i=0, j=0; i<num_neighbors; i++) {
1777     k = neighbor[i];
1778 lgao 3828 #ifdef ESYS_MPI
1779 lgao 3779 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1780     mpi_info->msg_tag_counter+k,
1781     mpi_info->comm, &mpi_requests[i]);
1782 lgao 3828 #endif
1783 lgao 3779 j += send_len[k];
1784     }
1785     for (i=0, j=0; i<recv->numNeighbors; i++) {
1786     k = recv->neighbor[i];
1787 lgao 3828 #ifdef ESYS_MPI
1788 lgao 3779 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1789     mpi_info->msg_tag_counter+rank,
1790     mpi_info->comm, &mpi_requests[i+num_neighbors]);
1791 lgao 3828 #endif
1792 lgao 3779 j += recv_len[k];
1793     }
1794 lgao 3828 #ifdef ESYS_MPI
1795 lgao 3779 MPI_Waitall(num_neighbors + recv->numNeighbors,
1796     mpi_requests, mpi_stati);
1797 lgao 3828 #endif
1798 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1799 lgao 3779
1800     j = offsetInShared[num_neighbors];
1801     offset = dist[rank];
1802 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1803 lgao 3779 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1804    
1805 caltinay 4816 paso::SharedComponents_ptr send(new paso::SharedComponents(
1806     num_Pmain_cols, num_neighbors, neighbor, shared,
1807     offsetInShared, 1, 0, mpi_info));
1808    
1809 caltinay 4817 col_connector.reset(new paso::Connector(send, recv));
1810 jfenwick 4275 delete[] shared;
1811 lgao 3779
1812     /* now, create row distribution (output_distri) and col
1813     distribution (input_distribution) */
1814 caltinay 4814 input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1815     output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1816 lgao 3779
1817     /* then, prepare the sender/receiver for the row_connector, first, prepare
1818     the information for sender */
1819     sum = RAP_couple_ptr[num_Pmain_cols];
1820 jfenwick 4275 len = new dim_t[size];
1821     send_ptr = new index_t*[size];
1822     send_idx = new index_t*[size];
1823 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1824 lgao 3779 for (i=0; i<size; i++) {
1825 jfenwick 4275 send_ptr[i] = new index_t[num_Pmain_cols];
1826     send_idx[i] = new index_t[sum];
1827 lgao 3779 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1828     }
1829     memset(len, 0, sizeof(dim_t) * size);
1830     recv = col_connector->recv;
1831     sum=0;
1832     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1833     i1 = RAP_couple_ptr[i_r];
1834     i2 = RAP_couple_ptr[i_r+1];
1835     if (i2 > i1) {
1836     /* then row i_r will be in the sender of row_connector, now check
1837 jfenwick 4345 how many neighbours i_r needs to be send to */
1838 lgao 3779 for (j=i1; j<i2; j++) {
1839     i_c = RAP_couple_idx[j];
1840     /* find out the corresponding neighbor "p" of column i_c */
1841     for (p=0; p<recv->numNeighbors; p++) {
1842     if (i_c < recv->offsetInShared[p+1]) {
1843     k = recv->neighbor[p];
1844     if (send_ptr[k][i_r] == 0) sum++;
1845     send_ptr[k][i_r] ++;
1846     send_idx[k][len[k]] = global_id_RAP[i_c];
1847     len[k] ++;
1848     break;
1849     }
1850     }
1851     }
1852     }
1853     }
1854     if (global_id_RAP) {
1855 jfenwick 4275 delete[] global_id_RAP;
1856 lgao 3779 global_id_RAP = NULL;
1857     }
1858    
1859     /* now allocate the sender */
1860 jfenwick 4275 shared = new index_t[sum];
1861 lgao 3779 memset(send_len, 0, sizeof(dim_t) * size);
1862     num_neighbors=0;
1863     offsetInShared[0] = 0;
1864     for (p=0, k=0; p<size; p++) {
1865     for (i=0; i<num_Pmain_cols; i++) {
1866     if (send_ptr[p][i] > 0) {
1867     shared[k] = i;
1868     k++;
1869     send_ptr[p][send_len[p]] = send_ptr[p][i];
1870     send_len[p]++;
1871     }
1872     }
1873     if (k > offsetInShared[num_neighbors]) {
1874     neighbor[num_neighbors] = p;
1875     num_neighbors ++;
1876     offsetInShared[num_neighbors] = k;
1877     }
1878     }
1879 caltinay 4816 send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1880     neighbor, shared, offsetInShared, 1, 0, mpi_info));
1881 lgao 3779
1882     /* send/recv number of rows will be sent from current proc
1883     recover info for the receiver of row_connector from the sender */
1884 lgao 3828 #ifdef ESYS_MPI
1885 lgao 3779 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1886 lgao 3828 #endif
1887 lgao 3779 num_neighbors = 0;
1888     offsetInShared[0] = 0;
1889     j = 0;
1890     for (i=0; i<size; i++) {
1891     if (i != rank && recv_len[i] > 0) {
1892     neighbor[num_neighbors] = i;
1893     num_neighbors ++;
1894     j += recv_len[i];
1895     offsetInShared[num_neighbors] = j;
1896     }
1897     }
1898 jfenwick 4275 delete[] shared;
1899     delete[] recv_len;
1900     shared = new index_t[j];
1901 lgao 3779 k = offsetInShared[num_neighbors];
1902 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1903 lgao 3779 for (i=0; i<k; i++) {
1904     shared[i] = i + num_Pmain_cols;
1905     }
1906 caltinay 4816 recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1907     neighbor, shared, offsetInShared, 1, 0, mpi_info));
1908 caltinay 4817 row_connector.reset(new paso::Connector(send, recv));
1909 jfenwick 4275 delete[] shared;
1910 lgao 3779
1911     /* send/recv pattern->ptr for rowCoupleBlock */
1912     num_RAPext_rows = offsetInShared[num_neighbors];
1913 jfenwick 4275 row_couple_ptr = new index_t[num_RAPext_rows+1];
1914 lgao 3779 for (p=0; p<num_neighbors; p++) {
1915     j = offsetInShared[p];
1916     i = offsetInShared[p+1];
1917 lgao 3828 #ifdef ESYS_MPI
1918 lgao 3779 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1919     mpi_info->msg_tag_counter+neighbor[p],
1920     mpi_info->comm, &mpi_requests[p]);
1921 lgao 3828 #endif
1922 lgao 3779 }
1923     send = row_connector->send;
1924     for (p=0; p<send->numNeighbors; p++) {
1925 lgao 3828 #ifdef ESYS_MPI
1926 lgao 3779 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1927     MPI_INT, send->neighbor[p],
1928     mpi_info->msg_tag_counter+rank,
1929     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1930 lgao 3828 #endif
1931 lgao 3779 }
1932 lgao 3828 #ifdef ESYS_MPI
1933 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1934     mpi_requests, mpi_stati);
1935 lgao 3828 #endif
1936 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1937 jfenwick 4275 delete[] send_len;
1938 lgao 3779
1939     sum = 0;
1940     for (i=0; i<num_RAPext_rows; i++) {
1941     k = row_couple_ptr[i];
1942     row_couple_ptr[i] = sum;
1943     sum += k;
1944     }
1945     row_couple_ptr[num_RAPext_rows] = sum;
1946    
1947     /* send/recv pattern->index for rowCoupleBlock */
1948     k = row_couple_ptr[num_RAPext_rows];
1949 jfenwick 4275 row_couple_idx = new index_t[k];
1950 lgao 3779 for (p=0; p<num_neighbors; p++) {
1951     j1 = row_couple_ptr[offsetInShared[p]];
1952     j2 = row_couple_ptr[offsetInShared[p+1]];
1953 lgao 3828 #ifdef ESYS_MPI
1954 lgao 3779 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1955     mpi_info->msg_tag_counter+neighbor[p],
1956     mpi_info->comm, &mpi_requests[p]);
1957 lgao 3828 #endif
1958 lgao 3779 }
1959     for (p=0; p<send->numNeighbors; p++) {
1960 lgao 3828 #ifdef ESYS_MPI
1961 lgao 3779 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1962     MPI_INT, send->neighbor[p],
1963     mpi_info->msg_tag_counter+rank,
1964     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1965 lgao 3828 #endif
1966 lgao 3779 }
1967 lgao 3828 #ifdef ESYS_MPI
1968 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1969     mpi_requests, mpi_stati);
1970 lgao 3828 #endif
1971 jfenwick 4505 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1972 lgao 3779
1973     offset = input_dist->first_component[rank];
1974     k = row_couple_ptr[num_RAPext_rows];
1975 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1976 lgao 3779 for (i=0; i<k; i++) {
1977     row_couple_idx[i] -= offset;
1978     }
1979 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1980 lgao 3779 for (i=0; i<size; i++) {
1981 jfenwick 4275 delete[] send_ptr[i];
1982     delete[] send_idx[i];
1983 lgao 3779 }
1984 jfenwick 4275 delete[] send_ptr;
1985     delete[] send_idx;
1986     delete[] len;
1987     delete[] offsetInShared;
1988     delete[] neighbor;
1989     delete[] mpi_requests;
1990     delete[] mpi_stati;
1991 lgao 3779 Esys_MPIInfo_free(mpi_info);
1992    
1993     /* Now, we can create pattern for mainBlock and coupleBlock */
1994 caltinay 4803 main_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1995 lgao 3779 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1996 caltinay 4803 col_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1997 lgao 3779 num_Pmain_cols, num_RAPext_cols,
1998     RAP_couple_ptr, RAP_couple_idx);
1999 caltinay 4803 row_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2000 lgao 3779 num_RAPext_rows, num_Pmain_cols,
2001     row_couple_ptr, row_couple_idx);
2002    
2003     /* next, create the system matrix */
2004 caltinay 4800 pattern = new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
2005 lgao 3779 output_dist, input_dist, main_pattern, col_couple_pattern,
2006     row_couple_pattern, col_connector, row_connector);
2007     out = Paso_SystemMatrix_alloc(A->type, pattern,
2008     row_block_size, col_block_size, FALSE);
2009    
2010     /* finally, fill in the data*/
2011     memcpy(out->mainBlock->val, RAP_main_val,
2012     out->mainBlock->len* sizeof(double));
2013     memcpy(out->col_coupleBlock->val, RAP_couple_val,
2014     out->col_coupleBlock->len * sizeof(double));
2015    
2016     /* Clean up */
2017 caltinay 4800 paso::SystemMatrixPattern_free(pattern);
2018 caltinay 4803 paso::Pattern_free(main_pattern);
2019     paso::Pattern_free(col_couple_pattern);
2020     paso::Pattern_free(row_couple_pattern);
2021 jfenwick 4275 delete[] RAP_main_val;
2022     delete[] RAP_couple_val;
2023 lgao 3779 if (Esys_noError()) {
2024     return out;
2025     } else {
2026     Paso_SystemMatrix_free(out);
2027     return NULL;
2028     }
2029     }
2030    
2031    
2032     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2033     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2034     {
2035     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2036 caltinay 4797 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
2037 lgao 3779 Paso_SystemMatrix *out=NULL;
2038 caltinay 4800 paso::SystemMatrixPattern *pattern=NULL;
2039 caltinay 4814 paso::Distribution_ptr input_dist, output_dist;
2040 caltinay 4816 paso::SharedComponents_ptr send, recv;
2041 caltinay 4817 paso::Connector_ptr col_connector, row_connector;
2042 caltinay 4803 paso::Pattern *main_pattern=NULL;
2043     paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2044 lgao 3779 const dim_t row_block_size=A->row_block_size;
2045     const dim_t col_block_size=A->col_block_size;
2046     const dim_t block_size = A->block_size;
2047     const double ZERO = 0.0;
2048     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2049     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2050     index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2051