/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Annotation of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4345 - (hide annotations)
Fri Mar 29 07:09:41 2013 UTC (6 years, 8 months ago) by jfenwick
Original Path: branches/doubleplusgood/paso/src/AMG_Interpolation.cpp
File size: 122380 byte(s)
Spelling fixes
1 jfenwick 3981 /*****************************************************************************
2 lgao 3763 *
3 jfenwick 4154 * Copyright (c) 2003-2013 by University of Queensland
4 jfenwick 3981 * http://www.uq.edu.au
5 lgao 3763 *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10 jfenwick 3981 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11     * Development since 2012 by School of Earth Sciences
12     *
13     *****************************************************************************/
14 lgao 3763
15    
16 jfenwick 3981 /************************************************************************************/
17 lgao 3763
18     /* Paso: defines AMG Interpolation */
19    
20 jfenwick 3981 /************************************************************************************/
21 lgao 3763
22     /* Author: l.gao@uq.edu.au */
23    
24 jfenwick 3981 /************************************************************************************/
25 lgao 3763
26     #include "Paso.h"
27     #include "SparseMatrix.h"
28     #include "PasoUtil.h"
29     #include "Preconditioner.h"
30    
31 jfenwick 3981 /************************************************************************************
32 lgao 3763
33 jfenwick 4345 Methods necessary for AMG preconditioner
34 lgao 3763
35     construct n_C x n_C interpolation operator A_C from matrix A
36     and prolongation matrix P.
37    
38     The coarsening operator A_C is defined by RAP where R=P^T.
39    
40 jfenwick 3981 *************************************************************************************/
41 lgao 3763
42 jfenwick 4345 /* Extend system matrix B with extra two sparse matrices:
43 lgao 3763 B_ext_main and B_ext_couple
44 jfenwick 4345 The combination of this two sparse matrices represents
45     the portion of B that is stored on neighbour procs and
46 lgao 3763 needed locally for triple matrix product (B^T) A B.
47    
48     FOR NOW, we assume that the system matrix B has a NULL
49     row_coupleBlock and a NULL remote_coupleBlock. Based on
50     this assumption, we use link row_coupleBlock for sparse
51     matrix B_ext_main, and link remote_coupleBlock for sparse
52     matrix B_ext_couple.
53    
54     To be specific, B_ext (on processor k) are group of rows
55     in B, where the list of rows from processor q is given by
56     A->col_coupler->connector->send->shared[rPtr...]
57     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
58     on q. */
59     void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
60     {
61     Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
62     Paso_Coupler *coupler=NULL;
63     Paso_SharedComponents *send=NULL, *recv=NULL;
64     double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
65     index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
66     index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
67     index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
68     index_t *idx_m=NULL, *idx_c=NULL;
69 lgao 3828 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
70 lgao 3763 index_t offset, len, block_size, block_size_size, max_num_cols;
71     index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
72     dim_t *recv_degree=NULL, *send_degree=NULL;
73     dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
74    
75     if (size == 1) return;
76    
77     if (B->row_coupleBlock) {
78     Paso_SparseMatrix_free(B->row_coupleBlock);
79     B->row_coupleBlock = NULL;
80     }
81    
82     if (B->row_coupleBlock || B->remote_coupleBlock) {
83     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
84     return;
85     }
86    
87     /* sending/receiving unknown's global ID */
88     num_main_cols = B->mainBlock->numCols;
89 jfenwick 4275 cols = new double[num_main_cols];
90 lgao 3763 offset = B->col_distribution->first_component[rank];
91     #pragma omp parallel for private(i) schedule(static)
92     for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
93     if (B->global_id == NULL) {
94     coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
95     Paso_Coupler_startCollect(coupler, cols);
96     }
97    
98 jfenwick 4275 recv_buf = new index_t[size];
99     recv_degree = new dim_t[size];
100     recv_offset = new index_t[size+1];
101 lgao 3763 #pragma omp parallel for private(i) schedule(static)
102     for (i=0; i<size; i++){
103     recv_buf[i] = 0;
104     recv_degree[i] = 1;
105     recv_offset[i] = i;
106     }
107    
108 lgao 3779 block_size = B->block_size;
109 lgao 3763 block_size_size = block_size * sizeof(double);
110     num_couple_cols = B->col_coupleBlock->numCols;
111     send = A->col_coupler->connector->send;
112     recv = A->col_coupler->connector->recv;
113     num_neighbors = send->numNeighbors;
114     p = send->offsetInShared[num_neighbors];
115 lgao 3767 len = p * B->col_distribution->first_component[size];
116 jfenwick 4275 send_buf = new double[len * block_size];
117     send_idx = new index_t[len];
118     send_offset = new index_t[(p+1)*2];
119     send_degree = new dim_t[num_neighbors];
120 lgao 3763 i = num_main_cols + num_couple_cols;
121 jfenwick 4275 send_m = new double[i * block_size];
122     send_c = new double[i * block_size];
123     idx_m = new index_t[i];
124     idx_c = new index_t[i];
125 lgao 3763
126     /* waiting for receiving unknown's global ID */
127     if (B->global_id == NULL) {
128     Paso_Coupler_finishCollect(coupler);
129 jfenwick 4275 global_id = new index_t[num_couple_cols];
130 lgao 3763 #pragma omp parallel for private(i) schedule(static)
131     for (i=0; i<num_couple_cols; ++i)
132     global_id[i] = coupler->recv_buffer[i];
133     B->global_id = global_id;
134     Paso_Coupler_free(coupler);
135     } else
136     global_id = B->global_id;
137    
138     /* distribute the number of cols in current col_coupleBlock to all ranks */
139 lgao 3828 #ifdef ESYS_MPI
140 lgao 3763 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
141 lgao 3828 #endif
142 lgao 3763
143     /* distribute global_ids of cols to be considered to all ranks*/
144     len = 0;
145     max_num_cols = 0;
146     for (i=0; i<size; i++){
147     recv_degree[i] = recv_buf[i];
148     recv_offset[i] = len;
149     len += recv_buf[i];
150     if (max_num_cols < recv_buf[i])
151     max_num_cols = recv_buf[i];
152     }
153     recv_offset[size] = len;
154 jfenwick 4275 cols_array = new index_t[len];
155 lgao 3828 #ifdef ESYS_MPI
156 lgao 3763 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
157 lgao 3828 #endif
158 lgao 3763
159     /* first, prepare the ptr_ptr to be received */
160     q = recv->numNeighbors;
161     len = recv->offsetInShared[q];
162 jfenwick 4275 ptr_ptr = new index_t[(len+1) * 2];
163 lgao 3763 for (p=0; p<q; p++) {
164     row = recv->offsetInShared[p];
165     m = recv->offsetInShared[p + 1];
166 lgao 3828 #ifdef ESYS_MPI
167 lgao 3767 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
168 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
169     A->mpi_info->comm,
170     &(A->col_coupler->mpi_requests[p]));
171 lgao 3828 #endif
172 lgao 3763 }
173    
174     /* now prepare the rows to be sent (the degree, the offset and the data) */
175     len = 0;
176 lgao 3817 i0 = 0;
177 lgao 3763 for (p=0; p<num_neighbors; p++) {
178 lgao 3817 i = i0;
179 lgao 3763 neighbor = send->neighbor[p];
180     m_lb = B->col_distribution->first_component[neighbor];
181     m_ub = B->col_distribution->first_component[neighbor + 1];
182     j_ub = send->offsetInShared[p + 1];
183     for (j=send->offsetInShared[p]; j<j_ub; j++) {
184     row = send->shared[j];
185     l_m = 0;
186     l_c = 0;
187     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
188     k_lb = B->col_coupleBlock->pattern->ptr[row];
189    
190     /* check part of col_coupleBlock for data to be passed @row */
191     for (k=k_lb; k<k_ub; k++) {
192     m = global_id[B->col_coupleBlock->pattern->index[k]];
193     if (m > offset) break;
194     if (m>= m_lb && m < m_ub) {
195     /* data to be passed to sparse matrix B_ext_main */
196     idx_m[l_m] = m - m_lb;
197     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
198     l_m++;
199     } else {
200     /* data to be passed to sparse matrix B_ext_couple */
201     idx_c[l_c] = m;
202     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
203     l_c++;
204     }
205     }
206     k_lb = k;
207    
208     /* check mainBlock for data to be passed @row to sparse
209     matrix B_ext_couple */
210     k_ub = B->mainBlock->pattern->ptr[row + 1];
211     k = B->mainBlock->pattern->ptr[row];
212     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
213     for (; k<k_ub; k++) {
214     m = B->mainBlock->pattern->index[k] + offset;
215     idx_c[l_c] = m;
216     l_c++;
217     }
218    
219     /* check the rest part of col_coupleBlock for data to
220     be passed @row to sparse matrix B_ext_couple */
221     k = k_lb;
222     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
223     for (k=k_lb; k<k_ub; k++) {
224     m = global_id[B->col_coupleBlock->pattern->index[k]];
225 lgao 3767 if (m>= m_lb && m < m_ub) {
226 lgao 3763 /* data to be passed to sparse matrix B_ext_main */
227     idx_m[l_m] = m - m_lb;
228     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
229     l_m++;
230     } else {
231     /* data to be passed to sparse matrix B_ext_couple */
232     idx_c[l_c] = m;
233     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
234     l_c++;
235     }
236     }
237    
238 lgao 3779 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
239 lgao 3767 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
240 lgao 3763 send_offset[2*i] = l_m;
241     len += l_m;
242 lgao 3779 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
243 lgao 3767 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
244 lgao 3763 send_offset[2*i+1] = l_c;
245     len += l_c;
246     i++;
247     }
248    
249     /* sending */
250 lgao 3828 #ifdef ESYS_MPI
251 lgao 3817 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
252 lgao 3763 A->mpi_info->msg_tag_counter+rank,
253     A->mpi_info->comm,
254     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
255 lgao 3828 #endif
256 lgao 3763 send_degree[p] = len;
257 lgao 3817 i0 = i;
258 lgao 3763 }
259 jfenwick 4275 delete[] send_m;
260     delete[] send_c;
261     delete[] idx_m;
262     delete[] idx_c;
263 lgao 3763
264    
265     q = recv->numNeighbors;
266     len = recv->offsetInShared[q];
267 jfenwick 4275 ptr_main = new index_t[(len+1)];
268     ptr_couple = new index_t[(len+1)];
269 lgao 3763
270 lgao 3828 #ifdef ESYS_MPI
271 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
272     A->col_coupler->mpi_requests,
273     A->col_coupler->mpi_stati);
274 lgao 3828 #endif
275 lgao 3763 A->mpi_info->msg_tag_counter += size;
276    
277     j = 0;
278     k = 0;
279     ptr_main[0] = 0;
280     ptr_couple[0] = 0;
281     for (i=0; i<len; i++) {
282     j += ptr_ptr[2*i];
283     k += ptr_ptr[2*i+1];
284     ptr_main[i+1] = j;
285     ptr_couple[i+1] = k;
286     }
287    
288 jfenwick 4275 delete[] ptr_ptr;
289     idx_main = new index_t[j];
290     idx_couple = new index_t[k];
291     ptr_idx = new index_t[j+k];
292     ptr_val = new double[(j+k) * block_size];
293 lgao 3763
294     /* send/receive index array */
295     j=0;
296     k_ub = 0;
297     for (p=0; p<recv->numNeighbors; p++) {
298     k = recv->offsetInShared[p];
299     m = recv->offsetInShared[p+1];
300     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
301     if (i > 0) {
302     k_ub ++;
303 lgao 3828 #ifdef ESYS_MPI
304 lgao 3763 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
305     A->mpi_info->msg_tag_counter+recv->neighbor[p],
306     A->mpi_info->comm,
307     &(A->col_coupler->mpi_requests[p]));
308 lgao 3828 #endif
309 lgao 3763 }
310     j += i;
311     }
312    
313     j=0;
314     k_ub = 0;
315     for (p=0; p<num_neighbors; p++) {
316     i = send_degree[p] - j;
317     if (i > 0){
318     k_ub ++;
319 lgao 3828 #ifdef ESYS_MPI
320 lgao 3763 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
321     A->mpi_info->msg_tag_counter+rank,
322     A->mpi_info->comm,
323     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
324 lgao 3828 #endif
325 lgao 3763 }
326     j = send_degree[p];
327     }
328    
329 lgao 3828 #ifdef ESYS_MPI
330 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
331     A->col_coupler->mpi_requests,
332     A->col_coupler->mpi_stati);
333 lgao 3828 #endif
334 lgao 3763 A->mpi_info->msg_tag_counter += size;
335    
336 lgao 3821 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
337 lgao 3763 for (i=0; i<len; i++) {
338     j = ptr_main[i];
339     k = ptr_main[i+1];
340     m = ptr_couple[i];
341     for (p=j; p<k; p++) {
342     idx_main[p] = ptr_idx[m+p];
343     }
344     j = ptr_couple[i+1];
345     for (p=m; p<j; p++) {
346     idx_couple[p] = ptr_idx[k+p];
347     }
348     }
349 jfenwick 4275 delete[] ptr_idx;
350 lgao 3763
351     /* allocate pattern and sparsematrix for B_ext_main */
352     pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
353     len, num_main_cols, ptr_main, idx_main);
354     B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
355     pattern_main, B->row_block_size, B->col_block_size,
356     FALSE);
357     Paso_Pattern_free(pattern_main);
358    
359     /* allocate pattern and sparsematrix for B_ext_couple */
360     pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
361     len, B->col_distribution->first_component[size],
362     ptr_couple, idx_couple);
363     B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
364     pattern_couple, B->row_block_size, B->col_block_size,
365     FALSE);
366     Paso_Pattern_free(pattern_couple);
367    
368     /* send/receive value array */
369     j=0;
370     for (p=0; p<recv->numNeighbors; p++) {
371     k = recv->offsetInShared[p];
372     m = recv->offsetInShared[p+1];
373     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
374 lgao 3828 #ifdef ESYS_MPI
375 lgao 3763 if (i > 0)
376     MPI_Irecv(&(ptr_val[j]), i * block_size,
377     MPI_DOUBLE, recv->neighbor[p],
378     A->mpi_info->msg_tag_counter+recv->neighbor[p],
379     A->mpi_info->comm,
380     &(A->col_coupler->mpi_requests[p]));
381 lgao 3828 #endif
382 lgao 3779 j += (i * block_size);
383 lgao 3763 }
384    
385     j=0;
386     for (p=0; p<num_neighbors; p++) {
387     i = send_degree[p] - j;
388 lgao 3828 #ifdef ESYS_MPI
389 lgao 3763 if (i > 0)
390 lgao 3779 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
391 lgao 3763 A->mpi_info->msg_tag_counter+rank,
392     A->mpi_info->comm,
393     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
394 lgao 3828 #endif
395 lgao 3779 j = send_degree[p] ;
396 lgao 3763 }
397    
398 lgao 3828 #ifdef ESYS_MPI
399 lgao 3763 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
400     A->col_coupler->mpi_requests,
401     A->col_coupler->mpi_stati);
402 lgao 3828 #endif
403 lgao 3763 A->mpi_info->msg_tag_counter += size;
404    
405     #pragma omp parallel for private(i,j,k,m,p) schedule(static)
406     for (i=0; i<len; i++) {
407     j = ptr_main[i];
408     k = ptr_main[i+1];
409     m = ptr_couple[i];
410     for (p=j; p<k; p++) {
411 lgao 3817 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
412 lgao 3763 }
413     j = ptr_couple[i+1];
414     for (p=m; p<j; p++) {
415 lgao 3817 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
416 lgao 3763 }
417     }
418 jfenwick 4275 delete[] ptr_val;
419 lgao 3763
420 lgao 3817
421 lgao 3763 /* release all temp memory allocation */
422 jfenwick 4275 delete[] cols;
423     delete[] cols_array;
424     delete[] recv_offset;
425     delete[] recv_degree;
426     delete[] recv_buf;
427     delete[] send_buf;
428     delete[] send_offset;
429     delete[] send_degree;
430     delete[] send_idx;
431 lgao 3763 }
432    
433     /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
434     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
435 jfenwick 4345 to copy block of data in T to neighbour processors, defined by
436 lgao 3763 P->col_coupler->connector->recv->neighbor[k] where k is in
437     [0, P->col_coupler->connector->recv->numNeighbors).
438     Rows to be copied to neighbor processor k is in the list defined by
439     P->col_coupler->connector->recv->offsetInShared[k] ...
440     P->col_coupler->connector->recv->offsetInShared[k+1] */
441     void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
442 lgao 3779 index_t **p_ptr, index_t **p_idx, double **p_val,
443     index_t *global_id, index_t block_size)
444 lgao 3763 {
445     Paso_SharedComponents *send=NULL, *recv=NULL;
446     index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
447 gross 3834 index_t i, j, p, m, n, size;
448 lgao 3763 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
449     index_t *ptr=*p_ptr, *idx=*p_idx;
450     double *val=*p_val, *recv_val=NULL;
451 gross 3834 #ifdef ESYS_MPI
452     index_t rank = P->mpi_info->rank;
453     #endif
454 lgao 3763
455     size = P->mpi_info->size;
456     send = P->col_coupler->connector->recv;
457     recv = P->col_coupler->connector->send;
458     send_neighbors = send->numNeighbors;
459     recv_neighbors = recv->numNeighbors;
460     send_rows = P->col_coupleBlock->numCols;
461     recv_rows = recv->offsetInShared[recv_neighbors];
462    
463 jfenwick 4275 send_degree = new index_t[send_rows];
464     recv_ptr = new index_t[recv_rows + 1];
465 lgao 3763 #pragma omp for schedule(static) private(i)
466     for (i=0; i<send_rows; i++)
467     send_degree[i] = ptr[i+1] - ptr[i];
468    
469     /* First, send/receive the degree */
470     for (p=0; p<recv_neighbors; p++) { /* Receiving */
471     m = recv->offsetInShared[p];
472     n = recv->offsetInShared[p+1];
473 lgao 3828 #ifdef ESYS_MPI
474 lgao 3763 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
475     P->mpi_info->msg_tag_counter + recv->neighbor[p],
476     P->mpi_info->comm,
477     &(P->col_coupler->mpi_requests[p]));
478 lgao 3828 #endif
479 lgao 3763 }
480     for (p=0; p<send_neighbors; p++) { /* Sending */
481     m = send->offsetInShared[p];
482     n = send->offsetInShared[p+1];
483 lgao 3828 #ifdef ESYS_MPI
484 lgao 3763 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
485     P->mpi_info->msg_tag_counter + rank,
486     P->mpi_info->comm,
487     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
488 lgao 3828 #endif
489 lgao 3763 }
490 lgao 3828 #ifdef ESYS_MPI
491 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
492     P->col_coupler->mpi_requests,
493     P->col_coupler->mpi_stati);
494 lgao 3828 #endif
495 lgao 3763 P->mpi_info->msg_tag_counter += size;
496    
497 jfenwick 4275 delete[] send_degree;
498 lgao 3763 m = Paso_Util_cumsum(recv_rows, recv_ptr);
499     recv_ptr[recv_rows] = m;
500 jfenwick 4275 recv_idx = new index_t[m];
501     recv_val = new double[m * block_size];
502 lgao 3763
503     /* Next, send/receive the index array */
504     j = 0;
505     for (p=0; p<recv_neighbors; p++) { /* Receiving */
506     m = recv->offsetInShared[p];
507     n = recv->offsetInShared[p+1];
508     i = recv_ptr[n] - recv_ptr[m];
509     if (i > 0) {
510 lgao 3828 #ifdef ESYS_MPI
511 lgao 3763 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
512     P->mpi_info->msg_tag_counter + recv->neighbor[p],
513     P->mpi_info->comm,
514     &(P->col_coupler->mpi_requests[p]));
515 lgao 3828 #endif
516 lgao 3819 }
517 lgao 3763 j += i;
518     }
519    
520     j = 0;
521     for (p=0; p<send_neighbors; p++) { /* Sending */
522     m = send->offsetInShared[p];
523     n = send->offsetInShared[p+1];
524     i = ptr[n] - ptr[m];
525     if (i >0) {
526 lgao 3828 #ifdef ESYS_MPI
527 lgao 3763 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
528     P->mpi_info->msg_tag_counter + rank,
529     P->mpi_info->comm,
530     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
531 lgao 3828 #endif
532 lgao 3763 j += i;
533 lgao 3819 }
534 lgao 3763 }
535 lgao 3828 #ifdef ESYS_MPI
536 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
537     P->col_coupler->mpi_requests,
538     P->col_coupler->mpi_stati);
539 lgao 3828 #endif
540 lgao 3763 P->mpi_info->msg_tag_counter += size;
541    
542     /* Last, send/receive the data array */
543     j = 0;
544     for (p=0; p<recv_neighbors; p++) { /* Receiving */
545     m = recv->offsetInShared[p];
546     n = recv->offsetInShared[p+1];
547     i = recv_ptr[n] - recv_ptr[m];
548 lgao 3828 #ifdef ESYS_MPI
549 lgao 3763 if (i > 0)
550 lgao 3779 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
551 lgao 3763 P->mpi_info->msg_tag_counter + recv->neighbor[p],
552     P->mpi_info->comm,
553     &(P->col_coupler->mpi_requests[p]));
554 lgao 3828 #endif
555 lgao 3779 j += (i*block_size);
556 lgao 3763 }
557    
558     j = 0;
559     for (p=0; p<send_neighbors; p++) { /* Sending */
560     m = send->offsetInShared[p];
561     n = send->offsetInShared[p+1];
562     i = ptr[n] - ptr[m];
563     if (i >0) {
564 lgao 3828 #ifdef ESYS_MPI
565 lgao 3779 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
566 lgao 3763 P->mpi_info->msg_tag_counter + rank,
567     P->mpi_info->comm,
568     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
569 lgao 3828 #endif
570 lgao 3779 j += i * block_size;
571 lgao 3763 }
572     }
573 lgao 3828 #ifdef ESYS_MPI
574 lgao 3763 MPI_Waitall(send_neighbors+recv_neighbors,
575     P->col_coupler->mpi_requests,
576     P->col_coupler->mpi_stati);
577 lgao 3828 #endif
578 lgao 3763 P->mpi_info->msg_tag_counter += size;
579    
580 jfenwick 4345 /* Clean up and return with received ptr, index and data arrays */
581 jfenwick 4275 delete[] ptr;
582     delete[] idx;
583     delete[] val;
584 lgao 3763 *p_ptr = recv_ptr;
585     *p_idx = recv_idx;
586     *p_val = recv_val;
587     }
588    
589     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
590     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
591     {
592     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
593     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
594     Paso_SystemMatrix *out=NULL;
595     Paso_SystemMatrixPattern *pattern=NULL;
596     Paso_Distribution *input_dist=NULL, *output_dist=NULL;
597     Paso_SharedComponents *send =NULL, *recv=NULL;
598     Paso_Connector *col_connector=NULL, *row_connector=NULL;
599     Paso_Pattern *main_pattern=NULL;
600     Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
601     const dim_t row_block_size=A->row_block_size;
602     const dim_t col_block_size=A->col_block_size;
603 lgao 3779 const dim_t block_size = A->block_size;
604     const dim_t P_block_size = P->block_size;
605 lgao 3763 const double ZERO = 0.0;
606     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
607 lgao 3779 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
608 lgao 3763 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
609     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
610     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
611     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
612     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
613     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
614     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
615 lgao 3779 index_t sum, i, j, k, iptr, irb, icb, ib;
616 lgao 3763 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
617     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
618     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
619     index_t row_marker_ext, *where_p=NULL;
620     index_t **send_ptr=NULL, **send_idx=NULL;
621 lgao 3828 dim_t p, num_neighbors;
622 lgao 3763 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
623     Esys_MPI_rank *neighbor=NULL;
624     #ifdef ESYS_MPI
625     MPI_Request* mpi_requests=NULL;
626     MPI_Status* mpi_stati=NULL;
627     #else
628     int *mpi_requests=NULL, *mpi_stati=NULL;
629     #endif
630    
631 lgao 3828 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
632     return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
633 lgao 3779
634 jfenwick 4345 /* two sparse matrices R_main and R_couple will be generate, as the
635 lgao 3763 transpose of P_main and P_col_couple, respectively. Note that,
636     R_couple is actually the row_coupleBlock of R (R=P^T) */
637     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
638 lgao 3817 if (size > 1)
639     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
640 lgao 3763
641 jfenwick 4345 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
642 lgao 3779 and needed locally for triple matrix product RAP
643     to be specific, P_ext (on processor k) are group of rows in P, where
644     the list of rows from processor q is given by
645     A->col_coupler->connector->send->shared[rPtr...]
646     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
647     on q.
648 jfenwick 4345 P_ext is represented by two sparse matrices P_ext_main and
649 lgao 3779 P_ext_couple */
650     Paso_Preconditioner_AMG_extendB(A, P);
651    
652     /* count the number of cols in P_ext_couple, resize the pattern of
653     sparse matrix P_ext_couple with new compressed order, and then
654     build the col id mapping from P->col_coupleBlock to
655     P_ext_couple */
656     num_Pmain_cols = P->mainBlock->numCols;
657 lgao 3817 if (size > 1) {
658     num_Pcouple_cols = P->col_coupleBlock->numCols;
659     num_Acouple_cols = A->col_coupleBlock->numCols;
660     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
661     } else {
662     num_Pcouple_cols = 0;
663     num_Acouple_cols = 0;
664     sum = 0;
665     }
666 lgao 3779 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
667     offset = P->col_distribution->first_component[rank];
668     num_Pext_cols = 0;
669     if (P->global_id) {
670     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
671     and P->col_coupleBlock */
672     iptr = 0;
673     if (num_Pcouple_cols || sum > 0) {
674 jfenwick 4275 temp = new index_t[num_Pcouple_cols+sum];
675 lgao 3821 #pragma omp parallel for lastprivate(iptr) schedule(static)
676     for (iptr=0; iptr<sum; iptr++){
677 lgao 3779 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
678     }
679     for (j=0; j<num_Pcouple_cols; j++, iptr++){
680     temp[iptr] = P->global_id[j];
681     }
682     }
683     if (iptr) {
684     #ifdef USE_QSORTG
685     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
686     #else
687     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
688     #endif
689     num_Pext_cols = 1;
690     i = temp[0];
691     for (j=1; j<iptr; j++) {
692     if (temp[j] > i) {
693     i = temp[j];
694     temp[num_Pext_cols++] = i;
695     }
696     }
697     }
698     /* resize the pattern of P_ext_couple */
699     if(num_Pext_cols){
700 jfenwick 4275 global_id_P = new index_t[num_Pext_cols];
701 lgao 3821 #pragma omp parallel for private(i) schedule(static)
702 lgao 3779 for (i=0; i<num_Pext_cols; i++)
703     global_id_P[i] = temp[i];
704     }
705     if (num_Pcouple_cols || sum > 0)
706 jfenwick 4275 delete[] temp;
707 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
708 lgao 3779 for (i=0; i<sum; i++) {
709     where_p = (index_t *)bsearch(
710     &(P->remote_coupleBlock->pattern->index[i]),
711     global_id_P, num_Pext_cols,
712     sizeof(index_t), Paso_comparIndex);
713     P->remote_coupleBlock->pattern->index[i] =
714     (index_t)(where_p -global_id_P);
715     }
716    
717     /* build the mapping */
718     if (num_Pcouple_cols) {
719 jfenwick 4275 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
720 lgao 3779 iptr = 0;
721     for (i=0; i<num_Pext_cols; i++)
722     if (global_id_P[i] == P->global_id[iptr]) {
723     Pcouple_to_Pext[iptr++] = i;
724     if (iptr == num_Pcouple_cols) break;
725     }
726     }
727     }
728    
729 jfenwick 4345 /* alloc and initialise the makers */
730 lgao 3779 sum = num_Pext_cols + num_Pmain_cols;
731 jfenwick 4275 P_marker = new index_t[sum];
732     A_marker = new index_t[num_A_cols];
733 lgao 3779 #pragma omp parallel for private(i) schedule(static)
734     for (i=0; i<sum; i++) P_marker[i] = -1;
735     #pragma omp parallel for private(i) schedule(static)
736     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
737    
738     /* Now, count the size of RAP_ext. Start with rows in R_couple */
739     sum = 0;
740     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
741     row_marker = sum;
742     /* then, loop over elements in row i_r of R_couple */
743     j1_ub = R_couple->pattern->ptr[i_r+1];
744     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
745     i1 = R_couple->pattern->index[j1];
746     /* then, loop over elements in row i1 of A->col_coupleBlock */
747     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
748     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
749     i2 = A->col_coupleBlock->pattern->index[j2];
750    
751     /* check whether entry RA[i_r, i2] has been previously visited.
752     RAP new entry is possible only if entry RA[i_r, i2] has not
753     been visited yet */
754     if (A_marker[i2] != i_r) {
755     /* first, mark entry RA[i_r,i2] as visited */
756     A_marker[i2] = i_r;
757    
758     /* then loop over elements in row i2 of P_ext_main */
759     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
760     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
761     i_c = P->row_coupleBlock->pattern->index[j3];
762    
763     /* check whether entry RAP[i_r,i_c] has been created.
764     If not yet created, create the entry and increase
765     the total number of elements in RAP_ext */
766     if (P_marker[i_c] < row_marker) {
767     P_marker[i_c] = sum;
768     sum++;
769     }
770     }
771    
772     /* loop over elements in row i2 of P_ext_couple, do the same */
773     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
774     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
775     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
776     if (P_marker[i_c] < row_marker) {
777     P_marker[i_c] = sum;
778     sum++;
779     }
780     }
781     }
782     }
783    
784     /* now loop over elements in row i1 of A->mainBlock, repeat
785     the process we have done to block A->col_coupleBlock */
786     j2_ub = A->mainBlock->pattern->ptr[i1+1];
787     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
788     i2 = A->mainBlock->pattern->index[j2];
789     if (A_marker[i2 + num_Acouple_cols] != i_r) {
790     A_marker[i2 + num_Acouple_cols] = i_r;
791     j3_ub = P->mainBlock->pattern->ptr[i2+1];
792     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
793     i_c = P->mainBlock->pattern->index[j3];
794     if (P_marker[i_c] < row_marker) {
795     P_marker[i_c] = sum;
796     sum++;
797     }
798     }
799     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
800     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
801     /* note that we need to map the column index in
802     P->col_coupleBlock back into the column index in
803     P_ext_couple */
804     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
805     if (P_marker[i_c] < row_marker) {
806     P_marker[i_c] = sum;
807     sum++;
808     }
809     }
810     }
811     }
812     }
813     }
814    
815     /* Now we have the number of non-zero elements in RAP_ext, allocate
816     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
817 jfenwick 4275 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
818     RAP_ext_idx = new index_t[sum];
819     RAP_ext_val = new double[sum * block_size];
820     RA_val = new double[block_size];
821     RAP_val = new double[block_size];
822 lgao 3779
823     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
824     sum = num_Pext_cols + num_Pmain_cols;
825     #pragma omp parallel for private(i) schedule(static)
826     for (i=0; i<sum; i++) P_marker[i] = -1;
827     #pragma omp parallel for private(i) schedule(static)
828     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
829     sum = 0;
830     RAP_ext_ptr[0] = 0;
831     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
832     row_marker = sum;
833     /* then, loop over elements in row i_r of R_couple */
834     j1_ub = R_couple->pattern->ptr[i_r+1];
835     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
836     i1 = R_couple->pattern->index[j1];
837     R_val = &(R_couple->val[j1*P_block_size]);
838    
839     /* then, loop over elements in row i1 of A->col_coupleBlock */
840     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
841     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
842     i2 = A->col_coupleBlock->pattern->index[j2];
843     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
844     for (irb=0; irb<row_block_size; irb++)
845     for (icb=0; icb<col_block_size; icb++)
846     RA_val[irb+row_block_size*icb] = ZERO;
847     for (irb=0; irb<P_block_size; irb++) {
848     rtmp=R_val[irb];
849     for (icb=0; icb<col_block_size; icb++) {
850     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
851     }
852     }
853    
854     /* check whether entry RA[i_r, i2] has been previously visited.
855     RAP new entry is possible only if entry RA[i_r, i2] has not
856     been visited yet */
857     if (A_marker[i2] != i_r) {
858     /* first, mark entry RA[i_r,i2] as visited */
859     A_marker[i2] = i_r;
860    
861     /* then loop over elements in row i2 of P_ext_main */
862     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
863     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
864     i_c = P->row_coupleBlock->pattern->index[j3];
865     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
866     for (irb=0; irb<row_block_size; irb++)
867     for (icb=0; icb<col_block_size; icb++)
868     RAP_val[irb+row_block_size*icb] = ZERO;
869     for (icb=0; icb<P_block_size; icb++) {
870     rtmp = temp_val[icb];
871     for (irb=0; irb<row_block_size; irb++) {
872     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
873     }
874     }
875    
876     /* check whether entry RAP[i_r,i_c] has been created.
877     If not yet created, create the entry and increase
878     the total number of elements in RAP_ext */
879     if (P_marker[i_c] < row_marker) {
880     P_marker[i_c] = sum;
881     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
882     RAP_ext_idx[sum] = i_c + offset;
883     sum++;
884     } else {
885     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
886     for (ib=0; ib<block_size; ib++) {
887     temp_val[ib] += RAP_val[ib];
888     }
889     }
890     }
891    
892     /* loop over elements in row i2 of P_ext_couple, do the same */
893     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
894     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
895     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
896     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
897     for (irb=0; irb<row_block_size; irb++)
898     for (icb=0; icb<col_block_size; icb++)
899     RAP_val[irb+row_block_size*icb]=ZERO;
900     for (icb=0; icb<P_block_size; icb++) {
901     rtmp = temp_val[icb];
902     for (irb=0; irb<row_block_size; irb++) {
903     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
904     }
905     }
906    
907     if (P_marker[i_c] < row_marker) {
908     P_marker[i_c] = sum;
909     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
910     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
911     sum++;
912     } else {
913     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
914     for (ib=0; ib<block_size; ib++) {
915     temp_val[ib] += RAP_val[ib];
916     }
917     }
918     }
919    
920     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
921     Only the contributions are added. */
922     } else {
923     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
924     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
925     i_c = P->row_coupleBlock->pattern->index[j3];
926     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
927     for (irb=0; irb<row_block_size; irb++)
928     for (icb=0; icb<col_block_size; icb++)
929     RAP_val[irb+row_block_size*icb]=ZERO;
930     for (icb=0; icb<P_block_size; icb++) {
931     rtmp = temp_val[icb];
932     for (irb=0; irb<row_block_size; irb++) {
933     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
934     }
935     }
936    
937     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
938     for (ib=0; ib<block_size; ib++) {
939     temp_val[ib] += RAP_val[ib];
940     }
941     }
942     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
943     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
944     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
945     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
946     for (irb=0; irb<row_block_size; irb++)
947     for (icb=0; icb<col_block_size; icb++)
948     RAP_val[irb+row_block_size*icb]=ZERO;
949     for (icb=0; icb<P_block_size; icb++) {
950     rtmp = temp_val[icb];
951     for (irb=0; irb<row_block_size; irb++) {
952     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
953     }
954     }
955    
956     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
957     for (ib=0; ib<block_size; ib++) {
958     temp_val[ib] += RAP_val[ib];
959     }
960     }
961     }
962     }
963    
964     /* now loop over elements in row i1 of A->mainBlock, repeat
965     the process we have done to block A->col_coupleBlock */
966     j2_ub = A->mainBlock->pattern->ptr[i1+1];
967     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
968     i2 = A->mainBlock->pattern->index[j2];
969     temp_val = &(A->mainBlock->val[j2*block_size]);
970     for (irb=0; irb<row_block_size; irb++)
971     for (icb=0; icb<col_block_size; icb++)
972     RA_val[irb+row_block_size*icb]=ZERO;
973     for (irb=0; irb<P_block_size; irb++) {
974     rtmp=R_val[irb];
975     for (icb=0; icb<col_block_size; icb++) {
976     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
977     }
978     }
979    
980     if (A_marker[i2 + num_Acouple_cols] != i_r) {
981     A_marker[i2 + num_Acouple_cols] = i_r;
982     j3_ub = P->mainBlock->pattern->ptr[i2+1];
983     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
984     i_c = P->mainBlock->pattern->index[j3];
985     temp_val = &(P->mainBlock->val[j3*P_block_size]);
986     for (irb=0; irb<row_block_size; irb++)
987     for (icb=0; icb<col_block_size; icb++)
988     RAP_val[irb+row_block_size*icb]=ZERO;
989     for (icb=0; icb<P_block_size; icb++) {
990     rtmp = temp_val[icb];
991     for (irb=0; irb<row_block_size; irb++) {
992     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
993     }
994     }
995    
996     if (P_marker[i_c] < row_marker) {
997     P_marker[i_c] = sum;
998     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
999     RAP_ext_idx[sum] = i_c + offset;
1000     sum++;
1001     } else {
1002     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1003     for (ib=0; ib<block_size; ib++) {
1004     temp_val[ib] += RAP_val[ib];
1005     }
1006     }
1007     }
1008     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1009     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1010     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1011     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1012     for (irb=0; irb<row_block_size; irb++)
1013     for (icb=0; icb<col_block_size; icb++)
1014     RAP_val[irb+row_block_size*icb]=ZERO;
1015     for (icb=0; icb<P_block_size; icb++) {
1016     rtmp=temp_val[icb];
1017     for (irb=0; irb<row_block_size; irb++) {
1018     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1019     }
1020     }
1021    
1022     if (P_marker[i_c] < row_marker) {
1023     P_marker[i_c] = sum;
1024     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1025     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1026     sum++;
1027     } else {
1028     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1029     for (ib=0; ib<block_size; ib++) {
1030     temp_val[ib] += RAP_val[ib];
1031     }
1032     }
1033     }
1034     } else {
1035     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1036     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1037     i_c = P->mainBlock->pattern->index[j3];
1038     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1039     for (irb=0; irb<row_block_size; irb++)
1040     for (icb=0; icb<col_block_size; icb++)
1041     RAP_val[irb+row_block_size*icb]=ZERO;
1042     for (icb=0; icb<P_block_size; icb++) {
1043     rtmp=temp_val[icb];
1044     for (irb=0; irb<row_block_size; irb++) {
1045     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1046     }
1047     }
1048    
1049     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1050     for (ib=0; ib<block_size; ib++) {
1051     temp_val[ib] += RAP_val[ib];
1052     }
1053     }
1054     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1055     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1056     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1057     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1058     for (irb=0; irb<row_block_size; irb++)
1059     for (icb=0; icb<col_block_size; icb++)
1060     RAP_val[irb+row_block_size*icb]=ZERO;
1061     for (icb=0; icb<P_block_size; icb++) {
1062     rtmp=temp_val[icb];
1063     for (irb=0; irb<row_block_size; irb++) {
1064     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1065     }
1066     }
1067    
1068     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1069     for (ib=0; ib<block_size; ib++) {
1070     temp_val[ib] += RAP_val[ib];
1071     }
1072     }
1073     }
1074     }
1075     }
1076     RAP_ext_ptr[i_r+1] = sum;
1077     }
1078 jfenwick 4275 delete[] P_marker;
1079     delete[] Pcouple_to_Pext;
1080 lgao 3779
1081     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1082     which is given by the column list of P->col_coupleBlock, and
1083 jfenwick 4345 column "c" is the list of columns which possibly covers the
1084     whole column range of system matrix P. This part of data is to
1085     be passed to neighbouring processors, and added to corresponding
1086     RAP[r,c] entries in the neighbouring processors */
1087 lgao 3779 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1088     &RAP_ext_val, global_id_P, block_size);
1089    
1090     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1091     sum = RAP_ext_ptr[num_RAPext_rows];
1092     num_RAPext_cols = 0;
1093     if (num_Pext_cols || sum > 0) {
1094 jfenwick 4275 temp = new index_t[num_Pext_cols+sum];
1095 lgao 3779 j1_ub = offset + num_Pmain_cols;
1096     for (i=0, iptr=0; i<sum; i++) {
1097 lgao 3819 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1098 lgao 3779 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1099     }
1100     for (j=0; j<num_Pext_cols; j++, iptr++){
1101     temp[iptr] = global_id_P[j];
1102     }
1103    
1104     if (iptr) {
1105     #ifdef USE_QSORTG
1106     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1107     #else
1108     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1109     #endif
1110     num_RAPext_cols = 1;
1111     i = temp[0];
1112     for (j=1; j<iptr; j++) {
1113     if (temp[j] > i) {
1114     i = temp[j];
1115     temp[num_RAPext_cols++] = i;
1116     }
1117     }
1118     }
1119     }
1120    
1121     /* resize the pattern of P_ext_couple */
1122     if(num_RAPext_cols){
1123 jfenwick 4275 global_id_RAP = new index_t[num_RAPext_cols];
1124 lgao 3821 #pragma omp parallel for private(i) schedule(static)
1125 lgao 3779 for (i=0; i<num_RAPext_cols; i++)
1126     global_id_RAP[i] = temp[i];
1127     }
1128     if (num_Pext_cols || sum > 0)
1129 jfenwick 4275 delete[] temp;
1130 lgao 3779 j1_ub = offset + num_Pmain_cols;
1131 lgao 3821 #pragma omp parallel for private(i, where_p) schedule(static)
1132 lgao 3779 for (i=0; i<sum; i++) {
1133     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1134     where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1135     /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1136     RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1137     } else
1138     RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1139     }
1140    
1141     /* build the mapping */
1142     if (num_Pcouple_cols) {
1143 jfenwick 4275 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1144 lgao 3779 iptr = 0;
1145     for (i=0; i<num_RAPext_cols; i++)
1146     if (global_id_RAP[i] == P->global_id[iptr]) {
1147     Pcouple_to_RAP[iptr++] = i;
1148     if (iptr == num_Pcouple_cols) break;
1149     }
1150     }
1151    
1152     if (num_Pext_cols) {
1153 jfenwick 4275 Pext_to_RAP = new index_t[num_Pext_cols];
1154 lgao 3779 iptr = 0;
1155     for (i=0; i<num_RAPext_cols; i++)
1156     if (global_id_RAP[i] == global_id_P[iptr]) {
1157     Pext_to_RAP[iptr++] = i;
1158     if (iptr == num_Pext_cols) break;
1159     }
1160     }
1161    
1162     if (global_id_P){
1163 jfenwick 4275 delete[] global_id_P;
1164 lgao 3779 global_id_P = NULL;
1165     }
1166    
1167 jfenwick 4345 /* alloc and initialise the makers */
1168 lgao 3779 sum = num_RAPext_cols + num_Pmain_cols;
1169 jfenwick 4275 P_marker = new index_t[sum];
1170 lgao 3779 #pragma omp parallel for private(i) schedule(static)
1171     for (i=0; i<sum; i++) P_marker[i] = -1;
1172     #pragma omp parallel for private(i) schedule(static)
1173     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1174    
1175     /* Now, count the size of RAP. Start with rows in R_main */
1176     num_neighbors = P->col_coupler->connector->send->numNeighbors;
1177     offsetInShared = P->col_coupler->connector->send->offsetInShared;
1178     shared = P->col_coupler->connector->send->shared;
1179     i = 0;
1180     j = 0;
1181     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1182     /* Mark the start of row for both main block and couple block */
1183     row_marker = i;
1184     row_marker_ext = j;
1185    
1186     /* Mark the diagonal element RAP[i_r, i_r], and other elements
1187     in RAP_ext */
1188     P_marker[i_r] = i;
1189     i++;
1190     for (j1=0; j1<num_neighbors; j1++) {
1191     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1192     if (shared[j2] == i_r) {
1193     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1194     i_c = RAP_ext_idx[k];
1195     if (i_c < num_Pmain_cols) {
1196     if (P_marker[i_c] < row_marker) {
1197     P_marker[i_c] = i;
1198     i++;
1199     }
1200     } else {
1201     if (P_marker[i_c] < row_marker_ext) {
1202     P_marker[i_c] = j;
1203     j++;
1204     }
1205     }
1206     }
1207     break;
1208     }
1209     }
1210     }
1211    
1212     /* then, loop over elements in row i_r of R_main */
1213     j1_ub = R_main->pattern->ptr[i_r+1];
1214     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1215     i1 = R_main->pattern->index[j1];
1216    
1217     /* then, loop over elements in row i1 of A->col_coupleBlock */
1218     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1219     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1220     i2 = A->col_coupleBlock->pattern->index[j2];
1221    
1222     /* check whether entry RA[i_r, i2] has been previously visited.
1223     RAP new entry is possible only if entry RA[i_r, i2] has not
1224     been visited yet */
1225     if (A_marker[i2] != i_r) {
1226     /* first, mark entry RA[i_r,i2] as visited */
1227     A_marker[i2] = i_r;
1228    
1229     /* then loop over elements in row i2 of P_ext_main */
1230     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1231     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232     i_c = P->row_coupleBlock->pattern->index[j3];
1233    
1234     /* check whether entry RAP[i_r,i_c] has been created.
1235     If not yet created, create the entry and increase
1236     the total number of elements in RAP_ext */
1237     if (P_marker[i_c] < row_marker) {
1238     P_marker[i_c] = i;
1239     i++;
1240     }
1241     }
1242    
1243     /* loop over elements in row i2 of P_ext_couple, do the same */
1244     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1245     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1246     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1247     if (P_marker[i_c] < row_marker_ext) {
1248     P_marker[i_c] = j;
1249     j++;
1250     }
1251     }
1252     }
1253     }
1254    
1255     /* now loop over elements in row i1 of A->mainBlock, repeat
1256     the process we have done to block A->col_coupleBlock */
1257     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1258     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1259     i2 = A->mainBlock->pattern->index[j2];
1260     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1261     A_marker[i2 + num_Acouple_cols] = i_r;
1262     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1263     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1264     i_c = P->mainBlock->pattern->index[j3];
1265     if (P_marker[i_c] < row_marker) {
1266     P_marker[i_c] = i;
1267     i++;
1268     }
1269     }
1270     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1271     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1272     /* note that we need to map the column index in
1273     P->col_coupleBlock back into the column index in
1274     P_ext_couple */
1275     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1276     if (P_marker[i_c] < row_marker_ext) {
1277     P_marker[i_c] = j;
1278     j++;
1279     }
1280     }
1281     }
1282     }
1283     }
1284     }
1285    
1286     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1287     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1288     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1289     RAP_couple */
1290 jfenwick 4275 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1291     RAP_main_idx = new index_t[i];
1292     RAP_main_val = new double[i * block_size];
1293     RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1294     RAP_couple_idx = new index_t[j];
1295     RAP_couple_val = new double[j * block_size];
1296 lgao 3779
1297     RAP_main_ptr[num_Pmain_cols] = i;
1298     RAP_couple_ptr[num_Pmain_cols] = j;
1299    
1300     #pragma omp parallel for private(i) schedule(static)
1301     for (i=0; i<sum; i++) P_marker[i] = -1;
1302     #pragma omp parallel for private(i) schedule(static)
1303     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1304    
1305     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1306     in R_main */
1307     i = 0;
1308     j = 0;
1309     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1310     /* Mark the start of row for both main block and couple block */
1311     row_marker = i;
1312     row_marker_ext = j;
1313     RAP_main_ptr[i_r] = row_marker;
1314     RAP_couple_ptr[i_r] = row_marker_ext;
1315    
1316     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1317     in row i_r of RAP_ext */
1318     P_marker[i_r] = i;
1319     for (ib=0; ib<block_size; ib++)
1320     RAP_main_val[i*block_size+ib] = ZERO;
1321     RAP_main_idx[i] = i_r;
1322     i++;
1323    
1324     for (j1=0; j1<num_neighbors; j1++) {
1325     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1326     if (shared[j2] == i_r) {
1327     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1328     i_c = RAP_ext_idx[k];
1329     if (i_c < num_Pmain_cols) {
1330     if (P_marker[i_c] < row_marker) {
1331     P_marker[i_c] = i;
1332     memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1333     RAP_main_idx[i] = i_c;
1334     i++;
1335     } else {
1336     t1_val = &(RAP_ext_val[k*block_size]);
1337     t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1338     for (ib=0; ib<block_size; ib++)
1339     t2_val[ib] += t1_val[ib];
1340     }
1341     } else {
1342     if (P_marker[i_c] < row_marker_ext) {
1343     P_marker[i_c] = j;
1344     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1345     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1346     j++;
1347     } else {
1348     t1_val = &(RAP_ext_val[k*block_size]);
1349     t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1350     for (ib=0; ib<block_size; ib++)
1351     t2_val[ib] += t1_val[ib];
1352     }
1353     }
1354     }
1355     break;
1356     }
1357     }
1358     }
1359    
1360     /* then, loop over elements in row i_r of R_main */
1361     j1_ub = R_main->pattern->ptr[i_r+1];
1362     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1363     i1 = R_main->pattern->index[j1];
1364     R_val = &(R_main->val[j1*P_block_size]);
1365    
1366     /* then, loop over elements in row i1 of A->col_coupleBlock */
1367     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1368     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1369     i2 = A->col_coupleBlock->pattern->index[j2];
1370     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1371     for (irb=0; irb<row_block_size; irb++)
1372     for (icb=0; icb<col_block_size; icb++)
1373     RA_val[irb+row_block_size*icb]=ZERO;
1374     for (irb=0; irb<P_block_size; irb++) {
1375     rtmp=R_val[irb];
1376     for (icb=0; icb<col_block_size; icb++) {
1377     RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1378     }
1379     }
1380    
1381    
1382     /* check whether entry RA[i_r, i2] has been previously visited.
1383     RAP new entry is possible only if entry RA[i_r, i2] has not
1384     been visited yet */
1385     if (A_marker[i2] != i_r) {
1386     /* first, mark entry RA[i_r,i2] as visited */
1387     A_marker[i2] = i_r;
1388    
1389     /* then loop over elements in row i2 of P_ext_main */
1390     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1391     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1392     i_c = P->row_coupleBlock->pattern->index[j3];
1393     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1394     for (irb=0; irb<row_block_size; irb++)
1395     for (icb=0; icb<col_block_size; icb++)
1396     RAP_val[irb+row_block_size*icb]=ZERO;
1397     for (icb=0; icb<P_block_size; icb++) {
1398     rtmp=temp_val[icb];
1399     for (irb=0; irb<row_block_size; irb++) {
1400     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1401     }
1402     }
1403    
1404    
1405     /* check whether entry RAP[i_r,i_c] has been created.
1406     If not yet created, create the entry and increase
1407     the total number of elements in RAP_ext */
1408     if (P_marker[i_c] < row_marker) {
1409     P_marker[i_c] = i;
1410     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1411     RAP_main_idx[i] = i_c;
1412     i++;
1413     } else {
1414     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1415     for (ib=0; ib<block_size; ib++) {
1416     temp_val[ib] += RAP_val[ib];
1417     }
1418     }
1419     }
1420    
1421     /* loop over elements in row i2 of P_ext_couple, do the same */
1422     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1423     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1424     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1425     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1426     for (irb=0; irb<row_block_size; irb++)
1427     for (icb=0; icb<col_block_size; icb++)
1428     RAP_val[irb+row_block_size*icb]=ZERO;
1429     for (icb=0; icb<P_block_size; icb++) {
1430     rtmp=temp_val[icb];
1431     for (irb=0; irb<row_block_size; irb++) {
1432     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1433     }
1434     }
1435    
1436     if (P_marker[i_c] < row_marker_ext) {
1437     P_marker[i_c] = j;
1438     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1439     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1440     j++;
1441     } else {
1442     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1443     for (ib=0; ib<block_size; ib++) {
1444     temp_val[ib] += RAP_val[ib];
1445     }
1446     }
1447     }
1448    
1449     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1450     Only the contributions are added. */
1451     } else {
1452     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1453     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1454     i_c = P->row_coupleBlock->pattern->index[j3];
1455     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1456     for (irb=0; irb<row_block_size; irb++)
1457     for (icb=0; icb<col_block_size; icb++)
1458     RAP_val[irb+row_block_size*icb]=ZERO;
1459     for (icb=0; icb<P_block_size; icb++) {
1460     rtmp=temp_val[icb];
1461     for (irb=0; irb<row_block_size; irb++) {
1462     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1463     }
1464     }
1465    
1466     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1467     for (ib=0; ib<block_size; ib++) {
1468     temp_val[ib] += RAP_val[ib];
1469     }
1470     }
1471     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1472     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1473     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1474     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1475     for (irb=0; irb<row_block_size; irb++)
1476     for (icb=0; icb<col_block_size; icb++)
1477     RAP_val[irb+row_block_size*icb]=ZERO;
1478     for (icb=0; icb<P_block_size; icb++) {
1479     rtmp=temp_val[icb];
1480     for (irb=0; irb<row_block_size; irb++) {
1481     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1482     }
1483     }
1484    
1485     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1486     for (ib=0; ib<block_size; ib++) {
1487     temp_val[ib] += RAP_val[ib];
1488     }
1489     }
1490     }
1491     }
1492    
1493     /* now loop over elements in row i1 of A->mainBlock, repeat
1494     the process we have done to block A->col_coupleBlock */
1495     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1496     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1497     i2 = A->mainBlock->pattern->index[j2];
1498     temp_val = &(A->mainBlock->val[j2*block_size]);
1499     for (irb=0; irb<row_block_size; irb++)
1500     for (icb=0; icb<col_block_size; icb++)
1501     RA_val[irb+row_block_size*icb]=ZERO;
1502     for (irb=0; irb<P_block_size; irb++) {
1503     rtmp=R_val[irb];
1504     for (icb=0; icb<col_block_size; icb++) {
1505     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1506     }
1507     }
1508    
1509     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1510     A_marker[i2 + num_Acouple_cols] = i_r;
1511     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1512     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1513     i_c = P->mainBlock->pattern->index[j3];
1514     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1515     for (irb=0; irb<row_block_size; irb++)
1516     for (icb=0; icb<col_block_size; icb++)
1517     RAP_val[irb+row_block_size*icb]=ZERO;
1518     for (icb=0; icb<P_block_size; icb++) {
1519     rtmp=temp_val[icb];
1520     for (irb=0; irb<row_block_size; irb++) {
1521     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1522     }
1523     }
1524     if (P_marker[i_c] < row_marker) {
1525     P_marker[i_c] = i;
1526     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1527     RAP_main_idx[i] = i_c;
1528     i++;
1529     } else {
1530     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1531     for (ib=0; ib<block_size; ib++) {
1532     temp_val[ib] += RAP_val[ib];
1533     }
1534     }
1535     }
1536     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1537     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1538     /* note that we need to map the column index in
1539     P->col_coupleBlock back into the column index in
1540     P_ext_couple */
1541     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1542     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1543     for (irb=0; irb<row_block_size; irb++)
1544     for (icb=0; icb<col_block_size; icb++)
1545     RAP_val[irb+row_block_size*icb]=ZERO;
1546     for (icb=0; icb<P_block_size; icb++) {
1547     rtmp=temp_val[icb];
1548     for (irb=0; irb<row_block_size; irb++) {
1549     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1550     }
1551     }
1552    
1553     if (P_marker[i_c] < row_marker_ext) {
1554     P_marker[i_c] = j;
1555     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1556     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1557     j++;
1558     } else {
1559     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1560     for (ib=0; ib<block_size; ib++) {
1561     temp_val[ib] += RAP_val[ib];
1562     }
1563     }
1564     }
1565    
1566     } else {
1567     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1568     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1569     i_c = P->mainBlock->pattern->index[j3];
1570     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1571     for (irb=0; irb<row_block_size; irb++)
1572     for (icb=0; icb<col_block_size; icb++)
1573     RAP_val[irb+row_block_size*icb]=ZERO;
1574     for (icb=0; icb<P_block_size; icb++) {
1575     rtmp=temp_val[icb];
1576     for (irb=0; irb<row_block_size; irb++) {
1577     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1578     }
1579     }
1580    
1581     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1582     for (ib=0; ib<block_size; ib++) {
1583     temp_val[ib] += RAP_val[ib];
1584     }
1585     }
1586     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1587     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1588     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1589     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1590     for (irb=0; irb<row_block_size; irb++)
1591     for (icb=0; icb<col_block_size; icb++)
1592     RAP_val[irb+row_block_size*icb]=ZERO;
1593     for (icb=0; icb<P_block_size; icb++) {
1594     rtmp = temp_val[icb];
1595     for (irb=0; irb<row_block_size; irb++) {
1596     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1597     }
1598     }
1599    
1600     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1601     for (ib=0; ib<block_size; ib++) {
1602     temp_val[ib] += RAP_val[ib];
1603     }
1604     }
1605     }
1606     }
1607     }
1608    
1609     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1610     if (i > row_marker) {
1611     offset = i - row_marker;
1612 jfenwick 4275 temp = new index_t[offset];
1613 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1614 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1615     temp[iptr] = RAP_main_idx[row_marker+iptr];
1616     if (offset > 0) {
1617     #ifdef USE_QSORTG
1618     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1619     #else
1620     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1621     #endif
1622     }
1623 jfenwick 4275 temp_val = new double[offset * block_size];
1624 lgao 3821 #pragma omp parallel for schedule(static) private(iptr,k)
1625 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1626     k = P_marker[temp[iptr]];
1627     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1628     P_marker[temp[iptr]] = iptr + row_marker;
1629     }
1630 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1631 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1632     RAP_main_idx[row_marker+iptr] = temp[iptr];
1633     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1634     }
1635 jfenwick 4275 delete[] temp;
1636     delete[] temp_val;
1637 lgao 3779 }
1638     if (j > row_marker_ext) {
1639     offset = j - row_marker_ext;
1640 jfenwick 4275 temp = new index_t[offset];
1641 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1642 lgao 3779 for (iptr=0; iptr<offset; iptr++)
1643     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1644     if (offset > 0) {
1645     #ifdef USE_QSORTG
1646     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1647     #else
1648     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1649     #endif
1650     }
1651 jfenwick 4275 temp_val = new double[offset * block_size];
1652 lgao 3821 #pragma omp parallel for schedule(static) private(iptr, k)
1653 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1654     k = P_marker[temp[iptr] + num_Pmain_cols];
1655     memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1656     P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1657     }
1658 lgao 3821 #pragma omp parallel for schedule(static) private(iptr)
1659 lgao 3779 for (iptr=0; iptr<offset; iptr++){
1660     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1661     memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1662     }
1663 jfenwick 4275 delete[] temp;
1664     delete[] temp_val;
1665 lgao 3779 }
1666     }
1667    
1668 jfenwick 4275 delete[] RA_val;
1669     delete[] RAP_val;
1670     delete[] A_marker;
1671     delete[] Pext_to_RAP;
1672     delete[] Pcouple_to_RAP;
1673     delete[] RAP_ext_ptr;
1674     delete[] RAP_ext_idx;
1675     delete[] RAP_ext_val;
1676 lgao 3779 Paso_SparseMatrix_free(R_main);
1677     Paso_SparseMatrix_free(R_couple);
1678    
1679     /* Check whether there are empty columns in RAP_couple */
1680     #pragma omp parallel for schedule(static) private(i)
1681     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1682     /* num of non-empty columns is stored in "k" */
1683     k = 0;
1684     j = RAP_couple_ptr[num_Pmain_cols];
1685     for (i=0; i<j; i++) {
1686     i1 = RAP_couple_idx[i];
1687     if (P_marker[i1]) {
1688     P_marker[i1] = 0;
1689     k++;
1690     }
1691     }
1692    
1693     /* empty columns is found */
1694     if (k < num_RAPext_cols) {
1695 jfenwick 4275 temp = new index_t[k];
1696 lgao 3779 k = 0;
1697     for (i=0; i<num_RAPext_cols; i++)
1698     if (!P_marker[i]) {
1699     P_marker[i] = k;
1700     temp[k] = global_id_RAP[i];
1701     k++;
1702     }
1703 lgao 3821 #pragma omp parallel for schedule(static) private(i, i1)
1704 lgao 3779 for (i=0; i<j; i++) {
1705     i1 = RAP_couple_idx[i];
1706     RAP_couple_idx[i] = P_marker[i1];
1707     }
1708     num_RAPext_cols = k;
1709 jfenwick 4275 delete[] global_id_RAP;
1710 lgao 3779 global_id_RAP = temp;
1711     }
1712 jfenwick 4275 delete[] P_marker;
1713 lgao 3779
1714     /******************************************************/
1715 jfenwick 4345 /* Start to create the coarse level System Matrix A_c */
1716 lgao 3779 /******************************************************/
1717     /* first, prepare the sender/receiver for the col_connector */
1718     dist = P->pattern->input_distribution->first_component;
1719 jfenwick 4275 recv_len = new dim_t[size];
1720     send_len = new dim_t[size];
1721     neighbor = new Esys_MPI_rank[size];
1722     offsetInShared = new index_t[size+1];
1723     shared = new index_t[num_RAPext_cols];
1724 lgao 3779 memset(recv_len, 0, sizeof(dim_t) * size);
1725     memset(send_len, 0, sizeof(dim_t) * size);
1726     num_neighbors = 0;
1727     offsetInShared[0] = 0;
1728     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1729     shared[i] = i + num_Pmain_cols;
1730     if (k <= global_id_RAP[i]) {
1731     if (recv_len[j] > 0) {
1732     neighbor[num_neighbors] = j;
1733     num_neighbors ++;
1734     offsetInShared[num_neighbors] = i;
1735     }
1736     while (k <= global_id_RAP[i]) {
1737     j++;
1738     k = dist[j+1];
1739     }
1740     }
1741     recv_len[j] ++;
1742     }
1743     if (recv_len[j] > 0) {
1744     neighbor[num_neighbors] = j;
1745     num_neighbors ++;
1746     offsetInShared[num_neighbors] = i;
1747     }
1748     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1749     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1750    
1751 lgao 3828 #ifdef ESYS_MPI
1752 lgao 3779 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1753 lgao 3828 #endif
1754 lgao 3779
1755     #ifdef ESYS_MPI
1756 jfenwick 4275 mpi_requests=new MPI_Request[size*2];
1757     mpi_stati=new MPI_Status[size*2];
1758 lgao 3779 #else
1759 jfenwick 4275 mpi_requests=new int[size*2];
1760     mpi_stati=new int[size*2];
1761 lgao 3779 #endif
1762     num_neighbors = 0;
1763     j = 0;
1764     offsetInShared[0] = 0;
1765     for (i=0; i<size; i++) {
1766     if (send_len[i] > 0) {
1767     neighbor[num_neighbors] = i;
1768     num_neighbors ++;
1769     j += send_len[i];
1770     offsetInShared[num_neighbors] = j;
1771     }
1772     }
1773 jfenwick 4275 delete[] shared;
1774     shared = new index_t[j];
1775 lgao 3779 for (i=0, j=0; i<num_neighbors; i++) {
1776     k = neighbor[i];
1777 lgao 3828 #ifdef ESYS_MPI
1778 lgao 3779 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1779     mpi_info->msg_tag_counter+k,
1780     mpi_info->comm, &mpi_requests[i]);
1781 lgao 3828 #endif
1782 lgao 3779 j += send_len[k];
1783     }
1784     for (i=0, j=0; i<recv->numNeighbors; i++) {
1785     k = recv->neighbor[i];
1786 lgao 3828 #ifdef ESYS_MPI
1787 lgao 3779 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1788     mpi_info->msg_tag_counter+rank,
1789     mpi_info->comm, &mpi_requests[i+num_neighbors]);
1790 lgao 3828 #endif
1791 lgao 3779 j += recv_len[k];
1792     }
1793 lgao 3828 #ifdef ESYS_MPI
1794 lgao 3779 MPI_Waitall(num_neighbors + recv->numNeighbors,
1795     mpi_requests, mpi_stati);
1796 lgao 3828 #endif
1797 lgao 3779 mpi_info->msg_tag_counter += size;
1798    
1799     j = offsetInShared[num_neighbors];
1800     offset = dist[rank];
1801 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1802 lgao 3779 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1803     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1804     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1805    
1806     col_connector = Paso_Connector_alloc(send, recv);
1807 jfenwick 4275 delete[] shared;
1808 lgao 3779 Paso_SharedComponents_free(recv);
1809     Paso_SharedComponents_free(send);
1810    
1811     /* now, create row distribution (output_distri) and col
1812     distribution (input_distribution) */
1813     input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1814     output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1815    
1816     /* then, prepare the sender/receiver for the row_connector, first, prepare
1817     the information for sender */
1818     sum = RAP_couple_ptr[num_Pmain_cols];
1819 jfenwick 4275 len = new dim_t[size];
1820     send_ptr = new index_t*[size];
1821     send_idx = new index_t*[size];
1822 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1823 lgao 3779 for (i=0; i<size; i++) {
1824 jfenwick 4275 send_ptr[i] = new index_t[num_Pmain_cols];
1825     send_idx[i] = new index_t[sum];
1826 lgao 3779 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1827     }
1828     memset(len, 0, sizeof(dim_t) * size);
1829     recv = col_connector->recv;
1830     sum=0;
1831     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1832     i1 = RAP_couple_ptr[i_r];
1833     i2 = RAP_couple_ptr[i_r+1];
1834     if (i2 > i1) {
1835     /* then row i_r will be in the sender of row_connector, now check
1836 jfenwick 4345 how many neighbours i_r needs to be send to */
1837 lgao 3779 for (j=i1; j<i2; j++) {
1838     i_c = RAP_couple_idx[j];
1839     /* find out the corresponding neighbor "p" of column i_c */
1840     for (p=0; p<recv->numNeighbors; p++) {
1841     if (i_c < recv->offsetInShared[p+1]) {
1842     k = recv->neighbor[p];
1843     if (send_ptr[k][i_r] == 0) sum++;
1844     send_ptr[k][i_r] ++;
1845     send_idx[k][len[k]] = global_id_RAP[i_c];
1846     len[k] ++;
1847     break;
1848     }
1849     }
1850     }
1851     }
1852     }
1853     if (global_id_RAP) {
1854 jfenwick 4275 delete[] global_id_RAP;
1855 lgao 3779 global_id_RAP = NULL;
1856     }
1857    
1858     /* now allocate the sender */
1859 jfenwick 4275 shared = new index_t[sum];
1860 lgao 3779 memset(send_len, 0, sizeof(dim_t) * size);
1861     num_neighbors=0;
1862     offsetInShared[0] = 0;
1863     for (p=0, k=0; p<size; p++) {
1864     for (i=0; i<num_Pmain_cols; i++) {
1865     if (send_ptr[p][i] > 0) {
1866     shared[k] = i;
1867     k++;
1868     send_ptr[p][send_len[p]] = send_ptr[p][i];
1869     send_len[p]++;
1870     }
1871     }
1872     if (k > offsetInShared[num_neighbors]) {
1873     neighbor[num_neighbors] = p;
1874     num_neighbors ++;
1875     offsetInShared[num_neighbors] = k;
1876     }
1877     }
1878     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1879     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1880    
1881     /* send/recv number of rows will be sent from current proc
1882     recover info for the receiver of row_connector from the sender */
1883 lgao 3828 #ifdef ESYS_MPI
1884 lgao 3779 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1885 lgao 3828 #endif
1886 lgao 3779 num_neighbors = 0;
1887     offsetInShared[0] = 0;
1888     j = 0;
1889     for (i=0; i<size; i++) {
1890     if (i != rank && recv_len[i] > 0) {
1891     neighbor[num_neighbors] = i;
1892     num_neighbors ++;
1893     j += recv_len[i];
1894     offsetInShared[num_neighbors] = j;
1895     }
1896     }
1897 jfenwick 4275 delete[] shared;
1898     delete[] recv_len;
1899     shared = new index_t[j];
1900 lgao 3779 k = offsetInShared[num_neighbors];
1901 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1902 lgao 3779 for (i=0; i<k; i++) {
1903     shared[i] = i + num_Pmain_cols;
1904     }
1905     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1906     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1907     row_connector = Paso_Connector_alloc(send, recv);
1908 jfenwick 4275 delete[] shared;
1909 lgao 3779 Paso_SharedComponents_free(recv);
1910     Paso_SharedComponents_free(send);
1911    
1912     /* send/recv pattern->ptr for rowCoupleBlock */
1913     num_RAPext_rows = offsetInShared[num_neighbors];
1914 jfenwick 4275 row_couple_ptr = new index_t[num_RAPext_rows+1];
1915 lgao 3779 for (p=0; p<num_neighbors; p++) {
1916     j = offsetInShared[p];
1917     i = offsetInShared[p+1];
1918 lgao 3828 #ifdef ESYS_MPI
1919 lgao 3779 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1920     mpi_info->msg_tag_counter+neighbor[p],
1921     mpi_info->comm, &mpi_requests[p]);
1922 lgao 3828 #endif
1923 lgao 3779 }
1924     send = row_connector->send;
1925     for (p=0; p<send->numNeighbors; p++) {
1926 lgao 3828 #ifdef ESYS_MPI
1927 lgao 3779 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1928     MPI_INT, send->neighbor[p],
1929     mpi_info->msg_tag_counter+rank,
1930     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1931 lgao 3828 #endif
1932 lgao 3779 }
1933 lgao 3828 #ifdef ESYS_MPI
1934 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1935     mpi_requests, mpi_stati);
1936 lgao 3828 #endif
1937 lgao 3779 mpi_info->msg_tag_counter += size;
1938 jfenwick 4275 delete[] send_len;
1939 lgao 3779
1940     sum = 0;
1941     for (i=0; i<num_RAPext_rows; i++) {
1942     k = row_couple_ptr[i];
1943     row_couple_ptr[i] = sum;
1944     sum += k;
1945     }
1946     row_couple_ptr[num_RAPext_rows] = sum;
1947    
1948     /* send/recv pattern->index for rowCoupleBlock */
1949     k = row_couple_ptr[num_RAPext_rows];
1950 jfenwick 4275 row_couple_idx = new index_t[k];
1951 lgao 3779 for (p=0; p<num_neighbors; p++) {
1952     j1 = row_couple_ptr[offsetInShared[p]];
1953     j2 = row_couple_ptr[offsetInShared[p+1]];
1954 lgao 3828 #ifdef ESYS_MPI
1955 lgao 3779 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1956     mpi_info->msg_tag_counter+neighbor[p],
1957     mpi_info->comm, &mpi_requests[p]);
1958 lgao 3828 #endif
1959 lgao 3779 }
1960     for (p=0; p<send->numNeighbors; p++) {
1961 lgao 3828 #ifdef ESYS_MPI
1962 lgao 3779 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1963     MPI_INT, send->neighbor[p],
1964     mpi_info->msg_tag_counter+rank,
1965     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1966 lgao 3828 #endif
1967 lgao 3779 }
1968 lgao 3828 #ifdef ESYS_MPI
1969 lgao 3779 MPI_Waitall(num_neighbors + send->numNeighbors,
1970     mpi_requests, mpi_stati);
1971 lgao 3828 #endif
1972 lgao 3779 mpi_info->msg_tag_counter += size;
1973    
1974     offset = input_dist->first_component[rank];
1975     k = row_couple_ptr[num_RAPext_rows];
1976 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1977 lgao 3779 for (i=0; i<k; i++) {
1978     row_couple_idx[i] -= offset;
1979     }
1980 lgao 3821 #pragma omp parallel for schedule(static) private(i)
1981 lgao 3779 for (i=0; i<size; i++) {
1982 jfenwick 4275 delete[] send_ptr[i];
1983     delete[] send_idx[i];
1984 lgao 3779 }
1985 jfenwick 4275 delete[] send_ptr;
1986     delete[] send_idx;
1987     delete[] len;
1988     delete[] offsetInShared;
1989     delete[] neighbor;
1990     delete[] mpi_requests;
1991     delete[] mpi_stati;
1992 lgao 3779 Esys_MPIInfo_free(mpi_info);
1993    
1994     /* Now, we can create pattern for mainBlock and coupleBlock */
1995     main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1996     num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1997     col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1998     num_Pmain_cols, num_RAPext_cols,
1999     RAP_couple_ptr, RAP_couple_idx);
2000     row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2001     num_RAPext_rows, num_Pmain_cols,
2002     row_couple_ptr, row_couple_idx);
2003    
2004     /* next, create the system matrix */
2005     pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2006     output_dist, input_dist, main_pattern, col_couple_pattern,
2007     row_couple_pattern, col_connector, row_connector);
2008     out = Paso_SystemMatrix_alloc(A->type, pattern,
2009     row_block_size, col_block_size, FALSE);
2010    
2011     /* finally, fill in the data*/
2012     memcpy(out->mainBlock->val, RAP_main_val,
2013     out->mainBlock->len* sizeof(double));
2014     memcpy(out->col_coupleBlock->val, RAP_couple_val,
2015     out->col_coupleBlock->len * sizeof(double));
2016    
2017     /* Clean up */
2018     Paso_SystemMatrixPattern_free(pattern);
2019     Paso_Pattern_free(main_pattern);
2020     Paso_Pattern_free(col_couple_pattern);
2021     Paso_Pattern_free(row_couple_pattern);
2022     Paso_Connector_free(col_connector);
2023     Paso_Connector_free(row_connector);
2024     Paso_Distribution_free(output_dist);
2025     Paso_Distribution_free(input_dist);
2026 jfenwick 4275 delete[] RAP_main_val;
2027     delete[] RAP_couple_val;
2028 lgao 3779 if (Esys_noError()) {
2029     return out;
2030     } else {
2031     Paso_SystemMatrix_free(out);
2032     return NULL;
2033     }
2034     }
2035    
2036    
2037     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2038     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2039     {
2040     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2041     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2042     Paso_SystemMatrix *out=NULL;
2043     Paso_SystemMatrixPattern *pattern=NULL;
2044     Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2045     Paso_SharedComponents *send =NULL, *recv=NULL;
2046     Paso_Connector *col_connector=NULL, *row_connector=NULL;
2047     Paso_Pattern *main_pattern=NULL;
2048     Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2049     const dim_t row_block_size=A->row_block_size;
2050     const dim_t col_block_size=A->col_block_size;
2051     const dim_t block_size = A->block_size;
2052     const double ZERO = 0.0;
2053     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2054     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2055     index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2056     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2057     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2058     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2059     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2060     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2061     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2062     index_t sum, i, j, k, iptr, irb, icb, ib;
2063     index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2064     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2065     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2066     index_t row_marker_ext, *where_p=NULL;
2067     index_t **send_ptr=NULL, **send_idx=NULL;
2068 lgao 3828 dim_t