/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Annotation of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3819 - (hide annotations)
Mon Feb 13 01:20:29 2012 UTC (7 years, 9 months ago) by lgao
Original Path: branches/amg_from_3530/paso/src/AMG_Interpolation.c
File MIME type: text/plain
File size: 121544 byte(s)
Clean up the debugging info.


1 lgao 3763 //
2     /*******************************************************
3     *
4     * Copyright (c) 2003-2011 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13    
14    
15     /**************************************************************/
16    
17     /* Paso: defines AMG Interpolation */
18    
19     /**************************************************************/
20    
21     /* Author: l.gao@uq.edu.au */
22    
23     /**************************************************************/
24    
25     #include "Paso.h"
26     #include "SparseMatrix.h"
27     #include "PasoUtil.h"
28     #include "Preconditioner.h"
29    
30     /**************************************************************
31    
32     Methods nessecary for AMG preconditioner
33    
34     construct n_C x n_C interpolation operator A_C from matrix A
35     and prolongation matrix P.
36    
37     The coarsening operator A_C is defined by RAP where R=P^T.
38    
39     ***************************************************************/
40    
41     #define MY_DEBUG 0
42 lgao 3817 #define MY_DEBUG1 1
43 lgao 3763
44     /* Extend system matrix B with extra two sparse matrixes:
45     B_ext_main and B_ext_couple
46     The combination of this two sparse matrixes represents
47     the portion of B that is stored on neighbor procs and
48     needed locally for triple matrix product (B^T) A B.
49    
50     FOR NOW, we assume that the system matrix B has a NULL
51     row_coupleBlock and a NULL remote_coupleBlock. Based on
52     this assumption, we use link row_coupleBlock for sparse
53     matrix B_ext_main, and link remote_coupleBlock for sparse
54     matrix B_ext_couple.
55    
56     To be specific, B_ext (on processor k) are group of rows
57     in B, where the list of rows from processor q is given by
58     A->col_coupler->connector->send->shared[rPtr...]
59     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
60     on q. */
61     void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
62     {
63     Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
64     Paso_Coupler *coupler=NULL;
65     Paso_SharedComponents *send=NULL, *recv=NULL;
66     double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
67     index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
68     index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
69     index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
70     index_t *idx_m=NULL, *idx_c=NULL;
71 lgao 3817 index_t i, j, k, m, n, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
72 lgao 3763 index_t offset, len, block_size, block_size_size, max_num_cols;
73     index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
74     dim_t *recv_degree=NULL, *send_degree=NULL;
75     dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
76    
77     if (size == 1) return;
78    
79     if (B->row_coupleBlock) {
80     Paso_SparseMatrix_free(B->row_coupleBlock);
81     B->row_coupleBlock = NULL;
82     }
83    
84     if (B->row_coupleBlock || B->remote_coupleBlock) {
85     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
86     return;
87     }
88    
89     /* sending/receiving unknown's global ID */
90     num_main_cols = B->mainBlock->numCols;
91     cols = TMPMEMALLOC(num_main_cols, double);
92     offset = B->col_distribution->first_component[rank];
93     #pragma omp parallel for private(i) schedule(static)
94     for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
95     if (B->global_id == NULL) {
96     coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
97     Paso_Coupler_startCollect(coupler, cols);
98     }
99    
100     recv_buf = TMPMEMALLOC(size, index_t);
101     recv_degree = TMPMEMALLOC(size, dim_t);
102     recv_offset = TMPMEMALLOC(size+1, index_t);
103     #pragma omp parallel for private(i) schedule(static)
104     for (i=0; i<size; i++){
105     recv_buf[i] = 0;
106     recv_degree[i] = 1;
107     recv_offset[i] = i;
108     }
109    
110 lgao 3779 block_size = B->block_size;
111 lgao 3763 block_size_size = block_size * sizeof(double);
112     num_couple_cols = B->col_coupleBlock->numCols;
113     send = A->col_coupler->connector->send;
114     recv = A->col_coupler->connector->recv;
115     num_neighbors = send->numNeighbors;
116     p = send->offsetInShared[num_neighbors];
117 lgao 3767 len = p * B->col_distribution->first_component[size];
118 lgao 3763 send_buf = TMPMEMALLOC(len * block_size, double);
119     send_idx = TMPMEMALLOC(len, index_t);
120     send_offset = TMPMEMALLOC((p+1)*2, index_t);
121     send_degree = TMPMEMALLOC(num_neighbors, dim_t);
122     i = num_main_cols + num_couple_cols;
123     send_m = TMPMEMALLOC(i * block_size, double);
124     send_c = TMPMEMALLOC(i * block_size, double);
125     idx_m = TMPMEMALLOC(i, index_t);
126     idx_c = TMPMEMALLOC(i, index_t);
127    
128     /* waiting for receiving unknown's global ID */
129     if (B->global_id == NULL) {
130     Paso_Coupler_finishCollect(coupler);
131     global_id = MEMALLOC(num_couple_cols, index_t);
132     #pragma omp parallel for private(i) schedule(static)
133     for (i=0; i<num_couple_cols; ++i)
134     global_id[i] = coupler->recv_buffer[i];
135     B->global_id = global_id;
136     Paso_Coupler_free(coupler);
137     } else
138     global_id = B->global_id;
139    
140     /* distribute the number of cols in current col_coupleBlock to all ranks */
141     MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
142    
143     /* distribute global_ids of cols to be considered to all ranks*/
144     len = 0;
145     max_num_cols = 0;
146     for (i=0; i<size; i++){
147     recv_degree[i] = recv_buf[i];
148     recv_offset[i] = len;
149     len += recv_buf[i];
150     if (max_num_cols < recv_buf[i])
151     max_num_cols = recv_buf[i];
152     }
153     recv_offset[size] = len;
154     cols_array = TMPMEMALLOC(len, index_t);
155     MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
156    
157     /* first, prepare the ptr_ptr to be received */
158     q = recv->numNeighbors;
159     len = recv->offsetInShared[q];
160     ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
161     for (p=0; p<q; p++) {
162     row = recv->offsetInShared[p];
163     m = recv->offsetInShared[p + 1];
164 lgao 3767 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
165 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
166     A->mpi_info->comm,
167     &(A->col_coupler->mpi_requests[p]));
168     }
169    
170     /* now prepare the rows to be sent (the degree, the offset and the data) */
171     len = 0;
172 lgao 3817 i0 = 0;
173 lgao 3763 for (p=0; p<num_neighbors; p++) {
174 lgao 3817 i = i0;
175 lgao 3763 neighbor = send->neighbor[p];
176     m_lb = B->col_distribution->first_component[neighbor];
177     m_ub = B->col_distribution->first_component[neighbor + 1];
178     j_ub = send->offsetInShared[p + 1];
179     for (j=send->offsetInShared[p]; j<j_ub; j++) {
180     row = send->shared[j];
181     l_m = 0;
182     l_c = 0;
183     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
184     k_lb = B->col_coupleBlock->pattern->ptr[row];
185    
186     /* check part of col_coupleBlock for data to be passed @row */
187     for (k=k_lb; k<k_ub; k++) {
188     m = global_id[B->col_coupleBlock->pattern->index[k]];
189     if (m > offset) break;
190     if (m>= m_lb && m < m_ub) {
191     /* data to be passed to sparse matrix B_ext_main */
192     idx_m[l_m] = m - m_lb;
193     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
194     l_m++;
195     } else {
196     /* data to be passed to sparse matrix B_ext_couple */
197     idx_c[l_c] = m;
198     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
199     l_c++;
200     }
201     }
202     k_lb = k;
203    
204     /* check mainBlock for data to be passed @row to sparse
205     matrix B_ext_couple */
206     k_ub = B->mainBlock->pattern->ptr[row + 1];
207     k = B->mainBlock->pattern->ptr[row];
208     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
209     for (; k<k_ub; k++) {
210     m = B->mainBlock->pattern->index[k] + offset;
211     idx_c[l_c] = m;
212     l_c++;
213     }
214    
215     /* check the rest part of col_coupleBlock for data to
216     be passed @row to sparse matrix B_ext_couple */
217     k = k_lb;
218     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
219     for (k=k_lb; k<k_ub; k++) {
220     m = global_id[B->col_coupleBlock->pattern->index[k]];
221 lgao 3767 if (m>= m_lb && m < m_ub) {
222 lgao 3763 /* data to be passed to sparse matrix B_ext_main */
223     idx_m[l_m] = m - m_lb;
224     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
225     l_m++;
226     } else {
227     /* data to be passed to sparse matrix B_ext_couple */
228     idx_c[l_c] = m;
229     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
230     l_c++;
231     }
232     }
233    
234 lgao 3779 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
235 lgao 3767 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
236 lgao 3763 send_offset[2*i] = l_m;
237     len += l_m;
238 lgao 3779 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
239 lgao 3767 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
240 lgao 3763 send_offset[2*i+1] = l_c;
241     len += l_c;
242     i++;
243     }
244    
245     /* sending */
246 lgao 3817 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
247 lgao 3763 A->mpi_info->msg_tag_counter+rank,
248     A->mpi_info->comm,
249     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
250     send_degree[p] = len;
251 lgao 3817 i0 = i;
252 lgao 3763 }
253     TMPMEMFREE(send_m);
254     TMPMEMFREE(send_c);
255     TMPMEMFREE(idx_m);
256     TMPMEMFREE(idx_c);
257    
258    
259     q = recv->numNeighbors;
260     len = recv->offsetInShared[q];
261     ptr_main = MEMALLOC((len+1), index_t);
262     ptr_couple = MEMALLOC((len+1), index_t);
263    
264    
265     MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
266     A->col_coupler->mpi_requests,
267     A->col_coupler->mpi_stati);
268     A->mpi_info->msg_tag_counter += size;
269    
270     j = 0;
271     k = 0;
272     ptr_main[0] = 0;
273     ptr_couple[0] = 0;
274     // #pragma omp parallel for private(i,j,k) schedule(static)
275     for (i=0; i<len; i++) {
276     j += ptr_ptr[2*i];
277     k += ptr_ptr[2*i+1];
278     ptr_main[i+1] = j;
279     ptr_couple[i+1] = k;
280     }
281    
282     TMPMEMFREE(ptr_ptr);
283     idx_main = MEMALLOC(j, index_t);
284     idx_couple = MEMALLOC(k, index_t);
285     ptr_idx = TMPMEMALLOC(j+k, index_t);
286 lgao 3779 ptr_val = TMPMEMALLOC((j+k) * block_size, double);
287 lgao 3763
288     /* send/receive index array */
289     j=0;
290     k_ub = 0;
291     for (p=0; p<recv->numNeighbors; p++) {
292     k = recv->offsetInShared[p];
293     m = recv->offsetInShared[p+1];
294     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
295     if (i > 0) {
296     k_ub ++;
297     MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
298     A->mpi_info->msg_tag_counter+recv->neighbor[p],
299     A->mpi_info->comm,
300     &(A->col_coupler->mpi_requests[p]));
301     }
302     j += i;
303     }
304    
305     j=0;
306     k_ub = 0;
307     for (p=0; p<num_neighbors; p++) {
308     i = send_degree[p] - j;
309     if (i > 0){
310     k_ub ++;
311     MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
312     A->mpi_info->msg_tag_counter+rank,
313     A->mpi_info->comm,
314     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
315     }
316     j = send_degree[p];
317     }
318    
319     MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
320     A->col_coupler->mpi_requests,
321     A->col_coupler->mpi_stati);
322     A->mpi_info->msg_tag_counter += size;
323    
324 lgao 3767 // #pragma omp parallel for private(i,j,k,m,p) schedule(static)
325 lgao 3763 for (i=0; i<len; i++) {
326     j = ptr_main[i];
327     k = ptr_main[i+1];
328     m = ptr_couple[i];
329     for (p=j; p<k; p++) {
330     idx_main[p] = ptr_idx[m+p];
331     }
332     j = ptr_couple[i+1];
333     for (p=m; p<j; p++) {
334     idx_couple[p] = ptr_idx[k+p];
335     }
336     }
337     TMPMEMFREE(ptr_idx);
338    
339     /* allocate pattern and sparsematrix for B_ext_main */
340     pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
341     len, num_main_cols, ptr_main, idx_main);
342     B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
343     pattern_main, B->row_block_size, B->col_block_size,
344     FALSE);
345     Paso_Pattern_free(pattern_main);
346    
347     /* allocate pattern and sparsematrix for B_ext_couple */
348     pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
349     len, B->col_distribution->first_component[size],
350     ptr_couple, idx_couple);
351     B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
352     pattern_couple, B->row_block_size, B->col_block_size,
353     FALSE);
354     Paso_Pattern_free(pattern_couple);
355    
356     /* send/receive value array */
357     j=0;
358     for (p=0; p<recv->numNeighbors; p++) {
359     k = recv->offsetInShared[p];
360     m = recv->offsetInShared[p+1];
361     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
362     if (i > 0)
363     MPI_Irecv(&(ptr_val[j]), i * block_size,
364     MPI_DOUBLE, recv->neighbor[p],
365     A->mpi_info->msg_tag_counter+recv->neighbor[p],
366     A->mpi_info->comm,
367     &(A->col_coupler->mpi_requests[p]));
368 lgao 3779 j += (i * block_size);
369 lgao 3763 }
370    
371     j=0;
372     for (p=0; p<num_neighbors; p++) {
373     i = send_degree[p] - j;
374     if (i > 0)
375 lgao 3779 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
376 lgao 3763 A->mpi_info->msg_tag_counter+rank,
377     A->mpi_info->comm,
378     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
379 lgao 3779 j = send_degree[p] ;
380 lgao 3763 }
381    
382     MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
383     A->col_coupler->mpi_requests,
384     A->col_coupler->mpi_stati);
385     A->mpi_info->msg_tag_counter += size;
386    
387     #pragma omp parallel for private(i,j,k,m,p) schedule(static)
388     for (i=0; i<len; i++) {
389     j = ptr_main[i];
390     k = ptr_main[i+1];
391     m = ptr_couple[i];
392     for (p=j; p<k; p++) {
393 lgao 3817 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
394 lgao 3763 }
395     j = ptr_couple[i+1];
396     for (p=m; p<j; p++) {
397 lgao 3817 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
398 lgao 3763 }
399     }
400     TMPMEMFREE(ptr_val);
401    
402 lgao 3817
403 lgao 3763 /* release all temp memory allocation */
404     TMPMEMFREE(cols);
405     TMPMEMFREE(cols_array);
406     TMPMEMFREE(recv_offset);
407     TMPMEMFREE(recv_degree);
408     TMPMEMFREE(recv_buf);
409     TMPMEMFREE(send_buf);
410     TMPMEMFREE(send_offset);
411     TMPMEMFREE(send_degree);
412     TMPMEMFREE(send_idx);
413     }
414    
415     /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
416     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
417     to copy block of data in T to neighbor processors, defined by
418     P->col_coupler->connector->recv->neighbor[k] where k is in
419     [0, P->col_coupler->connector->recv->numNeighbors).
420     Rows to be copied to neighbor processor k is in the list defined by
421     P->col_coupler->connector->recv->offsetInShared[k] ...
422     P->col_coupler->connector->recv->offsetInShared[k+1] */
423     void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
424 lgao 3779 index_t **p_ptr, index_t **p_idx, double **p_val,
425     index_t *global_id, index_t block_size)
426 lgao 3763 {
427     Paso_SharedComponents *send=NULL, *recv=NULL;
428     index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
429     index_t i, j, p, m, n, rank, size, offset;
430     index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
431     index_t *ptr=*p_ptr, *idx=*p_idx;
432     double *val=*p_val, *recv_val=NULL;
433    
434     rank = P->mpi_info->rank;
435     size = P->mpi_info->size;
436     send = P->col_coupler->connector->recv;
437     recv = P->col_coupler->connector->send;
438     send_neighbors = send->numNeighbors;
439     recv_neighbors = recv->numNeighbors;
440     send_rows = P->col_coupleBlock->numCols;
441     recv_rows = recv->offsetInShared[recv_neighbors];
442     offset = P->col_distribution->first_component[rank];
443    
444     send_degree = TMPMEMALLOC(send_rows, index_t);
445     recv_ptr = MEMALLOC(recv_rows + 1, index_t);
446     #pragma omp for schedule(static) private(i)
447     for (i=0; i<send_rows; i++)
448     send_degree[i] = ptr[i+1] - ptr[i];
449    
450     /* First, send/receive the degree */
451     for (p=0; p<recv_neighbors; p++) { /* Receiving */
452     m = recv->offsetInShared[p];
453     n = recv->offsetInShared[p+1];
454     MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
455     P->mpi_info->msg_tag_counter + recv->neighbor[p],
456     P->mpi_info->comm,
457     &(P->col_coupler->mpi_requests[p]));
458     }
459     for (p=0; p<send_neighbors; p++) { /* Sending */
460     m = send->offsetInShared[p];
461     n = send->offsetInShared[p+1];
462     MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
463     P->mpi_info->msg_tag_counter + rank,
464     P->mpi_info->comm,
465     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
466     }
467     MPI_Waitall(send_neighbors+recv_neighbors,
468     P->col_coupler->mpi_requests,
469     P->col_coupler->mpi_stati);
470     P->mpi_info->msg_tag_counter += size;
471    
472     TMPMEMFREE(send_degree);
473     m = Paso_Util_cumsum(recv_rows, recv_ptr);
474     recv_ptr[recv_rows] = m;
475     recv_idx = MEMALLOC(m, index_t);
476 lgao 3779 recv_val = MEMALLOC(m * block_size, double);
477 lgao 3763
478     /* Next, send/receive the index array */
479     j = 0;
480     for (p=0; p<recv_neighbors; p++) { /* Receiving */
481     m = recv->offsetInShared[p];
482     n = recv->offsetInShared[p+1];
483     i = recv_ptr[n] - recv_ptr[m];
484     if (i > 0) {
485     MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
486     P->mpi_info->msg_tag_counter + recv->neighbor[p],
487     P->mpi_info->comm,
488     &(P->col_coupler->mpi_requests[p]));
489 lgao 3819 }
490 lgao 3763 j += i;
491     }
492    
493     j = 0;
494     for (p=0; p<send_neighbors; p++) { /* Sending */
495     m = send->offsetInShared[p];
496     n = send->offsetInShared[p+1];
497     i = ptr[n] - ptr[m];
498     if (i >0) {
499     MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
500     P->mpi_info->msg_tag_counter + rank,
501     P->mpi_info->comm,
502     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
503     j += i;
504 lgao 3819 }
505 lgao 3763 }
506     MPI_Waitall(send_neighbors+recv_neighbors,
507     P->col_coupler->mpi_requests,
508     P->col_coupler->mpi_stati);
509     P->mpi_info->msg_tag_counter += size;
510    
511     /* Last, send/receive the data array */
512     j = 0;
513     for (p=0; p<recv_neighbors; p++) { /* Receiving */
514     m = recv->offsetInShared[p];
515     n = recv->offsetInShared[p+1];
516     i = recv_ptr[n] - recv_ptr[m];
517     if (i > 0)
518 lgao 3779 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
519 lgao 3763 P->mpi_info->msg_tag_counter + recv->neighbor[p],
520     P->mpi_info->comm,
521     &(P->col_coupler->mpi_requests[p]));
522 lgao 3779 j += (i*block_size);
523 lgao 3763 }
524    
525     j = 0;
526     for (p=0; p<send_neighbors; p++) { /* Sending */
527     m = send->offsetInShared[p];
528     n = send->offsetInShared[p+1];
529     i = ptr[n] - ptr[m];
530     if (i >0) {
531 lgao 3779 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
532 lgao 3763 P->mpi_info->msg_tag_counter + rank,
533     P->mpi_info->comm,
534     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
535 lgao 3779 j += i * block_size;
536 lgao 3763 }
537     }
538     MPI_Waitall(send_neighbors+recv_neighbors,
539     P->col_coupler->mpi_requests,
540     P->col_coupler->mpi_stati);
541     P->mpi_info->msg_tag_counter += size;
542    
543     /* Clean up and return with recevied ptr, index and data arrays */
544     MEMFREE(ptr);
545     MEMFREE(idx);
546     MEMFREE(val);
547     *p_ptr = recv_ptr;
548     *p_idx = recv_idx;
549     *p_val = recv_val;
550     }
551    
552     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
553     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
554     {
555     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
556     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
557     Paso_SystemMatrix *out=NULL;
558     Paso_SystemMatrixPattern *pattern=NULL;
559     Paso_Distribution *input_dist=NULL, *output_dist=NULL;
560     Paso_SharedComponents *send =NULL, *recv=NULL;
561     Paso_Connector *col_connector=NULL, *row_connector=NULL;
562     Paso_Coupler *coupler=NULL;
563     Paso_Pattern *main_pattern=NULL;
564     Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
565     const dim_t row_block_size=A->row_block_size;
566     const dim_t col_block_size=A->col_block_size;
567 lgao 3779 const dim_t block_size = A->block_size;
568     const dim_t P_block_size = P->block_size;
569 lgao 3763 const dim_t num_threads=omp_get_max_threads();
570     const double ZERO = 0.0;
571     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
572 lgao 3779 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
573 lgao 3763 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
574     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
575     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
576     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
577     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
578     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
579     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
580 lgao 3779 index_t sum, i, j, k, iptr, irb, icb, ib;
581 lgao 3763 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
582     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
583     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
584     index_t row_marker_ext, *where_p=NULL;
585     index_t **send_ptr=NULL, **send_idx=NULL;
586     dim_t l, p, q, global_label, num_neighbors;
587     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
588     Esys_MPI_rank *neighbor=NULL;
589     #ifdef ESYS_MPI
590     MPI_Request* mpi_requests=NULL;
591     MPI_Status* mpi_stati=NULL;
592     #else
593     int *mpi_requests=NULL, *mpi_stati=NULL;
594     #endif
595    
596 lgao 3779 // if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
597     // return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);
598    
599 lgao 3763 /* two sparse matrixes R_main and R_couple will be generate, as the
600     transpose of P_main and P_col_couple, respectively. Note that,
601     R_couple is actually the row_coupleBlock of R (R=P^T) */
602     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
603 lgao 3817 if (size > 1)
604     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
605 lgao 3763
606 lgao 3779 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
607     and needed locally for triple matrix product RAP
608     to be specific, P_ext (on processor k) are group of rows in P, where
609     the list of rows from processor q is given by
610     A->col_coupler->connector->send->shared[rPtr...]
611     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
612     on q.
613     P_ext is represented by two sparse matrixes P_ext_main and
614     P_ext_couple */
615     Paso_Preconditioner_AMG_extendB(A, P);
616    
617     /* count the number of cols in P_ext_couple, resize the pattern of
618     sparse matrix P_ext_couple with new compressed order, and then
619     build the col id mapping from P->col_coupleBlock to
620     P_ext_couple */
621     num_Pmain_cols = P->mainBlock->numCols;
622 lgao 3817 if (size > 1) {
623     num_Pcouple_cols = P->col_coupleBlock->numCols;
624     num_Acouple_cols = A->col_coupleBlock->numCols;
625     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
626     } else {
627     num_Pcouple_cols = 0;
628     num_Acouple_cols = 0;
629     sum = 0;
630     }
631 lgao 3779 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
632     offset = P->col_distribution->first_component[rank];
633     num_Pext_cols = 0;
634     if (P->global_id) {
635     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
636     and P->col_coupleBlock */
637     iptr = 0;
638     if (num_Pcouple_cols || sum > 0) {
639     temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
640     for (; iptr<sum; iptr++){
641     temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
642     }
643     for (j=0; j<num_Pcouple_cols; j++, iptr++){
644     temp[iptr] = P->global_id[j];
645     }
646     }
647     if (iptr) {
648     #ifdef USE_QSORTG
649     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
650     #else
651     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
652     #endif
653     num_Pext_cols = 1;
654     i = temp[0];
655     for (j=1; j<iptr; j++) {
656     if (temp[j] > i) {
657     i = temp[j];
658     temp[num_Pext_cols++] = i;
659     }
660     }
661     }
662     /* resize the pattern of P_ext_couple */
663     if(num_Pext_cols){
664     global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
665     for (i=0; i<num_Pext_cols; i++)
666     global_id_P[i] = temp[i];
667     }
668     if (num_Pcouple_cols || sum > 0)
669     TMPMEMFREE(temp);
670     for (i=0; i<sum; i++) {
671     where_p = (index_t *)bsearch(
672     &(P->remote_coupleBlock->pattern->index[i]),
673     global_id_P, num_Pext_cols,
674     sizeof(index_t), Paso_comparIndex);
675     P->remote_coupleBlock->pattern->index[i] =
676     (index_t)(where_p -global_id_P);
677     }
678    
679     /* build the mapping */
680     if (num_Pcouple_cols) {
681     Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
682     iptr = 0;
683     for (i=0; i<num_Pext_cols; i++)
684     if (global_id_P[i] == P->global_id[iptr]) {
685     Pcouple_to_Pext[iptr++] = i;
686     if (iptr == num_Pcouple_cols) break;
687     }
688     }
689     }
690    
691     /* alloc and initialize the makers */
692     sum = num_Pext_cols + num_Pmain_cols;
693     P_marker = TMPMEMALLOC(sum, index_t);
694     A_marker = TMPMEMALLOC(num_A_cols, index_t);
695     #pragma omp parallel for private(i) schedule(static)
696     for (i=0; i<sum; i++) P_marker[i] = -1;
697     #pragma omp parallel for private(i) schedule(static)
698     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
699    
700     /* Now, count the size of RAP_ext. Start with rows in R_couple */
701     sum = 0;
702     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
703     row_marker = sum;
704     /* then, loop over elements in row i_r of R_couple */
705     j1_ub = R_couple->pattern->ptr[i_r+1];
706     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
707     i1 = R_couple->pattern->index[j1];
708     /* then, loop over elements in row i1 of A->col_coupleBlock */
709     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
710     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
711     i2 = A->col_coupleBlock->pattern->index[j2];
712    
713     /* check whether entry RA[i_r, i2] has been previously visited.
714     RAP new entry is possible only if entry RA[i_r, i2] has not
715     been visited yet */
716     if (A_marker[i2] != i_r) {
717     /* first, mark entry RA[i_r,i2] as visited */
718     A_marker[i2] = i_r;
719    
720     /* then loop over elements in row i2 of P_ext_main */
721     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
722     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
723     i_c = P->row_coupleBlock->pattern->index[j3];
724    
725     /* check whether entry RAP[i_r,i_c] has been created.
726     If not yet created, create the entry and increase
727     the total number of elements in RAP_ext */
728     if (P_marker[i_c] < row_marker) {
729     P_marker[i_c] = sum;
730     sum++;
731     }
732     }
733    
734     /* loop over elements in row i2 of P_ext_couple, do the same */
735     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
736     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
737     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
738     if (P_marker[i_c] < row_marker) {
739     P_marker[i_c] = sum;
740     sum++;
741     }
742     }
743     }
744     }
745    
746     /* now loop over elements in row i1 of A->mainBlock, repeat
747     the process we have done to block A->col_coupleBlock */
748     j2_ub = A->mainBlock->pattern->ptr[i1+1];
749     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
750     i2 = A->mainBlock->pattern->index[j2];
751     if (A_marker[i2 + num_Acouple_cols] != i_r) {
752     A_marker[i2 + num_Acouple_cols] = i_r;
753     j3_ub = P->mainBlock->pattern->ptr[i2+1];
754     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
755     i_c = P->mainBlock->pattern->index[j3];
756     if (P_marker[i_c] < row_marker) {
757     P_marker[i_c] = sum;
758     sum++;
759     }
760     }
761     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
762     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
763     /* note that we need to map the column index in
764     P->col_coupleBlock back into the column index in
765     P_ext_couple */
766     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
767     if (P_marker[i_c] < row_marker) {
768     P_marker[i_c] = sum;
769     sum++;
770     }
771     }
772     }
773     }
774     }
775     }
776    
777     /* Now we have the number of non-zero elements in RAP_ext, allocate
778     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
779     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
780     RAP_ext_idx = MEMALLOC(sum, index_t);
781     RAP_ext_val = MEMALLOC(sum * block_size, double);
782     RA_val = TMPMEMALLOC(block_size, double);
783     RAP_val = TMPMEMALLOC(block_size, double);
784    
785     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
786     sum = num_Pext_cols + num_Pmain_cols;
787     #pragma omp parallel for private(i) schedule(static)
788     for (i=0; i<sum; i++) P_marker[i] = -1;
789     #pragma omp parallel for private(i) schedule(static)
790     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
791     sum = 0;
792     RAP_ext_ptr[0] = 0;
793     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
794     row_marker = sum;
795     /* then, loop over elements in row i_r of R_couple */
796     j1_ub = R_couple->pattern->ptr[i_r+1];
797     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
798     i1 = R_couple->pattern->index[j1];
799     R_val = &(R_couple->val[j1*P_block_size]);
800    
801     /* then, loop over elements in row i1 of A->col_coupleBlock */
802     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
803     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
804     i2 = A->col_coupleBlock->pattern->index[j2];
805     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
806     for (irb=0; irb<row_block_size; irb++)
807     for (icb=0; icb<col_block_size; icb++)
808     RA_val[irb+row_block_size*icb] = ZERO;
809     for (irb=0; irb<P_block_size; irb++) {
810     rtmp=R_val[irb];
811     for (icb=0; icb<col_block_size; icb++) {
812     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
813     }
814     }
815    
816     /* check whether entry RA[i_r, i2] has been previously visited.
817     RAP new entry is possible only if entry RA[i_r, i2] has not
818     been visited yet */
819     if (A_marker[i2] != i_r) {
820     /* first, mark entry RA[i_r,i2] as visited */
821     A_marker[i2] = i_r;
822    
823     /* then loop over elements in row i2 of P_ext_main */
824     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
825     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
826     i_c = P->row_coupleBlock->pattern->index[j3];
827     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
828     for (irb=0; irb<row_block_size; irb++)
829     for (icb=0; icb<col_block_size; icb++)
830     RAP_val[irb+row_block_size*icb] = ZERO;
831     for (icb=0; icb<P_block_size; icb++) {
832     rtmp = temp_val[icb];
833     for (irb=0; irb<row_block_size; irb++) {
834     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
835     }
836     }
837    
838     /* check whether entry RAP[i_r,i_c] has been created.
839     If not yet created, create the entry and increase
840     the total number of elements in RAP_ext */
841     if (P_marker[i_c] < row_marker) {
842     P_marker[i_c] = sum;
843     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
844     RAP_ext_idx[sum] = i_c + offset;
845     sum++;
846     } else {
847     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
848     for (ib=0; ib<block_size; ib++) {
849     temp_val[ib] += RAP_val[ib];
850     }
851     }
852     }
853    
854     /* loop over elements in row i2 of P_ext_couple, do the same */
855     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
856     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
857     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
858     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
859     for (irb=0; irb<row_block_size; irb++)
860     for (icb=0; icb<col_block_size; icb++)
861     RAP_val[irb+row_block_size*icb]=ZERO;
862     for (icb=0; icb<P_block_size; icb++) {
863     rtmp = temp_val[icb];
864     for (irb=0; irb<row_block_size; irb++) {
865     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
866     }
867     }
868    
869     if (P_marker[i_c] < row_marker) {
870     P_marker[i_c] = sum;
871     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
872     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
873     sum++;
874     } else {
875     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
876     for (ib=0; ib<block_size; ib++) {
877     temp_val[ib] += RAP_val[ib];
878     }
879     }
880     }
881    
882     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
883     Only the contributions are added. */
884     } else {
885     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
886     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
887     i_c = P->row_coupleBlock->pattern->index[j3];
888     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
889     for (irb=0; irb<row_block_size; irb++)
890     for (icb=0; icb<col_block_size; icb++)
891     RAP_val[irb+row_block_size*icb]=ZERO;
892     for (icb=0; icb<P_block_size; icb++) {
893     rtmp = temp_val[icb];
894     for (irb=0; irb<row_block_size; irb++) {
895     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
896     }
897     }
898    
899     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
900     for (ib=0; ib<block_size; ib++) {
901     temp_val[ib] += RAP_val[ib];
902     }
903     }
904     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
905     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
906     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
907     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
908     for (irb=0; irb<row_block_size; irb++)
909     for (icb=0; icb<col_block_size; icb++)
910     RAP_val[irb+row_block_size*icb]=ZERO;
911     for (icb=0; icb<P_block_size; icb++) {
912     rtmp = temp_val[icb];
913     for (irb=0; irb<row_block_size; irb++) {
914     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
915     }
916     }
917    
918     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
919     for (ib=0; ib<block_size; ib++) {
920     temp_val[ib] += RAP_val[ib];
921     }
922     }
923     }
924     }
925    
926     /* now loop over elements in row i1 of A->mainBlock, repeat
927     the process we have done to block A->col_coupleBlock */
928     j2_ub = A->mainBlock->pattern->ptr[i1+1];
929     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
930     i2 = A->mainBlock->pattern->index[j2];
931     temp_val = &(A->mainBlock->val[j2*block_size]);
932     for (irb=0; irb<row_block_size; irb++)
933     for (icb=0; icb<col_block_size; icb++)
934     RA_val[irb+row_block_size*icb]=ZERO;
935     for (irb=0; irb<P_block_size; irb++) {
936     rtmp=R_val[irb];
937     for (icb=0; icb<col_block_size; icb++) {
938     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
939     }
940     }
941    
942     if (A_marker[i2 + num_Acouple_cols] != i_r) {
943     A_marker[i2 + num_Acouple_cols] = i_r;
944     j3_ub = P->mainBlock->pattern->ptr[i2+1];
945     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
946     i_c = P->mainBlock->pattern->index[j3];
947     temp_val = &(P->mainBlock->val[j3*P_block_size]);
948     for (irb=0; irb<row_block_size; irb++)
949     for (icb=0; icb<col_block_size; icb++)
950     RAP_val[irb+row_block_size*icb]=ZERO;
951     for (icb=0; icb<P_block_size; icb++) {
952     rtmp = temp_val[icb];
953     for (irb=0; irb<row_block_size; irb++) {
954     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
955     }
956     }
957    
958     if (P_marker[i_c] < row_marker) {
959     P_marker[i_c] = sum;
960     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
961     RAP_ext_idx[sum] = i_c + offset;
962     sum++;
963     } else {
964     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
965     for (ib=0; ib<block_size; ib++) {
966     temp_val[ib] += RAP_val[ib];
967     }
968     }
969     }
970     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
971     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
972     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
973     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
974     for (irb=0; irb<row_block_size; irb++)
975     for (icb=0; icb<col_block_size; icb++)
976     RAP_val[irb+row_block_size*icb]=ZERO;
977     for (icb=0; icb<P_block_size; icb++) {
978     rtmp=temp_val[icb];
979     for (irb=0; irb<row_block_size; irb++) {
980     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
981     }
982     }
983    
984     if (P_marker[i_c] < row_marker) {
985     P_marker[i_c] = sum;
986     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
987     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
988     sum++;
989     } else {
990     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
991     for (ib=0; ib<block_size; ib++) {
992     temp_val[ib] += RAP_val[ib];
993     }
994     }
995     }
996     } else {
997     j3_ub = P->mainBlock->pattern->ptr[i2+1];
998     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
999     i_c = P->mainBlock->pattern->index[j3];
1000     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1001     for (irb=0; irb<row_block_size; irb++)
1002     for (icb=0; icb<col_block_size; icb++)
1003     RAP_val[irb+row_block_size*icb]=ZERO;
1004     for (icb=0; icb<P_block_size; icb++) {
1005     rtmp=temp_val[icb];
1006     for (irb=0; irb<row_block_size; irb++) {
1007     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1008     }
1009     }
1010    
1011     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1012     for (ib=0; ib<block_size; ib++) {
1013     temp_val[ib] += RAP_val[ib];
1014     }
1015     }
1016     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1017     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1018     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1019     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1020     for (irb=0; irb<row_block_size; irb++)
1021     for (icb=0; icb<col_block_size; icb++)
1022     RAP_val[irb+row_block_size*icb]=ZERO;
1023     for (icb=0; icb<P_block_size; icb++) {
1024     rtmp=temp_val[icb];
1025     for (irb=0; irb<row_block_size; irb++) {
1026     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1027     }
1028     }
1029    
1030     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1031     for (ib=0; ib<block_size; ib++) {
1032     temp_val[ib] += RAP_val[ib];
1033     }
1034     }
1035     }
1036     }
1037     }
1038     RAP_ext_ptr[i_r+1] = sum;
1039     }
1040     TMPMEMFREE(P_marker);
1041     TMPMEMFREE(Pcouple_to_Pext);
1042    
1043     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1044     which is given by the column list of P->col_coupleBlock, and
1045     column "c" is the list of columns which possiblely covers the
1046     whole column range of systme martris P. This part of data is to
1047     be passed to neighboring processors, and added to corresponding
1048     RAP[r,c] entries in the neighboring processors */
1049     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1050     &RAP_ext_val, global_id_P, block_size);
1051    
1052     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1053     sum = RAP_ext_ptr[num_RAPext_rows];
1054     num_RAPext_cols = 0;
1055     if (num_Pext_cols || sum > 0) {
1056     temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1057     j1_ub = offset + num_Pmain_cols;
1058     for (i=0, iptr=0; i<sum; i++) {
1059 lgao 3819 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1060 lgao 3779 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1061     }
1062     for (j=0; j<num_Pext_cols; j++, iptr++){
1063     temp[iptr] = global_id_P[j];
1064     }
1065    
1066     if (iptr) {
1067     #ifdef USE_QSORTG
1068     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1069     #else
1070     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1071     #endif
1072     num_RAPext_cols = 1;
1073     i = temp[0];
1074     for (j=1; j<iptr; j++) {
1075     if (temp[j] > i) {
1076     i = temp[j];
1077     temp[num_RAPext_cols++] = i;
1078     }
1079     }
1080     }
1081     }
1082    
1083     /* resize the pattern of P_ext_couple */
1084     if(num_RAPext_cols){
1085     global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1086     for (i=0; i<num_RAPext_cols; i++)
1087     global_id_RAP[i] = temp[i];
1088     }
1089     if (num_Pext_cols || sum > 0)
1090     TMPMEMFREE(temp);
1091     j1_ub = offset + num_Pmain_cols;
1092     for (i=0; i<sum; i++) {
1093     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1094     where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1095     /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1096     RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1097     } else
1098     RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1099     }
1100    
1101     /* build the mapping */
1102     if (num_Pcouple_cols) {
1103     Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1104     iptr = 0;
1105     for (i=0; i<num_RAPext_cols; i++)
1106     if (global_id_RAP[i] == P->global_id[iptr]) {
1107     Pcouple_to_RAP[iptr++] = i;
1108     if (iptr == num_Pcouple_cols) break;
1109     }
1110     }
1111    
1112     if (num_Pext_cols) {
1113     Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1114     iptr = 0;
1115     for (i=0; i<num_RAPext_cols; i++)
1116     if (global_id_RAP[i] == global_id_P[iptr]) {
1117     Pext_to_RAP[iptr++] = i;
1118     if (iptr == num_Pext_cols) break;
1119     }
1120     }
1121    
1122     if (global_id_P){
1123     TMPMEMFREE(global_id_P);
1124     global_id_P = NULL;
1125     }
1126    
1127     /* alloc and initialize the makers */
1128     sum = num_RAPext_cols + num_Pmain_cols;
1129     P_marker = TMPMEMALLOC(sum, index_t);
1130     #pragma omp parallel for private(i) schedule(static)
1131     for (i=0; i<sum; i++) P_marker[i] = -1;
1132     #pragma omp parallel for private(i) schedule(static)
1133     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1134    
1135     /* Now, count the size of RAP. Start with rows in R_main */
1136     num_neighbors = P->col_coupler->connector->send->numNeighbors;
1137     offsetInShared = P->col_coupler->connector->send->offsetInShared;
1138     shared = P->col_coupler->connector->send->shared;
1139     i = 0;
1140     j = 0;
1141     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1142     /* Mark the start of row for both main block and couple block */
1143     row_marker = i;
1144     row_marker_ext = j;
1145    
1146     /* Mark the diagonal element RAP[i_r, i_r], and other elements
1147     in RAP_ext */
1148     P_marker[i_r] = i;
1149     i++;
1150     for (j1=0; j1<num_neighbors; j1++) {
1151     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1152     if (shared[j2] == i_r) {
1153     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1154     i_c = RAP_ext_idx[k];
1155     if (i_c < num_Pmain_cols) {
1156     if (P_marker[i_c] < row_marker) {
1157     P_marker[i_c] = i;
1158     i++;
1159     }
1160     } else {
1161     if (P_marker[i_c] < row_marker_ext) {
1162     P_marker[i_c] = j;
1163     j++;
1164     }
1165     }
1166     }
1167     break;
1168     }
1169     }
1170     }
1171    
1172     /* then, loop over elements in row i_r of R_main */
1173     j1_ub = R_main->pattern->ptr[i_r+1];
1174     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1175     i1 = R_main->pattern->index[j1];
1176    
1177     /* then, loop over elements in row i1 of A->col_coupleBlock */
1178     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1179     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1180     i2 = A->col_coupleBlock->pattern->index[j2];
1181    
1182     /* check whether entry RA[i_r, i2] has been previously visited.
1183     RAP new entry is possible only if entry RA[i_r, i2] has not
1184     been visited yet */
1185     if (A_marker[i2] != i_r) {
1186     /* first, mark entry RA[i_r,i2] as visited */
1187     A_marker[i2] = i_r;
1188    
1189     /* then loop over elements in row i2 of P_ext_main */
1190     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1191     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1192     i_c = P->row_coupleBlock->pattern->index[j3];
1193    
1194     /* check whether entry RAP[i_r,i_c] has been created.
1195     If not yet created, create the entry and increase
1196     the total number of elements in RAP_ext */
1197     if (P_marker[i_c] < row_marker) {
1198     P_marker[i_c] = i;
1199     i++;
1200     }
1201     }
1202    
1203     /* loop over elements in row i2 of P_ext_couple, do the same */
1204     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1205     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1206     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1207     if (P_marker[i_c] < row_marker_ext) {
1208     P_marker[i_c] = j;
1209     j++;
1210     }
1211     }
1212     }
1213     }
1214    
1215     /* now loop over elements in row i1 of A->mainBlock, repeat
1216     the process we have done to block A->col_coupleBlock */
1217     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1218     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1219     i2 = A->mainBlock->pattern->index[j2];
1220     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1221     A_marker[i2 + num_Acouple_cols] = i_r;
1222     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1223     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1224     i_c = P->mainBlock->pattern->index[j3];
1225     if (P_marker[i_c] < row_marker) {
1226     P_marker[i_c] = i;
1227     i++;
1228     }
1229     }
1230     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1231     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232     /* note that we need to map the column index in
1233     P->col_coupleBlock back into the column index in
1234     P_ext_couple */
1235     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1236     if (P_marker[i_c] < row_marker_ext) {
1237     P_marker[i_c] = j;
1238     j++;
1239     }
1240     }
1241     }
1242     }
1243     }
1244     }
1245    
1246     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1247     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1248     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1249     RAP_couple */
1250     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1251     RAP_main_idx = MEMALLOC(i, index_t);
1252     RAP_main_val = TMPMEMALLOC(i * block_size, double);
1253     RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1254     RAP_couple_idx = MEMALLOC(j, index_t);
1255     RAP_couple_val = TMPMEMALLOC(j * block_size, double);
1256    
1257     RAP_main_ptr[num_Pmain_cols] = i;
1258     RAP_couple_ptr[num_Pmain_cols] = j;
1259    
1260     #pragma omp parallel for private(i) schedule(static)
1261     for (i=0; i<sum; i++) P_marker[i] = -1;
1262     #pragma omp parallel for private(i) schedule(static)
1263     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1264    
1265     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1266     in R_main */
1267     i = 0;
1268     j = 0;
1269     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1270     /* Mark the start of row for both main block and couple block */
1271     row_marker = i;
1272     row_marker_ext = j;
1273     RAP_main_ptr[i_r] = row_marker;
1274     RAP_couple_ptr[i_r] = row_marker_ext;
1275    
1276     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1277     in row i_r of RAP_ext */
1278     P_marker[i_r] = i;
1279     for (ib=0; ib<block_size; ib++)
1280     RAP_main_val[i*block_size+ib] = ZERO;
1281     RAP_main_idx[i] = i_r;
1282     i++;
1283    
1284     for (j1=0; j1<num_neighbors; j1++) {
1285     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1286     if (shared[j2] == i_r) {
1287     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1288     i_c = RAP_ext_idx[k];
1289     if (i_c < num_Pmain_cols) {
1290     if (P_marker[i_c] < row_marker) {
1291     P_marker[i_c] = i;
1292     memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1293     RAP_main_idx[i] = i_c;
1294     i++;
1295     } else {
1296     t1_val = &(RAP_ext_val[k*block_size]);
1297     t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1298     for (ib=0; ib<block_size; ib++)
1299     t2_val[ib] += t1_val[ib];
1300     }
1301     } else {
1302     if (P_marker[i_c] < row_marker_ext) {
1303     P_marker[i_c] = j;
1304     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1305     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1306     j++;
1307     } else {
1308     t1_val = &(RAP_ext_val[k*block_size]);
1309     t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1310     for (ib=0; ib<block_size; ib++)
1311     t2_val[ib] += t1_val[ib];
1312     }
1313     }
1314     }
1315     break;
1316     }
1317     }
1318     }
1319    
1320     /* then, loop over elements in row i_r of R_main */
1321     j1_ub = R_main->pattern->ptr[i_r+1];
1322     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1323     i1 = R_main->pattern->index[j1];
1324     R_val = &(R_main->val[j1*P_block_size]);
1325    
1326     /* then, loop over elements in row i1 of A->col_coupleBlock */
1327     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1328     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1329     i2 = A->col_coupleBlock->pattern->index[j2];
1330     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1331     for (irb=0; irb<row_block_size; irb++)
1332     for (icb=0; icb<col_block_size; icb++)
1333     RA_val[irb+row_block_size*icb]=ZERO;
1334     for (irb=0; irb<P_block_size; irb++) {
1335     rtmp=R_val[irb];
1336     for (icb=0; icb<col_block_size; icb++) {
1337     RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1338     }
1339     }
1340    
1341    
1342     /* check whether entry RA[i_r, i2] has been previously visited.
1343     RAP new entry is possible only if entry RA[i_r, i2] has not
1344     been visited yet */
1345     if (A_marker[i2] != i_r) {
1346     /* first, mark entry RA[i_r,i2] as visited */
1347     A_marker[i2] = i_r;
1348    
1349     /* then loop over elements in row i2 of P_ext_main */
1350     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1351     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1352     i_c = P->row_coupleBlock->pattern->index[j3];
1353     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1354     for (irb=0; irb<row_block_size; irb++)
1355     for (icb=0; icb<col_block_size; icb++)
1356     RAP_val[irb+row_block_size*icb]=ZERO;
1357     for (icb=0; icb<P_block_size; icb++) {
1358     rtmp=temp_val[icb];
1359     for (irb=0; irb<row_block_size; irb++) {
1360     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1361     }
1362     }
1363    
1364    
1365     /* check whether entry RAP[i_r,i_c] has been created.
1366     If not yet created, create the entry and increase
1367     the total number of elements in RAP_ext */
1368     if (P_marker[i_c] < row_marker) {
1369     P_marker[i_c] = i;
1370     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1371     RAP_main_idx[i] = i_c;
1372     i++;
1373     } else {
1374     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1375     for (ib=0; ib<block_size; ib++) {
1376     temp_val[ib] += RAP_val[ib];
1377     }
1378     }
1379     }
1380    
1381     /* loop over elements in row i2 of P_ext_couple, do the same */
1382     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1383     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1384     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1385     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1386     for (irb=0; irb<row_block_size; irb++)
1387     for (icb=0; icb<col_block_size; icb++)
1388     RAP_val[irb+row_block_size*icb]=ZERO;
1389     for (icb=0; icb<P_block_size; icb++) {
1390     rtmp=temp_val[icb];
1391     for (irb=0; irb<row_block_size; irb++) {
1392     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1393     }
1394     }
1395    
1396     if (P_marker[i_c] < row_marker_ext) {
1397     P_marker[i_c] = j;
1398     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1399     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1400     j++;
1401     } else {
1402     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1403     for (ib=0; ib<block_size; ib++) {
1404     temp_val[ib] += RAP_val[ib];
1405     }
1406     }
1407     }
1408    
1409     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1410     Only the contributions are added. */
1411     } else {
1412     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1413     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1414     i_c = P->row_coupleBlock->pattern->index[j3];
1415     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1416     for (irb=0; irb<row_block_size; irb++)
1417     for (icb=0; icb<col_block_size; icb++)
1418     RAP_val[irb+row_block_size*icb]=ZERO;
1419     for (icb=0; icb<P_block_size; icb++) {
1420     rtmp=temp_val[icb];
1421     for (irb=0; irb<row_block_size; irb++) {
1422     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1423     }
1424     }
1425    
1426     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1427     for (ib=0; ib<block_size; ib++) {
1428     temp_val[ib] += RAP_val[ib];
1429     }
1430     }
1431     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1432     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1433     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1434     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1435     for (irb=0; irb<row_block_size; irb++)
1436     for (icb=0; icb<col_block_size; icb++)
1437     RAP_val[irb+row_block_size*icb]=ZERO;
1438     for (icb=0; icb<P_block_size; icb++) {
1439     rtmp=temp_val[icb];
1440     for (irb=0; irb<row_block_size; irb++) {
1441     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1442     }
1443     }
1444    
1445     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1446     for (ib=0; ib<block_size; ib++) {
1447     temp_val[ib] += RAP_val[ib];
1448     }
1449     }
1450     }
1451     }
1452    
1453     /* now loop over elements in row i1 of A->mainBlock, repeat
1454     the process we have done to block A->col_coupleBlock */
1455     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1456     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1457     i2 = A->mainBlock->pattern->index[j2];
1458     temp_val = &(A->mainBlock->val[j2*block_size]);
1459     for (irb=0; irb<row_block_size; irb++)
1460     for (icb=0; icb<col_block_size; icb++)
1461     RA_val[irb+row_block_size*icb]=ZERO;
1462     for (irb=0; irb<P_block_size; irb++) {
1463     rtmp=R_val[irb];
1464     for (icb=0; icb<col_block_size; icb++) {
1465     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1466     }
1467     }
1468    
1469     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1470     A_marker[i2 + num_Acouple_cols] = i_r;
1471     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1472     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1473     i_c = P->mainBlock->pattern->index[j3];
1474     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1475     for (irb=0; irb<row_block_size; irb++)
1476     for (icb=0; icb<col_block_size; icb++)
1477     RAP_val[irb+row_block_size*icb]=ZERO;
1478     for (icb=0; icb<P_block_size; icb++) {
1479     rtmp=temp_val[icb];
1480     for (irb=0; irb<row_block_size; irb++) {
1481     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1482     }
1483     }
1484     if (P_marker[i_c] < row_marker) {
1485     P_marker[i_c] = i;
1486     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1487     RAP_main_idx[i] = i_c;
1488     i++;
1489     } else {
1490     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1491     for (ib=0; ib<block_size; ib++) {
1492     temp_val[ib] += RAP_val[ib];
1493     }
1494     }
1495     }
1496     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1497     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1498     /* note that we need to map the column index in
1499     P->col_coupleBlock back into the column index in
1500     P_ext_couple */
1501     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1502     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1503     for (irb=0; irb<row_block_size; irb++)
1504     for (icb=0; icb<col_block_size; icb++)
1505     RAP_val[irb+row_block_size*icb]=ZERO;
1506     for (icb=0; icb<P_block_size; icb++) {
1507     rtmp=temp_val[icb];
1508     for (irb=0; irb<row_block_size; irb++) {
1509     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1510     }
1511     }
1512    
1513     if (P_marker[i_c] < row_marker_ext) {
1514     P_marker[i_c] = j;
1515     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1516     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1517     j++;
1518     } else {
1519     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1520     for (ib=0; ib<block_size; ib++) {
1521     temp_val[ib] += RAP_val[ib];
1522     }
1523     }
1524     }
1525    
1526     } else {
1527     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1528     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1529     i_c = P->mainBlock->pattern->index[j3];
1530     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1531     for (irb=0; irb<row_block_size; irb++)
1532     for (icb=0; icb<col_block_size; icb++)
1533     RAP_val[irb+row_block_size*icb]=ZERO;
1534     for (icb=0; icb<P_block_size; icb++) {
1535     rtmp=temp_val[icb];
1536     for (irb=0; irb<row_block_size; irb++) {
1537     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1538     }
1539     }
1540    
1541     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1542     for (ib=0; ib<block_size; ib++) {
1543     temp_val[ib] += RAP_val[ib];
1544     }
1545     }
1546     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1547     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1548     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1549     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1550     for (irb=0; irb<row_block_size; irb++)
1551     for (icb=0; icb<col_block_size; icb++)
1552     RAP_val[irb+row_block_size*icb]=ZERO;
1553     for (icb=0; icb<P_block_size; icb++) {
1554     rtmp = temp_val[icb];
1555     for (irb=0; irb<row_block_size; irb++) {
1556     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1557     }
1558     }
1559    
1560     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1561     for (ib=0; ib<block_size; ib++) {
1562     temp_val[ib] += RAP_val[ib];
1563     }
1564     }
1565     }
1566     }
1567     }
1568    
1569     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1570     if (i > row_marker) {
1571     offset = i - row_marker;
1572     temp = TMPMEMALLOC(offset, index_t);
1573     for (iptr=0; iptr<offset; iptr++)
1574     temp[iptr] = RAP_main_idx[row_marker+iptr];
1575     if (offset > 0) {
1576     #ifdef USE_QSORTG
1577     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1578     #else
1579     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1580     #endif
1581     }
1582     temp_val = TMPMEMALLOC(offset * block_size, double);
1583     for (iptr=0; iptr<offset; iptr++){
1584     k = P_marker[temp[iptr]];
1585     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1586     P_marker[temp[iptr]] = iptr + row_marker;
1587     }
1588     for (iptr=0; iptr<offset; iptr++){
1589     RAP_main_idx[row_marker+iptr] = temp[iptr];
1590     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1591     }
1592     TMPMEMFREE(temp);
1593     TMPMEMFREE(temp_val);
1594     }
1595     if (j > row_marker_ext) {
1596     offset = j - row_marker_ext;
1597     temp = TMPMEMALLOC(offset, index_t);
1598     for (iptr=0; iptr<offset; iptr++)
1599     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1600     if (offset > 0) {
1601     #ifdef USE_QSORTG
1602     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1603     #else
1604     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1605     #endif
1606     }
1607     temp_val = TMPMEMALLOC(offset * block_size, double);
1608     for (iptr=0; iptr<offset; iptr++){
1609     k = P_marker[temp[iptr] + num_Pmain_cols];
1610     memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1611     P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1612     }
1613     for (iptr=0; iptr<offset; iptr++){
1614     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1615     memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1616     }
1617     TMPMEMFREE(temp);
1618     TMPMEMFREE(temp_val);
1619     }
1620     }
1621    
1622     TMPMEMFREE(RA_val);
1623     TMPMEMFREE(RAP_val);
1624     TMPMEMFREE(A_marker);
1625     TMPMEMFREE(Pext_to_RAP);
1626     TMPMEMFREE(Pcouple_to_RAP);
1627     MEMFREE(RAP_ext_ptr);
1628     MEMFREE(RAP_ext_idx);
1629     MEMFREE(RAP_ext_val);
1630     Paso_SparseMatrix_free(R_main);
1631     Paso_SparseMatrix_free(R_couple);
1632    
1633     /* Check whether there are empty columns in RAP_couple */
1634     #pragma omp parallel for schedule(static) private(i)
1635     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1636     /* num of non-empty columns is stored in "k" */
1637     k = 0;
1638     j = RAP_couple_ptr[num_Pmain_cols];
1639     for (i=0; i<j; i++) {
1640     i1 = RAP_couple_idx[i];
1641     if (P_marker[i1]) {
1642     P_marker[i1] = 0;
1643     k++;
1644     }
1645     }
1646    
1647     /* empty columns is found */
1648     if (k < num_RAPext_cols) {
1649     temp = TMPMEMALLOC(k, index_t);
1650     k = 0;
1651     for (i=0; i<num_RAPext_cols; i++)
1652     if (!P_marker[i]) {
1653     P_marker[i] = k;
1654     temp[k] = global_id_RAP[i];
1655     k++;
1656     }
1657     for (i=0; i<j; i++) {
1658     i1 = RAP_couple_idx[i];
1659     RAP_couple_idx[i] = P_marker[i1];
1660     }
1661     num_RAPext_cols = k;
1662     TMPMEMFREE(global_id_RAP);
1663     global_id_RAP = temp;
1664     }
1665     TMPMEMFREE(P_marker);
1666    
1667     /******************************************************/
1668     /* Start to create the coasre level System Matrix A_c */
1669     /******************************************************/
1670     /* first, prepare the sender/receiver for the col_connector */
1671     dist = P->pattern->input_distribution->first_component;
1672     recv_len = TMPMEMALLOC(size, dim_t);
1673     send_len = TMPMEMALLOC(size, dim_t);
1674     neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
1675     offsetInShared = TMPMEMALLOC(size+1, index_t);
1676     shared = TMPMEMALLOC(num_RAPext_cols, index_t);
1677     memset(recv_len, 0, sizeof(dim_t) * size);
1678     memset(send_len, 0, sizeof(dim_t) * size);
1679     num_neighbors = 0;
1680     offsetInShared[0] = 0;
1681     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1682     shared[i] = i + num_Pmain_cols;
1683     if (k <= global_id_RAP[i]) {
1684     if (recv_len[j] > 0) {
1685     neighbor[num_neighbors] = j;
1686     num_neighbors ++;
1687     offsetInShared[num_neighbors] = i;
1688     }
1689     while (k <= global_id_RAP[i]) {
1690     j++;
1691     k = dist[j+1];
1692     }
1693     }
1694     recv_len[j] ++;
1695     }
1696     if (recv_len[j] > 0) {
1697     neighbor[num_neighbors] = j;
1698     num_neighbors ++;
1699     offsetInShared[num_neighbors] = i;
1700     }
1701     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1702     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1703    
1704     MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1705    
1706     #ifdef ESYS_MPI
1707     mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
1708     mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
1709     #else
1710     mpi_requests=TMPMEMALLOC(size*2,int);
1711     mpi_stati=TMPMEMALLOC(size*2,int);
1712     #endif
1713     num_neighbors = 0;
1714     j = 0;
1715     offsetInShared[0] = 0;
1716     for (i=0; i<size; i++) {
1717     if (send_len[i] > 0) {
1718     neighbor[num_neighbors] = i;
1719     num_neighbors ++;
1720     j += send_len[i];
1721     offsetInShared[num_neighbors] = j;
1722     }
1723     }
1724     TMPMEMFREE(shared);
1725     shared = TMPMEMALLOC(j, index_t);
1726     for (i=0, j=0; i<num_neighbors; i++) {
1727     k = neighbor[i];
1728     MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1729     mpi_info->msg_tag_counter+k,
1730     mpi_info->comm, &mpi_requests[i]);
1731     j += send_len[k];
1732     }
1733     for (i=0, j=0; i<recv->numNeighbors; i++) {
1734     k = recv->neighbor[i];
1735     MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1736     mpi_info->msg_tag_counter+rank,
1737     mpi_info->comm, &mpi_requests[i+num_neighbors]);
1738     j += recv_len[k];
1739     }
1740     MPI_Waitall(num_neighbors + recv->numNeighbors,
1741     mpi_requests, mpi_stati);
1742     mpi_info->msg_tag_counter += size;
1743    
1744     j = offsetInShared[num_neighbors];
1745     offset = dist[rank];
1746     for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1747     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1748     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1749    
1750     col_connector = Paso_Connector_alloc(send, recv);
1751     TMPMEMFREE(shared);
1752     Paso_SharedComponents_free(recv);
1753     Paso_SharedComponents_free(send);
1754    
1755     /* now, create row distribution (output_distri) and col
1756     distribution (input_distribution) */
1757     input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1758     output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1759    
1760     /* then, prepare the sender/receiver for the row_connector, first, prepare
1761     the information for sender */
1762     sum = RAP_couple_ptr[num_Pmain_cols];
1763     len = TMPMEMALLOC(size, dim_t);
1764     send_ptr = TMPMEMALLOC(size, index_t*);
1765     send_idx = TMPMEMALLOC(size, index_t*);
1766     for (i=0; i<size; i++) {
1767     send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
1768     send_idx[i] = TMPMEMALLOC(sum, index_t);
1769     memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1770     }
1771     memset(len, 0, sizeof(dim_t) * size);
1772     recv = col_connector->recv;
1773     sum=0;
1774     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1775     i1 = RAP_couple_ptr[i_r];
1776     i2 = RAP_couple_ptr[i_r+1];
1777     if (i2 > i1) {
1778     /* then row i_r will be in the sender of row_connector, now check
1779     how many neighbors i_r needs to be send to */
1780     for (j=i1; j<i2; j++) {
1781     i_c = RAP_couple_idx[j];
1782     /* find out the corresponding neighbor "p" of column i_c */
1783     for (p=0; p<recv->numNeighbors; p++) {
1784     if (i_c < recv->offsetInShared[p+1]) {
1785     k = recv->neighbor[p];
1786     if (send_ptr[k][i_r] == 0) sum++;
1787     send_ptr[k][i_r] ++;
1788     send_idx[k][len[k]] = global_id_RAP[i_c];
1789     len[k] ++;
1790     break;
1791     }
1792     }
1793     }
1794     }
1795     }
1796     if (global_id_RAP) {
1797     TMPMEMFREE(global_id_RAP);
1798     global_id_RAP = NULL;
1799     }
1800    
1801     /* now allocate the sender */
1802     shared = TMPMEMALLOC(sum, index_t);
1803     memset(send_len, 0, sizeof(dim_t) * size);
1804     num_neighbors=0;
1805     offsetInShared[0] = 0;
1806     for (p=0, k=0; p<size; p++) {
1807     for (i=0; i<num_Pmain_cols; i++) {
1808     if (send_ptr[p][i] > 0) {
1809     shared[k] = i;
1810     k++;
1811     send_ptr[p][send_len[p]] = send_ptr[p][i];
1812     send_len[p]++;
1813     }
1814     }
1815     if (k > offsetInShared[num_neighbors]) {
1816     neighbor[num_neighbors] = p;
1817     num_neighbors ++;
1818     offsetInShared[num_neighbors] = k;
1819     }
1820     }
1821     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1822     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1823    
1824     /* send/recv number of rows will be sent from current proc
1825     recover info for the receiver of row_connector from the sender */
1826     MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1827     num_neighbors = 0;
1828     offsetInShared[0] = 0;
1829     j = 0;
1830     for (i=0; i<size; i++) {
1831     if (i != rank && recv_len[i] > 0) {
1832     neighbor[num_neighbors] = i;
1833     num_neighbors ++;
1834     j += recv_len[i];
1835     offsetInShared[num_neighbors] = j;
1836     }
1837     }
1838     TMPMEMFREE(shared);
1839     TMPMEMFREE(recv_len);
1840     shared = TMPMEMALLOC(j, index_t);
1841     k = offsetInShared[num_neighbors];
1842     for (i=0; i<k; i++) {
1843     shared[i] = i + num_Pmain_cols;
1844     }
1845     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1846     neighbor, shared, offsetInShared, 1, 0, mpi_info);
1847     row_connector = Paso_Connector_alloc(send, recv);
1848     TMPMEMFREE(shared);
1849     Paso_SharedComponents_free(recv);
1850     Paso_SharedComponents_free(send);
1851    
1852     /* send/recv pattern->ptr for rowCoupleBlock */
1853     num_RAPext_rows = offsetInShared[num_neighbors];
1854     row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
1855     for (p=0; p<num_neighbors; p++) {
1856     j = offsetInShared[p];
1857     i = offsetInShared[p+1];
1858     MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1859     mpi_info->msg_tag_counter+neighbor[p],
1860     mpi_info->comm, &mpi_requests[p]);
1861     }
1862     send = row_connector->send;
1863     for (p=0; p<send->numNeighbors; p++) {
1864     MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1865     MPI_INT, send->neighbor[p],
1866     mpi_info->msg_tag_counter+rank,
1867     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1868     }
1869     MPI_Waitall(num_neighbors + send->numNeighbors,
1870     mpi_requests, mpi_stati);
1871     mpi_info->msg_tag_counter += size;
1872     TMPMEMFREE(send_len);
1873    
1874     sum = 0;
1875     for (i=0; i<num_RAPext_rows; i++) {
1876     k = row_couple_ptr[i];
1877     row_couple_ptr[i] = sum;
1878     sum += k;
1879     }
1880     row_couple_ptr[num_RAPext_rows] = sum;
1881    
1882     /* send/recv pattern->index for rowCoupleBlock */
1883     k = row_couple_ptr[num_RAPext_rows];
1884     row_couple_idx = MEMALLOC(k, index_t);
1885     for (p=0; p<num_neighbors; p++) {
1886     j1 = row_couple_ptr[offsetInShared[p]];
1887     j2 = row_couple_ptr[offsetInShared[p+1]];
1888     MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1889     mpi_info->msg_tag_counter+neighbor[p],
1890     mpi_info->comm, &mpi_requests[p]);
1891     }
1892     for (p=0; p<send->numNeighbors; p++) {
1893     MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1894     MPI_INT, send->neighbor[p],
1895     mpi_info->msg_tag_counter+rank,
1896     mpi_info->comm, &mpi_requests[p+num_neighbors]);
1897     }
1898     MPI_Waitall(num_neighbors + send->numNeighbors,
1899     mpi_requests, mpi_stati);
1900     mpi_info->msg_tag_counter += size;
1901    
1902     offset = input_dist->first_component[rank];
1903     k = row_couple_ptr[num_RAPext_rows];
1904     for (i=0; i<k; i++) {
1905     row_couple_idx[i] -= offset;
1906     }
1907    
1908     for (i=0; i<size; i++) {
1909     TMPMEMFREE(send_ptr[i]);
1910     TMPMEMFREE(send_idx[i]);
1911     }
1912     TMPMEMFREE(send_ptr);
1913     TMPMEMFREE(send_idx);
1914     TMPMEMFREE(len);
1915     TMPMEMFREE(offsetInShared);
1916     TMPMEMFREE(neighbor);
1917     TMPMEMFREE(mpi_requests);
1918     TMPMEMFREE(mpi_stati);
1919     Esys_MPIInfo_free(mpi_info);
1920    
1921     /* Now, we can create pattern for mainBlock and coupleBlock */
1922     main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1923     num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1924     col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1925     num_Pmain_cols, num_RAPext_cols,
1926     RAP_couple_ptr, RAP_couple_idx);
1927     row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1928     num_RAPext_rows, num_Pmain_cols,
1929     row_couple_ptr, row_couple_idx);
1930    
1931     /* next, create the system matrix */
1932     pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
1933     output_dist, input_dist, main_pattern, col_couple_pattern,
1934     row_couple_pattern, col_connector, row_connector);
1935     out = Paso_SystemMatrix_alloc(A->type, pattern,
1936     row_block_size, col_block_size, FALSE);
1937    
1938     /* finally, fill in the data*/
1939     memcpy(out->mainBlock->val, RAP_main_val,
1940     out->mainBlock->len* sizeof(double));
1941     memcpy(out->col_coupleBlock->val, RAP_couple_val,
1942     out->col_coupleBlock->len * sizeof(double));
1943    
1944     /* Clean up */
1945     Paso_SystemMatrixPattern_free(pattern);
1946     Paso_Pattern_free(main_pattern);
1947     Paso_Pattern_free(col_couple_pattern);
1948     Paso_Pattern_free(row_couple_pattern);
1949     Paso_Connector_free(col_connector);
1950     Paso_Connector_free(row_connector);
1951     Paso_Distribution_free(output_dist);
1952     Paso_Distribution_free(input_dist);
1953     TMPMEMFREE(RAP_main_val);
1954     TMPMEMFREE(RAP_couple_val);
1955     if (Esys_noError()) {
1956     return out;
1957     } else {
1958     Paso_SystemMatrix_free(out);
1959     return NULL;
1960     }
1961     }
1962    
1963    
1964     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
1965     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
1966     {
1967     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
1968     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
1969     Paso_SystemMatrix *out=NULL;
1970     Paso_SystemMatrixPattern *pattern=NULL;
1971     Paso_Distribution *input_dist=NULL, *output_dist=NULL;
1972     Paso_SharedComponents *send =NULL, *recv=NULL;
1973     Paso_Connector *col_connector=NULL, *row_connector=NULL;
1974     Paso_Coupler *coupler=NULL;
1975     Paso_Pattern *main_pattern=NULL;
1976     Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
1977     const dim_t row_block_size=A->row_block_size;
1978     const dim_t col_block_size=A->col_block_size;
1979     const dim_t block_size = A->block_size;
1980     const dim_t P_block_size = P->block_size;
1981     const dim_t num_threads=omp_get_max_threads();
1982     const double ZERO = 0.0;
1983     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
1984     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
1985     index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
1986     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
1987     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
1988     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
1989     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
1990     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
1991     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
1992     index_t sum, i, j, k, iptr, irb, icb, ib;
1993     index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
1994     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
1995     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
1996     index_t row_marker_ext, *where_p=NULL;
1997     index_t **send_ptr=NULL, **send_idx=NULL;
1998     dim_t l, p, q, global_label, num_neighbors;
1999     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2000     Esys_MPI_rank *neighbor=NULL;
2001     #ifdef ESYS_MPI
2002     MPI_Request* mpi_requests=NULL;
2003     MPI_Status* mpi_stati=NULL;
2004     #else
2005     int *mpi_requests=NULL, *mpi_stati=NULL;
2006     #endif
2007    
2008     /* two sparse matrixes R_main and R_couple will be generate, as the
2009     transpose of P_main and P_col_couple, respectively. Note that,
2010     R_couple is actually the row_coupleBlock of R (R=P^T) */
2011     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2012     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2013    
2014 lgao 3763 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
2015     and needed locally for triple matrix product RAP
2016     to be specific, P_ext (on processor k) are group of rows in P, where
2017     the list of rows from processor q is given by
2018     A->col_coupler->connector->send->shared[rPtr...]
2019     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2020     on q.
2021     P_ext is represented by two sparse matrixes P_ext_main and
2022     P_ext_couple */
2023     Paso_Preconditioner_AMG_extendB(A, P);
2024    
2025     /* count the number of cols in P_ext_couple, resize the pattern of
2026     sparse matrix P_ext_couple with new compressed order, and then
2027     build the col id mapping from P->col_coupleBlock to
2028     P_ext_couple */
2029     num_Pmain_cols = P->mainBlock->numCols;
2030     num_Pcouple_cols = P->col_coupleBlock->numCols;
2031     num_Acouple_cols = A->col_coupleBlock->numCols;
2032     num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2033     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2034     offset = P->col_distribution->first_component[rank];
2035     num_Pext_cols = 0;
2036     if (P->global_id) {
2037     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2038     and P->col_coupleBlock */
2039     iptr = 0;
2040     if (num_Pcouple_cols || sum > 0) {
2041     temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
2042 lgao 3767 for (; iptr<sum; iptr++){
2043 lgao 3763 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2044 lgao 3767 }
2045     for (j=0; j<num_Pcouple_cols; j++, iptr++){
2046 lgao 3763 temp[iptr] = P->global_id[j];
2047 lgao 3767 }
2048 lgao 3763 }
2049     if (iptr) {
2050     #ifdef USE_QSORTG
2051     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2052     #else
2053     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2054     #endif
2055     num_Pext_cols = 1;
2056     i = temp[0];
2057     for (j=1; j<iptr; j++) {
2058     if (temp[j] > i) {
2059     i = temp[j];
2060     temp[num_Pext_cols++] = i;
2061     }
2062     }
2063     }
2064    
2065     /* resize the pattern of P_ext_couple */
2066     if(num_Pext_cols){
2067     global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
2068     for (i=0; i<num_Pext_cols; i++)
2069     global_id_P[i] = temp[i];
2070     }
2071     if (num_Pcouple_cols || sum > 0)
2072     TMPMEMFREE(temp);
2073     for (i=0; i<sum; i++) {
2074     where_p = (index_t *)bsearch(
2075     &(P->remote_coupleBlock->pattern->index[i]),
2076     global_id_P, num_Pext_cols,
2077     sizeof(index_t), Paso_comparIndex);
2078     P->remote_coupleBlock->pattern->index[i] =
2079     (index_t)(where_p -global_id_P);
2080     }
2081    
2082     /* build the mapping */
2083     if (num_Pcouple_cols) {
2084     Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
2085     iptr = 0;
2086     for (i=0; i<num_Pext_cols; i++)
2087     if (global_id_P[i] == P->global_id[iptr]) {
2088     Pcouple_to_Pext[iptr++] = i;
2089     if (iptr == num_Pcouple_cols) break;
2090     }
2091     }
2092     }
2093    
2094     /* alloc and initialize the makers */
2095     sum = num_Pext_cols + num_Pmain_cols;
2096     P_marker = TMPMEMALLOC(sum, index_t);
2097     A_marker = TMPMEMALLOC(num_A_cols, index_t);
2098     #pragma omp parallel for private(i) schedule(static)
2099     for (i=0; i<sum; i++) P_marker[i] = -1;
2100     #pragma omp parallel for private(i) schedule(static)
2101     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2102    
2103     /* Now, count the size of RAP_ext. Start with rows in R_couple */
2104     sum = 0;
2105     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2106     row_marker = sum;
2107     /* then, loop over elements in row i_r of R_couple */
2108     j1_ub = R_couple->pattern->ptr[i_r+1];
2109     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2110     i1 = R_couple->pattern->index[j1];
2111     /* then, loop over elements in row i1 of A->col_coupleBlock */
2112     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2113     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2114     i2 = A->col_coupleBlock->pattern->index[j2];
2115    
2116     /* check whether entry RA[i_r, i2] has been previously visited.
2117     RAP new entry is possible only if entry RA[i_r, i2] has not
2118     been visited yet */
2119     if (A_marker[i2] != i_r) {
2120     /* first, mark entry RA[i_r,i2] as visited */
2121     A_marker[i2] = i_r;
2122    
2123     /* then loop over elements in row i2 of P_ext_main */
2124     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2125     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2126     i_c = P->row_coupleBlock->pattern->index[j3];
2127    
2128     /* check whether entry RAP[i_r,i_c] has been created.
2129     If not yet created, create the entry and increase
2130     the total number of elements in RAP_ext */
2131     if (P_marker[i_c] < row_marker) {
2132     P_marker[i_c] = sum;
2133     sum++;
2134     }
2135     }
2136    
2137     /* loop over elements in row i2 of P_ext_couple, do the same */
2138     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2139     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2140     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2141     if (P_marker[i_c] < row_marker) {
2142     P_marker[i_c] = sum;
2143     sum++;
2144     }
2145     }
2146     }
2147     }
2148    
2149     /* now loop over elements in row i1 of A->mainBlock, repeat
2150     the process we have done to block A->col_coupleBlock */
2151     j2_ub = A->mainBlock->pattern->ptr[i1+1];
2152     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2153     i2 = A->mainBlock->pattern->index[j2];
2154     if (A_marker[i2 + num_Acouple_cols] != i_r) {
2155     A_marker[i2 + num_Acouple_cols] = i_r;
2156     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2157     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2158     i_c = P->mainBlock->pattern->index[j3];
2159     if (P_marker[i_c] < row_marker) {
2160     P_marker[i_c] = sum;
2161     sum++;
2162     }
2163     }
2164     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2165     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2166     /* note that we need to map the column index in
2167     P->col_coupleBlock back into the column index in
2168     P_ext_couple */
2169     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2170     if (P_marker[i_c] < row_marker) {
2171     P_marker[i_c] = sum;
2172     sum++;
2173     }
2174     }
2175     }
2176     }
2177     }
2178     }
2179    
2180     /* Now we have the number of non-zero elements in RAP_ext, allocate
2181     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2182     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
2183     RAP_ext_idx = MEMALLOC(sum, index_t);
2184 lgao 3779 RAP_ext_val = MEMALLOC(sum * block_size, double);
2185     RA_val = TMPMEMALLOC(block_size, double);
2186     RAP_val = TMPMEMALLOC(block_size, double);
2187 lgao 3763
2188     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2189     sum = num_Pext_cols + num_Pmain_cols;
2190     #pragma omp parallel for private(i) schedule(static)
2191     for (i=0; i<sum; i++) P_marker[i] = -1;
2192     #pragma omp parallel for private(i) schedule(static)
2193     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2194     sum = 0;
2195     RAP_ext_ptr[0] = 0;
2196     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2197     row_marker = sum;
2198     /* then, loop over elements in row i_r of R_couple */
2199     j1_ub = R_couple->pattern->ptr[i_r+1];
2200     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2201     i1 = R_couple->pattern->index[j1];
2202 lgao 3779 R_val = &(R_couple->val[j1*block_size]);
2203 lgao 3763
2204     /* then, loop over elements in row i1 of A->col_coupleBlock */
2205     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2206     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2207     i2 = A->col_coupleBlock->pattern->index[j2];
2208 lgao 3779 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2209     for (irb=0; irb<row_block_size; irb++) {
2210     for (icb=0; icb<col_block_size; icb++) {
2211     rtmp = ZERO;
2212     for (ib=0; ib<col_block_size; ib++) {
2213     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2214     }
2215     RA_val[irb+row_block_size*icb]=rtmp;
2216     }
2217     }
2218 lgao 3763
2219     /* check whether entry RA[i_r, i2] has been previously visited.
2220     RAP new entry is possible only if entry RA[i_r, i2] has not
2221     been visited yet */
2222     if (A_marker[i2] != i_r) {
2223     /* first, mark entry RA[i_r,i2] as visited */
2224     A_marker[i2] = i_r;
2225    
2226     /* then loop over elements in row i2 of P_ext_main */
2227     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2228     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2229     i_c = P->row_coupleBlock->pattern->index[j3];
2230 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2231     for (irb=0; irb<row_block_size; irb++) {
2232     for (icb=0; icb<col_block_size; icb++) {
2233     rtmp = ZERO;
2234     for (ib=0; ib<col_block_size; ib++) {
2235     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2236     }
2237     RAP_val[irb+row_block_size*icb]=rtmp;
2238     }
2239     }
2240 lgao 3763
2241 lgao 3779
2242 lgao 3763 /* check whether entry RAP[i_r,i_c] has been created.
2243     If not yet created, create the entry and increase
2244     the total number of elements in RAP_ext */
2245     if (P_marker[i_c] < row_marker) {
2246     P_marker[i_c] = sum;
2247 lgao 3779 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2248 lgao 3763 RAP_ext_idx[sum] = i_c + offset;
2249     sum++;
2250     } else {
2251 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2252     for (ib=0; ib<block_size; ib++) {
2253     temp_val[ib] += RAP_val[ib];
2254     }
2255 lgao 3763 }
2256     }
2257    
2258     /* loop over elements in row i2 of P_ext_couple, do the same */
2259     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2260     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2261     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2262 lgao 3779 //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
2263     temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2264     for (irb=0; irb<row_block_size; irb++) {
2265     for (icb=0; icb<col_block_size; icb++) {
2266     rtmp = ZERO;
2267     for (ib=0; ib<col_block_size; ib++) {
2268     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2269     }
2270     RAP_val[irb+row_block_size*icb]=rtmp;
2271     }
2272     }
2273 lgao 3763
2274     if (P_marker[i_c] < row_marker) {
2275     P_marker[i_c] = sum;
2276 lgao 3779 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2277 lgao 3763 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2278     sum++;
2279     } else {
2280 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2281     for (ib=0; ib<block_size; ib++) {
2282     temp_val[ib] += RAP_val[ib];
2283     }
2284 lgao 3763 }
2285     }
2286    
2287     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2288     Only the contributions are added. */
2289     } else {
2290     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2291     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2292     i_c = P->row_coupleBlock->pattern->index[j3];
2293 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2294     for (irb=0; irb<row_block_size; irb++) {
2295     for (icb=0; icb<col_block_size; icb++) {
2296     rtmp = ZERO;
2297     for (ib=0; ib<col_block_size; ib++) {
2298     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2299     }
2300     RAP_val[irb+row_block_size*icb]=rtmp;
2301     }
2302     }
2303    
2304     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2305     for (ib=0; ib<block_size; ib++) {
2306     temp_val[ib] += RAP_val[ib];
2307     }
2308 lgao 3763 }
2309     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2310     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2311     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2312 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2313     for (irb=0; irb<row_block_size; irb++) {
2314     for (icb=0; icb<col_block_size; icb++) {
2315     rtmp = ZERO;
2316     for (ib=0; ib<col_block_size; ib++) {
2317     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2318     }
2319     RAP_val[irb+row_block_size*icb]=rtmp;
2320     }
2321     }
2322    
2323     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2324     for (ib=0; ib<block_size; ib++) {
2325     temp_val[ib] += RAP_val[ib];
2326     }
2327 lgao 3763 }
2328     }
2329     }
2330    
2331     /* now loop over elements in row i1 of A->mainBlock, repeat
2332     the process we have done to block A->col_coupleBlock */
2333     j2_ub = A->mainBlock->pattern->ptr[i1+1];
2334     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2335     i2 = A->mainBlock->pattern->index[j2];
2336 lgao 3779 temp_val = &(A->mainBlock->val[j2*block_size]);
2337     for (irb=0; irb<row_block_size; irb++) {
2338     for (icb=0; icb<col_block_size; icb++) {
2339     rtmp = ZERO;
2340     for (ib=0; ib<col_block_size; ib++) {
2341     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2342     }
2343     RA_val[irb+row_block_size*icb]=rtmp;
2344     }
2345     }
2346    
2347 lgao 3763 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2348     A_marker[i2 + num_Acouple_cols] = i_r;
2349     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2350     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2351     i_c = P->mainBlock->pattern->index[j3];
2352 lgao 3779 temp_val = &(P->mainBlock->val[j3*block_size]);
2353     for (irb=0; irb<row_block_size; irb++) {
2354     for (icb=0; icb<col_block_size; icb++) {
2355     rtmp = ZERO;
2356     for (ib=0; ib<col_block_size; ib++) {
2357     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2358     }
2359     RAP_val[irb+row_block_size*icb]=rtmp;
2360     }
2361     }
2362    
2363 lgao 3763 if (P_marker[i_c] < row_marker) {
2364     P_marker[i_c] = sum;
2365 lgao 3779 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2366 lgao 3763 RAP_ext_idx[sum] = i_c + offset;
2367     sum++;
2368     } else {
2369 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2370     for (ib=0; ib<block_size; ib++) {
2371     temp_val[ib] += RAP_val[ib];
2372     }
2373 lgao 3763 }
2374     }
2375     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2376     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2377     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2378 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2379     for (irb=0; irb<row_block_size; irb++) {
2380     for (icb=0; icb<col_block_size; icb++) {
2381     rtmp = ZERO;
2382     for (ib=0; ib<col_block_size; ib++) {
2383     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2384     }
2385     RAP_val[irb+row_block_size*icb]=rtmp;
2386     }
2387     }
2388    
2389 lgao 3763 if (P_marker[i_c] < row_marker) {
2390     P_marker[i_c] = sum;
2391 lgao 3779 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2392 lgao 3763 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2393     sum++;
2394     } else {
2395 lgao 3779 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2396     for (ib=0; ib<block_size; ib++) {
2397     temp_val[ib] += RAP_val[ib];
2398     }
2399 lgao 3763 }
2400     }
2401     } else {
2402     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2403     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2404     i_c = P->mainBlock->pattern->index[j3];
2405 lgao 3779 temp_val = &(P->mainBlock->val[j3*block_size]);
2406     for (irb=0; irb<row_block_size; irb++) {
2407     for (icb=0; icb<col_block_size; icb++) {
2408     rtmp = ZERO;
2409     for (ib=0; ib<col_block_size; ib++) {
2410     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2411     }
2412     RAP_val[irb+row_block_size*icb]=rtmp;
2413     }
2414     }
2415    
2416     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2417     for (ib=0; ib<block_size; ib++) {
2418     temp_val[ib] += RAP_val[ib];
2419     }
2420 lgao 3763 }
2421     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2422     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2423     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2424 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2425     for (irb=0; irb<row_block_size; irb++) {
2426     for (icb=0; icb<col_block_size; icb++) {
2427     rtmp = ZERO;
2428     for (ib=0; ib<col_block_size; ib++) {
2429     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2430     }
2431     RAP_val[irb+row_block_size*icb]=rtmp;
2432     }
2433     }
2434    
2435     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2436     for (ib=0; ib<block_size; ib++) {
2437     temp_val[ib] += RAP_val[ib];
2438     }
2439 lgao 3763 }
2440     }
2441     }
2442     }
2443     RAP_ext_ptr[i_r+1] = sum;
2444     }
2445     TMPMEMFREE(P_marker);
2446     TMPMEMFREE(Pcouple_to_Pext);
2447    
2448     /* Now we have part of RAP[r,c] where row "r" is the list of rows
2449     which is given by the column list of P->col_coupleBlock, and
2450     column "c" is the list of columns which possiblely covers the
2451     whole column range of systme martris P. This part of data is to
2452     be passed to neighboring processors, and added to corresponding
2453     RAP[r,c] entries in the neighboring processors */
2454     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2455 lgao 3779 &RAP_ext_val, global_id_P, block_size);
2456 lgao 3763
2457     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2458     sum = RAP_ext_ptr[num_RAPext_rows];
2459     num_RAPext_cols = 0;
2460     if (num_Pext_cols || sum > 0) {
2461     temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
2462     j1_ub = offset + num_Pmain_cols;
2463     for (i=0, iptr=0; i<sum; i++) {
2464 lgao 3819 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2465 lgao 3763 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2466     }
2467 lgao 3767 for (j=0; j<num_Pext_cols; j++, iptr++){
2468 lgao 3763 temp[iptr] = global_id_P[j];
2469 lgao 3767 }
2470 lgao 3763
2471     if (iptr) {
2472     #ifdef USE_QSORTG
2473     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2474     #else
2475     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2476     #endif
2477     num_RAPext_cols = 1;
2478     i = temp[0];
2479     for (j=1; j<iptr; j++) {
2480     if (temp[j] > i) {
2481     i = temp[j];
2482     temp[num_RAPext_cols++] = i;
2483     }
2484     }
2485     }
2486     }
2487    
2488     /* resize the pattern of P_ext_couple */
2489     if(num_RAPext_cols){
2490     global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
2491     for (i=0; i<num_RAPext_cols; i++)
2492     global_id_RAP[i] = temp[i];
2493     }
2494     if (num_Pext_cols || sum > 0)
2495     TMPMEMFREE(temp);
2496     j1_ub = offset + num_Pmain_cols;
2497     for (i=0; i<sum; i++) {
2498     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2499     where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2500     /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2501     RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2502     } else
2503     RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2504     }
2505    
2506     /* build the mapping */
2507     if (num_Pcouple_cols) {
2508     Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
2509     iptr = 0;
2510     for (i=0; i<num_RAPext_cols; i++)
2511     if (global_id_RAP[i] == P->global_id[iptr]) {
2512     Pcouple_to_RAP[iptr++] = i;
2513     if (iptr == num_Pcouple_cols) break;
2514     }
2515     }
2516    
2517     if (num_Pext_cols) {
2518     Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
2519     iptr = 0;
2520     for (i=0; i<num_RAPext_cols; i++)
2521     if (global_id_RAP[i] == global_id_P[iptr]) {
2522     Pext_to_RAP[iptr++] = i;
2523     if (iptr == num_Pext_cols) break;
2524     }
2525     }
2526    
2527     if (global_id_P){
2528     TMPMEMFREE(global_id_P);
2529     global_id_P = NULL;
2530     }
2531    
2532     /* alloc and initialize the makers */
2533     sum = num_RAPext_cols + num_Pmain_cols;
2534     P_marker = TMPMEMALLOC(sum, index_t);
2535     #pragma omp parallel for private(i) schedule(static)
2536     for (i=0; i<sum; i++) P_marker[i] = -1;
2537     #pragma omp parallel for private(i) schedule(static)
2538     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2539    
2540     /* Now, count the size of RAP. Start with rows in R_main */
2541     num_neighbors = P->col_coupler->connector->send->numNeighbors;
2542     offsetInShared = P->col_coupler->connector->send->offsetInShared;
2543     shared = P->col_coupler->connector->send->shared;
2544     i = 0;
2545     j = 0;
2546     for (i_r=0; i_r<num_Pmain_cols; i_r++){
2547     /* Mark the start of row for both main block and couple block */
2548     row_marker = i;
2549     row_marker_ext = j;
2550    
2551     /* Mark the diagonal element RAP[i_r, i_r], and other elements
2552     in RAP_ext */
2553     P_marker[i_r] = i;
2554     i++;
2555     for (j1=0; j1<num_neighbors; j1++) {
2556     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2557     if (shared[j2] == i_r) {
2558     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2559     i_c = RAP_ext_idx[k];
2560     if (i_c < num_Pmain_cols) {
2561     if (P_marker[i_c] < row_marker) {
2562     P_marker[i_c] = i;
2563     i++;
2564     }
2565     } else {
2566     if (P_marker[i_c] < row_marker_ext) {
2567     P_marker[i_c] = j;
2568     j++;
2569     }
2570     }
2571     }
2572     break;
2573     }
2574     }
2575     }
2576    
2577     /* then, loop over elements in row i_r of R_main */
2578     j1_ub = R_main->pattern->ptr[i_r+1];
2579     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2580     i1 = R_main->pattern->index[j1];
2581    
2582     /* then, loop over elements in row i1 of A->col_coupleBlock */
2583     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2584     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2585     i2 = A->col_coupleBlock->pattern->index[j2];
2586    
2587     /* check whether entry RA[i_r, i2] has been previously visited.
2588     RAP new entry is possible only if entry RA[i_r, i2] has not
2589     been visited yet */
2590     if (A_marker[i2] != i_r) {
2591     /* first, mark entry RA[i_r,i2] as visited */
2592     A_marker[i2] = i_r;
2593    
2594     /* then loop over elements in row i2 of P_ext_main */
2595     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2596     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2597     i_c = P->row_coupleBlock->pattern->index[j3];
2598    
2599     /* check whether entry RAP[i_r,i_c] has been created.
2600     If not yet created, create the entry and increase
2601     the total number of elements in RAP_ext */
2602     if (P_marker[i_c] < row_marker) {
2603     P_marker[i_c] = i;
2604     i++;
2605     }
2606     }
2607    
2608     /* loop over elements in row i2 of P_ext_couple, do the same */
2609     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2610     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2611     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2612     if (P_marker[i_c] < row_marker_ext) {
2613     P_marker[i_c] = j;
2614     j++;
2615     }
2616     }
2617     }
2618     }
2619    
2620     /* now loop over elements in row i1 of A->mainBlock, repeat
2621     the process we have done to block A->col_coupleBlock */
2622     j2_ub = A->mainBlock->pattern->ptr[i1+1];
2623     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2624     i2 = A->mainBlock->pattern->index[j2];
2625     if (A_marker[i2 + num_Acouple_cols] != i_r) {
2626     A_marker[i2 + num_Acouple_cols] = i_r;
2627     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2628     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2629     i_c = P->mainBlock->pattern->index[j3];
2630     if (P_marker[i_c] < row_marker) {
2631     P_marker[i_c] = i;
2632     i++;
2633     }
2634     }
2635     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2636     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2637     /* note that we need to map the column index in
2638     P->col_coupleBlock back into the column index in
2639     P_ext_couple */
2640     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2641     if (P_marker[i_c] < row_marker_ext) {
2642     P_marker[i_c] = j;
2643     j++;
2644     }
2645     }
2646     }
2647     }
2648     }
2649     }
2650    
2651     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
2652     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2653     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2654     RAP_couple */
2655     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2656     RAP_main_idx = MEMALLOC(i, index_t);
2657 lgao 3779 RAP_main_val = TMPMEMALLOC(i * block_size, double);
2658 lgao 3763 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2659     RAP_couple_idx = MEMALLOC(j, index_t);
2660 lgao 3779 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
2661 lgao 3763
2662     RAP_main_ptr[num_Pmain_cols] = i;
2663     RAP_couple_ptr[num_Pmain_cols] = j;
2664    
2665     #pragma omp parallel for private(i) schedule(static)
2666     for (i=0; i<sum; i++) P_marker[i] = -1;
2667     #pragma omp parallel for private(i) schedule(static)
2668     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2669    
2670     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
2671     in R_main */
2672     i = 0;
2673     j = 0;
2674     for (i_r=0; i_r<num_Pmain_cols; i_r++){
2675     /* Mark the start of row for both main block and couple block */
2676     row_marker = i;
2677     row_marker_ext = j;
2678     RAP_main_ptr[i_r] = row_marker;
2679     RAP_couple_ptr[i_r] = row_marker_ext;
2680    
2681     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2682     in row i_r of RAP_ext */
2683     P_marker[i_r] = i;
2684     RAP_main_val[i] = ZERO;
2685     RAP_main_idx[i] = i_r;
2686     i++;
2687    
2688     for (j1=0; j1<num_neighbors; j1++) {
2689     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2690     if (shared[j2] == i_r) {
2691     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2692     i_c = RAP_ext_idx[k];
2693     if (i_c < num_Pmain_cols) {
2694     if (P_marker[i_c] < row_marker) {
2695     P_marker[i_c] = i;
2696 lgao 3779 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2697 lgao 3763 RAP_main_idx[i] = i_c;
2698     i++;
2699     } else {
2700 lgao 3779 temp_val = &(RAP_ext_val[k*block_size]);
2701     RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2702     for (ib=0; ib<block_size; ib++)
2703     RAP_val[ib] += temp_val[ib];
2704 lgao 3763 }
2705     } else {
2706     if (P_marker[i_c] < row_marker_ext) {
2707     P_marker[i_c] = j;
2708 lgao 3779 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2709 lgao 3763 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2710     j++;
2711     } else {
2712 lgao 3779 temp_val = &(RAP_ext_val[k*block_size]);
2713     RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2714     for (ib=0; ib<block_size; ib++)
2715     RAP_val[ib] += temp_val[ib];
2716 lgao 3763 }
2717     }
2718     }
2719     break;
2720     }
2721     }
2722     }
2723    
2724     /* then, loop over elements in row i_r of R_main */
2725     j1_ub = R_main->pattern->ptr[i_r+1];
2726     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2727     i1 = R_main->pattern->index[j1];
2728 lgao 3779 R_val = &(R_main->val[j1*block_size]);
2729 lgao 3763
2730     /* then, loop over elements in row i1 of A->col_coupleBlock */
2731     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2732     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2733     i2 = A->col_coupleBlock->pattern->index[j2];
2734 lgao 3779 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2735     for (irb=0; irb<row_block_size; irb++) {
2736     for (icb=0; icb<col_block_size; icb++) {
2737     rtmp = ZERO;
2738     for (ib=0; ib<col_block_size; ib++) {
2739     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2740     }
2741     RA_val[irb+row_block_size*icb]=rtmp;
2742     }
2743     }
2744 lgao 3763
2745 lgao 3779
2746 lgao 3763 /* check whether entry RA[i_r, i2] has been previously visited.
2747     RAP new entry is possible only if entry RA[i_r, i2] has not
2748     been visited yet */
2749     if (A_marker[i2] != i_r) {
2750     /* first, mark entry RA[i_r,i2] as visited */
2751     A_marker[i2] = i_r;
2752    
2753     /* then loop over elements in row i2 of P_ext_main */
2754     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2755     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2756     i_c = P->row_coupleBlock->pattern->index[j3];
2757 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2758     for (irb=0; irb<row_block_size; irb++) {
2759     for (icb=0; icb<col_block_size; icb++) {
2760     rtmp = ZERO;
2761     for (ib=0; ib<col_block_size; ib++) {
2762     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2763     }
2764     RAP_val[irb+row_block_size*icb]=rtmp;
2765     }
2766     }
2767 lgao 3763
2768 lgao 3779
2769 lgao 3763 /* check whether entry RAP[i_r,i_c] has been created.
2770     If not yet created, create the entry and increase
2771     the total number of elements in RAP_ext */
2772     if (P_marker[i_c] < row_marker) {
2773     P_marker[i_c] = i;
2774 lgao 3779 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2775 lgao 3763 RAP_main_idx[i] = i_c;
2776     i++;
2777     } else {
2778 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2779     for (ib=0; ib<block_size; ib++) {
2780     temp_val[ib] += RAP_val[ib];
2781     }
2782 lgao 3763 }
2783     }
2784    
2785     /* loop over elements in row i2 of P_ext_couple, do the same */
2786     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2787     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2788     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2789 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2790     for (irb=0; irb<row_block_size; irb++) {
2791     for (icb=0; icb<col_block_size; icb++) {
2792     rtmp = ZERO;
2793     for (ib=0; ib<col_block_size; ib++) {
2794     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2795     }
2796     RAP_val[irb+row_block_size*icb]=rtmp;
2797     }
2798     }
2799    
2800 lgao 3763 if (P_marker[i_c] < row_marker_ext) {
2801     P_marker[i_c] = j;
2802 lgao 3779 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2803 lgao 3763 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2804     j++;
2805     } else {
2806 lgao 3779 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2807     for (ib=0; ib<block_size; ib++) {
2808     temp_val[ib] += RAP_val[ib];
2809     }
2810 lgao 3763 }
2811     }
2812    
2813     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2814     Only the contributions are added. */
2815     } else {
2816     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2817     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2818     i_c = P->row_coupleBlock->pattern->index[j3];
2819 lgao 3779 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2820     for (irb=0; irb<row_block_size; irb++) {
2821     for (icb=0; icb<col_block_size; icb++) {
2822     rtmp = ZERO;
2823     for (ib=0; ib<col_block_size; ib++) {
2824     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2825     }
2826     RAP_val[irb+row_block_size*icb]=rtmp;
2827     }
2828     }
2829    
2830     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2831     for (ib=0; ib<block_size; ib++) {
2832     temp_val[ib] += RAP_val[ib];
2833     }
2834 lgao 3763 }
2835     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2836     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2837     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2838 lgao 3779 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2839     for (irb=0; irb<row_block_size; irb++) {
2840     for (icb=0; icb<col_block_size; icb++) {
2841     rtmp = ZERO;
2842     for (ib=0; ib<col_block_size; ib++) {
2843     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2844     }
2845     RAP_val[irb+row_block_size*icb]=rtmp;
2846     }
2847     }
2848    
2849     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2850     for (ib=0; ib<block_size; ib++) {
2851     temp_val[ib] += RAP_val[ib];
2852     }
2853 lgao 3763 }
2854     }
2855     }
2856    
2857     /* now loop over elements in row i1 of A->mainBlock, repeat
2858     the process we have done to block A->col_coupleBlock */
2859     j2_ub = A->mainBlock->pattern->ptr[i1+1];
2860     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2861     i2 = A->mainBlock->pattern->index[j2];
2862 lgao 3779 temp_val = &(A->mainBlock->val[j2*block_size]);
2863     for (irb=0; irb<row_block_size; irb++) {
2864     for (icb=0; icb<col_block_size; icb++) {
2865     rtmp = ZERO;
2866     for (ib=0; ib<col_block_size; ib++) {
2867     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2868     }
2869     RA_val[irb+row_block_size*icb]=rtmp;
2870     }
2871     }
2872    
2873 lgao 3763 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2874     A_marker[i2 + num_Acouple_cols] = i_r;
2875     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2876     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2877     i_c = P->mainBlock->pattern->index[j3];
2878 lgao 3779 temp_val = &(P->mainBlock->val[j3*block_size]);
2879     for (irb=0; irb<row_block_size; irb++) {
2880     for (icb=0; icb<col_block_size; icb++) {
2881     rtmp = ZERO;
2882     for (ib=0; ib<col_block_size; ib++) {
2883     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2884     }
2885     RAP_val[irb+row_block_size*icb]=rtmp;
2886     }
2887     }
2888    
2889 lgao 3763 if (P_marker[i_c] < row_marker) {
2890     P_marker[i_c] = i;
2891 lgao 3779 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2892 lgao 3763 RAP_main_idx[i] = i_c;
2893     i++;
2894     } else {
2895 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2896     for (ib=0; ib<block_size; ib++) {
2897     temp_val[ib] += RAP_val[ib];
2898     }
2899 lgao 3763 }
2900     }
2901     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2902     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2903     /* note that we need to map the column index in
2904     P->col_coupleBlock back into the column index in
2905     P_ext_couple */
2906     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2907 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2908     for (irb=0; irb<row_block_size; irb++) {
2909     for (icb=0; icb<col_block_size; icb++) {
2910     rtmp = ZERO;
2911     for (ib=0; ib<col_block_size; ib++) {
2912     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2913     }
2914     RAP_val[irb+row_block_size*icb]=rtmp;
2915     }
2916     }
2917    
2918 lgao 3763 if (P_marker[i_c] < row_marker_ext) {
2919     P_marker[i_c] = j;
2920 lgao 3779 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2921 lgao 3763 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2922     j++;
2923     } else {
2924 lgao 3779 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2925     for (ib=0; ib<block_size; ib++) {
2926     temp_val[ib] += RAP_val[ib];
2927     }
2928 lgao 3763 }
2929     }
2930    
2931     } else {
2932     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2933     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2934     i_c = P->mainBlock->pattern->index[j3];
2935 lgao 3779 temp_val = &(P->mainBlock->val[j3*block_size]);
2936     for (irb=0; irb<row_block_size; irb++) {
2937     for (icb=0; icb<col_block_size; icb++) {
2938     rtmp = ZERO;
2939     for (ib=0; ib<col_block_size; ib++) {
2940     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2941     }
2942     RAP_val[irb+row_block_size*icb]=rtmp;
2943     }
2944     }
2945    
2946     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2947     for (ib=0; ib<block_size; ib++) {
2948     temp_val[ib] += RAP_val[ib];
2949     }
2950 lgao 3763 }
2951     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2952     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2953     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2954 lgao 3779 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2955     for (irb=0; irb<row_block_size; irb++) {
2956     for (icb=0; icb<col_block_size; icb++) {
2957     rtmp = ZERO;
2958     for (ib=0; ib<col_block_size; ib++) {
2959     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2960     }
2961     RAP_val[irb+row_block_size*icb]=rtmp;
2962     }
2963     }
2964    
2965     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2966     for (ib=0; ib<block_size; ib++) {
2967     temp_val[ib] += RAP_val[ib];
2968     }
2969 lgao 3763 }
2970     }
2971     }
2972     }
2973    
2974     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
2975     if (i > row_marker) {
2976     offset = i - row_marker;
2977     temp = TMPMEMALLOC(offset, index_t);
2978     for (iptr=0; iptr<offset; iptr++)
2979     temp[iptr] = RAP_main_idx[row_marker+iptr];
2980     if (offset > 0) {
2981     #ifdef USE_QSORTG
2982     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
2983     #else
2984     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
2985     #endif
2986     }
2987 lgao 3779 temp_val = TMPMEMALLOC(offset * block_size, double);
2988 lgao 3763 for (iptr=0; iptr<offset; iptr++){
2989     k = P_marker[temp[iptr]];
2990 lgao 3779 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
2991 lgao 3763 P_marker[temp[iptr]] = iptr + row_marker;
2992     }
2993     for (iptr=0; iptr<offset; iptr++){
2994     RAP_main_idx[row_marker+iptr] = temp[iptr];
2995 lgao 3779 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
2996 lgao 3763 }
2997     TMPMEMFREE(temp);
2998     TMPMEMFREE(temp_val);
2999     }
3000     if (j > row_marker_ext) {
3001     offset = j - row_marker_ext;
3002     temp = TMPMEMALLOC(offset, index_t);
3003     for (iptr=0; iptr<offset; iptr++)
3004     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
3005     if (offset > 0) {
3006     #ifdef USE_QSORTG
3007     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3008     #else
3009     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3010     #endif
3011     }
3012 lgao 3779 temp_val = TMPMEMALLOC(offset * block_size, double);
3013 lgao 3763 for (iptr=0; iptr<offset; iptr++){
3014     k = P_marker[temp[iptr] + num_Pmain_cols];
3015 lgao 3779 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
3016 lgao 3763 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
3017     }
3018     for (iptr=0; iptr<offset; iptr++){
3019     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
3020 lgao 3779 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3021 lgao 3763 }
3022     TMPMEMFREE(temp);
3023     TMPMEMFREE(temp_val);
3024     }
3025     }
3026    
3027 lgao 3779 TMPMEMFREE(RA_val);
3028     TMPMEMFREE(RAP_val);
3029 lgao 3763 TMPMEMFREE(A_marker);
3030     TMPMEMFREE(Pext_to_RAP);
3031     TMPMEMFREE(Pcouple_to_RAP);
3032     MEMFREE(RAP_ext_ptr);
3033     MEMFREE(RAP_ext_idx);
3034     MEMFREE(RAP_ext_val);
3035     Paso_SparseMatrix_free(R_main);
3036     Paso_SparseMatrix_free(R_couple);
3037    
3038     /* Check whether there are empty columns in RAP_couple */
3039     #pragma omp parallel for schedule(static) private(i)
3040     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
3041     /* num of non-empty columns is stored in "k" */
3042     k = 0;
3043     j = RAP_couple_ptr[num_Pmain_cols];
3044     for (i=0; i<j; i++) {
3045     i1 = RAP_couple_idx[i];
3046     if (P_marker[i1]) {
3047     P_marker[i1] = 0;
3048     k++;
3049     }
3050     }
3051    
3052     /* empty columns is found */
3053     if (k < num_RAPext_cols) {
3054     temp = TMPMEMALLOC(k, index_t);
3055     k = 0;
3056     for (i=0; i<num_RAPext_cols; i++)
3057     if (!P_marker[i]) {
3058     P_marker[i] = k;
3059     temp[k] = global_id_RAP[i];
3060     k++;
3061     }
3062     for (i=0; i<j; i++) {
3063     i1 = RAP_couple_idx[i];
3064     RAP_couple_idx[i] = P_marker[i1];
3065     }
3066     num_RAPext_cols = k;
3067     TMPMEMFREE(global_id_RAP);
3068     global_id_RAP = temp;
3069     }
3070     TMPMEMFREE(P_marker);
3071    
3072     /******************************************************/
3073     /* Start to create the coasre level System Matrix A_c */
3074     /******************************************************/
3075     /* first, prepare the sender/receiver for the col_connector */
3076     dist = P->pattern->input_distribution->first_component;
3077     recv_len = TMPMEMALLOC(size, dim_t);
3078     send_len = TMPMEMALLOC(size, dim_t);
3079     neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
3080     offsetInShared = TMPMEMALLOC(size+1, index_t);
3081     shared = TMPMEMALLOC(num_RAPext_cols, index_t);
3082     memset(recv_len, 0, sizeof(dim_t) * size);
3083     memset(send_len, 0, sizeof(dim_t) * size);
3084     num_neighbors = 0;
3085     offsetInShared[0] = 0;
3086     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
3087     shared[i] = i + num_Pmain_cols;
3088     if (k <= global_id_RAP[i]) {
3089     if (recv_len[j] > 0) {
3090     neighbor[num_neighbors] = j;
3091     num_neighbors ++;
3092     offsetInShared[num_neighbors] = i;
3093     }
3094     while (k <= global_id_RAP[i]) {
3095     j++;
3096     k = dist[j+1];
3097     }
3098     }
3099     recv_len[j] ++;
3100     }
3101     if (recv_len[j] > 0) {
3102     neighbor[num_neighbors] = j;
3103     num_neighbors ++;
3104     offsetInShared[num_neighbors] = i;
3105     }
3106     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3107     neighbor, shared, offsetInShared, 1, 0, mpi_info);
3108    
3109     MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
3110    
3111     #ifdef ESYS_MPI
3112     mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
3113     mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
3114     #else
3115     mpi_requests=TMPMEMALLOC(size*2,int);
3116     mpi_stati=TMPMEMALLOC(size*2,int);
3117     #endif
3118     num_neighbors = 0;
3119     j = 0;
3120     offsetInShared[0] = 0;
3121     for (i=0; i<size; i++) {
3122     if (send_len[i] > 0) {
3123     neighbor[num_neighbors] = i;
3124     num_neighbors ++;
3125     j += send_len[i];
3126     offsetInShared[num_neighbors] = j;
3127     }
3128     }
3129     TMPMEMFREE(shared);
3130     shared = TMPMEMALLOC(j, index_t);
3131     for (i=0, j=0; i<num_neighbors; i++) {
3132     k = neighbor[i];
3133     MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
3134     mpi_info->msg_tag_counter+k,
3135     mpi_info->comm, &mpi_requests[i]);
3136     j += send_len[k];
3137     }
3138     for (i=0, j=0; i<recv->numNeighbors; i++) {
3139     k = recv->neighbor[i];
3140     MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
3141     mpi_info->msg_tag_counter+rank,
3142     mpi_info->comm, &mpi_requests[i+num_neighbors]);
3143     j += recv_len[k];
3144     }
3145     MPI_Waitall(num_neighbors + recv->numNeighbors,
3146     mpi_requests, mpi_stati);
3147     mpi_info->msg_tag_counter += size;
3148    
3149     j = offsetInShared[num_neighbors];
3150     offset = dist[rank];
3151     for (i=0; i<j; i++) shared[i] = shared[i] - offset;
3152     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3153     neighbor, shared, offsetInShared, 1, 0, mpi_info);
3154    
3155     col_connector = Paso_Connector_alloc(send, recv);
3156     TMPMEMFREE(shared);
3157     Paso_SharedComponents_free(recv);
3158     Paso_SharedComponents_free(send);
3159    
3160     /* now, create row distribution (output_distri) and col
3161     distribution (input_distribution) */
3162     input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3163     output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3164    
3165     /* then, prepare the sender/receiver for the row_connector, first, prepare
3166     the information for sender */
3167     sum = RAP_couple_ptr[num_Pmain_cols];
3168     len = TMPMEMALLOC(size, dim_t);
3169     send_ptr = TMPMEMALLOC(size, index_t*);
3170     send_idx = TMPMEMALLOC(size, index_t*);
3171     for (i=0; i<size; i++) {
3172     send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
3173     send_idx[i] = TMPMEMALLOC(sum, index_t);
3174     memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
3175     }
3176     memset(len, 0, sizeof(dim_t) * size);
3177     recv = col_connector->recv;
3178     sum=0;
3179     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
3180     i1 = RAP_couple_ptr[i_r];
3181     i2 = RAP_couple_ptr[i_r+1];
3182     if (i2 > i1) {
3183     /* then row i_r will be in the sender of row_connector, now check
3184     how many neighbors i_r needs to be send to */
3185     for (j=i1; j<i2; j++) {
3186     i_c = RAP_couple_idx[j];
3187     /* find out the corresponding neighbor "p" of column i_c */
3188     for (p=0; p<recv->numNeighbors; p++) {
3189     if (i_c < recv->offsetInShared[p+1]) {
3190     k = recv->neighbor[p];
3191     if (send_ptr[k][i_r] == 0) sum++;
3192     send_ptr[k][i_r] ++;
3193     send_idx[k][len[k]] = global_id_RAP[i_c];
3194     len[k] ++;
3195     break;
3196     }
3197     }
3198     }
3199     }
3200     }
3201     if (global_id_RAP) {
3202     TMPMEMFREE(global_id_RAP);
3203     global_id_RAP = NULL;
3204     }
3205    
3206     /* now allocate the sender */
3207     shared = TMPMEMALLOC(sum, index_t);
3208     memset(send_len, 0, sizeof(dim_t) * size);
3209     num_neighbors=0;
3210     offsetInShared[0] = 0;
3211     for (p=0, k=0; p<size; p++) {
3212     for (i=0; i<num_Pmain_cols; i++) {
3213     if (send_ptr[p][i] > 0) {
3214     shared[k] = i;
3215     k++;
3216     send_ptr[p][send_len[p]] = send_ptr[p][i];
3217     send_len[p]++;
3218     }
3219     }
3220     if (k > offsetInShared[num_neighbors]) {
3221     neighbor[num_neighbors] = p;
3222     num_neighbors ++;
3223     offsetInShared[num_neighbors] = k;
3224     }
3225     }
3226     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3227     neighbor, shared, offsetInShared, 1, 0, mpi_info);
3228    
3229     /* send/recv number of rows will be sent from current proc
3230     recover info for the receiver of row_connector from the sender */
3231     MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
3232     num_neighbors = 0;
3233     offsetInShared[0] = 0;
3234     j = 0;
3235     for (i=0; i<size; i++) {
3236     if (i != rank && recv_len[i] > 0) {
3237     neighbor[num_neighbors] = i;
3238     num_neighbors ++;
3239     j += recv_len[i];
3240     offsetInShared[num_neighbors] = j;
3241     }
3242     }
3243     TMPMEMFREE(shared);
3244     TMPMEMFREE(recv_len);
3245     shared = TMPMEMALLOC(j, index_t);
3246     k = offsetInShared[num_neighbors];
3247     for (i=0; i<k; i++) {
3248     shared[i] = i + num_Pmain_cols;
3249     }
3250     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3251     neighbor, shared, offsetInShared, 1, 0, mpi_info);
3252     row_connector = Paso_Connector_alloc(send, recv);
3253     TMPMEMFREE(shared);
3254     Paso_SharedComponents_free(recv);
3255     Paso_SharedComponents_free(send);
3256    
3257     /* send/recv pattern->ptr for rowCoupleBlock */
3258     num_RAPext_rows = offsetInShared[num_neighbors];
3259     row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
3260     for (p=0; p<num_neighbors; p++) {
3261     j = offsetInShared[p];
3262     i = offsetInShared[p+1];
3263     MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
3264     mpi_info->msg_tag_counter+neighbor[p],
3265     mpi_info->comm, &mpi_requests[p]);
3266     }
3267     send = row_connector->send;
3268     for (p=0; p<send->numNeighbors; p++) {
3269     MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
3270     MPI_INT, send->neighbor[p],
3271     mpi_info->msg_tag_counter+rank,
3272     mpi_info->comm, &mpi_requests[p+num_neighbors]);
3273     }
3274     MPI_Waitall(num_neighbors + send->numNeighbors,
3275     mpi_requests, mpi_stati);
3276     mpi_info->msg_tag_counter += size;
3277     TMPMEMFREE(send_len);
3278    
3279     sum = 0;
3280     for (i=0; i<num_RAPext_rows; i++) {
3281     k = row_couple_ptr[i];
3282     row_couple_ptr[i] = sum;
3283     sum += k;
3284     }
3285     row_couple_ptr[num_RAPext_rows] = sum;
3286    
3287     /* send/recv pattern->index for rowCoupleBlock */
3288     k = row_couple_ptr[num_RAPext_rows];
3289     row_couple_idx = MEMALLOC(k, index_t);
3290     for (p=0; p<num_neighbors; p++) {
3291     j1 = row_couple_ptr[offsetInShared[p]];
3292     j2 = row_couple_ptr[offsetInShared[p+1]];
3293     MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
3294     mpi_info->msg_tag_counter+neighbor[p],
3295     mpi_info->comm, &mpi_requests[p]);
3296     }
3297     for (p=0; p<send->numNeighbors; p++) {
3298     MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
3299     MPI_INT, send->neighbor[p],
3300     mpi_info->msg_tag_counter+rank,
3301     mpi_info->comm, &mpi_requests[p+num_neighbors]);
3302     }
3303     MPI_Waitall(num_neighbors + send->numNeighbors,
3304     mpi_requests, mpi_stati);
3305     mpi_info->msg_tag_counter += size;
3306    
3307     offset = input_dist->first_component[rank];
3308     k = row_couple_ptr[num_RAPext_rows];
3309     for (i=0; i<k; i++) {
3310     row_couple_idx[i] -= offset;
3311     }
3312    
3313     for (i=0; i<size; i++) {
3314     TMPMEMFREE(send_ptr[i]);
3315     TMPMEMFREE(send_idx[i]);
3316     }
3317     TMPMEMFREE(send_ptr);
3318     TMPMEMFREE(send_idx);
3319     TMPMEMFREE(len);
3320     TMPMEMFREE(offsetInShared);
3321     TMPMEMFREE(neighbor);
3322     TMPMEMFREE(mpi_requests);
3323     TMPMEMFREE(mpi_stati);
3324     Esys_MPIInfo_free(mpi_info);
3325    
3326     /* Now, we can create pattern for mainBlock and coupleBlock */
3327     main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
3328     num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
3329     col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3330     num_Pmain_cols, num_RAPext_cols,
3331     RAP_couple_ptr, RAP_couple_idx);
3332     row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3333     num_RAPext_rows, num_Pmain_cols,
3334     row_couple_ptr, row_couple_idx);
3335    
3336     /* next, create the system matrix */
3337     pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
3338     output_dist, input_dist, main_pattern, col_couple_pattern,
3339     row_couple_pattern, col_connector, row_connector);
3340     out = Paso_SystemMatrix_alloc(A->type, pattern,
3341     row_block_size, col_block_size, FALSE);
3342    
3343     /* finally, fill in the data*/
3344     memcpy(out->mainBlock->val, RAP_main_val,
3345     out->mainBlock->len* sizeof(double));
3346     memcpy(out->col_coupleBlock->val, RAP_couple_val,
3347     out->col_coupleBlock->len * sizeof(double));
3348    
3349     /* Clean up */
3350     Paso_SystemMatrixPattern_free(pattern);
3351     Paso_Pattern_free(main_pattern);
3352     Paso_Pattern_free(col_couple_pattern);
3353     Paso_Pattern_free(row_couple_pattern);
3354     Paso_Connector_free(col_connector);
3355     Paso_Connector_free(row_connector);
3356     Paso_Distribution_free(output_dist);
3357     Paso_Distribution_free(input_dist);
3358     TMPMEMFREE(RAP_main_val);
3359     TMPMEMFREE(RAP_couple_val);
3360     if (Esys_noError()) {
3361     return out;
3362     } else {
3363     Paso_SystemMatrix_free(out);
3364     return NULL;
3365     }
3366     }

  ViewVC Help
Powered by ViewVC 1.1.26