/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Annotation of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3817 - (hide annotations)
Fri Feb 10 03:43:35 2012 UTC (7 years, 10 months ago) by lgao
Original Path: branches/amg_from_3530/paso/src/AMG_Interpolation.c
File MIME type: text/plain
File size: 155917 byte(s)
Now AMG is working. With this working version, a few changes have been
made to the code outside the range of AMG: 
1) in Distribution.c: the way used to generate random seed depends on
the problem size. Sometimes (when the problem size is power of 2), the 
random seed will ends up not working (not diverse). Modification has
been made to fix this.
2) in addAbsRow.c: Paso_SparseMatrix_maxAbsRow_CSR_OFFSET0() modified.

There are still problem with the smoother. The case handler for 
block_size > 3 is not right. 


1 lgao 3763 //
2     /*******************************************************
3     *
4     * Copyright (c) 2003-2011 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13    
14    
15     /**************************************************************/
16    
17     /* Paso: defines AMG Interpolation */
18    
19     /**************************************************************/
20    
21     /* Author: l.gao@uq.edu.au */
22    
23     /**************************************************************/
24    
25     #include "Paso.h"
26     #include "SparseMatrix.h"
27     #include "PasoUtil.h"
28     #include "Preconditioner.h"
29    
30     /**************************************************************
31    
32     Methods nessecary for AMG preconditioner
33    
34     construct n_C x n_C interpolation operator A_C from matrix A
35     and prolongation matrix P.
36    
37     The coarsening operator A_C is defined by RAP where R=P^T.
38    
39     ***************************************************************/
40    
41     #define MY_DEBUG 0
42 lgao 3817 #define MY_DEBUG1 1
43 lgao 3763
44     /* Extend system matrix B with extra two sparse matrixes:
45     B_ext_main and B_ext_couple
46     The combination of this two sparse matrixes represents
47     the portion of B that is stored on neighbor procs and
48     needed locally for triple matrix product (B^T) A B.
49    
50     FOR NOW, we assume that the system matrix B has a NULL
51     row_coupleBlock and a NULL remote_coupleBlock. Based on
52     this assumption, we use link row_coupleBlock for sparse
53     matrix B_ext_main, and link remote_coupleBlock for sparse
54     matrix B_ext_couple.
55    
56     To be specific, B_ext (on processor k) are group of rows
57     in B, where the list of rows from processor q is given by
58     A->col_coupler->connector->send->shared[rPtr...]
59     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
60     on q. */
61     void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
62     {
63     Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
64     Paso_Coupler *coupler=NULL;
65     Paso_SharedComponents *send=NULL, *recv=NULL;
66     double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
67     index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
68     index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
69     index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
70     index_t *idx_m=NULL, *idx_c=NULL;
71 lgao 3817 index_t i, j, k, m, n, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
72 lgao 3763 index_t offset, len, block_size, block_size_size, max_num_cols;
73     index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
74     dim_t *recv_degree=NULL, *send_degree=NULL;
75     dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
76    
77     if (size == 1) return;
78    
79     if (B->row_coupleBlock) {
80     Paso_SparseMatrix_free(B->row_coupleBlock);
81     B->row_coupleBlock = NULL;
82     }
83    
84     if (B->row_coupleBlock || B->remote_coupleBlock) {
85     Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
86     return;
87     }
88    
89 lgao 3817 if (MY_DEBUG) fprintf(stderr, "CP1_1\n");
90 lgao 3763
91     /* sending/receiving unknown's global ID */
92     num_main_cols = B->mainBlock->numCols;
93     cols = TMPMEMALLOC(num_main_cols, double);
94     offset = B->col_distribution->first_component[rank];
95     #pragma omp parallel for private(i) schedule(static)
96     for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
97     if (B->global_id == NULL) {
98     coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
99     Paso_Coupler_startCollect(coupler, cols);
100     }
101    
102     recv_buf = TMPMEMALLOC(size, index_t);
103     recv_degree = TMPMEMALLOC(size, dim_t);
104     recv_offset = TMPMEMALLOC(size+1, index_t);
105     #pragma omp parallel for private(i) schedule(static)
106     for (i=0; i<size; i++){
107     recv_buf[i] = 0;
108     recv_degree[i] = 1;
109     recv_offset[i] = i;
110     }
111    
112 lgao 3779 block_size = B->block_size;
113 lgao 3763 block_size_size = block_size * sizeof(double);
114     num_couple_cols = B->col_coupleBlock->numCols;
115     send = A->col_coupler->connector->send;
116     recv = A->col_coupler->connector->recv;
117     num_neighbors = send->numNeighbors;
118     p = send->offsetInShared[num_neighbors];
119 lgao 3767 len = p * B->col_distribution->first_component[size];
120 lgao 3763 send_buf = TMPMEMALLOC(len * block_size, double);
121     send_idx = TMPMEMALLOC(len, index_t);
122     send_offset = TMPMEMALLOC((p+1)*2, index_t);
123     send_degree = TMPMEMALLOC(num_neighbors, dim_t);
124     i = num_main_cols + num_couple_cols;
125     send_m = TMPMEMALLOC(i * block_size, double);
126     send_c = TMPMEMALLOC(i * block_size, double);
127     idx_m = TMPMEMALLOC(i, index_t);
128     idx_c = TMPMEMALLOC(i, index_t);
129    
130     /* waiting for receiving unknown's global ID */
131     if (B->global_id == NULL) {
132     Paso_Coupler_finishCollect(coupler);
133     global_id = MEMALLOC(num_couple_cols, index_t);
134     #pragma omp parallel for private(i) schedule(static)
135     for (i=0; i<num_couple_cols; ++i)
136     global_id[i] = coupler->recv_buffer[i];
137     B->global_id = global_id;
138     Paso_Coupler_free(coupler);
139     } else
140     global_id = B->global_id;
141    
142     /* distribute the number of cols in current col_coupleBlock to all ranks */
143     MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
144    
145 lgao 3817 if (MY_DEBUG) fprintf(stderr, "CP1_2\n");
146 lgao 3763
147     /* distribute global_ids of cols to be considered to all ranks*/
148     len = 0;
149     max_num_cols = 0;
150     for (i=0; i<size; i++){
151     recv_degree[i] = recv_buf[i];
152     recv_offset[i] = len;
153     len += recv_buf[i];
154     if (max_num_cols < recv_buf[i])
155     max_num_cols = recv_buf[i];
156     }
157     recv_offset[size] = len;
158     cols_array = TMPMEMALLOC(len, index_t);
159     MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
160    
161 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_3\n", rank);
162 lgao 3763 if (MY_DEBUG) {
163     int i=0;
164     char hostname[256];
165     gethostname(hostname, sizeof(hostname));
166     fprintf(stderr, "rank %d PID %d on %s ready for attach\n", A->mpi_info->rank, getpid(), hostname);
167     if (rank == 1) {
168     fflush(stdout);
169     while (0 == i)
170     sleep(5);
171     }
172     }
173    
174     /* first, prepare the ptr_ptr to be received */
175     q = recv->numNeighbors;
176     len = recv->offsetInShared[q];
177     ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
178     for (p=0; p<q; p++) {
179     row = recv->offsetInShared[p];
180     m = recv->offsetInShared[p + 1];
181 lgao 3767 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
182 lgao 3763 A->mpi_info->msg_tag_counter+recv->neighbor[p],
183     A->mpi_info->comm,
184     &(A->col_coupler->mpi_requests[p]));
185 lgao 3817 if(MY_DEBUG) fprintf(stderr, "rank %d receive %d from %d tag %d (numNeighbors %d)\n", rank, 2*(m-row), recv->neighbor[p], A->mpi_info->msg_tag_counter+recv->neighbor[p], q);
186 lgao 3763 }
187    
188 lgao 3817 if (MY_DEBUG && rank == 1){
189     //Paso_SystemMatrix_print(B);
190     char *str1, *str2;
191     int sum, rank, i, iPtr, ib, block_size;
192     sum = B->col_coupleBlock->numCols*2;
193     str1 = TMPMEMALLOC(sum*100+100, char);
194     str2 = TMPMEMALLOC(100, char);
195     sprintf(str1, "B-col-coupleB: %d rows\n-----------\n", sum);
196     for (i=23; i<24; i++) {
197     sprintf(str2, "Row %d: ",i);
198     strcat(str1, str2);
199     for (iPtr =B->col_coupleBlock->pattern->ptr[i]; iPtr<B->col_coupleBlock->pattern->ptr[i+1]; ++iPtr) {
200     sprintf(str1, "%s (%d %d %f %f", str1, B->global_id[B->col_coupleBlock->pattern->index[iPtr]], iPtr, B->col_coupleBlock->val[iPtr*2], B->col_coupleBlock->val[iPtr*2+1]);
201     for (ib = 0; ib<block_size; ib++) {
202     // sprintf(str1, "%s %f", str1, B->col_coupleBlock->val[iPtr*block_size+ib]);
203     }
204     sprintf(str1, "%s) ", str1);
205     }
206     sprintf(str1, "%s\n", str1);
207     }
208     fprintf(stderr, "%s\n", str1);
209     TMPMEMFREE(str1);
210     TMPMEMFREE(str2);
211     }
212 lgao 3763 /* now prepare the rows to be sent (the degree, the offset and the data) */
213     len = 0;
214 lgao 3817 i0 = 0;
215 lgao 3763 for (p=0; p<num_neighbors; p++) {
216 lgao 3817 i = i0;
217 lgao 3763 neighbor = send->neighbor[p];
218     m_lb = B->col_distribution->first_component[neighbor];
219     m_ub = B->col_distribution->first_component[neighbor + 1];
220     j_ub = send->offsetInShared[p + 1];
221 lgao 3817 if (MY_DEBUG1 && rank == 1 && p == 0) fprintf(stderr, "rank%d neighbor %d m_lb %d m_ub %d offset %d\n", rank, neighbor, m_lb, m_ub, offset);
222 lgao 3763 for (j=send->offsetInShared[p]; j<j_ub; j++) {
223     row = send->shared[j];
224     l_m = 0;
225     l_c = 0;
226     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
227     k_lb = B->col_coupleBlock->pattern->ptr[row];
228    
229     /* check part of col_coupleBlock for data to be passed @row */
230     // q = B->mainBlock->pattern->index[B->mainBlock->pattern->ptr[row]] + offset;
231     for (k=k_lb; k<k_ub; k++) {
232     m = global_id[B->col_coupleBlock->pattern->index[k]];
233     // if (m > q) break;
234     if (m > offset) break;
235 lgao 3767 if (MY_DEBUG) fprintf(stderr, "rank%d (1) row %d m %d k %d\n", rank, row, m, B->col_coupleBlock->pattern->index[k]);
236 lgao 3763 if (m>= m_lb && m < m_ub) {
237     /* data to be passed to sparse matrix B_ext_main */
238     idx_m[l_m] = m - m_lb;
239     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
240 lgao 3817 if (MY_DEBUG && rank == 1 && p == 0 && j == 23)
241     fprintf(stderr, "[1] l_m %d idx %d val %f %f k%d row%d\n", l_m, m - m_lb, B->col_coupleBlock->val[block_size*k], send_m[l_m*block_size+1], k, row);
242 lgao 3763 l_m++;
243     } else {
244     /* data to be passed to sparse matrix B_ext_couple */
245     idx_c[l_c] = m;
246     if (MY_DEBUG) fprintf(stderr, "rank%d (1) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);
247     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
248     l_c++;
249     }
250     }
251     k_lb = k;
252    
253     /* check mainBlock for data to be passed @row to sparse
254     matrix B_ext_couple */
255     k_ub = B->mainBlock->pattern->ptr[row + 1];
256     k = B->mainBlock->pattern->ptr[row];
257     memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
258     for (; k<k_ub; k++) {
259     m = B->mainBlock->pattern->index[k] + offset;
260 lgao 3767 if (MY_DEBUG) fprintf(stderr, "rank%d (2) row %d m %d k %d\n", rank, row, m, B->mainBlock->pattern->index[k]);
261 lgao 3763 if (MY_DEBUG) fprintf(stderr, "rank%d (2) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);
262     idx_c[l_c] = m;
263     l_c++;
264     }
265    
266     /* check the rest part of col_coupleBlock for data to
267     be passed @row to sparse matrix B_ext_couple */
268     k = k_lb;
269     k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
270     for (k=k_lb; k<k_ub; k++) {
271     m = global_id[B->col_coupleBlock->pattern->index[k]];
272 lgao 3767 if (MY_DEBUG) fprintf(stderr, "rank%d (3) row %d m %d k %d\n", rank, row, m, B->col_coupleBlock->pattern->index[k]);
273     if (m>= m_lb && m < m_ub) {
274 lgao 3763 /* data to be passed to sparse matrix B_ext_main */
275     idx_m[l_m] = m - m_lb;
276     memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
277 lgao 3817 if (MY_DEBUG && rank == 1 && p == 0 && j == 23)
278     fprintf(stderr, "l_m %d idx %d val %f\n", l_m, m - m_lb, B->col_coupleBlock->val[block_size*k]);
279 lgao 3763 l_m++;
280     } else {
281     /* data to be passed to sparse matrix B_ext_couple */
282     idx_c[l_c] = m;
283     if (MY_DEBUG) fprintf(stderr, "rank%d (3) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);
284     memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
285     l_c++;
286     }
287     }
288    
289 lgao 3767 if (MY_DEBUG && rank == 2) fprintf(stderr, "rank %d p %d i %d j %d l_c %d l_m %d ub %d\n", rank, p, i, j, l_c, l_m, num_couple_cols + num_main_cols);
290 lgao 3779 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
291 lgao 3767 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
292 lgao 3763 send_offset[2*i] = l_m;
293     len += l_m;
294 lgao 3779 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
295 lgao 3767 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
296 lgao 3763 send_offset[2*i+1] = l_c;
297     len += l_c;
298 lgao 3767 if (MY_DEBUG && rank == 0) {
299     int sum = l_m+l_c, my_i;
300     char *str1, *str2;
301     str1 = TMPMEMALLOC(sum*100+100, char);
302     str2 = TMPMEMALLOC(100, char);
303     sprintf(str1, "rank %d send_idx[%d] offset %d m%d c%d= (", rank, sum, len-sum, l_m, l_c);
304     for (my_i=len-sum; my_i<len; my_i++){
305     sprintf(str2, "%d ", send_idx[my_i]);
306     strcat(str1, str2);
307     }
308     fprintf(stderr, "%s)\n", str1);
309     TMPMEMFREE(str1);
310     TMPMEMFREE(str2);
311     }
312 lgao 3763 i++;
313     }
314    
315 lgao 3817 if (MY_DEBUG && rank == 1) {
316 lgao 3767 int my_i,sum = len;
317     char *str1, *str2;
318     str1 = TMPMEMALLOC(sum*100+100, char);
319     str2 = TMPMEMALLOC(100, char);
320     sprintf(str1, "rank %d send_idx[%d] = (", rank, sum);
321     for (my_i=0; my_i<sum; my_i++){
322     sprintf(str2, "%d ", send_idx[my_i]);
323     strcat(str1, str2);
324     }
325     fprintf(stderr, "%s)\n", str1);
326     TMPMEMFREE(str1);
327     TMPMEMFREE(str2);
328     }
329    
330 lgao 3763 /* sending */
331 lgao 3817 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
332 lgao 3763 A->mpi_info->msg_tag_counter+rank,
333     A->mpi_info->comm,
334     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
335 lgao 3817 if(MY_DEBUG) fprintf(stderr, "rank %d send %d to %d tag %d (offset %d numNeighbors %d)\n", rank, 2*(i-i0), send->neighbor[p], A->mpi_info->msg_tag_counter+rank, i0, send->numNeighbors);
336 lgao 3763 send_degree[p] = len;
337 lgao 3817 i0 = i;
338 lgao 3763 }
339     TMPMEMFREE(send_m);
340     TMPMEMFREE(send_c);
341     TMPMEMFREE(idx_m);
342     TMPMEMFREE(idx_c);
343    
344    
345     q = recv->numNeighbors;
346     len = recv->offsetInShared[q];
347     ptr_main = MEMALLOC((len+1), index_t);
348     ptr_couple = MEMALLOC((len+1), index_t);
349    
350    
351     MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
352     A->col_coupler->mpi_requests,
353     A->col_coupler->mpi_stati);
354     A->mpi_info->msg_tag_counter += size;
355    
356 lgao 3767 if (MY_DEBUG) {
357     int sum = recv->offsetInShared[recv->numNeighbors] * 2;
358     char *str1, *str2;
359     str1 = TMPMEMALLOC(sum*100+100, char);
360     str2 = TMPMEMALLOC(100, char);
361     sprintf(str1, "rank %d ptr_ptr[%d] = (", rank, sum);
362     for (i=0; i<sum; i++){
363     sprintf(str2, "%d ", ptr_ptr[i]);
364     strcat(str1, str2);
365     }
366     fprintf(stderr, "%s)\n", str1);
367     TMPMEMFREE(str1);
368     TMPMEMFREE(str2);
369     }
370    
371 lgao 3763 j = 0;
372     k = 0;
373     ptr_main[0] = 0;
374     ptr_couple[0] = 0;
375     // #pragma omp parallel for private(i,j,k) schedule(static)
376     for (i=0; i<len; i++) {
377     j += ptr_ptr[2*i];
378     k += ptr_ptr[2*i+1];
379     ptr_main[i+1] = j;
380     ptr_couple[i+1] = k;
381     }
382    
383     TMPMEMFREE(ptr_ptr);
384     idx_main = MEMALLOC(j, index_t);
385     idx_couple = MEMALLOC(k, index_t);
386     ptr_idx = TMPMEMALLOC(j+k, index_t);
387 lgao 3779 ptr_val = TMPMEMALLOC((j+k) * block_size, double);
388 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_4_2 %d %d %d\n", rank, j, k, len);
389 lgao 3763
390     /* send/receive index array */
391     j=0;
392     k_ub = 0;
393     for (p=0; p<recv->numNeighbors; p++) {
394     k = recv->offsetInShared[p];
395     m = recv->offsetInShared[p+1];
396     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
397     if (i > 0) {
398     k_ub ++;
399     MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
400     A->mpi_info->msg_tag_counter+recv->neighbor[p],
401     A->mpi_info->comm,
402     &(A->col_coupler->mpi_requests[p]));
403 lgao 3817 if(MY_DEBUG) fprintf(stderr, "rank %d (INDEX) recv %d from %d tag %d (%d %d %d %d %d %d)\n", rank, i, recv->neighbor[p], A->mpi_info->msg_tag_counter+recv->neighbor[p], m, k, ptr_main[m], ptr_main[k], ptr_couple[m], ptr_couple[k]);
404 lgao 3763 } else {
405 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank%d k %d m %d main(%d %d) couple(%d %d)\n", rank, k, m, ptr_main[m], ptr_main[k], ptr_couple[m], ptr_couple[k]);
406 lgao 3763 }
407     j += i;
408     }
409    
410 lgao 3767 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_4_3 %d %d\n", rank, recv->numNeighbors, k_ub);
411 lgao 3763 j=0;
412     k_ub = 0;
413     for (p=0; p<num_neighbors; p++) {
414     i = send_degree[p] - j;
415     if (i > 0){
416     k_ub ++;
417 lgao 3767 if (MY_DEBUG && rank == 0 && send->neighbor[p] == 1) {
418     int sum = i, my_i;
419     char *str1, *str2;
420     str1 = TMPMEMALLOC(sum*100+100, char);
421     str2 = TMPMEMALLOC(100, char);
422     sprintf(str1, "rank %d send_idx[%d] offset %d = (", rank, sum, j);
423     for (my_i=0; my_i<sum; my_i++){
424     sprintf(str2, "%d ", send_idx[j+my_i]);
425     strcat(str1, str2);
426     }
427     fprintf(stderr, "%s)\n", str1);
428     TMPMEMFREE(str1);
429     TMPMEMFREE(str2);
430     }
431 lgao 3763 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
432     A->mpi_info->msg_tag_counter+rank,
433     A->mpi_info->comm,
434     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
435 lgao 3817 if(MY_DEBUG) fprintf(stderr, "rank %d (INDEX) send %d to %d tag %d\n", rank, i, send->neighbor[p], A->mpi_info->msg_tag_counter+rank);
436 lgao 3763 } else {
437 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank%d send_degree %d, p %d, j %d\n", rank, send_degree[p], p, j);
438 lgao 3763 }
439     j = send_degree[p];
440     }
441 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_4_4 %d %d\n", rank,num_neighbors, k_ub);
442 lgao 3763
443     MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
444     A->col_coupler->mpi_requests,
445     A->col_coupler->mpi_stati);
446     A->mpi_info->msg_tag_counter += size;
447 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5 %d %d %d\n", rank, len, ptr_main[len], ptr_couple[len]);
448     if (MY_DEBUG && rank == 3) {
449 lgao 3767 int my_i, sum1 = recv->offsetInShared[recv->numNeighbors];
450     int sum = ptr_main[sum1] + ptr_couple[sum1];
451     char *str1, *str2;
452     str1 = TMPMEMALLOC(sum*100+100, char);
453     str2 = TMPMEMALLOC(100, char);
454     sprintf(str1, "rank %d ptr_idx[%d] = (", rank, sum);
455 lgao 3817 for (my_i=0; my_i<500; my_i++){
456 lgao 3767 sprintf(str2, "%d ", ptr_idx[my_i]);
457     strcat(str1, str2);
458     }
459     fprintf(stderr, "%s)\n", str1);
460     TMPMEMFREE(str1);
461     TMPMEMFREE(str2);
462     }
463 lgao 3763
464 lgao 3767 // #pragma omp parallel for private(i,j,k,m,p) schedule(static)
465 lgao 3763 for (i=0; i<len; i++) {
466     j = ptr_main[i];
467     k = ptr_main[i+1];
468     m = ptr_couple[i];
469     for (p=j; p<k; p++) {
470     idx_main[p] = ptr_idx[m+p];
471     }
472     j = ptr_couple[i+1];
473     for (p=m; p<j; p++) {
474     idx_couple[p] = ptr_idx[k+p];
475     }
476     }
477     TMPMEMFREE(ptr_idx);
478 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5_1 %d %d %d\n", rank, num_main_cols, len, B->mainBlock->numRows);
479     if (MY_DEBUG && rank == 3) {
480     int sum = len, sum1 = ptr_main[sum];
481 lgao 3767 char *str1, *str2;
482 lgao 3817 str1 = TMPMEMALLOC(sum1*100+100, char);
483 lgao 3767 str2 = TMPMEMALLOC(100, char);
484     sprintf(str1, "rank %d ptr_main[%d] = (", rank, sum);
485     for (i=0; i<sum+1; i++){
486     sprintf(str2, "%d ", ptr_main[i]);
487     strcat(str1, str2);
488     }
489 lgao 3817 //fprintf(stderr, "%s)\n", str1);
490 lgao 3767 sprintf(str1, "rank %d idx_main[%d] = (", rank, sum1);
491 lgao 3817 for (i=500; i<700; i++){
492     sprintf(str2, "%d-%d ", i, idx_main[i]);
493 lgao 3767 strcat(str1, str2);
494     }
495     fprintf(stderr, "%s)\n", str1);
496     TMPMEMFREE(str1);
497     TMPMEMFREE(str2);
498     }
499 lgao 3763
500     /* allocate pattern and sparsematrix for B_ext_main */
501     pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
502     len, num_main_cols, ptr_main, idx_main);
503     B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
504     pattern_main, B->row_block_size, B->col_block_size,
505     FALSE);
506     Paso_Pattern_free(pattern_main);
507     if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5_2\n", rank);
508    
509     /* allocate pattern and sparsematrix for B_ext_couple */
510     pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
511     len, B->col_distribution->first_component[size],
512     ptr_couple, idx_couple);
513     B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
514     pattern_couple, B->row_block_size, B->col_block_size,
515     FALSE);
516     Paso_Pattern_free(pattern_couple);
517    
518 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_6\n", rank);
519 lgao 3763
520     /* send/receive value array */
521     j=0;
522     for (p=0; p<recv->numNeighbors; p++) {
523     k = recv->offsetInShared[p];
524     m = recv->offsetInShared[p+1];
525     i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
526     if (i > 0)
527     MPI_Irecv(&(ptr_val[j]), i * block_size,
528     MPI_DOUBLE, recv->neighbor[p],
529     A->mpi_info->msg_tag_counter+recv->neighbor[p],
530     A->mpi_info->comm,
531     &(A->col_coupler->mpi_requests[p]));
532 lgao 3779 j += (i * block_size);
533 lgao 3763 }
534    
535     j=0;
536     for (p=0; p<num_neighbors; p++) {
537     i = send_degree[p] - j;
538     if (i > 0)
539 lgao 3779 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
540 lgao 3763 A->mpi_info->msg_tag_counter+rank,
541     A->mpi_info->comm,
542     &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
543 lgao 3779 j = send_degree[p] ;
544 lgao 3763 }
545    
546     MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
547     A->col_coupler->mpi_requests,
548     A->col_coupler->mpi_stati);
549     A->mpi_info->msg_tag_counter += size;
550 lgao 3817 if (MY_DEBUG) fprintf(stderr, "CP1_7\n");
551 lgao 3763
552     #pragma omp parallel for private(i,j,k,m,p) schedule(static)
553     for (i=0; i<len; i++) {
554     j = ptr_main[i];
555     k = ptr_main[i+1];
556     m = ptr_couple[i];
557     for (p=j; p<k; p++) {
558 lgao 3817 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
559     // B->row_coupleBlock->val[p*block_size] = ptr_val[(m+p)*block_size];
560 lgao 3763 }
561     j = ptr_couple[i+1];
562     for (p=m; p<j; p++) {
563 lgao 3817 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
564     // B->remote_coupleBlock->val[p*block_size] = ptr_val[(k+p)*block_size];
565 lgao 3763 }
566     }
567     TMPMEMFREE(ptr_val);
568    
569 lgao 3817
570 lgao 3763 /* release all temp memory allocation */
571     TMPMEMFREE(cols);
572     TMPMEMFREE(cols_array);
573     TMPMEMFREE(recv_offset);
574     TMPMEMFREE(recv_degree);
575     TMPMEMFREE(recv_buf);
576     TMPMEMFREE(send_buf);
577     TMPMEMFREE(send_offset);
578     TMPMEMFREE(send_degree);
579     TMPMEMFREE(send_idx);
580     }
581    
582     /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
583     has the same number of rows as P->col_coupleBlock->numCols. Now, we need
584     to copy block of data in T to neighbor processors, defined by
585     P->col_coupler->connector->recv->neighbor[k] where k is in
586     [0, P->col_coupler->connector->recv->numNeighbors).
587     Rows to be copied to neighbor processor k is in the list defined by
588     P->col_coupler->connector->recv->offsetInShared[k] ...
589     P->col_coupler->connector->recv->offsetInShared[k+1] */
590     void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
591 lgao 3779 index_t **p_ptr, index_t **p_idx, double **p_val,
592     index_t *global_id, index_t block_size)
593 lgao 3763 {
594     Paso_SharedComponents *send=NULL, *recv=NULL;
595     index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
596     index_t i, j, p, m, n, rank, size, offset;
597     index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
598     index_t *ptr=*p_ptr, *idx=*p_idx;
599     double *val=*p_val, *recv_val=NULL;
600    
601     rank = P->mpi_info->rank;
602     size = P->mpi_info->size;
603     send = P->col_coupler->connector->recv;
604     recv = P->col_coupler->connector->send;
605     send_neighbors = send->numNeighbors;
606     recv_neighbors = recv->numNeighbors;
607     send_rows = P->col_coupleBlock->numCols;
608     recv_rows = recv->offsetInShared[recv_neighbors];
609     offset = P->col_distribution->first_component[rank];
610    
611     send_degree = TMPMEMALLOC(send_rows, index_t);
612     recv_ptr = MEMALLOC(recv_rows + 1, index_t);
613     #pragma omp for schedule(static) private(i)
614     for (i=0; i<send_rows; i++)
615     send_degree[i] = ptr[i+1] - ptr[i];
616     if (MY_DEBUG) fprintf(stderr, "rank %d CP5_1 srows %d rrows %d\n", rank, send_rows, recv_rows);
617    
618     /* First, send/receive the degree */
619     for (p=0; p<recv_neighbors; p++) { /* Receiving */
620     m = recv->offsetInShared[p];
621     n = recv->offsetInShared[p+1];
622     MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
623     P->mpi_info->msg_tag_counter + recv->neighbor[p],
624     P->mpi_info->comm,
625     &(P->col_coupler->mpi_requests[p]));
626     }
627     for (p=0; p<send_neighbors; p++) { /* Sending */
628     m = send->offsetInShared[p];
629     n = send->offsetInShared[p+1];
630     MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
631     P->mpi_info->msg_tag_counter + rank,
632     P->mpi_info->comm,
633     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
634     }
635     MPI_Waitall(send_neighbors+recv_neighbors,
636     P->col_coupler->mpi_requests,
637     P->col_coupler->mpi_stati);
638     P->mpi_info->msg_tag_counter += size;
639     if (MY_DEBUG) fprintf(stderr, "rank %d CP5_3 %d %d\n", rank, recv_neighbors, send_neighbors);
640    
641     TMPMEMFREE(send_degree);
642     m = Paso_Util_cumsum(recv_rows, recv_ptr);
643     recv_ptr[recv_rows] = m;
644     recv_idx = MEMALLOC(m, index_t);
645 lgao 3779 recv_val = MEMALLOC(m * block_size, double);
646 lgao 3763 if (MY_DEBUG) fprintf(stderr, "rank %d receive size %d\n", rank, m);
647    
648     /* Next, send/receive the index array */
649     j = 0;
650     for (p=0; p<recv_neighbors; p++) { /* Receiving */
651     m = recv->offsetInShared[p];
652     n = recv->offsetInShared[p+1];
653     i = recv_ptr[n] - recv_ptr[m];
654     if (i > 0) {
655     if (MY_DEBUG) fprintf(stderr, "rank %d recv message size %d from %d (%d, %d, %d, %d, %d) \n", rank, i, recv->neighbor[p], p, m, n, recv_ptr[n], recv_ptr[m]);
656     MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
657     P->mpi_info->msg_tag_counter + recv->neighbor[p],
658     P->mpi_info->comm,
659     &(P->col_coupler->mpi_requests[p]));
660     } else
661     if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty recv message %d %d %d %d %d %d\n", rank, recv_neighbors, p, m, n, recv_ptr[n], recv_ptr[m]);
662     j += i;
663     }
664    
665     j = 0;
666     for (p=0; p<send_neighbors; p++) { /* Sending */
667     m = send->offsetInShared[p];
668     n = send->offsetInShared[p+1];
669     i = ptr[n] - ptr[m];
670     if (i >0) {
671     if (MY_DEBUG) fprintf(stderr, "rank %d send message size %d to %d (%d, %d, %d, %d, %d) \n", rank, i, send->neighbor[p], p, m, n, ptr[n], ptr[m]);
672     MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
673     P->mpi_info->msg_tag_counter + rank,
674     P->mpi_info->comm,
675     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
676     j += i;
677     } else
678     if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty send message %d %d %d %d %d %d\n", rank, send_neighbors, p, m, n, ptr[n], ptr[m]);
679     }
680     MPI_Waitall(send_neighbors+recv_neighbors,
681     P->col_coupler->mpi_requests,
682     P->col_coupler->mpi_stati);
683     P->mpi_info->msg_tag_counter += size;
684     if (MY_DEBUG) fprintf(stderr, "rank %d CP5_5\n", rank);
685    
686     /* Last, send/receive the data array */
687     j = 0;
688     for (p=0; p<recv_neighbors; p++) { /* Receiving */
689     m = recv->offsetInShared[p];
690     n = recv->offsetInShared[p+1];
691     i = recv_ptr[n] - recv_ptr[m];
692     if (i > 0)
693 lgao 3779 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
694 lgao 3763 P->mpi_info->msg_tag_counter + recv->neighbor[p],
695     P->mpi_info->comm,
696     &(P->col_coupler->mpi_requests[p]));
697 lgao 3779 j += (i*block_size);
698 lgao 3763 }
699    
700     j = 0;
701     for (p=0; p<send_neighbors; p++) { /* Sending */
702     m = send->offsetInShared[p];
703     n = send->offsetInShared[p+1];
704     i = ptr[n] - ptr[m];
705     if (i >0) {
706 lgao 3779 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
707 lgao 3763 P->mpi_info->msg_tag_counter + rank,
708     P->mpi_info->comm,
709     &(P->col_coupler->mpi_requests[p+recv_neighbors]));
710 lgao 3779 j += i * block_size;
711 lgao 3763 }
712     }
713     MPI_Waitall(send_neighbors+recv_neighbors,
714     P->col_coupler->mpi_requests,
715     P->col_coupler->mpi_stati);
716     P->mpi_info->msg_tag_counter += size;
717     if (MY_DEBUG) fprintf(stderr, "rank %d CP5_7\n", rank);
718    
719     /* Clean up and return with recevied ptr, index and data arrays */
720     MEMFREE(ptr);
721     MEMFREE(idx);
722     MEMFREE(val);
723     if (MY_DEBUG) fprintf(stderr, "rank %d len %d\n", rank, recv_ptr[recv_rows]);
724     *p_ptr = recv_ptr;
725     *p_idx = recv_idx;
726     *p_val = recv_val;
727     }
728    
729     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
730     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
731     {
732     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
733     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
734     Paso_SystemMatrix *out=NULL;
735     Paso_SystemMatrixPattern *pattern=NULL;
736     Paso_Distribution *input_dist=NULL, *output_dist=NULL;
737     Paso_SharedComponents *send =NULL, *recv=NULL;
738     Paso_Connector *col_connector=NULL, *row_connector=NULL;
739     Paso_Coupler *coupler=NULL;
740     Paso_Pattern *main_pattern=NULL;
741     Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
742     const dim_t row_block_size=A->row_block_size;
743     const dim_t col_block_size=A->col_block_size;
744 lgao 3779 const dim_t block_size = A->block_size;
745     const dim_t P_block_size = P->block_size;
746 lgao 3763 const dim_t num_threads=omp_get_max_threads();
747     const double ZERO = 0.0;
748     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
749 lgao 3779 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
750 lgao 3763 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
751     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
752     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
753     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
754     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
755     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
756     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
757 lgao 3779 index_t sum, i, j, k, iptr, irb, icb, ib;
758 lgao 3763 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
759     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
760     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
761     index_t row_marker_ext, *where_p=NULL;
762     index_t **send_ptr=NULL, **send_idx=NULL;
763     dim_t l, p, q, global_label, num_neighbors;
764     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
765     Esys_MPI_rank *neighbor=NULL;
766     #ifdef ESYS_MPI
767     MPI_Request* mpi_requests=NULL;
768     MPI_Status* mpi_stati=NULL;
769     #else
770     int *mpi_requests=NULL, *mpi_stati=NULL;
771     #endif
772    
773 lgao 3817 // if (size == 1) return NULL;
774 lgao 3779 // if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
775     // return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);
776    
777 lgao 3763 /* two sparse matrixes R_main and R_couple will be generate, as the
778     transpose of P_main and P_col_couple, respectively. Note that,
779     R_couple is actually the row_coupleBlock of R (R=P^T) */
780     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
781 lgao 3817 if (size > 1)
782     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
783 lgao 3763
784 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP1\n", rank);
785 lgao 3779
786     /* generate P_ext, i.e. portion of P that is tored on neighbor procs
787     and needed locally for triple matrix product RAP
788     to be specific, P_ext (on processor k) are group of rows in P, where
789     the list of rows from processor q is given by
790     A->col_coupler->connector->send->shared[rPtr...]
791     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
792     on q.
793     P_ext is represented by two sparse matrixes P_ext_main and
794     P_ext_couple */
795     Paso_Preconditioner_AMG_extendB(A, P);
796 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP2\n", rank);
797 lgao 3779
798     /* count the number of cols in P_ext_couple, resize the pattern of
799     sparse matrix P_ext_couple with new compressed order, and then
800     build the col id mapping from P->col_coupleBlock to
801     P_ext_couple */
802     num_Pmain_cols = P->mainBlock->numCols;
803 lgao 3817 if (size > 1) {
804     num_Pcouple_cols = P->col_coupleBlock->numCols;
805     num_Acouple_cols = A->col_coupleBlock->numCols;
806     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
807     } else {
808     num_Pcouple_cols = 0;
809     num_Acouple_cols = 0;
810     sum = 0;
811     }
812 lgao 3779 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
813     offset = P->col_distribution->first_component[rank];
814     num_Pext_cols = 0;
815     if (P->global_id) {
816     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
817     and P->col_coupleBlock */
818     iptr = 0;
819     if (num_Pcouple_cols || sum > 0) {
820     temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
821     for (; iptr<sum; iptr++){
822     temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
823     if (MY_DEBUG && rank ==1)
824     fprintf(stderr, "rank %d remote_block[%d] = %d\n", rank, iptr, temp[iptr]);
825     }
826     for (j=0; j<num_Pcouple_cols; j++, iptr++){
827     temp[iptr] = P->global_id[j];
828     if (MY_DEBUG && rank ==1)
829     fprintf(stderr, "rank %d global_id[%d] = %d\n", rank, j, P->global_id[j]);
830     }
831     }
832     if (iptr) {
833     #ifdef USE_QSORTG
834     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
835     #else
836     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
837     #endif
838     num_Pext_cols = 1;
839     i = temp[0];
840     for (j=1; j<iptr; j++) {
841     if (temp[j] > i) {
842     i = temp[j];
843     temp[num_Pext_cols++] = i;
844     }
845     }
846     }
847 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank%d CP2_1\n", rank);
848 lgao 3779 /* resize the pattern of P_ext_couple */
849     if(num_Pext_cols){
850     global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
851     for (i=0; i<num_Pext_cols; i++)
852     global_id_P[i] = temp[i];
853     if (MY_DEBUG && rank == 1) {
854     int my_i, sum=num_Pext_cols;
855     char *str1, *str2;
856     str1 = TMPMEMALLOC(sum*100+100, char);
857     str2 = TMPMEMALLOC(100, char);
858     sprintf(str1, "rank %d global_id_P[%d] = (", rank, sum);
859     for (my_i=0; my_i<sum; my_i++) {
860     sprintf(str2, "%d ", global_id_P[my_i]);
861     strcat(str1, str2);
862     }
863     fprintf(stderr, "%s)\n", str1);
864     TMPMEMFREE(str1);
865     TMPMEMFREE(str2);
866     }
867    
868     }
869     if (num_Pcouple_cols || sum > 0)
870     TMPMEMFREE(temp);
871     for (i=0; i<sum; i++) {
872     where_p = (index_t *)bsearch(
873     &(P->remote_coupleBlock->pattern->index[i]),
874     global_id_P, num_Pext_cols,
875     sizeof(index_t), Paso_comparIndex);
876     P->remote_coupleBlock->pattern->index[i] =
877     (index_t)(where_p -global_id_P);
878     }
879    
880     /* build the mapping */
881     if (num_Pcouple_cols) {
882     Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
883     iptr = 0;
884     for (i=0; i<num_Pext_cols; i++)
885     if (global_id_P[i] == P->global_id[iptr]) {
886     Pcouple_to_Pext[iptr++] = i;
887     if (iptr == num_Pcouple_cols) break;
888     }
889     }
890     }
891    
892 lgao 3817 if (MY_DEBUG) fprintf(stderr, "CP3\n");
893 lgao 3779 if (MY_DEBUG && rank == 0) {
894     fprintf(stderr, "Rank %d Now Prolongation Matrix ************************\n", rank);
895     Paso_SystemMatrix_print(P);
896     fprintf(stderr, "Rank %d Now Restriction Matrix ************************\n", rank);
897     Paso_SystemMatrix_print(R);
898     }
899    
900     /* alloc and initialize the makers */
901     sum = num_Pext_cols + num_Pmain_cols;
902     P_marker = TMPMEMALLOC(sum, index_t);
903     A_marker = TMPMEMALLOC(num_A_cols, index_t);
904     #pragma omp parallel for private(i) schedule(static)
905     for (i=0; i<sum; i++) P_marker[i] = -1;
906     #pragma omp parallel for private(i) schedule(static)
907     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
908    
909     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_1 %d\n", rank, num_Pcouple_cols);
910     /* Now, count the size of RAP_ext. Start with rows in R_couple */
911     sum = 0;
912     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
913     row_marker = sum;
914     /* then, loop over elements in row i_r of R_couple */
915     j1_ub = R_couple->pattern->ptr[i_r+1];
916     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_2 %d (%d %d)\n", rank, i_r, R_couple->pattern->ptr[i_r], j1_ub);
917     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
918     i1 = R_couple->pattern->index[j1];
919     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_3 %d %d\n", rank, j1, i1);
920     /* then, loop over elements in row i1 of A->col_coupleBlock */
921     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
922     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_4 %d %d %d\n", rank, j1, i1, j2_ub);
923     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
924     i2 = A->col_coupleBlock->pattern->index[j2];
925    
926     /* check whether entry RA[i_r, i2] has been previously visited.
927     RAP new entry is possible only if entry RA[i_r, i2] has not
928     been visited yet */
929     if (A_marker[i2] != i_r) {
930     /* first, mark entry RA[i_r,i2] as visited */
931     A_marker[i2] = i_r;
932    
933     /* then loop over elements in row i2 of P_ext_main */
934     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
935     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
936     i_c = P->row_coupleBlock->pattern->index[j3];
937    
938     /* check whether entry RAP[i_r,i_c] has been created.
939     If not yet created, create the entry and increase
940     the total number of elements in RAP_ext */
941     if (P_marker[i_c] < row_marker) {
942     P_marker[i_c] = sum;
943     sum++;
944     }
945     }
946    
947     /* loop over elements in row i2 of P_ext_couple, do the same */
948     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
949     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
950     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
951     if (P_marker[i_c] < row_marker) {
952     P_marker[i_c] = sum;
953     sum++;
954     }
955     }
956     }
957     }
958    
959     /* now loop over elements in row i1 of A->mainBlock, repeat
960     the process we have done to block A->col_coupleBlock */
961     j2_ub = A->mainBlock->pattern->ptr[i1+1];
962     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
963     i2 = A->mainBlock->pattern->index[j2];
964     if (A_marker[i2 + num_Acouple_cols] != i_r) {
965     A_marker[i2 + num_Acouple_cols] = i_r;
966     j3_ub = P->mainBlock->pattern->ptr[i2+1];
967     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
968     i_c = P->mainBlock->pattern->index[j3];
969     if (P_marker[i_c] < row_marker) {
970     P_marker[i_c] = sum;
971     sum++;
972     }
973     }
974     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
975     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
976     /* note that we need to map the column index in
977     P->col_coupleBlock back into the column index in
978     P_ext_couple */
979     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
980 lgao 3817 if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d-%d out of %d+%d) 5\n", rank, i_c, P->col_coupleBlock->pattern->index[j3], num_Pext_cols, num_Pmain_cols);
981 lgao 3779 if (P_marker[i_c] < row_marker) {
982     P_marker[i_c] = sum;
983     sum++;
984     }
985     }
986     }
987     }
988     }
989     }
990 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP4 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);
991 lgao 3779
992     /* Now we have the number of non-zero elements in RAP_ext, allocate
993     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
994     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
995     RAP_ext_idx = MEMALLOC(sum, index_t);
996     RAP_ext_val = MEMALLOC(sum * block_size, double);
997     RA_val = TMPMEMALLOC(block_size, double);
998     RAP_val = TMPMEMALLOC(block_size, double);
999    
1000     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
1001     sum = num_Pext_cols + num_Pmain_cols;
1002     #pragma omp parallel for private(i) schedule(static)
1003     for (i=0; i<sum; i++) P_marker[i] = -1;
1004     #pragma omp parallel for private(i) schedule(static)
1005     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1006     if (MY_DEBUG) fprintf(stderr, "rank %d CP4_1 sum %d\n", rank, sum);
1007     sum = 0;
1008     RAP_ext_ptr[0] = 0;
1009     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
1010     row_marker = sum;
1011     if (MY_DEBUG) fprintf(stderr, "rank %d CP4_2 i_r %d sum %d\n", rank, i_r, sum);
1012     /* then, loop over elements in row i_r of R_couple */
1013     j1_ub = R_couple->pattern->ptr[i_r+1];
1014     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
1015     i1 = R_couple->pattern->index[j1];
1016     R_val = &(R_couple->val[j1*P_block_size]);
1017    
1018     /* then, loop over elements in row i1 of A->col_coupleBlock */
1019     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1020     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1021     i2 = A->col_coupleBlock->pattern->index[j2];
1022     //RA_val = R_val * A->col_coupleBlock->val[j2];
1023     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1024     for (irb=0; irb<row_block_size; irb++)
1025     for (icb=0; icb<col_block_size; icb++)
1026     RA_val[irb+row_block_size*icb] = ZERO;
1027     for (irb=0; irb<P_block_size; irb++) {
1028     rtmp=R_val[irb];
1029     for (icb=0; icb<col_block_size; icb++) {
1030     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1031     }
1032     }
1033    
1034     /* check whether entry RA[i_r, i2] has been previously visited.
1035     RAP new entry is possible only if entry RA[i_r, i2] has not
1036     been visited yet */
1037     if (A_marker[i2] != i_r) {
1038     /* first, mark entry RA[i_r,i2] as visited */
1039     A_marker[i2] = i_r;
1040    
1041     /* then loop over elements in row i2 of P_ext_main */
1042     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1043     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1044     i_c = P->row_coupleBlock->pattern->index[j3];
1045     // RAP_val = RA_val * P->row_coupleBlock->val[j3];
1046     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1047     for (irb=0; irb<row_block_size; irb++)
1048     for (icb=0; icb<col_block_size; icb++)
1049     RAP_val[irb+row_block_size*icb] = ZERO;
1050     for (icb=0; icb<P_block_size; icb++) {
1051     rtmp = temp_val[icb];
1052     for (irb=0; irb<row_block_size; irb++) {
1053     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1054     }
1055     }
1056    
1057     /* check whether entry RAP[i_r,i_c] has been created.
1058     If not yet created, create the entry and increase
1059     the total number of elements in RAP_ext */
1060     if (P_marker[i_c] < row_marker) {
1061     P_marker[i_c] = sum;
1062     //RAP_ext_val[sum*block_size] = RAP_val;
1063     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1064     RAP_ext_idx[sum] = i_c + offset;
1065     if (MY_DEBUG){
1066     if (rank == 1) fprintf(stderr, "1-ext[%d (%d,%d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum*block_size]);
1067     }
1068     sum++;
1069     } else {
1070     //RAP_ext_val[P_marker[i_c] * block_size] += RAP_val;
1071     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1072     for (ib=0; ib<block_size; ib++) {
1073     temp_val[ib] += RAP_val[ib];
1074     }
1075    
1076     if (MY_DEBUG){
1077     if (rank == 1) fprintf(stderr, "1-ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);
1078     }
1079     }
1080     }
1081    
1082     /* loop over elements in row i2 of P_ext_couple, do the same */
1083     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1084     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1085     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
1086     //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1087     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1088     for (irb=0; irb<row_block_size; irb++)
1089     for (icb=0; icb<col_block_size; icb++)
1090     RAP_val[irb+row_block_size*icb]=ZERO;
1091     for (icb=0; icb<P_block_size; icb++) {
1092     rtmp = temp_val[icb];
1093     for (irb=0; irb<row_block_size; irb++) {
1094     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1095     }
1096     }
1097    
1098     if (P_marker[i_c] < row_marker) {
1099     P_marker[i_c] = sum;
1100     //RAP_ext_val[sum] = RAP_val;
1101     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1102     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1103     if (MY_DEBUG){
1104     if (rank == 1) fprintf(stderr, "ext[%d (%d,%d)]=%f (R%f, A(%d %d)%f, P%f)\n", sum, i_r, i_c, RAP_ext_val[sum], *R_val, i1, i2, A->col_coupleBlock->val[j2], P->remote_coupleBlock->val[j3]);
1105     }
1106     sum++;
1107     } else {
1108     //RAP_ext_val[P_marker[i_c]] += RAP_val;
1109     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1110     for (ib=0; ib<block_size; ib++) {
1111     temp_val[ib] += RAP_val[ib];
1112     }
1113    
1114     if (MY_DEBUG){
1115     if (rank == 1) fprintf(stderr, "ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);
1116     }
1117     }
1118     }
1119    
1120     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1121     Only the contributions are added. */
1122     } else {
1123     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1124     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1125     i_c = P->row_coupleBlock->pattern->index[j3];
1126     //RAP_val = RA_val * P->row_coupleBlock->val[j3];
1127     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1128     for (irb=0; irb<row_block_size; irb++)
1129     for (icb=0; icb<col_block_size; icb++)
1130     RAP_val[irb+row_block_size*icb]=ZERO;
1131     for (icb=0; icb<P_block_size; icb++) {
1132     rtmp = temp_val[icb];
1133     for (irb=0; irb<row_block_size; irb++) {
1134     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1135     }
1136     }
1137    
1138     //RAP_ext_val[P_marker[i_c]] += RAP_val;
1139     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1140     for (ib=0; ib<block_size; ib++) {
1141     temp_val[ib] += RAP_val[ib];
1142     }
1143    
1144     if (MY_DEBUG){
1145     if (rank == 1) fprintf(stderr, "Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
1146     }
1147     }
1148     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1149     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1150     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
1151     //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1152     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1153     for (irb=0; irb<row_block_size; irb++)
1154     for (icb=0; icb<col_block_size; icb++)
1155     RAP_val[irb+row_block_size*icb]=ZERO;
1156     for (icb=0; icb<P_block_size; icb++) {
1157     rtmp = temp_val[icb];
1158     for (irb=0; irb<row_block_size; irb++) {
1159     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1160     }
1161     }
1162    
1163     //RAP_ext_val[P_marker[i_c]] += RAP_val;
1164     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1165     for (ib=0; ib<block_size; ib++) {
1166     temp_val[ib] += RAP_val[ib];
1167     }
1168    
1169     if (MY_DEBUG){
1170     if (rank == 1) fprintf(stderr, "Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
1171     }
1172     }
1173     }
1174     }
1175    
1176     /* now loop over elements in row i1 of A->mainBlock, repeat
1177     the process we have done to block A->col_coupleBlock */
1178     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1179     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1180     i2 = A->mainBlock->pattern->index[j2];
1181     //RA_val = R_val * A->mainBlock->val[j2];
1182     temp_val = &(A->mainBlock->val[j2*block_size]);
1183     for (irb=0; irb<row_block_size; irb++)
1184     for (icb=0; icb<col_block_size; icb++)
1185     RA_val[irb+row_block_size*icb]=ZERO;
1186     for (irb=0; irb<P_block_size; irb++) {
1187     rtmp=R_val[irb];
1188     for (icb=0; icb<col_block_size; icb++) {
1189     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1190     }
1191     }
1192    
1193     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1194     A_marker[i2 + num_Acouple_cols] = i_r;
1195     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1196     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1197     i_c = P->mainBlock->pattern->index[j3];
1198     //RAP_val = RA_val * P->mainBlock->val[j3];
1199     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1200     for (irb=0; irb<row_block_size; irb++)
1201     for (icb=0; icb<col_block_size; icb++)
1202     RAP_val[irb+row_block_size*icb]=ZERO;
1203     for (icb=0; icb<P_block_size; icb++) {
1204     rtmp = temp_val[icb];
1205     for (irb=0; irb<row_block_size; irb++) {
1206     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1207     }
1208     }
1209    
1210     if (P_marker[i_c] < row_marker) {
1211     P_marker[i_c] = sum;
1212     //RAP_ext_val[sum] = RAP_val;
1213     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1214     RAP_ext_idx[sum] = i_c + offset;
1215     if (MY_DEBUG){
1216     if (rank == 1) fprintf(stderr, "Main 1-ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);
1217     }
1218     sum++;
1219     } else {
1220     //RAP_ext_val[P_marker[i_c]] += RAP_val;
1221     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1222     for (ib=0; ib<block_size; ib++) {
1223     temp_val[ib] += RAP_val[ib];
1224     }
1225    
1226     if (MY_DEBUG){
1227     if (rank == 1) fprintf(stderr, "Main 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
1228     }
1229     }
1230     }
1231     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1232     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1233     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1234 lgao 3817 if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 1\n", rank, i_c, num_Pext_cols + num_Pmain_cols);
1235 lgao 3779 //RAP_val = RA_val * P->col_coupleBlock->val[j3];
1236     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1237     for (irb=0; irb<row_block_size; irb++)
1238     for (icb=0; icb<col_block_size; icb++)
1239     RAP_val[irb+row_block_size*icb]=ZERO;
1240     for (icb=0; icb<P_block_size; icb++) {
1241     rtmp=temp_val[icb];
1242     for (irb=0; irb<row_block_size; irb++) {
1243     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1244     }
1245     }
1246    
1247     if (P_marker[i_c] < row_marker) {
1248     P_marker[i_c] = sum;
1249     //RAP_ext_val[sum] = RAP_val;
1250     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1251     RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1252     if (MY_DEBUG){
1253     if (rank == 1) fprintf(stderr, "Main ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);
1254     }
1255     sum++;
1256     } else {
1257     //RAP_ext_val[P_marker[i_c]] += RAP_val;
1258     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1259     for (ib=0; ib<block_size; ib++) {
1260     temp_val[ib] += RAP_val[ib];
1261     }
1262    
1263     if (MY_DEBUG){
1264     if (rank == 1) fprintf(stderr, "Main ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
1265     }
1266     }
1267     }
1268     } else {
1269     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1270     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1271     i_c = P->mainBlock->pattern->index[j3];
1272     //RAP_val = RA_val * P->mainBlock->val[j3];
1273     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1274     for (irb=0; irb<row_block_size; irb++)
1275     for (icb=0; icb<col_block_size; icb++)
1276     RAP_val[irb+row_block_size*icb]=ZERO;
1277     for (icb=0; icb<P_block_size; icb++) {
1278     rtmp=temp_val[icb];
1279     for (irb=0; irb<row_block_size; irb++) {
1280     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1281     }
1282     }
1283    
1284     //RAP_ext_val[P_marker[i_c]] += RAP_val;
1285     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1286     for (ib=0; ib<block_size; ib++) {
1287     temp_val[ib] += RAP_val[ib];
1288     }
1289    
1290     if (MY_DEBUG){
1291     if (rank == 1) fprintf(stderr, "Main Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
1292     }
1293     }
1294     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1295     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1296     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1297     //RAP_val = RA_val * P->col_coupleBlock->val[j3];
1298     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1299     for (irb=0; irb<row_block_size; irb++)
1300     for (icb=0; icb<col_block_size; icb++)
1301     RAP_val[irb+row_block_size*icb]=ZERO;
1302     for (icb=0; icb<P_block_size; icb++) {
1303     rtmp=temp_val[icb];
1304     for (irb=0; irb<row_block_size; irb++) {
1305     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1306     }
1307     }
1308    
1309     //RAP_ext_val[P_marker[i_c]] += RAP_val;
1310     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1311     for (ib=0; ib<block_size; ib++) {
1312     temp_val[ib] += RAP_val[ib];
1313     }
1314    
1315     if (MY_DEBUG){
1316     if (rank == 1) fprintf(stderr, "Main Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
1317     }
1318     }
1319     }
1320     }
1321     }
1322     RAP_ext_ptr[i_r+1] = sum;
1323     }
1324     TMPMEMFREE(P_marker);
1325     TMPMEMFREE(Pcouple_to_Pext);
1326     if (MY_DEBUG) fprintf(stderr, "rank %d CP5 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);
1327    
1328     if (MY_DEBUG) {
1329     char *str1, *str2;
1330     str1 = TMPMEMALLOC(sum*100+100, char);
1331     str2 = TMPMEMALLOC(15, char);
1332     sprintf(str1, "rank %d RAP_ext_idx[%d] = (", rank, sum);
1333     for (i=0; i<sum; i++) {
1334     sprintf(str2, "%d ", RAP_ext_idx[i]);
1335     strcat(str1, str2);
1336     }
1337     fprintf(stderr, "%s)\n", str1);
1338     TMPMEMFREE(str1);
1339     TMPMEMFREE(str2);
1340     }
1341    
1342     /* Now we have part of RAP[r,c] where row "r" is the list of rows
1343     which is given by the column list of P->col_coupleBlock, and
1344     column "c" is the list of columns which possiblely covers the
1345     whole column range of systme martris P. This part of data is to
1346     be passed to neighboring processors, and added to corresponding
1347     RAP[r,c] entries in the neighboring processors */
1348     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1349     &RAP_ext_val, global_id_P, block_size);
1350 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP6\n", rank);
1351 lgao 3779
1352     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1353     sum = RAP_ext_ptr[num_RAPext_rows];
1354     if (MY_DEBUG) fprintf(stderr, "rank %d CP6_0 %d %d (%d %d)\n", rank, sum, num_Pext_cols, num_RAPext_rows, num_Pcouple_cols);
1355     num_RAPext_cols = 0;
1356     if (num_Pext_cols || sum > 0) {
1357     temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1358     j1_ub = offset + num_Pmain_cols;
1359     for (i=0, iptr=0; i<sum; i++) {
1360     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */ {
1361     temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1362     if (MY_DEBUG && rank ==1) fprintf(stderr, "RAP_ext_idx[%d]=%d\n", i, RAP_ext_idx[i]);
1363     }
1364     }
1365     for (j=0; j<num_Pext_cols; j++, iptr++){
1366     temp[iptr] = global_id_P[j];
1367     }
1368    
1369     if (iptr) {
1370     #ifdef USE_QSORTG
1371     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1372     #else
1373     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1374     #endif
1375     num_RAPext_cols = 1;
1376     i = temp[0];
1377     for (j=1; j<iptr; j++) {
1378     if (temp[j] > i) {
1379     i = temp[j];
1380     temp[num_RAPext_cols++] = i;
1381     }
1382     }
1383    
1384     if (MY_DEBUG) {
1385     char *str1, *str2;
1386     index_t my_sum=iptr, i;
1387     str1 = TMPMEMALLOC(my_sum*20+100, char);
1388     str2 = TMPMEMALLOC(15, char);
1389     sprintf(str1, "rank %d temp[%d] = (", rank, my_sum);
1390     for (i=0; i<my_sum; i++){
1391     sprintf(str2, "%d ", temp[i]);
1392     strcat(str1, str2);
1393     }
1394     fprintf(stderr, "%s)\n", str1);
1395     TMPMEMFREE(str1);
1396     TMPMEMFREE(str2);
1397     }
1398     }
1399     }
1400     if (MY_DEBUG) fprintf(stderr, "rank %d CP6_1 %d %d\n", rank, iptr, num_RAPext_cols);
1401    
1402     /* resize the pattern of P_ext_couple */
1403     if(num_RAPext_cols){
1404     global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1405     for (i=0; i<num_RAPext_cols; i++)
1406     global_id_RAP[i] = temp[i];
1407     }
1408     if (MY_DEBUG) {
1409     char *str1, *str2;
1410     index_t my_sum=num_RAPext_cols, i;
1411     str1 = TMPMEMALLOC(my_sum*20+100, char);
1412     str2 = TMPMEMALLOC(15, char);
1413     sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, my_sum);
1414     for (i=0; i<my_sum; i++){
1415     sprintf(str2, "%d ", global_id_RAP[i]);
1416     strcat(str1, str2);
1417     }
1418     fprintf(stderr, "%s)\n", str1);
1419     TMPMEMFREE(str1);
1420     TMPMEMFREE(str2);
1421     }
1422     if (num_Pext_cols || sum > 0)
1423     TMPMEMFREE(temp);
1424     j1_ub = offset + num_Pmain_cols;
1425     for (i=0; i<sum; i++) {
1426     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1427     where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1428     /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1429     RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1430     } else
1431     RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1432     }
1433     if (MY_DEBUG) fprintf(stderr, "rank %d CP7 %d\n", rank, num_Pext_cols);
1434    
1435     /* build the mapping */
1436     if (num_Pcouple_cols) {
1437     Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1438     iptr = 0;
1439     for (i=0; i<num_RAPext_cols; i++)
1440     if (global_id_RAP[i] == P->global_id[iptr]) {
1441     Pcouple_to_RAP[iptr++] = i;
1442     if (iptr == num_Pcouple_cols) break;
1443     }
1444     }
1445    
1446     if (num_Pext_cols) {
1447     Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1448     iptr = 0;
1449     for (i=0; i<num_RAPext_cols; i++)
1450     if (global_id_RAP[i] == global_id_P[iptr]) {
1451     Pext_to_RAP[iptr++] = i;
1452     if (iptr == num_Pext_cols) break;
1453     }
1454     }
1455    
1456     if (global_id_P){
1457     TMPMEMFREE(global_id_P);
1458     global_id_P = NULL;
1459     }
1460    
1461     /* alloc and initialize the makers */
1462     sum = num_RAPext_cols + num_Pmain_cols;
1463     P_marker = TMPMEMALLOC(sum, index_t);
1464     #pragma omp parallel for private(i) schedule(static)
1465     for (i=0; i<sum; i++) P_marker[i] = -1;
1466     #pragma omp parallel for private(i) schedule(static)
1467     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1468 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP8 %d\n", rank, num_Pmain_cols);
1469 lgao 3779
1470     /* Now, count the size of RAP. Start with rows in R_main */
1471     num_neighbors = P->col_coupler->connector->send->numNeighbors;
1472     offsetInShared = P->col_coupler->connector->send->offsetInShared;
1473     shared = P->col_coupler->connector->send->shared;
1474     i = 0;
1475     j = 0;
1476     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1477     /* Mark the start of row for both main block and couple block */
1478     row_marker = i;
1479     row_marker_ext = j;
1480    
1481     /* Mark the diagonal element RAP[i_r, i_r], and other elements
1482     in RAP_ext */
1483     P_marker[i_r] = i;
1484     i++;
1485     for (j1=0; j1<num_neighbors; j1++) {
1486     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1487     if (shared[j2] == i_r) {
1488     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1489     i_c = RAP_ext_idx[k];
1490     if (i_c < num_Pmain_cols) {
1491     if (P_marker[i_c] < row_marker) {
1492     P_marker[i_c] = i;
1493     i++;
1494     }
1495     } else {
1496     if (P_marker[i_c] < row_marker_ext) {
1497     P_marker[i_c] = j;
1498     j++;
1499     }
1500     }
1501     }
1502     break;
1503     }
1504     }
1505     }
1506    
1507     /* then, loop over elements in row i_r of R_main */
1508     j1_ub = R_main->pattern->ptr[i_r+1];
1509     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1510     i1 = R_main->pattern->index[j1];
1511    
1512     /* then, loop over elements in row i1 of A->col_coupleBlock */
1513     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1514     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1515     i2 = A->col_coupleBlock->pattern->index[j2];
1516    
1517     /* check whether entry RA[i_r, i2] has been previously visited.
1518     RAP new entry is possible only if entry RA[i_r, i2] has not
1519     been visited yet */
1520     if (A_marker[i2] != i_r) {
1521     /* first, mark entry RA[i_r,i2] as visited */
1522     A_marker[i2] = i_r;
1523    
1524     /* then loop over elements in row i2 of P_ext_main */
1525     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1526     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1527     i_c = P->row_coupleBlock->pattern->index[j3];
1528    
1529     /* check whether entry RAP[i_r,i_c] has been created.
1530     If not yet created, create the entry and increase
1531     the total number of elements in RAP_ext */
1532     if (P_marker[i_c] < row_marker) {
1533     P_marker[i_c] = i;
1534     i++;
1535     }
1536     }
1537    
1538     /* loop over elements in row i2 of P_ext_couple, do the same */
1539     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1540     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1541     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1542     if (P_marker[i_c] < row_marker_ext) {
1543     P_marker[i_c] = j;
1544     j++;
1545     }
1546     }
1547     }
1548     }
1549    
1550     /* now loop over elements in row i1 of A->mainBlock, repeat
1551     the process we have done to block A->col_coupleBlock */
1552     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1553     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1554     i2 = A->mainBlock->pattern->index[j2];
1555     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1556     A_marker[i2 + num_Acouple_cols] = i_r;
1557     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1558     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1559     i_c = P->mainBlock->pattern->index[j3];
1560     if (P_marker[i_c] < row_marker) {
1561     P_marker[i_c] = i;
1562     i++;
1563     }
1564     }
1565     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1566     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1567     /* note that we need to map the column index in
1568     P->col_coupleBlock back into the column index in
1569     P_ext_couple */
1570     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1571 lgao 3817 if (MY_DEBUG && (i_c <0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 2\n", rank, i_c, sum);
1572 lgao 3779 if (P_marker[i_c] < row_marker_ext) {
1573     P_marker[i_c] = j;
1574     j++;
1575     }
1576     }
1577     }
1578     }
1579     }
1580     }
1581 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP9 main_size%d couple_size%d\n", rank, i, j);
1582 lgao 3779
1583     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1584     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1585     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1586     RAP_couple */
1587     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1588     RAP_main_idx = MEMALLOC(i, index_t);
1589     RAP_main_val = TMPMEMALLOC(i * block_size, double);
1590     RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1591     RAP_couple_idx = MEMALLOC(j, index_t);
1592     RAP_couple_val = TMPMEMALLOC(j * block_size, double);
1593    
1594     RAP_main_ptr[num_Pmain_cols] = i;
1595     RAP_couple_ptr[num_Pmain_cols] = j;
1596    
1597     #pragma omp parallel for private(i) schedule(static)
1598     for (i=0; i<sum; i++) P_marker[i] = -1;
1599     #pragma omp parallel for private(i) schedule(static)
1600     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1601    
1602     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1603     in R_main */
1604     i = 0;
1605     j = 0;
1606     for (i_r=0; i_r<num_Pmain_cols; i_r++){
1607     /* Mark the start of row for both main block and couple block */
1608     row_marker = i;
1609     row_marker_ext = j;
1610     RAP_main_ptr[i_r] = row_marker;
1611     RAP_couple_ptr[i_r] = row_marker_ext;
1612    
1613     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1614     in row i_r of RAP_ext */
1615     P_marker[i_r] = i;
1616     for (ib=0; ib<block_size; ib++)
1617     RAP_main_val[i*block_size+ib] = ZERO;
1618     RAP_main_idx[i] = i_r;
1619     i++;
1620    
1621     for (j1=0; j1<num_neighbors; j1++) {
1622     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1623     if (shared[j2] == i_r) {
1624     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1625     i_c = RAP_ext_idx[k];
1626     if (i_c < num_Pmain_cols) {
1627 lgao 3817 if (MY_DEBUG){
1628     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: ext %f\n", RAP_ext_val[k*block_size]);
1629     }
1630 lgao 3779 if (P_marker[i_c] < row_marker) {
1631     P_marker[i_c] = i;
1632     //RAP_main_val[i] = RAP_ext_val[k];
1633     memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1634     if (MY_DEBUG){
1635 lgao 3817 if (rank == 0 && i == 2) fprintf(stderr, "ext %f\n", RAP_ext_val[k*block_size]);
1636 lgao 3779 }
1637     RAP_main_idx[i] = i_c;
1638     i++;
1639     } else {
1640     //RAP_main_val[P_marker[i_c]] += RAP_ext_val[k];
1641     t1_val = &(RAP_ext_val[k*block_size]);
1642     t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1643     for (ib=0; ib<block_size; ib++)
1644     t2_val[ib] += t1_val[ib];
1645     if (MY_DEBUG){
1646     if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "i_c %d ext %f\n", i_c, RAP_ext_val[k]);
1647     }
1648     }
1649     } else {
1650     if (P_marker[i_c] < row_marker_ext) {
1651     P_marker[i_c] = j;
1652     //RAP_couple_val[j] = RAP_ext_val[k];
1653     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1654     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1655     j++;
1656     } else {
1657     //RAP_couple_val[P_marker[i_c]] += RAP_ext_val[k];
1658     t1_val = &(RAP_ext_val[k*block_size]);
1659     t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1660     for (ib=0; ib<block_size; ib++)
1661     t2_val[ib] += t1_val[ib];
1662     }
1663     }
1664     }
1665     break;
1666     }
1667     }
1668     }
1669    
1670     /* then, loop over elements in row i_r of R_main */
1671     j1_ub = R_main->pattern->ptr[i_r+1];
1672     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1673     i1 = R_main->pattern->index[j1];
1674     R_val = &(R_main->val[j1*P_block_size]);
1675    
1676     /* then, loop over elements in row i1 of A->col_coupleBlock */
1677     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1678     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1679     i2 = A->col_coupleBlock->pattern->index[j2];
1680     //RA_val = R_val * A->col_coupleBlock->val[j2];
1681     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1682     for (irb=0; irb<row_block_size; irb++)
1683     for (icb=0; icb<col_block_size; icb++)
1684     RA_val[irb+row_block_size*icb]=ZERO;
1685     for (irb=0; irb<P_block_size; irb++) {
1686     rtmp=R_val[irb];
1687     for (icb=0; icb<col_block_size; icb++) {
1688     RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1689     }
1690     }
1691    
1692    
1693     /* check whether entry RA[i_r, i2] has been previously visited.
1694     RAP new entry is possible only if entry RA[i_r, i2] has not
1695     been visited yet */
1696     if (A_marker[i2] != i_r) {
1697     /* first, mark entry RA[i_r,i2] as visited */
1698     A_marker[i2] = i_r;
1699    
1700     /* then loop over elements in row i2 of P_ext_main */
1701     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1702     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1703     i_c = P->row_coupleBlock->pattern->index[j3];
1704     //RAP_val = RA_val * P->row_coupleBlock->val[j3];
1705     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1706     for (irb=0; irb<row_block_size; irb++)
1707     for (icb=0; icb<col_block_size; icb++)
1708     RAP_val[irb+row_block_size*icb]=ZERO;
1709     for (icb=0; icb<P_block_size; icb++) {
1710     rtmp=temp_val[icb];
1711     for (irb=0; irb<row_block_size; irb++) {
1712     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1713     }
1714     }
1715    
1716    
1717     /* check whether entry RAP[i_r,i_c] has been created.
1718     If not yet created, create the entry and increase
1719     the total number of elements in RAP_ext */
1720 lgao 3817 if (MY_DEBUG){
1721     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP %.20f R %f %f A %f P %f %f [%d %d]\n", RAP_val[0], R_val[0], R_val[1], A->col_coupleBlock->val[j2*block_size], temp_val[0], temp_val[1], i2, j3);
1722     }
1723 lgao 3779 if (P_marker[i_c] < row_marker) {
1724     P_marker[i_c] = i;
1725     //RAP_main_val[i] = RAP_val;
1726     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1727     if (MY_DEBUG){
1728     if (rank == 0 && i == 2) fprintf(stderr, "R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
1729     }
1730     RAP_main_idx[i] = i_c;
1731     i++;
1732     } else {
1733     //RAP_main_val[P_marker[i_c]] += RAP_val;
1734     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1735     for (ib=0; ib<block_size; ib++) {
1736     temp_val[ib] += RAP_val[ib];
1737     }
1738    
1739     if (MY_DEBUG){
1740     if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1 , *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
1741     }
1742     }
1743     }
1744    
1745     /* loop over elements in row i2 of P_ext_couple, do the same */
1746     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1747     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1748     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1749     //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1750     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1751     for (irb=0; irb<row_block_size; irb++)
1752     for (icb=0; icb<col_block_size; icb++)
1753     RAP_val[irb+row_block_size*icb]=ZERO;
1754     for (icb=0; icb<P_block_size; icb++) {
1755     rtmp=temp_val[icb];
1756     for (irb=0; irb<row_block_size; irb++) {
1757     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1758     }
1759     }
1760    
1761     if (P_marker[i_c] < row_marker_ext) {
1762     P_marker[i_c] = j;
1763     //RAP_couple_val[j] = RAP_val;
1764     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1765     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1766     j++;
1767     } else {
1768     //RAP_couple_val[P_marker[i_c]] += RAP_val;
1769     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1770     for (ib=0; ib<block_size; ib++) {
1771     temp_val[ib] += RAP_val[ib];
1772     }
1773     }
1774     }
1775    
1776     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1777     Only the contributions are added. */
1778     } else {
1779     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1780     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1781     i_c = P->row_coupleBlock->pattern->index[j3];
1782     //RAP_val = RA_val * P->row_coupleBlock->val[j3];
1783     temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1784     for (irb=0; irb<row_block_size; irb++)
1785     for (icb=0; icb<col_block_size; icb++)
1786     RAP_val[irb+row_block_size*icb]=ZERO;
1787     for (icb=0; icb<P_block_size; icb++) {
1788     rtmp=temp_val[icb];
1789     for (irb=0; irb<row_block_size; irb++) {
1790     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1791     }
1792     }
1793    
1794     //RAP_main_val[P_marker[i_c]] += RAP_val;
1795 lgao 3817 if (MY_DEBUG){
1796     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[2] %.20f\n", RAP_val[0]);
1797     }
1798 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1799     for (ib=0; ib<block_size; ib++) {
1800     temp_val[ib] += RAP_val[ib];
1801     }
1802    
1803     if (MY_DEBUG){
1804     if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
1805     }
1806     }
1807     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1808     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1809     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1810     //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1811     temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1812     for (irb=0; irb<row_block_size; irb++)
1813     for (icb=0; icb<col_block_size; icb++)
1814     RAP_val[irb+row_block_size*icb]=ZERO;
1815     for (icb=0; icb<P_block_size; icb++) {
1816     rtmp=temp_val[icb];
1817     for (irb=0; irb<row_block_size; irb++) {
1818     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1819     }
1820     }
1821    
1822     //RAP_couple_val[P_marker[i_c]] += RAP_val;
1823     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1824     for (ib=0; ib<block_size; ib++) {
1825     temp_val[ib] += RAP_val[ib];
1826     }
1827     }
1828     }
1829     }
1830    
1831     /* now loop over elements in row i1 of A->mainBlock, repeat
1832     the process we have done to block A->col_coupleBlock */
1833     j2_ub = A->mainBlock->pattern->ptr[i1+1];
1834     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1835     i2 = A->mainBlock->pattern->index[j2];
1836     //RA_val = R_val * A->mainBlock->val[j2];
1837     temp_val = &(A->mainBlock->val[j2*block_size]);
1838     for (irb=0; irb<row_block_size; irb++)
1839     for (icb=0; icb<col_block_size; icb++)
1840     RA_val[irb+row_block_size*icb]=ZERO;
1841     for (irb=0; irb<P_block_size; irb++) {
1842     rtmp=R_val[irb];
1843     for (icb=0; icb<col_block_size; icb++) {
1844     RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1845     }
1846     }
1847    
1848     if (A_marker[i2 + num_Acouple_cols] != i_r) {
1849     A_marker[i2 + num_Acouple_cols] = i_r;
1850     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1851     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1852     i_c = P->mainBlock->pattern->index[j3];
1853     //RAP_val = RA_val * P->mainBlock->val[j3];
1854     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1855     for (irb=0; irb<row_block_size; irb++)
1856     for (icb=0; icb<col_block_size; icb++)
1857     RAP_val[irb+row_block_size*icb]=ZERO;
1858     for (icb=0; icb<P_block_size; icb++) {
1859     rtmp=temp_val[icb];
1860     for (irb=0; irb<row_block_size; irb++) {
1861     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1862     }
1863     }
1864    
1865 lgao 3817 if (MY_DEBUG){
1866     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[3] %.20f R %f %f A %f P %f %f\n [%d %d]", RAP_val[0], R_val[0], R_val[1], A->mainBlock->val[j2*block_size], temp_val[0], temp_val[1], i2, j3);
1867     }
1868 lgao 3779 if (P_marker[i_c] < row_marker) {
1869     P_marker[i_c] = i;
1870     //RAP_main_val[i] = RAP_val;
1871     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1872     if (MY_DEBUG){
1873     if (rank == 0 && i == 2) fprintf(stderr, "Main R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
1874     }
1875     RAP_main_idx[i] = i_c;
1876     i++;
1877     } else {
1878     //RAP_main_val[P_marker[i_c]] += RAP_val;
1879     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1880     for (ib=0; ib<block_size; ib++) {
1881     temp_val[ib] += RAP_val[ib];
1882     }
1883    
1884     if (MY_DEBUG){
1885     if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
1886     }
1887     }
1888     }
1889     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1890     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1891     /* note that we need to map the column index in
1892     P->col_coupleBlock back into the column index in
1893     P_ext_couple */
1894     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1895 lgao 3817 if (MY_DEBUG && (i_c < 0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d--%d,%d) 3\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);
1896 lgao 3779 //RAP_val = RA_val * P->col_coupleBlock->val[j3];
1897     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1898     for (irb=0; irb<row_block_size; irb++)
1899     for (icb=0; icb<col_block_size; icb++)
1900     RAP_val[irb+row_block_size*icb]=ZERO;
1901     for (icb=0; icb<P_block_size; icb++) {
1902     rtmp=temp_val[icb];
1903     for (irb=0; irb<row_block_size; irb++) {
1904     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1905     }
1906     }
1907    
1908     if (P_marker[i_c] < row_marker_ext) {
1909     P_marker[i_c] = j;
1910     //RAP_couple_val[j] = RAP_val;
1911     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1912     RAP_couple_idx[j] = i_c - num_Pmain_cols;
1913     j++;
1914     } else {
1915     //RAP_couple_val[P_marker[i_c]] += RAP_val;
1916     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1917     for (ib=0; ib<block_size; ib++) {
1918     temp_val[ib] += RAP_val[ib];
1919     }
1920     }
1921     }
1922    
1923     } else {
1924     j3_ub = P->mainBlock->pattern->ptr[i2+1];
1925     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1926     i_c = P->mainBlock->pattern->index[j3];
1927     //RAP_val = RA_val * P->mainBlock->val[j3];
1928     temp_val = &(P->mainBlock->val[j3*P_block_size]);
1929     for (irb=0; irb<row_block_size; irb++)
1930     for (icb=0; icb<col_block_size; icb++)
1931     RAP_val[irb+row_block_size*icb]=ZERO;
1932     for (icb=0; icb<P_block_size; icb++) {
1933     rtmp=temp_val[icb];
1934     for (irb=0; irb<row_block_size; irb++) {
1935     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1936     }
1937     }
1938    
1939     //RAP_main_val[P_marker[i_c]] += RAP_val;
1940 lgao 3817 if (MY_DEBUG){
1941     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[4] %f\n", RAP_val[0]);
1942     }
1943 lgao 3779 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1944     for (ib=0; ib<block_size; ib++) {
1945     temp_val[ib] += RAP_val[ib];
1946     }
1947    
1948     if (MY_DEBUG){
1949     if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main Visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
1950     }
1951     }
1952     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1953     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1954     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1955 lgao 3817 if (MY_DEBUG && i_c >= sum) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d -- %d,%d) 4\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);
1956 lgao 3779 //RAP_val = RA_val * P->col_coupleBlock->val[j3];
1957     temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1958     for (irb=0; irb<row_block_size; irb++)
1959     for (icb=0; icb<col_block_size; icb++)
1960     RAP_val[irb+row_block_size*icb]=ZERO;
1961     for (icb=0; icb<P_block_size; icb++) {
1962     rtmp = temp_val[icb];
1963     for (irb=0; irb<row_block_size; irb++) {
1964     RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1965     }
1966     }
1967    
1968     //RAP_couple_val[P_marker[i_c]] += RAP_val;
1969     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1970     for (ib=0; ib<block_size; ib++) {
1971     temp_val[ib] += RAP_val[ib];
1972     }
1973     }
1974     }
1975     }
1976     }
1977    
1978     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1979     if (i > row_marker) {
1980     offset = i - row_marker;
1981     temp = TMPMEMALLOC(offset, index_t);
1982     for (iptr=0; iptr<offset; iptr++)
1983     temp[iptr] = RAP_main_idx[row_marker+iptr];
1984     if (offset > 0) {
1985     #ifdef USE_QSORTG
1986     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1987     #else
1988     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1989     #endif
1990     }
1991     temp_val = TMPMEMALLOC(offset * block_size, double);
1992     for (iptr=0; iptr<offset; iptr++){
1993     k = P_marker[temp[iptr]];
1994     //temp_val[iptr] = RAP_main_val[k];
1995     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1996     P_marker[temp[iptr]] = iptr + row_marker;
1997     }
1998     for (iptr=0; iptr<offset; iptr++){
1999     RAP_main_idx[row_marker+iptr] = temp[iptr];
2000     //RAP_main_val[row_marker+iptr] = temp_val[iptr];
2001     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
2002     }
2003     TMPMEMFREE(temp);
2004     TMPMEMFREE(temp_val);
2005     }
2006     if (j > row_marker_ext) {
2007     offset = j - row_marker_ext;
2008     temp = TMPMEMALLOC(offset, index_t);
2009     for (iptr=0; iptr<offset; iptr++)
2010     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
2011     if (offset > 0) {
2012     #ifdef USE_QSORTG
2013     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
2014     #else
2015     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
2016     #endif
2017     }
2018     temp_val = TMPMEMALLOC(offset * block_size, double);
2019     for (iptr=0; iptr<offset; iptr++){
2020 lgao 3817 if (MY_DEBUG && temp[iptr] >= num_RAPext_cols)
2021 lgao 3779 fprintf(stderr, "rank%d index %d exceed %d\n", rank, temp[iptr], num_RAPext_cols);
2022     k = P_marker[temp[iptr] + num_Pmain_cols];
2023     //temp_val[iptr] = RAP_couple_val[k];
2024     memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
2025     P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
2026     }
2027     for (iptr=0; iptr<offset; iptr++){
2028     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
2029     //RAP_couple_val[row_marker_ext+iptr] = temp_val[iptr];
2030     memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
2031     }
2032     TMPMEMFREE(temp);
2033     TMPMEMFREE(temp_val);
2034     }
2035     }
2036    
2037     TMPMEMFREE(RA_val);
2038     TMPMEMFREE(RAP_val);
2039     TMPMEMFREE(A_marker);
2040     TMPMEMFREE(Pext_to_RAP);
2041     TMPMEMFREE(Pcouple_to_RAP);
2042     MEMFREE(RAP_ext_ptr);
2043     MEMFREE(RAP_ext_idx);
2044     MEMFREE(RAP_ext_val);
2045     Paso_SparseMatrix_free(R_main);
2046     Paso_SparseMatrix_free(R_couple);
2047    
2048     if (MY_DEBUG){
2049     if (rank == 0) fprintf(stderr, "A[%d]=%f A[%d]=%f A[%d]=%f A[%d]=%f \n", RAP_main_idx[0], RAP_main_val[0], RAP_main_idx[1], RAP_main_val[1], RAP_main_idx[2], RAP_main_val[2], RAP_main_idx[3], RAP_main_val[3]);
2050     }
2051 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP10\n", rank);
2052     if (MY_DEBUG && rank == 1) {
2053 lgao 3779 int sum = num_RAPext_cols, my_i;
2054     char *str1, *str2;
2055     str1 = TMPMEMALLOC(sum*100+100, char);
2056     str2 = TMPMEMALLOC(100, char);
2057     sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, sum);
2058     for (my_i=0; my_i<sum; my_i++){
2059     sprintf(str2, "%d ", global_id_RAP[my_i]);
2060     strcat(str1, str2);
2061     }
2062     fprintf(stderr, "%s)\n", str1);
2063     sprintf(str1, "rank %d distribution = (", rank);
2064     for (my_i=0; my_i<size; my_i++){
2065     sprintf(str2, "%d ", P->pattern->input_distribution->first_component[my_i+1]);
2066     strcat(str1, str2);
2067     }
2068     fprintf(stderr, "%s)\n", str1);
2069     TMPMEMFREE(str1);
2070     TMPMEMFREE(str2);
2071     }
2072    
2073    
2074     /* Check whether there are empty columns in RAP_couple */
2075     #pragma omp parallel for schedule(static) private(i)
2076     for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
2077     /* num of non-empty columns is stored in "k" */
2078     k = 0;
2079     j = RAP_couple_ptr[num_Pmain_cols];
2080     if (MY_DEBUG) fprintf(stderr, "rank %d CP10_1 %d %d %d\n", rank, num_RAPext_cols, num_Pmain_cols, j);
2081     for (i=0; i<j; i++) {
2082     i1 = RAP_couple_idx[i];
2083     if (P_marker[i1]) {
2084     P_marker[i1] = 0;
2085     k++;
2086     }
2087     }
2088    
2089     if (MY_DEBUG) fprintf(stderr, "rank %d CP10_2 %d\n", rank, k);
2090     /* empty columns is found */
2091     if (k < num_RAPext_cols) {
2092     temp = TMPMEMALLOC(k, index_t);
2093     k = 0;
2094     for (i=0; i<num_RAPext_cols; i++)
2095     if (!P_marker[i]) {
2096     P_marker[i] = k;
2097     temp[k] = global_id_RAP[i];
2098     k++;
2099     }
2100     for (i=0; i<j; i++) {
2101     i1 = RAP_couple_idx[i];
2102     RAP_couple_idx[i] = P_marker[i1];
2103     }
2104     num_RAPext_cols = k;
2105     TMPMEMFREE(global_id_RAP);
2106     global_id_RAP = temp;
2107     }
2108     TMPMEMFREE(P_marker);
2109     if (MY_DEBUG) fprintf(stderr, "rank %d CP11\n", rank);
2110    
2111     /******************************************************/
2112     /* Start to create the coasre level System Matrix A_c */
2113     /******************************************************/
2114     /* first, prepare the sender/receiver for the col_connector */
2115     dist = P->pattern->input_distribution->first_component;
2116     recv_len = TMPMEMALLOC(size, dim_t);
2117     send_len = TMPMEMALLOC(size, dim_t);
2118     neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
2119     offsetInShared = TMPMEMALLOC(size+1, index_t);
2120     shared = TMPMEMALLOC(num_RAPext_cols, index_t);
2121     memset(recv_len, 0, sizeof(dim_t) * size);
2122     memset(send_len, 0, sizeof(dim_t) * size);
2123     num_neighbors = 0;
2124     offsetInShared[0] = 0;
2125     for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
2126     if (MY_DEBUG && rank == 1) fprintf(stderr, "i%d j%d k%d %d %d\n", i, j, k, global_id_RAP[i], recv_len[j]);
2127     shared[i] = i + num_Pmain_cols;
2128     if (k <= global_id_RAP[i]) {
2129     if (recv_len[j] > 0) {
2130     neighbor[num_neighbors] = j;
2131     num_neighbors ++;
2132     offsetInShared[num_neighbors] = i;
2133     }
2134     while (k <= global_id_RAP[i]) {
2135     j++;
2136     k = dist[j+1];
2137     }
2138     }
2139     recv_len[j] ++;
2140     }
2141     if (recv_len[j] > 0) {
2142     neighbor[num_neighbors] = j;
2143     num_neighbors ++;
2144     offsetInShared[num_neighbors] = i;
2145     }
2146     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
2147     neighbor, shared, offsetInShared, 1, 0, mpi_info);
2148    
2149     MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
2150 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP12 %d %d\n", rank, num_neighbors, num_RAPext_cols);
2151 lgao 3779
2152     #ifdef ESYS_MPI
2153     mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
2154     mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
2155     #else
2156     mpi_requests=TMPMEMALLOC(size*2,int);
2157     mpi_stati=TMPMEMALLOC(size*2,int);
2158     #endif
2159     num_neighbors = 0;
2160     j = 0;
2161     offsetInShared[0] = 0;
2162     for (i=0; i<size; i++) {
2163     if (send_len[i] > 0) {
2164     neighbor[num_neighbors] = i;
2165     num_neighbors ++;
2166     j += send_len[i];
2167     offsetInShared[num_neighbors] = j;
2168     }
2169     }
2170     TMPMEMFREE(shared);
2171     shared = TMPMEMALLOC(j, index_t);
2172     for (i=0, j=0; i<num_neighbors; i++) {
2173     k = neighbor[i];
2174     MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
2175     mpi_info->msg_tag_counter+k,
2176     mpi_info->comm, &mpi_requests[i]);
2177     j += send_len[k];
2178     }
2179     for (i=0, j=0; i<recv->numNeighbors; i++) {
2180     k = recv->neighbor[i];
2181     MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
2182     mpi_info->msg_tag_counter+rank,
2183     mpi_info->comm, &mpi_requests[i+num_neighbors]);
2184     j += recv_len[k];
2185     }
2186     MPI_Waitall(num_neighbors + recv->numNeighbors,
2187     mpi_requests, mpi_stati);
2188     mpi_info->msg_tag_counter += size;
2189     if (MY_DEBUG) fprintf(stderr, "rank %d CP13\n", rank);
2190    
2191     j = offsetInShared[num_neighbors];
2192     offset = dist[rank];
2193     for (i=0; i<j; i++) shared[i] = shared[i] - offset;
2194     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
2195     neighbor, shared, offsetInShared, 1, 0, mpi_info);
2196    
2197     col_connector = Paso_Connector_alloc(send, recv);
2198     TMPMEMFREE(shared);
2199     Paso_SharedComponents_free(recv);
2200     Paso_SharedComponents_free(send);
2201 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP14\n", rank);
2202 lgao 3779
2203     /* now, create row distribution (output_distri) and col
2204     distribution (input_distribution) */
2205     input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
2206     output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
2207     if (MY_DEBUG) fprintf(stderr, "rank %d CP14_1\n", rank);
2208    
2209     /* then, prepare the sender/receiver for the row_connector, first, prepare
2210     the information for sender */
2211     sum = RAP_couple_ptr[num_Pmain_cols];
2212     len = TMPMEMALLOC(size, dim_t);
2213     send_ptr = TMPMEMALLOC(size, index_t*);
2214     send_idx = TMPMEMALLOC(size, index_t*);
2215     for (i=0; i<size; i++) {
2216     send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
2217     send_idx[i] = TMPMEMALLOC(sum, index_t);
2218     memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
2219     }
2220     memset(len, 0, sizeof(dim_t) * size);
2221     if (MY_DEBUG) fprintf(stderr, "rank %d CP14_2 %d\n", rank, sum);
2222     recv = col_connector->recv;
2223     sum=0;
2224     for (i_r=0; i_r<num_Pmain_cols; i_r++) {
2225     i1 = RAP_couple_ptr[i_r];
2226     i2 = RAP_couple_ptr[i_r+1];
2227     if (i2 > i1) {
2228     /* then row i_r will be in the sender of row_connector, now check
2229     how many neighbors i_r needs to be send to */
2230     for (j=i1; j<i2; j++) {
2231     i_c = RAP_couple_idx[j];
2232     /* find out the corresponding neighbor "p" of column i_c */
2233     for (p=0; p<recv->numNeighbors; p++) {
2234     if (i_c < recv->offsetInShared[p+1]) {
2235     k = recv->neighbor[p];
2236 lgao 3817 if (MY_DEBUG && rank == 1 && k == 1)
2237 lgao 3779 fprintf(stderr, "******* i_r %d i_c %d p %d ub %d numNeighbors %d\n", i_r, i_c, p, recv->offsetInShared[p+1], recv->numNeighbors);
2238     if (send_ptr[k][i_r] == 0) sum++;
2239     send_ptr[k][i_r] ++;
2240     send_idx[k][len[k]] = global_id_RAP[i_c];
2241     len[k] ++;
2242     break;
2243     }
2244     }
2245     }
2246     }
2247     }
2248     if (MY_DEBUG) fprintf(stderr, "rank %d CP14_3 %d\n", rank, send_ptr[1][0]);
2249     if (global_id_RAP) {
2250     TMPMEMFREE(global_id_RAP);
2251     global_id_RAP = NULL;
2252     }
2253     if (MY_DEBUG) fprintf(stderr, "rank %d CP14_4 %d sum%d cols%d\n", rank, len[0], sum, num_RAPext_cols);
2254    
2255     /* now allocate the sender */
2256     shared = TMPMEMALLOC(sum, index_t);
2257     memset(send_len, 0, sizeof(dim_t) * size);
2258     num_neighbors=0;
2259     offsetInShared[0] = 0;
2260     if (MY_DEBUG) fprintf(stderr, "rank %d CP14_5 %d\n", rank, size);
2261     for (p=0, k=0; p<size; p++) {
2262     if (MY_DEBUG && rank == 1) fprintf(stderr, "rank %d CP14_6 %d %d %d\n", rank, p, k, num_neighbors);
2263     for (i=0; i<num_Pmain_cols; i++) {
2264     if (send_ptr[p][i] > 0) {
2265     shared[k] = i;
2266     k++;
2267     send_ptr[p][send_len[p]] = send_ptr[p][i];
2268     send_len[p]++;
2269     }
2270     }
2271     if (k > offsetInShared[num_neighbors]) {
2272     neighbor[num_neighbors] = p;
2273     num_neighbors ++;
2274     offsetInShared[num_neighbors] = k;
2275     }
2276     }
2277     if (MY_DEBUG) fprintf(stderr, "rank %d CP14_8 %d %d %d %d %d\n", rank, k, send_len[0], send_len[1], offsetInShared[1], k);
2278     send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
2279     neighbor, shared, offsetInShared, 1, 0, mpi_info);
2280    
2281     /* send/recv number of rows will be sent from current proc
2282     recover info for the receiver of row_connector from the sender */
2283     MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
2284     if (MY_DEBUG) fprintf(stderr, "rank %d CP15 %d %d\n", rank, recv_len[0], recv_len[1]);
2285     num_neighbors = 0;
2286     offsetInShared[0] = 0;
2287     j = 0;
2288     for (i=0; i<size; i++) {
2289     if (i != rank && recv_len[i] > 0) {
2290     neighbor[num_neighbors] = i;
2291     num_neighbors ++;
2292     j += recv_len[i];
2293     offsetInShared[num_neighbors] = j;
2294     }
2295     }
2296     if (MY_DEBUG) fprintf(stderr, "rank %d CP15_1\n", rank);
2297     TMPMEMFREE(shared);
2298     TMPMEMFREE(recv_len);
2299     shared = TMPMEMALLOC(j, index_t);
2300     k = offsetInShared[num_neighbors];
2301     for (i=0; i<k; i++) {
2302     shared[i] = i + num_Pmain_cols;
2303     }
2304     if (MY_DEBUG) fprintf(stderr, "rank %d CP15_2\n", rank);
2305     recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
2306     neighbor, shared, offsetInShared, 1, 0, mpi_info);
2307     if (MY_DEBUG) fprintf(stderr, "rank %d CP15_3\n", rank);
2308     row_connector = Paso_Connector_alloc(send, recv);
2309     TMPMEMFREE(shared);
2310     Paso_SharedComponents_free(recv);
2311     Paso_SharedComponents_free(send);
2312 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP16 %d R%d S%d\n", rank, k, num_neighbors, send->numNeighbors);
2313 lgao 3779
2314     /* send/recv pattern->ptr for rowCoupleBlock */
2315     num_RAPext_rows = offsetInShared[num_neighbors];
2316     row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
2317     for (p=0; p<num_neighbors; p++) {
2318     j = offsetInShared[p];
2319     i = offsetInShared[p+1];
2320     MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
2321     mpi_info->msg_tag_counter+neighbor[p],
2322     mpi_info->comm, &mpi_requests[p]);
2323     if (MY_DEBUG) fprintf(stderr, "rank %d RECV %d from %d tag %d\n", rank, i-j, neighbor[p], mpi_info->msg_tag_counter+neighbor[p]);
2324     }
2325     send = row_connector->send;
2326     for (p=0; p<send->numNeighbors; p++) {
2327     MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
2328     MPI_INT, send->neighbor[p],
2329     mpi_info->msg_tag_counter+rank,
2330     mpi_info->comm, &mpi_requests[p+num_neighbors]);
2331     if (MY_DEBUG) fprintf(stderr, "rank %d SEND %d TO %d tag %d\n", rank, send_len[send->neighbor[p]], send->neighbor[p], mpi_info->msg_tag_counter+rank);
2332     }
2333     MPI_Waitall(num_neighbors + send->numNeighbors,
2334     mpi_requests, mpi_stati);
2335     mpi_info->msg_tag_counter += size;
2336     TMPMEMFREE(send_len);
2337 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP17\n", rank);
2338 lgao 3779
2339     sum = 0;
2340     for (i=0; i<num_RAPext_rows; i++) {
2341     k = row_couple_ptr[i];
2342     row_couple_ptr[i] = sum;
2343     sum += k;
2344     }
2345     row_couple_ptr[num_RAPext_rows] = sum;
2346    
2347     /* send/recv pattern->index for rowCoupleBlock */
2348     k = row_couple_ptr[num_RAPext_rows];
2349     row_couple_idx = MEMALLOC(k, index_t);
2350     for (p=0; p<num_neighbors; p++) {
2351     j1 = row_couple_ptr[offsetInShared[p]];
2352     j2 = row_couple_ptr[offsetInShared[p+1]];
2353     MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
2354     mpi_info->msg_tag_counter+neighbor[p],
2355     mpi_info->comm, &mpi_requests[p]);
2356     }
2357     for (p=0; p<send->numNeighbors; p++) {
2358     MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
2359     MPI_INT, send->neighbor[p],
2360     mpi_info->msg_tag_counter+rank,
2361     mpi_info->comm, &mpi_requests[p+num_neighbors]);
2362     }
2363     MPI_Waitall(num_neighbors + send->numNeighbors,
2364     mpi_requests, mpi_stati);
2365     mpi_info->msg_tag_counter += size;
2366    
2367     offset = input_dist->first_component[rank];
2368     k = row_couple_ptr[num_RAPext_rows];
2369     for (i=0; i<k; i++) {
2370     row_couple_idx[i] -= offset;
2371     }
2372    
2373     for (i=0; i<size; i++) {
2374     TMPMEMFREE(send_ptr[i]);
2375     TMPMEMFREE(send_idx[i]);
2376     }
2377     TMPMEMFREE(send_ptr);
2378     TMPMEMFREE(send_idx);
2379     TMPMEMFREE(len);
2380     TMPMEMFREE(offsetInShared);
2381     TMPMEMFREE(neighbor);
2382     TMPMEMFREE(mpi_requests);
2383     TMPMEMFREE(mpi_stati);
2384     Esys_MPIInfo_free(mpi_info);
2385 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP18\n", rank);
2386 lgao 3779
2387     /* Now, we can create pattern for mainBlock and coupleBlock */
2388     main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
2389     num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
2390     col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2391     num_Pmain_cols, num_RAPext_cols,
2392     RAP_couple_ptr, RAP_couple_idx);
2393     row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2394     num_RAPext_rows, num_Pmain_cols,
2395     row_couple_ptr, row_couple_idx);
2396    
2397     /* next, create the system matrix */
2398     pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2399     output_dist, input_dist, main_pattern, col_couple_pattern,
2400     row_couple_pattern, col_connector, row_connector);
2401     out = Paso_SystemMatrix_alloc(A->type, pattern,
2402     row_block_size, col_block_size, FALSE);
2403     if (MY_DEBUG) fprintf(stderr, "rank %d CP19\n", rank);
2404    
2405     /* finally, fill in the data*/
2406     memcpy(out->mainBlock->val, RAP_main_val,
2407     out->mainBlock->len* sizeof(double));
2408     memcpy(out->col_coupleBlock->val, RAP_couple_val,
2409     out->col_coupleBlock->len * sizeof(double));
2410    
2411     /* Clean up */
2412     Paso_SystemMatrixPattern_free(pattern);
2413     Paso_Pattern_free(main_pattern);
2414     Paso_Pattern_free(col_couple_pattern);
2415     Paso_Pattern_free(row_couple_pattern);
2416     Paso_Connector_free(col_connector);
2417     Paso_Connector_free(row_connector);
2418     Paso_Distribution_free(output_dist);
2419     Paso_Distribution_free(input_dist);
2420     TMPMEMFREE(RAP_main_val);
2421     TMPMEMFREE(RAP_couple_val);
2422     if (Esys_noError()) {
2423     return out;
2424     } else {
2425     Paso_SystemMatrix_free(out);
2426     return NULL;
2427     }
2428     }
2429    
2430    
2431     Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2432     Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2433     {
2434     Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2435     Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2436     Paso_SystemMatrix *out=NULL;
2437     Paso_SystemMatrixPattern *pattern=NULL;
2438     Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2439     Paso_SharedComponents *send =NULL, *recv=NULL;
2440     Paso_Connector *col_connector=NULL, *row_connector=NULL;
2441     Paso_Coupler *coupler=NULL;
2442     Paso_Pattern *main_pattern=NULL;
2443     Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2444     const dim_t row_block_size=A->row_block_size;
2445     const dim_t col_block_size=A->col_block_size;
2446     const dim_t block_size = A->block_size;
2447     const dim_t P_block_size = P->block_size;
2448     const dim_t num_threads=omp_get_max_threads();
2449     const double ZERO = 0.0;
2450     double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2451     double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2452     index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2453     index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2454     index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2455     index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2456     index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2457     index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2458     index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2459     index_t sum, i, j, k, iptr, irb, icb, ib;
2460     index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2461     index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2462     index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2463     index_t row_marker_ext, *where_p=NULL;
2464     index_t **send_ptr=NULL, **send_idx=NULL;
2465     dim_t l, p, q, global_label, num_neighbors;
2466     dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2467     Esys_MPI_rank *neighbor=NULL;
2468     #ifdef ESYS_MPI
2469     MPI_Request* mpi_requests=NULL;
2470     MPI_Status* mpi_stati=NULL;
2471     #else
2472     int *mpi_requests=NULL, *mpi_stati=NULL;
2473     #endif
2474    
2475     /* two sparse matrixes R_main and R_couple will be generate, as the
2476     transpose of P_main and P_col_couple, respectively. Note that,
2477     R_couple is actually the row_coupleBlock of R (R=P^T) */
2478     R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2479     R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2480    
2481 lgao 3763 if (MY_DEBUG) fprintf(stderr, "CP1\n");
2482    
2483     /* generate P_ext, i.e. portion of P that is tored on neighbor procs
2484     and needed locally for triple matrix product RAP
2485     to be specific, P_ext (on processor k) are group of rows in P, where
2486     the list of rows from processor q is given by
2487     A->col_coupler->connector->send->shared[rPtr...]
2488     rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2489     on q.
2490     P_ext is represented by two sparse matrixes P_ext_main and
2491     P_ext_couple */
2492     Paso_Preconditioner_AMG_extendB(A, P);
2493 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP2\n", rank);
2494 lgao 3763
2495     /* count the number of cols in P_ext_couple, resize the pattern of
2496     sparse matrix P_ext_couple with new compressed order, and then
2497     build the col id mapping from P->col_coupleBlock to
2498     P_ext_couple */
2499     num_Pmain_cols = P->mainBlock->numCols;
2500     num_Pcouple_cols = P->col_coupleBlock->numCols;
2501     num_Acouple_cols = A->col_coupleBlock->numCols;
2502     num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2503     sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2504     offset = P->col_distribution->first_component[rank];
2505     num_Pext_cols = 0;
2506     if (P->global_id) {
2507     /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2508     and P->col_coupleBlock */
2509     iptr = 0;
2510     if (num_Pcouple_cols || sum > 0) {
2511     temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
2512 lgao 3767 for (; iptr<sum; iptr++){
2513 lgao 3763 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2514 lgao 3767 if (MY_DEBUG && rank ==1)
2515     fprintf(stderr, "rank %d remote_block[%d] = %d\n", rank, iptr, temp[iptr]);
2516     }
2517     for (j=0; j<num_Pcouple_cols; j++, iptr++){
2518 lgao 3763 temp[iptr] = P->global_id[j];
2519 lgao 3767 if (MY_DEBUG && rank ==1)
2520     fprintf(stderr, "rank %d global_id[%d] = %d\n", rank, j, P->global_id[j]);
2521     }
2522 lgao 3763 }
2523     if (iptr) {
2524     #ifdef USE_QSORTG
2525     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2526     #else
2527     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2528     #endif
2529     num_Pext_cols = 1;
2530     i = temp[0];
2531     for (j=1; j<iptr; j++) {
2532     if (temp[j] > i) {
2533     i = temp[j];
2534     temp[num_Pext_cols++] = i;
2535     }
2536     }
2537     }
2538    
2539     /* resize the pattern of P_ext_couple */
2540     if(num_Pext_cols){
2541     global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
2542     for (i=0; i<num_Pext_cols; i++)
2543     global_id_P[i] = temp[i];
2544 lgao 3767 if (MY_DEBUG && rank == 1) {
2545     int my_i, sum=num_Pext_cols;
2546     char *str1, *str2;
2547     str1 = TMPMEMALLOC(sum*100+100, char);
2548     str2 = TMPMEMALLOC(100, char);
2549     sprintf(str1, "rank %d global_id_P[%d] = (", rank, sum);
2550     for (my_i=0; my_i<sum; my_i++) {
2551     sprintf(str2, "%d ", global_id_P[my_i]);
2552     strcat(str1, str2);
2553     }
2554     fprintf(stderr, "%s)\n", str1);
2555     TMPMEMFREE(str1);
2556     TMPMEMFREE(str2);
2557     }
2558    
2559 lgao 3763 }
2560     if (num_Pcouple_cols || sum > 0)
2561     TMPMEMFREE(temp);
2562     for (i=0; i<sum; i++) {
2563     where_p = (index_t *)bsearch(
2564     &(P->remote_coupleBlock->pattern->index[i]),
2565     global_id_P, num_Pext_cols,
2566     sizeof(index_t), Paso_comparIndex);
2567     P->remote_coupleBlock->pattern->index[i] =
2568     (index_t)(where_p -global_id_P);
2569     }
2570    
2571     /* build the mapping */
2572     if (num_Pcouple_cols) {
2573     Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
2574     iptr = 0;
2575     for (i=0; i<num_Pext_cols; i++)
2576     if (global_id_P[i] == P->global_id[iptr]) {
2577     Pcouple_to_Pext[iptr++] = i;
2578     if (iptr == num_Pcouple_cols) break;
2579     }
2580     }
2581     }
2582    
2583 lgao 3817 if (MY_DEBUG) fprintf(stderr, "CP3\n");
2584 lgao 3763 if (MY_DEBUG && rank == 0) {
2585     fprintf(stderr, "Rank %d Now Prolongation Matrix ************************\n", rank);
2586     Paso_SystemMatrix_print(P);
2587     fprintf(stderr, "Rank %d Now Restriction Matrix ************************\n", rank);
2588     Paso_SystemMatrix_print(R);
2589     }
2590    
2591     /* alloc and initialize the makers */
2592     sum = num_Pext_cols + num_Pmain_cols;
2593     P_marker = TMPMEMALLOC(sum, index_t);
2594     A_marker = TMPMEMALLOC(num_A_cols, index_t);
2595     #pragma omp parallel for private(i) schedule(static)
2596     for (i=0; i<sum; i++) P_marker[i] = -1;
2597     #pragma omp parallel for private(i) schedule(static)
2598     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2599    
2600     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_1 %d\n", rank, num_Pcouple_cols);
2601     /* Now, count the size of RAP_ext. Start with rows in R_couple */
2602     sum = 0;
2603     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2604     row_marker = sum;
2605     /* then, loop over elements in row i_r of R_couple */
2606     j1_ub = R_couple->pattern->ptr[i_r+1];
2607     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_2 %d (%d %d)\n", rank, i_r, R_couple->pattern->ptr[i_r], j1_ub);
2608     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2609     i1 = R_couple->pattern->index[j1];
2610     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_3 %d %d\n", rank, j1, i1);
2611     /* then, loop over elements in row i1 of A->col_coupleBlock */
2612     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2613     if (MY_DEBUG) fprintf(stderr, "rank %d CP3_4 %d %d %d\n", rank, j1, i1, j2_ub);
2614     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2615     i2 = A->col_coupleBlock->pattern->index[j2];
2616    
2617     /* check whether entry RA[i_r, i2] has been previously visited.
2618     RAP new entry is possible only if entry RA[i_r, i2] has not
2619     been visited yet */
2620     if (A_marker[i2] != i_r) {
2621     /* first, mark entry RA[i_r,i2] as visited */
2622     A_marker[i2] = i_r;
2623    
2624     /* then loop over elements in row i2 of P_ext_main */
2625     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2626     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2627     i_c = P->row_coupleBlock->pattern->index[j3];
2628    
2629     /* check whether entry RAP[i_r,i_c] has been created.
2630     If not yet created, create the entry and increase
2631     the total number of elements in RAP_ext */
2632     if (P_marker[i_c] < row_marker) {
2633     P_marker[i_c] = sum;
2634     sum++;
2635     }
2636     }
2637    
2638     /* loop over elements in row i2 of P_ext_couple, do the same */
2639     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2640     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2641     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2642     if (P_marker[i_c] < row_marker) {
2643     P_marker[i_c] = sum;
2644     sum++;
2645     }
2646     }
2647     }
2648     }
2649    
2650     /* now loop over elements in row i1 of A->mainBlock, repeat
2651     the process we have done to block A->col_coupleBlock */
2652     j2_ub = A->mainBlock->pattern->ptr[i1+1];
2653     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2654     i2 = A->mainBlock->pattern->index[j2];
2655     if (A_marker[i2 + num_Acouple_cols] != i_r) {
2656     A_marker[i2 + num_Acouple_cols] = i_r;
2657     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2658     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2659     i_c = P->mainBlock->pattern->index[j3];
2660     if (P_marker[i_c] < row_marker) {
2661     P_marker[i_c] = sum;
2662     sum++;
2663     }
2664     }
2665     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2666     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2667     /* note that we need to map the column index in
2668     P->col_coupleBlock back into the column index in
2669     P_ext_couple */
2670     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2671 lgao 3779 if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d-%d out of %d+%d) 5\n", rank, i_c, P->col_coupleBlock->pattern->index[j3], num_Pext_cols, num_Pmain_cols);
2672 lgao 3763 if (P_marker[i_c] < row_marker) {
2673     P_marker[i_c] = sum;
2674     sum++;
2675     }
2676     }
2677     }
2678     }
2679     }
2680     }
2681 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP4 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);
2682 lgao 3763
2683     /* Now we have the number of non-zero elements in RAP_ext, allocate
2684     PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2685     RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
2686     RAP_ext_idx = MEMALLOC(sum, index_t);
2687 lgao 3779 RAP_ext_val = MEMALLOC(sum * block_size, double);
2688     RA_val = TMPMEMALLOC(block_size, double);
2689     RAP_val = TMPMEMALLOC(block_size, double);
2690 lgao 3763
2691     /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2692     sum = num_Pext_cols + num_Pmain_cols;
2693     #pragma omp parallel for private(i) schedule(static)
2694     for (i=0; i<sum; i++) P_marker[i] = -1;
2695     #pragma omp parallel for private(i) schedule(static)
2696     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2697     if (MY_DEBUG) fprintf(stderr, "rank %d CP4_1 sum %d\n", rank, sum);
2698     sum = 0;
2699     RAP_ext_ptr[0] = 0;
2700     for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2701     row_marker = sum;
2702     if (MY_DEBUG) fprintf(stderr, "rank %d CP4_2 i_r %d sum %d\n", rank, i_r, sum);
2703     /* then, loop over elements in row i_r of R_couple */
2704     j1_ub = R_couple->pattern->ptr[i_r+1];
2705     for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2706     i1 = R_couple->pattern->index[j1];
2707 lgao 3779 R_val = &(R_couple->val[j1*block_size]);
2708 lgao 3763
2709     /* then, loop over elements in row i1 of A->col_coupleBlock */
2710     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2711     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2712     i2 = A->col_coupleBlock->pattern->index[j2];
2713 lgao 3779 //RA_val = R_val * A->col_coupleBlock->val[j2];
2714     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2715     for (irb=0; irb<row_block_size; irb++) {
2716     for (icb=0; icb<col_block_size; icb++) {
2717     rtmp = ZERO;
2718     for (ib=0; ib<col_block_size; ib++) {
2719     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2720     }
2721     RA_val[irb+row_block_size*icb]=rtmp;
2722     }
2723     }
2724 lgao 3763
2725     /* check whether entry RA[i_r, i2] has been previously visited.
2726     RAP new entry is possible only if entry RA[i_r, i2] has not
2727     been visited yet */
2728     if (A_marker[i2] != i_r) {
2729     /* first, mark entry RA[i_r,i2] as visited */
2730     A_marker[i2] = i_r;
2731    
2732     /* then loop over elements in row i2 of P_ext_main */
2733     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2734     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2735     i_c = P->row_coupleBlock->pattern->index[j3];
2736 lgao 3779 // RAP_val = RA_val * P->row_coupleBlock->val[j3];
2737     temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2738     for (irb=0; irb<row_block_size; irb++) {
2739     for (icb=0; icb<col_block_size; icb++) {
2740     rtmp = ZERO;
2741     for (ib=0; ib<col_block_size; ib++) {
2742     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2743     }
2744     RAP_val[irb+row_block_size*icb]=rtmp;
2745     }
2746     }
2747 lgao 3763
2748 lgao 3779
2749 lgao 3763 /* check whether entry RAP[i_r,i_c] has been created.
2750     If not yet created, create the entry and increase
2751     the total number of elements in RAP_ext */
2752     if (P_marker[i_c] < row_marker) {
2753     P_marker[i_c] = sum;
2754 lgao 3779 //RAP_ext_val[sum*block_size] = RAP_val;
2755     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2756 lgao 3763 RAP_ext_idx[sum] = i_c + offset;
2757     if (MY_DEBUG){
2758 lgao 3779 if (rank == 1) fprintf(stderr, "1-ext[%d (%d,%d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum*block_size]);
2759 lgao 3763 }
2760     sum++;
2761     } else {
2762 lgao 3779 //RAP_ext_val[P_marker[i_c] * block_size] += RAP_val;
2763     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2764     for (ib=0; ib<block_size; ib++) {
2765     temp_val[ib] += RAP_val[ib];
2766     }
2767    
2768 lgao 3763 if (MY_DEBUG){
2769 lgao 3779 if (rank == 1) fprintf(stderr, "1-ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);
2770 lgao 3763 }
2771     }
2772     }
2773    
2774     /* loop over elements in row i2 of P_ext_couple, do the same */
2775     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2776     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2777     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2778 lgao 3779 //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
2779     temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2780     for (irb=0; irb<row_block_size; irb++) {
2781     for (icb=0; icb<col_block_size; icb++) {
2782     rtmp = ZERO;
2783     for (ib=0; ib<col_block_size; ib++) {
2784     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2785     }
2786     RAP_val[irb+row_block_size*icb]=rtmp;
2787     }
2788     }
2789 lgao 3763
2790     if (P_marker[i_c] < row_marker) {
2791     P_marker[i_c] = sum;
2792 lgao 3779 //RAP_ext_val[sum] = RAP_val;
2793     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2794 lgao 3763 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2795     if (MY_DEBUG){
2796 lgao 3779 if (rank == 1) fprintf(stderr, "ext[%d (%d,%d)]=%f (R%f, A(%d %d)%f, P%f)\n", sum, i_r, i_c, RAP_ext_val[sum], *R_val, i1, i2, A->col_coupleBlock->val[j2], P->remote_coupleBlock->val[j3]);
2797 lgao 3763 }
2798     sum++;
2799     } else {
2800 lgao 3779 //RAP_ext_val[P_marker[i_c]] += RAP_val;
2801     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2802     for (ib=0; ib<block_size; ib++) {
2803     temp_val[ib] += RAP_val[ib];
2804     }
2805    
2806 lgao 3763 if (MY_DEBUG){
2807 lgao 3779 if (rank == 1) fprintf(stderr, "ext[(%d,%d)]+=%f\n", i_r, i_c, *RAP_val);
2808 lgao 3763 }
2809     }
2810     }
2811    
2812     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2813     Only the contributions are added. */
2814     } else {
2815     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2816     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2817     i_c = P->row_coupleBlock->pattern->index[j3];
2818 lgao 3779 //RAP_val = RA_val * P->row_coupleBlock->val[j3];
2819     temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2820     for (irb=0; irb<row_block_size; irb++) {
2821     for (icb=0; icb<col_block_size; icb++) {
2822     rtmp = ZERO;
2823     for (ib=0; ib<col_block_size; ib++) {
2824     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2825     }
2826     RAP_val[irb+row_block_size*icb]=rtmp;
2827     }
2828     }
2829    
2830     //RAP_ext_val[P_marker[i_c]] += RAP_val;
2831     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2832     for (ib=0; ib<block_size; ib++) {
2833     temp_val[ib] += RAP_val[ib];
2834     }
2835    
2836 lgao 3763 if (MY_DEBUG){
2837 lgao 3779 if (rank == 1) fprintf(stderr, "Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
2838 lgao 3763 }
2839     }
2840     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2841     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2842     i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2843 lgao 3779 //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
2844     temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2845     for (irb=0; irb<row_block_size; irb++) {
2846     for (icb=0; icb<col_block_size; icb++) {
2847     rtmp = ZERO;
2848     for (ib=0; ib<col_block_size; ib++) {
2849     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2850     }
2851     RAP_val[irb+row_block_size*icb]=rtmp;
2852     }
2853     }
2854    
2855     //RAP_ext_val[P_marker[i_c]] += RAP_val;
2856     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2857     for (ib=0; ib<block_size; ib++) {
2858     temp_val[ib] += RAP_val[ib];
2859     }
2860    
2861 lgao 3763 if (MY_DEBUG){
2862 lgao 3779 if (rank == 1) fprintf(stderr, "Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
2863 lgao 3763 }
2864     }
2865     }
2866     }
2867    
2868     /* now loop over elements in row i1 of A->mainBlock, repeat
2869     the process we have done to block A->col_coupleBlock */
2870     j2_ub = A->mainBlock->pattern->ptr[i1+1];
2871     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2872     i2 = A->mainBlock->pattern->index[j2];
2873 lgao 3779 //RA_val = R_val * A->mainBlock->val[j2];
2874     temp_val = &(A->mainBlock->val[j2*block_size]);
2875     for (irb=0; irb<row_block_size; irb++) {
2876     for (icb=0; icb<col_block_size; icb++) {
2877     rtmp = ZERO;
2878     for (ib=0; ib<col_block_size; ib++) {
2879     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2880     }
2881     RA_val[irb+row_block_size*icb]=rtmp;
2882     }
2883     }
2884    
2885 lgao 3763 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2886     A_marker[i2 + num_Acouple_cols] = i_r;
2887     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2888     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2889     i_c = P->mainBlock->pattern->index[j3];
2890 lgao 3779 //RAP_val = RA_val * P->mainBlock->val[j3];
2891     temp_val = &(P->mainBlock->val[j3*block_size]);
2892     for (irb=0; irb<row_block_size; irb++) {
2893     for (icb=0; icb<col_block_size; icb++) {
2894     rtmp = ZERO;
2895     for (ib=0; ib<col_block_size; ib++) {
2896     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2897     }
2898     RAP_val[irb+row_block_size*icb]=rtmp;
2899     }
2900     }
2901    
2902 lgao 3763 if (P_marker[i_c] < row_marker) {
2903     P_marker[i_c] = sum;
2904 lgao 3779 //RAP_ext_val[sum] = RAP_val;
2905     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2906 lgao 3763 RAP_ext_idx[sum] = i_c + offset;
2907     if (MY_DEBUG){
2908     if (rank == 1) fprintf(stderr, "Main 1-ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);
2909     }
2910     sum++;
2911     } else {
2912 lgao 3779 //RAP_ext_val[P_marker[i_c]] += RAP_val;
2913     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2914     for (ib=0; ib<block_size; ib++) {
2915     temp_val[ib] += RAP_val[ib];
2916     }
2917    
2918 lgao 3763 if (MY_DEBUG){
2919 lgao 3779 if (rank == 1) fprintf(stderr, "Main 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
2920 lgao 3763 }
2921     }
2922     }
2923     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2924     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2925     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2926 lgao 3817 if (MY_DEBUG && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 1\n", rank, i_c, num_Pext_cols + num_Pmain_cols);
2927 lgao 3779 //RAP_val = RA_val * P->col_coupleBlock->val[j3];
2928     temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2929     for (irb=0; irb<row_block_size; irb++) {
2930     for (icb=0; icb<col_block_size; icb++) {
2931     rtmp = ZERO;
2932     for (ib=0; ib<col_block_size; ib++) {
2933     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2934     }
2935     RAP_val[irb+row_block_size*icb]=rtmp;
2936     }
2937     }
2938    
2939 lgao 3763 if (P_marker[i_c] < row_marker) {
2940     P_marker[i_c] = sum;
2941 lgao 3779 //RAP_ext_val[sum] = RAP_val;
2942     memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2943 lgao 3763 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2944     if (MY_DEBUG){
2945     if (rank == 1) fprintf(stderr, "Main ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);
2946     }
2947     sum++;
2948     } else {
2949 lgao 3779 //RAP_ext_val[P_marker[i_c]] += RAP_val;
2950     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2951     for (ib=0; ib<block_size; ib++) {
2952     temp_val[ib] += RAP_val[ib];
2953     }
2954    
2955 lgao 3763 if (MY_DEBUG){
2956 lgao 3779 if (rank == 1) fprintf(stderr, "Main ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
2957 lgao 3763 }
2958     }
2959     }
2960     } else {
2961     j3_ub = P->mainBlock->pattern->ptr[i2+1];
2962     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2963     i_c = P->mainBlock->pattern->index[j3];
2964 lgao 3779 //RAP_val = RA_val * P->mainBlock->val[j3];
2965     temp_val = &(P->mainBlock->val[j3*block_size]);
2966     for (irb=0; irb<row_block_size; irb++) {
2967     for (icb=0; icb<col_block_size; icb++) {
2968     rtmp = ZERO;
2969     for (ib=0; ib<col_block_size; ib++) {
2970     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2971     }
2972     RAP_val[irb+row_block_size*icb]=rtmp;
2973     }
2974     }
2975    
2976     //RAP_ext_val[P_marker[i_c]] += RAP_val;
2977     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2978     for (ib=0; ib<block_size; ib++) {
2979     temp_val[ib] += RAP_val[ib];
2980     }
2981    
2982 lgao 3763 if (MY_DEBUG){
2983 lgao 3779 if (rank == 1) fprintf(stderr, "Main Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
2984 lgao 3763 }
2985     }
2986     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2987     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2988     i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2989 lgao 3779 //RAP_val = RA_val * P->col_coupleBlock->val[j3];
2990     temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2991     for (irb=0; irb<row_block_size; irb++) {
2992     for (icb=0; icb<col_block_size; icb++) {
2993     rtmp = ZERO;
2994     for (ib=0; ib<col_block_size; ib++) {
2995     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2996     }
2997     RAP_val[irb+row_block_size*icb]=rtmp;
2998     }
2999     }
3000    
3001     //RAP_ext_val[P_marker[i_c]] += RAP_val;
3002     temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
3003     for (ib=0; ib<block_size; ib++) {
3004     temp_val[ib] += RAP_val[ib];
3005     }
3006    
3007 lgao 3763 if (MY_DEBUG){
3008 lgao 3779 if (rank == 1) fprintf(stderr, "Main Visited ext[%d, %d]+=%f\n", i_r, i_c, *RAP_val);
3009 lgao 3763 }
3010     }
3011     }
3012     }
3013     }
3014     RAP_ext_ptr[i_r+1] = sum;
3015     }
3016     TMPMEMFREE(P_marker);
3017     TMPMEMFREE(Pcouple_to_Pext);
3018     if (MY_DEBUG) fprintf(stderr, "rank %d CP5 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);
3019    
3020     if (MY_DEBUG) {
3021     char *str1, *str2;
3022     str1 = TMPMEMALLOC(sum*100+100, char);
3023     str2 = TMPMEMALLOC(15, char);
3024     sprintf(str1, "rank %d RAP_ext_idx[%d] = (", rank, sum);
3025     for (i=0; i<sum; i++) {
3026     sprintf(str2, "%d ", RAP_ext_idx[i]);
3027     strcat(str1, str2);
3028     }
3029     fprintf(stderr, "%s)\n", str1);
3030     TMPMEMFREE(str1);
3031     TMPMEMFREE(str2);
3032     }
3033    
3034     /* Now we have part of RAP[r,c] where row "r" is the list of rows
3035     which is given by the column list of P->col_coupleBlock, and
3036     column "c" is the list of columns which possiblely covers the
3037     whole column range of systme martris P. This part of data is to
3038     be passed to neighboring processors, and added to corresponding
3039     RAP[r,c] entries in the neighboring processors */
3040     Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
3041 lgao 3779 &RAP_ext_val, global_id_P, block_size);
3042 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP6\n", rank);
3043 lgao 3763
3044     num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
3045     sum = RAP_ext_ptr[num_RAPext_rows];
3046     if (MY_DEBUG) fprintf(stderr, "rank %d CP6_0 %d %d (%d %d)\n", rank, sum, num_Pext_cols, num_RAPext_rows, num_Pcouple_cols);
3047     num_RAPext_cols = 0;
3048     if (num_Pext_cols || sum > 0) {
3049     temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
3050     j1_ub = offset + num_Pmain_cols;
3051     for (i=0, iptr=0; i<sum; i++) {
3052     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */ {
3053     temp[iptr++] = RAP_ext_idx[i]; /* XXX */
3054 lgao 3767 if (MY_DEBUG && rank ==1) fprintf(stderr, "RAP_ext_idx[%d]=%d\n", i, RAP_ext_idx[i]);
3055 lgao 3763 }
3056     }
3057 lgao 3767 for (j=0; j<num_Pext_cols; j++, iptr++){
3058 lgao 3763 temp[iptr] = global_id_P[j];
3059 lgao 3767 }
3060 lgao 3763
3061     if (iptr) {
3062     #ifdef USE_QSORTG
3063     qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
3064     #else
3065     qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
3066     #endif
3067     num_RAPext_cols = 1;
3068     i = temp[0];
3069     for (j=1; j<iptr; j++) {
3070     if (temp[j] > i) {
3071     i = temp[j];
3072     temp[num_RAPext_cols++] = i;
3073     }
3074     }
3075    
3076     if (MY_DEBUG) {
3077     char *str1, *str2;
3078     index_t my_sum=iptr, i;
3079     str1 = TMPMEMALLOC(my_sum*20+100, char);
3080     str2 = TMPMEMALLOC(15, char);
3081     sprintf(str1, "rank %d temp[%d] = (", rank, my_sum);
3082     for (i=0; i<my_sum; i++){
3083     sprintf(str2, "%d ", temp[i]);
3084     strcat(str1, str2);
3085     }
3086     fprintf(stderr, "%s)\n", str1);
3087     TMPMEMFREE(str1);
3088     TMPMEMFREE(str2);
3089     }
3090     }
3091     }
3092     if (MY_DEBUG) fprintf(stderr, "rank %d CP6_1 %d %d\n", rank, iptr, num_RAPext_cols);
3093    
3094     /* resize the pattern of P_ext_couple */
3095     if(num_RAPext_cols){
3096     global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
3097     for (i=0; i<num_RAPext_cols; i++)
3098     global_id_RAP[i] = temp[i];
3099     }
3100     if (MY_DEBUG) {
3101     char *str1, *str2;
3102     index_t my_sum=num_RAPext_cols, i;
3103     str1 = TMPMEMALLOC(my_sum*20+100, char);
3104     str2 = TMPMEMALLOC(15, char);
3105     sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, my_sum);
3106     for (i=0; i<my_sum; i++){
3107     sprintf(str2, "%d ", global_id_RAP[i]);
3108     strcat(str1, str2);
3109     }
3110     fprintf(stderr, "%s)\n", str1);
3111     TMPMEMFREE(str1);
3112     TMPMEMFREE(str2);
3113     }
3114     if (num_Pext_cols || sum > 0)
3115     TMPMEMFREE(temp);
3116     j1_ub = offset + num_Pmain_cols;
3117     for (i=0; i<sum; i++) {
3118     if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
3119     where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
3120     /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
3121     RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
3122     } else
3123     RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
3124     }
3125     if (MY_DEBUG) fprintf(stderr, "rank %d CP7 %d\n", rank, num_Pext_cols);
3126    
3127     /* build the mapping */
3128     if (num_Pcouple_cols) {
3129     Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
3130     iptr = 0;
3131     for (i=0; i<num_RAPext_cols; i++)
3132     if (global_id_RAP[i] == P->global_id[iptr]) {
3133     Pcouple_to_RAP[iptr++] = i;
3134     if (iptr == num_Pcouple_cols) break;
3135     }
3136     }
3137    
3138     if (num_Pext_cols) {
3139     Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
3140     iptr = 0;
3141     for (i=0; i<num_RAPext_cols; i++)
3142     if (global_id_RAP[i] == global_id_P[iptr]) {
3143     Pext_to_RAP[iptr++] = i;
3144     if (iptr == num_Pext_cols) break;
3145     }
3146     }
3147    
3148     if (global_id_P){
3149     TMPMEMFREE(global_id_P);
3150     global_id_P = NULL;
3151     }
3152    
3153     /* alloc and initialize the makers */
3154     sum = num_RAPext_cols + num_Pmain_cols;
3155     P_marker = TMPMEMALLOC(sum, index_t);
3156     #pragma omp parallel for private(i) schedule(static)
3157     for (i=0; i<sum; i++) P_marker[i] = -1;
3158     #pragma omp parallel for private(i) schedule(static)
3159     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
3160 lgao 3817 if (MY_DEBUG) fprintf(stderr, "rank %d CP8 %d\n", rank, num_Pmain_cols);
3161 lgao 3763
3162     /* Now, count the size of RAP. Start with rows in R_main */
3163     num_neighbors = P->col_coupler->connector->send->numNeighbors;
3164     offsetInShared = P->col_coupler->connector->send->offsetInShared;
3165     shared = P->col_coupler->connector->send->shared;
3166     i = 0;
3167     j = 0;
3168     for (i_r=0; i_r<num_Pmain_cols; i_r++){
3169     /* Mark the start of row for both main block and couple block */
3170     row_marker = i;
3171     row_marker_ext = j;
3172    
3173     /* Mark the diagonal element RAP[i_r, i_r], and other elements
3174     in RAP_ext */
3175     P_marker[i_r] = i;
3176     i++;
3177     for (j1=0; j1<num_neighbors; j1++) {
3178     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
3179     if (shared[j2] == i_r) {
3180     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
3181     i_c = RAP_ext_idx[k];
3182     if (i_c < num_Pmain_cols) {
3183     if (P_marker[i_c] < row_marker) {
3184     P_marker[i_c] = i;
3185     i++;
3186     }
3187     } else {
3188     if (P_marker[i_c] < row_marker_ext) {
3189     P_marker[i_c] = j;
3190     j++;
3191     }
3192     }
3193     }
3194     break;
3195     }
3196     }
3197     }
3198    
3199     /* then, loop over elements in row i_r of R_main */
3200     j1_ub = R_main->pattern->ptr[i_r+1];
3201     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
3202     i1 = R_main->pattern->index[j1];
3203    
3204     /* then, loop over elements in row i1 of A->col_coupleBlock */
3205     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
3206     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
3207     i2 = A->col_coupleBlock->pattern->index[j2];
3208    
3209     /* check whether entry RA[i_r, i2] has been previously visited.
3210     RAP new entry is possible only if entry RA[i_r, i2] has not
3211     been visited yet */
3212     if (A_marker[i2] != i_r) {
3213     /* first, mark entry RA[i_r,i2] as visited */
3214     A_marker[i2] = i_r;
3215    
3216     /* then loop over elements in row i2 of P_ext_main */
3217     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
3218     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3219     i_c = P->row_coupleBlock->pattern->index[j3];
3220    
3221     /* check whether entry RAP[i_r,i_c] has been created.
3222     If not yet created, create the entry and increase
3223     the total number of elements in RAP_ext */
3224     if (P_marker[i_c] < row_marker) {
3225     P_marker[i_c] = i;
3226     i++;
3227     }
3228     }
3229    
3230     /* loop over elements in row i2 of P_ext_couple, do the same */
3231     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
3232     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3233     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3234     if (P_marker[i_c] < row_marker_ext) {
3235     P_marker[i_c] = j;
3236     j++;
3237     }
3238     }
3239     }
3240     }
3241    
3242     /* now loop over elements in row i1 of A->mainBlock, repeat
3243     the process we have done to block A->col_coupleBlock */
3244     j2_ub = A->mainBlock->pattern->ptr[i1+1];
3245     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
3246     i2 = A->mainBlock->pattern->index[j2];
3247     if (A_marker[i2 + num_Acouple_cols] != i_r) {
3248     A_marker[i2 + num_Acouple_cols] = i_r;
3249     j3_ub = P->mainBlock->pattern->ptr[i2+1];
3250     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3251     i_c = P->mainBlock->pattern->index[j3];
3252     if (P_marker[i_c] < row_marker) {
3253     P_marker[i_c] = i;
3254     i++;
3255     }
3256     }
3257     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
3258     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3259     /* note that we need to map the column index in
3260     P->col_coupleBlock back into the column index in
3261     P_ext_couple */
3262     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3263 lgao 3817 if (MY_DEBUG && (i_c <0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 2\n", rank, i_c, sum);
3264 lgao 3763 if (P_marker[i_c] < row_marker_ext) {
3265     P_marker[i_c] = j;
3266     j++;
3267     }
3268     }
3269     }
3270     }
3271     }
3272     }
3273     if (MY_DEBUG) fprintf(stderr, "rank %d CP9 main_size%d couple_size%d\n", rank, i, j);
3274    
3275     /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
3276     Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
3277     and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
3278     RAP_couple */
3279     RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
3280     RAP_main_idx = MEMALLOC(i, index_t);
3281 lgao 3779 RAP_main_val = TMPMEMALLOC(i * block_size, double);
3282 lgao 3763 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
3283     RAP_couple_idx = MEMALLOC(j, index_t);
3284 lgao 3779 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
3285 lgao 3763
3286     RAP_main_ptr[num_Pmain_cols] = i;
3287     RAP_couple_ptr[num_Pmain_cols] = j;
3288    
3289     #pragma omp parallel for private(i) schedule(static)
3290     for (i=0; i<sum; i++) P_marker[i] = -1;
3291     #pragma omp parallel for private(i) schedule(static)
3292     for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
3293    
3294     /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
3295     in R_main */
3296     i = 0;
3297     j = 0;
3298     for (i_r=0; i_r<num_Pmain_cols; i_r++){
3299     /* Mark the start of row for both main block and couple block */
3300     row_marker = i;
3301     row_marker_ext = j;
3302     RAP_main_ptr[i_r] = row_marker;
3303     RAP_couple_ptr[i_r] = row_marker_ext;
3304    
3305     /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
3306     in row i_r of RAP_ext */
3307     P_marker[i_r] = i;
3308     RAP_main_val[i] = ZERO;
3309     RAP_main_idx[i] = i_r;
3310     i++;
3311    
3312     for (j1=0; j1<num_neighbors; j1++) {
3313     for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
3314     if (shared[j2] == i_r) {
3315     for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
3316     i_c = RAP_ext_idx[k];
3317     if (i_c < num_Pmain_cols) {
3318 lgao 3817 if (MY_DEBUG){
3319     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: ext %f\n", RAP_ext_val[k*block_size]);
3320     }
3321 lgao 3763 if (P_marker[i_c] < row_marker) {
3322     P_marker[i_c] = i;
3323 lgao 3779 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
3324     //RAP_main_val[i] = RAP_ext_val[k];
3325 lgao 3763 if (MY_DEBUG){
3326     if (rank == 0 && i == 2) fprintf(stderr, "ext %f\n", RAP_ext_val[k]);
3327     }
3328     RAP_main_idx[i] = i_c;
3329     i++;
3330     } else {
3331 lgao 3779 //RAP_main_val[P_marker[i_c]] += RAP_ext_val[k];
3332     temp_val = &(RAP_ext_val[k*block_size]);
3333     RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
3334     for (ib=0; ib<block_size; ib++)
3335     RAP_val[ib] += temp_val[ib];
3336 lgao 3763 if (MY_DEBUG){
3337     if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "i_c %d ext %f\n", i_c, RAP_ext_val[k]);
3338     }
3339     }
3340     } else {
3341     if (P_marker[i_c] < row_marker_ext) {
3342     P_marker[i_c] = j;
3343 lgao 3779 //RAP_couple_val[j] = RAP_ext_val[k];
3344     memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
3345 lgao 3763 RAP_couple_idx[j] = i_c - num_Pmain_cols;
3346     j++;
3347     } else {
3348 lgao 3779 //RAP_couple_val[P_marker[i_c]] += RAP_ext_val[k];
3349     temp_val = &(RAP_ext_val[k*block_size]);
3350     RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
3351     for (ib=0; ib<block_size; ib++)
3352     RAP_val[ib] += temp_val[ib];
3353 lgao 3763 }
3354     }
3355     }
3356     break;
3357     }
3358     }
3359     }
3360    
3361     /* then, loop over elements in row i_r of R_main */
3362     j1_ub = R_main->pattern->ptr[i_r+1];
3363     for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
3364     i1 = R_main->pattern->index[j1];
3365 lgao 3779 R_val = &(R_main->val[j1*block_size]);
3366 lgao 3763
3367     /* then, loop over elements in row i1 of A->col_coupleBlock */
3368     j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
3369     for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
3370     i2 = A->col_coupleBlock->pattern->index[j2];
3371 lgao 3779 //RA_val = R_val * A->col_coupleBlock->val[j2];
3372     temp_val = &(A->col_coupleBlock->val[j2*block_size]);
3373     for (irb=0; irb<row_block_size; irb++) {
3374     for (icb=0; icb<col_block_size; icb++) {
3375     rtmp = ZERO;
3376     for (ib=0; ib<col_block_size; ib++) {
3377     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3378     }
3379     RA_val[irb+row_block_size*icb]=rtmp;
3380     }
3381     }
3382 lgao 3763
3383 lgao 3779
3384 lgao 3763 /* check whether entry RA[i_r, i2] has been previously visited.
3385     RAP new entry is possible only if entry RA[i_r, i2] has not
3386     been visited yet */
3387     if (A_marker[i2] != i_r) {
3388     /* first, mark entry RA[i_r,i2] as visited */
3389     A_marker[i2] = i_r;
3390    
3391     /* then loop over elements in row i2 of P_ext_main */
3392     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
3393     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3394     i_c = P->row_coupleBlock->pattern->index[j3];
3395 lgao 3779 //RAP_val = RA_val * P->row_coupleBlock->val[j3];
3396     temp_val = &(P->row_coupleBlock->val[j3*block_size]);
3397     for (irb=0; irb<row_block_size; irb++) {
3398     for (icb=0; icb<col_block_size; icb++) {
3399     rtmp = ZERO;
3400     for (ib=0; ib<col_block_size; ib++) {
3401     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3402     }
3403     RAP_val[irb+row_block_size*icb]=rtmp;
3404     }
3405     }
3406 lgao 3763
3407 lgao 3779
3408 lgao 3763 /* check whether entry RAP[i_r,i_c] has been created.
3409     If not yet created, create the entry and increase
3410     the total number of elements in RAP_ext */
3411 lgao 3817 if (MY_DEBUG){
3412     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP %f\n", RAP_val[0]);
3413     }
3414 lgao 3763 if (P_marker[i_c] < row_marker) {
3415     P_marker[i_c] = i;
3416 lgao 3779 //RAP_main_val[i] = RAP_val;
3417     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
3418 lgao 3763 if (MY_DEBUG){
3419 lgao 3779 if (rank == 0 && i == 2) fprintf(stderr, "R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
3420 lgao 3763 }
3421     RAP_main_idx[i] = i_c;
3422     i++;
3423     } else {
3424 lgao 3779 //RAP_main_val[P_marker[i_c]] += RAP_val;
3425     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3426     for (ib=0; ib<block_size; ib++) {
3427     temp_val[ib] += RAP_val[ib];
3428     }
3429    
3430 lgao 3763 if (MY_DEBUG){
3431 lgao 3779 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1 , *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
3432 lgao 3763 }
3433     }
3434     }
3435    
3436     /* loop over elements in row i2 of P_ext_couple, do the same */
3437     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
3438     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3439     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3440 lgao 3779 //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
3441     temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
3442     for (irb=0; irb<row_block_size; irb++) {
3443     for (icb=0; icb<col_block_size; icb++) {
3444     rtmp = ZERO;
3445     for (ib=0; ib<col_block_size; ib++) {
3446     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3447     }
3448     RAP_val[irb+row_block_size*icb]=rtmp;
3449     }
3450     }
3451    
3452 lgao 3763 if (P_marker[i_c] < row_marker_ext) {
3453     P_marker[i_c] = j;
3454 lgao 3779 //RAP_couple_val[j] = RAP_val;
3455     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
3456 lgao 3763 RAP_couple_idx[j] = i_c - num_Pmain_cols;
3457     j++;
3458     } else {
3459 lgao 3779 //RAP_couple_val[P_marker[i_c]] += RAP_val;
3460     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3461     for (ib=0; ib<block_size; ib++) {
3462     temp_val[ib] += RAP_val[ib];
3463     }
3464 lgao 3763 }
3465     }
3466    
3467     /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
3468     Only the contributions are added. */
3469     } else {
3470     j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
3471     for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3472     i_c = P->row_coupleBlock->pattern->index[j3];
3473 lgao 3779 //RAP_val = RA_val * P->row_coupleBlock->val[j3];
3474     temp_val = &(P->row_coupleBlock->val[j3*block_size]);
3475     for (irb=0; irb<row_block_size; irb++) {
3476     for (icb=0; icb<col_block_size; icb++) {
3477     rtmp = ZERO;
3478     for (ib=0; ib<col_block_size; ib++) {
3479     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3480     }
3481     RAP_val[irb+row_block_size*icb]=rtmp;
3482     }
3483     }
3484    
3485 lgao 3817 if (MY_DEBUG){
3486     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[2] %f\n", RAP_val[0]);
3487     }
3488 lgao 3779 //RAP_main_val[P_marker[i_c]] += RAP_val;
3489     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3490     for (ib=0; ib<block_size; ib++) {
3491     temp_val[ib] += RAP_val[ib];
3492     }
3493    
3494 lgao 3763 if (MY_DEBUG){
3495 lgao 3779 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
3496 lgao 3763 }
3497     }
3498     j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
3499     for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3500     i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3501 lgao 3779 //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
3502     temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
3503     for (irb=0; irb<row_block_size; irb++) {
3504     for (icb=0; icb<col_block_size; icb++) {
3505     rtmp = ZERO;
3506     for (ib=0; ib<col_block_size; ib++) {
3507     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3508     }
3509     RAP_val[irb+row_block_size*icb]=rtmp;
3510     }
3511     }
3512    
3513     //RAP_couple_val[P_marker[i_c]] += RAP_val;
3514     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3515     for (ib=0; ib<block_size; ib++) {
3516     temp_val[ib] += RAP_val[ib];
3517     }
3518 lgao 3763 }
3519     }
3520     }
3521    
3522     /* now loop over elements in row i1 of A->mainBlock, repeat
3523     the process we have done to block A->col_coupleBlock */
3524     j2_ub = A->mainBlock->pattern->ptr[i1+1];
3525     for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
3526     i2 = A->mainBlock->pattern->index[j2];
3527 lgao 3779 //RA_val = R_val * A->mainBlock->val[j2];
3528     temp_val = &(A->mainBlock->val[j2*block_size]);
3529     for (irb=0; irb<row_block_size; irb++) {
3530     for (icb=0; icb<col_block_size; icb++) {
3531     rtmp = ZERO;
3532     for (ib=0; ib<col_block_size; ib++) {
3533     rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3534     }
3535     RA_val[irb+row_block_size*icb]=rtmp;
3536     }
3537     }
3538    
3539 lgao 3763 if (A_marker[i2 + num_Acouple_cols] != i_r) {
3540     A_marker[i2 + num_Acouple_cols] = i_r;
3541     j3_ub = P->mainBlock->pattern->ptr[i2+1];
3542     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3543     i_c = P->mainBlock->pattern->index[j3];
3544 lgao 3779 //RAP_val = RA_val * P->mainBlock->val[j3];
3545     temp_val = &(P->mainBlock->val[j3*block_size]);
3546     for (irb=0; irb<row_block_size; irb++) {
3547     for (icb=0; icb<col_block_size; icb++) {
3548     rtmp = ZERO;
3549     for (ib=0; ib<col_block_size; ib++) {
3550     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3551     }
3552     RAP_val[irb+row_block_size*icb]=rtmp;
3553     }
3554     }
3555    
3556 lgao 3817 if (MY_DEBUG){
3557     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[3] %f\n", RAP_val[0]);
3558     }
3559    
3560 lgao 3763 if (P_marker[i_c] < row_marker) {
3561     P_marker[i_c] = i;
3562 lgao 3779 //RAP_main_val[i] = RAP_val;
3563     memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
3564 lgao 3763 if (MY_DEBUG){
3565 lgao 3779 if (rank == 0 && i == 2) fprintf(stderr, "Main R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
3566 lgao 3763 }
3567     RAP_main_idx[i] = i_c;
3568     i++;
3569     } else {
3570 lgao 3779 //RAP_main_val[P_marker[i_c]] += RAP_val;
3571     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3572     for (ib=0; ib<block_size; ib++) {
3573     temp_val[ib] += RAP_val[ib];
3574     }
3575    
3576 lgao 3763 if (MY_DEBUG){
3577 lgao 3779 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
3578 lgao 3763 }
3579     }
3580     }
3581     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
3582     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3583     /* note that we need to map the column index in
3584     P->col_coupleBlock back into the column index in
3585     P_ext_couple */
3586     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3587 lgao 3817 if (MY_DEBUG && (i_c < 0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d--%d,%d) 3\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);
3588 lgao 3779 //RAP_val = RA_val * P->col_coupleBlock->val[j3];
3589     temp_val = &(P->col_coupleBlock->val[j3*block_size]);
3590     for (irb=0; irb<row_block_size; irb++) {
3591     for (icb=0; icb<col_block_size; icb++) {
3592     rtmp = ZERO;
3593     for (ib=0; ib<col_block_size; ib++) {
3594     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3595     }
3596     RAP_val[irb+row_block_size*icb]=rtmp;
3597     }
3598     }
3599    
3600 lgao 3763 if (P_marker[i_c] < row_marker_ext) {
3601     P_marker[i_c] = j;
3602 lgao 3779 //RAP_couple_val[j] = RAP_val;
3603     memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
3604 lgao 3763 RAP_couple_idx[j] = i_c - num_Pmain_cols;
3605     j++;
3606     } else {
3607 lgao 3779 //RAP_couple_val[P_marker[i_c]] += RAP_val;
3608     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3609     for (ib=0; ib<block_size; ib++) {
3610     temp_val[ib] += RAP_val[ib];
3611     }
3612 lgao 3763 }
3613     }
3614    
3615     } else {
3616     j3_ub = P->mainBlock->pattern->ptr[i2+1];
3617     for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3618     i_c = P->mainBlock->pattern->index[j3];
3619 lgao 3779 //RAP_val = RA_val * P->mainBlock->val[j3];
3620     temp_val = &(P->mainBlock->val[j3*block_size]);
3621     for (irb=0; irb<row_block_size; irb++) {
3622     for (icb=0; icb<col_block_size; icb++) {
3623     rtmp = ZERO;
3624     for (ib=0; ib<col_block_size; ib++) {
3625     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3626     }
3627     RAP_val[irb+row_block_size*icb]=rtmp;
3628     }
3629     }
3630    
3631 lgao 3817 if (MY_DEBUG){
3632     if (rank == 0 && i_r == 5 && i_c == 58) fprintf(stderr, "From: RAP[4] %.20f %d\n", RAP_val[0], P_marker[i_c]);
3633     }
3634 lgao 3779 //RAP_main_val[P_marker[i_c]] += RAP_val;
3635     temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3636     for (ib=0; ib<block_size; ib++) {
3637     temp_val[ib] += RAP_val[ib];
3638     }
3639    
3640 lgao 3763 if (MY_DEBUG){
3641 lgao 3779 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main Visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, *R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
3642 lgao 3763 }
3643     }
3644     j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
3645     for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3646     i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3647 lgao 3817 if (MY_DEBUG && i_c >= sum) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d -- %d,%d) 4\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);
3648 lgao 3779 //RAP_val = RA_val * P->col_coupleBlock->val[j3];
3649     temp_val = &(P->col_coupleBlock->val[j3*block_size]);
3650     for (irb=0; irb<row_block_size; irb++) {
3651     for (icb=0; icb<col_block_size; icb++) {
3652     rtmp = ZERO;
3653     for (ib=0; ib<col_block_size; ib++) {
3654     rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3655     }
3656     RAP_val[irb+row_block_size*icb]=rtmp;
3657     }
3658     }
3659    
3660     //RAP_couple_val[P_marker[i_c]] += RAP_val;
3661     temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3662     for (ib=0; ib<block_size; ib++) {
3663     temp_val[ib] += RAP_val[ib];
3664     }
3665 lgao 3763 }
3666     }
3667     }
3668     }
3669    
3670     /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
3671     if (i > row_marker) {
3672     offset = i - row_marker;
3673     temp = TMPMEMALLOC(offset, index_t);
3674     for (iptr=0; iptr<offset; iptr++)
3675     temp[iptr] = RAP_main_idx[row_marker+iptr];
3676     if (offset > 0) {
3677     #ifdef USE_QSORTG
3678     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3679     #else
3680     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3681     #endif
3682     }
3683 lgao 3779 temp_val = TMPMEMALLOC(offset * block_size, double);
3684 lgao 3763 for (iptr=0; iptr<offset; iptr++){
3685     k = P_marker[temp[iptr]];
3686 lgao 3779 //temp_val[iptr] = RAP_main_val[k];
3687     memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
3688 lgao 3763 P_marker[temp[iptr]] = iptr + row_marker;
3689     }
3690     for (iptr=0; iptr<offset; iptr++){
3691     RAP_main_idx[row_marker+iptr] = temp[iptr];
3692 lgao 3779 //RAP_main_val[row_marker+iptr] = temp_val[iptr];
3693     memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3694 lgao 3763 }
3695     TMPMEMFREE(temp);
3696     TMPMEMFREE(temp_val);
3697     }
3698     if (j > row_marker_ext) {
3699     offset = j - row_marker_ext;
3700     temp = TMPMEMALLOC(offset, index_t);
3701     for (iptr=0; iptr<offset; iptr++)
3702     temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
3703     if (offset > 0) {
3704     #ifdef USE_QSORTG
3705     qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3706     #else
3707     qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3708     #endif
3709     }
3710 lgao 3779 temp_val = TMPMEMALLOC(offset * block_size, double);
3711 lgao 3763 for (iptr=0; iptr<offset; iptr++){
3712 lgao 3817 if (MY_DEBUG && temp[iptr] >= num_RAPext_cols)
3713 lgao 3763 fprintf(stderr, "rank%d index %d exceed %d\n", rank, temp[iptr], num_RAPext_cols);
3714     k = P_marker[temp[iptr] + num_Pmain_cols];
3715 lgao 3779 //temp_val[iptr] = RAP_couple_val[k];
3716     memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
3717 lgao 3763 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
3718     }
3719     for (iptr=0; iptr<offset; iptr++){
3720     RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
3721 lgao 3779 //RAP_couple_val[row_marker_ext+iptr] = temp_val[iptr];
3722     memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3723 lgao 3763 }
3724     TMPMEMFREE(temp);
3725     TMPMEMFREE(temp_val);
3726     }
3727     }
3728    
3729 lgao 3779 TMPMEMFREE(RA_val);
3730     TMPMEMFREE(RAP_val);
3731 lgao 3763 TMPMEMFREE(A_marker);
3732     TMPMEMFREE(Pext_to_RAP);
3733     TMPMEMFREE(Pcouple_to_RAP);
3734     MEMFREE(RAP_ext_ptr);
3735     MEMFREE(RAP_ext_idx);
3736     MEMFREE(RAP_ext_val);
3737     Paso_SparseMatrix_free(R_main);
3738     Paso_SparseMatrix_free(R_couple);
3739    
3740     if (MY_DEBUG){
3741     if (rank == 0) fprintf(stderr, "A[%d]=%f A[%d]=%f A[%d]=%f A[%d]=%f \n", RAP_main_idx[0], RAP_main_val[0], RAP_main_idx[1], RAP_main_val[1], RAP_main_