/[escript]/branches/amg_from_3530/paso/src/AMG_Interpolation.c
ViewVC logotype

Contents of /branches/amg_from_3530/paso/src/AMG_Interpolation.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3767 - (show annotations)
Fri Jan 13 01:52:46 2012 UTC (7 years, 5 months ago) by lgao
File MIME type: text/plain
File size: 74268 byte(s)
A version which passed all my small test cases. Ready for the test cases 
in Unittest. 


1 //
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2011 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: defines AMG Interpolation */
18
19 /**************************************************************/
20
21 /* Author: l.gao@uq.edu.au */
22
23 /**************************************************************/
24
25 #include "Paso.h"
26 #include "SparseMatrix.h"
27 #include "PasoUtil.h"
28 #include "Preconditioner.h"
29
30 /**************************************************************
31
32 Methods nessecary for AMG preconditioner
33
34 construct n_C x n_C interpolation operator A_C from matrix A
35 and prolongation matrix P.
36
37 The coarsening operator A_C is defined by RAP where R=P^T.
38
39 ***************************************************************/
40
41 #define MY_DEBUG 0
42 #define MY_DEBUG1 0
43
44 /* Extend system matrix B with extra two sparse matrixes:
45 B_ext_main and B_ext_couple
46 The combination of this two sparse matrixes represents
47 the portion of B that is stored on neighbor procs and
48 needed locally for triple matrix product (B^T) A B.
49
50 FOR NOW, we assume that the system matrix B has a NULL
51 row_coupleBlock and a NULL remote_coupleBlock. Based on
52 this assumption, we use link row_coupleBlock for sparse
53 matrix B_ext_main, and link remote_coupleBlock for sparse
54 matrix B_ext_couple.
55
56 To be specific, B_ext (on processor k) are group of rows
57 in B, where the list of rows from processor q is given by
58 A->col_coupler->connector->send->shared[rPtr...]
59 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
60 on q. */
61 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
62 {
63 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
64 Paso_Coupler *coupler=NULL;
65 Paso_SharedComponents *send=NULL, *recv=NULL;
66 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
67 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
68 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
69 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
70 index_t *idx_m=NULL, *idx_c=NULL;
71 index_t i, j, k, m, n, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c;
72 index_t offset, len, block_size, block_size_size, max_num_cols;
73 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
74 dim_t *recv_degree=NULL, *send_degree=NULL;
75 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
76
77 if (size == 1) return;
78
79 if (B->row_coupleBlock) {
80 Paso_SparseMatrix_free(B->row_coupleBlock);
81 B->row_coupleBlock = NULL;
82 }
83
84 if (B->row_coupleBlock || B->remote_coupleBlock) {
85 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
86 return;
87 }
88
89 if (MY_DEBUG1) fprintf(stderr, "CP1_1\n");
90
91 /* sending/receiving unknown's global ID */
92 num_main_cols = B->mainBlock->numCols;
93 cols = TMPMEMALLOC(num_main_cols, double);
94 offset = B->col_distribution->first_component[rank];
95 #pragma omp parallel for private(i) schedule(static)
96 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
97 if (B->global_id == NULL) {
98 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
99 Paso_Coupler_startCollect(coupler, cols);
100 }
101
102 recv_buf = TMPMEMALLOC(size, index_t);
103 recv_degree = TMPMEMALLOC(size, dim_t);
104 recv_offset = TMPMEMALLOC(size+1, index_t);
105 #pragma omp parallel for private(i) schedule(static)
106 for (i=0; i<size; i++){
107 recv_buf[i] = 0;
108 recv_degree[i] = 1;
109 recv_offset[i] = i;
110 }
111
112 block_size = B->col_coupler->block_size;
113 block_size_size = block_size * sizeof(double);
114 num_couple_cols = B->col_coupleBlock->numCols;
115 send = A->col_coupler->connector->send;
116 recv = A->col_coupler->connector->recv;
117 num_neighbors = send->numNeighbors;
118 p = send->offsetInShared[num_neighbors];
119 len = p * B->col_distribution->first_component[size];
120 send_buf = TMPMEMALLOC(len * block_size, double);
121 send_idx = TMPMEMALLOC(len, index_t);
122 send_offset = TMPMEMALLOC((p+1)*2, index_t);
123 send_degree = TMPMEMALLOC(num_neighbors, dim_t);
124 i = num_main_cols + num_couple_cols;
125 send_m = TMPMEMALLOC(i * block_size, double);
126 send_c = TMPMEMALLOC(i * block_size, double);
127 idx_m = TMPMEMALLOC(i, index_t);
128 idx_c = TMPMEMALLOC(i, index_t);
129
130 /* waiting for receiving unknown's global ID */
131 if (B->global_id == NULL) {
132 Paso_Coupler_finishCollect(coupler);
133 global_id = MEMALLOC(num_couple_cols, index_t);
134 #pragma omp parallel for private(i) schedule(static)
135 for (i=0; i<num_couple_cols; ++i)
136 global_id[i] = coupler->recv_buffer[i];
137 B->global_id = global_id;
138 Paso_Coupler_free(coupler);
139 } else
140 global_id = B->global_id;
141
142 /* distribute the number of cols in current col_coupleBlock to all ranks */
143 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
144
145 if (MY_DEBUG1) fprintf(stderr, "CP1_2\n");
146
147 /* distribute global_ids of cols to be considered to all ranks*/
148 len = 0;
149 max_num_cols = 0;
150 for (i=0; i<size; i++){
151 recv_degree[i] = recv_buf[i];
152 recv_offset[i] = len;
153 len += recv_buf[i];
154 if (max_num_cols < recv_buf[i])
155 max_num_cols = recv_buf[i];
156 }
157 recv_offset[size] = len;
158 cols_array = TMPMEMALLOC(len, index_t);
159 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
160
161 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_3\n", rank);
162 if (MY_DEBUG) {
163 int i=0;
164 char hostname[256];
165 gethostname(hostname, sizeof(hostname));
166 fprintf(stderr, "rank %d PID %d on %s ready for attach\n", A->mpi_info->rank, getpid(), hostname);
167 if (rank == 1) {
168 fflush(stdout);
169 while (0 == i)
170 sleep(5);
171 }
172 }
173
174 /* first, prepare the ptr_ptr to be received */
175 q = recv->numNeighbors;
176 len = recv->offsetInShared[q];
177 ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
178 for (p=0; p<q; p++) {
179 row = recv->offsetInShared[p];
180 m = recv->offsetInShared[p + 1];
181 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
182 A->mpi_info->msg_tag_counter+recv->neighbor[p],
183 A->mpi_info->comm,
184 &(A->col_coupler->mpi_requests[p]));
185 if(MY_DEBUG1) fprintf(stderr, "rank %d receive %d from %d tag %d (numNeighbors %d)\n", rank, 2*(m-row), recv->neighbor[p], A->mpi_info->msg_tag_counter+recv->neighbor[p], q);
186 }
187
188 /* now prepare the rows to be sent (the degree, the offset and the data) */
189 len = 0;
190 for (p=0; p<num_neighbors; p++) {
191 i = 0;
192 neighbor = send->neighbor[p];
193 m_lb = B->col_distribution->first_component[neighbor];
194 m_ub = B->col_distribution->first_component[neighbor + 1];
195 j_ub = send->offsetInShared[p + 1];
196 if (MY_DEBUG1) fprintf(stderr, "rank%d neighbor %d m_lb %d m_ub %d offset %d\n", rank, neighbor, m_lb, m_ub, offset);
197 for (j=send->offsetInShared[p]; j<j_ub; j++) {
198 row = send->shared[j];
199 l_m = 0;
200 l_c = 0;
201 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
202 k_lb = B->col_coupleBlock->pattern->ptr[row];
203
204 /* check part of col_coupleBlock for data to be passed @row */
205 // q = B->mainBlock->pattern->index[B->mainBlock->pattern->ptr[row]] + offset;
206 for (k=k_lb; k<k_ub; k++) {
207 m = global_id[B->col_coupleBlock->pattern->index[k]];
208 // if (m > q) break;
209 if (m > offset) break;
210 if (MY_DEBUG) fprintf(stderr, "rank%d (1) row %d m %d k %d\n", rank, row, m, B->col_coupleBlock->pattern->index[k]);
211 if (m>= m_lb && m < m_ub) {
212 /* data to be passed to sparse matrix B_ext_main */
213 idx_m[l_m] = m - m_lb;
214 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
215 l_m++;
216 } else {
217 /* data to be passed to sparse matrix B_ext_couple */
218 idx_c[l_c] = m;
219 if (MY_DEBUG) fprintf(stderr, "rank%d (1) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);
220 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
221 l_c++;
222 }
223 }
224 k_lb = k;
225
226 /* check mainBlock for data to be passed @row to sparse
227 matrix B_ext_couple */
228 k_ub = B->mainBlock->pattern->ptr[row + 1];
229 k = B->mainBlock->pattern->ptr[row];
230 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
231 for (; k<k_ub; k++) {
232 m = B->mainBlock->pattern->index[k] + offset;
233 if (MY_DEBUG) fprintf(stderr, "rank%d (2) row %d m %d k %d\n", rank, row, m, B->mainBlock->pattern->index[k]);
234 if (MY_DEBUG) fprintf(stderr, "rank%d (2) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);
235 idx_c[l_c] = m;
236 l_c++;
237 }
238
239 /* check the rest part of col_coupleBlock for data to
240 be passed @row to sparse matrix B_ext_couple */
241 k = k_lb;
242 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
243 for (k=k_lb; k<k_ub; k++) {
244 m = global_id[B->col_coupleBlock->pattern->index[k]];
245 if (MY_DEBUG) fprintf(stderr, "rank%d (3) row %d m %d k %d\n", rank, row, m, B->col_coupleBlock->pattern->index[k]);
246 if (m>= m_lb && m < m_ub) {
247 /* data to be passed to sparse matrix B_ext_main */
248 idx_m[l_m] = m - m_lb;
249 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
250 l_m++;
251 } else {
252 /* data to be passed to sparse matrix B_ext_couple */
253 idx_c[l_c] = m;
254 if (MY_DEBUG) fprintf(stderr, "rank%d (3) m %d m_lb %d m_ub %d\n", rank, m, m_lb, m_ub);
255 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
256 l_c++;
257 }
258 }
259
260 if (MY_DEBUG && rank == 2) fprintf(stderr, "rank %d p %d i %d j %d l_c %d l_m %d ub %d\n", rank, p, i, j, l_c, l_m, num_couple_cols + num_main_cols);
261 memcpy(&(send_buf[len]), send_m, block_size_size*l_m);
262 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
263 send_offset[2*i] = l_m;
264 len += l_m;
265 memcpy(&(send_buf[len]), send_c, block_size_size*l_c);
266 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
267 send_offset[2*i+1] = l_c;
268 len += l_c;
269 if (MY_DEBUG && rank == 0) {
270 int sum = l_m+l_c, my_i;
271 char *str1, *str2;
272 str1 = TMPMEMALLOC(sum*100+100, char);
273 str2 = TMPMEMALLOC(100, char);
274 sprintf(str1, "rank %d send_idx[%d] offset %d m%d c%d= (", rank, sum, len-sum, l_m, l_c);
275 for (my_i=len-sum; my_i<len; my_i++){
276 sprintf(str2, "%d ", send_idx[my_i]);
277 strcat(str1, str2);
278 }
279 fprintf(stderr, "%s)\n", str1);
280 TMPMEMFREE(str1);
281 TMPMEMFREE(str2);
282 }
283 i++;
284 }
285
286 if (MY_DEBUG && rank == 0) {
287 int my_i,sum = len;
288 char *str1, *str2;
289 str1 = TMPMEMALLOC(sum*100+100, char);
290 str2 = TMPMEMALLOC(100, char);
291 sprintf(str1, "rank %d send_idx[%d] = (", rank, sum);
292 for (my_i=0; my_i<sum; my_i++){
293 sprintf(str2, "%d ", send_idx[my_i]);
294 strcat(str1, str2);
295 }
296 fprintf(stderr, "%s)\n", str1);
297 TMPMEMFREE(str1);
298 TMPMEMFREE(str2);
299 }
300
301 /* sending */
302 MPI_Issend(send_offset, 2*i, MPI_INT, send->neighbor[p],
303 A->mpi_info->msg_tag_counter+rank,
304 A->mpi_info->comm,
305 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
306 if(MY_DEBUG) fprintf(stderr, "rank %d send %d to %d tag %d (numNeighbors %d)\n", rank, 2*i, send->neighbor[p], A->mpi_info->msg_tag_counter+rank, send->numNeighbors);
307 send_degree[p] = len;
308 }
309 TMPMEMFREE(send_m);
310 TMPMEMFREE(send_c);
311 TMPMEMFREE(idx_m);
312 TMPMEMFREE(idx_c);
313
314
315 q = recv->numNeighbors;
316 len = recv->offsetInShared[q];
317 ptr_main = MEMALLOC((len+1), index_t);
318 ptr_couple = MEMALLOC((len+1), index_t);
319
320
321 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
322 A->col_coupler->mpi_requests,
323 A->col_coupler->mpi_stati);
324 A->mpi_info->msg_tag_counter += size;
325
326 if (MY_DEBUG) {
327 int sum = recv->offsetInShared[recv->numNeighbors] * 2;
328 char *str1, *str2;
329 str1 = TMPMEMALLOC(sum*100+100, char);
330 str2 = TMPMEMALLOC(100, char);
331 sprintf(str1, "rank %d ptr_ptr[%d] = (", rank, sum);
332 for (i=0; i<sum; i++){
333 sprintf(str2, "%d ", ptr_ptr[i]);
334 strcat(str1, str2);
335 }
336 fprintf(stderr, "%s)\n", str1);
337 TMPMEMFREE(str1);
338 TMPMEMFREE(str2);
339 }
340
341 j = 0;
342 k = 0;
343 ptr_main[0] = 0;
344 ptr_couple[0] = 0;
345 // #pragma omp parallel for private(i,j,k) schedule(static)
346 for (i=0; i<len; i++) {
347 j += ptr_ptr[2*i];
348 k += ptr_ptr[2*i+1];
349 ptr_main[i+1] = j;
350 ptr_couple[i+1] = k;
351 }
352
353 TMPMEMFREE(ptr_ptr);
354 idx_main = MEMALLOC(j, index_t);
355 idx_couple = MEMALLOC(k, index_t);
356 ptr_idx = TMPMEMALLOC(j+k, index_t);
357 ptr_val = TMPMEMALLOC(j+k, double);
358 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_4_2 %d %d %d\n", rank, j, k, len);
359
360 /* send/receive index array */
361 j=0;
362 k_ub = 0;
363 for (p=0; p<recv->numNeighbors; p++) {
364 k = recv->offsetInShared[p];
365 m = recv->offsetInShared[p+1];
366 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
367 if (i > 0) {
368 k_ub ++;
369 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
370 A->mpi_info->msg_tag_counter+recv->neighbor[p],
371 A->mpi_info->comm,
372 &(A->col_coupler->mpi_requests[p]));
373 if(MY_DEBUG) fprintf(stderr, "rank %d (INDEX) recv %d from %d tag %d\n", rank, i, recv->neighbor[p], A->mpi_info->msg_tag_counter+recv->neighbor[p]);
374 } else {
375 if (MY_DEBUG1) fprintf(stderr, "rank%d k %d m %d main(%d %d) couple(%d %d)\n", rank, k, m, ptr_main[m], ptr_main[k], ptr_couple[m], ptr_couple[k]);
376 }
377 j += i;
378 }
379
380 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_4_3 %d %d\n", rank, recv->numNeighbors, k_ub);
381 j=0;
382 k_ub = 0;
383 for (p=0; p<num_neighbors; p++) {
384 i = send_degree[p] - j;
385 if (i > 0){
386 k_ub ++;
387 if (MY_DEBUG && rank == 0 && send->neighbor[p] == 1) {
388 int sum = i, my_i;
389 char *str1, *str2;
390 str1 = TMPMEMALLOC(sum*100+100, char);
391 str2 = TMPMEMALLOC(100, char);
392 sprintf(str1, "rank %d send_idx[%d] offset %d = (", rank, sum, j);
393 for (my_i=0; my_i<sum; my_i++){
394 sprintf(str2, "%d ", send_idx[j+my_i]);
395 strcat(str1, str2);
396 }
397 fprintf(stderr, "%s)\n", str1);
398 TMPMEMFREE(str1);
399 TMPMEMFREE(str2);
400 }
401 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
402 A->mpi_info->msg_tag_counter+rank,
403 A->mpi_info->comm,
404 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
405 if(MY_DEBUG1) fprintf(stderr, "rank %d (INDEX) send %d to %d tag %d\n", rank, i, send->neighbor[p], A->mpi_info->msg_tag_counter+rank);
406 } else {
407 if (MY_DEBUG1) fprintf(stderr, "rank%d send_degree %d, p %d, j %d\n", rank, send_degree[p], p, j);
408 }
409 j = send_degree[p];
410 }
411 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_4_4 %d %d\n", rank,num_neighbors, k_ub);
412
413 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
414 A->col_coupler->mpi_requests,
415 A->col_coupler->mpi_stati);
416 A->mpi_info->msg_tag_counter += size;
417 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_5 %d %d %d\n", rank, len, ptr_main[len], ptr_couple[len]);
418 if (MY_DEBUG && rank == 1) {
419 int my_i, sum1 = recv->offsetInShared[recv->numNeighbors];
420 int sum = ptr_main[sum1] + ptr_couple[sum1];
421 char *str1, *str2;
422 str1 = TMPMEMALLOC(sum*100+100, char);
423 str2 = TMPMEMALLOC(100, char);
424 sprintf(str1, "rank %d ptr_idx[%d] = (", rank, sum);
425 for (my_i=0; my_i<sum; my_i++){
426 sprintf(str2, "%d ", ptr_idx[my_i]);
427 strcat(str1, str2);
428 }
429 fprintf(stderr, "%s)\n", str1);
430 TMPMEMFREE(str1);
431 TMPMEMFREE(str2);
432 }
433
434
435 // #pragma omp parallel for private(i,j,k,m,p) schedule(static)
436 for (i=0; i<len; i++) {
437 j = ptr_main[i];
438 k = ptr_main[i+1];
439 m = ptr_couple[i];
440 for (p=j; p<k; p++) {
441 idx_main[p] = ptr_idx[m+p];
442 }
443 j = ptr_couple[i+1];
444 for (p=m; p<j; p++) {
445 idx_couple[p] = ptr_idx[k+p];
446 }
447 }
448 TMPMEMFREE(ptr_idx);
449 if (MY_DEBUG1) fprintf(stderr, "rank %d CP1_5_1 %d %d\n", rank, num_main_cols, len);
450 if (MY_DEBUG) {
451 int sum = num_main_cols, sum1 = ptr_main[sum];
452 char *str1, *str2;
453 str1 = TMPMEMALLOC(sum1*30+100, char);
454 str2 = TMPMEMALLOC(100, char);
455 sprintf(str1, "rank %d ptr_main[%d] = (", rank, sum);
456 for (i=0; i<sum+1; i++){
457 sprintf(str2, "%d ", ptr_main[i]);
458 strcat(str1, str2);
459 }
460 fprintf(stderr, "%s)\n", str1);
461 sprintf(str1, "rank %d idx_main[%d] = (", rank, sum1);
462 for (i=0; i<sum1; i++){
463 sprintf(str2, "%d ", idx_main[i]);
464 strcat(str1, str2);
465 }
466 fprintf(stderr, "%s)\n", str1);
467 TMPMEMFREE(str1);
468 TMPMEMFREE(str2);
469 }
470
471 /* allocate pattern and sparsematrix for B_ext_main */
472 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
473 len, num_main_cols, ptr_main, idx_main);
474 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
475 pattern_main, B->row_block_size, B->col_block_size,
476 FALSE);
477 Paso_Pattern_free(pattern_main);
478 if (MY_DEBUG) fprintf(stderr, "rank %d CP1_5_2\n", rank);
479
480 /* allocate pattern and sparsematrix for B_ext_couple */
481 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
482 len, B->col_distribution->first_component[size],
483 ptr_couple, idx_couple);
484 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
485 pattern_couple, B->row_block_size, B->col_block_size,
486 FALSE);
487 Paso_Pattern_free(pattern_couple);
488
489 if (MY_DEBUG1) fprintf(stderr, "CP1_6\n");
490
491 /* send/receive value array */
492 j=0;
493 for (p=0; p<recv->numNeighbors; p++) {
494 k = recv->offsetInShared[p];
495 m = recv->offsetInShared[p+1];
496 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
497 if (i > 0)
498 MPI_Irecv(&(ptr_val[j]), i * block_size,
499 MPI_DOUBLE, recv->neighbor[p],
500 A->mpi_info->msg_tag_counter+recv->neighbor[p],
501 A->mpi_info->comm,
502 &(A->col_coupler->mpi_requests[p]));
503 j += i;
504 }
505
506 j=0;
507 for (p=0; p<num_neighbors; p++) {
508 i = send_degree[p] - j;
509 if (i > 0)
510 MPI_Issend(&(send_buf[j]), i*block_size, MPI_DOUBLE, send->neighbor[p],
511 A->mpi_info->msg_tag_counter+rank,
512 A->mpi_info->comm,
513 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
514 j = send_degree[p];
515 }
516
517 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
518 A->col_coupler->mpi_requests,
519 A->col_coupler->mpi_stati);
520 A->mpi_info->msg_tag_counter += size;
521 if (MY_DEBUG1) fprintf(stderr, "CP1_7\n");
522
523 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
524 for (i=0; i<len; i++) {
525 j = ptr_main[i];
526 k = ptr_main[i+1];
527 m = ptr_couple[i];
528 for (p=j; p<k; p++) {
529 B->row_coupleBlock->val[p] = ptr_val[m+p];
530 }
531 j = ptr_couple[i+1];
532 for (p=m; p<j; p++) {
533 B->remote_coupleBlock->val[p] = ptr_val[k+p];
534 }
535 }
536 TMPMEMFREE(ptr_val);
537
538 /* release all temp memory allocation */
539 TMPMEMFREE(cols);
540 TMPMEMFREE(cols_array);
541 TMPMEMFREE(recv_offset);
542 TMPMEMFREE(recv_degree);
543 TMPMEMFREE(recv_buf);
544 TMPMEMFREE(send_buf);
545 TMPMEMFREE(send_offset);
546 TMPMEMFREE(send_degree);
547 TMPMEMFREE(send_idx);
548 }
549
550 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
551 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
552 to copy block of data in T to neighbor processors, defined by
553 P->col_coupler->connector->recv->neighbor[k] where k is in
554 [0, P->col_coupler->connector->recv->numNeighbors).
555 Rows to be copied to neighbor processor k is in the list defined by
556 P->col_coupler->connector->recv->offsetInShared[k] ...
557 P->col_coupler->connector->recv->offsetInShared[k+1] */
558 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
559 index_t **p_ptr, index_t **p_idx, double **p_val, index_t *global_id)
560 {
561 Paso_SharedComponents *send=NULL, *recv=NULL;
562 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
563 index_t i, j, p, m, n, rank, size, offset;
564 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
565 index_t *ptr=*p_ptr, *idx=*p_idx;
566 double *val=*p_val, *recv_val=NULL;
567
568 rank = P->mpi_info->rank;
569 size = P->mpi_info->size;
570 send = P->col_coupler->connector->recv;
571 recv = P->col_coupler->connector->send;
572 send_neighbors = send->numNeighbors;
573 recv_neighbors = recv->numNeighbors;
574 send_rows = P->col_coupleBlock->numCols;
575 recv_rows = recv->offsetInShared[recv_neighbors];
576 offset = P->col_distribution->first_component[rank];
577
578 send_degree = TMPMEMALLOC(send_rows, index_t);
579 recv_ptr = MEMALLOC(recv_rows + 1, index_t);
580 #pragma omp for schedule(static) private(i)
581 for (i=0; i<send_rows; i++)
582 send_degree[i] = ptr[i+1] - ptr[i];
583 if (MY_DEBUG) fprintf(stderr, "rank %d CP5_1 srows %d rrows %d\n", rank, send_rows, recv_rows);
584
585 /* First, send/receive the degree */
586 for (p=0; p<recv_neighbors; p++) { /* Receiving */
587 m = recv->offsetInShared[p];
588 n = recv->offsetInShared[p+1];
589 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
590 P->mpi_info->msg_tag_counter + recv->neighbor[p],
591 P->mpi_info->comm,
592 &(P->col_coupler->mpi_requests[p]));
593 }
594 for (p=0; p<send_neighbors; p++) { /* Sending */
595 m = send->offsetInShared[p];
596 n = send->offsetInShared[p+1];
597 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
598 P->mpi_info->msg_tag_counter + rank,
599 P->mpi_info->comm,
600 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
601 }
602 MPI_Waitall(send_neighbors+recv_neighbors,
603 P->col_coupler->mpi_requests,
604 P->col_coupler->mpi_stati);
605 P->mpi_info->msg_tag_counter += size;
606 if (MY_DEBUG) fprintf(stderr, "rank %d CP5_3 %d %d\n", rank, recv_neighbors, send_neighbors);
607
608 TMPMEMFREE(send_degree);
609 m = Paso_Util_cumsum(recv_rows, recv_ptr);
610 recv_ptr[recv_rows] = m;
611 recv_idx = MEMALLOC(m, index_t);
612 recv_val = MEMALLOC(m, double);
613 if (MY_DEBUG) fprintf(stderr, "rank %d receive size %d\n", rank, m);
614
615 /* Next, send/receive the index array */
616 j = 0;
617 for (p=0; p<recv_neighbors; p++) { /* Receiving */
618 m = recv->offsetInShared[p];
619 n = recv->offsetInShared[p+1];
620 i = recv_ptr[n] - recv_ptr[m];
621 if (i > 0) {
622 if (MY_DEBUG) fprintf(stderr, "rank %d recv message size %d from %d (%d, %d, %d, %d, %d) \n", rank, i, recv->neighbor[p], p, m, n, recv_ptr[n], recv_ptr[m]);
623 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
624 P->mpi_info->msg_tag_counter + recv->neighbor[p],
625 P->mpi_info->comm,
626 &(P->col_coupler->mpi_requests[p]));
627 } else
628 if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty recv message %d %d %d %d %d %d\n", rank, recv_neighbors, p, m, n, recv_ptr[n], recv_ptr[m]);
629 j += i;
630 }
631
632 j = 0;
633 for (p=0; p<send_neighbors; p++) { /* Sending */
634 m = send->offsetInShared[p];
635 n = send->offsetInShared[p+1];
636 i = ptr[n] - ptr[m];
637 if (i >0) {
638 if (MY_DEBUG) fprintf(stderr, "rank %d send message size %d to %d (%d, %d, %d, %d, %d) \n", rank, i, send->neighbor[p], p, m, n, ptr[n], ptr[m]);
639 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
640 P->mpi_info->msg_tag_counter + rank,
641 P->mpi_info->comm,
642 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
643 j += i;
644 } else
645 if (MY_DEBUG) fprintf(stderr, "rank %d WARNING empty send message %d %d %d %d %d %d\n", rank, send_neighbors, p, m, n, ptr[n], ptr[m]);
646 }
647 MPI_Waitall(send_neighbors+recv_neighbors,
648 P->col_coupler->mpi_requests,
649 P->col_coupler->mpi_stati);
650 P->mpi_info->msg_tag_counter += size;
651 if (MY_DEBUG) fprintf(stderr, "rank %d CP5_5\n", rank);
652
653 /* Last, send/receive the data array */
654 j = 0;
655 for (p=0; p<recv_neighbors; p++) { /* Receiving */
656 m = recv->offsetInShared[p];
657 n = recv->offsetInShared[p+1];
658 i = recv_ptr[n] - recv_ptr[m];
659 if (i > 0)
660 MPI_Irecv(&(recv_val[j]), i, MPI_DOUBLE, recv->neighbor[p],
661 P->mpi_info->msg_tag_counter + recv->neighbor[p],
662 P->mpi_info->comm,
663 &(P->col_coupler->mpi_requests[p]));
664 j += i;
665 }
666
667 j = 0;
668 for (p=0; p<send_neighbors; p++) { /* Sending */
669 m = send->offsetInShared[p];
670 n = send->offsetInShared[p+1];
671 i = ptr[n] - ptr[m];
672 if (i >0) {
673 MPI_Issend(&(val[j]), i, MPI_DOUBLE, send->neighbor[p],
674 P->mpi_info->msg_tag_counter + rank,
675 P->mpi_info->comm,
676 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
677 j += i;
678 }
679 }
680 MPI_Waitall(send_neighbors+recv_neighbors,
681 P->col_coupler->mpi_requests,
682 P->col_coupler->mpi_stati);
683 P->mpi_info->msg_tag_counter += size;
684 if (MY_DEBUG) fprintf(stderr, "rank %d CP5_7\n", rank);
685
686 /* Clean up and return with recevied ptr, index and data arrays */
687 MEMFREE(ptr);
688 MEMFREE(idx);
689 MEMFREE(val);
690 if (MY_DEBUG) fprintf(stderr, "rank %d len %d\n", rank, recv_ptr[recv_rows]);
691 *p_ptr = recv_ptr;
692 *p_idx = recv_idx;
693 *p_val = recv_val;
694 }
695
696 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
697 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
698 {
699 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
700 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
701 Paso_SystemMatrix *out=NULL;
702 Paso_SystemMatrixPattern *pattern=NULL;
703 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
704 Paso_SharedComponents *send =NULL, *recv=NULL;
705 Paso_Connector *col_connector=NULL, *row_connector=NULL;
706 Paso_Coupler *coupler=NULL;
707 Paso_Pattern *main_pattern=NULL;
708 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
709 const dim_t row_block_size=A->row_block_size;
710 const dim_t col_block_size=A->col_block_size;
711 const dim_t num_threads=omp_get_max_threads();
712 const double ZERO = 0.0;
713 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
714 double RAP_val, RA_val, R_val, *temp_val=NULL;
715 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
716 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
717 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
718 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
719 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
720 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
721 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
722 index_t sum, i, j, k, iptr;
723 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
724 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
725 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
726 index_t row_marker_ext, *where_p=NULL;
727 index_t **send_ptr=NULL, **send_idx=NULL;
728 dim_t l, p, q, global_label, num_neighbors;
729 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
730 Esys_MPI_rank *neighbor=NULL;
731 #ifdef ESYS_MPI
732 MPI_Request* mpi_requests=NULL;
733 MPI_Status* mpi_stati=NULL;
734 #else
735 int *mpi_requests=NULL, *mpi_stati=NULL;
736 #endif
737
738 if (size == 1) return NULL;
739 /* two sparse matrixes R_main and R_couple will be generate, as the
740 transpose of P_main and P_col_couple, respectively. Note that,
741 R_couple is actually the row_coupleBlock of R (R=P^T) */
742 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
743 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
744
745 if (MY_DEBUG) fprintf(stderr, "CP1\n");
746
747 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
748 and needed locally for triple matrix product RAP
749 to be specific, P_ext (on processor k) are group of rows in P, where
750 the list of rows from processor q is given by
751 A->col_coupler->connector->send->shared[rPtr...]
752 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
753 on q.
754 P_ext is represented by two sparse matrixes P_ext_main and
755 P_ext_couple */
756 Paso_Preconditioner_AMG_extendB(A, P);
757 if (MY_DEBUG1) fprintf(stderr, "rank %d CP2\n", rank);
758
759 /* count the number of cols in P_ext_couple, resize the pattern of
760 sparse matrix P_ext_couple with new compressed order, and then
761 build the col id mapping from P->col_coupleBlock to
762 P_ext_couple */
763 num_Pmain_cols = P->mainBlock->numCols;
764 num_Pcouple_cols = P->col_coupleBlock->numCols;
765 num_Acouple_cols = A->col_coupleBlock->numCols;
766 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
767 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
768 offset = P->col_distribution->first_component[rank];
769 num_Pext_cols = 0;
770 if (P->global_id) {
771 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
772 and P->col_coupleBlock */
773 iptr = 0;
774 if (num_Pcouple_cols || sum > 0) {
775 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
776 for (; iptr<sum; iptr++){
777 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
778 if (MY_DEBUG && rank ==1)
779 fprintf(stderr, "rank %d remote_block[%d] = %d\n", rank, iptr, temp[iptr]);
780 }
781 for (j=0; j<num_Pcouple_cols; j++, iptr++){
782 temp[iptr] = P->global_id[j];
783 if (MY_DEBUG && rank ==1)
784 fprintf(stderr, "rank %d global_id[%d] = %d\n", rank, j, P->global_id[j]);
785 }
786 }
787 if (iptr) {
788 #ifdef USE_QSORTG
789 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
790 #else
791 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
792 #endif
793 num_Pext_cols = 1;
794 i = temp[0];
795 for (j=1; j<iptr; j++) {
796 if (temp[j] > i) {
797 i = temp[j];
798 temp[num_Pext_cols++] = i;
799 }
800 }
801 }
802
803 /* resize the pattern of P_ext_couple */
804 if(num_Pext_cols){
805 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
806 for (i=0; i<num_Pext_cols; i++)
807 global_id_P[i] = temp[i];
808 if (MY_DEBUG && rank == 1) {
809 int my_i, sum=num_Pext_cols;
810 char *str1, *str2;
811 str1 = TMPMEMALLOC(sum*100+100, char);
812 str2 = TMPMEMALLOC(100, char);
813 sprintf(str1, "rank %d global_id_P[%d] = (", rank, sum);
814 for (my_i=0; my_i<sum; my_i++) {
815 sprintf(str2, "%d ", global_id_P[my_i]);
816 strcat(str1, str2);
817 }
818 fprintf(stderr, "%s)\n", str1);
819 TMPMEMFREE(str1);
820 TMPMEMFREE(str2);
821 }
822
823 }
824 if (num_Pcouple_cols || sum > 0)
825 TMPMEMFREE(temp);
826 for (i=0; i<sum; i++) {
827 where_p = (index_t *)bsearch(
828 &(P->remote_coupleBlock->pattern->index[i]),
829 global_id_P, num_Pext_cols,
830 sizeof(index_t), Paso_comparIndex);
831 P->remote_coupleBlock->pattern->index[i] =
832 (index_t)(where_p -global_id_P);
833 }
834
835 /* build the mapping */
836 if (num_Pcouple_cols) {
837 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
838 iptr = 0;
839 for (i=0; i<num_Pext_cols; i++)
840 if (global_id_P[i] == P->global_id[iptr]) {
841 Pcouple_to_Pext[iptr++] = i;
842 if (iptr == num_Pcouple_cols) break;
843 }
844 }
845 }
846
847 if (MY_DEBUG) fprintf(stderr, "CP3\n");
848 if (MY_DEBUG && rank == 0) {
849 fprintf(stderr, "Rank %d Now Prolongation Matrix ************************\n", rank);
850 Paso_SystemMatrix_print(P);
851 fprintf(stderr, "Rank %d Now Restriction Matrix ************************\n", rank);
852 Paso_SystemMatrix_print(R);
853 }
854
855 /* alloc and initialize the makers */
856 sum = num_Pext_cols + num_Pmain_cols;
857 P_marker = TMPMEMALLOC(sum, index_t);
858 A_marker = TMPMEMALLOC(num_A_cols, index_t);
859 #pragma omp parallel for private(i) schedule(static)
860 for (i=0; i<sum; i++) P_marker[i] = -1;
861 #pragma omp parallel for private(i) schedule(static)
862 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
863
864 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_1 %d\n", rank, num_Pcouple_cols);
865 /* Now, count the size of RAP_ext. Start with rows in R_couple */
866 sum = 0;
867 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
868 row_marker = sum;
869 /* then, loop over elements in row i_r of R_couple */
870 j1_ub = R_couple->pattern->ptr[i_r+1];
871 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_2 %d (%d %d)\n", rank, i_r, R_couple->pattern->ptr[i_r], j1_ub);
872 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
873 i1 = R_couple->pattern->index[j1];
874 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_3 %d %d\n", rank, j1, i1);
875 /* then, loop over elements in row i1 of A->col_coupleBlock */
876 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
877 if (MY_DEBUG) fprintf(stderr, "rank %d CP3_4 %d %d %d\n", rank, j1, i1, j2_ub);
878 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
879 i2 = A->col_coupleBlock->pattern->index[j2];
880
881 /* check whether entry RA[i_r, i2] has been previously visited.
882 RAP new entry is possible only if entry RA[i_r, i2] has not
883 been visited yet */
884 if (A_marker[i2] != i_r) {
885 /* first, mark entry RA[i_r,i2] as visited */
886 A_marker[i2] = i_r;
887
888 /* then loop over elements in row i2 of P_ext_main */
889 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
890 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
891 i_c = P->row_coupleBlock->pattern->index[j3];
892
893 /* check whether entry RAP[i_r,i_c] has been created.
894 If not yet created, create the entry and increase
895 the total number of elements in RAP_ext */
896 if (P_marker[i_c] < row_marker) {
897 P_marker[i_c] = sum;
898 sum++;
899 }
900 }
901
902 /* loop over elements in row i2 of P_ext_couple, do the same */
903 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
904 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
905 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
906 if (P_marker[i_c] < row_marker) {
907 P_marker[i_c] = sum;
908 sum++;
909 }
910 }
911 }
912 }
913
914 /* now loop over elements in row i1 of A->mainBlock, repeat
915 the process we have done to block A->col_coupleBlock */
916 j2_ub = A->mainBlock->pattern->ptr[i1+1];
917 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
918 i2 = A->mainBlock->pattern->index[j2];
919 if (A_marker[i2 + num_Acouple_cols] != i_r) {
920 A_marker[i2 + num_Acouple_cols] = i_r;
921 j3_ub = P->mainBlock->pattern->ptr[i2+1];
922 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
923 i_c = P->mainBlock->pattern->index[j3];
924 if (P_marker[i_c] < row_marker) {
925 P_marker[i_c] = sum;
926 sum++;
927 }
928 }
929 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
930 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
931 /* note that we need to map the column index in
932 P->col_coupleBlock back into the column index in
933 P_ext_couple */
934 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
935 if (MY_DEBUG1 && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d-%d out of %d+%d) 5\n", rank, i_c, P->col_coupleBlock->pattern->index[j3], num_Pext_cols, num_Pmain_cols);
936 if (P_marker[i_c] < row_marker) {
937 P_marker[i_c] = sum;
938 sum++;
939 }
940 }
941 }
942 }
943 }
944 }
945 if (MY_DEBUG1) fprintf(stderr, "rank %d CP4 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);
946
947 /* Now we have the number of non-zero elements in RAP_ext, allocate
948 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
949 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
950 RAP_ext_idx = MEMALLOC(sum, index_t);
951 RAP_ext_val = MEMALLOC(sum, double);
952
953 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
954 sum = num_Pext_cols + num_Pmain_cols;
955 #pragma omp parallel for private(i) schedule(static)
956 for (i=0; i<sum; i++) P_marker[i] = -1;
957 #pragma omp parallel for private(i) schedule(static)
958 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
959 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_1 sum %d\n", rank, sum);
960 sum = 0;
961 RAP_ext_ptr[0] = 0;
962 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
963 row_marker = sum;
964 if (MY_DEBUG) fprintf(stderr, "rank %d CP4_2 i_r %d sum %d\n", rank, i_r, sum);
965 /* then, loop over elements in row i_r of R_couple */
966 j1_ub = R_couple->pattern->ptr[i_r+1];
967 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
968 i1 = R_couple->pattern->index[j1];
969 R_val = R_couple->val[j1];
970
971 /* then, loop over elements in row i1 of A->col_coupleBlock */
972 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
973 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
974 i2 = A->col_coupleBlock->pattern->index[j2];
975 RA_val = R_val * A->col_coupleBlock->val[j2];
976
977 /* check whether entry RA[i_r, i2] has been previously visited.
978 RAP new entry is possible only if entry RA[i_r, i2] has not
979 been visited yet */
980 if (A_marker[i2] != i_r) {
981 /* first, mark entry RA[i_r,i2] as visited */
982 A_marker[i2] = i_r;
983
984 /* then loop over elements in row i2 of P_ext_main */
985 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
986 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
987 i_c = P->row_coupleBlock->pattern->index[j3];
988 RAP_val = RA_val * P->row_coupleBlock->val[j3];
989
990 /* check whether entry RAP[i_r,i_c] has been created.
991 If not yet created, create the entry and increase
992 the total number of elements in RAP_ext */
993 if (P_marker[i_c] < row_marker) {
994 P_marker[i_c] = sum;
995 RAP_ext_val[sum] = RAP_val;
996 RAP_ext_idx[sum] = i_c + offset;
997 if (MY_DEBUG){
998 if (rank == 1) fprintf(stderr, "1-ext[%d (%d,%d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);
999 }
1000 sum++;
1001 } else {
1002 RAP_ext_val[P_marker[i_c]] += RAP_val;
1003 if (MY_DEBUG){
1004 if (rank == 1) fprintf(stderr, "1-ext[(%d,%d)]+=%f\n", i_r, i_c, RAP_val);
1005 }
1006 }
1007 }
1008
1009 /* loop over elements in row i2 of P_ext_couple, do the same */
1010 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1011 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1012 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
1013 RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1014
1015 if (P_marker[i_c] < row_marker) {
1016 P_marker[i_c] = sum;
1017 RAP_ext_val[sum] = RAP_val;
1018 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1019 if (MY_DEBUG){
1020 if (rank == 1) fprintf(stderr, "ext[%d (%d,%d)]=%f (R%f, A(%d %d)%f, P%f)\n", sum, i_r, i_c, RAP_ext_val[sum], R_val, i1, i2, A->col_coupleBlock->val[j2], P->remote_coupleBlock->val[j3]);
1021 }
1022 sum++;
1023 } else {
1024 RAP_ext_val[P_marker[i_c]] += RAP_val;
1025 if (MY_DEBUG){
1026 if (rank == 1) fprintf(stderr, "ext[(%d,%d)]+=%f\n", i_r, i_c, RAP_val);
1027 }
1028 }
1029 }
1030
1031 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1032 Only the contributions are added. */
1033 } else {
1034 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1035 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1036 i_c = P->row_coupleBlock->pattern->index[j3];
1037 RAP_val = RA_val * P->row_coupleBlock->val[j3];
1038 RAP_ext_val[P_marker[i_c]] += RAP_val;
1039 if (MY_DEBUG){
1040 if (rank == 1) fprintf(stderr, "Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);
1041 }
1042 }
1043 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1044 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1045 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
1046 RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1047 RAP_ext_val[P_marker[i_c]] += RAP_val;
1048 if (MY_DEBUG){
1049 if (rank == 1) fprintf(stderr, "Visited ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);
1050 }
1051 }
1052 }
1053 }
1054
1055 /* now loop over elements in row i1 of A->mainBlock, repeat
1056 the process we have done to block A->col_coupleBlock */
1057 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1058 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1059 i2 = A->mainBlock->pattern->index[j2];
1060 RA_val = R_val * A->mainBlock->val[j2];
1061 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1062 A_marker[i2 + num_Acouple_cols] = i_r;
1063 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1064 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1065 i_c = P->mainBlock->pattern->index[j3];
1066 RAP_val = RA_val * P->mainBlock->val[j3];
1067 if (P_marker[i_c] < row_marker) {
1068 P_marker[i_c] = sum;
1069 RAP_ext_val[sum] = RAP_val;
1070 RAP_ext_idx[sum] = i_c + offset;
1071 if (MY_DEBUG){
1072 if (rank == 1) fprintf(stderr, "Main 1-ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);
1073 }
1074 sum++;
1075 } else {
1076 RAP_ext_val[P_marker[i_c]] += RAP_val;
1077 if (MY_DEBUG){
1078 if (rank == 1) fprintf(stderr, "Main 1-ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);
1079 }
1080 }
1081 }
1082 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1083 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1084 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1085 if (MY_DEBUG1 && (i_c < 0 || i_c >= num_Pext_cols + num_Pmain_cols)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 1\n", rank, i_c, num_Pext_cols + num_Pmain_cols);
1086 RAP_val = RA_val * P->col_coupleBlock->val[j3];
1087 if (P_marker[i_c] < row_marker) {
1088 P_marker[i_c] = sum;
1089 RAP_ext_val[sum] = RAP_val;
1090 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1091 if (MY_DEBUG){
1092 if (rank == 1) fprintf(stderr, "Main ext[%d (%d, %d)]=%f\n", sum, i_r, i_c, RAP_ext_val[sum]);
1093 }
1094 sum++;
1095 } else {
1096 RAP_ext_val[P_marker[i_c]] += RAP_val;
1097 if (MY_DEBUG){
1098 if (rank == 1) fprintf(stderr, "Main ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);
1099 }
1100 }
1101 }
1102 } else {
1103 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1104 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1105 i_c = P->mainBlock->pattern->index[j3];
1106 RAP_val = RA_val * P->mainBlock->val[j3];
1107 RAP_ext_val[P_marker[i_c]] += RAP_val;
1108 if (MY_DEBUG){
1109 if (rank == 1) fprintf(stderr, "Main Visited 1-ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);
1110 }
1111 }
1112 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1113 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1114 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1115 RAP_val = RA_val * P->col_coupleBlock->val[j3];
1116 RAP_ext_val[P_marker[i_c]] += RAP_val;
1117 if (MY_DEBUG){
1118 if (rank == 1) fprintf(stderr, "Main Visited ext[%d, %d]+=%f\n", i_r, i_c, RAP_val);
1119 }
1120 }
1121 }
1122 }
1123 }
1124 RAP_ext_ptr[i_r+1] = sum;
1125 }
1126 TMPMEMFREE(P_marker);
1127 TMPMEMFREE(Pcouple_to_Pext);
1128 if (MY_DEBUG) fprintf(stderr, "rank %d CP5 num_Pcouple_cols %d sum %d\n", rank, num_Pcouple_cols, sum);
1129
1130 if (MY_DEBUG) {
1131 char *str1, *str2;
1132 str1 = TMPMEMALLOC(sum*100+100, char);
1133 str2 = TMPMEMALLOC(15, char);
1134 sprintf(str1, "rank %d RAP_ext_idx[%d] = (", rank, sum);
1135 for (i=0; i<sum; i++) {
1136 sprintf(str2, "%d ", RAP_ext_idx[i]);
1137 strcat(str1, str2);
1138 }
1139 fprintf(stderr, "%s)\n", str1);
1140 TMPMEMFREE(str1);
1141 TMPMEMFREE(str2);
1142 }
1143
1144 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1145 which is given by the column list of P->col_coupleBlock, and
1146 column "c" is the list of columns which possiblely covers the
1147 whole column range of systme martris P. This part of data is to
1148 be passed to neighboring processors, and added to corresponding
1149 RAP[r,c] entries in the neighboring processors */
1150 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1151 &RAP_ext_val, global_id_P);
1152 if (MY_DEBUG1) fprintf(stderr, "rank %d CP6\n", rank);
1153
1154 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1155 sum = RAP_ext_ptr[num_RAPext_rows];
1156 if (MY_DEBUG) fprintf(stderr, "rank %d CP6_0 %d %d (%d %d)\n", rank, sum, num_Pext_cols, num_RAPext_rows, num_Pcouple_cols);
1157 num_RAPext_cols = 0;
1158 if (num_Pext_cols || sum > 0) {
1159 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1160 j1_ub = offset + num_Pmain_cols;
1161 for (i=0, iptr=0; i<sum; i++) {
1162 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */ {
1163 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1164 if (MY_DEBUG && rank ==1) fprintf(stderr, "RAP_ext_idx[%d]=%d\n", i, RAP_ext_idx[i]);
1165 }
1166 }
1167 for (j=0; j<num_Pext_cols; j++, iptr++){
1168 temp[iptr] = global_id_P[j];
1169 }
1170
1171 if (iptr) {
1172 #ifdef USE_QSORTG
1173 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1174 #else
1175 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1176 #endif
1177 num_RAPext_cols = 1;
1178 i = temp[0];
1179 for (j=1; j<iptr; j++) {
1180 if (temp[j] > i) {
1181 i = temp[j];
1182 temp[num_RAPext_cols++] = i;
1183 }
1184 }
1185
1186 if (MY_DEBUG) {
1187 char *str1, *str2;
1188 index_t my_sum=iptr, i;
1189 str1 = TMPMEMALLOC(my_sum*20+100, char);
1190 str2 = TMPMEMALLOC(15, char);
1191 sprintf(str1, "rank %d temp[%d] = (", rank, my_sum);
1192 for (i=0; i<my_sum; i++){
1193 sprintf(str2, "%d ", temp[i]);
1194 strcat(str1, str2);
1195 }
1196 fprintf(stderr, "%s)\n", str1);
1197 TMPMEMFREE(str1);
1198 TMPMEMFREE(str2);
1199 }
1200 }
1201 }
1202 if (MY_DEBUG) fprintf(stderr, "rank %d CP6_1 %d %d\n", rank, iptr, num_RAPext_cols);
1203
1204 /* resize the pattern of P_ext_couple */
1205 if(num_RAPext_cols){
1206 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1207 for (i=0; i<num_RAPext_cols; i++)
1208 global_id_RAP[i] = temp[i];
1209 }
1210 if (MY_DEBUG) {
1211 char *str1, *str2;
1212 index_t my_sum=num_RAPext_cols, i;
1213 str1 = TMPMEMALLOC(my_sum*20+100, char);
1214 str2 = TMPMEMALLOC(15, char);
1215 sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, my_sum);
1216 for (i=0; i<my_sum; i++){
1217 sprintf(str2, "%d ", global_id_RAP[i]);
1218 strcat(str1, str2);
1219 }
1220 fprintf(stderr, "%s)\n", str1);
1221 TMPMEMFREE(str1);
1222 TMPMEMFREE(str2);
1223 }
1224 if (num_Pext_cols || sum > 0)
1225 TMPMEMFREE(temp);
1226 j1_ub = offset + num_Pmain_cols;
1227 for (i=0; i<sum; i++) {
1228 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1229 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1230 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1231 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1232 } else
1233 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1234 }
1235 if (MY_DEBUG) fprintf(stderr, "rank %d CP7 %d\n", rank, num_Pext_cols);
1236
1237 /* build the mapping */
1238 if (num_Pcouple_cols) {
1239 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1240 iptr = 0;
1241 for (i=0; i<num_RAPext_cols; i++)
1242 if (global_id_RAP[i] == P->global_id[iptr]) {
1243 Pcouple_to_RAP[iptr++] = i;
1244 if (iptr == num_Pcouple_cols) break;
1245 }
1246 }
1247
1248 if (num_Pext_cols) {
1249 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1250 iptr = 0;
1251 for (i=0; i<num_RAPext_cols; i++)
1252 if (global_id_RAP[i] == global_id_P[iptr]) {
1253 Pext_to_RAP[iptr++] = i;
1254 if (iptr == num_Pext_cols) break;
1255 }
1256 }
1257
1258 if (global_id_P){
1259 TMPMEMFREE(global_id_P);
1260 global_id_P = NULL;
1261 }
1262
1263 /* alloc and initialize the makers */
1264 sum = num_RAPext_cols + num_Pmain_cols;
1265 P_marker = TMPMEMALLOC(sum, index_t);
1266 #pragma omp parallel for private(i) schedule(static)
1267 for (i=0; i<sum; i++) P_marker[i] = -1;
1268 #pragma omp parallel for private(i) schedule(static)
1269 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1270 if (MY_DEBUG1) fprintf(stderr, "rank %d CP8 %d\n", rank, num_Pmain_cols);
1271
1272 /* Now, count the size of RAP. Start with rows in R_main */
1273 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1274 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1275 shared = P->col_coupler->connector->send->shared;
1276 i = 0;
1277 j = 0;
1278 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1279 /* Mark the start of row for both main block and couple block */
1280 row_marker = i;
1281 row_marker_ext = j;
1282
1283 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1284 in RAP_ext */
1285 P_marker[i_r] = i;
1286 i++;
1287 for (j1=0; j1<num_neighbors; j1++) {
1288 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1289 if (shared[j2] == i_r) {
1290 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1291 i_c = RAP_ext_idx[k];
1292 if (i_c < num_Pmain_cols) {
1293 if (P_marker[i_c] < row_marker) {
1294 P_marker[i_c] = i;
1295 i++;
1296 }
1297 } else {
1298 if (P_marker[i_c] < row_marker_ext) {
1299 P_marker[i_c] = j;
1300 j++;
1301 }
1302 }
1303 }
1304 break;
1305 }
1306 }
1307 }
1308
1309 /* then, loop over elements in row i_r of R_main */
1310 j1_ub = R_main->pattern->ptr[i_r+1];
1311 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1312 i1 = R_main->pattern->index[j1];
1313
1314 /* then, loop over elements in row i1 of A->col_coupleBlock */
1315 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1316 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1317 i2 = A->col_coupleBlock->pattern->index[j2];
1318
1319 /* check whether entry RA[i_r, i2] has been previously visited.
1320 RAP new entry is possible only if entry RA[i_r, i2] has not
1321 been visited yet */
1322 if (A_marker[i2] != i_r) {
1323 /* first, mark entry RA[i_r,i2] as visited */
1324 A_marker[i2] = i_r;
1325
1326 /* then loop over elements in row i2 of P_ext_main */
1327 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1328 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1329 i_c = P->row_coupleBlock->pattern->index[j3];
1330
1331 /* check whether entry RAP[i_r,i_c] has been created.
1332 If not yet created, create the entry and increase
1333 the total number of elements in RAP_ext */
1334 if (P_marker[i_c] < row_marker) {
1335 P_marker[i_c] = i;
1336 i++;
1337 }
1338 }
1339
1340 /* loop over elements in row i2 of P_ext_couple, do the same */
1341 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1342 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1343 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1344 if (P_marker[i_c] < row_marker_ext) {
1345 P_marker[i_c] = j;
1346 j++;
1347 }
1348 }
1349 }
1350 }
1351
1352 /* now loop over elements in row i1 of A->mainBlock, repeat
1353 the process we have done to block A->col_coupleBlock */
1354 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1355 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1356 i2 = A->mainBlock->pattern->index[j2];
1357 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1358 A_marker[i2 + num_Acouple_cols] = i_r;
1359 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1360 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1361 i_c = P->mainBlock->pattern->index[j3];
1362 if (P_marker[i_c] < row_marker) {
1363 P_marker[i_c] = i;
1364 i++;
1365 }
1366 }
1367 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1368 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1369 /* note that we need to map the column index in
1370 P->col_coupleBlock back into the column index in
1371 P_ext_couple */
1372 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1373 if (MY_DEBUG1 && (i_c <0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d) 2\n", rank, i_c, sum);
1374 if (P_marker[i_c] < row_marker_ext) {
1375 P_marker[i_c] = j;
1376 j++;
1377 }
1378 }
1379 }
1380 }
1381 }
1382 }
1383 if (MY_DEBUG) fprintf(stderr, "rank %d CP9 main_size%d couple_size%d\n", rank, i, j);
1384
1385 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1386 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1387 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1388 RAP_couple */
1389 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1390 RAP_main_idx = MEMALLOC(i, index_t);
1391 RAP_main_val = TMPMEMALLOC(i, double);
1392 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1393 RAP_couple_idx = MEMALLOC(j, index_t);
1394 RAP_couple_val = TMPMEMALLOC(j, double);
1395
1396 RAP_main_ptr[num_Pmain_cols] = i;
1397 RAP_couple_ptr[num_Pmain_cols] = j;
1398
1399 #pragma omp parallel for private(i) schedule(static)
1400 for (i=0; i<sum; i++) P_marker[i] = -1;
1401 #pragma omp parallel for private(i) schedule(static)
1402 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1403
1404 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1405 in R_main */
1406 i = 0;
1407 j = 0;
1408 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1409 /* Mark the start of row for both main block and couple block */
1410 row_marker = i;
1411 row_marker_ext = j;
1412 RAP_main_ptr[i_r] = row_marker;
1413 RAP_couple_ptr[i_r] = row_marker_ext;
1414
1415 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1416 in row i_r of RAP_ext */
1417 P_marker[i_r] = i;
1418 RAP_main_val[i] = ZERO;
1419 RAP_main_idx[i] = i_r;
1420 i++;
1421
1422 for (j1=0; j1<num_neighbors; j1++) {
1423 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1424 if (shared[j2] == i_r) {
1425 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1426 i_c = RAP_ext_idx[k];
1427 if (i_c < num_Pmain_cols) {
1428 if (P_marker[i_c] < row_marker) {
1429 P_marker[i_c] = i;
1430 RAP_main_val[i] = RAP_ext_val[k];
1431 if (MY_DEBUG){
1432 if (rank == 0 && i == 2) fprintf(stderr, "ext %f\n", RAP_ext_val[k]);
1433 }
1434 RAP_main_idx[i] = i_c;
1435 i++;
1436 } else {
1437 RAP_main_val[P_marker[i_c]] += RAP_ext_val[k];
1438 if (MY_DEBUG){
1439 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "i_c %d ext %f\n", i_c, RAP_ext_val[k]);
1440 }
1441 }
1442 } else {
1443 if (P_marker[i_c] < row_marker_ext) {
1444 P_marker[i_c] = j;
1445 RAP_couple_val[j] = RAP_ext_val[k];
1446 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1447 j++;
1448 } else {
1449 RAP_couple_val[P_marker[i_c]] += RAP_ext_val[k];
1450 }
1451 }
1452 }
1453 break;
1454 }
1455 }
1456 }
1457
1458 /* then, loop over elements in row i_r of R_main */
1459 j1_ub = R_main->pattern->ptr[i_r+1];
1460 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1461 i1 = R_main->pattern->index[j1];
1462 R_val = R_main->val[j1];
1463
1464 /* then, loop over elements in row i1 of A->col_coupleBlock */
1465 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1466 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1467 i2 = A->col_coupleBlock->pattern->index[j2];
1468 RA_val = R_val * A->col_coupleBlock->val[j2];
1469
1470 /* check whether entry RA[i_r, i2] has been previously visited.
1471 RAP new entry is possible only if entry RA[i_r, i2] has not
1472 been visited yet */
1473 if (A_marker[i2] != i_r) {
1474 /* first, mark entry RA[i_r,i2] as visited */
1475 A_marker[i2] = i_r;
1476
1477 /* then loop over elements in row i2 of P_ext_main */
1478 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1479 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1480 i_c = P->row_coupleBlock->pattern->index[j3];
1481 RAP_val = RA_val * P->row_coupleBlock->val[j3];
1482
1483 /* check whether entry RAP[i_r,i_c] has been created.
1484 If not yet created, create the entry and increase
1485 the total number of elements in RAP_ext */
1486 if (P_marker[i_c] < row_marker) {
1487 P_marker[i_c] = i;
1488 RAP_main_val[i] = RAP_val;
1489 if (MY_DEBUG){
1490 if (rank == 0 && i == 2) fprintf(stderr, "R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
1491 }
1492 RAP_main_idx[i] = i_c;
1493 i++;
1494 } else {
1495 RAP_main_val[P_marker[i_c]] += RAP_val;
1496 if (MY_DEBUG){
1497 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1 , R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
1498 }
1499 }
1500 }
1501
1502 /* loop over elements in row i2 of P_ext_couple, do the same */
1503 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1504 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1505 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1506 RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1507 if (P_marker[i_c] < row_marker_ext) {
1508 P_marker[i_c] = j;
1509 RAP_couple_val[j] = RAP_val;
1510 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1511 j++;
1512 } else {
1513 RAP_couple_val[P_marker[i_c]] += RAP_val;
1514 }
1515 }
1516
1517 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1518 Only the contributions are added. */
1519 } else {
1520 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1521 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1522 i_c = P->row_coupleBlock->pattern->index[j3];
1523 RAP_val = RA_val * P->row_coupleBlock->val[j3];
1524 RAP_main_val[P_marker[i_c]] += RAP_val;
1525 if (MY_DEBUG){
1526 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->col_coupleBlock->val[j2], i2, i_c, P->row_coupleBlock->val[j3]);
1527 }
1528 }
1529 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1530 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1531 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1532 RAP_val = RA_val * P->remote_coupleBlock->val[j3];
1533 RAP_couple_val[P_marker[i_c]] += RAP_val;
1534 }
1535 }
1536 }
1537
1538 /* now loop over elements in row i1 of A->mainBlock, repeat
1539 the process we have done to block A->col_coupleBlock */
1540 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1541 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1542 i2 = A->mainBlock->pattern->index[j2];
1543 RA_val = R_val * A->mainBlock->val[j2];
1544 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1545 A_marker[i2 + num_Acouple_cols] = i_r;
1546 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1547 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1548 i_c = P->mainBlock->pattern->index[j3];
1549 RAP_val = RA_val * P->mainBlock->val[j3];
1550 if (P_marker[i_c] < row_marker) {
1551 P_marker[i_c] = i;
1552 RAP_main_val[i] = RAP_val;
1553 if (MY_DEBUG){
1554 if (rank == 0 && i == 2) fprintf(stderr, "Main R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
1555 }
1556 RAP_main_idx[i] = i_c;
1557 i++;
1558 } else {
1559 RAP_main_val[P_marker[i_c]] += RAP_val;
1560 if (MY_DEBUG){
1561 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main row_marker R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
1562 }
1563 }
1564 }
1565 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1566 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1567 /* note that we need to map the column index in
1568 P->col_coupleBlock back into the column index in
1569 P_ext_couple */
1570 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1571 if (MY_DEBUG1 && (i_c < 0 || i_c >= sum)) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d--%d,%d) 3\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);
1572 RAP_val = RA_val * P->col_coupleBlock->val[j3];
1573 if (P_marker[i_c] < row_marker_ext) {
1574 P_marker[i_c] = j;
1575 RAP_couple_val[j] = RAP_val;
1576 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1577 j++;
1578 } else {
1579 RAP_couple_val[P_marker[i_c]] += RAP_val;
1580 }
1581 }
1582
1583 } else {
1584 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1585 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1586 i_c = P->mainBlock->pattern->index[j3];
1587 RAP_val = RA_val * P->mainBlock->val[j3];
1588 RAP_main_val[P_marker[i_c]] += RAP_val;
1589 if (MY_DEBUG){
1590 if (rank == 0 && P_marker[i_c] == 2) fprintf(stderr, "Main Visited R[%d %d] %f A[%d %d] %f P[%d %d] %f \n", i_r, i1, R_val, i1, i2, A->mainBlock->val[j2], i2, i_c, P->mainBlock->val[j3]);
1591 }
1592 }
1593 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1594 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1595 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1596 if (MY_DEBUG1 && i_c >= sum) fprintf(stderr, "rank %d access to P_marker excceeded( %d out of %d -- %d,%d) 4\n", rank, i_c, sum, num_RAPext_cols, num_Pmain_cols);
1597 RAP_val = RA_val * P->col_coupleBlock->val[j3];
1598 RAP_couple_val[P_marker[i_c]] += RAP_val;
1599 }
1600 }
1601 }
1602 }
1603
1604 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1605 if (i > row_marker) {
1606 offset = i - row_marker;
1607 temp = TMPMEMALLOC(offset, index_t);
1608 for (iptr=0; iptr<offset; iptr++)
1609 temp[iptr] = RAP_main_idx[row_marker+iptr];
1610 if (offset > 0) {
1611 #ifdef USE_QSORTG
1612 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1613 #else
1614 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1615 #endif
1616 }
1617 temp_val = TMPMEMALLOC(offset, double);
1618 for (iptr=0; iptr<offset; iptr++){
1619 k = P_marker[temp[iptr]];
1620 temp_val[iptr] = RAP_main_val[k];
1621 P_marker[temp[iptr]] = iptr + row_marker;
1622 }
1623 for (iptr=0; iptr<offset; iptr++){
1624 RAP_main_idx[row_marker+iptr] = temp[iptr];
1625 RAP_main_val[row_marker+iptr] = temp_val[iptr];
1626 }
1627 TMPMEMFREE(temp);
1628 TMPMEMFREE(temp_val);
1629 }
1630 if (j > row_marker_ext) {
1631 offset = j - row_marker_ext;
1632 temp = TMPMEMALLOC(offset, index_t);
1633 for (iptr=0; iptr<offset; iptr++)
1634 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1635 if (offset > 0) {
1636 #ifdef USE_QSORTG
1637 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1638 #else
1639 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1640 #endif
1641 }
1642 temp_val = TMPMEMALLOC(offset, double);
1643 for (iptr=0; iptr<offset; iptr++){
1644 if (MY_DEBUG1 && temp[iptr] >= num_RAPext_cols)
1645 fprintf(stderr, "rank%d index %d exceed %d\n", rank, temp[iptr], num_RAPext_cols);
1646 k = P_marker[temp[iptr] + num_Pmain_cols];
1647 temp_val[iptr] = RAP_couple_val[k];
1648 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1649 }
1650 for (iptr=0; iptr<offset; iptr++){
1651 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1652 RAP_couple_val[row_marker_ext+iptr] = temp_val[iptr];
1653 }
1654 TMPMEMFREE(temp);
1655 TMPMEMFREE(temp_val);
1656 }
1657 }
1658
1659 TMPMEMFREE(A_marker);
1660 TMPMEMFREE(Pext_to_RAP);
1661 TMPMEMFREE(Pcouple_to_RAP);
1662 MEMFREE(RAP_ext_ptr);
1663 MEMFREE(RAP_ext_idx);
1664 MEMFREE(RAP_ext_val);
1665 Paso_SparseMatrix_free(R_main);
1666 Paso_SparseMatrix_free(R_couple);
1667
1668 if (MY_DEBUG){
1669 if (rank == 0) fprintf(stderr, "A[%d]=%f A[%d]=%f A[%d]=%f A[%d]=%f \n", RAP_main_idx[0], RAP_main_val[0], RAP_main_idx[1], RAP_main_val[1], RAP_main_idx[2], RAP_main_val[2], RAP_main_idx[3], RAP_main_val[3]);
1670 }
1671 if (MY_DEBUG1) fprintf(stderr, "rank %d CP10\n", rank);
1672 if (MY_DEBUG1 && rank == 1) {
1673 int sum = num_RAPext_cols, my_i;
1674 char *str1, *str2;
1675 str1 = TMPMEMALLOC(sum*100+100, char);
1676 str2 = TMPMEMALLOC(100, char);
1677 sprintf(str1, "rank %d global_id_RAP[%d] = (", rank, sum);
1678 for (my_i=0; my_i<sum; my_i++){
1679 sprintf(str2, "%d ", global_id_RAP[my_i]);
1680 strcat(str1, str2);
1681 }
1682 fprintf(stderr, "%s)\n", str1);
1683 sprintf(str1, "rank %d distribution = (", rank);
1684 for (my_i=0; my_i<size; my_i++){
1685 sprintf(str2, "%d ", P->pattern->input_distribution->first_component[my_i+1]);
1686 strcat(str1, str2);
1687 }
1688 fprintf(stderr, "%s)\n", str1);
1689 TMPMEMFREE(str1);
1690 TMPMEMFREE(str2);
1691 }
1692
1693
1694 /* Check whether there are empty columns in RAP_couple */
1695 #pragma omp parallel for schedule(static) private(i)
1696 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1697 /* num of non-empty columns is stored in "k" */
1698 k = 0;
1699 j = RAP_couple_ptr[num_Pmain_cols];
1700 if (MY_DEBUG) fprintf(stderr, "rank %d CP10_1 %d %d %d\n", rank, num_RAPext_cols, num_Pmain_cols, j);
1701 for (i=0; i<j; i++) {
1702 i1 = RAP_couple_idx[i];
1703 if (P_marker[i1]) {
1704 P_marker[i1] = 0;
1705 k++;
1706 }
1707 }
1708
1709 if (MY_DEBUG) fprintf(stderr, "rank %d CP10_2 %d\n", rank, k);
1710 /* empty columns is found */
1711 if (k < num_RAPext_cols) {
1712 temp = TMPMEMALLOC(k, index_t);
1713 k = 0;
1714 for (i=0; i<num_RAPext_cols; i++)
1715 if (!P_marker[i]) {
1716 P_marker[i] = k;
1717 temp[k] = global_id_RAP[i];
1718 k++;
1719 }
1720 for (i=0; i<j; i++) {
1721 i1 = RAP_couple_idx[i];
1722 RAP_couple_idx[i] = P_marker[i1];
1723 }
1724 num_RAPext_cols = k;
1725 TMPMEMFREE(global_id_RAP);
1726 global_id_RAP = temp;
1727 }
1728 TMPMEMFREE(P_marker);
1729 if (MY_DEBUG) fprintf(stderr, "rank %d CP11\n", rank);
1730
1731 /******************************************************/
1732 /* Start to create the coasre level System Matrix A_c */
1733 /******************************************************/
1734 /* first, prepare the sender/receiver for the col_connector */
1735 dist = P->pattern->input_distribution->first_component;
1736 recv_len = TMPMEMALLOC(size, dim_t);
1737 send_len = TMPMEMALLOC(size, dim_t);
1738 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
1739 offsetInShared = TMPMEMALLOC(size+1, index_t);
1740 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
1741 memset(recv_len, 0, sizeof(dim_t) * size);
1742 memset(send_len, 0, sizeof(dim_t) * size);
1743 num_neighbors = 0;
1744 offsetInShared[0] = 0;
1745 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1746 if (MY_DEBUG && rank == 1) fprintf(stderr, "i%d j%d k%d %d %d\n", i, j, k, global_id_RAP[i], recv_len[j]);
1747 shared[i] = i + num_Pmain_cols;
1748 if (k <= global_id_RAP[i]) {
1749 if (recv_len[j] > 0) {
1750 neighbor[num_neighbors] = j;
1751 num_neighbors ++;
1752 offsetInShared[num_neighbors] = i;
1753 }
1754 while (k <= global_id_RAP[i]) {
1755 j++;
1756 k = dist[j+1];
1757 }
1758 }
1759 recv_len[j] ++;
1760 }
1761 if (recv_len[j] > 0) {
1762 neighbor[num_neighbors] = j;
1763 num_neighbors ++;
1764 offsetInShared[num_neighbors] = i;
1765 }
1766 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1767 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1768
1769 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1770 if (MY_DEBUG1) fprintf(stderr, "rank %d CP12 %d %d\n", rank, num_neighbors, num_RAPext_cols);
1771
1772 #ifdef ESYS_MPI
1773 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
1774 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
1775 #else
1776 mpi_requests=TMPMEMALLOC(size*2,int);
1777 mpi_stati=TMPMEMALLOC(size*2,int);
1778 #endif
1779 num_neighbors = 0;
1780 j = 0;
1781 offsetInShared[0] = 0;
1782 for (i=0; i<size; i++) {
1783 if (send_len[i] > 0) {
1784 neighbor[num_neighbors] = i;
1785 num_neighbors ++;
1786 j += send_len[i];
1787 offsetInShared[num_neighbors] = j;
1788 }
1789 }
1790 TMPMEMFREE(shared);
1791 shared = TMPMEMALLOC(j, index_t);
1792 for (i=0, j=0; i<num_neighbors; i++) {
1793 k = neighbor[i];
1794 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1795 mpi_info->msg_tag_counter+k,
1796 mpi_info->comm, &mpi_requests[i]);
1797 j += send_len[k];
1798 }
1799 for (i=0, j=0; i<recv->numNeighbors; i++) {
1800 k = recv->neighbor[i];
1801 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1802 mpi_info->msg_tag_counter+rank,
1803 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1804 j += recv_len[k];
1805 }
1806 MPI_Waitall(num_neighbors + recv->numNeighbors,
1807 mpi_requests, mpi_stati);
1808 mpi_info->msg_tag_counter += size;
1809 if (MY_DEBUG) fprintf(stderr, "rank %d CP13\n", rank);
1810
1811 j = offsetInShared[num_neighbors];
1812 offset = dist[rank];
1813 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1814 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1815 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1816
1817 col_connector = Paso_Connector_alloc(send, recv);
1818 TMPMEMFREE(shared);
1819 Paso_SharedComponents_free(recv);
1820 Paso_SharedComponents_free(send);
1821 if (MY_DEBUG1) fprintf(stderr, "rank %d CP14\n", rank);
1822
1823 /* now, create row distribution (output_distri) and col
1824 distribution (input_distribution) */
1825 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1826 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1827 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_1\n", rank);
1828
1829 /* then, prepare the sender/receiver for the row_connector, first, prepare
1830 the information for sender */
1831 sum = RAP_couple_ptr[num_Pmain_cols];
1832 len = TMPMEMALLOC(size, dim_t);
1833 send_ptr = TMPMEMALLOC(size, index_t*);
1834 send_idx = TMPMEMALLOC(size, index_t*);
1835 for (i=0; i<size; i++) {
1836 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
1837 send_idx[i] = TMPMEMALLOC(sum, index_t);
1838 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1839 }
1840 memset(len, 0, sizeof(dim_t) * size);
1841 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_2 %d\n", rank, sum);
1842 recv = col_connector->recv;
1843 sum=0;
1844 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1845 i1 = RAP_couple_ptr[i_r];
1846 i2 = RAP_couple_ptr[i_r+1];
1847 if (i2 > i1) {
1848 /* then row i_r will be in the sender of row_connector, now check
1849 how many neighbors i_r needs to be send to */
1850 for (j=i1; j<i2; j++) {
1851 i_c = RAP_couple_idx[j];
1852 /* find out the corresponding neighbor "p" of column i_c */
1853 for (p=0; p<recv->numNeighbors; p++) {
1854 if (i_c < recv->offsetInShared[p+1]) {
1855 k = recv->neighbor[p];
1856 if (MY_DEBUG1 && rank == 1 && k == 1)
1857 fprintf(stderr, "******* i_r %d i_c %d p %d ub %d numNeighbors %d\n", i_r, i_c, p, recv->offsetInShared[p+1], recv->numNeighbors);
1858 if (send_ptr[k][i_r] == 0) sum++;
1859 send_ptr[k][i_r] ++;
1860 send_idx[k][len[k]] = global_id_RAP[i_c];
1861 len[k] ++;
1862 break;
1863 }
1864 }
1865 }
1866 }
1867 }
1868 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_3 %d\n", rank, send_ptr[1][0]);
1869 if (global_id_RAP) {
1870 TMPMEMFREE(global_id_RAP);
1871 global_id_RAP = NULL;
1872 }
1873 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_4 %d sum%d cols%d\n", rank, len[0], sum, num_RAPext_cols);
1874
1875 /* now allocate the sender */
1876 shared = TMPMEMALLOC(sum, index_t);
1877 memset(send_len, 0, sizeof(dim_t) * size);
1878 num_neighbors=0;
1879 offsetInShared[0] = 0;
1880 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_5 %d\n", rank, size);
1881 for (p=0, k=0; p<size; p++) {
1882 if (MY_DEBUG && rank == 1) fprintf(stderr, "rank %d CP14_6 %d %d %d\n", rank, p, k, num_neighbors);
1883 for (i=0; i<num_Pmain_cols; i++) {
1884 if (send_ptr[p][i] > 0) {
1885 shared[k] = i;
1886 k++;
1887 send_ptr[p][send_len[p]] = send_ptr[p][i];
1888 send_len[p]++;
1889 }
1890 }
1891 if (k > offsetInShared[num_neighbors]) {
1892 neighbor[num_neighbors] = p;
1893 num_neighbors ++;
1894 offsetInShared[num_neighbors] = k;
1895 }
1896 }
1897 if (MY_DEBUG) fprintf(stderr, "rank %d CP14_8 %d %d %d %d %d\n", rank, k, send_len[0], send_len[1], offsetInShared[1], k);
1898 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1899 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1900
1901 /* send/recv number of rows will be sent from current proc
1902 recover info for the receiver of row_connector from the sender */
1903 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1904 if (MY_DEBUG) fprintf(stderr, "rank %d CP15 %d %d\n", rank, recv_len[0], recv_len[1]);
1905 num_neighbors = 0;
1906 offsetInShared[0] = 0;
1907 j = 0;
1908 for (i=0; i<size; i++) {
1909 if (i != rank && recv_len[i] > 0) {
1910 neighbor[num_neighbors] = i;
1911 num_neighbors ++;
1912 j += recv_len[i];
1913 offsetInShared[num_neighbors] = j;
1914 }
1915 }
1916 if (MY_DEBUG) fprintf(stderr, "rank %d CP15_1\n", rank);
1917 TMPMEMFREE(shared);
1918 TMPMEMFREE(recv_len);
1919 shared = TMPMEMALLOC(j, index_t);
1920 k = offsetInShared[num_neighbors];
1921 for (i=0; i<k; i++) {
1922 shared[i] = i + num_Pmain_cols;
1923 }
1924 if (MY_DEBUG) fprintf(stderr, "rank %d CP15_2\n", rank);
1925 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1926 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1927 if (MY_DEBUG) fprintf(stderr, "rank %d CP15_3\n", rank);
1928 row_connector = Paso_Connector_alloc(send, recv);
1929 TMPMEMFREE(shared);
1930 Paso_SharedComponents_free(recv);
1931 Paso_SharedComponents_free(send);
1932 if (MY_DEBUG1) fprintf(stderr, "rank %d CP16 %d R%d S%d\n", rank, k, num_neighbors, send->numNeighbors);
1933
1934 /* send/recv pattern->ptr for rowCoupleBlock */
1935 num_RAPext_rows = offsetInShared[num_neighbors];
1936 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
1937 for (p=0; p<num_neighbors; p++) {
1938 j = offsetInShared[p];
1939 i = offsetInShared[p+1];
1940 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1941 mpi_info->msg_tag_counter+neighbor[p],
1942 mpi_info->comm, &mpi_requests[p]);
1943 if (MY_DEBUG) fprintf(stderr, "rank %d RECV %d from %d tag %d\n", rank, i-j, neighbor[p], mpi_info->msg_tag_counter+neighbor[p]);
1944 }
1945 send = row_connector->send;
1946 for (p=0; p<send->numNeighbors; p++) {
1947 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1948 MPI_INT, send->neighbor[p],
1949 mpi_info->msg_tag_counter+rank,
1950 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1951 if (MY_DEBUG) fprintf(stderr, "rank %d SEND %d TO %d tag %d\n", rank, send_len[send->neighbor[p]], send->neighbor[p], mpi_info->msg_tag_counter+rank);
1952 }
1953 MPI_Waitall(num_neighbors + send->numNeighbors,
1954 mpi_requests, mpi_stati);
1955 mpi_info->msg_tag_counter += size;
1956 TMPMEMFREE(send_len);
1957 if (MY_DEBUG1) fprintf(stderr, "rank %d CP17\n", rank);
1958
1959 sum = 0;
1960 for (i=0; i<num_RAPext_rows; i++) {
1961 k = row_couple_ptr[i];
1962 row_couple_ptr[i] = sum;
1963 sum += k;
1964 }
1965 row_couple_ptr[num_RAPext_rows] = sum;
1966
1967 /* send/recv pattern->index for rowCoupleBlock */
1968 k = row_couple_ptr[num_RAPext_rows];
1969 row_couple_idx = MEMALLOC(k, index_t);
1970 for (p=0; p<num_neighbors; p++) {
1971 j1 = row_couple_ptr[offsetInShared[p]];
1972 j2 = row_couple_ptr[offsetInShared[p+1]];
1973 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1974 mpi_info->msg_tag_counter+neighbor[p],
1975 mpi_info->comm, &mpi_requests[p]);
1976 }
1977 for (p=0; p<send->numNeighbors; p++) {
1978 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1979 MPI_INT, send->neighbor[p],
1980 mpi_info->msg_tag_counter+rank,
1981 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1982 }
1983 MPI_Waitall(num_neighbors + send->numNeighbors,
1984 mpi_requests, mpi_stati);
1985 mpi_info->msg_tag_counter += size;
1986
1987 offset = input_dist->first_component[rank];
1988 k = row_couple_ptr[num_RAPext_rows];
1989 for (i=0; i<k; i++) {
1990 row_couple_idx[i] -= offset;
1991 }
1992
1993 for (i=0; i<size; i++) {
1994 TMPMEMFREE(send_ptr[i]);
1995 TMPMEMFREE(send_idx[i]);
1996 }
1997 TMPMEMFREE(send_ptr);
1998 TMPMEMFREE(send_idx);
1999 TMPMEMFREE(len);
2000 TMPMEMFREE(offsetInShared);
2001 TMPMEMFREE(neighbor);
2002 TMPMEMFREE(mpi_requests);
2003 TMPMEMFREE(mpi_stati);
2004 Esys_MPIInfo_free(mpi_info);
2005 if (MY_DEBUG1) fprintf(stderr, "rank %d CP18\n", rank);
2006
2007 /* Now, we can create pattern for mainBlock and coupleBlock */
2008 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
2009 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
2010 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2011 num_Pmain_cols, num_RAPext_cols,
2012 RAP_couple_ptr, RAP_couple_idx);
2013 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2014 num_RAPext_rows, num_Pmain_cols,
2015 row_couple_ptr, row_couple_idx);
2016
2017 /* next, create the system matrix */
2018 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2019 output_dist, input_dist, main_pattern, col_couple_pattern,
2020 row_couple_pattern, col_connector, row_connector);
2021 out = Paso_SystemMatrix_alloc(A->type, pattern,
2022 row_block_size, col_block_size, FALSE);
2023 if (MY_DEBUG) fprintf(stderr, "rank %d CP19\n", rank);
2024
2025 /* finally, fill in the data*/
2026 memcpy(out->mainBlock->val, RAP_main_val,
2027 out->mainBlock->len* sizeof(double));
2028 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2029 out->col_coupleBlock->len * sizeof(double));
2030
2031 /* Clean up */
2032 Paso_SystemMatrixPattern_free(pattern);
2033 Paso_Pattern_free(main_pattern);
2034 Paso_Pattern_free(col_couple_pattern);
2035 Paso_Pattern_free(row_couple_pattern);
2036 Paso_Connector_free(col_connector);
2037 Paso_Connector_free(row_connector);
2038 Paso_Distribution_free(output_dist);
2039 Paso_Distribution_free(input_dist);
2040 TMPMEMFREE(RAP_main_val);
2041 TMPMEMFREE(RAP_couple_val);
2042 if (Esys_noError()) {
2043 return out;
2044 } else {
2045 Paso_SystemMatrix_free(out);
2046 return NULL;
2047 }
2048 }

  ViewVC Help
Powered by ViewVC 1.1.26