/[escript]/trunk/paso/src/AMG_Interpolation.c
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3829 - (show annotations)
Wed Feb 15 03:33:08 2012 UTC (7 years, 4 months ago) by lgao
File MIME type: text/plain
File size: 123322 byte(s)
Fix warnings from another machine. 


1 /*******************************************************
2 *
3 * Copyright (c) 2003-2011 by University of Queensland
4 * Earth Systems Science Computational Center (ESSCC)
5 * http://www.uq.edu.au/esscc
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 *******************************************************/
12
13
14 /**************************************************************/
15
16 /* Paso: defines AMG Interpolation */
17
18 /**************************************************************/
19
20 /* Author: l.gao@uq.edu.au */
21
22 /**************************************************************/
23
24 #include "Paso.h"
25 #include "SparseMatrix.h"
26 #include "PasoUtil.h"
27 #include "Preconditioner.h"
28
29 /**************************************************************
30
31 Methods nessecary for AMG preconditioner
32
33 construct n_C x n_C interpolation operator A_C from matrix A
34 and prolongation matrix P.
35
36 The coarsening operator A_C is defined by RAP where R=P^T.
37
38 ***************************************************************/
39
40 /* Extend system matrix B with extra two sparse matrixes:
41 B_ext_main and B_ext_couple
42 The combination of this two sparse matrixes represents
43 the portion of B that is stored on neighbor procs and
44 needed locally for triple matrix product (B^T) A B.
45
46 FOR NOW, we assume that the system matrix B has a NULL
47 row_coupleBlock and a NULL remote_coupleBlock. Based on
48 this assumption, we use link row_coupleBlock for sparse
49 matrix B_ext_main, and link remote_coupleBlock for sparse
50 matrix B_ext_couple.
51
52 To be specific, B_ext (on processor k) are group of rows
53 in B, where the list of rows from processor q is given by
54 A->col_coupler->connector->send->shared[rPtr...]
55 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
56 on q. */
57 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
58 {
59 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
60 Paso_Coupler *coupler=NULL;
61 Paso_SharedComponents *send=NULL, *recv=NULL;
62 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
63 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
64 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
65 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
66 index_t *idx_m=NULL, *idx_c=NULL;
67 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
68 index_t offset, len, block_size, block_size_size, max_num_cols;
69 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
70 dim_t *recv_degree=NULL, *send_degree=NULL;
71 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
72
73 if (size == 1) return;
74
75 if (B->row_coupleBlock) {
76 Paso_SparseMatrix_free(B->row_coupleBlock);
77 B->row_coupleBlock = NULL;
78 }
79
80 if (B->row_coupleBlock || B->remote_coupleBlock) {
81 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
82 return;
83 }
84
85 /* sending/receiving unknown's global ID */
86 num_main_cols = B->mainBlock->numCols;
87 cols = TMPMEMALLOC(num_main_cols, double);
88 offset = B->col_distribution->first_component[rank];
89 #pragma omp parallel for private(i) schedule(static)
90 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
91 if (B->global_id == NULL) {
92 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
93 Paso_Coupler_startCollect(coupler, cols);
94 }
95
96 recv_buf = TMPMEMALLOC(size, index_t);
97 recv_degree = TMPMEMALLOC(size, dim_t);
98 recv_offset = TMPMEMALLOC(size+1, index_t);
99 #pragma omp parallel for private(i) schedule(static)
100 for (i=0; i<size; i++){
101 recv_buf[i] = 0;
102 recv_degree[i] = 1;
103 recv_offset[i] = i;
104 }
105
106 block_size = B->block_size;
107 block_size_size = block_size * sizeof(double);
108 num_couple_cols = B->col_coupleBlock->numCols;
109 send = A->col_coupler->connector->send;
110 recv = A->col_coupler->connector->recv;
111 num_neighbors = send->numNeighbors;
112 p = send->offsetInShared[num_neighbors];
113 len = p * B->col_distribution->first_component[size];
114 send_buf = TMPMEMALLOC(len * block_size, double);
115 send_idx = TMPMEMALLOC(len, index_t);
116 send_offset = TMPMEMALLOC((p+1)*2, index_t);
117 send_degree = TMPMEMALLOC(num_neighbors, dim_t);
118 i = num_main_cols + num_couple_cols;
119 send_m = TMPMEMALLOC(i * block_size, double);
120 send_c = TMPMEMALLOC(i * block_size, double);
121 idx_m = TMPMEMALLOC(i, index_t);
122 idx_c = TMPMEMALLOC(i, index_t);
123
124 /* waiting for receiving unknown's global ID */
125 if (B->global_id == NULL) {
126 Paso_Coupler_finishCollect(coupler);
127 global_id = MEMALLOC(num_couple_cols, index_t);
128 #pragma omp parallel for private(i) schedule(static)
129 for (i=0; i<num_couple_cols; ++i)
130 global_id[i] = coupler->recv_buffer[i];
131 B->global_id = global_id;
132 Paso_Coupler_free(coupler);
133 } else
134 global_id = B->global_id;
135
136 /* distribute the number of cols in current col_coupleBlock to all ranks */
137 #ifdef ESYS_MPI
138 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
139 #endif
140
141 /* distribute global_ids of cols to be considered to all ranks*/
142 len = 0;
143 max_num_cols = 0;
144 for (i=0; i<size; i++){
145 recv_degree[i] = recv_buf[i];
146 recv_offset[i] = len;
147 len += recv_buf[i];
148 if (max_num_cols < recv_buf[i])
149 max_num_cols = recv_buf[i];
150 }
151 recv_offset[size] = len;
152 cols_array = TMPMEMALLOC(len, index_t);
153 #ifdef ESYS_MPI
154 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155 #endif
156
157 /* first, prepare the ptr_ptr to be received */
158 q = recv->numNeighbors;
159 len = recv->offsetInShared[q];
160 ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
161 for (p=0; p<q; p++) {
162 row = recv->offsetInShared[p];
163 m = recv->offsetInShared[p + 1];
164 #ifdef ESYS_MPI
165 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166 A->mpi_info->msg_tag_counter+recv->neighbor[p],
167 A->mpi_info->comm,
168 &(A->col_coupler->mpi_requests[p]));
169 #endif
170 }
171
172 /* now prepare the rows to be sent (the degree, the offset and the data) */
173 len = 0;
174 i0 = 0;
175 for (p=0; p<num_neighbors; p++) {
176 i = i0;
177 neighbor = send->neighbor[p];
178 m_lb = B->col_distribution->first_component[neighbor];
179 m_ub = B->col_distribution->first_component[neighbor + 1];
180 j_ub = send->offsetInShared[p + 1];
181 for (j=send->offsetInShared[p]; j<j_ub; j++) {
182 row = send->shared[j];
183 l_m = 0;
184 l_c = 0;
185 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
186 k_lb = B->col_coupleBlock->pattern->ptr[row];
187
188 /* check part of col_coupleBlock for data to be passed @row */
189 for (k=k_lb; k<k_ub; k++) {
190 m = global_id[B->col_coupleBlock->pattern->index[k]];
191 if (m > offset) break;
192 if (m>= m_lb && m < m_ub) {
193 /* data to be passed to sparse matrix B_ext_main */
194 idx_m[l_m] = m - m_lb;
195 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196 l_m++;
197 } else {
198 /* data to be passed to sparse matrix B_ext_couple */
199 idx_c[l_c] = m;
200 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
201 l_c++;
202 }
203 }
204 k_lb = k;
205
206 /* check mainBlock for data to be passed @row to sparse
207 matrix B_ext_couple */
208 k_ub = B->mainBlock->pattern->ptr[row + 1];
209 k = B->mainBlock->pattern->ptr[row];
210 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
211 for (; k<k_ub; k++) {
212 m = B->mainBlock->pattern->index[k] + offset;
213 idx_c[l_c] = m;
214 l_c++;
215 }
216
217 /* check the rest part of col_coupleBlock for data to
218 be passed @row to sparse matrix B_ext_couple */
219 k = k_lb;
220 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
221 for (k=k_lb; k<k_ub; k++) {
222 m = global_id[B->col_coupleBlock->pattern->index[k]];
223 if (m>= m_lb && m < m_ub) {
224 /* data to be passed to sparse matrix B_ext_main */
225 idx_m[l_m] = m - m_lb;
226 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227 l_m++;
228 } else {
229 /* data to be passed to sparse matrix B_ext_couple */
230 idx_c[l_c] = m;
231 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
232 l_c++;
233 }
234 }
235
236 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
237 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
238 send_offset[2*i] = l_m;
239 len += l_m;
240 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
241 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
242 send_offset[2*i+1] = l_c;
243 len += l_c;
244 i++;
245 }
246
247 /* sending */
248 #ifdef ESYS_MPI
249 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
250 A->mpi_info->msg_tag_counter+rank,
251 A->mpi_info->comm,
252 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
253 #endif
254 send_degree[p] = len;
255 i0 = i;
256 }
257 TMPMEMFREE(send_m);
258 TMPMEMFREE(send_c);
259 TMPMEMFREE(idx_m);
260 TMPMEMFREE(idx_c);
261
262
263 q = recv->numNeighbors;
264 len = recv->offsetInShared[q];
265 ptr_main = MEMALLOC((len+1), index_t);
266 ptr_couple = MEMALLOC((len+1), index_t);
267
268 #ifdef ESYS_MPI
269 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
270 A->col_coupler->mpi_requests,
271 A->col_coupler->mpi_stati);
272 #endif
273 A->mpi_info->msg_tag_counter += size;
274
275 j = 0;
276 k = 0;
277 ptr_main[0] = 0;
278 ptr_couple[0] = 0;
279 for (i=0; i<len; i++) {
280 j += ptr_ptr[2*i];
281 k += ptr_ptr[2*i+1];
282 ptr_main[i+1] = j;
283 ptr_couple[i+1] = k;
284 }
285
286 TMPMEMFREE(ptr_ptr);
287 idx_main = MEMALLOC(j, index_t);
288 idx_couple = MEMALLOC(k, index_t);
289 ptr_idx = TMPMEMALLOC(j+k, index_t);
290 ptr_val = TMPMEMALLOC((j+k) * block_size, double);
291
292 /* send/receive index array */
293 j=0;
294 k_ub = 0;
295 for (p=0; p<recv->numNeighbors; p++) {
296 k = recv->offsetInShared[p];
297 m = recv->offsetInShared[p+1];
298 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
299 if (i > 0) {
300 k_ub ++;
301 #ifdef ESYS_MPI
302 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
303 A->mpi_info->msg_tag_counter+recv->neighbor[p],
304 A->mpi_info->comm,
305 &(A->col_coupler->mpi_requests[p]));
306 #endif
307 }
308 j += i;
309 }
310
311 j=0;
312 k_ub = 0;
313 for (p=0; p<num_neighbors; p++) {
314 i = send_degree[p] - j;
315 if (i > 0){
316 k_ub ++;
317 #ifdef ESYS_MPI
318 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
319 A->mpi_info->msg_tag_counter+rank,
320 A->mpi_info->comm,
321 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
322 #endif
323 }
324 j = send_degree[p];
325 }
326
327 #ifdef ESYS_MPI
328 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
329 A->col_coupler->mpi_requests,
330 A->col_coupler->mpi_stati);
331 #endif
332 A->mpi_info->msg_tag_counter += size;
333
334 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
335 for (i=0; i<len; i++) {
336 j = ptr_main[i];
337 k = ptr_main[i+1];
338 m = ptr_couple[i];
339 for (p=j; p<k; p++) {
340 idx_main[p] = ptr_idx[m+p];
341 }
342 j = ptr_couple[i+1];
343 for (p=m; p<j; p++) {
344 idx_couple[p] = ptr_idx[k+p];
345 }
346 }
347 TMPMEMFREE(ptr_idx);
348
349 /* allocate pattern and sparsematrix for B_ext_main */
350 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
351 len, num_main_cols, ptr_main, idx_main);
352 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
353 pattern_main, B->row_block_size, B->col_block_size,
354 FALSE);
355 Paso_Pattern_free(pattern_main);
356
357 /* allocate pattern and sparsematrix for B_ext_couple */
358 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
359 len, B->col_distribution->first_component[size],
360 ptr_couple, idx_couple);
361 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
362 pattern_couple, B->row_block_size, B->col_block_size,
363 FALSE);
364 Paso_Pattern_free(pattern_couple);
365
366 /* send/receive value array */
367 j=0;
368 for (p=0; p<recv->numNeighbors; p++) {
369 k = recv->offsetInShared[p];
370 m = recv->offsetInShared[p+1];
371 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
372 #ifdef ESYS_MPI
373 if (i > 0)
374 MPI_Irecv(&(ptr_val[j]), i * block_size,
375 MPI_DOUBLE, recv->neighbor[p],
376 A->mpi_info->msg_tag_counter+recv->neighbor[p],
377 A->mpi_info->comm,
378 &(A->col_coupler->mpi_requests[p]));
379 #endif
380 j += (i * block_size);
381 }
382
383 j=0;
384 for (p=0; p<num_neighbors; p++) {
385 i = send_degree[p] - j;
386 #ifdef ESYS_MPI
387 if (i > 0)
388 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
389 A->mpi_info->msg_tag_counter+rank,
390 A->mpi_info->comm,
391 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
392 #endif
393 j = send_degree[p] ;
394 }
395
396 #ifdef ESYS_MPI
397 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
398 A->col_coupler->mpi_requests,
399 A->col_coupler->mpi_stati);
400 #endif
401 A->mpi_info->msg_tag_counter += size;
402
403 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
404 for (i=0; i<len; i++) {
405 j = ptr_main[i];
406 k = ptr_main[i+1];
407 m = ptr_couple[i];
408 for (p=j; p<k; p++) {
409 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
410 }
411 j = ptr_couple[i+1];
412 for (p=m; p<j; p++) {
413 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
414 }
415 }
416 TMPMEMFREE(ptr_val);
417
418
419 /* release all temp memory allocation */
420 TMPMEMFREE(cols);
421 TMPMEMFREE(cols_array);
422 TMPMEMFREE(recv_offset);
423 TMPMEMFREE(recv_degree);
424 TMPMEMFREE(recv_buf);
425 TMPMEMFREE(send_buf);
426 TMPMEMFREE(send_offset);
427 TMPMEMFREE(send_degree);
428 TMPMEMFREE(send_idx);
429 }
430
431 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
432 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
433 to copy block of data in T to neighbor processors, defined by
434 P->col_coupler->connector->recv->neighbor[k] where k is in
435 [0, P->col_coupler->connector->recv->numNeighbors).
436 Rows to be copied to neighbor processor k is in the list defined by
437 P->col_coupler->connector->recv->offsetInShared[k] ...
438 P->col_coupler->connector->recv->offsetInShared[k+1] */
439 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
440 index_t **p_ptr, index_t **p_idx, double **p_val,
441 index_t *global_id, index_t block_size)
442 {
443 Paso_SharedComponents *send=NULL, *recv=NULL;
444 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
445 index_t i, j, p, m, n, rank, size;
446 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
447 index_t *ptr=*p_ptr, *idx=*p_idx;
448 double *val=*p_val, *recv_val=NULL;
449
450 rank = P->mpi_info->rank;
451 size = P->mpi_info->size;
452 send = P->col_coupler->connector->recv;
453 recv = P->col_coupler->connector->send;
454 send_neighbors = send->numNeighbors;
455 recv_neighbors = recv->numNeighbors;
456 send_rows = P->col_coupleBlock->numCols;
457 recv_rows = recv->offsetInShared[recv_neighbors];
458
459 send_degree = TMPMEMALLOC(send_rows, index_t);
460 recv_ptr = MEMALLOC(recv_rows + 1, index_t);
461 #pragma omp for schedule(static) private(i)
462 for (i=0; i<send_rows; i++)
463 send_degree[i] = ptr[i+1] - ptr[i];
464
465 /* First, send/receive the degree */
466 for (p=0; p<recv_neighbors; p++) { /* Receiving */
467 m = recv->offsetInShared[p];
468 n = recv->offsetInShared[p+1];
469 #ifdef ESYS_MPI
470 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
471 P->mpi_info->msg_tag_counter + recv->neighbor[p],
472 P->mpi_info->comm,
473 &(P->col_coupler->mpi_requests[p]));
474 #endif
475 }
476 for (p=0; p<send_neighbors; p++) { /* Sending */
477 m = send->offsetInShared[p];
478 n = send->offsetInShared[p+1];
479 #ifdef ESYS_MPI
480 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
481 P->mpi_info->msg_tag_counter + rank,
482 P->mpi_info->comm,
483 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
484 #endif
485 }
486 #ifdef ESYS_MPI
487 MPI_Waitall(send_neighbors+recv_neighbors,
488 P->col_coupler->mpi_requests,
489 P->col_coupler->mpi_stati);
490 #endif
491 P->mpi_info->msg_tag_counter += size;
492
493 TMPMEMFREE(send_degree);
494 m = Paso_Util_cumsum(recv_rows, recv_ptr);
495 recv_ptr[recv_rows] = m;
496 recv_idx = MEMALLOC(m, index_t);
497 recv_val = MEMALLOC(m * block_size, double);
498
499 /* Next, send/receive the index array */
500 j = 0;
501 for (p=0; p<recv_neighbors; p++) { /* Receiving */
502 m = recv->offsetInShared[p];
503 n = recv->offsetInShared[p+1];
504 i = recv_ptr[n] - recv_ptr[m];
505 if (i > 0) {
506 #ifdef ESYS_MPI
507 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
508 P->mpi_info->msg_tag_counter + recv->neighbor[p],
509 P->mpi_info->comm,
510 &(P->col_coupler->mpi_requests[p]));
511 #endif
512 }
513 j += i;
514 }
515
516 j = 0;
517 for (p=0; p<send_neighbors; p++) { /* Sending */
518 m = send->offsetInShared[p];
519 n = send->offsetInShared[p+1];
520 i = ptr[n] - ptr[m];
521 if (i >0) {
522 #ifdef ESYS_MPI
523 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
524 P->mpi_info->msg_tag_counter + rank,
525 P->mpi_info->comm,
526 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
527 #endif
528 j += i;
529 }
530 }
531 #ifdef ESYS_MPI
532 MPI_Waitall(send_neighbors+recv_neighbors,
533 P->col_coupler->mpi_requests,
534 P->col_coupler->mpi_stati);
535 #endif
536 P->mpi_info->msg_tag_counter += size;
537
538 /* Last, send/receive the data array */
539 j = 0;
540 for (p=0; p<recv_neighbors; p++) { /* Receiving */
541 m = recv->offsetInShared[p];
542 n = recv->offsetInShared[p+1];
543 i = recv_ptr[n] - recv_ptr[m];
544 #ifdef ESYS_MPI
545 if (i > 0)
546 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
547 P->mpi_info->msg_tag_counter + recv->neighbor[p],
548 P->mpi_info->comm,
549 &(P->col_coupler->mpi_requests[p]));
550 #endif
551 j += (i*block_size);
552 }
553
554 j = 0;
555 for (p=0; p<send_neighbors; p++) { /* Sending */
556 m = send->offsetInShared[p];
557 n = send->offsetInShared[p+1];
558 i = ptr[n] - ptr[m];
559 if (i >0) {
560 #ifdef ESYS_MPI
561 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
562 P->mpi_info->msg_tag_counter + rank,
563 P->mpi_info->comm,
564 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
565 #endif
566 j += i * block_size;
567 }
568 }
569 #ifdef ESYS_MPI
570 MPI_Waitall(send_neighbors+recv_neighbors,
571 P->col_coupler->mpi_requests,
572 P->col_coupler->mpi_stati);
573 #endif
574 P->mpi_info->msg_tag_counter += size;
575
576 /* Clean up and return with recevied ptr, index and data arrays */
577 MEMFREE(ptr);
578 MEMFREE(idx);
579 MEMFREE(val);
580 *p_ptr = recv_ptr;
581 *p_idx = recv_idx;
582 *p_val = recv_val;
583 }
584
585 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
586 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
587 {
588 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
589 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
590 Paso_SystemMatrix *out=NULL;
591 Paso_SystemMatrixPattern *pattern=NULL;
592 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
593 Paso_SharedComponents *send =NULL, *recv=NULL;
594 Paso_Connector *col_connector=NULL, *row_connector=NULL;
595 Paso_Pattern *main_pattern=NULL;
596 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
597 const dim_t row_block_size=A->row_block_size;
598 const dim_t col_block_size=A->col_block_size;
599 const dim_t block_size = A->block_size;
600 const dim_t P_block_size = P->block_size;
601 const double ZERO = 0.0;
602 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
603 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
604 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
605 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
606 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
607 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
608 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
609 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
610 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
611 index_t sum, i, j, k, iptr, irb, icb, ib;
612 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
613 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
614 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
615 index_t row_marker_ext, *where_p=NULL;
616 index_t **send_ptr=NULL, **send_idx=NULL;
617 dim_t p, num_neighbors;
618 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
619 Esys_MPI_rank *neighbor=NULL;
620 #ifdef ESYS_MPI
621 MPI_Request* mpi_requests=NULL;
622 MPI_Status* mpi_stati=NULL;
623 #else
624 int *mpi_requests=NULL, *mpi_stati=NULL;
625 #endif
626
627 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
628 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
629
630 /* two sparse matrixes R_main and R_couple will be generate, as the
631 transpose of P_main and P_col_couple, respectively. Note that,
632 R_couple is actually the row_coupleBlock of R (R=P^T) */
633 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
634 if (size > 1)
635 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
636
637 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
638 and needed locally for triple matrix product RAP
639 to be specific, P_ext (on processor k) are group of rows in P, where
640 the list of rows from processor q is given by
641 A->col_coupler->connector->send->shared[rPtr...]
642 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
643 on q.
644 P_ext is represented by two sparse matrixes P_ext_main and
645 P_ext_couple */
646 Paso_Preconditioner_AMG_extendB(A, P);
647
648 /* count the number of cols in P_ext_couple, resize the pattern of
649 sparse matrix P_ext_couple with new compressed order, and then
650 build the col id mapping from P->col_coupleBlock to
651 P_ext_couple */
652 num_Pmain_cols = P->mainBlock->numCols;
653 if (size > 1) {
654 num_Pcouple_cols = P->col_coupleBlock->numCols;
655 num_Acouple_cols = A->col_coupleBlock->numCols;
656 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
657 } else {
658 num_Pcouple_cols = 0;
659 num_Acouple_cols = 0;
660 sum = 0;
661 }
662 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
663 offset = P->col_distribution->first_component[rank];
664 num_Pext_cols = 0;
665 if (P->global_id) {
666 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
667 and P->col_coupleBlock */
668 iptr = 0;
669 if (num_Pcouple_cols || sum > 0) {
670 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
671 #pragma omp parallel for lastprivate(iptr) schedule(static)
672 for (iptr=0; iptr<sum; iptr++){
673 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
674 }
675 for (j=0; j<num_Pcouple_cols; j++, iptr++){
676 temp[iptr] = P->global_id[j];
677 }
678 }
679 if (iptr) {
680 #ifdef USE_QSORTG
681 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
682 #else
683 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
684 #endif
685 num_Pext_cols = 1;
686 i = temp[0];
687 for (j=1; j<iptr; j++) {
688 if (temp[j] > i) {
689 i = temp[j];
690 temp[num_Pext_cols++] = i;
691 }
692 }
693 }
694 /* resize the pattern of P_ext_couple */
695 if(num_Pext_cols){
696 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
697 #pragma omp parallel for private(i) schedule(static)
698 for (i=0; i<num_Pext_cols; i++)
699 global_id_P[i] = temp[i];
700 }
701 if (num_Pcouple_cols || sum > 0)
702 TMPMEMFREE(temp);
703 #pragma omp parallel for private(i, where_p) schedule(static)
704 for (i=0; i<sum; i++) {
705 where_p = (index_t *)bsearch(
706 &(P->remote_coupleBlock->pattern->index[i]),
707 global_id_P, num_Pext_cols,
708 sizeof(index_t), Paso_comparIndex);
709 P->remote_coupleBlock->pattern->index[i] =
710 (index_t)(where_p -global_id_P);
711 }
712
713 /* build the mapping */
714 if (num_Pcouple_cols) {
715 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
716 iptr = 0;
717 for (i=0; i<num_Pext_cols; i++)
718 if (global_id_P[i] == P->global_id[iptr]) {
719 Pcouple_to_Pext[iptr++] = i;
720 if (iptr == num_Pcouple_cols) break;
721 }
722 }
723 }
724
725 /* alloc and initialize the makers */
726 sum = num_Pext_cols + num_Pmain_cols;
727 P_marker = TMPMEMALLOC(sum, index_t);
728 A_marker = TMPMEMALLOC(num_A_cols, index_t);
729 #pragma omp parallel for private(i) schedule(static)
730 for (i=0; i<sum; i++) P_marker[i] = -1;
731 #pragma omp parallel for private(i) schedule(static)
732 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
733
734 /* Now, count the size of RAP_ext. Start with rows in R_couple */
735 sum = 0;
736 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
737 row_marker = sum;
738 /* then, loop over elements in row i_r of R_couple */
739 j1_ub = R_couple->pattern->ptr[i_r+1];
740 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
741 i1 = R_couple->pattern->index[j1];
742 /* then, loop over elements in row i1 of A->col_coupleBlock */
743 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
744 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
745 i2 = A->col_coupleBlock->pattern->index[j2];
746
747 /* check whether entry RA[i_r, i2] has been previously visited.
748 RAP new entry is possible only if entry RA[i_r, i2] has not
749 been visited yet */
750 if (A_marker[i2] != i_r) {
751 /* first, mark entry RA[i_r,i2] as visited */
752 A_marker[i2] = i_r;
753
754 /* then loop over elements in row i2 of P_ext_main */
755 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
756 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
757 i_c = P->row_coupleBlock->pattern->index[j3];
758
759 /* check whether entry RAP[i_r,i_c] has been created.
760 If not yet created, create the entry and increase
761 the total number of elements in RAP_ext */
762 if (P_marker[i_c] < row_marker) {
763 P_marker[i_c] = sum;
764 sum++;
765 }
766 }
767
768 /* loop over elements in row i2 of P_ext_couple, do the same */
769 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
770 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
771 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
772 if (P_marker[i_c] < row_marker) {
773 P_marker[i_c] = sum;
774 sum++;
775 }
776 }
777 }
778 }
779
780 /* now loop over elements in row i1 of A->mainBlock, repeat
781 the process we have done to block A->col_coupleBlock */
782 j2_ub = A->mainBlock->pattern->ptr[i1+1];
783 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
784 i2 = A->mainBlock->pattern->index[j2];
785 if (A_marker[i2 + num_Acouple_cols] != i_r) {
786 A_marker[i2 + num_Acouple_cols] = i_r;
787 j3_ub = P->mainBlock->pattern->ptr[i2+1];
788 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
789 i_c = P->mainBlock->pattern->index[j3];
790 if (P_marker[i_c] < row_marker) {
791 P_marker[i_c] = sum;
792 sum++;
793 }
794 }
795 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
796 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
797 /* note that we need to map the column index in
798 P->col_coupleBlock back into the column index in
799 P_ext_couple */
800 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
801 if (P_marker[i_c] < row_marker) {
802 P_marker[i_c] = sum;
803 sum++;
804 }
805 }
806 }
807 }
808 }
809 }
810
811 /* Now we have the number of non-zero elements in RAP_ext, allocate
812 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
813 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
814 RAP_ext_idx = MEMALLOC(sum, index_t);
815 RAP_ext_val = MEMALLOC(sum * block_size, double);
816 RA_val = TMPMEMALLOC(block_size, double);
817 RAP_val = TMPMEMALLOC(block_size, double);
818
819 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
820 sum = num_Pext_cols + num_Pmain_cols;
821 #pragma omp parallel for private(i) schedule(static)
822 for (i=0; i<sum; i++) P_marker[i] = -1;
823 #pragma omp parallel for private(i) schedule(static)
824 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
825 sum = 0;
826 RAP_ext_ptr[0] = 0;
827 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
828 row_marker = sum;
829 /* then, loop over elements in row i_r of R_couple */
830 j1_ub = R_couple->pattern->ptr[i_r+1];
831 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
832 i1 = R_couple->pattern->index[j1];
833 R_val = &(R_couple->val[j1*P_block_size]);
834
835 /* then, loop over elements in row i1 of A->col_coupleBlock */
836 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
837 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
838 i2 = A->col_coupleBlock->pattern->index[j2];
839 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
840 for (irb=0; irb<row_block_size; irb++)
841 for (icb=0; icb<col_block_size; icb++)
842 RA_val[irb+row_block_size*icb] = ZERO;
843 for (irb=0; irb<P_block_size; irb++) {
844 rtmp=R_val[irb];
845 for (icb=0; icb<col_block_size; icb++) {
846 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
847 }
848 }
849
850 /* check whether entry RA[i_r, i2] has been previously visited.
851 RAP new entry is possible only if entry RA[i_r, i2] has not
852 been visited yet */
853 if (A_marker[i2] != i_r) {
854 /* first, mark entry RA[i_r,i2] as visited */
855 A_marker[i2] = i_r;
856
857 /* then loop over elements in row i2 of P_ext_main */
858 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
859 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
860 i_c = P->row_coupleBlock->pattern->index[j3];
861 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
862 for (irb=0; irb<row_block_size; irb++)
863 for (icb=0; icb<col_block_size; icb++)
864 RAP_val[irb+row_block_size*icb] = ZERO;
865 for (icb=0; icb<P_block_size; icb++) {
866 rtmp = temp_val[icb];
867 for (irb=0; irb<row_block_size; irb++) {
868 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
869 }
870 }
871
872 /* check whether entry RAP[i_r,i_c] has been created.
873 If not yet created, create the entry and increase
874 the total number of elements in RAP_ext */
875 if (P_marker[i_c] < row_marker) {
876 P_marker[i_c] = sum;
877 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
878 RAP_ext_idx[sum] = i_c + offset;
879 sum++;
880 } else {
881 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
882 for (ib=0; ib<block_size; ib++) {
883 temp_val[ib] += RAP_val[ib];
884 }
885 }
886 }
887
888 /* loop over elements in row i2 of P_ext_couple, do the same */
889 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
890 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
891 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
892 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
893 for (irb=0; irb<row_block_size; irb++)
894 for (icb=0; icb<col_block_size; icb++)
895 RAP_val[irb+row_block_size*icb]=ZERO;
896 for (icb=0; icb<P_block_size; icb++) {
897 rtmp = temp_val[icb];
898 for (irb=0; irb<row_block_size; irb++) {
899 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
900 }
901 }
902
903 if (P_marker[i_c] < row_marker) {
904 P_marker[i_c] = sum;
905 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
906 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
907 sum++;
908 } else {
909 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
910 for (ib=0; ib<block_size; ib++) {
911 temp_val[ib] += RAP_val[ib];
912 }
913 }
914 }
915
916 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
917 Only the contributions are added. */
918 } else {
919 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
920 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
921 i_c = P->row_coupleBlock->pattern->index[j3];
922 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
923 for (irb=0; irb<row_block_size; irb++)
924 for (icb=0; icb<col_block_size; icb++)
925 RAP_val[irb+row_block_size*icb]=ZERO;
926 for (icb=0; icb<P_block_size; icb++) {
927 rtmp = temp_val[icb];
928 for (irb=0; irb<row_block_size; irb++) {
929 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
930 }
931 }
932
933 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
934 for (ib=0; ib<block_size; ib++) {
935 temp_val[ib] += RAP_val[ib];
936 }
937 }
938 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
939 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
940 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
941 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
942 for (irb=0; irb<row_block_size; irb++)
943 for (icb=0; icb<col_block_size; icb++)
944 RAP_val[irb+row_block_size*icb]=ZERO;
945 for (icb=0; icb<P_block_size; icb++) {
946 rtmp = temp_val[icb];
947 for (irb=0; irb<row_block_size; irb++) {
948 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
949 }
950 }
951
952 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
953 for (ib=0; ib<block_size; ib++) {
954 temp_val[ib] += RAP_val[ib];
955 }
956 }
957 }
958 }
959
960 /* now loop over elements in row i1 of A->mainBlock, repeat
961 the process we have done to block A->col_coupleBlock */
962 j2_ub = A->mainBlock->pattern->ptr[i1+1];
963 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
964 i2 = A->mainBlock->pattern->index[j2];
965 temp_val = &(A->mainBlock->val[j2*block_size]);
966 for (irb=0; irb<row_block_size; irb++)
967 for (icb=0; icb<col_block_size; icb++)
968 RA_val[irb+row_block_size*icb]=ZERO;
969 for (irb=0; irb<P_block_size; irb++) {
970 rtmp=R_val[irb];
971 for (icb=0; icb<col_block_size; icb++) {
972 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
973 }
974 }
975
976 if (A_marker[i2 + num_Acouple_cols] != i_r) {
977 A_marker[i2 + num_Acouple_cols] = i_r;
978 j3_ub = P->mainBlock->pattern->ptr[i2+1];
979 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
980 i_c = P->mainBlock->pattern->index[j3];
981 temp_val = &(P->mainBlock->val[j3*P_block_size]);
982 for (irb=0; irb<row_block_size; irb++)
983 for (icb=0; icb<col_block_size; icb++)
984 RAP_val[irb+row_block_size*icb]=ZERO;
985 for (icb=0; icb<P_block_size; icb++) {
986 rtmp = temp_val[icb];
987 for (irb=0; irb<row_block_size; irb++) {
988 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
989 }
990 }
991
992 if (P_marker[i_c] < row_marker) {
993 P_marker[i_c] = sum;
994 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
995 RAP_ext_idx[sum] = i_c + offset;
996 sum++;
997 } else {
998 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
999 for (ib=0; ib<block_size; ib++) {
1000 temp_val[ib] += RAP_val[ib];
1001 }
1002 }
1003 }
1004 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1005 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1006 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1007 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1008 for (irb=0; irb<row_block_size; irb++)
1009 for (icb=0; icb<col_block_size; icb++)
1010 RAP_val[irb+row_block_size*icb]=ZERO;
1011 for (icb=0; icb<P_block_size; icb++) {
1012 rtmp=temp_val[icb];
1013 for (irb=0; irb<row_block_size; irb++) {
1014 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1015 }
1016 }
1017
1018 if (P_marker[i_c] < row_marker) {
1019 P_marker[i_c] = sum;
1020 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1021 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1022 sum++;
1023 } else {
1024 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1025 for (ib=0; ib<block_size; ib++) {
1026 temp_val[ib] += RAP_val[ib];
1027 }
1028 }
1029 }
1030 } else {
1031 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1032 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1033 i_c = P->mainBlock->pattern->index[j3];
1034 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1035 for (irb=0; irb<row_block_size; irb++)
1036 for (icb=0; icb<col_block_size; icb++)
1037 RAP_val[irb+row_block_size*icb]=ZERO;
1038 for (icb=0; icb<P_block_size; icb++) {
1039 rtmp=temp_val[icb];
1040 for (irb=0; irb<row_block_size; irb++) {
1041 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1042 }
1043 }
1044
1045 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1046 for (ib=0; ib<block_size; ib++) {
1047 temp_val[ib] += RAP_val[ib];
1048 }
1049 }
1050 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1051 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1052 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1053 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1054 for (irb=0; irb<row_block_size; irb++)
1055 for (icb=0; icb<col_block_size; icb++)
1056 RAP_val[irb+row_block_size*icb]=ZERO;
1057 for (icb=0; icb<P_block_size; icb++) {
1058 rtmp=temp_val[icb];
1059 for (irb=0; irb<row_block_size; irb++) {
1060 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1061 }
1062 }
1063
1064 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1065 for (ib=0; ib<block_size; ib++) {
1066 temp_val[ib] += RAP_val[ib];
1067 }
1068 }
1069 }
1070 }
1071 }
1072 RAP_ext_ptr[i_r+1] = sum;
1073 }
1074 TMPMEMFREE(P_marker);
1075 TMPMEMFREE(Pcouple_to_Pext);
1076
1077 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1078 which is given by the column list of P->col_coupleBlock, and
1079 column "c" is the list of columns which possiblely covers the
1080 whole column range of systme martris P. This part of data is to
1081 be passed to neighboring processors, and added to corresponding
1082 RAP[r,c] entries in the neighboring processors */
1083 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1084 &RAP_ext_val, global_id_P, block_size);
1085
1086 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1087 sum = RAP_ext_ptr[num_RAPext_rows];
1088 num_RAPext_cols = 0;
1089 if (num_Pext_cols || sum > 0) {
1090 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1091 j1_ub = offset + num_Pmain_cols;
1092 for (i=0, iptr=0; i<sum; i++) {
1093 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1094 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1095 }
1096 for (j=0; j<num_Pext_cols; j++, iptr++){
1097 temp[iptr] = global_id_P[j];
1098 }
1099
1100 if (iptr) {
1101 #ifdef USE_QSORTG
1102 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1103 #else
1104 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1105 #endif
1106 num_RAPext_cols = 1;
1107 i = temp[0];
1108 for (j=1; j<iptr; j++) {
1109 if (temp[j] > i) {
1110 i = temp[j];
1111 temp[num_RAPext_cols++] = i;
1112 }
1113 }
1114 }
1115 }
1116
1117 /* resize the pattern of P_ext_couple */
1118 if(num_RAPext_cols){
1119 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1120 #pragma omp parallel for private(i) schedule(static)
1121 for (i=0; i<num_RAPext_cols; i++)
1122 global_id_RAP[i] = temp[i];
1123 }
1124 if (num_Pext_cols || sum > 0)
1125 TMPMEMFREE(temp);
1126 j1_ub = offset + num_Pmain_cols;
1127 #pragma omp parallel for private(i, where_p) schedule(static)
1128 for (i=0; i<sum; i++) {
1129 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1130 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1131 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1132 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1133 } else
1134 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1135 }
1136
1137 /* build the mapping */
1138 if (num_Pcouple_cols) {
1139 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1140 iptr = 0;
1141 for (i=0; i<num_RAPext_cols; i++)
1142 if (global_id_RAP[i] == P->global_id[iptr]) {
1143 Pcouple_to_RAP[iptr++] = i;
1144 if (iptr == num_Pcouple_cols) break;
1145 }
1146 }
1147
1148 if (num_Pext_cols) {
1149 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1150 iptr = 0;
1151 for (i=0; i<num_RAPext_cols; i++)
1152 if (global_id_RAP[i] == global_id_P[iptr]) {
1153 Pext_to_RAP[iptr++] = i;
1154 if (iptr == num_Pext_cols) break;
1155 }
1156 }
1157
1158 if (global_id_P){
1159 TMPMEMFREE(global_id_P);
1160 global_id_P = NULL;
1161 }
1162
1163 /* alloc and initialize the makers */
1164 sum = num_RAPext_cols + num_Pmain_cols;
1165 P_marker = TMPMEMALLOC(sum, index_t);
1166 #pragma omp parallel for private(i) schedule(static)
1167 for (i=0; i<sum; i++) P_marker[i] = -1;
1168 #pragma omp parallel for private(i) schedule(static)
1169 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1170
1171 /* Now, count the size of RAP. Start with rows in R_main */
1172 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1173 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1174 shared = P->col_coupler->connector->send->shared;
1175 i = 0;
1176 j = 0;
1177 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1178 /* Mark the start of row for both main block and couple block */
1179 row_marker = i;
1180 row_marker_ext = j;
1181
1182 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1183 in RAP_ext */
1184 P_marker[i_r] = i;
1185 i++;
1186 for (j1=0; j1<num_neighbors; j1++) {
1187 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1188 if (shared[j2] == i_r) {
1189 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1190 i_c = RAP_ext_idx[k];
1191 if (i_c < num_Pmain_cols) {
1192 if (P_marker[i_c] < row_marker) {
1193 P_marker[i_c] = i;
1194 i++;
1195 }
1196 } else {
1197 if (P_marker[i_c] < row_marker_ext) {
1198 P_marker[i_c] = j;
1199 j++;
1200 }
1201 }
1202 }
1203 break;
1204 }
1205 }
1206 }
1207
1208 /* then, loop over elements in row i_r of R_main */
1209 j1_ub = R_main->pattern->ptr[i_r+1];
1210 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1211 i1 = R_main->pattern->index[j1];
1212
1213 /* then, loop over elements in row i1 of A->col_coupleBlock */
1214 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1215 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1216 i2 = A->col_coupleBlock->pattern->index[j2];
1217
1218 /* check whether entry RA[i_r, i2] has been previously visited.
1219 RAP new entry is possible only if entry RA[i_r, i2] has not
1220 been visited yet */
1221 if (A_marker[i2] != i_r) {
1222 /* first, mark entry RA[i_r,i2] as visited */
1223 A_marker[i2] = i_r;
1224
1225 /* then loop over elements in row i2 of P_ext_main */
1226 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1227 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1228 i_c = P->row_coupleBlock->pattern->index[j3];
1229
1230 /* check whether entry RAP[i_r,i_c] has been created.
1231 If not yet created, create the entry and increase
1232 the total number of elements in RAP_ext */
1233 if (P_marker[i_c] < row_marker) {
1234 P_marker[i_c] = i;
1235 i++;
1236 }
1237 }
1238
1239 /* loop over elements in row i2 of P_ext_couple, do the same */
1240 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1241 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1242 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1243 if (P_marker[i_c] < row_marker_ext) {
1244 P_marker[i_c] = j;
1245 j++;
1246 }
1247 }
1248 }
1249 }
1250
1251 /* now loop over elements in row i1 of A->mainBlock, repeat
1252 the process we have done to block A->col_coupleBlock */
1253 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1254 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1255 i2 = A->mainBlock->pattern->index[j2];
1256 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1257 A_marker[i2 + num_Acouple_cols] = i_r;
1258 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1259 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1260 i_c = P->mainBlock->pattern->index[j3];
1261 if (P_marker[i_c] < row_marker) {
1262 P_marker[i_c] = i;
1263 i++;
1264 }
1265 }
1266 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1267 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1268 /* note that we need to map the column index in
1269 P->col_coupleBlock back into the column index in
1270 P_ext_couple */
1271 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1272 if (P_marker[i_c] < row_marker_ext) {
1273 P_marker[i_c] = j;
1274 j++;
1275 }
1276 }
1277 }
1278 }
1279 }
1280 }
1281
1282 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1283 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1284 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1285 RAP_couple */
1286 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1287 RAP_main_idx = MEMALLOC(i, index_t);
1288 RAP_main_val = TMPMEMALLOC(i * block_size, double);
1289 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1290 RAP_couple_idx = MEMALLOC(j, index_t);
1291 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
1292
1293 RAP_main_ptr[num_Pmain_cols] = i;
1294 RAP_couple_ptr[num_Pmain_cols] = j;
1295
1296 #pragma omp parallel for private(i) schedule(static)
1297 for (i=0; i<sum; i++) P_marker[i] = -1;
1298 #pragma omp parallel for private(i) schedule(static)
1299 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1300
1301 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1302 in R_main */
1303 i = 0;
1304 j = 0;
1305 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1306 /* Mark the start of row for both main block and couple block */
1307 row_marker = i;
1308 row_marker_ext = j;
1309 RAP_main_ptr[i_r] = row_marker;
1310 RAP_couple_ptr[i_r] = row_marker_ext;
1311
1312 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1313 in row i_r of RAP_ext */
1314 P_marker[i_r] = i;
1315 for (ib=0; ib<block_size; ib++)
1316 RAP_main_val[i*block_size+ib] = ZERO;
1317 RAP_main_idx[i] = i_r;
1318 i++;
1319
1320 for (j1=0; j1<num_neighbors; j1++) {
1321 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1322 if (shared[j2] == i_r) {
1323 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1324 i_c = RAP_ext_idx[k];
1325 if (i_c < num_Pmain_cols) {
1326 if (P_marker[i_c] < row_marker) {
1327 P_marker[i_c] = i;
1328 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1329 RAP_main_idx[i] = i_c;
1330 i++;
1331 } else {
1332 t1_val = &(RAP_ext_val[k*block_size]);
1333 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1334 for (ib=0; ib<block_size; ib++)
1335 t2_val[ib] += t1_val[ib];
1336 }
1337 } else {
1338 if (P_marker[i_c] < row_marker_ext) {
1339 P_marker[i_c] = j;
1340 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1341 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1342 j++;
1343 } else {
1344 t1_val = &(RAP_ext_val[k*block_size]);
1345 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1346 for (ib=0; ib<block_size; ib++)
1347 t2_val[ib] += t1_val[ib];
1348 }
1349 }
1350 }
1351 break;
1352 }
1353 }
1354 }
1355
1356 /* then, loop over elements in row i_r of R_main */
1357 j1_ub = R_main->pattern->ptr[i_r+1];
1358 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1359 i1 = R_main->pattern->index[j1];
1360 R_val = &(R_main->val[j1*P_block_size]);
1361
1362 /* then, loop over elements in row i1 of A->col_coupleBlock */
1363 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1364 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1365 i2 = A->col_coupleBlock->pattern->index[j2];
1366 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1367 for (irb=0; irb<row_block_size; irb++)
1368 for (icb=0; icb<col_block_size; icb++)
1369 RA_val[irb+row_block_size*icb]=ZERO;
1370 for (irb=0; irb<P_block_size; irb++) {
1371 rtmp=R_val[irb];
1372 for (icb=0; icb<col_block_size; icb++) {
1373 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1374 }
1375 }
1376
1377
1378 /* check whether entry RA[i_r, i2] has been previously visited.
1379 RAP new entry is possible only if entry RA[i_r, i2] has not
1380 been visited yet */
1381 if (A_marker[i2] != i_r) {
1382 /* first, mark entry RA[i_r,i2] as visited */
1383 A_marker[i2] = i_r;
1384
1385 /* then loop over elements in row i2 of P_ext_main */
1386 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1387 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1388 i_c = P->row_coupleBlock->pattern->index[j3];
1389 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1390 for (irb=0; irb<row_block_size; irb++)
1391 for (icb=0; icb<col_block_size; icb++)
1392 RAP_val[irb+row_block_size*icb]=ZERO;
1393 for (icb=0; icb<P_block_size; icb++) {
1394 rtmp=temp_val[icb];
1395 for (irb=0; irb<row_block_size; irb++) {
1396 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1397 }
1398 }
1399
1400
1401 /* check whether entry RAP[i_r,i_c] has been created.
1402 If not yet created, create the entry and increase
1403 the total number of elements in RAP_ext */
1404 if (P_marker[i_c] < row_marker) {
1405 P_marker[i_c] = i;
1406 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1407 RAP_main_idx[i] = i_c;
1408 i++;
1409 } else {
1410 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1411 for (ib=0; ib<block_size; ib++) {
1412 temp_val[ib] += RAP_val[ib];
1413 }
1414 }
1415 }
1416
1417 /* loop over elements in row i2 of P_ext_couple, do the same */
1418 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1419 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1420 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1421 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1422 for (irb=0; irb<row_block_size; irb++)
1423 for (icb=0; icb<col_block_size; icb++)
1424 RAP_val[irb+row_block_size*icb]=ZERO;
1425 for (icb=0; icb<P_block_size; icb++) {
1426 rtmp=temp_val[icb];
1427 for (irb=0; irb<row_block_size; irb++) {
1428 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1429 }
1430 }
1431
1432 if (P_marker[i_c] < row_marker_ext) {
1433 P_marker[i_c] = j;
1434 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1435 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1436 j++;
1437 } else {
1438 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1439 for (ib=0; ib<block_size; ib++) {
1440 temp_val[ib] += RAP_val[ib];
1441 }
1442 }
1443 }
1444
1445 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1446 Only the contributions are added. */
1447 } else {
1448 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1449 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1450 i_c = P->row_coupleBlock->pattern->index[j3];
1451 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1452 for (irb=0; irb<row_block_size; irb++)
1453 for (icb=0; icb<col_block_size; icb++)
1454 RAP_val[irb+row_block_size*icb]=ZERO;
1455 for (icb=0; icb<P_block_size; icb++) {
1456 rtmp=temp_val[icb];
1457 for (irb=0; irb<row_block_size; irb++) {
1458 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1459 }
1460 }
1461
1462 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1463 for (ib=0; ib<block_size; ib++) {
1464 temp_val[ib] += RAP_val[ib];
1465 }
1466 }
1467 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1468 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1469 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1470 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1471 for (irb=0; irb<row_block_size; irb++)
1472 for (icb=0; icb<col_block_size; icb++)
1473 RAP_val[irb+row_block_size*icb]=ZERO;
1474 for (icb=0; icb<P_block_size; icb++) {
1475 rtmp=temp_val[icb];
1476 for (irb=0; irb<row_block_size; irb++) {
1477 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1478 }
1479 }
1480
1481 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1482 for (ib=0; ib<block_size; ib++) {
1483 temp_val[ib] += RAP_val[ib];
1484 }
1485 }
1486 }
1487 }
1488
1489 /* now loop over elements in row i1 of A->mainBlock, repeat
1490 the process we have done to block A->col_coupleBlock */
1491 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1492 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1493 i2 = A->mainBlock->pattern->index[j2];
1494 temp_val = &(A->mainBlock->val[j2*block_size]);
1495 for (irb=0; irb<row_block_size; irb++)
1496 for (icb=0; icb<col_block_size; icb++)
1497 RA_val[irb+row_block_size*icb]=ZERO;
1498 for (irb=0; irb<P_block_size; irb++) {
1499 rtmp=R_val[irb];
1500 for (icb=0; icb<col_block_size; icb++) {
1501 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1502 }
1503 }
1504
1505 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1506 A_marker[i2 + num_Acouple_cols] = i_r;
1507 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1508 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1509 i_c = P->mainBlock->pattern->index[j3];
1510 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1511 for (irb=0; irb<row_block_size; irb++)
1512 for (icb=0; icb<col_block_size; icb++)
1513 RAP_val[irb+row_block_size*icb]=ZERO;
1514 for (icb=0; icb<P_block_size; icb++) {
1515 rtmp=temp_val[icb];
1516 for (irb=0; irb<row_block_size; irb++) {
1517 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1518 }
1519 }
1520 if (P_marker[i_c] < row_marker) {
1521 P_marker[i_c] = i;
1522 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1523 RAP_main_idx[i] = i_c;
1524 i++;
1525 } else {
1526 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1527 for (ib=0; ib<block_size; ib++) {
1528 temp_val[ib] += RAP_val[ib];
1529 }
1530 }
1531 }
1532 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1533 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1534 /* note that we need to map the column index in
1535 P->col_coupleBlock back into the column index in
1536 P_ext_couple */
1537 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1538 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1539 for (irb=0; irb<row_block_size; irb++)
1540 for (icb=0; icb<col_block_size; icb++)
1541 RAP_val[irb+row_block_size*icb]=ZERO;
1542 for (icb=0; icb<P_block_size; icb++) {
1543 rtmp=temp_val[icb];
1544 for (irb=0; irb<row_block_size; irb++) {
1545 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1546 }
1547 }
1548
1549 if (P_marker[i_c] < row_marker_ext) {
1550 P_marker[i_c] = j;
1551 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1552 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1553 j++;
1554 } else {
1555 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1556 for (ib=0; ib<block_size; ib++) {
1557 temp_val[ib] += RAP_val[ib];
1558 }
1559 }
1560 }
1561
1562 } else {
1563 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1564 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1565 i_c = P->mainBlock->pattern->index[j3];
1566 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1567 for (irb=0; irb<row_block_size; irb++)
1568 for (icb=0; icb<col_block_size; icb++)
1569 RAP_val[irb+row_block_size*icb]=ZERO;
1570 for (icb=0; icb<P_block_size; icb++) {
1571 rtmp=temp_val[icb];
1572 for (irb=0; irb<row_block_size; irb++) {
1573 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1574 }
1575 }
1576
1577 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1578 for (ib=0; ib<block_size; ib++) {
1579 temp_val[ib] += RAP_val[ib];
1580 }
1581 }
1582 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1583 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1584 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1585 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1586 for (irb=0; irb<row_block_size; irb++)
1587 for (icb=0; icb<col_block_size; icb++)
1588 RAP_val[irb+row_block_size*icb]=ZERO;
1589 for (icb=0; icb<P_block_size; icb++) {
1590 rtmp = temp_val[icb];
1591 for (irb=0; irb<row_block_size; irb++) {
1592 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1593 }
1594 }
1595
1596 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1597 for (ib=0; ib<block_size; ib++) {
1598 temp_val[ib] += RAP_val[ib];
1599 }
1600 }
1601 }
1602 }
1603 }
1604
1605 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1606 if (i > row_marker) {
1607 offset = i - row_marker;
1608 temp = TMPMEMALLOC(offset, index_t);
1609 #pragma omp parallel for schedule(static) private(iptr)
1610 for (iptr=0; iptr<offset; iptr++)
1611 temp[iptr] = RAP_main_idx[row_marker+iptr];
1612 if (offset > 0) {
1613 #ifdef USE_QSORTG
1614 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1615 #else
1616 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1617 #endif
1618 }
1619 temp_val = TMPMEMALLOC(offset * block_size, double);
1620 #pragma omp parallel for schedule(static) private(iptr,k)
1621 for (iptr=0; iptr<offset; iptr++){
1622 k = P_marker[temp[iptr]];
1623 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1624 P_marker[temp[iptr]] = iptr + row_marker;
1625 }
1626 #pragma omp parallel for schedule(static) private(iptr)
1627 for (iptr=0; iptr<offset; iptr++){
1628 RAP_main_idx[row_marker+iptr] = temp[iptr];
1629 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1630 }
1631 TMPMEMFREE(temp);
1632 TMPMEMFREE(temp_val);
1633 }
1634 if (j > row_marker_ext) {
1635 offset = j - row_marker_ext;
1636 temp = TMPMEMALLOC(offset, index_t);
1637 #pragma omp parallel for schedule(static) private(iptr)
1638 for (iptr=0; iptr<offset; iptr++)
1639 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1640 if (offset > 0) {
1641 #ifdef USE_QSORTG
1642 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1643 #else
1644 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1645 #endif
1646 }
1647 temp_val = TMPMEMALLOC(offset * block_size, double);
1648 #pragma omp parallel for schedule(static) private(iptr, k)
1649 for (iptr=0; iptr<offset; iptr++){
1650 k = P_marker[temp[iptr] + num_Pmain_cols];
1651 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1652 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1653 }
1654 #pragma omp parallel for schedule(static) private(iptr)
1655 for (iptr=0; iptr<offset; iptr++){
1656 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1657 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1658 }
1659 TMPMEMFREE(temp);
1660 TMPMEMFREE(temp_val);
1661 }
1662 }
1663
1664 TMPMEMFREE(RA_val);
1665 TMPMEMFREE(RAP_val);
1666 TMPMEMFREE(A_marker);
1667 TMPMEMFREE(Pext_to_RAP);
1668 TMPMEMFREE(Pcouple_to_RAP);
1669 MEMFREE(RAP_ext_ptr);
1670 MEMFREE(RAP_ext_idx);
1671 MEMFREE(RAP_ext_val);
1672 Paso_SparseMatrix_free(R_main);
1673 Paso_SparseMatrix_free(R_couple);
1674
1675 /* Check whether there are empty columns in RAP_couple */
1676 #pragma omp parallel for schedule(static) private(i)
1677 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1678 /* num of non-empty columns is stored in "k" */
1679 k = 0;
1680 j = RAP_couple_ptr[num_Pmain_cols];
1681 for (i=0; i<j; i++) {
1682 i1 = RAP_couple_idx[i];
1683 if (P_marker[i1]) {
1684 P_marker[i1] = 0;
1685 k++;
1686 }
1687 }
1688
1689 /* empty columns is found */
1690 if (k < num_RAPext_cols) {
1691 temp = TMPMEMALLOC(k, index_t);
1692 k = 0;
1693 for (i=0; i<num_RAPext_cols; i++)
1694 if (!P_marker[i]) {
1695 P_marker[i] = k;
1696 temp[k] = global_id_RAP[i];
1697 k++;
1698 }
1699 #pragma omp parallel for schedule(static) private(i, i1)
1700 for (i=0; i<j; i++) {
1701 i1 = RAP_couple_idx[i];
1702 RAP_couple_idx[i] = P_marker[i1];
1703 }
1704 num_RAPext_cols = k;
1705 TMPMEMFREE(global_id_RAP);
1706 global_id_RAP = temp;
1707 }
1708 TMPMEMFREE(P_marker);
1709
1710 /******************************************************/
1711 /* Start to create the coasre level System Matrix A_c */
1712 /******************************************************/
1713 /* first, prepare the sender/receiver for the col_connector */
1714 dist = P->pattern->input_distribution->first_component;
1715 recv_len = TMPMEMALLOC(size, dim_t);
1716 send_len = TMPMEMALLOC(size, dim_t);
1717 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
1718 offsetInShared = TMPMEMALLOC(size+1, index_t);
1719 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
1720 memset(recv_len, 0, sizeof(dim_t) * size);
1721 memset(send_len, 0, sizeof(dim_t) * size);
1722 num_neighbors = 0;
1723 offsetInShared[0] = 0;
1724 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1725 shared[i] = i + num_Pmain_cols;
1726 if (k <= global_id_RAP[i]) {
1727 if (recv_len[j] > 0) {
1728 neighbor[num_neighbors] = j;
1729 num_neighbors ++;
1730 offsetInShared[num_neighbors] = i;
1731 }
1732 while (k <= global_id_RAP[i]) {
1733 j++;
1734 k = dist[j+1];
1735 }
1736 }
1737 recv_len[j] ++;
1738 }
1739 if (recv_len[j] > 0) {
1740 neighbor[num_neighbors] = j;
1741 num_neighbors ++;
1742 offsetInShared[num_neighbors] = i;
1743 }
1744 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1745 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1746
1747 #ifdef ESYS_MPI
1748 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1749 #endif
1750
1751 #ifdef ESYS_MPI
1752 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
1753 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
1754 #else
1755 mpi_requests=TMPMEMALLOC(size*2,int);
1756 mpi_stati=TMPMEMALLOC(size*2,int);
1757 #endif
1758 num_neighbors = 0;
1759 j = 0;
1760 offsetInShared[0] = 0;
1761 for (i=0; i<size; i++) {
1762 if (send_len[i] > 0) {
1763 neighbor[num_neighbors] = i;
1764 num_neighbors ++;
1765 j += send_len[i];
1766 offsetInShared[num_neighbors] = j;
1767 }
1768 }
1769 TMPMEMFREE(shared);
1770 shared = TMPMEMALLOC(j, index_t);
1771 for (i=0, j=0; i<num_neighbors; i++) {
1772 k = neighbor[i];
1773 #ifdef ESYS_MPI
1774 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1775 mpi_info->msg_tag_counter+k,
1776 mpi_info->comm, &mpi_requests[i]);
1777 #endif
1778 j += send_len[k];
1779 }
1780 for (i=0, j=0; i<recv->numNeighbors; i++) {
1781 k = recv->neighbor[i];
1782 #ifdef ESYS_MPI
1783 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1784 mpi_info->msg_tag_counter+rank,
1785 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1786 #endif
1787 j += recv_len[k];
1788 }
1789 #ifdef ESYS_MPI
1790 MPI_Waitall(num_neighbors + recv->numNeighbors,
1791 mpi_requests, mpi_stati);
1792 #endif
1793 mpi_info->msg_tag_counter += size;
1794
1795 j = offsetInShared[num_neighbors];
1796 offset = dist[rank];
1797 #pragma omp parallel for schedule(static) private(i)
1798 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1799 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1800 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1801
1802 col_connector = Paso_Connector_alloc(send, recv);
1803 TMPMEMFREE(shared);
1804 Paso_SharedComponents_free(recv);
1805 Paso_SharedComponents_free(send);
1806
1807 /* now, create row distribution (output_distri) and col
1808 distribution (input_distribution) */
1809 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1810 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1811
1812 /* then, prepare the sender/receiver for the row_connector, first, prepare
1813 the information for sender */
1814 sum = RAP_couple_ptr[num_Pmain_cols];
1815 len = TMPMEMALLOC(size, dim_t);
1816 send_ptr = TMPMEMALLOC(size, index_t*);
1817 send_idx = TMPMEMALLOC(size, index_t*);
1818 #pragma omp parallel for schedule(static) private(i)
1819 for (i=0; i<size; i++) {
1820 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
1821 send_idx[i] = TMPMEMALLOC(sum, index_t);
1822 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1823 }
1824 memset(len, 0, sizeof(dim_t) * size);
1825 recv = col_connector->recv;
1826 sum=0;
1827 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1828 i1 = RAP_couple_ptr[i_r];
1829 i2 = RAP_couple_ptr[i_r+1];
1830 if (i2 > i1) {
1831 /* then row i_r will be in the sender of row_connector, now check
1832 how many neighbors i_r needs to be send to */
1833 for (j=i1; j<i2; j++) {
1834 i_c = RAP_couple_idx[j];
1835 /* find out the corresponding neighbor "p" of column i_c */
1836 for (p=0; p<recv->numNeighbors; p++) {
1837 if (i_c < recv->offsetInShared[p+1]) {
1838 k = recv->neighbor[p];
1839 if (send_ptr[k][i_r] == 0) sum++;
1840 send_ptr[k][i_r] ++;
1841 send_idx[k][len[k]] = global_id_RAP[i_c];
1842 len[k] ++;
1843 break;
1844 }
1845 }
1846 }
1847 }
1848 }
1849 if (global_id_RAP) {
1850 TMPMEMFREE(global_id_RAP);
1851 global_id_RAP = NULL;
1852 }
1853
1854 /* now allocate the sender */
1855 shared = TMPMEMALLOC(sum, index_t);
1856 memset(send_len, 0, sizeof(dim_t) * size);
1857 num_neighbors=0;
1858 offsetInShared[0] = 0;
1859 for (p=0, k=0; p<size; p++) {
1860 for (i=0; i<num_Pmain_cols; i++) {
1861 if (send_ptr[p][i] > 0) {
1862 shared[k] = i;
1863 k++;
1864 send_ptr[p][send_len[p]] = send_ptr[p][i];
1865 send_len[p]++;
1866 }
1867 }
1868 if (k > offsetInShared[num_neighbors]) {
1869 neighbor[num_neighbors] = p;
1870 num_neighbors ++;
1871 offsetInShared[num_neighbors] = k;
1872 }
1873 }
1874 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1875 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1876
1877 /* send/recv number of rows will be sent from current proc
1878 recover info for the receiver of row_connector from the sender */
1879 #ifdef ESYS_MPI
1880 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1881 #endif
1882 num_neighbors = 0;
1883 offsetInShared[0] = 0;
1884 j = 0;
1885 for (i=0; i<size; i++) {
1886 if (i != rank && recv_len[i] > 0) {
1887 neighbor[num_neighbors] = i;
1888 num_neighbors ++;
1889 j += recv_len[i];
1890 offsetInShared[num_neighbors] = j;
1891 }
1892 }
1893 TMPMEMFREE(shared);
1894 TMPMEMFREE(recv_len);
1895 shared = TMPMEMALLOC(j, index_t);
1896 k = offsetInShared[num_neighbors];
1897 #pragma omp parallel for schedule(static) private(i)
1898 for (i=0; i<k; i++) {
1899 shared[i] = i + num_Pmain_cols;
1900 }
1901 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1902 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1903 row_connector = Paso_Connector_alloc(send, recv);
1904 TMPMEMFREE(shared);
1905 Paso_SharedComponents_free(recv);
1906 Paso_SharedComponents_free(send);
1907
1908 /* send/recv pattern->ptr for rowCoupleBlock */
1909 num_RAPext_rows = offsetInShared[num_neighbors];
1910 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
1911 for (p=0; p<num_neighbors; p++) {
1912 j = offsetInShared[p];
1913 i = offsetInShared[p+1];
1914 #ifdef ESYS_MPI
1915 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1916 mpi_info->msg_tag_counter+neighbor[p],
1917 mpi_info->comm, &mpi_requests[p]);
1918 #endif
1919 }
1920 send = row_connector->send;
1921 for (p=0; p<send->numNeighbors; p++) {
1922 #ifdef ESYS_MPI
1923 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1924 MPI_INT, send->neighbor[p],
1925 mpi_info->msg_tag_counter+rank,
1926 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1927 #endif
1928 }
1929 #ifdef ESYS_MPI
1930 MPI_Waitall(num_neighbors + send->numNeighbors,
1931 mpi_requests, mpi_stati);
1932 #endif
1933 mpi_info->msg_tag_counter += size;
1934 TMPMEMFREE(send_len);
1935
1936 sum = 0;
1937 for (i=0; i<num_RAPext_rows; i++) {
1938 k = row_couple_ptr[i];
1939 row_couple_ptr[i] = sum;
1940 sum += k;
1941 }
1942 row_couple_ptr[num_RAPext_rows] = sum;
1943
1944 /* send/recv pattern->index for rowCoupleBlock */
1945 k = row_couple_ptr[num_RAPext_rows];
1946 row_couple_idx = MEMALLOC(k, index_t);
1947 for (p=0; p<num_neighbors; p++) {
1948 j1 = row_couple_ptr[offsetInShared[p]];
1949 j2 = row_couple_ptr[offsetInShared[p+1]];
1950 #ifdef ESYS_MPI
1951 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1952 mpi_info->msg_tag_counter+neighbor[p],
1953 mpi_info->comm, &mpi_requests[p]);
1954 #endif
1955 }
1956 for (p=0; p<send->numNeighbors; p++) {
1957 #ifdef ESYS_MPI
1958 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1959 MPI_INT, send->neighbor[p],
1960 mpi_info->msg_tag_counter+rank,
1961 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1962 #endif
1963 }
1964 #ifdef ESYS_MPI
1965 MPI_Waitall(num_neighbors + send->numNeighbors,
1966 mpi_requests, mpi_stati);
1967 #endif
1968 mpi_info->msg_tag_counter += size;
1969
1970 offset = input_dist->first_component[rank];
1971 k = row_couple_ptr[num_RAPext_rows];
1972 #pragma omp parallel for schedule(static) private(i)
1973 for (i=0; i<k; i++) {
1974 row_couple_idx[i] -= offset;
1975 }
1976 #pragma omp parallel for schedule(static) private(i)
1977 for (i=0; i<size; i++) {
1978 TMPMEMFREE(send_ptr[i]);
1979 TMPMEMFREE(send_idx[i]);
1980 }
1981 TMPMEMFREE(send_ptr);
1982 TMPMEMFREE(send_idx);
1983 TMPMEMFREE(len);
1984 TMPMEMFREE(offsetInShared);
1985 TMPMEMFREE(neighbor);
1986 TMPMEMFREE(mpi_requests);
1987 TMPMEMFREE(mpi_stati);
1988 Esys_MPIInfo_free(mpi_info);
1989
1990 /* Now, we can create pattern for mainBlock and coupleBlock */
1991 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1992 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1993 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1994 num_Pmain_cols, num_RAPext_cols,
1995 RAP_couple_ptr, RAP_couple_idx);
1996 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1997 num_RAPext_rows, num_Pmain_cols,
1998 row_couple_ptr, row_couple_idx);
1999
2000 /* next, create the system matrix */
2001 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2002 output_dist, input_dist, main_pattern, col_couple_pattern,
2003 row_couple_pattern, col_connector, row_connector);
2004 out = Paso_SystemMatrix_alloc(A->type, pattern,
2005 row_block_size, col_block_size, FALSE);
2006
2007 /* finally, fill in the data*/
2008 memcpy(out->mainBlock->val, RAP_main_val,
2009 out->mainBlock->len* sizeof(double));
2010 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2011 out->col_coupleBlock->len * sizeof(double));
2012
2013 /* Clean up */
2014 Paso_SystemMatrixPattern_free(pattern);
2015 Paso_Pattern_free(main_pattern);
2016 Paso_Pattern_free(col_couple_pattern);
2017 Paso_Pattern_free(row_couple_pattern);
2018 Paso_Connector_free(col_connector);
2019 Paso_Connector_free(row_connector);
2020 Paso_Distribution_free(output_dist);
2021 Paso_Distribution_free(input_dist);
2022 TMPMEMFREE(RAP_main_val);
2023 TMPMEMFREE(RAP_couple_val);
2024 if (Esys_noError()) {
2025 return out;
2026 } else {
2027 Paso_SystemMatrix_free(out);
2028 return NULL;
2029 }
2030 }
2031
2032
2033 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2034 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2035 {
2036 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2037 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2038 Paso_SystemMatrix *out=NULL;
2039 Paso_SystemMatrixPattern *pattern=NULL;
2040 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2041 Paso_SharedComponents *send =NULL, *recv=NULL;
2042 Paso_Connector *col_connector=NULL, *row_connector=NULL;
2043 Paso_Pattern *main_pattern=NULL;
2044 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2045 const dim_t row_block_size=A->row_block_size;
2046 const dim_t col_block_size=A->col_block_size;
2047 const dim_t block_size = A->block_size;
2048 const double ZERO = 0.0;
2049 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2050 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2051 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2052 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2053 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2054 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2055 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2056 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2057 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2058 index_t sum, i, j, k, iptr, irb, icb, ib;
2059 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2060 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2061 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2062 index_t row_marker_ext, *where_p=NULL;
2063 index_t **send_ptr=NULL, **send_idx=NULL;
2064 dim_t p, num_neighbors;
2065 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2066 Esys_MPI_rank *neighbor=NULL;
2067 #ifdef ESYS_MPI
2068 MPI_Request* mpi_requests=NULL;
2069 MPI_Status* mpi_stati=NULL;
2070 #else
2071 int *mpi_requests=NULL, *mpi_stati=NULL;
2072 #endif
2073
2074 /* two sparse matrixes R_main and R_couple will be generate, as the
2075 transpose of P_main and P_col_couple, respectively. Note that,
2076 R_couple is actually the row_coupleBlock of R (R=P^T) */
2077 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2078 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2079
2080 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
2081 and needed locally for triple matrix product RAP
2082 to be specific, P_ext (on processor k) are group of rows in P, where
2083 the list of rows from processor q is given by
2084 A->col_coupler->connector->send->shared[rPtr...]
2085 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2086 on q.
2087 P_ext is represented by two sparse matrixes P_ext_main and
2088 P_ext_couple */
2089 Paso_Preconditioner_AMG_extendB(A, P);
2090
2091 /* count the number of cols in P_ext_couple, resize the pattern of
2092 sparse matrix P_ext_couple with new compressed order, and then
2093 build the col id mapping from P->col_coupleBlock to
2094 P_ext_couple */
2095 num_Pmain_cols = P->mainBlock->numCols;
2096 num_Pcouple_cols = P->col_coupleBlock->numCols;
2097 num_Acouple_cols = A->col_coupleBlock->numCols;
2098 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2099 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2100 offset = P->col_distribution->first_component[rank];
2101 num_Pext_cols = 0;
2102 if (P->global_id) {
2103 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2104 and P->col_coupleBlock */
2105 iptr = 0;
2106 if (num_Pcouple_cols || sum > 0) {
2107 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
2108 for (; iptr<sum; iptr++){
2109 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2110 }
2111 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2112 temp[iptr] = P->global_id[j];
2113 }
2114 }
2115 if (iptr) {
2116 #ifdef USE_QSORTG
2117 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2118 #else
2119 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2120 #endif
2121 num_Pext_cols = 1;
2122 i = temp[0];
2123 for (j=1; j<iptr; j++) {
2124 if (temp[j] > i) {
2125 i = temp[j];
2126 temp[num_Pext_cols++] = i;
2127 }
2128 }
2129 }
2130
2131 /* resize the pattern of P_ext_couple */
2132 if(num_Pext_cols){
2133 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
2134 for (i=0; i<num_Pext_cols; i++)
2135 global_id_P[i] = temp[i];
2136 }
2137 if (num_Pcouple_cols || sum > 0)
2138 TMPMEMFREE(temp);
2139 for (i=0; i<sum; i++) {
2140 where_p = (index_t *)bsearch(
2141 &(P->remote_coupleBlock->pattern->index[i]),
2142 global_id_P, num_Pext_cols,
2143 sizeof(index_t), Paso_comparIndex);
2144 P->remote_coupleBlock->pattern->index[i] =
2145 (index_t)(where_p -global_id_P);
2146 }
2147
2148 /* build the mapping */
2149 if (num_Pcouple_cols) {
2150 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
2151 iptr = 0;
2152 for (i=0; i<num_Pext_cols; i++)
2153 if (global_id_P[i] == P->global_id[iptr]) {
2154 Pcouple_to_Pext[iptr++] = i;
2155 if (iptr == num_Pcouple_cols) break;
2156 }
2157 }
2158 }
2159
2160 /* alloc and initialize the makers */
2161 sum = num_Pext_cols + num_Pmain_cols;
2162 P_marker = TMPMEMALLOC(sum, index_t);
2163 A_marker = TMPMEMALLOC(num_A_cols, index_t);
2164 #pragma omp parallel for private(i) schedule(static)
2165 for (i=0; i<sum; i++) P_marker[i] = -1;
2166 #pragma omp parallel for private(i) schedule(static)
2167 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2168
2169 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2170 sum = 0;
2171 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2172 row_marker = sum;
2173 /* then, loop over elements in row i_r of R_couple */
2174 j1_ub = R_couple->pattern->ptr[i_r+1];
2175 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2176 i1 = R_couple->pattern->index[j1];
2177 /* then, loop over elements in row i1 of A->col_coupleBlock */
2178 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2179 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2180 i2 = A->col_coupleBlock->pattern->index[j2];
2181
2182 /* check whether entry RA[i_r, i2] has been previously visited.
2183 RAP new entry is possible only if entry RA[i_r, i2] has not
2184 been visited yet */
2185 if (A_marker[i2] != i_r) {
2186 /* first, mark entry RA[i_r,i2] as visited */
2187 A_marker[i2] = i_r;
2188
2189 /* then loop over elements in row i2 of P_ext_main */
2190 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2191 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2192 i_c = P->row_coupleBlock->pattern->index[j3];
2193
2194 /* check whether entry RAP[i_r,i_c] has been created.
2195 If not yet created, create the entry and increase
2196 the total number of elements in RAP_ext */
2197 if (P_marker[i_c] < row_marker) {
2198 P_marker[i_c] = sum;
2199 sum++;
2200 }
2201 }
2202
2203 /* loop over elements in row i2 of P_ext_couple, do the same */
2204 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2205 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2206 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2207 if (P_marker[i_c] < row_marker) {
2208 P_marker[i_c] = sum;
2209 sum++;
2210 }
2211 }
2212 }
2213 }
2214
2215 /* now loop over elements in row i1 of A->mainBlock, repeat
2216 the process we have done to block A->col_coupleBlock */
2217 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2218 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2219 i2 = A->mainBlock->pattern->index[j2];
2220 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2221 A_marker[i2 + num_Acouple_cols] = i_r;
2222 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2223 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2224 i_c = P->mainBlock->pattern->index[j3];
2225 if (P_marker[i_c] < row_marker) {
2226 P_marker[i_c] = sum;
2227 sum++;
2228 }
2229 }
2230 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2231 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2232 /* note that we need to map the column index in
2233 P->col_coupleBlock back into the column index in
2234 P_ext_couple */
2235 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2236 if (P_marker[i_c] < row_marker) {
2237 P_marker[i_c] = sum;
2238 sum++;
2239 }
2240 }
2241 }
2242 }
2243 }
2244 }
2245
2246 /* Now we have the number of non-zero elements in RAP_ext, allocate
2247 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2248 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
2249 RAP_ext_idx = MEMALLOC(sum, index_t);
2250 RAP_ext_val = MEMALLOC(sum * block_size, double);
2251 RA_val = TMPMEMALLOC(block_size, double);
2252 RAP_val = TMPMEMALLOC(block_size, double);
2253
2254 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2255 sum = num_Pext_cols + num_Pmain_cols;
2256 #pragma omp parallel for private(i) schedule(static)
2257 for (i=0; i<sum; i++) P_marker[i] = -1;
2258 #pragma omp parallel for private(i) schedule(static)
2259 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2260 sum = 0;
2261 RAP_ext_ptr[0] = 0;
2262 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2263 row_marker = sum;
2264 /* then, loop over elements in row i_r of R_couple */
2265 j1_ub = R_couple->pattern->ptr[i_r+1];
2266 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2267 i1 = R_couple->pattern->index[j1];
2268 R_val = &(R_couple->val[j1*block_size]);
2269
2270 /* then, loop over elements in row i1 of A->col_coupleBlock */
2271 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2272 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2273 i2 = A->col_coupleBlock->pattern->index[j2];
2274 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2275 for (irb=0; irb<row_block_size; irb++) {
2276 for (icb=0; icb<col_block_size; icb++) {
2277 rtmp = ZERO;
2278 for (ib=0; ib<col_block_size; ib++) {
2279 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2280 }
2281 RA_val[irb+row_block_size*icb]=rtmp;
2282 }
2283 }
2284
2285 /* check whether entry RA[i_r, i2] has been previously visited.
2286 RAP new entry is possible only if entry RA[i_r, i2] has not
2287 been visited yet */
2288 if (A_marker[i2] != i_r) {
2289 /* first, mark entry RA[i_r,i2] as visited */
2290 A_marker[i2] = i_r;
2291
2292 /* then loop over elements in row i2 of P_ext_main */
2293 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2294 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2295 i_c = P->row_coupleBlock->pattern->index[j3];
2296 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2297 for (irb=0; irb<row_block_size; irb++) {
2298 for (icb=0; icb<col_block_size; icb++) {
2299 rtmp = ZERO;
2300 for (ib=0; ib<col_block_size; ib++) {
2301 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2302 }
2303 RAP_val[irb+row_block_size*icb]=rtmp;
2304 }
2305 }
2306
2307
2308 /* check whether entry RAP[i_r,i_c] has been created.
2309 If not yet created, create the entry and increase
2310 the total number of elements in RAP_ext */
2311 if (P_marker[i_c] < row_marker) {
2312 P_marker[i_c] = sum;
2313 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2314 RAP_ext_idx[sum] = i_c + offset;
2315 sum++;
2316 } else {
2317 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2318 for (ib=0; ib<block_size; ib++) {
2319 temp_val[ib] += RAP_val[ib];
2320 }
2321 }
2322 }
2323
2324 /* loop over elements in row i2 of P_ext_couple, do the same */
2325 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2326 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2327 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2328 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2329 for (irb=0; irb<row_block_size; irb++) {
2330 for (icb=0; icb<col_block_size; icb++) {
2331 rtmp = ZERO;
2332 for (ib=0; ib<col_block_size; ib++) {
2333 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2334 }
2335 RAP_val[irb+row_block_size*icb]=rtmp;
2336 }
2337 }
2338
2339 if (P_marker[i_c] < row_marker) {
2340 P_marker[i_c] = sum;
2341 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2342 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2343 sum++;
2344 } else {
2345 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2346 for (ib=0; ib<block_size; ib++) {
2347 temp_val[ib] += RAP_val[ib];
2348 }
2349 }
2350 }
2351
2352 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2353 Only the contributions are added. */
2354 } else {
2355 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2356 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2357 i_c = P->row_coupleBlock->pattern->index[j3];
2358 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2359 for (irb=0; irb<row_block_size; irb++) {
2360 for (icb=0; icb<col_block_size; icb++) {
2361 rtmp = ZERO;
2362 for (ib=0; ib<col_block_size; ib++) {
2363 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2364 }
2365 RAP_val[irb+row_block_size*icb]=rtmp;
2366 }
2367 }
2368
2369 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2370 for (ib=0; ib<block_size; ib++) {
2371 temp_val[ib] += RAP_val[ib];
2372 }
2373 }
2374 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2375 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2376 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2377 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2378 for (irb=0; irb<row_block_size; irb++) {
2379 for (icb=0; icb<col_block_size; icb++) {
2380 rtmp = ZERO;
2381 for (ib=0; ib<col_block_size; ib++) {
2382 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2383 }
2384 RAP_val[irb+row_block_size*icb]=rtmp;
2385 }
2386 }
2387
2388 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2389 for (ib=0; ib<block_size; ib++) {
2390 temp_val[ib] += RAP_val[ib];
2391 }
2392 }
2393 }
2394 }
2395
2396 /* now loop over elements in row i1 of A->mainBlock, repeat
2397 the process we have done to block A->col_coupleBlock */
2398 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2399 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2400 i2 = A->mainBlock->pattern->index[j2];
2401 temp_val = &(A->mainBlock->val[j2*block_size]);
2402 for (irb=0; irb<row_block_size; irb++) {
2403 for (icb=0; icb<col_block_size; icb++) {
2404 rtmp = ZERO;
2405 for (ib=0; ib<col_block_size; ib++) {
2406 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2407 }
2408 RA_val[irb+row_block_size*icb]=rtmp;
2409 }
2410 }
2411
2412 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2413 A_marker[i2 + num_Acouple_cols] = i_r;
2414 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2415 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2416 i_c = P->mainBlock->pattern->index[j3];
2417 temp_val = &(P->mainBlock->val[j3*block_size]);
2418 for (irb=0; irb<row_block_size; irb++) {
2419 for (icb=0; icb<col_block_size; icb++) {
2420 rtmp = ZERO;
2421 for (ib=0; ib<col_block_size; ib++) {
2422 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2423 }
2424 RAP_val[irb+row_block_size*icb]=rtmp;
2425 }
2426 }
2427
2428 if (P_marker[i_c] < row_marker) {
2429 P_marker[i_c] = sum;
2430 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2431 RAP_ext_idx[sum] = i_c + offset;
2432 sum++;
2433 } else {
2434 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2435 for (ib=0; ib<block_size; ib++) {
2436 temp_val[ib] += RAP_val[ib];
2437 }
2438 }
2439 }
2440 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2441 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2442 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2443 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2444 for (irb=0; irb<row_block_size; irb++) {
2445 for (icb=0; icb<col_block_size; icb++) {
2446 rtmp = ZERO;
2447 for (ib=0; ib<col_block_size; ib++) {
2448 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2449 }
2450 RAP_val[irb+row_block_size*icb]=rtmp;
2451 }
2452 }
2453
2454 if (P_marker[i_c] < row_marker) {
2455 P_marker[i_c] = sum;
2456 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2457 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2458 sum++;
2459 } else {
2460 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2461 for (ib=0; ib<block_size; ib++) {
2462 temp_val[ib] += RAP_val[ib];
2463 }
2464 }
2465 }
2466 } else {
2467 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2468 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2469 i_c = P->mainBlock->pattern->index[j3];
2470 temp_val = &(P->mainBlock->val[j3*block_size]);
2471 for (irb=0; irb<row_block_size; irb++) {
2472 for (icb=0; icb<col_block_size; icb++) {
2473 rtmp = ZERO;
2474 for (ib=0; ib<col_block_size; ib++) {
2475 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2476 }
2477 RAP_val[irb+row_block_size*icb]=rtmp;
2478 }
2479 }
2480
2481 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2482 for (ib=0; ib<block_size; ib++) {
2483 temp_val[ib] += RAP_val[ib];
2484 }
2485 }
2486 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2487 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2488 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2489 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2490 for (irb=0; irb<row_block_size; irb++) {
2491 for (icb=0; icb<col_block_size; icb++) {
2492 rtmp = ZERO;
2493 for (ib=0; ib<col_block_size; ib++) {
2494 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2495 }
2496 RAP_val[irb+row_block_size*icb]=rtmp;
2497 }
2498 }
2499
2500 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2501 for (ib=0; ib<block_size; ib++) {
2502 temp_val[ib] += RAP_val[ib];
2503 }
2504 }
2505 }
2506 }
2507 }
2508 RAP_ext_ptr[i_r+1] = sum;
2509 }
2510 TMPMEMFREE(P_marker);
2511 TMPMEMFREE(Pcouple_to_Pext);
2512
2513 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2514 which is given by the column list of P->col_coupleBlock, and
2515 column "c" is the list of columns which possiblely covers the
2516 whole column range of systme martris P. This part of data is to
2517 be passed to neighboring processors, and added to corresponding
2518 RAP[r,c] entries in the neighboring processors */
2519 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2520 &RAP_ext_val, global_id_P, block_size);
2521
2522 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2523 sum = RAP_ext_ptr[num_RAPext_rows];
2524 num_RAPext_cols = 0;
2525 if (num_Pext_cols || sum > 0) {
2526 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
2527 j1_ub = offset + num_Pmain_cols;
2528 for (i=0, iptr=0; i<sum; i++) {
2529 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2530 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2531 }
2532 for (j=0; j<num_Pext_cols; j++, iptr++){
2533 temp[iptr] = global_id_P[j];
2534 }
2535
2536 if (iptr) {
2537 #ifdef USE_QSORTG
2538 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2539 #else
2540 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2541 #endif
2542 num_RAPext_cols = 1;
2543 i = temp[0];
2544 for (j=1; j<iptr; j++) {
2545 if (temp[j] > i) {
2546 i = temp[j];
2547 temp[num_RAPext_cols++] = i;
2548 }
2549 }
2550 }
2551 }
2552
2553 /* resize the pattern of P_ext_couple */
2554 if(num_RAPext_cols){
2555 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
2556 for (i=0; i<num_RAPext_cols; i++)
2557 global_id_RAP[i] = temp[i];
2558 }
2559 if (num_Pext_cols || sum > 0)
2560 TMPMEMFREE(temp);
2561 j1_ub = offset + num_Pmain_cols;
2562 for (i=0; i<sum; i++) {
2563 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2564 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2565 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2566 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2567 } else
2568 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2569 }
2570
2571 /* build the mapping */
2572 if (num_Pcouple_cols) {
2573 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
2574 iptr = 0;
2575 for (i=0; i<num_RAPext_cols; i++)
2576 if (global_id_RAP[i] == P->global_id[iptr]) {
2577 Pcouple_to_RAP[iptr++] = i;
2578 if (iptr == num_Pcouple_cols) break;
2579 }
2580 }
2581
2582 if (num_Pext_cols) {
2583 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
2584 iptr = 0;
2585 for (i=0; i<num_RAPext_cols; i++)
2586 if (global_id_RAP[i] == global_id_P[iptr]) {
2587 Pext_to_RAP[iptr++] = i;
2588 if (iptr == num_Pext_cols) break;
2589 }
2590 }
2591
2592 if (global_id_P){
2593 TMPMEMFREE(global_id_P);
2594 global_id_P = NULL;
2595 }
2596
2597 /* alloc and initialize the makers */
2598 sum = num_RAPext_cols + num_Pmain_cols;
2599 P_marker = TMPMEMALLOC(sum, index_t);
2600 #pragma omp parallel for private(i) schedule(static)
2601 for (i=0; i<sum; i++) P_marker[i] = -1;
2602 #pragma omp parallel for private(i) schedule(static)
2603 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2604
2605 /* Now, count the size of RAP. Start with rows in R_main */
2606 num_neighbors = P->col_coupler->connector->send->numNeighbors;
2607 offsetInShared = P->col_coupler->connector->send->offsetInShared;
2608 shared = P->col_coupler->connector->send->shared;
2609 i = 0;
2610 j = 0;
2611 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2612 /* Mark the start of row for both main block and couple block */
2613 row_marker = i;
2614 row_marker_ext = j;
2615
2616 /* Mark the diagonal element RAP[i_r, i_r], and other elements
2617 in RAP_ext */
2618 P_marker[i_r] = i;
2619 i++;
2620 for (j1=0; j1<num_neighbors; j1++) {
2621 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2622 if (shared[j2] == i_r) {
2623 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2624 i_c = RAP_ext_idx[k];
2625 if (i_c < num_Pmain_cols) {
2626 if (P_marker[i_c] < row_marker) {
2627 P_marker[i_c] = i;
2628 i++;
2629 }
2630 } else {
2631 if (P_marker[i_c] < row_marker_ext) {
2632 P_marker[i_c] = j;
2633 j++;
2634 }
2635 }
2636 }
2637 break;
2638 }
2639 }
2640 }
2641
2642 /* then, loop over elements in row i_r of R_main */
2643 j1_ub = R_main->pattern->ptr[i_r+1];
2644 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2645 i1 = R_main->pattern->index[j1];
2646
2647 /* then, loop over elements in row i1 of A->col_coupleBlock */
2648 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2649 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2650 i2 = A->col_coupleBlock->pattern->index[j2];
2651
2652 /* check whether entry RA[i_r, i2] has been previously visited.
2653 RAP new entry is possible only if entry RA[i_r, i2] has not
2654 been visited yet */
2655 if (A_marker[i2] != i_r) {
2656 /* first, mark entry RA[i_r,i2] as visited */
2657 A_marker[i2] = i_r;
2658
2659 /* then loop over elements in row i2 of P_ext_main */
2660 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2661 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2662 i_c = P->row_coupleBlock->pattern->index[j3];
2663
2664 /* check whether entry RAP[i_r,i_c] has been created.
2665 If not yet created, create the entry and increase
2666 the total number of elements in RAP_ext */
2667 if (P_marker[i_c] < row_marker) {
2668 P_marker[i_c] = i;
2669 i++;
2670 }
2671 }
2672
2673 /* loop over elements in row i2 of P_ext_couple, do the same */
2674 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2675 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2676 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2677 if (P_marker[i_c] < row_marker_ext) {
2678 P_marker[i_c] = j;
2679 j++;
2680 }
2681 }
2682 }
2683 }
2684
2685 /* now loop over elements in row i1 of A->mainBlock, repeat
2686 the process we have done to block A->col_coupleBlock */
2687 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2688 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2689 i2 = A->mainBlock->pattern->index[j2];
2690 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2691 A_marker[i2 + num_Acouple_cols] = i_r;
2692 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2693 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2694 i_c = P->mainBlock->pattern->index[j3];
2695 if (P_marker[i_c] < row_marker) {
2696 P_marker[i_c] = i;
2697 i++;
2698 }
2699 }
2700 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2701 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2702 /* note that we need to map the column index in
2703 P->col_coupleBlock back into the column index in
2704 P_ext_couple */
2705 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2706 if (P_marker[i_c] < row_marker_ext) {
2707 P_marker[i_c] = j;
2708 j++;
2709 }
2710 }
2711 }
2712 }
2713 }
2714 }
2715
2716 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
2717 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2718 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2719 RAP_couple */
2720 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2721 RAP_main_idx = MEMALLOC(i, index_t);
2722 RAP_main_val = TMPMEMALLOC(i * block_size, double);
2723 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2724 RAP_couple_idx = MEMALLOC(j, index_t);
2725 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
2726
2727 RAP_main_ptr[num_Pmain_cols] = i;
2728 RAP_couple_ptr[num_Pmain_cols] = j;
2729
2730 #pragma omp parallel for private(i) schedule(static)
2731 for (i=0; i<sum; i++) P_marker[i] = -1;
2732 #pragma omp parallel for private(i) schedule(static)
2733 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2734
2735 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
2736 in R_main */
2737 i = 0;
2738 j = 0;
2739 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2740 /* Mark the start of row for both main block and couple block */
2741 row_marker = i;
2742 row_marker_ext = j;
2743 RAP_main_ptr[i_r] = row_marker;
2744 RAP_couple_ptr[i_r] = row_marker_ext;
2745
2746 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2747 in row i_r of RAP_ext */
2748 P_marker[i_r] = i;
2749 RAP_main_val[i] = ZERO;
2750 RAP_main_idx[i] = i_r;
2751 i++;
2752
2753 for (j1=0; j1<num_neighbors; j1++) {
2754 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2755 if (shared[j2] == i_r) {
2756 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2757 i_c = RAP_ext_idx[k];
2758 if (i_c < num_Pmain_cols) {
2759 if (P_marker[i_c] < row_marker) {
2760 P_marker[i_c] = i;
2761 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2762 RAP_main_idx[i] = i_c;
2763 i++;
2764 } else {
2765 temp_val = &(RAP_ext_val[k*block_size]);
2766 RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2767 for (ib=0; ib<block_size; ib++)
2768 RAP_val[ib] += temp_val[ib];
2769 }
2770 } else {
2771 if (P_marker[i_c] < row_marker_ext) {
2772 P_marker[i_c] = j;
2773 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2774 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2775 j++;
2776 } else {
2777 temp_val = &(RAP_ext_val[k*block_size]);
2778 RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2779 for (ib=0; ib<block_size; ib++)
2780 RAP_val[ib] += temp_val[ib];
2781 }
2782 }
2783 }
2784 break;
2785 }
2786 }
2787 }
2788
2789 /* then, loop over elements in row i_r of R_main */
2790 j1_ub = R_main->pattern->ptr[i_r+1];
2791 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2792 i1 = R_main->pattern->index[j1];
2793 R_val = &(R_main->val[j1*block_size]);
2794
2795 /* then, loop over elements in row i1 of A->col_coupleBlock */
2796 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2797 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2798 i2 = A->col_coupleBlock->pattern->index[j2];
2799 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2800 for (irb=0; irb<row_block_size; irb++) {
2801 for (icb=0; icb<col_block_size; icb++) {
2802 rtmp = ZERO;
2803 for (ib=0; ib<col_block_size; ib++) {
2804 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2805 }
2806 RA_val[irb+row_block_size*icb]=rtmp;
2807 }
2808 }
2809
2810
2811 /* check whether entry RA[i_r, i2] has been previously visited.
2812 RAP new entry is possible only if entry RA[i_r, i2] has not
2813 been visited yet */
2814 if (A_marker[i2] != i_r) {
2815 /* first, mark entry RA[i_r,i2] as visited */
2816 A_marker[i2] = i_r;
2817
2818 /* then loop over elements in row i2 of P_ext_main */
2819 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2820 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2821 i_c = P->row_coupleBlock->pattern->index[j3];
2822 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2823 for (irb=0; irb<row_block_size; irb++) {
2824 for (icb=0; icb<col_block_size; icb++) {
2825 rtmp = ZERO;
2826 for (ib=0; ib<col_block_size; ib++) {
2827 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2828 }
2829 RAP_val[irb+row_block_size*icb]=rtmp;
2830 }
2831 }
2832
2833
2834 /* check whether entry RAP[i_r,i_c] has been created.
2835 If not yet created, create the entry and increase
2836 the total number of elements in RAP_ext */
2837 if (P_marker[i_c] < row_marker) {
2838 P_marker[i_c] = i;
2839 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2840 RAP_main_idx[i] = i_c;
2841 i++;
2842 } else {
2843 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2844 for (ib=0; ib<block_size; ib++) {
2845 temp_val[ib] += RAP_val[ib];
2846 }
2847 }
2848 }
2849
2850 /* loop over elements in row i2 of P_ext_couple, do the same */
2851 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2852 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2853 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2854 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2855 for (irb=0; irb<row_block_size; irb++) {
2856 for (icb=0; icb<col_block_size; icb++) {
2857 rtmp = ZERO;
2858 for (ib=0; ib<col_block_size; ib++) {
2859 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2860 }
2861 RAP_val[irb+row_block_size*icb]=rtmp;
2862 }
2863 }
2864
2865 if (P_marker[i_c] < row_marker_ext) {
2866 P_marker[i_c] = j;
2867 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2868 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2869 j++;
2870 } else {
2871 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2872 for (ib=0; ib<block_size; ib++) {
2873 temp_val[ib] += RAP_val[ib];
2874 }
2875 }
2876 }
2877
2878 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2879 Only the contributions are added. */
2880 } else {
2881 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2882 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2883 i_c = P->row_coupleBlock->pattern->index[j3];
2884 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2885 for (irb=0; irb<row_block_size; irb++) {
2886 for (icb=0; icb<col_block_size; icb++) {
2887 rtmp = ZERO;
2888 for (ib=0; ib<col_block_size; ib++) {
2889 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2890 }
2891 RAP_val[irb+row_block_size*icb]=rtmp;
2892 }
2893 }
2894
2895 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2896 for (ib=0; ib<block_size; ib++) {
2897 temp_val[ib] += RAP_val[ib];
2898 }
2899 }
2900 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2901 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2902 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2903 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2904 for (irb=0; irb<row_block_size; irb++) {
2905 for (icb=0; icb<col_block_size; icb++) {
2906 rtmp = ZERO;
2907 for (ib=0; ib<col_block_size; ib++) {
2908 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2909 }
2910 RAP_val[irb+row_block_size*icb]=rtmp;
2911 }
2912 }
2913
2914 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2915 for (ib=0; ib<block_size; ib++) {
2916 temp_val[ib] += RAP_val[ib];
2917 }
2918 }
2919 }
2920 }
2921
2922 /* now loop over elements in row i1 of A->mainBlock, repeat
2923 the process we have done to block A->col_coupleBlock */
2924 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2925 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2926 i2 = A->mainBlock->pattern->index[j2];
2927 temp_val = &(A->mainBlock->val[j2*block_size]);
2928 for (irb=0; irb<row_block_size; irb++) {
2929 for (icb=0; icb<col_block_size; icb++) {
2930 rtmp = ZERO;
2931 for (ib=0; ib<col_block_size; ib++) {
2932 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2933 }
2934 RA_val[irb+row_block_size*icb]=rtmp;
2935 }
2936 }
2937
2938 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2939 A_marker[i2 + num_Acouple_cols] = i_r;
2940 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2941 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2942 i_c = P->mainBlock->pattern->index[j3];
2943 temp_val = &(P->mainBlock->val[j3*block_size]);
2944 for (irb=0; irb<row_block_size; irb++) {
2945 for (icb=0; icb<col_block_size; icb++) {
2946 rtmp = ZERO;
2947 for (ib=0; ib<col_block_size; ib++) {
2948 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2949 }
2950 RAP_val[irb+row_block_size*icb]=rtmp;
2951 }
2952 }
2953
2954 if (P_marker[i_c] < row_marker) {
2955 P_marker[i_c] = i;
2956 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2957 RAP_main_idx[i] = i_c;
2958 i++;
2959 } else {
2960 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2961 for (ib=0; ib<block_size; ib++) {
2962 temp_val[ib] += RAP_val[ib];
2963 }
2964 }
2965 }
2966 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2967 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2968 /* note that we need to map the column index in
2969 P->col_coupleBlock back into the column index in
2970 P_ext_couple */
2971 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2972 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2973 for (irb=0; irb<row_block_size; irb++) {
2974 for (icb=0; icb<col_block_size; icb++) {
2975 rtmp = ZERO;
2976 for (ib=0; ib<col_block_size; ib++) {
2977 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2978 }
2979 RAP_val[irb+row_block_size*icb]=rtmp;
2980 }
2981 }
2982
2983 if (P_marker[i_c] < row_marker_ext) {
2984 P_marker[i_c] = j;
2985 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2986 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2987 j++;
2988 } else {
2989 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2990 for (ib=0; ib<block_size; ib++) {
2991 temp_val[ib] += RAP_val[ib];
2992 }
2993 }
2994 }
2995
2996 } else {
2997 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2998 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2999 i_c = P->mainBlock->pattern->index[j3];
3000 temp_val = &(P->mainBlock->val[j3*block_size]);
3001 for (irb=0; irb<row_block_size; irb++) {
3002 for (icb=0; icb<col_block_size; icb++) {
3003 rtmp = ZERO;
3004 for (ib=0; ib<col_block_size; ib++) {
3005 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3006 }
3007 RAP_val[irb+row_block_size*icb]=rtmp;
3008 }
3009 }
3010
3011 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3012 for (ib=0; ib<block_size; ib++) {
3013 temp_val[ib] += RAP_val[ib];
3014 }
3015 }
3016 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
3017 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3018 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3019 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
3020 for (irb=0; irb<row_block_size; irb++) {
3021 for (icb=0; icb<col_block_size; icb++) {
3022 rtmp = ZERO;
3023 for (ib=0; ib<col_block_size; ib++) {
3024 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3025 }
3026 RAP_val[irb+row_block_size*icb]=rtmp;
3027 }
3028 }
3029
3030 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3031 for (ib=0; ib<block_size; ib++) {
3032 temp_val[ib] += RAP_val[ib];
3033 }
3034 }
3035 }
3036 }
3037 }
3038
3039 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
3040 if (i > row_marker) {
3041 offset = i - row_marker;
3042 temp = TMPMEMALLOC(offset, index_t);
3043 for (iptr=0; iptr<offset; iptr++)
3044 temp[iptr] = RAP_main_idx[row_marker+iptr];
3045 if (offset > 0) {
3046 #ifdef USE_QSORTG
3047 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3048 #else
3049 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3050 #endif
3051 }
3052 temp_val = TMPMEMALLOC(offset * block_size, double);
3053 for (iptr=0; iptr<offset; iptr++){
3054 k = P_marker[temp[iptr]];
3055 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
3056 P_marker[temp[iptr]] = iptr + row_marker;
3057 }
3058 for (iptr=0; iptr<offset; iptr++){
3059 RAP_main_idx[row_marker+iptr] = temp[iptr];
3060 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3061 }
3062 TMPMEMFREE(temp);
3063 TMPMEMFREE(temp_val);
3064 }
3065 if (j > row_marker_ext) {
3066 offset = j - row_marker_ext;
3067 temp = TMPMEMALLOC(offset, index_t);
3068 for (iptr=0; iptr<offset; iptr++)
3069 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
3070 if (offset > 0) {
3071 #ifdef USE_QSORTG
3072 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3073 #else
3074 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3075 #endif
3076 }
3077 temp_val = TMPMEMALLOC(offset * block_size, double);
3078 for (iptr=0; iptr<offset; iptr++){
3079 k = P_marker[temp[iptr] + num_Pmain_cols];
3080 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
3081 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
3082 }
3083 for (iptr=0; iptr<offset; iptr++){
3084 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
3085 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3086 }
3087 TMPMEMFREE(temp);
3088 TMPMEMFREE(temp_val);
3089 }
3090 }
3091
3092 TMPMEMFREE(RA_val);
3093 TMPMEMFREE(RAP_val);
3094 TMPMEMFREE(A_marker);
3095 TMPMEMFREE(Pext_to_RAP);
3096 TMPMEMFREE(Pcouple_to_RAP);
3097 MEMFREE(RAP_ext_ptr);
3098 MEMFREE(RAP_ext_idx);
3099 MEMFREE(RAP_ext_val);
3100 Paso_SparseMatrix_free(R_main);
3101 Paso_SparseMatrix_free(R_couple);
3102
3103 /* Check whether there are empty columns in RAP_couple */
3104 #pragma omp parallel for schedule(static) private(i)
3105 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
3106 /* num of non-empty columns is stored in "k" */
3107 k = 0;
3108 j = RAP_couple_ptr[num_Pmain_cols];
3109 for (i=0; i<j; i++) {
3110 i1 = RAP_couple_idx[i];
3111 if (P_marker[i1]) {
3112 P_marker[i1] = 0;
3113 k++;
3114 }
3115 }
3116
3117 /* empty columns is found */
3118 if (k < num_RAPext_cols) {
3119 temp = TMPMEMALLOC(k, index_t);
3120 k = 0;
3121 for (i=0; i<num_RAPext_cols; i++)
3122 if (!P_marker[i]) {
3123 P_marker[i] = k;
3124 temp[k] = global_id_RAP[i];
3125 k++;
3126 }
3127 for (i=0; i<j; i++) {
3128 i1 = RAP_couple_idx[i];
3129 RAP_couple_idx[i] = P_marker[i1];
3130 }
3131 num_RAPext_cols = k;
3132 TMPMEMFREE(global_id_RAP);
3133 global_id_RAP = temp;
3134 }
3135 TMPMEMFREE(P_marker);
3136
3137 /******************************************************/
3138 /* Start to create the coasre level System Matrix A_c */
3139 /******************************************************/
3140 /* first, prepare the sender/receiver for the col_connector */
3141 dist = P->pattern->input_distribution->first_component;
3142 recv_len = TMPMEMALLOC(size, dim_t);
3143 send_len = TMPMEMALLOC(size, dim_t);
3144 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
3145 offsetInShared = TMPMEMALLOC(size+1, index_t);
3146 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
3147 memset(recv_len, 0, sizeof(dim_t) * size);
3148 memset(send_len, 0, sizeof(dim_t) * size);
3149 num_neighbors = 0;
3150 offsetInShared[0] = 0;
3151 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
3152 shared[i] = i + num_Pmain_cols;
3153 if (k <= global_id_RAP[i]) {
3154 if (recv_len[j] > 0) {
3155 neighbor[num_neighbors] = j;
3156 num_neighbors ++;
3157 offsetInShared[num_neighbors] = i;
3158 }
3159 while (k <= global_id_RAP[i]) {
3160 j++;
3161 k = dist[j+1];
3162 }
3163 }
3164 recv_len[j] ++;
3165 }
3166 if (recv_len[j] > 0) {
3167 neighbor[num_neighbors] = j;
3168 num_neighbors ++;
3169 offsetInShared[num_neighbors] = i;
3170 }
3171 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3172 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3173
3174 #ifdef ESYS_MPI
3175 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
3176 #endif
3177
3178 #ifdef ESYS_MPI
3179 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
3180 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
3181 #else
3182 mpi_requests=TMPMEMALLOC(size*2,int);
3183 mpi_stati=TMPMEMALLOC(size*2,int);
3184 #endif
3185 num_neighbors = 0;
3186 j = 0;
3187 offsetInShared[0] = 0;
3188 for (i=0; i<size; i++) {
3189 if (send_len[i] > 0) {
3190 neighbor[num_neighbors] = i;
3191 num_neighbors ++;
3192 j += send_len[i];
3193 offsetInShared[num_neighbors] = j;
3194 }
3195 }
3196 TMPMEMFREE(shared);
3197 shared = TMPMEMALLOC(j, index_t);
3198 for (i=0, j=0; i<num_neighbors; i++) {
3199 k = neighbor[i];
3200 #ifdef ESYS_MPI
3201 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
3202 mpi_info->msg_tag_counter+k,
3203 mpi_info->comm, &mpi_requests[i]);
3204 #endif
3205 j += send_len[k];
3206 }
3207 for (i=0, j=0; i<recv->numNeighbors; i++) {
3208 k = recv->neighbor[i];
3209 #ifdef ESYS_MPI
3210 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
3211 mpi_info->msg_tag_counter+rank,
3212 mpi_info->comm, &mpi_requests[i+num_neighbors]);
3213 #endif
3214 j += recv_len[k];
3215 }
3216 #ifdef ESYS_MPI
3217 MPI_Waitall(num_neighbors + recv->numNeighbors,
3218 mpi_requests, mpi_stati);
3219 #endif
3220 mpi_info->msg_tag_counter += size;
3221
3222 j = offsetInShared[num_neighbors];
3223 offset = dist[rank];
3224 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
3225 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3226 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3227
3228 col_connector = Paso_Connector_alloc(send, recv);
3229 TMPMEMFREE(shared);
3230 Paso_SharedComponents_free(recv);
3231 Paso_SharedComponents_free(send);
3232
3233 /* now, create row distribution (output_distri) and col
3234 distribution (input_distribution) */
3235 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3236 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3237
3238 /* then, prepare the sender/receiver for the row_connector, first, prepare
3239 the information for sender */
3240 sum = RAP_couple_ptr[num_Pmain_cols];
3241 len = TMPMEMALLOC(size, dim_t);
3242 send_ptr = TMPMEMALLOC(size, index_t*);
3243 send_idx = TMPMEMALLOC(size, index_t*);
3244 for (i=0; i<size; i++) {
3245 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
3246 send_idx[i] = TMPMEMALLOC(sum, index_t);
3247 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
3248 }
3249 memset(len, 0, sizeof(dim_t) * size);
3250 recv = col_connector->recv;
3251 sum=0;
3252 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
3253 i1 = RAP_couple_ptr[i_r];
3254 i2 = RAP_couple_ptr[i_r+1];
3255 if (i2 > i1) {
3256 /* then row i_r will be in the sender of row_connector, now check
3257 how many neighbors i_r needs to be send to */
3258 for (j=i1; j<i2; j++) {
3259 i_c = RAP_couple_idx[j];
3260 /* find out the corresponding neighbor "p" of column i_c */
3261 for (p=0; p<recv->numNeighbors; p++) {
3262 if (i_c < recv->offsetInShared[p+1]) {
3263 k = recv->neighbor[p];
3264 if (send_ptr[k][i_r] == 0) sum++;
3265 send_ptr[k][i_r] ++;
3266 send_idx[k][len[k]] = global_id_RAP[i_c];
3267 len[k] ++;
3268 break;
3269 }
3270 }
3271 }
3272 }
3273 }
3274 if (global_id_RAP) {
3275 TMPMEMFREE(global_id_RAP);
3276 global_id_RAP = NULL;
3277 }
3278
3279 /* now allocate the sender */
3280 shared = TMPMEMALLOC(sum, index_t);
3281 memset(send_len, 0, sizeof(dim_t) * size);
3282 num_neighbors=0;
3283 offsetInShared[0] = 0;
3284 for (p=0, k=0; p<size; p++) {
3285 for (i=0; i<num_Pmain_cols; i++) {
3286 if (send_ptr[p][i] > 0) {
3287 shared[k] = i;
3288 k++;
3289 send_ptr[p][send_len[p]] = send_ptr[p][i];
3290 send_len[p]++;
3291 }
3292 }
3293 if (k > offsetInShared[num_neighbors]) {
3294 neighbor[num_neighbors] = p;
3295 num_neighbors ++;
3296 offsetInShared[num_neighbors] = k;
3297 }
3298 }
3299 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3300 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3301
3302 /* send/recv number of rows will be sent from current proc
3303 recover info for the receiver of row_connector from the sender */
3304 #ifdef ESYS_MPI
3305 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
3306 #endif
3307 num_neighbors = 0;
3308 offsetInShared[0] = 0;
3309 j = 0;
3310 for (i=0; i<size; i++) {
3311 if (i != rank && recv_len[i] > 0) {
3312 neighbor[num_neighbors] = i;
3313 num_neighbors ++;
3314 j += recv_len[i];
3315 offsetInShared[num_neighbors] = j;
3316 }
3317 }
3318 TMPMEMFREE(shared);
3319 TMPMEMFREE(recv_len);
3320 shared = TMPMEMALLOC(j, index_t);
3321 k = offsetInShared[num_neighbors];
3322 for (i=0; i<k; i++) {
3323 shared[i] = i + num_Pmain_cols;
3324 }
3325 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3326 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3327 row_connector = Paso_Connector_alloc(send, recv);
3328 TMPMEMFREE(shared);
3329 Paso_SharedComponents_free(recv);
3330 Paso_SharedComponents_free(send);
3331
3332 /* send/recv pattern->ptr for rowCoupleBlock */
3333 num_RAPext_rows = offsetInShared[num_neighbors];
3334 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
3335 for (p=0; p<num_neighbors; p++) {
3336 j = offsetInShared[p];
3337 i = offsetInShared[p+1];
3338 #ifdef ESYS_MPI
3339 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
3340 mpi_info->msg_tag_counter+neighbor[p],
3341 mpi_info->comm, &mpi_requests[p]);
3342 #endif
3343 }
3344 send = row_connector->send;
3345 for (p=0; p<send->numNeighbors; p++) {
3346 #ifdef ESYS_MPI
3347 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
3348 MPI_INT, send->neighbor[p],
3349 mpi_info->msg_tag_counter+rank,
3350 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3351 #endif
3352 }
3353 #ifdef ESYS_MPI
3354 MPI_Waitall(num_neighbors + send->numNeighbors,
3355 mpi_requests, mpi_stati);
3356 #endif
3357 mpi_info->msg_tag_counter += size;
3358 TMPMEMFREE(send_len);
3359
3360 sum = 0;
3361 for (i=0; i<num_RAPext_rows; i++) {
3362 k = row_couple_ptr[i];
3363 row_couple_ptr[i] = sum;
3364 sum += k;
3365 }
3366 row_couple_ptr[num_RAPext_rows] = sum;
3367
3368 /* send/recv pattern->index for rowCoupleBlock */
3369 k = row_couple_ptr[num_RAPext_rows];
3370 row_couple_idx = MEMALLOC(k, index_t);
3371 for (p=0; p<num_neighbors; p++) {
3372 j1 = row_couple_ptr[offsetInShared[p]];
3373 j2 = row_couple_ptr[offsetInShared[p+1]];
3374 #ifdef ESYS_MPI
3375 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
3376 mpi_info->msg_tag_counter+neighbor[p],
3377 mpi_info->comm, &mpi_requests[p]);
3378 #endif
3379 }
3380 for (p=0; p<send->numNeighbors; p++) {
3381 #ifdef ESYS_MPI
3382 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
3383 MPI_INT, send->neighbor[p],
3384 mpi_info->msg_tag_counter+rank,
3385 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3386 #endif
3387 }
3388 #ifdef ESYS_MPI
3389 MPI_Waitall(num_neighbors + send->numNeighbors,
3390 mpi_requests, mpi_stati);
3391 #endif
3392 mpi_info->msg_tag_counter += size;
3393
3394 offset = input_dist->first_component[rank];
3395 k = row_couple_ptr[num_RAPext_rows];
3396 for (i=0; i<k; i++) {
3397 row_couple_idx[i] -= offset;
3398 }
3399
3400 for (i=0; i<size; i++) {
3401 TMPMEMFREE(send_ptr[i]);
3402 TMPMEMFREE(send_idx[i]);
3403 }
3404 TMPMEMFREE(send_ptr);
3405 TMPMEMFREE(send_idx);
3406 TMPMEMFREE(len);
3407 TMPMEMFREE(offsetInShared);
3408 TMPMEMFREE(neighbor);
3409 TMPMEMFREE(mpi_requests);
3410 TMPMEMFREE(mpi_stati);
3411 Esys_MPIInfo_free(mpi_info);
3412
3413 /* Now, we can create pattern for mainBlock and coupleBlock */
3414 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
3415 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
3416 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3417 num_Pmain_cols, num_RAPext_cols,
3418 RAP_couple_ptr, RAP_couple_idx);
3419 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3420 num_RAPext_rows, num_Pmain_cols,
3421 row_couple_ptr, row_couple_idx);
3422
3423 /* next, create the system matrix */
3424 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
3425 output_dist, input_dist, main_pattern, col_couple_pattern,
3426 row_couple_pattern, col_connector, row_connector);
3427 out = Paso_SystemMatrix_alloc(A->type, pattern,
3428 row_block_size, col_block_size, FALSE);
3429
3430 /* finally, fill in the data*/
3431 memcpy(out->mainBlock->val, RAP_main_val,
3432 out->mainBlock->len* sizeof(double));
3433 memcpy(out->col_coupleBlock->val, RAP_couple_val,
3434 out->col_coupleBlock->len * sizeof(double));
3435
3436 /* Clean up */
3437 Paso_SystemMatrixPattern_free(pattern);
3438 Paso_Pattern_free(main_pattern);
3439 Paso_Pattern_free(col_couple_pattern);
3440 Paso_Pattern_free(row_couple_pattern);
3441 Paso_Connector_free(col_connector);
3442 Paso_Connector_free(row_connector);
3443 Paso_Distribution_free(output_dist);
3444 Paso_Distribution_free(input_dist);
3445 TMPMEMFREE(RAP_main_val);
3446 TMPMEMFREE(RAP_couple_val);
3447 if (Esys_noError()) {
3448 return out;
3449 } else {
3450 Paso_SystemMatrix_free(out);
3451 return NULL;
3452 }
3453 }

  ViewVC Help
Powered by ViewVC 1.1.26