/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4843 - (show annotations)
Tue Apr 8 05:32:07 2014 UTC (5 years, 8 months ago) by caltinay
File size: 132793 byte(s)
checkpointing paso::Preconditioner.

1 /*****************************************************************************
2 *
3 * Copyright (c) 2003-2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16
17 /****************************************************************************/
18
19 /* Paso: defines AMG Interpolation */
20
21 /****************************************************************************/
22
23 /* Author: l.gao@uq.edu.au */
24
25 /****************************************************************************/
26
27 #include "Paso.h"
28 #include "SparseMatrix.h"
29 #include "PasoUtil.h"
30 #include "Preconditioner.h"
31
32 namespace paso {
33
34 /****************************************************************************
35
36 Methods necessary for AMG preconditioner
37
38 construct n_C x n_C interpolation operator A_C from matrix A
39 and prolongation matrix P.
40
41 The coarsening operator A_C is defined by RAP where R=P^T.
42
43 *****************************************************************************/
44
45 /* Extend system matrix B with extra two sparse matrices:
46 B_ext_main and B_ext_couple
47 The combination of this two sparse matrices represents
48 the portion of B that is stored on neighbour procs and
49 needed locally for triple matrix product (B^T) A B.
50
51 FOR NOW, we assume that the system matrix B has a NULL
52 row_coupleBlock and a NULL remote_coupleBlock. Based on
53 this assumption, we use link row_coupleBlock for sparse
54 matrix B_ext_main, and link remote_coupleBlock for sparse
55 matrix B_ext_couple.
56
57 To be specific, B_ext (on processor k) are group of rows
58 in B, where the list of rows from processor q is given by
59 A->col_coupler->connector->send->shared[rPtr...]
60 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
61 on q.
62 */
63 void Preconditioner_AMG_extendB(SystemMatrix_ptr A, SystemMatrix_ptr B)
64 {
65 if (A->mpi_info->size == 1) return;
66
67 if (B->remote_coupleBlock.get()) {
68 Esys_setError(VALUE_ERROR, "Preconditioner_AMG_extendB: the link to "
69 "remote_coupleBlock has already been set.");
70 return;
71 }
72 #ifdef ESYS_MPI
73 B->row_coupleBlock.reset();
74
75 Pattern_ptr pattern_main, pattern_couple;
76 Coupler_ptr coupler;
77 SharedComponents_ptr send, recv;
78 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
79 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
80 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
81 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
82 index_t *idx_m=NULL, *idx_c=NULL;
83 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
84 index_t offset, len, block_size, block_size_size, max_num_cols;
85 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
86 dim_t *recv_degree=NULL, *send_degree=NULL;
87 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
88
89 // sending/receiving unknown's global ID
90 num_main_cols = B->mainBlock->numCols;
91 cols = new double[num_main_cols];
92 offset = B->col_distribution->first_component[rank];
93 #pragma omp parallel for private(i) schedule(static)
94 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
95 if (B->global_id == NULL) {
96 coupler.reset(new Coupler(B->col_coupler->connector, 1));
97 coupler->startCollect(cols);
98 }
99
100 recv_buf = new index_t[size];
101 recv_degree = new dim_t[size];
102 recv_offset = new index_t[size+1];
103 #pragma omp parallel for private(i) schedule(static)
104 for (i=0; i<size; i++){
105 recv_buf[i] = 0;
106 recv_degree[i] = 1;
107 recv_offset[i] = i;
108 }
109
110 block_size = B->block_size;
111 block_size_size = block_size * sizeof(double);
112 num_couple_cols = B->col_coupleBlock->numCols;
113 send = A->col_coupler->connector->send;
114 recv = A->col_coupler->connector->recv;
115 num_neighbors = send->numNeighbors;
116 p = send->offsetInShared[num_neighbors];
117 len = p * B->col_distribution->first_component[size];
118 send_buf = new double[len * block_size];
119 send_idx = new index_t[len];
120 send_offset = new index_t[(p+1)*2];
121 send_degree = new dim_t[num_neighbors];
122 i = num_main_cols + num_couple_cols;
123 send_m = new double[i * block_size];
124 send_c = new double[i * block_size];
125 idx_m = new index_t[i];
126 idx_c = new index_t[i];
127
128 /* waiting for receiving unknown's global ID */
129 if (B->global_id == NULL) {
130 coupler->finishCollect();
131 global_id = new index_t[num_couple_cols];
132 #pragma omp parallel for private(i) schedule(static)
133 for (i=0; i<num_couple_cols; ++i)
134 global_id[i] = coupler->recv_buffer[i];
135 B->global_id = global_id;
136 } else
137 global_id = B->global_id;
138
139 /* distribute the number of cols in current col_coupleBlock to all ranks */
140 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
141
142 /* distribute global_ids of cols to be considered to all ranks*/
143 len = 0;
144 max_num_cols = 0;
145 for (i=0; i<size; i++){
146 recv_degree[i] = recv_buf[i];
147 recv_offset[i] = len;
148 len += recv_buf[i];
149 if (max_num_cols < recv_buf[i])
150 max_num_cols = recv_buf[i];
151 }
152 recv_offset[size] = len;
153 cols_array = new index_t[len];
154 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155
156 // first, prepare the ptr_ptr to be received
157 q = recv->numNeighbors;
158 len = recv->offsetInShared[q];
159 ptr_ptr = new index_t[(len+1) * 2];
160 for (p=0; p<q; p++) {
161 row = recv->offsetInShared[p];
162 m = recv->offsetInShared[p + 1];
163 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
164 A->mpi_info->msg_tag_counter+recv->neighbor[p],
165 A->mpi_info->comm,
166 &(A->col_coupler->mpi_requests[p]));
167 }
168
169 // now prepare the rows to be sent (the degree, the offset and the data)
170 len = 0;
171 i0 = 0;
172 for (p=0; p<num_neighbors; p++) {
173 i = i0;
174 neighbor = send->neighbor[p];
175 m_lb = B->col_distribution->first_component[neighbor];
176 m_ub = B->col_distribution->first_component[neighbor + 1];
177 j_ub = send->offsetInShared[p + 1];
178 for (j=send->offsetInShared[p]; j<j_ub; j++) {
179 row = send->shared[j];
180 l_m = 0;
181 l_c = 0;
182 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
183 k_lb = B->col_coupleBlock->pattern->ptr[row];
184
185 /* check part of col_coupleBlock for data to be passed @row */
186 for (k=k_lb; k<k_ub; k++) {
187 m = global_id[B->col_coupleBlock->pattern->index[k]];
188 if (m > offset) break;
189 if (m>= m_lb && m < m_ub) {
190 /* data to be passed to sparse matrix B_ext_main */
191 idx_m[l_m] = m - m_lb;
192 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
193 l_m++;
194 } else {
195 /* data to be passed to sparse matrix B_ext_couple */
196 idx_c[l_c] = m;
197 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
198 l_c++;
199 }
200 }
201 k_lb = k;
202
203 /* check mainBlock for data to be passed @row to sparse
204 matrix B_ext_couple */
205 k_ub = B->mainBlock->pattern->ptr[row + 1];
206 k = B->mainBlock->pattern->ptr[row];
207 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
208 for (; k<k_ub; k++) {
209 m = B->mainBlock->pattern->index[k] + offset;
210 idx_c[l_c] = m;
211 l_c++;
212 }
213
214 /* check the rest part of col_coupleBlock for data to
215 be passed @row to sparse matrix B_ext_couple */
216 k = k_lb;
217 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
218 for (k=k_lb; k<k_ub; k++) {
219 m = global_id[B->col_coupleBlock->pattern->index[k]];
220 if (m>= m_lb && m < m_ub) {
221 /* data to be passed to sparse matrix B_ext_main */
222 idx_m[l_m] = m - m_lb;
223 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
224 l_m++;
225 } else {
226 /* data to be passed to sparse matrix B_ext_couple */
227 idx_c[l_c] = m;
228 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
229 l_c++;
230 }
231 }
232
233 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
234 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
235 send_offset[2*i] = l_m;
236 len += l_m;
237 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
238 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
239 send_offset[2*i+1] = l_c;
240 len += l_c;
241 i++;
242 }
243
244 /* sending */
245 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
246 A->mpi_info->msg_tag_counter+rank,
247 A->mpi_info->comm,
248 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
249 send_degree[p] = len;
250 i0 = i;
251 }
252 delete[] send_m;
253 delete[] send_c;
254 delete[] idx_m;
255 delete[] idx_c;
256
257 q = recv->numNeighbors;
258 len = recv->offsetInShared[q];
259 ptr_main = new index_t[(len+1)];
260 ptr_couple = new index_t[(len+1)];
261
262 MPI_Waitall(A->col_coupler->connector->send->numNeighbors +
263 A->col_coupler->connector->recv->numNeighbors,
264 A->col_coupler->mpi_requests,
265 A->col_coupler->mpi_stati);
266 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
267
268 j = 0;
269 k = 0;
270 ptr_main[0] = 0;
271 ptr_couple[0] = 0;
272 for (i=0; i<len; i++) {
273 j += ptr_ptr[2*i];
274 k += ptr_ptr[2*i+1];
275 ptr_main[i+1] = j;
276 ptr_couple[i+1] = k;
277 }
278
279 delete[] ptr_ptr;
280 idx_main = new index_t[j];
281 idx_couple = new index_t[k];
282 ptr_idx = new index_t[j+k];
283 ptr_val = new double[(j+k) * block_size];
284
285 /* send/receive index array */
286 j=0;
287 k_ub = 0;
288 for (p=0; p<recv->numNeighbors; p++) {
289 k = recv->offsetInShared[p];
290 m = recv->offsetInShared[p+1];
291 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
292 if (i > 0) {
293 k_ub ++;
294 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
295 A->mpi_info->msg_tag_counter+recv->neighbor[p],
296 A->mpi_info->comm,
297 &(A->col_coupler->mpi_requests[p]));
298 }
299 j += i;
300 }
301
302 j=0;
303 k_ub = 0;
304 for (p=0; p<num_neighbors; p++) {
305 i = send_degree[p] - j;
306 if (i > 0){
307 k_ub ++;
308 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
309 A->mpi_info->msg_tag_counter+rank,
310 A->mpi_info->comm,
311 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
312 }
313 j = send_degree[p];
314 }
315
316 MPI_Waitall(A->col_coupler->connector->send->numNeighbors +
317 A->col_coupler->connector->recv->numNeighbors,
318 A->col_coupler->mpi_requests,
319 A->col_coupler->mpi_stati);
320 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
321
322 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
323 for (i=0; i<len; i++) {
324 j = ptr_main[i];
325 k = ptr_main[i+1];
326 m = ptr_couple[i];
327 for (p=j; p<k; p++) {
328 idx_main[p] = ptr_idx[m+p];
329 }
330 j = ptr_couple[i+1];
331 for (p=m; p<j; p++) {
332 idx_couple[p] = ptr_idx[k+p];
333 }
334 }
335 delete[] ptr_idx;
336
337 /* allocate pattern and sparsematrix for B_ext_main */
338 pattern_main.reset(new Pattern(B->col_coupleBlock->pattern->type,
339 len, num_main_cols, ptr_main, idx_main));
340 B->row_coupleBlock.reset(new SparseMatrix(B->col_coupleBlock->type,
341 pattern_main, B->row_block_size, B->col_block_size,
342 false));
343
344 /* allocate pattern and sparsematrix for B_ext_couple */
345 pattern_couple.reset(new Pattern(B->col_coupleBlock->pattern->type,
346 len, B->col_distribution->first_component[size],
347 ptr_couple, idx_couple));
348 B->remote_coupleBlock.reset(new SparseMatrix(B->col_coupleBlock->type,
349 pattern_couple, B->row_block_size, B->col_block_size,
350 false));
351
352 /* send/receive value array */
353 j=0;
354 for (p=0; p<recv->numNeighbors; p++) {
355 k = recv->offsetInShared[p];
356 m = recv->offsetInShared[p+1];
357 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
358 if (i > 0)
359 MPI_Irecv(&(ptr_val[j]), i * block_size,
360 MPI_DOUBLE, recv->neighbor[p],
361 A->mpi_info->msg_tag_counter+recv->neighbor[p],
362 A->mpi_info->comm,
363 &(A->col_coupler->mpi_requests[p]));
364 j += (i * block_size);
365 }
366
367 j=0;
368 for (p=0; p<num_neighbors; p++) {
369 i = send_degree[p] - j;
370 if (i > 0)
371 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
372 A->mpi_info->msg_tag_counter+rank,
373 A->mpi_info->comm,
374 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
375 j = send_degree[p] ;
376 }
377
378 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
379 A->col_coupler->mpi_requests,
380 A->col_coupler->mpi_stati);
381 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
382
383 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
384 for (i=0; i<len; i++) {
385 j = ptr_main[i];
386 k = ptr_main[i+1];
387 m = ptr_couple[i];
388 for (p=j; p<k; p++) {
389 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
390 }
391 j = ptr_couple[i+1];
392 for (p=m; p<j; p++) {
393 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
394 }
395 }
396
397 delete[] ptr_val;
398 delete[] cols;
399 delete[] cols_array;
400 delete[] recv_offset;
401 delete[] recv_degree;
402 delete[] recv_buf;
403 delete[] send_buf;
404 delete[] send_offset;
405 delete[] send_degree;
406 delete[] send_idx;
407 #endif // ESYS_MPI
408 }
409
410 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
411 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
412 to copy block of data in T to neighbour processors, defined by
413 P->col_coupler->connector->recv->neighbor[k] where k is in
414 [0, P->col_coupler->connector->recv->numNeighbors).
415 Rows to be copied to neighbor processor k is in the list defined by
416 P->col_coupler->connector->recv->offsetInShared[k] ...
417 P->col_coupler->connector->recv->offsetInShared[k+1] */
418 void Preconditioner_AMG_CopyRemoteData(SystemMatrix_ptr P,
419 index_t **p_ptr, index_t **p_idx, double **p_val,
420 index_t *global_id, index_t block_size)
421 {
422 SharedComponents_ptr send, recv;
423 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
424 index_t i, j, p, m, n, size;
425 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
426 index_t *ptr=*p_ptr, *idx=*p_idx;
427 double *val=*p_val, *recv_val=NULL;
428 #ifdef ESYS_MPI
429 index_t rank = P->mpi_info->rank;
430 #endif
431
432 size = P->mpi_info->size;
433 send = P->col_coupler->connector->recv;
434 recv = P->col_coupler->connector->send;
435 send_neighbors = send->numNeighbors;
436 recv_neighbors = recv->numNeighbors;
437 send_rows = P->col_coupleBlock->numCols;
438 recv_rows = recv->offsetInShared[recv_neighbors];
439
440 send_degree = new index_t[send_rows];
441 recv_ptr = new index_t[recv_rows + 1];
442 #pragma omp for schedule(static) private(i)
443 for (i=0; i<send_rows; i++)
444 send_degree[i] = ptr[i+1] - ptr[i];
445
446 /* First, send/receive the degree */
447 for (p=0; p<recv_neighbors; p++) { /* Receiving */
448 m = recv->offsetInShared[p];
449 n = recv->offsetInShared[p+1];
450 #ifdef ESYS_MPI
451 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
452 P->mpi_info->msg_tag_counter + recv->neighbor[p],
453 P->mpi_info->comm,
454 &(P->col_coupler->mpi_requests[p]));
455 #endif
456 }
457 for (p=0; p<send_neighbors; p++) { /* Sending */
458 m = send->offsetInShared[p];
459 n = send->offsetInShared[p+1];
460 #ifdef ESYS_MPI
461 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
462 P->mpi_info->msg_tag_counter + rank,
463 P->mpi_info->comm,
464 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
465 #endif
466 }
467 #ifdef ESYS_MPI
468 MPI_Waitall(send_neighbors+recv_neighbors,
469 P->col_coupler->mpi_requests,
470 P->col_coupler->mpi_stati);
471 #endif
472 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
473
474 delete[] send_degree;
475 m = Paso_Util_cumsum(recv_rows, recv_ptr);
476 recv_ptr[recv_rows] = m;
477 recv_idx = new index_t[m];
478 recv_val = new double[m * block_size];
479
480 /* Next, send/receive the index array */
481 j = 0;
482 for (p=0; p<recv_neighbors; p++) { /* Receiving */
483 m = recv->offsetInShared[p];
484 n = recv->offsetInShared[p+1];
485 i = recv_ptr[n] - recv_ptr[m];
486 if (i > 0) {
487 #ifdef ESYS_MPI
488 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
489 P->mpi_info->msg_tag_counter + recv->neighbor[p],
490 P->mpi_info->comm,
491 &(P->col_coupler->mpi_requests[p]));
492 #endif
493 }
494 j += i;
495 }
496
497 j = 0;
498 for (p=0; p<send_neighbors; p++) { /* Sending */
499 m = send->offsetInShared[p];
500 n = send->offsetInShared[p+1];
501 i = ptr[n] - ptr[m];
502 if (i >0) {
503 #ifdef ESYS_MPI
504 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
505 P->mpi_info->msg_tag_counter + rank,
506 P->mpi_info->comm,
507 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
508 #endif
509 j += i;
510 }
511 }
512 #ifdef ESYS_MPI
513 MPI_Waitall(send_neighbors+recv_neighbors,
514 P->col_coupler->mpi_requests,
515 P->col_coupler->mpi_stati);
516 #endif
517 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
518
519 /* Last, send/receive the data array */
520 j = 0;
521 for (p=0; p<recv_neighbors; p++) { /* Receiving */
522 m = recv->offsetInShared[p];
523 n = recv->offsetInShared[p+1];
524 i = recv_ptr[n] - recv_ptr[m];
525 #ifdef ESYS_MPI
526 if (i > 0)
527 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
528 P->mpi_info->msg_tag_counter + recv->neighbor[p],
529 P->mpi_info->comm,
530 &(P->col_coupler->mpi_requests[p]));
531 #endif
532 j += (i*block_size);
533 }
534
535 j = 0;
536 for (p=0; p<send_neighbors; p++) { /* Sending */
537 m = send->offsetInShared[p];
538 n = send->offsetInShared[p+1];
539 i = ptr[n] - ptr[m];
540 if (i >0) {
541 #ifdef ESYS_MPI
542 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
543 P->mpi_info->msg_tag_counter + rank,
544 P->mpi_info->comm,
545 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
546 #endif
547 j += i * block_size;
548 }
549 }
550 #ifdef ESYS_MPI
551 MPI_Waitall(send_neighbors+recv_neighbors,
552 P->col_coupler->mpi_requests,
553 P->col_coupler->mpi_stati);
554 #endif
555 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
556
557 /* Clean up and return with received ptr, index and data arrays */
558 delete[] ptr;
559 delete[] idx;
560 delete[] val;
561 *p_ptr = recv_ptr;
562 *p_idx = recv_idx;
563 *p_val = recv_val;
564 }
565
566 SystemMatrix_ptr Preconditioner_AMG_buildInterpolationOperator(
567 SystemMatrix_ptr A, SystemMatrix_ptr P,
568 SystemMatrix_ptr R)
569 {
570 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
571 SystemMatrix_ptr out;
572 SystemMatrixPattern_ptr pattern;
573 Distribution_ptr input_dist, output_dist;
574 Connector_ptr col_connector, row_connector;
575 const dim_t row_block_size=A->row_block_size;
576 const dim_t col_block_size=A->col_block_size;
577 const dim_t block_size = A->block_size;
578 const dim_t P_block_size = P->block_size;
579 const double ZERO = 0.0;
580 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
581 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
582 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
583 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
584 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
585 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
586 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
587 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
588 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
589 index_t sum, i, j, k, iptr, irb, icb, ib;
590 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
591 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
592 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
593 index_t row_marker_ext, *where_p=NULL;
594 index_t **send_ptr=NULL, **send_idx=NULL;
595 dim_t p, num_neighbors;
596 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
597 Esys_MPI_rank *neighbor=NULL;
598 #ifdef ESYS_MPI
599 MPI_Request* mpi_requests=NULL;
600 MPI_Status* mpi_stati=NULL;
601 #else
602 int *mpi_requests=NULL, *mpi_stati=NULL;
603 #endif
604
605 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
606 return Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
607
608 /* two sparse matrices R_main and R_couple will be generate, as the
609 transpose of P_main and P_col_couple, respectively. Note that,
610 R_couple is actually the row_coupleBlock of R (R=P^T) */
611 SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
612 SparseMatrix_ptr R_couple;
613 if (size > 1)
614 R_couple = P->col_coupleBlock->getTranspose();
615
616 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
617 and needed locally for triple matrix product RAP
618 to be specific, P_ext (on processor k) are group of rows in P, where
619 the list of rows from processor q is given by
620 A->col_coupler->connector->send->shared[rPtr...]
621 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
622 on q.
623 P_ext is represented by two sparse matrices P_ext_main and
624 P_ext_couple */
625 Preconditioner_AMG_extendB(A, P);
626
627 /* count the number of cols in P_ext_couple, resize the pattern of
628 sparse matrix P_ext_couple with new compressed order, and then
629 build the col id mapping from P->col_coupleBlock to
630 P_ext_couple */
631 num_Pmain_cols = P->mainBlock->numCols;
632 if (size > 1) {
633 num_Pcouple_cols = P->col_coupleBlock->numCols;
634 num_Acouple_cols = A->col_coupleBlock->numCols;
635 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
636 } else {
637 num_Pcouple_cols = 0;
638 num_Acouple_cols = 0;
639 sum = 0;
640 }
641 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
642 offset = P->col_distribution->first_component[rank];
643 num_Pext_cols = 0;
644 if (P->global_id) {
645 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
646 and P->col_coupleBlock */
647 iptr = 0;
648 if (num_Pcouple_cols || sum > 0) {
649 temp = new index_t[num_Pcouple_cols+sum];
650 #pragma omp parallel for lastprivate(iptr) schedule(static)
651 for (iptr=0; iptr<sum; iptr++){
652 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
653 }
654 for (j=0; j<num_Pcouple_cols; j++, iptr++){
655 temp[iptr] = P->global_id[j];
656 }
657 }
658 if (iptr) {
659 qsort(temp, (size_t)iptr, sizeof(index_t), comparIndex);
660 num_Pext_cols = 1;
661 i = temp[0];
662 for (j=1; j<iptr; j++) {
663 if (temp[j] > i) {
664 i = temp[j];
665 temp[num_Pext_cols++] = i;
666 }
667 }
668 }
669 /* resize the pattern of P_ext_couple */
670 if(num_Pext_cols){
671 global_id_P = new index_t[num_Pext_cols];
672 #pragma omp parallel for private(i) schedule(static)
673 for (i=0; i<num_Pext_cols; i++)
674 global_id_P[i] = temp[i];
675 }
676 if (num_Pcouple_cols || sum > 0)
677 delete[] temp;
678 #pragma omp parallel for private(i, where_p) schedule(static)
679 for (i=0; i<sum; i++) {
680 where_p = (index_t *)bsearch(
681 &(P->remote_coupleBlock->pattern->index[i]),
682 global_id_P, num_Pext_cols,
683 sizeof(index_t), comparIndex);
684 P->remote_coupleBlock->pattern->index[i] =
685 (index_t)(where_p -global_id_P);
686 }
687
688 /* build the mapping */
689 if (num_Pcouple_cols) {
690 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
691 iptr = 0;
692 for (i=0; i<num_Pext_cols; i++)
693 if (global_id_P[i] == P->global_id[iptr]) {
694 Pcouple_to_Pext[iptr++] = i;
695 if (iptr == num_Pcouple_cols) break;
696 }
697 }
698 }
699
700 /* alloc and initialise the makers */
701 sum = num_Pext_cols + num_Pmain_cols;
702 P_marker = new index_t[sum];
703 A_marker = new index_t[num_A_cols];
704 #pragma omp parallel for private(i) schedule(static)
705 for (i=0; i<sum; i++) P_marker[i] = -1;
706 #pragma omp parallel for private(i) schedule(static)
707 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
708
709 /* Now, count the size of RAP_ext. Start with rows in R_couple */
710 sum = 0;
711 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
712 row_marker = sum;
713 /* then, loop over elements in row i_r of R_couple */
714 j1_ub = R_couple->pattern->ptr[i_r+1];
715 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
716 i1 = R_couple->pattern->index[j1];
717 /* then, loop over elements in row i1 of A->col_coupleBlock */
718 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
719 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
720 i2 = A->col_coupleBlock->pattern->index[j2];
721
722 /* check whether entry RA[i_r, i2] has been previously visited.
723 RAP new entry is possible only if entry RA[i_r, i2] has not
724 been visited yet */
725 if (A_marker[i2] != i_r) {
726 /* first, mark entry RA[i_r,i2] as visited */
727 A_marker[i2] = i_r;
728
729 /* then loop over elements in row i2 of P_ext_main */
730 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
731 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
732 i_c = P->row_coupleBlock->pattern->index[j3];
733
734 /* check whether entry RAP[i_r,i_c] has been created.
735 If not yet created, create the entry and increase
736 the total number of elements in RAP_ext */
737 if (P_marker[i_c] < row_marker) {
738 P_marker[i_c] = sum;
739 sum++;
740 }
741 }
742
743 /* loop over elements in row i2 of P_ext_couple, do the same */
744 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
745 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
746 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
747 if (P_marker[i_c] < row_marker) {
748 P_marker[i_c] = sum;
749 sum++;
750 }
751 }
752 }
753 }
754
755 /* now loop over elements in row i1 of A->mainBlock, repeat
756 the process we have done to block A->col_coupleBlock */
757 j2_ub = A->mainBlock->pattern->ptr[i1+1];
758 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
759 i2 = A->mainBlock->pattern->index[j2];
760 if (A_marker[i2 + num_Acouple_cols] != i_r) {
761 A_marker[i2 + num_Acouple_cols] = i_r;
762 j3_ub = P->mainBlock->pattern->ptr[i2+1];
763 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
764 i_c = P->mainBlock->pattern->index[j3];
765 if (P_marker[i_c] < row_marker) {
766 P_marker[i_c] = sum;
767 sum++;
768 }
769 }
770 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
771 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
772 /* note that we need to map the column index in
773 P->col_coupleBlock back into the column index in
774 P_ext_couple */
775 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
776 if (P_marker[i_c] < row_marker) {
777 P_marker[i_c] = sum;
778 sum++;
779 }
780 }
781 }
782 }
783 }
784 }
785
786 /* Now we have the number of non-zero elements in RAP_ext, allocate
787 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
788 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
789 RAP_ext_idx = new index_t[sum];
790 RAP_ext_val = new double[sum * block_size];
791 RA_val = new double[block_size];
792 RAP_val = new double[block_size];
793
794 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
795 sum = num_Pext_cols + num_Pmain_cols;
796 #pragma omp parallel for private(i) schedule(static)
797 for (i=0; i<sum; i++) P_marker[i] = -1;
798 #pragma omp parallel for private(i) schedule(static)
799 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
800 sum = 0;
801 RAP_ext_ptr[0] = 0;
802 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
803 row_marker = sum;
804 /* then, loop over elements in row i_r of R_couple */
805 j1_ub = R_couple->pattern->ptr[i_r+1];
806 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
807 i1 = R_couple->pattern->index[j1];
808 R_val = &(R_couple->val[j1*P_block_size]);
809
810 /* then, loop over elements in row i1 of A->col_coupleBlock */
811 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
812 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
813 i2 = A->col_coupleBlock->pattern->index[j2];
814 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
815 for (irb=0; irb<row_block_size; irb++)
816 for (icb=0; icb<col_block_size; icb++)
817 RA_val[irb+row_block_size*icb] = ZERO;
818 for (irb=0; irb<P_block_size; irb++) {
819 rtmp=R_val[irb];
820 for (icb=0; icb<col_block_size; icb++) {
821 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
822 }
823 }
824
825 /* check whether entry RA[i_r, i2] has been previously visited.
826 RAP new entry is possible only if entry RA[i_r, i2] has not
827 been visited yet */
828 if (A_marker[i2] != i_r) {
829 /* first, mark entry RA[i_r,i2] as visited */
830 A_marker[i2] = i_r;
831
832 /* then loop over elements in row i2 of P_ext_main */
833 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
834 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
835 i_c = P->row_coupleBlock->pattern->index[j3];
836 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
837 for (irb=0; irb<row_block_size; irb++)
838 for (icb=0; icb<col_block_size; icb++)
839 RAP_val[irb+row_block_size*icb] = ZERO;
840 for (icb=0; icb<P_block_size; icb++) {
841 rtmp = temp_val[icb];
842 for (irb=0; irb<row_block_size; irb++) {
843 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
844 }
845 }
846
847 /* check whether entry RAP[i_r,i_c] has been created.
848 If not yet created, create the entry and increase
849 the total number of elements in RAP_ext */
850 if (P_marker[i_c] < row_marker) {
851 P_marker[i_c] = sum;
852 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
853 RAP_ext_idx[sum] = i_c + offset;
854 sum++;
855 } else {
856 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
857 for (ib=0; ib<block_size; ib++) {
858 temp_val[ib] += RAP_val[ib];
859 }
860 }
861 }
862
863 /* loop over elements in row i2 of P_ext_couple, do the same */
864 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
865 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
866 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
867 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
868 for (irb=0; irb<row_block_size; irb++)
869 for (icb=0; icb<col_block_size; icb++)
870 RAP_val[irb+row_block_size*icb]=ZERO;
871 for (icb=0; icb<P_block_size; icb++) {
872 rtmp = temp_val[icb];
873 for (irb=0; irb<row_block_size; irb++) {
874 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
875 }
876 }
877
878 if (P_marker[i_c] < row_marker) {
879 P_marker[i_c] = sum;
880 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
881 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
882 sum++;
883 } else {
884 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
885 for (ib=0; ib<block_size; ib++) {
886 temp_val[ib] += RAP_val[ib];
887 }
888 }
889 }
890
891 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
892 Only the contributions are added. */
893 } else {
894 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
895 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
896 i_c = P->row_coupleBlock->pattern->index[j3];
897 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
898 for (irb=0; irb<row_block_size; irb++)
899 for (icb=0; icb<col_block_size; icb++)
900 RAP_val[irb+row_block_size*icb]=ZERO;
901 for (icb=0; icb<P_block_size; icb++) {
902 rtmp = temp_val[icb];
903 for (irb=0; irb<row_block_size; irb++) {
904 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
905 }
906 }
907
908 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
909 for (ib=0; ib<block_size; ib++) {
910 temp_val[ib] += RAP_val[ib];
911 }
912 }
913 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
914 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
915 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
916 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
917 for (irb=0; irb<row_block_size; irb++)
918 for (icb=0; icb<col_block_size; icb++)
919 RAP_val[irb+row_block_size*icb]=ZERO;
920 for (icb=0; icb<P_block_size; icb++) {
921 rtmp = temp_val[icb];
922 for (irb=0; irb<row_block_size; irb++) {
923 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
924 }
925 }
926
927 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
928 for (ib=0; ib<block_size; ib++) {
929 temp_val[ib] += RAP_val[ib];
930 }
931 }
932 }
933 }
934
935 /* now loop over elements in row i1 of A->mainBlock, repeat
936 the process we have done to block A->col_coupleBlock */
937 j2_ub = A->mainBlock->pattern->ptr[i1+1];
938 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
939 i2 = A->mainBlock->pattern->index[j2];
940 temp_val = &(A->mainBlock->val[j2*block_size]);
941 for (irb=0; irb<row_block_size; irb++)
942 for (icb=0; icb<col_block_size; icb++)
943 RA_val[irb+row_block_size*icb]=ZERO;
944 for (irb=0; irb<P_block_size; irb++) {
945 rtmp=R_val[irb];
946 for (icb=0; icb<col_block_size; icb++) {
947 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
948 }
949 }
950
951 if (A_marker[i2 + num_Acouple_cols] != i_r) {
952 A_marker[i2 + num_Acouple_cols] = i_r;
953 j3_ub = P->mainBlock->pattern->ptr[i2+1];
954 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
955 i_c = P->mainBlock->pattern->index[j3];
956 temp_val = &(P->mainBlock->val[j3*P_block_size]);
957 for (irb=0; irb<row_block_size; irb++)
958 for (icb=0; icb<col_block_size; icb++)
959 RAP_val[irb+row_block_size*icb]=ZERO;
960 for (icb=0; icb<P_block_size; icb++) {
961 rtmp = temp_val[icb];
962 for (irb=0; irb<row_block_size; irb++) {
963 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
964 }
965 }
966
967 if (P_marker[i_c] < row_marker) {
968 P_marker[i_c] = sum;
969 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
970 RAP_ext_idx[sum] = i_c + offset;
971 sum++;
972 } else {
973 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
974 for (ib=0; ib<block_size; ib++) {
975 temp_val[ib] += RAP_val[ib];
976 }
977 }
978 }
979 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
980 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
981 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
982 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
983 for (irb=0; irb<row_block_size; irb++)
984 for (icb=0; icb<col_block_size; icb++)
985 RAP_val[irb+row_block_size*icb]=ZERO;
986 for (icb=0; icb<P_block_size; icb++) {
987 rtmp=temp_val[icb];
988 for (irb=0; irb<row_block_size; irb++) {
989 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
990 }
991 }
992
993 if (P_marker[i_c] < row_marker) {
994 P_marker[i_c] = sum;
995 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
996 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
997 sum++;
998 } else {
999 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1000 for (ib=0; ib<block_size; ib++) {
1001 temp_val[ib] += RAP_val[ib];
1002 }
1003 }
1004 }
1005 } else {
1006 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1007 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1008 i_c = P->mainBlock->pattern->index[j3];
1009 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1010 for (irb=0; irb<row_block_size; irb++)
1011 for (icb=0; icb<col_block_size; icb++)
1012 RAP_val[irb+row_block_size*icb]=ZERO;
1013 for (icb=0; icb<P_block_size; icb++) {
1014 rtmp=temp_val[icb];
1015 for (irb=0; irb<row_block_size; irb++) {
1016 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1017 }
1018 }
1019
1020 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1021 for (ib=0; ib<block_size; ib++) {
1022 temp_val[ib] += RAP_val[ib];
1023 }
1024 }
1025 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1026 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1027 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1028 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1029 for (irb=0; irb<row_block_size; irb++)
1030 for (icb=0; icb<col_block_size; icb++)
1031 RAP_val[irb+row_block_size*icb]=ZERO;
1032 for (icb=0; icb<P_block_size; icb++) {
1033 rtmp=temp_val[icb];
1034 for (irb=0; irb<row_block_size; irb++) {
1035 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1036 }
1037 }
1038
1039 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1040 for (ib=0; ib<block_size; ib++) {
1041 temp_val[ib] += RAP_val[ib];
1042 }
1043 }
1044 }
1045 }
1046 }
1047 RAP_ext_ptr[i_r+1] = sum;
1048 }
1049 delete[] P_marker;
1050 delete[] Pcouple_to_Pext;
1051
1052 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1053 which is given by the column list of P->col_coupleBlock, and
1054 column "c" is the list of columns which possibly covers the
1055 whole column range of system matrix P. This part of data is to
1056 be passed to neighbouring processors, and added to corresponding
1057 RAP[r,c] entries in the neighbouring processors */
1058 Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1059 &RAP_ext_val, global_id_P, block_size);
1060
1061 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1062 sum = RAP_ext_ptr[num_RAPext_rows];
1063 num_RAPext_cols = 0;
1064 if (num_Pext_cols || sum > 0) {
1065 temp = new index_t[num_Pext_cols+sum];
1066 j1_ub = offset + num_Pmain_cols;
1067 for (i=0, iptr=0; i<sum; i++) {
1068 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1069 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1070 }
1071 for (j=0; j<num_Pext_cols; j++, iptr++){
1072 temp[iptr] = global_id_P[j];
1073 }
1074
1075 if (iptr) {
1076 qsort(temp, (size_t)iptr, sizeof(index_t), comparIndex);
1077 num_RAPext_cols = 1;
1078 i = temp[0];
1079 for (j=1; j<iptr; j++) {
1080 if (temp[j] > i) {
1081 i = temp[j];
1082 temp[num_RAPext_cols++] = i;
1083 }
1084 }
1085 }
1086 }
1087
1088 /* resize the pattern of P_ext_couple */
1089 if(num_RAPext_cols){
1090 global_id_RAP = new index_t[num_RAPext_cols];
1091 #pragma omp parallel for private(i) schedule(static)
1092 for (i=0; i<num_RAPext_cols; i++)
1093 global_id_RAP[i] = temp[i];
1094 }
1095 if (num_Pext_cols || sum > 0)
1096 delete[] temp;
1097 j1_ub = offset + num_Pmain_cols;
1098 #pragma omp parallel for private(i, where_p) schedule(static)
1099 for (i=0; i<sum; i++) {
1100 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1101 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1102 /*XXX*/ num_RAPext_cols, sizeof(index_t), comparIndex);
1103 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1104 } else
1105 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1106 }
1107
1108 /* build the mapping */
1109 if (num_Pcouple_cols) {
1110 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1111 iptr = 0;
1112 for (i=0; i<num_RAPext_cols; i++)
1113 if (global_id_RAP[i] == P->global_id[iptr]) {
1114 Pcouple_to_RAP[iptr++] = i;
1115 if (iptr == num_Pcouple_cols) break;
1116 }
1117 }
1118
1119 if (num_Pext_cols) {
1120 Pext_to_RAP = new index_t[num_Pext_cols];
1121 iptr = 0;
1122 for (i=0; i<num_RAPext_cols; i++)
1123 if (global_id_RAP[i] == global_id_P[iptr]) {
1124 Pext_to_RAP[iptr++] = i;
1125 if (iptr == num_Pext_cols) break;
1126 }
1127 }
1128
1129 if (global_id_P){
1130 delete[] global_id_P;
1131 global_id_P = NULL;
1132 }
1133
1134 /* alloc and initialise the makers */
1135 sum = num_RAPext_cols + num_Pmain_cols;
1136 P_marker = new index_t[sum];
1137 #pragma omp parallel for private(i) schedule(static)
1138 for (i=0; i<sum; i++) P_marker[i] = -1;
1139 #pragma omp parallel for private(i) schedule(static)
1140 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1141
1142 /* Now, count the size of RAP. Start with rows in R_main */
1143 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1144 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1145 shared = P->col_coupler->connector->send->shared;
1146 i = 0;
1147 j = 0;
1148 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1149 /* Mark the start of row for both main block and couple block */
1150 row_marker = i;
1151 row_marker_ext = j;
1152
1153 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1154 in RAP_ext */
1155 P_marker[i_r] = i;
1156 i++;
1157 for (j1=0; j1<num_neighbors; j1++) {
1158 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1159 if (shared[j2] == i_r) {
1160 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1161 i_c = RAP_ext_idx[k];
1162 if (i_c < num_Pmain_cols) {
1163 if (P_marker[i_c] < row_marker) {
1164 P_marker[i_c] = i;
1165 i++;
1166 }
1167 } else {
1168 if (P_marker[i_c] < row_marker_ext) {
1169 P_marker[i_c] = j;
1170 j++;
1171 }
1172 }
1173 }
1174 break;
1175 }
1176 }
1177 }
1178
1179 /* then, loop over elements in row i_r of R_main */
1180 j1_ub = R_main->pattern->ptr[i_r+1];
1181 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1182 i1 = R_main->pattern->index[j1];
1183
1184 /* then, loop over elements in row i1 of A->col_coupleBlock */
1185 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1186 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1187 i2 = A->col_coupleBlock->pattern->index[j2];
1188
1189 /* check whether entry RA[i_r, i2] has been previously visited.
1190 RAP new entry is possible only if entry RA[i_r, i2] has not
1191 been visited yet */
1192 if (A_marker[i2] != i_r) {
1193 /* first, mark entry RA[i_r,i2] as visited */
1194 A_marker[i2] = i_r;
1195
1196 /* then loop over elements in row i2 of P_ext_main */
1197 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1198 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1199 i_c = P->row_coupleBlock->pattern->index[j3];
1200
1201 /* check whether entry RAP[i_r,i_c] has been created.
1202 If not yet created, create the entry and increase
1203 the total number of elements in RAP_ext */
1204 if (P_marker[i_c] < row_marker) {
1205 P_marker[i_c] = i;
1206 i++;
1207 }
1208 }
1209
1210 /* loop over elements in row i2 of P_ext_couple, do the same */
1211 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1212 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1213 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1214 if (P_marker[i_c] < row_marker_ext) {
1215 P_marker[i_c] = j;
1216 j++;
1217 }
1218 }
1219 }
1220 }
1221
1222 /* now loop over elements in row i1 of A->mainBlock, repeat
1223 the process we have done to block A->col_coupleBlock */
1224 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1225 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1226 i2 = A->mainBlock->pattern->index[j2];
1227 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1228 A_marker[i2 + num_Acouple_cols] = i_r;
1229 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1230 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1231 i_c = P->mainBlock->pattern->index[j3];
1232 if (P_marker[i_c] < row_marker) {
1233 P_marker[i_c] = i;
1234 i++;
1235 }
1236 }
1237 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1238 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1239 /* note that we need to map the column index in
1240 P->col_coupleBlock back into the column index in
1241 P_ext_couple */
1242 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1243 if (P_marker[i_c] < row_marker_ext) {
1244 P_marker[i_c] = j;
1245 j++;
1246 }
1247 }
1248 }
1249 }
1250 }
1251 }
1252
1253 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1254 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1255 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1256 RAP_couple */
1257 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1258 RAP_main_idx = new index_t[i];
1259 RAP_main_val = new double[i * block_size];
1260 RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1261 RAP_couple_idx = new index_t[j];
1262 RAP_couple_val = new double[j * block_size];
1263
1264 RAP_main_ptr[num_Pmain_cols] = i;
1265 RAP_couple_ptr[num_Pmain_cols] = j;
1266
1267 #pragma omp parallel for private(i) schedule(static)
1268 for (i=0; i<sum; i++) P_marker[i] = -1;
1269 #pragma omp parallel for private(i) schedule(static)
1270 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1271
1272 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1273 in R_main */
1274 i = 0;
1275 j = 0;
1276 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1277 /* Mark the start of row for both main block and couple block */
1278 row_marker = i;
1279 row_marker_ext = j;
1280 RAP_main_ptr[i_r] = row_marker;
1281 RAP_couple_ptr[i_r] = row_marker_ext;
1282
1283 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1284 in row i_r of RAP_ext */
1285 P_marker[i_r] = i;
1286 for (ib=0; ib<block_size; ib++)
1287 RAP_main_val[i*block_size+ib] = ZERO;
1288 RAP_main_idx[i] = i_r;
1289 i++;
1290
1291 for (j1=0; j1<num_neighbors; j1++) {
1292 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1293 if (shared[j2] == i_r) {
1294 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1295 i_c = RAP_ext_idx[k];
1296 if (i_c < num_Pmain_cols) {
1297 if (P_marker[i_c] < row_marker) {
1298 P_marker[i_c] = i;
1299 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1300 RAP_main_idx[i] = i_c;
1301 i++;
1302 } else {
1303 t1_val = &(RAP_ext_val[k*block_size]);
1304 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1305 for (ib=0; ib<block_size; ib++)
1306 t2_val[ib] += t1_val[ib];
1307 }
1308 } else {
1309 if (P_marker[i_c] < row_marker_ext) {
1310 P_marker[i_c] = j;
1311 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1312 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1313 j++;
1314 } else {
1315 t1_val = &(RAP_ext_val[k*block_size]);
1316 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1317 for (ib=0; ib<block_size; ib++)
1318 t2_val[ib] += t1_val[ib];
1319 }
1320 }
1321 }
1322 break;
1323 }
1324 }
1325 }
1326
1327 /* then, loop over elements in row i_r of R_main */
1328 j1_ub = R_main->pattern->ptr[i_r+1];
1329 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1330 i1 = R_main->pattern->index[j1];
1331 R_val = &(R_main->val[j1*P_block_size]);
1332
1333 /* then, loop over elements in row i1 of A->col_coupleBlock */
1334 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1335 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1336 i2 = A->col_coupleBlock->pattern->index[j2];
1337 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1338 for (irb=0; irb<row_block_size; irb++)
1339 for (icb=0; icb<col_block_size; icb++)
1340 RA_val[irb+row_block_size*icb]=ZERO;
1341 for (irb=0; irb<P_block_size; irb++) {
1342 rtmp=R_val[irb];
1343 for (icb=0; icb<col_block_size; icb++) {
1344 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1345 }
1346 }
1347
1348
1349 /* check whether entry RA[i_r, i2] has been previously visited.
1350 RAP new entry is possible only if entry RA[i_r, i2] has not
1351 been visited yet */
1352 if (A_marker[i2] != i_r) {
1353 /* first, mark entry RA[i_r,i2] as visited */
1354 A_marker[i2] = i_r;
1355
1356 /* then loop over elements in row i2 of P_ext_main */
1357 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1358 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1359 i_c = P->row_coupleBlock->pattern->index[j3];
1360 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1361 for (irb=0; irb<row_block_size; irb++)
1362 for (icb=0; icb<col_block_size; icb++)
1363 RAP_val[irb+row_block_size*icb]=ZERO;
1364 for (icb=0; icb<P_block_size; icb++) {
1365 rtmp=temp_val[icb];
1366 for (irb=0; irb<row_block_size; irb++) {
1367 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1368 }
1369 }
1370
1371
1372 /* check whether entry RAP[i_r,i_c] has been created.
1373 If not yet created, create the entry and increase
1374 the total number of elements in RAP_ext */
1375 if (P_marker[i_c] < row_marker) {
1376 P_marker[i_c] = i;
1377 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1378 RAP_main_idx[i] = i_c;
1379 i++;
1380 } else {
1381 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1382 for (ib=0; ib<block_size; ib++) {
1383 temp_val[ib] += RAP_val[ib];
1384 }
1385 }
1386 }
1387
1388 /* loop over elements in row i2 of P_ext_couple, do the same */
1389 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1390 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1391 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1392 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1393 for (irb=0; irb<row_block_size; irb++)
1394 for (icb=0; icb<col_block_size; icb++)
1395 RAP_val[irb+row_block_size*icb]=ZERO;
1396 for (icb=0; icb<P_block_size; icb++) {
1397 rtmp=temp_val[icb];
1398 for (irb=0; irb<row_block_size; irb++) {
1399 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1400 }
1401 }
1402
1403 if (P_marker[i_c] < row_marker_ext) {
1404 P_marker[i_c] = j;
1405 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1406 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1407 j++;
1408 } else {
1409 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1410 for (ib=0; ib<block_size; ib++) {
1411 temp_val[ib] += RAP_val[ib];
1412 }
1413 }
1414 }
1415
1416 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1417 Only the contributions are added. */
1418 } else {
1419 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1420 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1421 i_c = P->row_coupleBlock->pattern->index[j3];
1422 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1423 for (irb=0; irb<row_block_size; irb++)
1424 for (icb=0; icb<col_block_size; icb++)
1425 RAP_val[irb+row_block_size*icb]=ZERO;
1426 for (icb=0; icb<P_block_size; icb++) {
1427 rtmp=temp_val[icb];
1428 for (irb=0; irb<row_block_size; irb++) {
1429 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1430 }
1431 }
1432
1433 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1434 for (ib=0; ib<block_size; ib++) {
1435 temp_val[ib] += RAP_val[ib];
1436 }
1437 }
1438 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1439 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1440 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1441 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1442 for (irb=0; irb<row_block_size; irb++)
1443 for (icb=0; icb<col_block_size; icb++)
1444 RAP_val[irb+row_block_size*icb]=ZERO;
1445 for (icb=0; icb<P_block_size; icb++) {
1446 rtmp=temp_val[icb];
1447 for (irb=0; irb<row_block_size; irb++) {
1448 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1449 }
1450 }
1451
1452 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1453 for (ib=0; ib<block_size; ib++) {
1454 temp_val[ib] += RAP_val[ib];
1455 }
1456 }
1457 }
1458 }
1459
1460 /* now loop over elements in row i1 of A->mainBlock, repeat
1461 the process we have done to block A->col_coupleBlock */
1462 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1463 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1464 i2 = A->mainBlock->pattern->index[j2];
1465 temp_val = &(A->mainBlock->val[j2*block_size]);
1466 for (irb=0; irb<row_block_size; irb++)
1467 for (icb=0; icb<col_block_size; icb++)
1468 RA_val[irb+row_block_size*icb]=ZERO;
1469 for (irb=0; irb<P_block_size; irb++) {
1470 rtmp=R_val[irb];
1471 for (icb=0; icb<col_block_size; icb++) {
1472 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1473 }
1474 }
1475
1476 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1477 A_marker[i2 + num_Acouple_cols] = i_r;
1478 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1479 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1480 i_c = P->mainBlock->pattern->index[j3];
1481 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1482 for (irb=0; irb<row_block_size; irb++)
1483 for (icb=0; icb<col_block_size; icb++)
1484 RAP_val[irb+row_block_size*icb]=ZERO;
1485 for (icb=0; icb<P_block_size; icb++) {
1486 rtmp=temp_val[icb];
1487 for (irb=0; irb<row_block_size; irb++) {
1488 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1489 }
1490 }
1491 if (P_marker[i_c] < row_marker) {
1492 P_marker[i_c] = i;
1493 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1494 RAP_main_idx[i] = i_c;
1495 i++;
1496 } else {
1497 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1498 for (ib=0; ib<block_size; ib++) {
1499 temp_val[ib] += RAP_val[ib];
1500 }
1501 }
1502 }
1503 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1504 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1505 /* note that we need to map the column index in
1506 P->col_coupleBlock back into the column index in
1507 P_ext_couple */
1508 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1509 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1510 for (irb=0; irb<row_block_size; irb++)
1511 for (icb=0; icb<col_block_size; icb++)
1512 RAP_val[irb+row_block_size*icb]=ZERO;
1513 for (icb=0; icb<P_block_size; icb++) {
1514 rtmp=temp_val[icb];
1515 for (irb=0; irb<row_block_size; irb++) {
1516 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1517 }
1518 }
1519
1520 if (P_marker[i_c] < row_marker_ext) {
1521 P_marker[i_c] = j;
1522 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1523 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1524 j++;
1525 } else {
1526 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1527 for (ib=0; ib<block_size; ib++) {
1528 temp_val[ib] += RAP_val[ib];
1529 }
1530 }
1531 }
1532
1533 } else {
1534 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1535 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1536 i_c = P->mainBlock->pattern->index[j3];
1537 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1538 for (irb=0; irb<row_block_size; irb++)
1539 for (icb=0; icb<col_block_size; icb++)
1540 RAP_val[irb+row_block_size*icb]=ZERO;
1541 for (icb=0; icb<P_block_size; icb++) {
1542 rtmp=temp_val[icb];
1543 for (irb=0; irb<row_block_size; irb++) {
1544 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1545 }
1546 }
1547
1548 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1549 for (ib=0; ib<block_size; ib++) {
1550 temp_val[ib] += RAP_val[ib];
1551 }
1552 }
1553 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1554 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1555 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1556 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1557 for (irb=0; irb<row_block_size; irb++)
1558 for (icb=0; icb<col_block_size; icb++)
1559 RAP_val[irb+row_block_size*icb]=ZERO;
1560 for (icb=0; icb<P_block_size; icb++) {
1561 rtmp = temp_val[icb];
1562 for (irb=0; irb<row_block_size; irb++) {
1563 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1564 }
1565 }
1566
1567 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1568 for (ib=0; ib<block_size; ib++) {
1569 temp_val[ib] += RAP_val[ib];
1570 }
1571 }
1572 }
1573 }
1574 }
1575
1576 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1577 if (i > row_marker) {
1578 offset = i - row_marker;
1579 temp = new index_t[offset];
1580 #pragma omp parallel for schedule(static) private(iptr)
1581 for (iptr=0; iptr<offset; iptr++)
1582 temp[iptr] = RAP_main_idx[row_marker+iptr];
1583 if (offset > 0) {
1584 qsort(temp, (size_t)offset, sizeof(index_t), comparIndex);
1585 }
1586 temp_val = new double[offset * block_size];
1587 #pragma omp parallel for schedule(static) private(iptr,k)
1588 for (iptr=0; iptr<offset; iptr++){
1589 k = P_marker[temp[iptr]];
1590 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1591 P_marker[temp[iptr]] = iptr + row_marker;
1592 }
1593 #pragma omp parallel for schedule(static) private(iptr)
1594 for (iptr=0; iptr<offset; iptr++){
1595 RAP_main_idx[row_marker+iptr] = temp[iptr];
1596 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1597 }
1598 delete[] temp;
1599 delete[] temp_val;
1600 }
1601 if (j > row_marker_ext) {
1602 offset = j - row_marker_ext;
1603 temp = new index_t[offset];
1604 #pragma omp parallel for schedule(static) private(iptr)
1605 for (iptr=0; iptr<offset; iptr++)
1606 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1607 if (offset > 0) {
1608 qsort(temp, (size_t)offset, sizeof(index_t), comparIndex);
1609 }
1610 temp_val = new double[offset * block_size];
1611 #pragma omp parallel for schedule(static) private(iptr, k)
1612 for (iptr=0; iptr<offset; iptr++){
1613 k = P_marker[temp[iptr] + num_Pmain_cols];
1614 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1615 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1616 }
1617 #pragma omp parallel for schedule(static) private(iptr)
1618 for (iptr=0; iptr<offset; iptr++){
1619 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1620 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1621 }
1622 delete[] temp;
1623 delete[] temp_val;
1624 }
1625 }
1626
1627 delete[] RA_val;
1628 delete[] RAP_val;
1629 delete[] A_marker;
1630 delete[] Pext_to_RAP;
1631 delete[] Pcouple_to_RAP;
1632 delete[] RAP_ext_ptr;
1633 delete[] RAP_ext_idx;
1634 delete[] RAP_ext_val;
1635 R_main.reset();
1636 R_couple.reset();
1637
1638 /* Check whether there are empty columns in RAP_couple */
1639 #pragma omp parallel for schedule(static) private(i)
1640 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1641 /* num of non-empty columns is stored in "k" */
1642 k = 0;
1643 j = RAP_couple_ptr[num_Pmain_cols];
1644 for (i=0; i<j; i++) {
1645 i1 = RAP_couple_idx[i];
1646 if (P_marker[i1]) {
1647 P_marker[i1] = 0;
1648 k++;
1649 }
1650 }
1651
1652 /* empty columns is found */
1653 if (k < num_RAPext_cols) {
1654 temp = new index_t[k];
1655 k = 0;
1656 for (i=0; i<num_RAPext_cols; i++)
1657 if (!P_marker[i]) {
1658 P_marker[i] = k;
1659 temp[k] = global_id_RAP[i];
1660 k++;
1661 }
1662 #pragma omp parallel for schedule(static) private(i, i1)
1663 for (i=0; i<j; i++) {
1664 i1 = RAP_couple_idx[i];
1665 RAP_couple_idx[i] = P_marker[i1];
1666 }
1667 num_RAPext_cols = k;
1668 delete[] global_id_RAP;
1669 global_id_RAP = temp;
1670 }
1671 delete[] P_marker;
1672
1673 /******************************************************/
1674 /* Start to create the coarse level System Matrix A_c */
1675 /******************************************************/
1676 /* first, prepare the sender/receiver for the col_connector */
1677 dist = P->pattern->input_distribution->first_component;
1678 recv_len = new dim_t[size];
1679 send_len = new dim_t[size];
1680 neighbor = new Esys_MPI_rank[size];
1681 offsetInShared = new index_t[size+1];
1682 shared = new index_t[num_RAPext_cols];
1683 memset(recv_len, 0, sizeof(dim_t) * size);
1684 memset(send_len, 0, sizeof(dim_t) * size);
1685 num_neighbors = 0;
1686 offsetInShared[0] = 0;
1687 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1688 shared[i] = i + num_Pmain_cols;
1689 if (k <= global_id_RAP[i]) {
1690 if (recv_len[j] > 0) {
1691 neighbor[num_neighbors] = j;
1692 num_neighbors ++;
1693 offsetInShared[num_neighbors] = i;
1694 }
1695 while (k <= global_id_RAP[i]) {
1696 j++;
1697 k = dist[j+1];
1698 }
1699 }
1700 recv_len[j] ++;
1701 }
1702 if (recv_len[j] > 0) {
1703 neighbor[num_neighbors] = j;
1704 num_neighbors ++;
1705 offsetInShared[num_neighbors] = i;
1706 }
1707
1708 SharedComponents_ptr recv(new SharedComponents(
1709 num_Pmain_cols, num_neighbors, neighbor, shared,
1710 offsetInShared, 1, 0, mpi_info));
1711
1712 #ifdef ESYS_MPI
1713 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1714 #endif
1715
1716 #ifdef ESYS_MPI
1717 mpi_requests=new MPI_Request[size*2];
1718 mpi_stati=new MPI_Status[size*2];
1719 #else
1720 mpi_requests=new int[size*2];
1721 mpi_stati=new int[size*2];
1722 #endif
1723 num_neighbors = 0;
1724 j = 0;
1725 offsetInShared[0] = 0;
1726 for (i=0; i<size; i++) {
1727 if (send_len[i] > 0) {
1728 neighbor[num_neighbors] = i;
1729 num_neighbors ++;
1730 j += send_len[i];
1731 offsetInShared[num_neighbors] = j;
1732 }
1733 }
1734 delete[] shared;
1735 shared = new index_t[j];
1736 for (i=0, j=0; i<num_neighbors; i++) {
1737 k = neighbor[i];
1738 #ifdef ESYS_MPI
1739 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1740 mpi_info->msg_tag_counter+k,
1741 mpi_info->comm, &mpi_requests[i]);
1742 #endif
1743 j += send_len[k];
1744 }
1745 for (i=0, j=0; i<recv->numNeighbors; i++) {
1746 k = recv->neighbor[i];
1747 #ifdef ESYS_MPI
1748 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1749 mpi_info->msg_tag_counter+rank,
1750 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1751 #endif
1752 j += recv_len[k];
1753 }
1754 #ifdef ESYS_MPI
1755 MPI_Waitall(num_neighbors + recv->numNeighbors,
1756 mpi_requests, mpi_stati);
1757 #endif
1758 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1759
1760 j = offsetInShared[num_neighbors];
1761 offset = dist[rank];
1762 #pragma omp parallel for schedule(static) private(i)
1763 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1764
1765 SharedComponents_ptr send(new SharedComponents(
1766 num_Pmain_cols, num_neighbors, neighbor, shared,
1767 offsetInShared, 1, 0, mpi_info));
1768
1769 col_connector.reset(new Connector(send, recv));
1770 delete[] shared;
1771
1772 /* now, create row distribution (output_distri) and col
1773 distribution (input_distribution) */
1774 input_dist.reset(new Distribution(mpi_info, dist, 1, 0));
1775 output_dist.reset(new Distribution(mpi_info, dist, 1, 0));
1776
1777 /* then, prepare the sender/receiver for the row_connector, first, prepare
1778 the information for sender */
1779 sum = RAP_couple_ptr[num_Pmain_cols];
1780 len = new dim_t[size];
1781 send_ptr = new index_t*[size];
1782 send_idx = new index_t*[size];
1783 #pragma omp parallel for schedule(static) private(i)
1784 for (i=0; i<size; i++) {
1785 send_ptr[i] = new index_t[num_Pmain_cols];
1786 send_idx[i] = new index_t[sum];
1787 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1788 }
1789 memset(len, 0, sizeof(dim_t) * size);
1790 recv = col_connector->recv;
1791 sum=0;
1792 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1793 i1 = RAP_couple_ptr[i_r];
1794 i2 = RAP_couple_ptr[i_r+1];
1795 if (i2 > i1) {
1796 /* then row i_r will be in the sender of row_connector, now check
1797 how many neighbours i_r needs to be send to */
1798 for (j=i1; j<i2; j++) {
1799 i_c = RAP_couple_idx[j];
1800 /* find out the corresponding neighbor "p" of column i_c */
1801 for (p=0; p<recv->numNeighbors; p++) {
1802 if (i_c < recv->offsetInShared[p+1]) {
1803 k = recv->neighbor[p];
1804 if (send_ptr[k][i_r] == 0) sum++;
1805 send_ptr[k][i_r] ++;
1806 send_idx[k][len[k]] = global_id_RAP[i_c];
1807 len[k] ++;
1808 break;
1809 }
1810 }
1811 }
1812 }
1813 }
1814 if (global_id_RAP) {
1815 delete[] global_id_RAP;
1816 global_id_RAP = NULL;
1817 }
1818
1819 /* now allocate the sender */
1820 shared = new index_t[sum];
1821 memset(send_len, 0, sizeof(dim_t) * size);
1822 num_neighbors=0;
1823 offsetInShared[0] = 0;
1824 for (p=0, k=0; p<size; p++) {
1825 for (i=0; i<num_Pmain_cols; i++) {
1826 if (send_ptr[p][i] > 0) {
1827 shared[k] = i;
1828 k++;
1829 send_ptr[p][send_len[p]] = send_ptr[p][i];
1830 send_len[p]++;
1831 }
1832 }
1833 if (k > offsetInShared[num_neighbors]) {
1834 neighbor[num_neighbors] = p;
1835 num_neighbors ++;
1836 offsetInShared[num_neighbors] = k;
1837 }
1838 }
1839 send.reset(new SharedComponents(num_Pmain_cols, num_neighbors,
1840 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1841
1842 /* send/recv number of rows will be sent from current proc
1843 recover info for the receiver of row_connector from the sender */
1844 #ifdef ESYS_MPI
1845 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1846 #endif
1847 num_neighbors = 0;
1848 offsetInShared[0] = 0;
1849 j = 0;
1850 for (i=0; i<size; i++) {
1851 if (i != rank && recv_len[i] > 0) {
1852 neighbor[num_neighbors] = i;
1853 num_neighbors ++;
1854 j += recv_len[i];
1855 offsetInShared[num_neighbors] = j;
1856 }
1857 }
1858 delete[] shared;
1859 delete[] recv_len;
1860 shared = new index_t[j];
1861 k = offsetInShared[num_neighbors];
1862 #pragma omp parallel for schedule(static) private(i)
1863 for (i=0; i<k; i++) {
1864 shared[i] = i + num_Pmain_cols;
1865 }
1866 recv.reset(new SharedComponents(num_Pmain_cols, num_neighbors,
1867 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1868 row_connector.reset(new Connector(send, recv));
1869 delete[] shared;
1870
1871 /* send/recv pattern->ptr for rowCoupleBlock */
1872 num_RAPext_rows = offsetInShared[num_neighbors];
1873 row_couple_ptr = new index_t[num_RAPext_rows+1];
1874 for (p=0; p<num_neighbors; p++) {
1875 j = offsetInShared[p];
1876 i = offsetInShared[p+1];
1877 #ifdef ESYS_MPI
1878 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1879 mpi_info->msg_tag_counter+neighbor[p],
1880 mpi_info->comm, &mpi_requests[p]);
1881 #endif
1882 }
1883 send = row_connector->send;
1884 for (p=0; p<send->numNeighbors; p++) {
1885 #ifdef ESYS_MPI
1886 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1887 MPI_INT, send->neighbor[p],
1888 mpi_info->msg_tag_counter+rank,
1889 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1890 #endif
1891 }
1892 #ifdef ESYS_MPI
1893 MPI_Waitall(num_neighbors + send->numNeighbors,
1894 mpi_requests, mpi_stati);
1895 #endif
1896 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1897 delete[] send_len;
1898
1899 sum = 0;
1900 for (i=0; i<num_RAPext_rows; i++) {
1901 k = row_couple_ptr[i];
1902 row_couple_ptr[i] = sum;
1903 sum += k;
1904 }
1905 row_couple_ptr[num_RAPext_rows] = sum;
1906
1907 /* send/recv pattern->index for rowCoupleBlock */
1908 k = row_couple_ptr[num_RAPext_rows];
1909 row_couple_idx = new index_t[k];
1910 for (p=0; p<num_neighbors; p++) {
1911 j1 = row_couple_ptr[offsetInShared[p]];
1912 j2 = row_couple_ptr[offsetInShared[p+1]];
1913 #ifdef ESYS_MPI
1914 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1915 mpi_info->msg_tag_counter+neighbor[p],
1916 mpi_info->comm, &mpi_requests[p]);
1917 #endif
1918 }
1919 for (p=0; p<send->numNeighbors; p++) {
1920 #ifdef ESYS_MPI
1921 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1922 MPI_INT, send->neighbor[p],
1923 mpi_info->msg_tag_counter+rank,
1924 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1925 #endif
1926 }
1927 #ifdef ESYS_MPI
1928 MPI_Waitall(num_neighbors + send->numNeighbors,
1929 mpi_requests, mpi_stati);
1930 #endif
1931 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1932
1933 offset = input_dist->first_component[rank];
1934 k = row_couple_ptr[num_RAPext_rows];
1935 #pragma omp parallel for schedule(static) private(i)
1936 for (i=0; i<k; i++) {
1937 row_couple_idx[i] -= offset;
1938 }
1939 #pragma omp parallel for schedule(static) private(i)
1940 for (i=0; i<size; i++) {
1941 delete[] send_ptr[i];
1942 delete[] send_idx[i];
1943 }
1944 delete[] send_ptr;
1945 delete[] send_idx;
1946 delete[] len;
1947 delete[] offsetInShared;
1948 delete[] neighbor;
1949 delete[] mpi_requests;
1950 delete[] mpi_stati;
1951 Esys_MPIInfo_free(mpi_info);
1952
1953 /* Now, we can create pattern for mainBlock and coupleBlock */
1954 Pattern_ptr main_pattern(new Pattern(MATRIX_FORMAT_DEFAULT,
1955 num_Pmain_cols, num_Pmain_cols, RAP_main_ptr, RAP_main_idx));
1956 Pattern_ptr col_couple_pattern(new Pattern(
1957 MATRIX_FORMAT_DEFAULT, num_Pmain_cols, num_RAPext_cols,
1958 RAP_couple_ptr, RAP_couple_idx));
1959 Pattern_ptr row_couple_pattern(new Pattern(
1960 MATRIX_FORMAT_DEFAULT, num_RAPext_rows, num_Pmain_cols,
1961 row_couple_ptr, row_couple_idx));
1962
1963 /* next, create the system matrix */
1964 pattern.reset(new SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1965 output_dist, input_dist, main_pattern, col_couple_pattern,
1966 row_couple_pattern, col_connector, row_connector));
1967 out.reset(new SystemMatrix(A->type, pattern, row_block_size,
1968 col_block_size, false));
1969
1970 /* finally, fill in the data*/
1971 memcpy(out->mainBlock->val, RAP_main_val,
1972 out->mainBlock->len* sizeof(double));
1973 memcpy(out->col_coupleBlock->val, RAP_couple_val,
1974 out->col_coupleBlock->len * sizeof(double));
1975
1976 delete[] RAP_main_val;
1977 delete[] RAP_couple_val;
1978 if (!Esys_noError()) {
1979 out.reset();
1980 }
1981 return out;
1982 }
1983
1984
1985 SystemMatrix_ptr Preconditioner_AMG_buildInterpolationOperatorBlock(
1986 SystemMatrix_ptr A, SystemMatrix_ptr P,
1987 SystemMatrix_ptr R)
1988 {
1989 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
1990 SystemMatrix_ptr out;
1991 SystemMatrixPattern_ptr pattern;
1992 Distribution_ptr input_dist, output_dist;
1993 SharedComponents_ptr send, recv;
1994 Connector_ptr col_connector, row_connector;
1995 const dim_t row_block_size=A->row_block_size;
1996 const dim_t col_block_size=A->col_block_size;
1997 const dim_t block_size = A->block_size;
1998 const double ZERO = 0.0;
1999 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2000 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2001 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2002 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2003 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2004 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2005 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2006 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2007 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2008 index_t sum, i, j, k, iptr, irb, icb, ib;
2009 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2010 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2011 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2012 index_t row_marker_ext, *where_p=NULL;
2013 index_t **send_ptr=NULL, **send_idx=NULL;
2014 dim_t p, num_neighbors;
2015 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2016 Esys_MPI_rank *neighbor=NULL;
2017 #ifdef ESYS_MPI
2018 MPI_Request* mpi_requests=NULL;
2019 MPI_Status* mpi_stati=NULL;
2020 #else
2021 int *mpi_requests=NULL, *mpi_stati=NULL;
2022 #endif
2023
2024 /* two sparse matrices R_main and R_couple will be generate, as the
2025 transpose of P_main and P_col_couple, respectively. Note that,
2026 R_couple is actually the row_coupleBlock of R (R=P^T) */
2027 SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
2028 SparseMatrix_ptr R_couple(P->col_coupleBlock->getTranspose());
2029
2030 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2031 and needed locally for triple matrix product RAP
2032 to be specific, P_ext (on processor k) are group of rows in P, where
2033 the list of rows from processor q is given by
2034 A->col_coupler->connector->send->shared[rPtr...]
2035 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2036 on q.
2037 P_ext is represented by two sparse matrices P_ext_main and
2038 P_ext_couple */
2039 Preconditioner_AMG_extendB(A, P);
2040
2041 /* count the number of cols in P_ext_couple, resize the pattern of
2042 sparse matrix P_ext_couple with new compressed order, and then
2043 build the col id mapping from P->col_coupleBlock to
2044 P_ext_couple */
2045 num_Pmain_cols = P->mainBlock->numCols;
2046 num_Pcouple_cols = P->col_coupleBlock->numCols;
2047 num_Acouple_cols = A->col_coupleBlock->numCols;
2048 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2049 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2050 offset = P->col_distribution->first_component[rank];
2051 num_Pext_cols = 0;
2052 if (P->global_id) {
2053 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2054 and P->col_coupleBlock */
2055 iptr = 0;
2056 if (num_Pcouple_cols || sum > 0) {
2057 temp = new index_t[num_Pcouple_cols+sum];
2058 for (; iptr<sum; iptr++){
2059 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2060 }
2061 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2062 temp[iptr] = P->global_id[j];
2063 }
2064 }
2065 if (iptr) {
2066 qsort(temp, (size_t)iptr, sizeof(index_t), comparIndex);
2067 num_Pext_cols = 1;
2068 i = temp[0];
2069 for (j=1; j<iptr; j++) {
2070 if (temp[j] > i) {
2071 i = temp[j];
2072 temp[num_Pext_cols++] = i;
2073 }
2074 }
2075 }
2076
2077 /* resize the pattern of P_ext_couple */
2078 if(num_Pext_cols){
2079 global_id_P = new index_t[num_Pext_cols];
2080 for (i=0; i<num_Pext_cols; i++)
2081 global_id_P[i] = temp[i];
2082 }
2083 if (num_Pcouple_cols || sum > 0)
2084 delete[] temp;
2085 for (i=0; i<sum; i++) {
2086 where_p = (index_t *)bsearch(
2087 &(P->remote_coupleBlock->pattern->index[i]),
2088 global_id_P, num_Pext_cols,
2089 sizeof(index_t), comparIndex);
2090 P->remote_coupleBlock->pattern->index[i] =
2091 (index_t)(where_p -global_id_P);
2092 }
2093
2094 /* build the mapping */
2095 if (num_Pcouple_cols) {
2096 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2097 iptr = 0;
2098 for (i=0; i<num_Pext_cols; i++)
2099 if (global_id_P[i] == P->global_id[iptr]) {
2100 Pcouple_to_Pext[iptr++] = i;
2101 if (iptr == num_Pcouple_cols) break;
2102 }
2103 }
2104 }
2105
2106 /* alloc and initialise the makers */
2107 sum = num_Pext_cols + num_Pmain_cols;
2108 P_marker = new index_t[sum];
2109 A_marker = new index_t[num_A_cols];
2110 #pragma omp parallel for private(i) schedule(static)
2111 for (i=0; i<sum; i++) P_marker[i] = -1;
2112 #pragma omp parallel for private(i) schedule(static)
2113 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2114
2115 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2116 sum = 0;
2117 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2118 row_marker = sum;
2119 /* then, loop over elements in row i_r of R_couple */
2120 j1_ub = R_couple->pattern->ptr[i_r+1];
2121 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2122 i1 = R_couple->pattern->index[j1];
2123 /* then, loop over elements in row i1 of A->col_coupleBlock */
2124 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2125 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2126 i2 = A->col_coupleBlock->pattern->index[j2];
2127
2128 /* check whether entry RA[i_r, i2] has been previously visited.
2129 RAP new entry is possible only if entry RA[i_r, i2] has not
2130 been visited yet */
2131 if (A_marker[i2] != i_r) {
2132 /* first, mark entry RA[i_r,i2] as visited */
2133 A_marker[i2] = i_r;
2134
2135 /* then loop over elements in row i2 of P_ext_main */
2136 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2137 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2138 i_c = P->row_coupleBlock->pattern->index[j3];
2139
2140 /* check whether entry RAP[i_r,i_c] has been created.
2141 If not yet created, create the entry and increase
2142 the total number of elements in RAP_ext */
2143 if (P_marker[i_c] < row_marker) {
2144 P_marker[i_c] = sum;
2145 sum++;
2146 }
2147 }
2148
2149 /* loop over elements in row i2 of P_ext_couple, do the same */
2150 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2151 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2152 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2153 if (P_marker[i_c] < row_marker) {
2154 P_marker[i_c] = sum;
2155 sum++;
2156 }
2157 }
2158 }
2159 }
2160
2161 /* now loop over elements in row i1 of A->mainBlock, repeat
2162 the process we have done to block A->col_coupleBlock */
2163 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2164 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2165 i2 = A->mainBlock->pattern->index[j2];
2166 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2167 A_marker[i2 + num_Acouple_cols] = i_r;
2168 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2169 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2170 i_c = P->mainBlock->pattern->index[j3];
2171 if (P_marker[i_c] < row_marker) {
2172 P_marker[i_c] = sum;
2173 sum++;
2174 }
2175 }
2176 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2177 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2178 /* note that we need to map the column index in
2179 P->col_coupleBlock back into the column index in
2180 P_ext_couple */
2181 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2182 if (P_marker[i_c] < row_marker) {
2183 P_marker[i_c] = sum;
2184 sum++;
2185 }
2186 }
2187 }
2188 }
2189 }
2190 }
2191
2192 /* Now we have the number of non-zero elements in RAP_ext, allocate
2193 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2194 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2195 RAP_ext_idx = new index_t[sum];
2196 RAP_ext_val = new double[sum * block_size];
2197 RA_val = new double[block_size];
2198 RAP_val = new double[block_size];
2199
2200 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2201 sum = num_Pext_cols + num_Pmain_cols;
2202 #pragma omp parallel for private(i) schedule(static)
2203 for (i=0; i<sum; i++) P_marker[i] = -1;
2204 #pragma omp parallel for private(i) schedule(static)
2205 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2206 sum = 0;
2207 RAP_ext_ptr[0] = 0;
2208 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2209 row_marker = sum;
2210 /* then, loop over elements in row i_r of R_couple */
2211 j1_ub = R_couple->pattern->ptr[i_r+1];
2212 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2213 i1 = R_couple->pattern->index[j1];
2214 R_val = &(R_couple->val[j1*block_size]);
2215
2216 /* then, loop over elements in row i1 of A->col_coupleBlock */
2217 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2218 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2219 i2 = A->col_coupleBlock->pattern->index[j2];
2220 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2221 for (irb=0; irb<row_block_size; irb++) {
2222 for (icb=0; icb<col_block_size; icb++) {
2223 rtmp = ZERO;
2224 for (ib=0; ib<col_block_size; ib++) {
2225 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2226 }
2227 RA_val[irb+row_block_size*icb]=rtmp;
2228 }
2229 }
2230
2231 /* check whether entry RA[i_r, i2] has been previously visited.
2232 RAP new entry is possible only if entry RA[i_r, i2] has not
2233 been visited yet */
2234 if (A_marker[i2] != i_r) {
2235 /* first, mark entry RA[i_r,i2] as visited */
2236 A_marker[i2] = i_r;
2237
2238 /* then loop over elements in row i2 of P_ext_main */
2239 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2240 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2241 i_c = P->row_coupleBlock->pattern->index[j3];
2242 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2243 for (irb=0; irb<row_block_size; irb++) {
2244 for (icb=0; icb<col_block_size; icb++) {
2245 rtmp = ZERO;
2246 for (ib=0; ib<col_block_size; ib++) {
2247 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2248 }
2249 RAP_val[irb+row_block_size*icb]=rtmp;
2250 }
2251 }
2252
2253
2254 /* check whether entry RAP[i_r,i_c] has been created.
2255 If not yet created, create the entry and increase
2256 the total number of elements in RAP_ext */
2257 if (P_marker[i_c] < row_marker) {
2258 P_marker[i_c] = sum;
2259 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2260 RAP_ext_idx[sum] = i_c + offset;
2261 sum++;
2262 } else {
2263 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2264 for (ib=0; ib<block_size; ib++) {
2265 temp_val[ib] += RAP_val[ib];
2266 }
2267 }
2268 }
2269
2270 /* loop over elements in row i2 of P_ext_couple, do the same */
2271 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2272 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2273 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2274 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2275 for (irb=0; irb<row_block_size; irb++) {
2276 for (icb=0; icb<col_block_size; icb++) {
2277 rtmp = ZERO;
2278 for (ib=0; ib<col_block_size; ib++) {
2279 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2280 }
2281 RAP_val[irb+row_block_size*icb]=rtmp;
2282 }
2283 }
2284
2285 if (P_marker[i_c] < row_marker) {
2286 P_marker[i_c] = sum;
2287 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2288 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2289 sum++;
2290 } else {
2291 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2292 for (ib=0; ib<block_size; ib++) {
2293 temp_val[ib] += RAP_val[ib];
2294 }
2295 }
2296 }
2297
2298 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2299 Only the contributions are added. */
2300 } else {
2301 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2302 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2303 i_c = P->row_coupleBlock->pattern->index[j3];
2304 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2305 for (irb=0; irb<row_block_size; irb++) {
2306 for (icb=0; icb<col_block_size; icb++) {
2307 rtmp = ZERO;
2308 for (ib=0; ib<col_block_size; ib++) {
2309 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2310 }
2311 RAP_val[irb+row_block_size*icb]=rtmp;
2312 }
2313 }
2314
2315 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2316 for (ib=0; ib<block_size; ib++) {
2317 temp_val[ib] += RAP_val[ib];
2318 }
2319 }
2320 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2321 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2322 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2323 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2324 for (irb=0; irb<row_block_size; irb++) {
2325 for (icb=0; icb<col_block_size; icb++) {
2326 rtmp = ZERO;
2327 for (ib=0; ib<col_block_size; ib++) {
2328 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2329 }
2330 RAP_val[irb+row_block_size*icb]=rtmp;
2331 }
2332 }
2333
2334 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2335 for (ib=0; ib<block_size; ib++) {
2336 temp_val[ib] += RAP_val[ib];
2337 }
2338 }
2339 }
2340 }
2341
2342 /* now loop over elements in row i1 of A->mainBlock, repeat
2343 the process we have done to block A->col_coupleBlock */
2344 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2345 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2346 i2 = A->mainBlock->pattern->index[j2];
2347 temp_val = &(A->mainBlock->val[j2*block_size]);
2348 for (irb=0; irb<row_block_size; irb++) {
2349 for (icb=0; icb<col_block_size; icb++) {
2350 rtmp = ZERO;
2351 for (ib=0; ib<col_block_size; ib++) {
2352 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2353 }
2354 RA_val[irb+row_block_size*icb]=rtmp;
2355 }
2356 }
2357
2358 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2359 A_marker[i2 + num_Acouple_cols] = i_r;
2360 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2361 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2362 i_c = P->mainBlock->pattern->index[j3];
2363 temp_val = &(P->mainBlock->val[j3*block_size]);
2364 for (irb=0; irb<row_block_size; irb++) {
2365 for (icb=0; icb<col_block_size; icb++) {
2366 rtmp = ZERO;
2367 for (ib=0; ib<col_block_size; ib++) {
2368 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2369 }
2370 RAP_val[irb+row_block_size*icb]=rtmp;
2371 }
2372 }
2373
2374 if (P_marker[i_c] < row_marker) {
2375 P_marker[i_c] = sum;
2376 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2377 RAP_ext_idx[sum] = i_c + offset;
2378 sum++;
2379 } else {
2380 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2381 for (ib=0; ib<block_size; ib++) {
2382 temp_val[ib] += RAP_val[ib];
2383 }
2384 }
2385 }
2386 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2387 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2388 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2389 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2390 for (irb=0; irb<row_block_size; irb++) {
2391 for (icb=0; icb<col_block_size; icb++) {
2392 rtmp = ZERO;
2393 for (ib=0; ib<col_block_size; ib++) {
2394 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2395 }
2396 RAP_val[irb+row_block_size*icb]=rtmp;
2397 }
2398 }
2399
2400 if (P_marker[i_c] < row_marker) {
2401 P_marker[i_c] = sum;
2402 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2403 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2404 sum++;
2405 } else {
2406 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2407 for (ib=0; ib<block_size; ib++) {
2408 temp_val[ib] += RAP_val[ib];
2409 }
2410 }
2411 }
2412 } else {
2413 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2414 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2415 i_c = P->mainBlock->pattern->index[j3];
2416 temp_val = &(P->mainBlock->val[j3*block_size]);
2417 for (irb=0; irb<row_block_size; irb++) {
2418 for (icb=0; icb<col_block_size; icb++) {
2419 rtmp = ZERO;
2420 for (ib=0; ib<col_block_size; ib++) {
2421 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2422 }
2423 RAP_val[irb+row_block_size*icb]=rtmp;
2424 }
2425 }
2426
2427 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2428 for (ib=0; ib<block_size; ib++) {
2429 temp_val[ib] += RAP_val[ib];
2430 }
2431 }
2432 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2433 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2434 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2435 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2436 for (irb=0; irb<row_block_size; irb++) {
2437 for (icb=0; icb<col_block_size; icb++) {
2438 rtmp = ZERO;
2439 for (ib=0; ib<col_block_size; ib++) {
2440 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2441 }
2442 RAP_val[irb+row_block_size*icb]=rtmp;
2443 }
2444 }
2445
2446 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2447 for (ib=0; ib<block_size; ib++) {
2448 temp_val[ib] += RAP_val[ib];
2449 }
2450 }
2451 }
2452 }
2453 }
2454 RAP_ext_ptr[i_r+1] = sum;
2455 }
2456 delete[] P_marker;
2457 delete[] Pcouple_to_Pext;
2458
2459 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2460 which is given by the column list of P->col_coupleBlock, and
2461 column "c" is the list of columns which possibly covers the
2462 whole column range of system matrix P. This part of data is to
2463 be passed to neighbouring processors, and added to corresponding
2464 RAP[r,c] entries in the neighbouring processors */
2465 Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2466 &RAP_ext_val, global_id_P, block_size);
2467
2468 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2469 sum = RAP_ext_ptr[num_RAPext_rows];
2470 num_RAPext_cols = 0;
2471 if (num_Pext_cols || sum > 0) {
2472 temp = new index_t[num_Pext_cols+sum];
2473 j1_ub = offset + num_Pmain_cols;
2474 for (i=0, iptr=0; i<sum; i++) {
2475 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2476 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2477 }
2478 for (j=0; j<num_Pext_cols; j++, iptr++){
2479 temp[iptr] = global_id_P[j];
2480 }
2481
2482 if (iptr) {
2483 qsort(temp, (size_t)iptr, sizeof(index_t), comparIndex);
2484 num_RAPext_cols = 1;
2485 i = temp[0];
2486 for (j=1; j<iptr; j++) {
2487 if (temp[j] > i) {
2488 i = temp[j];
2489 temp[num_RAPext_cols++] = i;
2490 }
2491 }
2492 }
2493 }
2494
2495 /* resize the pattern of P_ext_couple */
2496 if(num_RAPext_cols){
2497 global_id_RAP = new index_t[num_RAPext_cols];
2498 for (i=0; i<num_RAPext_cols; i++)
2499 global_id_RAP[i] = temp[i];
2500 }
2501 if (num_Pext_cols || sum > 0)
2502 delete[] temp;
2503 j1_ub = offset + num_Pmain_cols;
2504 for (i=0; i<sum; i++) {
2505 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2506 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2507 /*XXX*/ num_RAPext_cols, sizeof(index_t), comparIndex);
2508 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2509 } else
2510 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2511 }
2512
2513 /* build the mapping */
2514 if (num_Pcouple_cols) {
2515 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
2516 iptr = 0;
2517 for (i=0; i<num_RAPext_cols; i++)
2518 if (global_id_RAP[i] == P->global_id[iptr]) {
2519 Pcouple_to_RAP[iptr++] = i;
2520 if (iptr == num_Pcouple_cols) break;
2521 }
2522 }
2523
2524 if (num_Pext_cols) {
2525 Pext_to_RAP = new index_t[num_Pext_cols];
2526 iptr = 0;
2527 for (i=0; i<num_RAPext_cols; i++)
2528 if (global_id_RAP[i] == global_id_P[iptr]) {
2529 Pext_to_RAP[iptr++] = i;
2530 if (iptr == num_Pext_cols) break;
2531 }
2532 }
2533
2534 if (global_id_P){
2535 delete[] global_id_P;
2536 global_id_P = NULL;
2537 }
2538
2539 /* alloc and initialise the makers */
2540 sum = num_RAPext_cols + num_Pmain_cols;
2541 P_marker = new index_t[sum];
2542 #pragma omp parallel for private(i) schedule(static)
2543 for (i=0; i<sum; i++) P_marker[i] = -1;
2544 #pragma omp parallel for private(i) schedule(static)
2545 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2546
2547 /* Now, count the size of RAP. Start with rows in R_main */
2548 num_neighbors = P->col_coupler->connector->send->numNeighbors;
2549 offsetInShared = P->col_coupler->connector->send->offsetInShared;
2550 shared = P->col_coupler->connector->send->shared;
2551 i = 0;
2552 j = 0;
2553 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2554 /* Mark the start of row for both main block and couple block */
2555 row_marker = i;
2556 row_marker_ext = j<