/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4934 - (show annotations)
Tue May 13 00:28:11 2014 UTC (5 years, 4 months ago) by jfenwick
File size: 132779 byte(s)
This commit is brought to you by the number 4934 and the tool "meld".
Merge of partially complete split world code from branch.



1 /*****************************************************************************
2 *
3 * Copyright (c) 2003-2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16
17 /****************************************************************************/
18
19 /* Paso: defines AMG Interpolation */
20
21 /****************************************************************************/
22
23 /* Author: l.gao@uq.edu.au */
24
25 /****************************************************************************/
26
27 #include "Paso.h"
28 #include "SparseMatrix.h"
29 #include "PasoUtil.h"
30 #include "Preconditioner.h"
31
32 #include <cstring> // memcpy
33
34 namespace paso {
35
36 /****************************************************************************
37
38 Methods necessary for AMG preconditioner
39
40 construct n_C x n_C interpolation operator A_C from matrix A
41 and prolongation matrix P.
42
43 The coarsening operator A_C is defined by RAP where R=P^T.
44
45 *****************************************************************************/
46
47 /* Extend system matrix B with extra two sparse matrices:
48 B_ext_main and B_ext_couple
49 The combination of this two sparse matrices represents
50 the portion of B that is stored on neighbour procs and
51 needed locally for triple matrix product (B^T) A B.
52
53 FOR NOW, we assume that the system matrix B has a NULL
54 row_coupleBlock and a NULL remote_coupleBlock. Based on
55 this assumption, we use link row_coupleBlock for sparse
56 matrix B_ext_main, and link remote_coupleBlock for sparse
57 matrix B_ext_couple.
58
59 To be specific, B_ext (on processor k) are group of rows
60 in B, where the list of rows from processor q is given by
61 A->col_coupler->connector->send->shared[rPtr...]
62 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
63 on q.
64 */
65 void Preconditioner_AMG_extendB(SystemMatrix_ptr A, SystemMatrix_ptr B)
66 {
67 if (A->mpi_info->size == 1) return;
68
69 if (B->remote_coupleBlock.get()) {
70 Esys_setError(VALUE_ERROR, "Preconditioner_AMG_extendB: the link to "
71 "remote_coupleBlock has already been set.");
72 return;
73 }
74 #ifdef ESYS_MPI
75 B->row_coupleBlock.reset();
76
77 Pattern_ptr pattern_main, pattern_couple;
78 Coupler_ptr coupler;
79 SharedComponents_ptr send, recv;
80 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
81 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
82 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
83 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
84 index_t *idx_m=NULL, *idx_c=NULL;
85 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
86 index_t offset, len, block_size, block_size_size, max_num_cols;
87 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
88 dim_t *recv_degree=NULL, *send_degree=NULL;
89 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
90
91 // sending/receiving unknown's global ID
92 num_main_cols = B->mainBlock->numCols;
93 cols = new double[num_main_cols];
94 offset = B->col_distribution->first_component[rank];
95 #pragma omp parallel for private(i) schedule(static)
96 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
97 if (B->global_id == NULL) {
98 coupler.reset(new Coupler(B->col_coupler->connector, 1));
99 coupler->startCollect(cols);
100 }
101
102 recv_buf = new index_t[size];
103 recv_degree = new dim_t[size];
104 recv_offset = new index_t[size+1];
105 #pragma omp parallel for private(i) schedule(static)
106 for (i=0; i<size; i++){
107 recv_buf[i] = 0;
108 recv_degree[i] = 1;
109 recv_offset[i] = i;
110 }
111
112 block_size = B->block_size;
113 block_size_size = block_size * sizeof(double);
114 num_couple_cols = B->col_coupleBlock->numCols;
115 send = A->col_coupler->connector->send;
116 recv = A->col_coupler->connector->recv;
117 num_neighbors = send->numNeighbors;
118 p = send->offsetInShared[num_neighbors];
119 len = p * B->col_distribution->first_component[size];
120 send_buf = new double[len * block_size];
121 send_idx = new index_t[len];
122 send_offset = new index_t[(p+1)*2];
123 send_degree = new dim_t[num_neighbors];
124 i = num_main_cols + num_couple_cols;
125 send_m = new double[i * block_size];
126 send_c = new double[i * block_size];
127 idx_m = new index_t[i];
128 idx_c = new index_t[i];
129
130 /* waiting for receiving unknown's global ID */
131 if (B->global_id == NULL) {
132 coupler->finishCollect();
133 global_id = new index_t[num_couple_cols];
134 #pragma omp parallel for private(i) schedule(static)
135 for (i=0; i<num_couple_cols; ++i)
136 global_id[i] = coupler->recv_buffer[i];
137 B->global_id = global_id;
138 } else
139 global_id = B->global_id;
140
141 /* distribute the number of cols in current col_coupleBlock to all ranks */
142 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
143
144 /* distribute global_ids of cols to be considered to all ranks*/
145 len = 0;
146 max_num_cols = 0;
147 for (i=0; i<size; i++){
148 recv_degree[i] = recv_buf[i];
149 recv_offset[i] = len;
150 len += recv_buf[i];
151 if (max_num_cols < recv_buf[i])
152 max_num_cols = recv_buf[i];
153 }
154 recv_offset[size] = len;
155 cols_array = new index_t[len];
156 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
157
158 // first, prepare the ptr_ptr to be received
159 q = recv->numNeighbors;
160 len = recv->offsetInShared[q];
161 ptr_ptr = new index_t[(len+1) * 2];
162 for (p=0; p<q; p++) {
163 row = recv->offsetInShared[p];
164 m = recv->offsetInShared[p + 1];
165 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166 A->mpi_info->msg_tag_counter+recv->neighbor[p],
167 A->mpi_info->comm,
168 &(A->col_coupler->mpi_requests[p]));
169 }
170
171 // now prepare the rows to be sent (the degree, the offset and the data)
172 len = 0;
173 i0 = 0;
174 for (p=0; p<num_neighbors; p++) {
175 i = i0;
176 neighbor = send->neighbor[p];
177 m_lb = B->col_distribution->first_component[neighbor];
178 m_ub = B->col_distribution->first_component[neighbor + 1];
179 j_ub = send->offsetInShared[p + 1];
180 for (j=send->offsetInShared[p]; j<j_ub; j++) {
181 row = send->shared[j];
182 l_m = 0;
183 l_c = 0;
184 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
185 k_lb = B->col_coupleBlock->pattern->ptr[row];
186
187 /* check part of col_coupleBlock for data to be passed @row */
188 for (k=k_lb; k<k_ub; k++) {
189 m = global_id[B->col_coupleBlock->pattern->index[k]];
190 if (m > offset) break;
191 if (m>= m_lb && m < m_ub) {
192 /* data to be passed to sparse matrix B_ext_main */
193 idx_m[l_m] = m - m_lb;
194 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
195 l_m++;
196 } else {
197 /* data to be passed to sparse matrix B_ext_couple */
198 idx_c[l_c] = m;
199 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
200 l_c++;
201 }
202 }
203 k_lb = k;
204
205 /* check mainBlock for data to be passed @row to sparse
206 matrix B_ext_couple */
207 k_ub = B->mainBlock->pattern->ptr[row + 1];
208 k = B->mainBlock->pattern->ptr[row];
209 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
210 for (; k<k_ub; k++) {
211 m = B->mainBlock->pattern->index[k] + offset;
212 idx_c[l_c] = m;
213 l_c++;
214 }
215
216 /* check the rest part of col_coupleBlock for data to
217 be passed @row to sparse matrix B_ext_couple */
218 k = k_lb;
219 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
220 for (k=k_lb; k<k_ub; k++) {
221 m = global_id[B->col_coupleBlock->pattern->index[k]];
222 if (m>= m_lb && m < m_ub) {
223 /* data to be passed to sparse matrix B_ext_main */
224 idx_m[l_m] = m - m_lb;
225 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
226 l_m++;
227 } else {
228 /* data to be passed to sparse matrix B_ext_couple */
229 idx_c[l_c] = m;
230 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
231 l_c++;
232 }
233 }
234
235 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
236 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
237 send_offset[2*i] = l_m;
238 len += l_m;
239 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
240 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
241 send_offset[2*i+1] = l_c;
242 len += l_c;
243 i++;
244 }
245
246 /* sending */
247 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
248 A->mpi_info->msg_tag_counter+rank,
249 A->mpi_info->comm,
250 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
251 send_degree[p] = len;
252 i0 = i;
253 }
254 delete[] send_m;
255 delete[] send_c;
256 delete[] idx_m;
257 delete[] idx_c;
258
259 q = recv->numNeighbors;
260 len = recv->offsetInShared[q];
261 ptr_main = new index_t[(len+1)];
262 ptr_couple = new index_t[(len+1)];
263
264 MPI_Waitall(A->col_coupler->connector->send->numNeighbors +
265 A->col_coupler->connector->recv->numNeighbors,
266 A->col_coupler->mpi_requests,
267 A->col_coupler->mpi_stati);
268 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
269
270 j = 0;
271 k = 0;
272 ptr_main[0] = 0;
273 ptr_couple[0] = 0;
274 for (i=0; i<len; i++) {
275 j += ptr_ptr[2*i];
276 k += ptr_ptr[2*i+1];
277 ptr_main[i+1] = j;
278 ptr_couple[i+1] = k;
279 }
280
281 delete[] ptr_ptr;
282 idx_main = new index_t[j];
283 idx_couple = new index_t[k];
284 ptr_idx = new index_t[j+k];
285 ptr_val = new double[(j+k) * block_size];
286
287 /* send/receive index array */
288 j=0;
289 k_ub = 0;
290 for (p=0; p<recv->numNeighbors; p++) {
291 k = recv->offsetInShared[p];
292 m = recv->offsetInShared[p+1];
293 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
294 if (i > 0) {
295 k_ub ++;
296 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
297 A->mpi_info->msg_tag_counter+recv->neighbor[p],
298 A->mpi_info->comm,
299 &(A->col_coupler->mpi_requests[p]));
300 }
301 j += i;
302 }
303
304 j=0;
305 k_ub = 0;
306 for (p=0; p<num_neighbors; p++) {
307 i = send_degree[p] - j;
308 if (i > 0){
309 k_ub ++;
310 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
311 A->mpi_info->msg_tag_counter+rank,
312 A->mpi_info->comm,
313 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
314 }
315 j = send_degree[p];
316 }
317
318 MPI_Waitall(A->col_coupler->connector->send->numNeighbors +
319 A->col_coupler->connector->recv->numNeighbors,
320 A->col_coupler->mpi_requests,
321 A->col_coupler->mpi_stati);
322 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
323
324 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
325 for (i=0; i<len; i++) {
326 j = ptr_main[i];
327 k = ptr_main[i+1];
328 m = ptr_couple[i];
329 for (p=j; p<k; p++) {
330 idx_main[p] = ptr_idx[m+p];
331 }
332 j = ptr_couple[i+1];
333 for (p=m; p<j; p++) {
334 idx_couple[p] = ptr_idx[k+p];
335 }
336 }
337 delete[] ptr_idx;
338
339 /* allocate pattern and sparsematrix for B_ext_main */
340 pattern_main.reset(new Pattern(B->col_coupleBlock->pattern->type,
341 len, num_main_cols, ptr_main, idx_main));
342 B->row_coupleBlock.reset(new SparseMatrix(B->col_coupleBlock->type,
343 pattern_main, B->row_block_size, B->col_block_size,
344 false));
345
346 /* allocate pattern and sparsematrix for B_ext_couple */
347 pattern_couple.reset(new Pattern(B->col_coupleBlock->pattern->type,
348 len, B->col_distribution->first_component[size],
349 ptr_couple, idx_couple));
350 B->remote_coupleBlock.reset(new SparseMatrix(B->col_coupleBlock->type,
351 pattern_couple, B->row_block_size, B->col_block_size,
352 false));
353
354 /* send/receive value array */
355 j=0;
356 for (p=0; p<recv->numNeighbors; p++) {
357 k = recv->offsetInShared[p];
358 m = recv->offsetInShared[p+1];
359 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
360 if (i > 0)
361 MPI_Irecv(&(ptr_val[j]), i * block_size,
362 MPI_DOUBLE, recv->neighbor[p],
363 A->mpi_info->msg_tag_counter+recv->neighbor[p],
364 A->mpi_info->comm,
365 &(A->col_coupler->mpi_requests[p]));
366 j += (i * block_size);
367 }
368
369 j=0;
370 for (p=0; p<num_neighbors; p++) {
371 i = send_degree[p] - j;
372 if (i > 0)
373 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
374 A->mpi_info->msg_tag_counter+rank,
375 A->mpi_info->comm,
376 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
377 j = send_degree[p] ;
378 }
379
380 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
381 A->col_coupler->mpi_requests,
382 A->col_coupler->mpi_stati);
383 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
384
385 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
386 for (i=0; i<len; i++) {
387 j = ptr_main[i];
388 k = ptr_main[i+1];
389 m = ptr_couple[i];
390 for (p=j; p<k; p++) {
391 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
392 }
393 j = ptr_couple[i+1];
394 for (p=m; p<j; p++) {
395 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
396 }
397 }
398
399 delete[] ptr_val;
400 delete[] cols;
401 delete[] cols_array;
402 delete[] recv_offset;
403 delete[] recv_degree;
404 delete[] recv_buf;
405 delete[] send_buf;
406 delete[] send_offset;
407 delete[] send_degree;
408 delete[] send_idx;
409 #endif // ESYS_MPI
410 }
411
412 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
413 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
414 to copy block of data in T to neighbour processors, defined by
415 P->col_coupler->connector->recv->neighbor[k] where k is in
416 [0, P->col_coupler->connector->recv->numNeighbors).
417 Rows to be copied to neighbor processor k is in the list defined by
418 P->col_coupler->connector->recv->offsetInShared[k] ...
419 P->col_coupler->connector->recv->offsetInShared[k+1] */
420 void Preconditioner_AMG_CopyRemoteData(SystemMatrix_ptr P,
421 index_t **p_ptr, index_t **p_idx, double **p_val,
422 index_t *global_id, index_t block_size)
423 {
424 SharedComponents_ptr send, recv;
425 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
426 index_t i, j, p, m, n, size;
427 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
428 index_t *ptr=*p_ptr, *idx=*p_idx;
429 double *val=*p_val, *recv_val=NULL;
430 #ifdef ESYS_MPI
431 index_t rank = P->mpi_info->rank;
432 #endif
433
434 size = P->mpi_info->size;
435 send = P->col_coupler->connector->recv;
436 recv = P->col_coupler->connector->send;
437 send_neighbors = send->numNeighbors;
438 recv_neighbors = recv->numNeighbors;
439 send_rows = P->col_coupleBlock->numCols;
440 recv_rows = recv->offsetInShared[recv_neighbors];
441
442 send_degree = new index_t[send_rows];
443 recv_ptr = new index_t[recv_rows + 1];
444 #pragma omp for schedule(static) private(i)
445 for (i=0; i<send_rows; i++)
446 send_degree[i] = ptr[i+1] - ptr[i];
447
448 /* First, send/receive the degree */
449 for (p=0; p<recv_neighbors; p++) { /* Receiving */
450 m = recv->offsetInShared[p];
451 n = recv->offsetInShared[p+1];
452 #ifdef ESYS_MPI
453 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
454 P->mpi_info->msg_tag_counter + recv->neighbor[p],
455 P->mpi_info->comm,
456 &(P->col_coupler->mpi_requests[p]));
457 #endif
458 }
459 for (p=0; p<send_neighbors; p++) { /* Sending */
460 m = send->offsetInShared[p];
461 n = send->offsetInShared[p+1];
462 #ifdef ESYS_MPI
463 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
464 P->mpi_info->msg_tag_counter + rank,
465 P->mpi_info->comm,
466 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
467 #endif
468 }
469 #ifdef ESYS_MPI
470 MPI_Waitall(send_neighbors+recv_neighbors,
471 P->col_coupler->mpi_requests,
472 P->col_coupler->mpi_stati);
473 #endif
474 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
475
476 delete[] send_degree;
477 m = util::cumsum(recv_rows, recv_ptr);
478 recv_ptr[recv_rows] = m;
479 recv_idx = new index_t[m];
480 recv_val = new double[m * block_size];
481
482 /* Next, send/receive the index array */
483 j = 0;
484 for (p=0; p<recv_neighbors; p++) { /* Receiving */
485 m = recv->offsetInShared[p];
486 n = recv->offsetInShared[p+1];
487 i = recv_ptr[n] - recv_ptr[m];
488 if (i > 0) {
489 #ifdef ESYS_MPI
490 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
491 P->mpi_info->msg_tag_counter + recv->neighbor[p],
492 P->mpi_info->comm,
493 &(P->col_coupler->mpi_requests[p]));
494 #endif
495 }
496 j += i;
497 }
498
499 j = 0;
500 for (p=0; p<send_neighbors; p++) { /* Sending */
501 m = send->offsetInShared[p];
502 n = send->offsetInShared[p+1];
503 i = ptr[n] - ptr[m];
504 if (i >0) {
505 #ifdef ESYS_MPI
506 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
507 P->mpi_info->msg_tag_counter + rank,
508 P->mpi_info->comm,
509 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
510 #endif
511 j += i;
512 }
513 }
514 #ifdef ESYS_MPI
515 MPI_Waitall(send_neighbors+recv_neighbors,
516 P->col_coupler->mpi_requests,
517 P->col_coupler->mpi_stati);
518 #endif
519 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
520
521 /* Last, send/receive the data array */
522 j = 0;
523 for (p=0; p<recv_neighbors; p++) { /* Receiving */
524 m = recv->offsetInShared[p];
525 n = recv->offsetInShared[p+1];
526 i = recv_ptr[n] - recv_ptr[m];
527 #ifdef ESYS_MPI
528 if (i > 0)
529 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
530 P->mpi_info->msg_tag_counter + recv->neighbor[p],
531 P->mpi_info->comm,
532 &(P->col_coupler->mpi_requests[p]));
533 #endif
534 j += (i*block_size);
535 }
536
537 j = 0;
538 for (p=0; p<send_neighbors; p++) { /* Sending */
539 m = send->offsetInShared[p];
540 n = send->offsetInShared[p+1];
541 i = ptr[n] - ptr[m];
542 if (i >0) {
543 #ifdef ESYS_MPI
544 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
545 P->mpi_info->msg_tag_counter + rank,
546 P->mpi_info->comm,
547 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
548 #endif
549 j += i * block_size;
550 }
551 }
552 #ifdef ESYS_MPI
553 MPI_Waitall(send_neighbors+recv_neighbors,
554 P->col_coupler->mpi_requests,
555 P->col_coupler->mpi_stati);
556 #endif
557 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
558
559 /* Clean up and return with received ptr, index and data arrays */
560 delete[] ptr;
561 delete[] idx;
562 delete[] val;
563 *p_ptr = recv_ptr;
564 *p_idx = recv_idx;
565 *p_val = recv_val;
566 }
567
568 SystemMatrix_ptr Preconditioner_AMG_buildInterpolationOperator(
569 SystemMatrix_ptr A, SystemMatrix_ptr P,
570 SystemMatrix_ptr R)
571 {
572 esysUtils::JMPI& mpi_info=A->mpi_info;
573 SystemMatrix_ptr out;
574 SystemMatrixPattern_ptr pattern;
575 Distribution_ptr input_dist, output_dist;
576 Connector_ptr col_connector, row_connector;
577 const dim_t row_block_size=A->row_block_size;
578 const dim_t col_block_size=A->col_block_size;
579 const dim_t block_size = A->block_size;
580 const dim_t P_block_size = P->block_size;
581 const double ZERO = 0.0;
582 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
583 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
584 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
585 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
586 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
587 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
588 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
589 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
590 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
591 index_t sum, i, j, k, iptr, irb, icb, ib;
592 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
593 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
594 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
595 index_t row_marker_ext, *where_p=NULL;
596 index_t **send_ptr=NULL, **send_idx=NULL;
597 dim_t p, num_neighbors;
598 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
599 Esys_MPI_rank *neighbor=NULL;
600 #ifdef ESYS_MPI
601 MPI_Request* mpi_requests=NULL;
602 MPI_Status* mpi_stati=NULL;
603 #else
604 int *mpi_requests=NULL, *mpi_stati=NULL;
605 #endif
606
607 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
608 return Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
609
610 /* two sparse matrices R_main and R_couple will be generate, as the
611 transpose of P_main and P_col_couple, respectively. Note that,
612 R_couple is actually the row_coupleBlock of R (R=P^T) */
613 SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
614 SparseMatrix_ptr R_couple;
615 if (size > 1)
616 R_couple = P->col_coupleBlock->getTranspose();
617
618 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
619 and needed locally for triple matrix product RAP
620 to be specific, P_ext (on processor k) are group of rows in P, where
621 the list of rows from processor q is given by
622 A->col_coupler->connector->send->shared[rPtr...]
623 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
624 on q.
625 P_ext is represented by two sparse matrices P_ext_main and
626 P_ext_couple */
627 Preconditioner_AMG_extendB(A, P);
628
629 /* count the number of cols in P_ext_couple, resize the pattern of
630 sparse matrix P_ext_couple with new compressed order, and then
631 build the col id mapping from P->col_coupleBlock to
632 P_ext_couple */
633 num_Pmain_cols = P->mainBlock->numCols;
634 if (size > 1) {
635 num_Pcouple_cols = P->col_coupleBlock->numCols;
636 num_Acouple_cols = A->col_coupleBlock->numCols;
637 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
638 } else {
639 num_Pcouple_cols = 0;
640 num_Acouple_cols = 0;
641 sum = 0;
642 }
643 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
644 offset = P->col_distribution->first_component[rank];
645 num_Pext_cols = 0;
646 if (P->global_id) {
647 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
648 and P->col_coupleBlock */
649 iptr = 0;
650 if (num_Pcouple_cols || sum > 0) {
651 temp = new index_t[num_Pcouple_cols+sum];
652 #pragma omp parallel for lastprivate(iptr) schedule(static)
653 for (iptr=0; iptr<sum; iptr++){
654 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
655 }
656 for (j=0; j<num_Pcouple_cols; j++, iptr++){
657 temp[iptr] = P->global_id[j];
658 }
659 }
660 if (iptr) {
661 qsort(temp, (size_t)iptr, sizeof(index_t), util::comparIndex);
662 num_Pext_cols = 1;
663 i = temp[0];
664 for (j=1; j<iptr; j++) {
665 if (temp[j] > i) {
666 i = temp[j];
667 temp[num_Pext_cols++] = i;
668 }
669 }
670 }
671 /* resize the pattern of P_ext_couple */
672 if(num_Pext_cols){
673 global_id_P = new index_t[num_Pext_cols];
674 #pragma omp parallel for private(i) schedule(static)
675 for (i=0; i<num_Pext_cols; i++)
676 global_id_P[i] = temp[i];
677 }
678 if (num_Pcouple_cols || sum > 0)
679 delete[] temp;
680 #pragma omp parallel for private(i, where_p) schedule(static)
681 for (i=0; i<sum; i++) {
682 where_p = (index_t *)bsearch(
683 &(P->remote_coupleBlock->pattern->index[i]),
684 global_id_P, num_Pext_cols,
685 sizeof(index_t), util::comparIndex);
686 P->remote_coupleBlock->pattern->index[i] =
687 (index_t)(where_p -global_id_P);
688 }
689
690 /* build the mapping */
691 if (num_Pcouple_cols) {
692 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
693 iptr = 0;
694 for (i=0; i<num_Pext_cols; i++)
695 if (global_id_P[i] == P->global_id[iptr]) {
696 Pcouple_to_Pext[iptr++] = i;
697 if (iptr == num_Pcouple_cols) break;
698 }
699 }
700 }
701
702 /* alloc and initialise the makers */
703 sum = num_Pext_cols + num_Pmain_cols;
704 P_marker = new index_t[sum];
705 A_marker = new index_t[num_A_cols];
706 #pragma omp parallel for private(i) schedule(static)
707 for (i=0; i<sum; i++) P_marker[i] = -1;
708 #pragma omp parallel for private(i) schedule(static)
709 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
710
711 /* Now, count the size of RAP_ext. Start with rows in R_couple */
712 sum = 0;
713 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
714 row_marker = sum;
715 /* then, loop over elements in row i_r of R_couple */
716 j1_ub = R_couple->pattern->ptr[i_r+1];
717 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
718 i1 = R_couple->pattern->index[j1];
719 /* then, loop over elements in row i1 of A->col_coupleBlock */
720 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
721 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
722 i2 = A->col_coupleBlock->pattern->index[j2];
723
724 /* check whether entry RA[i_r, i2] has been previously visited.
725 RAP new entry is possible only if entry RA[i_r, i2] has not
726 been visited yet */
727 if (A_marker[i2] != i_r) {
728 /* first, mark entry RA[i_r,i2] as visited */
729 A_marker[i2] = i_r;
730
731 /* then loop over elements in row i2 of P_ext_main */
732 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
733 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
734 i_c = P->row_coupleBlock->pattern->index[j3];
735
736 /* check whether entry RAP[i_r,i_c] has been created.
737 If not yet created, create the entry and increase
738 the total number of elements in RAP_ext */
739 if (P_marker[i_c] < row_marker) {
740 P_marker[i_c] = sum;
741 sum++;
742 }
743 }
744
745 /* loop over elements in row i2 of P_ext_couple, do the same */
746 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
747 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
748 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
749 if (P_marker[i_c] < row_marker) {
750 P_marker[i_c] = sum;
751 sum++;
752 }
753 }
754 }
755 }
756
757 /* now loop over elements in row i1 of A->mainBlock, repeat
758 the process we have done to block A->col_coupleBlock */
759 j2_ub = A->mainBlock->pattern->ptr[i1+1];
760 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
761 i2 = A->mainBlock->pattern->index[j2];
762 if (A_marker[i2 + num_Acouple_cols] != i_r) {
763 A_marker[i2 + num_Acouple_cols] = i_r;
764 j3_ub = P->mainBlock->pattern->ptr[i2+1];
765 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
766 i_c = P->mainBlock->pattern->index[j3];
767 if (P_marker[i_c] < row_marker) {
768 P_marker[i_c] = sum;
769 sum++;
770 }
771 }
772 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
773 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
774 /* note that we need to map the column index in
775 P->col_coupleBlock back into the column index in
776 P_ext_couple */
777 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
778 if (P_marker[i_c] < row_marker) {
779 P_marker[i_c] = sum;
780 sum++;
781 }
782 }
783 }
784 }
785 }
786 }
787
788 /* Now we have the number of non-zero elements in RAP_ext, allocate
789 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
790 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
791 RAP_ext_idx = new index_t[sum];
792 RAP_ext_val = new double[sum * block_size];
793 RA_val = new double[block_size];
794 RAP_val = new double[block_size];
795
796 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
797 sum = num_Pext_cols + num_Pmain_cols;
798 #pragma omp parallel for private(i) schedule(static)
799 for (i=0; i<sum; i++) P_marker[i] = -1;
800 #pragma omp parallel for private(i) schedule(static)
801 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
802 sum = 0;
803 RAP_ext_ptr[0] = 0;
804 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
805 row_marker = sum;
806 /* then, loop over elements in row i_r of R_couple */
807 j1_ub = R_couple->pattern->ptr[i_r+1];
808 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
809 i1 = R_couple->pattern->index[j1];
810 R_val = &(R_couple->val[j1*P_block_size]);
811
812 /* then, loop over elements in row i1 of A->col_coupleBlock */
813 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
814 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
815 i2 = A->col_coupleBlock->pattern->index[j2];
816 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
817 for (irb=0; irb<row_block_size; irb++)
818 for (icb=0; icb<col_block_size; icb++)
819 RA_val[irb+row_block_size*icb] = ZERO;
820 for (irb=0; irb<P_block_size; irb++) {
821 rtmp=R_val[irb];
822 for (icb=0; icb<col_block_size; icb++) {
823 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
824 }
825 }
826
827 /* check whether entry RA[i_r, i2] has been previously visited.
828 RAP new entry is possible only if entry RA[i_r, i2] has not
829 been visited yet */
830 if (A_marker[i2] != i_r) {
831 /* first, mark entry RA[i_r,i2] as visited */
832 A_marker[i2] = i_r;
833
834 /* then loop over elements in row i2 of P_ext_main */
835 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
836 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
837 i_c = P->row_coupleBlock->pattern->index[j3];
838 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
839 for (irb=0; irb<row_block_size; irb++)
840 for (icb=0; icb<col_block_size; icb++)
841 RAP_val[irb+row_block_size*icb] = ZERO;
842 for (icb=0; icb<P_block_size; icb++) {
843 rtmp = temp_val[icb];
844 for (irb=0; irb<row_block_size; irb++) {
845 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
846 }
847 }
848
849 /* check whether entry RAP[i_r,i_c] has been created.
850 If not yet created, create the entry and increase
851 the total number of elements in RAP_ext */
852 if (P_marker[i_c] < row_marker) {
853 P_marker[i_c] = sum;
854 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
855 RAP_ext_idx[sum] = i_c + offset;
856 sum++;
857 } else {
858 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
859 for (ib=0; ib<block_size; ib++) {
860 temp_val[ib] += RAP_val[ib];
861 }
862 }
863 }
864
865 /* loop over elements in row i2 of P_ext_couple, do the same */
866 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
867 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
868 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
869 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
870 for (irb=0; irb<row_block_size; irb++)
871 for (icb=0; icb<col_block_size; icb++)
872 RAP_val[irb+row_block_size*icb]=ZERO;
873 for (icb=0; icb<P_block_size; icb++) {
874 rtmp = temp_val[icb];
875 for (irb=0; irb<row_block_size; irb++) {
876 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
877 }
878 }
879
880 if (P_marker[i_c] < row_marker) {
881 P_marker[i_c] = sum;
882 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
883 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
884 sum++;
885 } else {
886 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
887 for (ib=0; ib<block_size; ib++) {
888 temp_val[ib] += RAP_val[ib];
889 }
890 }
891 }
892
893 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
894 Only the contributions are added. */
895 } else {
896 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
897 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
898 i_c = P->row_coupleBlock->pattern->index[j3];
899 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
900 for (irb=0; irb<row_block_size; irb++)
901 for (icb=0; icb<col_block_size; icb++)
902 RAP_val[irb+row_block_size*icb]=ZERO;
903 for (icb=0; icb<P_block_size; icb++) {
904 rtmp = temp_val[icb];
905 for (irb=0; irb<row_block_size; irb++) {
906 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
907 }
908 }
909
910 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
911 for (ib=0; ib<block_size; ib++) {
912 temp_val[ib] += RAP_val[ib];
913 }
914 }
915 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
916 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
917 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
918 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
919 for (irb=0; irb<row_block_size; irb++)
920 for (icb=0; icb<col_block_size; icb++)
921 RAP_val[irb+row_block_size*icb]=ZERO;
922 for (icb=0; icb<P_block_size; icb++) {
923 rtmp = temp_val[icb];
924 for (irb=0; irb<row_block_size; irb++) {
925 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
926 }
927 }
928
929 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
930 for (ib=0; ib<block_size; ib++) {
931 temp_val[ib] += RAP_val[ib];
932 }
933 }
934 }
935 }
936
937 /* now loop over elements in row i1 of A->mainBlock, repeat
938 the process we have done to block A->col_coupleBlock */
939 j2_ub = A->mainBlock->pattern->ptr[i1+1];
940 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
941 i2 = A->mainBlock->pattern->index[j2];
942 temp_val = &(A->mainBlock->val[j2*block_size]);
943 for (irb=0; irb<row_block_size; irb++)
944 for (icb=0; icb<col_block_size; icb++)
945 RA_val[irb+row_block_size*icb]=ZERO;
946 for (irb=0; irb<P_block_size; irb++) {
947 rtmp=R_val[irb];
948 for (icb=0; icb<col_block_size; icb++) {
949 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
950 }
951 }
952
953 if (A_marker[i2 + num_Acouple_cols] != i_r) {
954 A_marker[i2 + num_Acouple_cols] = i_r;
955 j3_ub = P->mainBlock->pattern->ptr[i2+1];
956 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
957 i_c = P->mainBlock->pattern->index[j3];
958 temp_val = &(P->mainBlock->val[j3*P_block_size]);
959 for (irb=0; irb<row_block_size; irb++)
960 for (icb=0; icb<col_block_size; icb++)
961 RAP_val[irb+row_block_size*icb]=ZERO;
962 for (icb=0; icb<P_block_size; icb++) {
963 rtmp = temp_val[icb];
964 for (irb=0; irb<row_block_size; irb++) {
965 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
966 }
967 }
968
969 if (P_marker[i_c] < row_marker) {
970 P_marker[i_c] = sum;
971 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
972 RAP_ext_idx[sum] = i_c + offset;
973 sum++;
974 } else {
975 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
976 for (ib=0; ib<block_size; ib++) {
977 temp_val[ib] += RAP_val[ib];
978 }
979 }
980 }
981 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
982 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
983 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
984 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
985 for (irb=0; irb<row_block_size; irb++)
986 for (icb=0; icb<col_block_size; icb++)
987 RAP_val[irb+row_block_size*icb]=ZERO;
988 for (icb=0; icb<P_block_size; icb++) {
989 rtmp=temp_val[icb];
990 for (irb=0; irb<row_block_size; irb++) {
991 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
992 }
993 }
994
995 if (P_marker[i_c] < row_marker) {
996 P_marker[i_c] = sum;
997 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
998 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
999 sum++;
1000 } else {
1001 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1002 for (ib=0; ib<block_size; ib++) {
1003 temp_val[ib] += RAP_val[ib];
1004 }
1005 }
1006 }
1007 } else {
1008 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1009 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1010 i_c = P->mainBlock->pattern->index[j3];
1011 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1012 for (irb=0; irb<row_block_size; irb++)
1013 for (icb=0; icb<col_block_size; icb++)
1014 RAP_val[irb+row_block_size*icb]=ZERO;
1015 for (icb=0; icb<P_block_size; icb++) {
1016 rtmp=temp_val[icb];
1017 for (irb=0; irb<row_block_size; irb++) {
1018 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1019 }
1020 }
1021
1022 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1023 for (ib=0; ib<block_size; ib++) {
1024 temp_val[ib] += RAP_val[ib];
1025 }
1026 }
1027 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1028 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1029 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1030 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1031 for (irb=0; irb<row_block_size; irb++)
1032 for (icb=0; icb<col_block_size; icb++)
1033 RAP_val[irb+row_block_size*icb]=ZERO;
1034 for (icb=0; icb<P_block_size; icb++) {
1035 rtmp=temp_val[icb];
1036 for (irb=0; irb<row_block_size; irb++) {
1037 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1038 }
1039 }
1040
1041 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1042 for (ib=0; ib<block_size; ib++) {
1043 temp_val[ib] += RAP_val[ib];
1044 }
1045 }
1046 }
1047 }
1048 }
1049 RAP_ext_ptr[i_r+1] = sum;
1050 }
1051 delete[] P_marker;
1052 delete[] Pcouple_to_Pext;
1053
1054 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1055 which is given by the column list of P->col_coupleBlock, and
1056 column "c" is the list of columns which possibly covers the
1057 whole column range of system matrix P. This part of data is to
1058 be passed to neighbouring processors, and added to corresponding
1059 RAP[r,c] entries in the neighbouring processors */
1060 Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1061 &RAP_ext_val, global_id_P, block_size);
1062
1063 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1064 sum = RAP_ext_ptr[num_RAPext_rows];
1065 num_RAPext_cols = 0;
1066 if (num_Pext_cols || sum > 0) {
1067 temp = new index_t[num_Pext_cols+sum];
1068 j1_ub = offset + num_Pmain_cols;
1069 for (i=0, iptr=0; i<sum; i++) {
1070 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1071 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1072 }
1073 for (j=0; j<num_Pext_cols; j++, iptr++){
1074 temp[iptr] = global_id_P[j];
1075 }
1076
1077 if (iptr) {
1078 qsort(temp, (size_t)iptr, sizeof(index_t), util::comparIndex);
1079 num_RAPext_cols = 1;
1080 i = temp[0];
1081 for (j=1; j<iptr; j++) {
1082 if (temp[j] > i) {
1083 i = temp[j];
1084 temp[num_RAPext_cols++] = i;
1085 }
1086 }
1087 }
1088 }
1089
1090 /* resize the pattern of P_ext_couple */
1091 if(num_RAPext_cols){
1092 global_id_RAP = new index_t[num_RAPext_cols];
1093 #pragma omp parallel for private(i) schedule(static)
1094 for (i=0; i<num_RAPext_cols; i++)
1095 global_id_RAP[i] = temp[i];
1096 }
1097 if (num_Pext_cols || sum > 0)
1098 delete[] temp;
1099 j1_ub = offset + num_Pmain_cols;
1100 #pragma omp parallel for private(i, where_p) schedule(static)
1101 for (i=0; i<sum; i++) {
1102 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1103 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1104 /*XXX*/ num_RAPext_cols, sizeof(index_t), util::comparIndex);
1105 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1106 } else
1107 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1108 }
1109
1110 /* build the mapping */
1111 if (num_Pcouple_cols) {
1112 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1113 iptr = 0;
1114 for (i=0; i<num_RAPext_cols; i++)
1115 if (global_id_RAP[i] == P->global_id[iptr]) {
1116 Pcouple_to_RAP[iptr++] = i;
1117 if (iptr == num_Pcouple_cols) break;
1118 }
1119 }
1120
1121 if (num_Pext_cols) {
1122 Pext_to_RAP = new index_t[num_Pext_cols];
1123 iptr = 0;
1124 for (i=0; i<num_RAPext_cols; i++)
1125 if (global_id_RAP[i] == global_id_P[iptr]) {
1126 Pext_to_RAP[iptr++] = i;
1127 if (iptr == num_Pext_cols) break;
1128 }
1129 }
1130
1131 if (global_id_P){
1132 delete[] global_id_P;
1133 global_id_P = NULL;
1134 }
1135
1136 /* alloc and initialise the makers */
1137 sum = num_RAPext_cols + num_Pmain_cols;
1138 P_marker = new index_t[sum];
1139 #pragma omp parallel for private(i) schedule(static)
1140 for (i=0; i<sum; i++) P_marker[i] = -1;
1141 #pragma omp parallel for private(i) schedule(static)
1142 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1143
1144 /* Now, count the size of RAP. Start with rows in R_main */
1145 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1146 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1147 shared = P->col_coupler->connector->send->shared;
1148 i = 0;
1149 j = 0;
1150 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1151 /* Mark the start of row for both main block and couple block */
1152 row_marker = i;
1153 row_marker_ext = j;
1154
1155 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1156 in RAP_ext */
1157 P_marker[i_r] = i;
1158 i++;
1159 for (j1=0; j1<num_neighbors; j1++) {
1160 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1161 if (shared[j2] == i_r) {
1162 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1163 i_c = RAP_ext_idx[k];
1164 if (i_c < num_Pmain_cols) {
1165 if (P_marker[i_c] < row_marker) {
1166 P_marker[i_c] = i;
1167 i++;
1168 }
1169 } else {
1170 if (P_marker[i_c] < row_marker_ext) {
1171 P_marker[i_c] = j;
1172 j++;
1173 }
1174 }
1175 }
1176 break;
1177 }
1178 }
1179 }
1180
1181 /* then, loop over elements in row i_r of R_main */
1182 j1_ub = R_main->pattern->ptr[i_r+1];
1183 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1184 i1 = R_main->pattern->index[j1];
1185
1186 /* then, loop over elements in row i1 of A->col_coupleBlock */
1187 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1188 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1189 i2 = A->col_coupleBlock->pattern->index[j2];
1190
1191 /* check whether entry RA[i_r, i2] has been previously visited.
1192 RAP new entry is possible only if entry RA[i_r, i2] has not
1193 been visited yet */
1194 if (A_marker[i2] != i_r) {
1195 /* first, mark entry RA[i_r,i2] as visited */
1196 A_marker[i2] = i_r;
1197
1198 /* then loop over elements in row i2 of P_ext_main */
1199 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1200 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1201 i_c = P->row_coupleBlock->pattern->index[j3];
1202
1203 /* check whether entry RAP[i_r,i_c] has been created.
1204 If not yet created, create the entry and increase
1205 the total number of elements in RAP_ext */
1206 if (P_marker[i_c] < row_marker) {
1207 P_marker[i_c] = i;
1208 i++;
1209 }
1210 }
1211
1212 /* loop over elements in row i2 of P_ext_couple, do the same */
1213 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1214 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1215 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1216 if (P_marker[i_c] < row_marker_ext) {
1217 P_marker[i_c] = j;
1218 j++;
1219 }
1220 }
1221 }
1222 }
1223
1224 /* now loop over elements in row i1 of A->mainBlock, repeat
1225 the process we have done to block A->col_coupleBlock */
1226 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1227 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1228 i2 = A->mainBlock->pattern->index[j2];
1229 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1230 A_marker[i2 + num_Acouple_cols] = i_r;
1231 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1232 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1233 i_c = P->mainBlock->pattern->index[j3];
1234 if (P_marker[i_c] < row_marker) {
1235 P_marker[i_c] = i;
1236 i++;
1237 }
1238 }
1239 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1240 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1241 /* note that we need to map the column index in
1242 P->col_coupleBlock back into the column index in
1243 P_ext_couple */
1244 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1245 if (P_marker[i_c] < row_marker_ext) {
1246 P_marker[i_c] = j;
1247 j++;
1248 }
1249 }
1250 }
1251 }
1252 }
1253 }
1254
1255 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1256 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1257 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1258 RAP_couple */
1259 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1260 RAP_main_idx = new index_t[i];
1261 RAP_main_val = new double[i * block_size];
1262 RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1263 RAP_couple_idx = new index_t[j];
1264 RAP_couple_val = new double[j * block_size];
1265
1266 RAP_main_ptr[num_Pmain_cols] = i;
1267 RAP_couple_ptr[num_Pmain_cols] = j;
1268
1269 #pragma omp parallel for private(i) schedule(static)
1270 for (i=0; i<sum; i++) P_marker[i] = -1;
1271 #pragma omp parallel for private(i) schedule(static)
1272 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1273
1274 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1275 in R_main */
1276 i = 0;
1277 j = 0;
1278 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1279 /* Mark the start of row for both main block and couple block */
1280 row_marker = i;
1281 row_marker_ext = j;
1282 RAP_main_ptr[i_r] = row_marker;
1283 RAP_couple_ptr[i_r] = row_marker_ext;
1284
1285 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1286 in row i_r of RAP_ext */
1287 P_marker[i_r] = i;
1288 for (ib=0; ib<block_size; ib++)
1289 RAP_main_val[i*block_size+ib] = ZERO;
1290 RAP_main_idx[i] = i_r;
1291 i++;
1292
1293 for (j1=0; j1<num_neighbors; j1++) {
1294 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1295 if (shared[j2] == i_r) {
1296 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1297 i_c = RAP_ext_idx[k];
1298 if (i_c < num_Pmain_cols) {
1299 if (P_marker[i_c] < row_marker) {
1300 P_marker[i_c] = i;
1301 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1302 RAP_main_idx[i] = i_c;
1303 i++;
1304 } else {
1305 t1_val = &(RAP_ext_val[k*block_size]);
1306 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1307 for (ib=0; ib<block_size; ib++)
1308 t2_val[ib] += t1_val[ib];
1309 }
1310 } else {
1311 if (P_marker[i_c] < row_marker_ext) {
1312 P_marker[i_c] = j;
1313 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1314 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1315 j++;
1316 } else {
1317 t1_val = &(RAP_ext_val[k*block_size]);
1318 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1319 for (ib=0; ib<block_size; ib++)
1320 t2_val[ib] += t1_val[ib];
1321 }
1322 }
1323 }
1324 break;
1325 }
1326 }
1327 }
1328
1329 /* then, loop over elements in row i_r of R_main */
1330 j1_ub = R_main->pattern->ptr[i_r+1];
1331 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1332 i1 = R_main->pattern->index[j1];
1333 R_val = &(R_main->val[j1*P_block_size]);
1334
1335 /* then, loop over elements in row i1 of A->col_coupleBlock */
1336 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1337 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1338 i2 = A->col_coupleBlock->pattern->index[j2];
1339 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1340 for (irb=0; irb<row_block_size; irb++)
1341 for (icb=0; icb<col_block_size; icb++)
1342 RA_val[irb+row_block_size*icb]=ZERO;
1343 for (irb=0; irb<P_block_size; irb++) {
1344 rtmp=R_val[irb];
1345 for (icb=0; icb<col_block_size; icb++) {
1346 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1347 }
1348 }
1349
1350
1351 /* check whether entry RA[i_r, i2] has been previously visited.
1352 RAP new entry is possible only if entry RA[i_r, i2] has not
1353 been visited yet */
1354 if (A_marker[i2] != i_r) {
1355 /* first, mark entry RA[i_r,i2] as visited */
1356 A_marker[i2] = i_r;
1357
1358 /* then loop over elements in row i2 of P_ext_main */
1359 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1360 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1361 i_c = P->row_coupleBlock->pattern->index[j3];
1362 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1363 for (irb=0; irb<row_block_size; irb++)
1364 for (icb=0; icb<col_block_size; icb++)
1365 RAP_val[irb+row_block_size*icb]=ZERO;
1366 for (icb=0; icb<P_block_size; icb++) {
1367 rtmp=temp_val[icb];
1368 for (irb=0; irb<row_block_size; irb++) {
1369 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1370 }
1371 }
1372
1373
1374 /* check whether entry RAP[i_r,i_c] has been created.
1375 If not yet created, create the entry and increase
1376 the total number of elements in RAP_ext */
1377 if (P_marker[i_c] < row_marker) {
1378 P_marker[i_c] = i;
1379 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1380 RAP_main_idx[i] = i_c;
1381 i++;
1382 } else {
1383 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1384 for (ib=0; ib<block_size; ib++) {
1385 temp_val[ib] += RAP_val[ib];
1386 }
1387 }
1388 }
1389
1390 /* loop over elements in row i2 of P_ext_couple, do the same */
1391 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1392 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1393 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1394 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1395 for (irb=0; irb<row_block_size; irb++)
1396 for (icb=0; icb<col_block_size; icb++)
1397 RAP_val[irb+row_block_size*icb]=ZERO;
1398 for (icb=0; icb<P_block_size; icb++) {
1399 rtmp=temp_val[icb];
1400 for (irb=0; irb<row_block_size; irb++) {
1401 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1402 }
1403 }
1404
1405 if (P_marker[i_c] < row_marker_ext) {
1406 P_marker[i_c] = j;
1407 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1408 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1409 j++;
1410 } else {
1411 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1412 for (ib=0; ib<block_size; ib++) {
1413 temp_val[ib] += RAP_val[ib];
1414 }
1415 }
1416 }
1417
1418 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1419 Only the contributions are added. */
1420 } else {
1421 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1422 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1423 i_c = P->row_coupleBlock->pattern->index[j3];
1424 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1425 for (irb=0; irb<row_block_size; irb++)
1426 for (icb=0; icb<col_block_size; icb++)
1427 RAP_val[irb+row_block_size*icb]=ZERO;
1428 for (icb=0; icb<P_block_size; icb++) {
1429 rtmp=temp_val[icb];
1430 for (irb=0; irb<row_block_size; irb++) {
1431 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1432 }
1433 }
1434
1435 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1436 for (ib=0; ib<block_size; ib++) {
1437 temp_val[ib] += RAP_val[ib];
1438 }
1439 }
1440 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1441 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1442 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1443 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1444 for (irb=0; irb<row_block_size; irb++)
1445 for (icb=0; icb<col_block_size; icb++)
1446 RAP_val[irb+row_block_size*icb]=ZERO;
1447 for (icb=0; icb<P_block_size; icb++) {
1448 rtmp=temp_val[icb];
1449 for (irb=0; irb<row_block_size; irb++) {
1450 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1451 }
1452 }
1453
1454 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1455 for (ib=0; ib<block_size; ib++) {
1456 temp_val[ib] += RAP_val[ib];
1457 }
1458 }
1459 }
1460 }
1461
1462 /* now loop over elements in row i1 of A->mainBlock, repeat
1463 the process we have done to block A->col_coupleBlock */
1464 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1465 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1466 i2 = A->mainBlock->pattern->index[j2];
1467 temp_val = &(A->mainBlock->val[j2*block_size]);
1468 for (irb=0; irb<row_block_size; irb++)
1469 for (icb=0; icb<col_block_size; icb++)
1470 RA_val[irb+row_block_size*icb]=ZERO;
1471 for (irb=0; irb<P_block_size; irb++) {
1472 rtmp=R_val[irb];
1473 for (icb=0; icb<col_block_size; icb++) {
1474 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1475 }
1476 }
1477
1478 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1479 A_marker[i2 + num_Acouple_cols] = i_r;
1480 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1481 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1482 i_c = P->mainBlock->pattern->index[j3];
1483 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1484 for (irb=0; irb<row_block_size; irb++)
1485 for (icb=0; icb<col_block_size; icb++)
1486 RAP_val[irb+row_block_size*icb]=ZERO;
1487 for (icb=0; icb<P_block_size; icb++) {
1488 rtmp=temp_val[icb];
1489 for (irb=0; irb<row_block_size; irb++) {
1490 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1491 }
1492 }
1493 if (P_marker[i_c] < row_marker) {
1494 P_marker[i_c] = i;
1495 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1496 RAP_main_idx[i] = i_c;
1497 i++;
1498 } else {
1499 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1500 for (ib=0; ib<block_size; ib++) {
1501 temp_val[ib] += RAP_val[ib];
1502 }
1503 }
1504 }
1505 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1506 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1507 /* note that we need to map the column index in
1508 P->col_coupleBlock back into the column index in
1509 P_ext_couple */
1510 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1511 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1512 for (irb=0; irb<row_block_size; irb++)
1513 for (icb=0; icb<col_block_size; icb++)
1514 RAP_val[irb+row_block_size*icb]=ZERO;
1515 for (icb=0; icb<P_block_size; icb++) {
1516 rtmp=temp_val[icb];
1517 for (irb=0; irb<row_block_size; irb++) {
1518 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1519 }
1520 }
1521
1522 if (P_marker[i_c] < row_marker_ext) {
1523 P_marker[i_c] = j;
1524 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1525 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1526 j++;
1527 } else {
1528 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1529 for (ib=0; ib<block_size; ib++) {
1530 temp_val[ib] += RAP_val[ib];
1531 }
1532 }
1533 }
1534
1535 } else {
1536 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1537 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1538 i_c = P->mainBlock->pattern->index[j3];
1539 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1540 for (irb=0; irb<row_block_size; irb++)
1541 for (icb=0; icb<col_block_size; icb++)
1542 RAP_val[irb+row_block_size*icb]=ZERO;
1543 for (icb=0; icb<P_block_size; icb++) {
1544 rtmp=temp_val[icb];
1545 for (irb=0; irb<row_block_size; irb++) {
1546 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1547 }
1548 }
1549
1550 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1551 for (ib=0; ib<block_size; ib++) {
1552 temp_val[ib] += RAP_val[ib];
1553 }
1554 }
1555 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1556 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1557 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1558 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1559 for (irb=0; irb<row_block_size; irb++)
1560 for (icb=0; icb<col_block_size; icb++)
1561 RAP_val[irb+row_block_size*icb]=ZERO;
1562 for (icb=0; icb<P_block_size; icb++) {
1563 rtmp = temp_val[icb];
1564 for (irb=0; irb<row_block_size; irb++) {
1565 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1566 }
1567 }
1568
1569 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1570 for (ib=0; ib<block_size; ib++) {
1571 temp_val[ib] += RAP_val[ib];
1572 }
1573 }
1574 }
1575 }
1576 }
1577
1578 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1579 if (i > row_marker) {
1580 offset = i - row_marker;
1581 temp = new index_t[offset];
1582 #pragma omp parallel for schedule(static) private(iptr)
1583 for (iptr=0; iptr<offset; iptr++)
1584 temp[iptr] = RAP_main_idx[row_marker+iptr];
1585 if (offset > 0) {
1586 qsort(temp, (size_t)offset, sizeof(index_t), util::comparIndex);
1587 }
1588 temp_val = new double[offset * block_size];
1589 #pragma omp parallel for schedule(static) private(iptr,k)
1590 for (iptr=0; iptr<offset; iptr++){
1591 k = P_marker[temp[iptr]];
1592 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1593 P_marker[temp[iptr]] = iptr + row_marker;
1594 }
1595 #pragma omp parallel for schedule(static) private(iptr)
1596 for (iptr=0; iptr<offset; iptr++){
1597 RAP_main_idx[row_marker+iptr] = temp[iptr];
1598 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1599 }
1600 delete[] temp;
1601 delete[] temp_val;
1602 }
1603 if (j > row_marker_ext) {
1604 offset = j - row_marker_ext;
1605 temp = new index_t[offset];
1606 #pragma omp parallel for schedule(static) private(iptr)
1607 for (iptr=0; iptr<offset; iptr++)
1608 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1609 if (offset > 0) {
1610 qsort(temp, (size_t)offset, sizeof(index_t), util::comparIndex);
1611 }
1612 temp_val = new double[offset * block_size];
1613 #pragma omp parallel for schedule(static) private(iptr, k)
1614 for (iptr=0; iptr<offset; iptr++){
1615 k = P_marker[temp[iptr] + num_Pmain_cols];
1616 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1617 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1618 }
1619 #pragma omp parallel for schedule(static) private(iptr)
1620 for (iptr=0; iptr<offset; iptr++){
1621 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1622 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1623 }
1624 delete[] temp;
1625 delete[] temp_val;
1626 }
1627 }
1628
1629 delete[] RA_val;
1630 delete[] RAP_val;
1631 delete[] A_marker;
1632 delete[] Pext_to_RAP;
1633 delete[] Pcouple_to_RAP;
1634 delete[] RAP_ext_ptr;
1635 delete[] RAP_ext_idx;
1636 delete[] RAP_ext_val;
1637 R_main.reset();
1638 R_couple.reset();
1639
1640 /* Check whether there are empty columns in RAP_couple */
1641 #pragma omp parallel for schedule(static) private(i)
1642 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1643 /* num of non-empty columns is stored in "k" */
1644 k = 0;
1645 j = RAP_couple_ptr[num_Pmain_cols];
1646 for (i=0; i<j; i++) {
1647 i1 = RAP_couple_idx[i];
1648 if (P_marker[i1]) {
1649 P_marker[i1] = 0;
1650 k++;
1651 }
1652 }
1653
1654 /* empty columns is found */
1655 if (k < num_RAPext_cols) {
1656 temp = new index_t[k];
1657 k = 0;
1658 for (i=0; i<num_RAPext_cols; i++)
1659 if (!P_marker[i]) {
1660 P_marker[i] = k;
1661 temp[k] = global_id_RAP[i];
1662 k++;
1663 }
1664 #pragma omp parallel for schedule(static) private(i, i1)
1665 for (i=0; i<j; i++) {
1666 i1 = RAP_couple_idx[i];
1667 RAP_couple_idx[i] = P_marker[i1];
1668 }
1669 num_RAPext_cols = k;
1670 delete[] global_id_RAP;
1671 global_id_RAP = temp;
1672 }
1673 delete[] P_marker;
1674
1675 /******************************************************/
1676 /* Start to create the coarse level System Matrix A_c */
1677 /******************************************************/
1678 /* first, prepare the sender/receiver for the col_connector */
1679 dist = P->pattern->input_distribution->first_component;
1680 recv_len = new dim_t[size];
1681 send_len = new dim_t[size];
1682 neighbor = new Esys_MPI_rank[size];
1683 offsetInShared = new index_t[size+1];
1684 shared = new index_t[num_RAPext_cols];
1685 memset(recv_len, 0, sizeof(dim_t) * size);
1686 memset(send_len, 0, sizeof(dim_t) * size);
1687 num_neighbors = 0;
1688 offsetInShared[0] = 0;
1689 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1690 shared[i] = i + num_Pmain_cols;
1691 if (k <= global_id_RAP[i]) {
1692 if (recv_len[j] > 0) {
1693 neighbor[num_neighbors] = j;
1694 num_neighbors ++;
1695 offsetInShared[num_neighbors] = i;
1696 }
1697 while (k <= global_id_RAP[i]) {
1698 j++;
1699 k = dist[j+1];
1700 }
1701 }
1702 recv_len[j] ++;
1703 }
1704 if (recv_len[j] > 0) {
1705 neighbor[num_neighbors] = j;
1706 num_neighbors ++;
1707 offsetInShared[num_neighbors] = i;
1708 }
1709
1710 SharedComponents_ptr recv(new SharedComponents(
1711 num_Pmain_cols, num_neighbors, neighbor, shared,
1712 offsetInShared, 1, 0, mpi_info));
1713
1714 #ifdef ESYS_MPI
1715 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1716 #endif
1717
1718 #ifdef ESYS_MPI
1719 mpi_requests=new MPI_Request[size*2];
1720 mpi_stati=new MPI_Status[size*2];
1721 #else
1722 mpi_requests=new int[size*2];
1723 mpi_stati=new int[size*2];
1724 #endif
1725 num_neighbors = 0;
1726 j = 0;
1727 offsetInShared[0] = 0;
1728 for (i=0; i<size; i++) {
1729 if (send_len[i] > 0) {
1730 neighbor[num_neighbors] = i;
1731 num_neighbors ++;
1732 j += send_len[i];
1733 offsetInShared[num_neighbors] = j;
1734 }
1735 }
1736 delete[] shared;
1737 shared = new index_t[j];
1738 for (i=0, j=0; i<num_neighbors; i++) {
1739 k = neighbor[i];
1740 #ifdef ESYS_MPI
1741 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1742 mpi_info->msg_tag_counter+k,
1743 mpi_info->comm, &mpi_requests[i]);
1744 #endif
1745 j += send_len[k];
1746 }
1747 for (i=0, j=0; i<recv->numNeighbors; i++) {
1748 k = recv->neighbor[i];
1749 #ifdef ESYS_MPI
1750 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1751 mpi_info->msg_tag_counter+rank,
1752 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1753 #endif
1754 j += recv_len[k];
1755 }
1756 #ifdef ESYS_MPI
1757 MPI_Waitall(num_neighbors + recv->numNeighbors,
1758 mpi_requests, mpi_stati);
1759 #endif
1760 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1761
1762 j = offsetInShared[num_neighbors];
1763 offset = dist[rank];
1764 #pragma omp parallel for schedule(static) private(i)
1765 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1766
1767 SharedComponents_ptr send(new SharedComponents(
1768 num_Pmain_cols, num_neighbors, neighbor, shared,
1769 offsetInShared, 1, 0, mpi_info));
1770
1771 col_connector.reset(new Connector(send, recv));
1772 delete[] shared;
1773
1774 /* now, create row distribution (output_distri) and col
1775 distribution (input_distribution) */
1776 input_dist.reset(new Distribution(mpi_info, dist, 1, 0));
1777 output_dist.reset(new Distribution(mpi_info, dist, 1, 0));
1778
1779 /* then, prepare the sender/receiver for the row_connector, first, prepare
1780 the information for sender */
1781 sum = RAP_couple_ptr[num_Pmain_cols];
1782 len = new dim_t[size];
1783 send_ptr = new index_t*[size];
1784 send_idx = new index_t*[size];
1785 #pragma omp parallel for schedule(static) private(i)
1786 for (i=0; i<size; i++) {
1787 send_ptr[i] = new index_t[num_Pmain_cols];
1788 send_idx[i] = new index_t[sum];
1789 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1790 }
1791 memset(len, 0, sizeof(dim_t) * size);
1792 recv = col_connector->recv;
1793 sum=0;
1794 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1795 i1 = RAP_couple_ptr[i_r];
1796 i2 = RAP_couple_ptr[i_r+1];
1797 if (i2 > i1) {
1798 /* then row i_r will be in the sender of row_connector, now check
1799 how many neighbours i_r needs to be send to */
1800 for (j=i1; j<i2; j++) {
1801 i_c = RAP_couple_idx[j];
1802 /* find out the corresponding neighbor "p" of column i_c */
1803 for (p=0; p<recv->numNeighbors; p++) {
1804 if (i_c < recv->offsetInShared[p+1]) {
1805 k = recv->neighbor[p];
1806 if (send_ptr[k][i_r] == 0) sum++;
1807 send_ptr[k][i_r] ++;
1808 send_idx[k][len[k]] = global_id_RAP[i_c];
1809 len[k] ++;
1810 break;
1811 }
1812 }
1813 }
1814 }
1815 }
1816 if (global_id_RAP) {
1817 delete[] global_id_RAP;
1818 global_id_RAP = NULL;
1819 }
1820
1821 /* now allocate the sender */
1822 shared = new index_t[sum];
1823 memset(send_len, 0, sizeof(dim_t) * size);
1824 num_neighbors=0;
1825 offsetInShared[0] = 0;
1826 for (p=0, k=0; p<size; p++) {
1827 for (i=0; i<num_Pmain_cols; i++) {
1828 if (send_ptr[p][i] > 0) {
1829 shared[k] = i;
1830 k++;
1831 send_ptr[p][send_len[p]] = send_ptr[p][i];
1832 send_len[p]++;
1833 }
1834 }
1835 if (k > offsetInShared[num_neighbors]) {
1836 neighbor[num_neighbors] = p;
1837 num_neighbors ++;
1838 offsetInShared[num_neighbors] = k;
1839 }
1840 }
1841 send.reset(new SharedComponents(num_Pmain_cols, num_neighbors,
1842 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1843
1844 /* send/recv number of rows will be sent from current proc
1845 recover info for the receiver of row_connector from the sender */
1846 #ifdef ESYS_MPI
1847 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1848 #endif
1849 num_neighbors = 0;
1850 offsetInShared[0] = 0;
1851 j = 0;
1852 for (i=0; i<size; i++) {
1853 if (i != rank && recv_len[i] > 0) {
1854 neighbor[num_neighbors] = i;
1855 num_neighbors ++;
1856 j += recv_len[i];
1857 offsetInShared[num_neighbors] = j;
1858 }
1859 }
1860 delete[] shared;
1861 delete[] recv_len;
1862 shared = new index_t[j];
1863 k = offsetInShared[num_neighbors];
1864 #pragma omp parallel for schedule(static) private(i)
1865 for (i=0; i<k; i++) {
1866 shared[i] = i + num_Pmain_cols;
1867 }
1868 recv.reset(new SharedComponents(num_Pmain_cols, num_neighbors,
1869 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1870 row_connector.reset(new Connector(send, recv));
1871 delete[] shared;
1872
1873 /* send/recv pattern->ptr for rowCoupleBlock */
1874 num_RAPext_rows = offsetInShared[num_neighbors];
1875 row_couple_ptr = new index_t[num_RAPext_rows+1];
1876 for (p=0; p<num_neighbors; p++) {
1877 j = offsetInShared[p];
1878 i = offsetInShared[p+1];
1879 #ifdef ESYS_MPI
1880 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1881 mpi_info->msg_tag_counter+neighbor[p],
1882 mpi_info->comm, &mpi_requests[p]);
1883 #endif
1884 }
1885 send = row_connector->send;
1886 for (p=0; p<send->numNeighbors; p++) {
1887 #ifdef ESYS_MPI
1888 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1889 MPI_INT, send->neighbor[p],
1890 mpi_info->msg_tag_counter+rank,
1891 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1892 #endif
1893 }
1894 #ifdef ESYS_MPI
1895 MPI_Waitall(num_neighbors + send->numNeighbors,
1896 mpi_requests, mpi_stati);
1897 #endif
1898 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1899 delete[] send_len;
1900
1901 sum = 0;
1902 for (i=0; i<num_RAPext_rows; i++) {
1903 k = row_couple_ptr[i];
1904 row_couple_ptr[i] = sum;
1905 sum += k;
1906 }
1907 row_couple_ptr[num_RAPext_rows] = sum;
1908
1909 /* send/recv pattern->index for rowCoupleBlock */
1910 k = row_couple_ptr[num_RAPext_rows];
1911 row_couple_idx = new index_t[k];
1912 for (p=0; p<num_neighbors; p++) {
1913 j1 = row_couple_ptr[offsetInShared[p]];
1914 j2 = row_couple_ptr[offsetInShared[p+1]];
1915 #ifdef ESYS_MPI
1916 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1917 mpi_info->msg_tag_counter+neighbor[p],
1918 mpi_info->comm, &mpi_requests[p]);
1919 #endif
1920 }
1921 for (p=0; p<send->numNeighbors; p++) {
1922 #ifdef ESYS_MPI
1923 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1924 MPI_INT, send->neighbor[p],
1925 mpi_info->msg_tag_counter+rank,
1926 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1927 #endif
1928 }
1929 #ifdef ESYS_MPI
1930 MPI_Waitall(num_neighbors + send->numNeighbors,
1931 mpi_requests, mpi_stati);
1932 #endif
1933 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1934
1935 offset = input_dist->first_component[rank];
1936 k = row_couple_ptr[num_RAPext_rows];
1937 #pragma omp parallel for schedule(static) private(i)
1938 for (i=0; i<k; i++) {
1939 row_couple_idx[i] -= offset;
1940 }
1941 #pragma omp parallel for schedule(static) private(i)
1942 for (i=0; i<size; i++) {
1943 delete[] send_ptr[i];
1944 delete[] send_idx[i];
1945 }
1946 delete[] send_ptr;
1947 delete[] send_idx;
1948 delete[] len;
1949 delete[] offsetInShared;
1950 delete[] neighbor;
1951 delete[] mpi_requests;
1952 delete[] mpi_stati;
1953
1954
1955 /* Now, we can create pattern for mainBlock and coupleBlock */
1956 Pattern_ptr main_pattern(new Pattern(MATRIX_FORMAT_DEFAULT,
1957 num_Pmain_cols, num_Pmain_cols, RAP_main_ptr, RAP_main_idx));
1958 Pattern_ptr col_couple_pattern(new Pattern(
1959 MATRIX_FORMAT_DEFAULT, num_Pmain_cols, num_RAPext_cols,
1960 RAP_couple_ptr, RAP_couple_idx));
1961 Pattern_ptr row_couple_pattern(new Pattern(
1962 MATRIX_FORMAT_DEFAULT, num_RAPext_rows, num_Pmain_cols,
1963 row_couple_ptr, row_couple_idx));
1964
1965 /* next, create the system matrix */
1966 pattern.reset(new SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1967 output_dist, input_dist, main_pattern, col_couple_pattern,
1968 row_couple_pattern, col_connector, row_connector));
1969 out.reset(new SystemMatrix(A->type, pattern, row_block_size,
1970 col_block_size, false));
1971
1972 /* finally, fill in the data*/
1973 memcpy(out->mainBlock->val, RAP_main_val,
1974 out->mainBlock->len* sizeof(double));
1975 memcpy(out->col_coupleBlock->val, RAP_couple_val,
1976 out->col_coupleBlock->len * sizeof(double));
1977
1978 delete[] RAP_main_val;
1979 delete[] RAP_couple_val;
1980 if (!Esys_noError()) {
1981 out.reset();
1982 }
1983 return out;
1984 }
1985
1986
1987 SystemMatrix_ptr Preconditioner_AMG_buildInterpolationOperatorBlock(
1988 SystemMatrix_ptr A, SystemMatrix_ptr P,
1989 SystemMatrix_ptr R)
1990 {
1991 esysUtils::JMPI& mpi_info=A->mpi_info;
1992 SystemMatrix_ptr out;
1993 SystemMatrixPattern_ptr pattern;
1994 Distribution_ptr input_dist, output_dist;
1995 SharedComponents_ptr send, recv;
1996 Connector_ptr col_connector, row_connector;
1997 const dim_t row_block_size=A->row_block_size;
1998 const dim_t col_block_size=A->col_block_size;
1999 const dim_t block_size = A->block_size;
2000 const double ZERO = 0.0;
2001 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2002 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2003 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2004 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2005 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2006 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2007 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2008 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2009 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2010 index_t sum, i, j, k, iptr, irb, icb, ib;
2011 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2012 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2013 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2014 index_t row_marker_ext, *where_p=NULL;
2015 index_t **send_ptr=NULL, **send_idx=NULL;
2016 dim_t p, num_neighbors;
2017 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2018 Esys_MPI_rank *neighbor=NULL;
2019 #ifdef ESYS_MPI
2020 MPI_Request* mpi_requests=NULL;
2021 MPI_Status* mpi_stati=NULL;
2022 #else
2023 int *mpi_requests=NULL, *mpi_stati=NULL;
2024 #endif
2025
2026 /* two sparse matrices R_main and R_couple will be generate, as the
2027 transpose of P_main and P_col_couple, respectively. Note that,
2028 R_couple is actually the row_coupleBlock of R (R=P^T) */
2029 SparseMatrix_ptr R_main(P->mainBlock->getTranspose());
2030 SparseMatrix_ptr R_couple(P->col_coupleBlock->getTranspose());
2031
2032 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2033 and needed locally for triple matrix product RAP
2034 to be specific, P_ext (on processor k) are group of rows in P, where
2035 the list of rows from processor q is given by
2036 A->col_coupler->connector->send->shared[rPtr...]
2037 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2038 on q.
2039 P_ext is represented by two sparse matrices P_ext_main and
2040 P_ext_couple */
2041 Preconditioner_AMG_extendB(A, P);
2042
2043 /* count the number of cols in P_ext_couple, resize the pattern of
2044 sparse matrix P_ext_couple with new compressed order, and then
2045 build the col id mapping from P->col_coupleBlock to
2046 P_ext_couple */
2047 num_Pmain_cols = P->mainBlock->numCols;
2048 num_Pcouple_cols = P->col_coupleBlock->numCols;
2049 num_Acouple_cols = A->col_coupleBlock->numCols;
2050 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2051 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2052 offset = P->col_distribution->first_component[rank];
2053 num_Pext_cols = 0;
2054 if (P->global_id) {
2055 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2056 and P->col_coupleBlock */
2057 iptr = 0;
2058 if (num_Pcouple_cols || sum > 0) {
2059 temp = new index_t[num_Pcouple_cols+sum];
2060 for (; iptr<sum; iptr++){
2061 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2062 }
2063 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2064 temp[iptr] = P->global_id[j];
2065 }
2066 }
2067 if (iptr) {
2068 qsort(temp, (size_t)iptr, sizeof(index_t), util::comparIndex);
2069 num_Pext_cols = 1;
2070 i = temp[0];
2071 for (j=1; j<iptr; j++) {
2072 if (temp[j] > i) {
2073 i = temp[j];
2074 temp[num_Pext_cols++] = i;
2075 }
2076 }
2077 }
2078
2079 /* resize the pattern of P_ext_couple */
2080 if(num_Pext_cols){
2081 global_id_P = new index_t[num_Pext_cols];
2082 for (i=0; i<num_Pext_cols; i++)
2083 global_id_P[i] = temp[i];
2084 }
2085 if (num_Pcouple_cols || sum > 0)
2086 delete[] temp;
2087 for (i=0; i<sum; i++) {
2088 where_p = (index_t *)bsearch(
2089 &(P->remote_coupleBlock->pattern->index[i]),
2090 global_id_P, num_Pext_cols,
2091 sizeof(index_t), util::comparIndex);
2092 P->remote_coupleBlock->pattern->index[i] =
2093 (index_t)(where_p -global_id_P);
2094 }
2095
2096 /* build the mapping */
2097 if (num_Pcouple_cols) {
2098 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2099 iptr = 0;
2100 for (i=0; i<num_Pext_cols; i++)
2101 if (global_id_P[i] == P->global_id[iptr]) {
2102 Pcouple_to_Pext[iptr++] = i;
2103 if (iptr == num_Pcouple_cols) break;
2104 }
2105 }
2106 }
2107
2108 /* alloc and initialise the makers */
2109 sum = num_Pext_cols + num_Pmain_cols;
2110 P_marker = new index_t[sum];
2111 A_marker = new index_t[num_A_cols];
2112 #pragma omp parallel for private(i) schedule(static)
2113 for (i=0; i<sum; i++) P_marker[i] = -1;
2114 #pragma omp parallel for private(i) schedule(static)
2115 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2116
2117 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2118 sum = 0;
2119 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2120 row_marker = sum;
2121 /* then, loop over elements in row i_r of R_couple */
2122 j1_ub = R_couple->pattern->ptr[i_r+1];
2123 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2124 i1 = R_couple->pattern->index[j1];
2125 /* then, loop over elements in row i1 of A->col_coupleBlock */
2126 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2127 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2128 i2 = A->col_coupleBlock->pattern->index[j2];
2129
2130 /* check whether entry RA[i_r, i2] has been previously visited.
2131 RAP new entry is possible only if entry RA[i_r, i2] has not
2132 been visited yet */
2133 if (A_marker[i2] != i_r) {
2134 /* first, mark entry RA[i_r,i2] as visited */
2135 A_marker[i2] = i_r;
2136
2137 /* then loop over elements in row i2 of P_ext_main */
2138 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2139 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2140 i_c = P->row_coupleBlock->pattern->index[j3];
2141
2142 /* check whether entry RAP[i_r,i_c] has been created.
2143 If not yet created, create the entry and increase
2144 the total number of elements in RAP_ext */
2145 if (P_marker[i_c] < row_marker) {
2146 P_marker[i_c] = sum;
2147 sum++;
2148 }
2149 }
2150
2151 /* loop over elements in row i2 of P_ext_couple, do the same */
2152 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2153 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2154 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2155 if (P_marker[i_c] < row_marker) {
2156 P_marker[i_c] = sum;
2157 sum++;
2158 }
2159 }
2160 }
2161 }
2162
2163 /* now loop over elements in row i1 of A->mainBlock, repeat
2164 the process we have done to block A->col_coupleBlock */
2165 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2166 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2167 i2 = A->mainBlock->pattern->index[j2];
2168 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2169 A_marker[i2 + num_Acouple_cols] = i_r;
2170 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2171 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2172 i_c = P->mainBlock->pattern->index[j3];
2173 if (P_marker[i_c] < row_marker) {
2174 P_marker[i_c] = sum;
2175 sum++;
2176 }
2177 }
2178 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2179 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2180 /* note that we need to map the column index in
2181 P->col_coupleBlock back into the column index in
2182 P_ext_couple */
2183 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2184 if (P_marker[i_c] < row_marker) {
2185 P_marker[i_c] = sum;
2186 sum++;
2187 }
2188 }
2189 }
2190 }
2191 }
2192 }
2193
2194 /* Now we have the number of non-zero elements in RAP_ext, allocate
2195 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2196 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2197 RAP_ext_idx = new index_t[sum];
2198 RAP_ext_val = new double[sum * block_size];
2199 RA_val = new double[block_size];
2200 RAP_val = new double[block_size];
2201
2202 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2203 sum = num_Pext_cols + num_Pmain_cols;
2204 #pragma omp parallel for private(i) schedule(static)
2205 for (i=0; i<sum; i++) P_marker[i] = -1;
2206 #pragma omp parallel for private(i) schedule(static)
2207 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2208 sum = 0;
2209 RAP_ext_ptr[0] = 0;
2210 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2211 row_marker = sum;
2212 /* then, loop over elements in row i_r of R_couple */
2213 j1_ub = R_couple->pattern->ptr[i_r+1];
2214 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2215 i1 = R_couple->pattern->index[j1];
2216 R_val = &(R_couple->val[j1*block_size]);
2217
2218 /* then, loop over elements in row i1 of A->col_coupleBlock */
2219 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2220 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2221 i2 = A->col_coupleBlock->pattern->index[j2];
2222 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2223 for (irb=0; irb<row_block_size; irb++) {
2224 for (icb=0; icb<col_block_size; icb++) {
2225 rtmp = ZERO;
2226 for (ib=0; ib<col_block_size; ib++) {
2227 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2228 }
2229 RA_val[irb+row_block_size*icb]=rtmp;
2230 }
2231 }
2232
2233 /* check whether entry RA[i_r, i2] has been previously visited.
2234 RAP new entry is possible only if entry RA[i_r, i2] has not
2235 been visited yet */
2236 if (A_marker[i2] != i_r) {
2237 /* first, mark entry RA[i_r,i2] as visited */
2238 A_marker[i2] = i_r;
2239
2240 /* then loop over elements in row i2 of P_ext_main */
2241 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2242 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2243 i_c = P->row_coupleBlock->pattern->index[j3];
2244 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2245 for (irb=0; irb<row_block_size; irb++) {
2246 for (icb=0; icb<col_block_size; icb++) {
2247 rtmp = ZERO;
2248 for (ib=0; ib<col_block_size; ib++) {
2249 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2250 }
2251 RAP_val[irb+row_block_size*icb]=rtmp;
2252 }
2253 }
2254
2255
2256 /* check whether entry RAP[i_r,i_c] has been created.
2257 If not yet created, create the entry and increase
2258 the total number of elements in RAP_ext */
2259 if (P_marker[i_c] < row_marker) {
2260 P_marker[i_c] = sum;
2261 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2262 RAP_ext_idx[sum] = i_c + offset;
2263 sum++;
2264 } else {
2265 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2266 for (ib=0; ib<block_size; ib++) {
2267 temp_val[ib] += RAP_val[ib];
2268 }
2269 }
2270 }
2271
2272 /* loop over elements in row i2 of P_ext_couple, do the same */
2273 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2274 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2275 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2276 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2277 for (irb=0; irb<row_block_size; irb++) {
2278 for (icb=0; icb<col_block_size; icb++) {
2279 rtmp = ZERO;
2280 for (ib=0; ib<col_block_size; ib++) {
2281 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2282 }
2283 RAP_val[irb+row_block_size*icb]=rtmp;
2284 }
2285 }
2286
2287 if (P_marker[i_c] < row_marker) {
2288 P_marker[i_c] = sum;
2289 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2290 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2291 sum++;
2292 } else {
2293 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2294 for (ib=0; ib<block_size; ib++) {
2295 temp_val[ib] += RAP_val[ib];
2296 }
2297 }
2298 }
2299
2300 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2301 Only the contributions are added. */
2302 } else {
2303 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2304 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2305 i_c = P->row_coupleBlock->pattern->index[j3];
2306 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2307 for (irb=0; irb<row_block_size; irb++) {
2308 for (icb=0; icb<col_block_size; icb++) {
2309 rtmp = ZERO;
2310 for (ib=0; ib<col_block_size; ib++) {
2311 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2312 }
2313 RAP_val[irb+row_block_size*icb]=rtmp;
2314 }
2315 }
2316
2317 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2318 for (ib=0; ib<block_size; ib++) {
2319 temp_val[ib] += RAP_val[ib];
2320 }
2321 }
2322 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2323 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2324 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2325 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2326 for (irb=0; irb<row_block_size; irb++) {
2327 for (icb=0; icb<col_block_size; icb++) {
2328 rtmp = ZERO;
2329 for (ib=0; ib<col_block_size; ib++) {
2330 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2331 }
2332 RAP_val[irb+row_block_size*icb]=rtmp;
2333 }
2334 }
2335
2336 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2337 for (ib=0; ib<block_size; ib++) {
2338 temp_val[ib] += RAP_val[ib];
2339 }
2340 }
2341 }
2342 }
2343
2344 /* now loop over elements in row i1 of A->mainBlock, repeat
2345 the process we have done to block A->col_coupleBlock */
2346 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2347 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2348 i2 = A->mainBlock->pattern->index[j2];
2349 temp_val = &(A->mainBlock->val[j2*block_size]);
2350 for (irb=0; irb<row_block_size; irb++) {
2351 for (icb=0; icb<col_block_size; icb++) {
2352 rtmp = ZERO;
2353 for (ib=0; ib<col_block_size; ib++) {
2354 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2355 }
2356 RA_val[irb+row_block_size*icb]=rtmp;
2357 }
2358 }
2359
2360 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2361 A_marker[i2 + num_Acouple_cols] = i_r;
2362 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2363 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2364 i_c = P->mainBlock->pattern->index[j3];
2365 temp_val = &(P->mainBlock->val[j3*block_size]);
2366 for (irb=0; irb<row_block_size; irb++) {
2367 for (icb=0; icb<col_block_size; icb++) {
2368 rtmp = ZERO;
2369 for (ib=0; ib<col_block_size; ib++) {
2370 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2371 }
2372 RAP_val[irb+row_block_size*icb]=rtmp;
2373 }
2374 }
2375
2376 if (P_marker[i_c] < row_marker) {
2377 P_marker[i_c] = sum;
2378 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2379 RAP_ext_idx[sum] = i_c + offset;
2380 sum++;
2381 } else {
2382 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2383 for (ib=0; ib<block_size; ib++) {
2384 temp_val[ib] += RAP_val[ib];
2385 }
2386 }
2387 }
2388 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2389 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2390 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2391 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2392 for (irb=0; irb<row_block_size; irb++) {
2393 for (icb=0; icb<col_block_size; icb++) {
2394 rtmp = ZERO;
2395 for (ib=0; ib<col_block_size; ib++) {
2396 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2397 }
2398 RAP_val[irb+row_block_size*icb]=rtmp;
2399 }
2400 }
2401
2402 if (P_marker[i_c] < row_marker) {
2403 P_marker[i_c] = sum;
2404 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2405 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2406 sum++;
2407 } else {
2408 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2409 for (ib=0; ib<block_size; ib++) {
2410 temp_val[ib] += RAP_val[ib];
2411 }
2412 }
2413 }
2414 } else {
2415 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2416 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2417 i_c = P->mainBlock->pattern->index[j3];
2418 temp_val = &(P->mainBlock->val[j3*block_size]);
2419 for (irb=0; irb<row_block_size; irb++) {
2420 for (icb=0; icb<col_block_size; icb++) {
2421 rtmp = ZERO;
2422 for (ib=0; ib<col_block_size; ib++) {
2423 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2424 }
2425 RAP_val[irb+row_block_size*icb]=rtmp;
2426 }
2427 }
2428
2429 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2430 for (ib=0; ib<block_size; ib++) {
2431 temp_val[ib] += RAP_val[ib];
2432 }
2433 }
2434 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2435 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2436 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2437 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2438 for (irb=0; irb<row_block_size; irb++) {
2439 for (icb=0; icb<col_block_size; icb++) {
2440 rtmp = ZERO;
2441 for (ib=0; ib<col_block_size; ib++) {
2442 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2443 }
2444 RAP_val[irb+row_block_size*icb]=rtmp;
2445 }
2446 }
2447
2448 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2449 for (ib=0; ib<block_size; ib++) {
2450 temp_val[ib] += RAP_val[ib];
2451 }
2452 }
2453 }
2454 }
2455 }
2456 RAP_ext_ptr[i_r+1] = sum;
2457 }
2458 delete[] P_marker;
2459 delete[] Pcouple_to_Pext;
2460
2461 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2462 which is given by the column list of P->col_coupleBlock, and
2463 column "c" is the list of columns which possibly covers the
2464 whole column range of system matrix P. This part of data is to
2465 be passed to neighbouring processors, and added to corresponding
2466 RAP[r,c] entries in the neighbouring processors */
2467 Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2468 &RAP_ext_val, global_id_P, block_size);
2469
2470 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2471 sum = RAP_ext_ptr[num_RAPext_rows];
2472 num_RAPext_cols = 0;
2473 if (num_Pext_cols || sum > 0) {
2474 temp = new index_t[num_Pext_cols+sum];
2475 j1_ub = offset + num_Pmain_cols;
2476 for (i=0, iptr=0; i<sum; i++) {
2477 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2478 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2479 }
2480 for (j=0; j<num_Pext_cols; j++, iptr++){
2481 temp[iptr] = global_id_P[j];
2482 }
2483
2484 if (iptr) {
2485 qsort(temp, (size_t)iptr, sizeof(index_t), util::comparIndex);
2486 num_RAPext_cols = 1;
2487 i = temp[0];
2488 for (j=1; j<iptr; j++) {
2489 if (temp[j] > i) {
2490 i = temp[j];
2491 temp[num_RAPext_cols++] = i;
2492 }
2493 }
2494 }
2495 }
2496
2497 /* resize the pattern of P_ext_couple */
2498 if(num_RAPext_cols){
2499 global_id_RAP = new index_t[num_RAPext_cols];
2500 for (i=0; i<num_RAPext_cols; i++)
2501 global_id_RAP[i] = temp[i];
2502 }
2503 if (num_Pext_cols || sum > 0)
2504 delete[] temp;
2505 j1_ub = offset + num_Pmain_cols;
2506 for (i=0; i<sum; i++) {
2507 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2508 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2509 /*XXX*/ num_RAPext_cols, sizeof(index_t), util::comparIndex);
2510 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2511 } else
2512 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2513 }
2514
2515 /* build the mapping */
2516 if (num_Pcouple_cols) {
2517 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
2518 iptr = 0;
2519 for (i=0; i<num_RAPext_cols; i++)
2520 if (global_id_RAP[i] == P->global_id[iptr]) {
2521 Pcouple_to_RAP[iptr++] = i;
2522 if (iptr == num_Pcouple_cols) break;
2523 }
2524 }
2525
2526 if (num_Pext_cols) {
2527 Pext_to_RAP = new index_t[num_Pext_cols];
2528 iptr = 0;
2529 for (i=0; i<num_RAPext_cols; i++)
2530 if (global_id_RAP[i] == global_id_P[iptr]) {
2531 Pext_to_RAP[iptr++] = i;
2532 if (iptr == num_Pext_cols) break;
2533 }
2534 }
2535
2536 if (global_id_P){
2537 delete[] global_id_P;
2538 global_id_P = NULL;
2539 }
2540
2541 /* alloc and initialise the makers */
2542 sum = num_RAPext_cols + num_Pmain_cols;
2543 P_marker = new index_t[sum];
2544 #pragma omp parallel for private(i) schedule(static)
2545 for (i=0; i<sum; i++) P_marker[i] = -1;
2546 #pragma omp parallel for private(i) schedule(static)
2547 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2548
2549 /* Now, count the size of RAP. Start with rows in R_main */
2550 num_neighbors = P->col_coupler->connector->send->numNeighbors;
2551 offsetInShared = P->col_coupler->connector->send->offsetInShared;
2552 shared = P->col_coupler->connector->send->shared;
2553 i = 0;
2554 j = 0;
2555 for (i_r=0