/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4817 - (show annotations)
Fri Mar 28 08:04:09 2014 UTC (5 years, 7 months ago) by caltinay
File size: 121916 byte(s)
Coupler/Connector shared ptrs.

1 /*****************************************************************************
2 *
3 * Copyright (c) 2003-2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16
17 /****************************************************************************/
18
19 /* Paso: defines AMG Interpolation */
20
21 /****************************************************************************/
22
23 /* Author: l.gao@uq.edu.au */
24
25 /****************************************************************************/
26
27 #include "Paso.h"
28 #include "SparseMatrix.h"
29 #include "PasoUtil.h"
30 #include "Preconditioner.h"
31
32 /****************************************************************************
33
34 Methods necessary for AMG preconditioner
35
36 construct n_C x n_C interpolation operator A_C from matrix A
37 and prolongation matrix P.
38
39 The coarsening operator A_C is defined by RAP where R=P^T.
40
41 *****************************************************************************/
42
43 /* Extend system matrix B with extra two sparse matrices:
44 B_ext_main and B_ext_couple
45 The combination of this two sparse matrices represents
46 the portion of B that is stored on neighbour procs and
47 needed locally for triple matrix product (B^T) A B.
48
49 FOR NOW, we assume that the system matrix B has a NULL
50 row_coupleBlock and a NULL remote_coupleBlock. Based on
51 this assumption, we use link row_coupleBlock for sparse
52 matrix B_ext_main, and link remote_coupleBlock for sparse
53 matrix B_ext_couple.
54
55 To be specific, B_ext (on processor k) are group of rows
56 in B, where the list of rows from processor q is given by
57 A->col_coupler->connector->send->shared[rPtr...]
58 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59 on q. */
60 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
61 {
62 paso::Pattern *pattern_main=NULL, *pattern_couple=NULL;
63 paso::Coupler_ptr coupler;
64 paso::SharedComponents_ptr send, recv;
65 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
66 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
67 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
68 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
69 index_t *idx_m=NULL, *idx_c=NULL;
70 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
71 index_t offset, len, block_size, block_size_size, max_num_cols;
72 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
73 dim_t *recv_degree=NULL, *send_degree=NULL;
74 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
75
76 if (size == 1) return;
77
78 if (B->row_coupleBlock) {
79 paso::SparseMatrix_free(B->row_coupleBlock);
80 B->row_coupleBlock = NULL;
81 }
82
83 if (B->row_coupleBlock || B->remote_coupleBlock) {
84 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
85 return;
86 }
87
88 /* sending/receiving unknown's global ID */
89 num_main_cols = B->mainBlock->numCols;
90 cols = new double[num_main_cols];
91 offset = B->col_distribution->first_component[rank];
92 #pragma omp parallel for private(i) schedule(static)
93 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
94 if (B->global_id == NULL) {
95 coupler.reset(new paso::Coupler(B->col_coupler->connector, 1));
96 coupler->startCollect(cols);
97 }
98
99 recv_buf = new index_t[size];
100 recv_degree = new dim_t[size];
101 recv_offset = new index_t[size+1];
102 #pragma omp parallel for private(i) schedule(static)
103 for (i=0; i<size; i++){
104 recv_buf[i] = 0;
105 recv_degree[i] = 1;
106 recv_offset[i] = i;
107 }
108
109 block_size = B->block_size;
110 block_size_size = block_size * sizeof(double);
111 num_couple_cols = B->col_coupleBlock->numCols;
112 send = A->col_coupler->connector->send;
113 recv = A->col_coupler->connector->recv;
114 num_neighbors = send->numNeighbors;
115 p = send->offsetInShared[num_neighbors];
116 len = p * B->col_distribution->first_component[size];
117 send_buf = new double[len * block_size];
118 send_idx = new index_t[len];
119 send_offset = new index_t[(p+1)*2];
120 send_degree = new dim_t[num_neighbors];
121 i = num_main_cols + num_couple_cols;
122 send_m = new double[i * block_size];
123 send_c = new double[i * block_size];
124 idx_m = new index_t[i];
125 idx_c = new index_t[i];
126
127 /* waiting for receiving unknown's global ID */
128 if (B->global_id == NULL) {
129 coupler->finishCollect();
130 global_id = new index_t[num_couple_cols];
131 #pragma omp parallel for private(i) schedule(static)
132 for (i=0; i<num_couple_cols; ++i)
133 global_id[i] = coupler->recv_buffer[i];
134 B->global_id = global_id;
135 } else
136 global_id = B->global_id;
137
138 /* distribute the number of cols in current col_coupleBlock to all ranks */
139 #ifdef ESYS_MPI
140 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
141 #endif
142
143 /* distribute global_ids of cols to be considered to all ranks*/
144 len = 0;
145 max_num_cols = 0;
146 for (i=0; i<size; i++){
147 recv_degree[i] = recv_buf[i];
148 recv_offset[i] = len;
149 len += recv_buf[i];
150 if (max_num_cols < recv_buf[i])
151 max_num_cols = recv_buf[i];
152 }
153 recv_offset[size] = len;
154 cols_array = new index_t[len];
155 #ifdef ESYS_MPI
156 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
157 #endif
158
159 /* first, prepare the ptr_ptr to be received */
160 q = recv->numNeighbors;
161 len = recv->offsetInShared[q];
162 ptr_ptr = new index_t[(len+1) * 2];
163 for (p=0; p<q; p++) {
164 row = recv->offsetInShared[p];
165 m = recv->offsetInShared[p + 1];
166 #ifdef ESYS_MPI
167 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
168 A->mpi_info->msg_tag_counter+recv->neighbor[p],
169 A->mpi_info->comm,
170 &(A->col_coupler->mpi_requests[p]));
171 #endif
172 }
173
174 /* now prepare the rows to be sent (the degree, the offset and the data) */
175 len = 0;
176 i0 = 0;
177 for (p=0; p<num_neighbors; p++) {
178 i = i0;
179 neighbor = send->neighbor[p];
180 m_lb = B->col_distribution->first_component[neighbor];
181 m_ub = B->col_distribution->first_component[neighbor + 1];
182 j_ub = send->offsetInShared[p + 1];
183 for (j=send->offsetInShared[p]; j<j_ub; j++) {
184 row = send->shared[j];
185 l_m = 0;
186 l_c = 0;
187 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
188 k_lb = B->col_coupleBlock->pattern->ptr[row];
189
190 /* check part of col_coupleBlock for data to be passed @row */
191 for (k=k_lb; k<k_ub; k++) {
192 m = global_id[B->col_coupleBlock->pattern->index[k]];
193 if (m > offset) break;
194 if (m>= m_lb && m < m_ub) {
195 /* data to be passed to sparse matrix B_ext_main */
196 idx_m[l_m] = m - m_lb;
197 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
198 l_m++;
199 } else {
200 /* data to be passed to sparse matrix B_ext_couple */
201 idx_c[l_c] = m;
202 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
203 l_c++;
204 }
205 }
206 k_lb = k;
207
208 /* check mainBlock for data to be passed @row to sparse
209 matrix B_ext_couple */
210 k_ub = B->mainBlock->pattern->ptr[row + 1];
211 k = B->mainBlock->pattern->ptr[row];
212 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
213 for (; k<k_ub; k++) {
214 m = B->mainBlock->pattern->index[k] + offset;
215 idx_c[l_c] = m;
216 l_c++;
217 }
218
219 /* check the rest part of col_coupleBlock for data to
220 be passed @row to sparse matrix B_ext_couple */
221 k = k_lb;
222 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
223 for (k=k_lb; k<k_ub; k++) {
224 m = global_id[B->col_coupleBlock->pattern->index[k]];
225 if (m>= m_lb && m < m_ub) {
226 /* data to be passed to sparse matrix B_ext_main */
227 idx_m[l_m] = m - m_lb;
228 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
229 l_m++;
230 } else {
231 /* data to be passed to sparse matrix B_ext_couple */
232 idx_c[l_c] = m;
233 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
234 l_c++;
235 }
236 }
237
238 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
239 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
240 send_offset[2*i] = l_m;
241 len += l_m;
242 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
243 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
244 send_offset[2*i+1] = l_c;
245 len += l_c;
246 i++;
247 }
248
249 /* sending */
250 #ifdef ESYS_MPI
251 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
252 A->mpi_info->msg_tag_counter+rank,
253 A->mpi_info->comm,
254 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
255 #endif
256 send_degree[p] = len;
257 i0 = i;
258 }
259 delete[] send_m;
260 delete[] send_c;
261 delete[] idx_m;
262 delete[] idx_c;
263
264
265 q = recv->numNeighbors;
266 len = recv->offsetInShared[q];
267 ptr_main = new index_t[(len+1)];
268 ptr_couple = new index_t[(len+1)];
269
270 #ifdef ESYS_MPI
271 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
272 A->col_coupler->mpi_requests,
273 A->col_coupler->mpi_stati);
274 #endif
275 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
276
277 j = 0;
278 k = 0;
279 ptr_main[0] = 0;
280 ptr_couple[0] = 0;
281 for (i=0; i<len; i++) {
282 j += ptr_ptr[2*i];
283 k += ptr_ptr[2*i+1];
284 ptr_main[i+1] = j;
285 ptr_couple[i+1] = k;
286 }
287
288 delete[] ptr_ptr;
289 idx_main = new index_t[j];
290 idx_couple = new index_t[k];
291 ptr_idx = new index_t[j+k];
292 ptr_val = new double[(j+k) * block_size];
293
294 /* send/receive index array */
295 j=0;
296 k_ub = 0;
297 for (p=0; p<recv->numNeighbors; p++) {
298 k = recv->offsetInShared[p];
299 m = recv->offsetInShared[p+1];
300 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
301 if (i > 0) {
302 k_ub ++;
303 #ifdef ESYS_MPI
304 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
305 A->mpi_info->msg_tag_counter+recv->neighbor[p],
306 A->mpi_info->comm,
307 &(A->col_coupler->mpi_requests[p]));
308 #endif
309 }
310 j += i;
311 }
312
313 j=0;
314 k_ub = 0;
315 for (p=0; p<num_neighbors; p++) {
316 i = send_degree[p] - j;
317 if (i > 0){
318 k_ub ++;
319 #ifdef ESYS_MPI
320 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
321 A->mpi_info->msg_tag_counter+rank,
322 A->mpi_info->comm,
323 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
324 #endif
325 }
326 j = send_degree[p];
327 }
328
329 #ifdef ESYS_MPI
330 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
331 A->col_coupler->mpi_requests,
332 A->col_coupler->mpi_stati);
333 #endif
334 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
335
336 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
337 for (i=0; i<len; i++) {
338 j = ptr_main[i];
339 k = ptr_main[i+1];
340 m = ptr_couple[i];
341 for (p=j; p<k; p++) {
342 idx_main[p] = ptr_idx[m+p];
343 }
344 j = ptr_couple[i+1];
345 for (p=m; p<j; p++) {
346 idx_couple[p] = ptr_idx[k+p];
347 }
348 }
349 delete[] ptr_idx;
350
351 /* allocate pattern and sparsematrix for B_ext_main */
352 pattern_main = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
353 len, num_main_cols, ptr_main, idx_main);
354 B->row_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
355 pattern_main, B->row_block_size, B->col_block_size,
356 FALSE);
357 paso::Pattern_free(pattern_main);
358
359 /* allocate pattern and sparsematrix for B_ext_couple */
360 pattern_couple = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
361 len, B->col_distribution->first_component[size],
362 ptr_couple, idx_couple);
363 B->remote_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
364 pattern_couple, B->row_block_size, B->col_block_size,
365 FALSE);
366 paso::Pattern_free(pattern_couple);
367
368 /* send/receive value array */
369 j=0;
370 for (p=0; p<recv->numNeighbors; p++) {
371 k = recv->offsetInShared[p];
372 m = recv->offsetInShared[p+1];
373 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
374 #ifdef ESYS_MPI
375 if (i > 0)
376 MPI_Irecv(&(ptr_val[j]), i * block_size,
377 MPI_DOUBLE, recv->neighbor[p],
378 A->mpi_info->msg_tag_counter+recv->neighbor[p],
379 A->mpi_info->comm,
380 &(A->col_coupler->mpi_requests[p]));
381 #endif
382 j += (i * block_size);
383 }
384
385 j=0;
386 for (p=0; p<num_neighbors; p++) {
387 i = send_degree[p] - j;
388 #ifdef ESYS_MPI
389 if (i > 0)
390 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
391 A->mpi_info->msg_tag_counter+rank,
392 A->mpi_info->comm,
393 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
394 #endif
395 j = send_degree[p] ;
396 }
397
398 #ifdef ESYS_MPI
399 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
400 A->col_coupler->mpi_requests,
401 A->col_coupler->mpi_stati);
402 #endif
403 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
404
405 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
406 for (i=0; i<len; i++) {
407 j = ptr_main[i];
408 k = ptr_main[i+1];
409 m = ptr_couple[i];
410 for (p=j; p<k; p++) {
411 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
412 }
413 j = ptr_couple[i+1];
414 for (p=m; p<j; p++) {
415 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
416 }
417 }
418 delete[] ptr_val;
419
420
421 /* release all temp memory allocation */
422 delete[] cols;
423 delete[] cols_array;
424 delete[] recv_offset;
425 delete[] recv_degree;
426 delete[] recv_buf;
427 delete[] send_buf;
428 delete[] send_offset;
429 delete[] send_degree;
430 delete[] send_idx;
431 }
432
433 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
434 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
435 to copy block of data in T to neighbour processors, defined by
436 P->col_coupler->connector->recv->neighbor[k] where k is in
437 [0, P->col_coupler->connector->recv->numNeighbors).
438 Rows to be copied to neighbor processor k is in the list defined by
439 P->col_coupler->connector->recv->offsetInShared[k] ...
440 P->col_coupler->connector->recv->offsetInShared[k+1] */
441 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
442 index_t **p_ptr, index_t **p_idx, double **p_val,
443 index_t *global_id, index_t block_size)
444 {
445 paso::SharedComponents_ptr send, recv;
446 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
447 index_t i, j, p, m, n, size;
448 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
449 index_t *ptr=*p_ptr, *idx=*p_idx;
450 double *val=*p_val, *recv_val=NULL;
451 #ifdef ESYS_MPI
452 index_t rank = P->mpi_info->rank;
453 #endif
454
455 size = P->mpi_info->size;
456 send = P->col_coupler->connector->recv;
457 recv = P->col_coupler->connector->send;
458 send_neighbors = send->numNeighbors;
459 recv_neighbors = recv->numNeighbors;
460 send_rows = P->col_coupleBlock->numCols;
461 recv_rows = recv->offsetInShared[recv_neighbors];
462
463 send_degree = new index_t[send_rows];
464 recv_ptr = new index_t[recv_rows + 1];
465 #pragma omp for schedule(static) private(i)
466 for (i=0; i<send_rows; i++)
467 send_degree[i] = ptr[i+1] - ptr[i];
468
469 /* First, send/receive the degree */
470 for (p=0; p<recv_neighbors; p++) { /* Receiving */
471 m = recv->offsetInShared[p];
472 n = recv->offsetInShared[p+1];
473 #ifdef ESYS_MPI
474 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
475 P->mpi_info->msg_tag_counter + recv->neighbor[p],
476 P->mpi_info->comm,
477 &(P->col_coupler->mpi_requests[p]));
478 #endif
479 }
480 for (p=0; p<send_neighbors; p++) { /* Sending */
481 m = send->offsetInShared[p];
482 n = send->offsetInShared[p+1];
483 #ifdef ESYS_MPI
484 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
485 P->mpi_info->msg_tag_counter + rank,
486 P->mpi_info->comm,
487 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
488 #endif
489 }
490 #ifdef ESYS_MPI
491 MPI_Waitall(send_neighbors+recv_neighbors,
492 P->col_coupler->mpi_requests,
493 P->col_coupler->mpi_stati);
494 #endif
495 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
496
497 delete[] send_degree;
498 m = Paso_Util_cumsum(recv_rows, recv_ptr);
499 recv_ptr[recv_rows] = m;
500 recv_idx = new index_t[m];
501 recv_val = new double[m * block_size];
502
503 /* Next, send/receive the index array */
504 j = 0;
505 for (p=0; p<recv_neighbors; p++) { /* Receiving */
506 m = recv->offsetInShared[p];
507 n = recv->offsetInShared[p+1];
508 i = recv_ptr[n] - recv_ptr[m];
509 if (i > 0) {
510 #ifdef ESYS_MPI
511 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
512 P->mpi_info->msg_tag_counter + recv->neighbor[p],
513 P->mpi_info->comm,
514 &(P->col_coupler->mpi_requests[p]));
515 #endif
516 }
517 j += i;
518 }
519
520 j = 0;
521 for (p=0; p<send_neighbors; p++) { /* Sending */
522 m = send->offsetInShared[p];
523 n = send->offsetInShared[p+1];
524 i = ptr[n] - ptr[m];
525 if (i >0) {
526 #ifdef ESYS_MPI
527 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
528 P->mpi_info->msg_tag_counter + rank,
529 P->mpi_info->comm,
530 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
531 #endif
532 j += i;
533 }
534 }
535 #ifdef ESYS_MPI
536 MPI_Waitall(send_neighbors+recv_neighbors,
537 P->col_coupler->mpi_requests,
538 P->col_coupler->mpi_stati);
539 #endif
540 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
541
542 /* Last, send/receive the data array */
543 j = 0;
544 for (p=0; p<recv_neighbors; p++) { /* Receiving */
545 m = recv->offsetInShared[p];
546 n = recv->offsetInShared[p+1];
547 i = recv_ptr[n] - recv_ptr[m];
548 #ifdef ESYS_MPI
549 if (i > 0)
550 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
551 P->mpi_info->msg_tag_counter + recv->neighbor[p],
552 P->mpi_info->comm,
553 &(P->col_coupler->mpi_requests[p]));
554 #endif
555 j += (i*block_size);
556 }
557
558 j = 0;
559 for (p=0; p<send_neighbors; p++) { /* Sending */
560 m = send->offsetInShared[p];
561 n = send->offsetInShared[p+1];
562 i = ptr[n] - ptr[m];
563 if (i >0) {
564 #ifdef ESYS_MPI
565 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
566 P->mpi_info->msg_tag_counter + rank,
567 P->mpi_info->comm,
568 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
569 #endif
570 j += i * block_size;
571 }
572 }
573 #ifdef ESYS_MPI
574 MPI_Waitall(send_neighbors+recv_neighbors,
575 P->col_coupler->mpi_requests,
576 P->col_coupler->mpi_stati);
577 #endif
578 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
579
580 /* Clean up and return with received ptr, index and data arrays */
581 delete[] ptr;
582 delete[] idx;
583 delete[] val;
584 *p_ptr = recv_ptr;
585 *p_idx = recv_idx;
586 *p_val = recv_val;
587 }
588
589 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
590 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
591 {
592 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
593 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
594 Paso_SystemMatrix *out=NULL;
595 paso::SystemMatrixPattern *pattern=NULL;
596 paso::Distribution_ptr input_dist, output_dist;
597 paso::Connector_ptr col_connector, row_connector;
598 paso::Pattern *main_pattern=NULL;
599 paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
600 const dim_t row_block_size=A->row_block_size;
601 const dim_t col_block_size=A->col_block_size;
602 const dim_t block_size = A->block_size;
603 const dim_t P_block_size = P->block_size;
604 const double ZERO = 0.0;
605 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
606 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
607 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
608 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
609 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
610 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
611 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
612 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
613 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
614 index_t sum, i, j, k, iptr, irb, icb, ib;
615 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
616 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
617 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
618 index_t row_marker_ext, *where_p=NULL;
619 index_t **send_ptr=NULL, **send_idx=NULL;
620 dim_t p, num_neighbors;
621 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
622 Esys_MPI_rank *neighbor=NULL;
623 #ifdef ESYS_MPI
624 MPI_Request* mpi_requests=NULL;
625 MPI_Status* mpi_stati=NULL;
626 #else
627 int *mpi_requests=NULL, *mpi_stati=NULL;
628 #endif
629
630 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
631 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
632
633 /* two sparse matrices R_main and R_couple will be generate, as the
634 transpose of P_main and P_col_couple, respectively. Note that,
635 R_couple is actually the row_coupleBlock of R (R=P^T) */
636 R_main = paso::SparseMatrix_getTranspose(P->mainBlock);
637 if (size > 1)
638 R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock);
639
640 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
641 and needed locally for triple matrix product RAP
642 to be specific, P_ext (on processor k) are group of rows in P, where
643 the list of rows from processor q is given by
644 A->col_coupler->connector->send->shared[rPtr...]
645 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
646 on q.
647 P_ext is represented by two sparse matrices P_ext_main and
648 P_ext_couple */
649 Paso_Preconditioner_AMG_extendB(A, P);
650
651 /* count the number of cols in P_ext_couple, resize the pattern of
652 sparse matrix P_ext_couple with new compressed order, and then
653 build the col id mapping from P->col_coupleBlock to
654 P_ext_couple */
655 num_Pmain_cols = P->mainBlock->numCols;
656 if (size > 1) {
657 num_Pcouple_cols = P->col_coupleBlock->numCols;
658 num_Acouple_cols = A->col_coupleBlock->numCols;
659 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
660 } else {
661 num_Pcouple_cols = 0;
662 num_Acouple_cols = 0;
663 sum = 0;
664 }
665 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
666 offset = P->col_distribution->first_component[rank];
667 num_Pext_cols = 0;
668 if (P->global_id) {
669 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
670 and P->col_coupleBlock */
671 iptr = 0;
672 if (num_Pcouple_cols || sum > 0) {
673 temp = new index_t[num_Pcouple_cols+sum];
674 #pragma omp parallel for lastprivate(iptr) schedule(static)
675 for (iptr=0; iptr<sum; iptr++){
676 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
677 }
678 for (j=0; j<num_Pcouple_cols; j++, iptr++){
679 temp[iptr] = P->global_id[j];
680 }
681 }
682 if (iptr) {
683 #ifdef USE_QSORTG
684 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
685 #else
686 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
687 #endif
688 num_Pext_cols = 1;
689 i = temp[0];
690 for (j=1; j<iptr; j++) {
691 if (temp[j] > i) {
692 i = temp[j];
693 temp[num_Pext_cols++] = i;
694 }
695 }
696 }
697 /* resize the pattern of P_ext_couple */
698 if(num_Pext_cols){
699 global_id_P = new index_t[num_Pext_cols];
700 #pragma omp parallel for private(i) schedule(static)
701 for (i=0; i<num_Pext_cols; i++)
702 global_id_P[i] = temp[i];
703 }
704 if (num_Pcouple_cols || sum > 0)
705 delete[] temp;
706 #pragma omp parallel for private(i, where_p) schedule(static)
707 for (i=0; i<sum; i++) {
708 where_p = (index_t *)bsearch(
709 &(P->remote_coupleBlock->pattern->index[i]),
710 global_id_P, num_Pext_cols,
711 sizeof(index_t), paso::comparIndex);
712 P->remote_coupleBlock->pattern->index[i] =
713 (index_t)(where_p -global_id_P);
714 }
715
716 /* build the mapping */
717 if (num_Pcouple_cols) {
718 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
719 iptr = 0;
720 for (i=0; i<num_Pext_cols; i++)
721 if (global_id_P[i] == P->global_id[iptr]) {
722 Pcouple_to_Pext[iptr++] = i;
723 if (iptr == num_Pcouple_cols) break;
724 }
725 }
726 }
727
728 /* alloc and initialise the makers */
729 sum = num_Pext_cols + num_Pmain_cols;
730 P_marker = new index_t[sum];
731 A_marker = new index_t[num_A_cols];
732 #pragma omp parallel for private(i) schedule(static)
733 for (i=0; i<sum; i++) P_marker[i] = -1;
734 #pragma omp parallel for private(i) schedule(static)
735 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
736
737 /* Now, count the size of RAP_ext. Start with rows in R_couple */
738 sum = 0;
739 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
740 row_marker = sum;
741 /* then, loop over elements in row i_r of R_couple */
742 j1_ub = R_couple->pattern->ptr[i_r+1];
743 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
744 i1 = R_couple->pattern->index[j1];
745 /* then, loop over elements in row i1 of A->col_coupleBlock */
746 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
747 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
748 i2 = A->col_coupleBlock->pattern->index[j2];
749
750 /* check whether entry RA[i_r, i2] has been previously visited.
751 RAP new entry is possible only if entry RA[i_r, i2] has not
752 been visited yet */
753 if (A_marker[i2] != i_r) {
754 /* first, mark entry RA[i_r,i2] as visited */
755 A_marker[i2] = i_r;
756
757 /* then loop over elements in row i2 of P_ext_main */
758 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
759 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
760 i_c = P->row_coupleBlock->pattern->index[j3];
761
762 /* check whether entry RAP[i_r,i_c] has been created.
763 If not yet created, create the entry and increase
764 the total number of elements in RAP_ext */
765 if (P_marker[i_c] < row_marker) {
766 P_marker[i_c] = sum;
767 sum++;
768 }
769 }
770
771 /* loop over elements in row i2 of P_ext_couple, do the same */
772 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
773 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
774 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
775 if (P_marker[i_c] < row_marker) {
776 P_marker[i_c] = sum;
777 sum++;
778 }
779 }
780 }
781 }
782
783 /* now loop over elements in row i1 of A->mainBlock, repeat
784 the process we have done to block A->col_coupleBlock */
785 j2_ub = A->mainBlock->pattern->ptr[i1+1];
786 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
787 i2 = A->mainBlock->pattern->index[j2];
788 if (A_marker[i2 + num_Acouple_cols] != i_r) {
789 A_marker[i2 + num_Acouple_cols] = i_r;
790 j3_ub = P->mainBlock->pattern->ptr[i2+1];
791 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
792 i_c = P->mainBlock->pattern->index[j3];
793 if (P_marker[i_c] < row_marker) {
794 P_marker[i_c] = sum;
795 sum++;
796 }
797 }
798 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
799 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
800 /* note that we need to map the column index in
801 P->col_coupleBlock back into the column index in
802 P_ext_couple */
803 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
804 if (P_marker[i_c] < row_marker) {
805 P_marker[i_c] = sum;
806 sum++;
807 }
808 }
809 }
810 }
811 }
812 }
813
814 /* Now we have the number of non-zero elements in RAP_ext, allocate
815 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
816 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
817 RAP_ext_idx = new index_t[sum];
818 RAP_ext_val = new double[sum * block_size];
819 RA_val = new double[block_size];
820 RAP_val = new double[block_size];
821
822 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
823 sum = num_Pext_cols + num_Pmain_cols;
824 #pragma omp parallel for private(i) schedule(static)
825 for (i=0; i<sum; i++) P_marker[i] = -1;
826 #pragma omp parallel for private(i) schedule(static)
827 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
828 sum = 0;
829 RAP_ext_ptr[0] = 0;
830 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
831 row_marker = sum;
832 /* then, loop over elements in row i_r of R_couple */
833 j1_ub = R_couple->pattern->ptr[i_r+1];
834 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
835 i1 = R_couple->pattern->index[j1];
836 R_val = &(R_couple->val[j1*P_block_size]);
837
838 /* then, loop over elements in row i1 of A->col_coupleBlock */
839 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
840 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
841 i2 = A->col_coupleBlock->pattern->index[j2];
842 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
843 for (irb=0; irb<row_block_size; irb++)
844 for (icb=0; icb<col_block_size; icb++)
845 RA_val[irb+row_block_size*icb] = ZERO;
846 for (irb=0; irb<P_block_size; irb++) {
847 rtmp=R_val[irb];
848 for (icb=0; icb<col_block_size; icb++) {
849 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
850 }
851 }
852
853 /* check whether entry RA[i_r, i2] has been previously visited.
854 RAP new entry is possible only if entry RA[i_r, i2] has not
855 been visited yet */
856 if (A_marker[i2] != i_r) {
857 /* first, mark entry RA[i_r,i2] as visited */
858 A_marker[i2] = i_r;
859
860 /* then loop over elements in row i2 of P_ext_main */
861 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
862 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
863 i_c = P->row_coupleBlock->pattern->index[j3];
864 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
865 for (irb=0; irb<row_block_size; irb++)
866 for (icb=0; icb<col_block_size; icb++)
867 RAP_val[irb+row_block_size*icb] = ZERO;
868 for (icb=0; icb<P_block_size; icb++) {
869 rtmp = temp_val[icb];
870 for (irb=0; irb<row_block_size; irb++) {
871 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
872 }
873 }
874
875 /* check whether entry RAP[i_r,i_c] has been created.
876 If not yet created, create the entry and increase
877 the total number of elements in RAP_ext */
878 if (P_marker[i_c] < row_marker) {
879 P_marker[i_c] = sum;
880 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
881 RAP_ext_idx[sum] = i_c + offset;
882 sum++;
883 } else {
884 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
885 for (ib=0; ib<block_size; ib++) {
886 temp_val[ib] += RAP_val[ib];
887 }
888 }
889 }
890
891 /* loop over elements in row i2 of P_ext_couple, do the same */
892 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
893 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
894 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
895 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
896 for (irb=0; irb<row_block_size; irb++)
897 for (icb=0; icb<col_block_size; icb++)
898 RAP_val[irb+row_block_size*icb]=ZERO;
899 for (icb=0; icb<P_block_size; icb++) {
900 rtmp = temp_val[icb];
901 for (irb=0; irb<row_block_size; irb++) {
902 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
903 }
904 }
905
906 if (P_marker[i_c] < row_marker) {
907 P_marker[i_c] = sum;
908 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
909 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
910 sum++;
911 } else {
912 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
913 for (ib=0; ib<block_size; ib++) {
914 temp_val[ib] += RAP_val[ib];
915 }
916 }
917 }
918
919 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
920 Only the contributions are added. */
921 } else {
922 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
923 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
924 i_c = P->row_coupleBlock->pattern->index[j3];
925 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
926 for (irb=0; irb<row_block_size; irb++)
927 for (icb=0; icb<col_block_size; icb++)
928 RAP_val[irb+row_block_size*icb]=ZERO;
929 for (icb=0; icb<P_block_size; icb++) {
930 rtmp = temp_val[icb];
931 for (irb=0; irb<row_block_size; irb++) {
932 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
933 }
934 }
935
936 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
937 for (ib=0; ib<block_size; ib++) {
938 temp_val[ib] += RAP_val[ib];
939 }
940 }
941 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
942 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
943 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
944 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
945 for (irb=0; irb<row_block_size; irb++)
946 for (icb=0; icb<col_block_size; icb++)
947 RAP_val[irb+row_block_size*icb]=ZERO;
948 for (icb=0; icb<P_block_size; icb++) {
949 rtmp = temp_val[icb];
950 for (irb=0; irb<row_block_size; irb++) {
951 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
952 }
953 }
954
955 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
956 for (ib=0; ib<block_size; ib++) {
957 temp_val[ib] += RAP_val[ib];
958 }
959 }
960 }
961 }
962
963 /* now loop over elements in row i1 of A->mainBlock, repeat
964 the process we have done to block A->col_coupleBlock */
965 j2_ub = A->mainBlock->pattern->ptr[i1+1];
966 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
967 i2 = A->mainBlock->pattern->index[j2];
968 temp_val = &(A->mainBlock->val[j2*block_size]);
969 for (irb=0; irb<row_block_size; irb++)
970 for (icb=0; icb<col_block_size; icb++)
971 RA_val[irb+row_block_size*icb]=ZERO;
972 for (irb=0; irb<P_block_size; irb++) {
973 rtmp=R_val[irb];
974 for (icb=0; icb<col_block_size; icb++) {
975 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
976 }
977 }
978
979 if (A_marker[i2 + num_Acouple_cols] != i_r) {
980 A_marker[i2 + num_Acouple_cols] = i_r;
981 j3_ub = P->mainBlock->pattern->ptr[i2+1];
982 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
983 i_c = P->mainBlock->pattern->index[j3];
984 temp_val = &(P->mainBlock->val[j3*P_block_size]);
985 for (irb=0; irb<row_block_size; irb++)
986 for (icb=0; icb<col_block_size; icb++)
987 RAP_val[irb+row_block_size*icb]=ZERO;
988 for (icb=0; icb<P_block_size; icb++) {
989 rtmp = temp_val[icb];
990 for (irb=0; irb<row_block_size; irb++) {
991 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
992 }
993 }
994
995 if (P_marker[i_c] < row_marker) {
996 P_marker[i_c] = sum;
997 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
998 RAP_ext_idx[sum] = i_c + offset;
999 sum++;
1000 } else {
1001 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1002 for (ib=0; ib<block_size; ib++) {
1003 temp_val[ib] += RAP_val[ib];
1004 }
1005 }
1006 }
1007 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1008 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1009 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1010 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1011 for (irb=0; irb<row_block_size; irb++)
1012 for (icb=0; icb<col_block_size; icb++)
1013 RAP_val[irb+row_block_size*icb]=ZERO;
1014 for (icb=0; icb<P_block_size; icb++) {
1015 rtmp=temp_val[icb];
1016 for (irb=0; irb<row_block_size; irb++) {
1017 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1018 }
1019 }
1020
1021 if (P_marker[i_c] < row_marker) {
1022 P_marker[i_c] = sum;
1023 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1024 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1025 sum++;
1026 } else {
1027 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1028 for (ib=0; ib<block_size; ib++) {
1029 temp_val[ib] += RAP_val[ib];
1030 }
1031 }
1032 }
1033 } else {
1034 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1035 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1036 i_c = P->mainBlock->pattern->index[j3];
1037 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1038 for (irb=0; irb<row_block_size; irb++)
1039 for (icb=0; icb<col_block_size; icb++)
1040 RAP_val[irb+row_block_size*icb]=ZERO;
1041 for (icb=0; icb<P_block_size; icb++) {
1042 rtmp=temp_val[icb];
1043 for (irb=0; irb<row_block_size; irb++) {
1044 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1045 }
1046 }
1047
1048 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1049 for (ib=0; ib<block_size; ib++) {
1050 temp_val[ib] += RAP_val[ib];
1051 }
1052 }
1053 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1054 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1055 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1056 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1057 for (irb=0; irb<row_block_size; irb++)
1058 for (icb=0; icb<col_block_size; icb++)
1059 RAP_val[irb+row_block_size*icb]=ZERO;
1060 for (icb=0; icb<P_block_size; icb++) {
1061 rtmp=temp_val[icb];
1062 for (irb=0; irb<row_block_size; irb++) {
1063 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1064 }
1065 }
1066
1067 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1068 for (ib=0; ib<block_size; ib++) {
1069 temp_val[ib] += RAP_val[ib];
1070 }
1071 }
1072 }
1073 }
1074 }
1075 RAP_ext_ptr[i_r+1] = sum;
1076 }
1077 delete[] P_marker;
1078 delete[] Pcouple_to_Pext;
1079
1080 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1081 which is given by the column list of P->col_coupleBlock, and
1082 column "c" is the list of columns which possibly covers the
1083 whole column range of system matrix P. This part of data is to
1084 be passed to neighbouring processors, and added to corresponding
1085 RAP[r,c] entries in the neighbouring processors */
1086 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1087 &RAP_ext_val, global_id_P, block_size);
1088
1089 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1090 sum = RAP_ext_ptr[num_RAPext_rows];
1091 num_RAPext_cols = 0;
1092 if (num_Pext_cols || sum > 0) {
1093 temp = new index_t[num_Pext_cols+sum];
1094 j1_ub = offset + num_Pmain_cols;
1095 for (i=0, iptr=0; i<sum; i++) {
1096 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1097 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1098 }
1099 for (j=0; j<num_Pext_cols; j++, iptr++){
1100 temp[iptr] = global_id_P[j];
1101 }
1102
1103 if (iptr) {
1104 #ifdef USE_QSORTG
1105 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1106 #else
1107 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1108 #endif
1109 num_RAPext_cols = 1;
1110 i = temp[0];
1111 for (j=1; j<iptr; j++) {
1112 if (temp[j] > i) {
1113 i = temp[j];
1114 temp[num_RAPext_cols++] = i;
1115 }
1116 }
1117 }
1118 }
1119
1120 /* resize the pattern of P_ext_couple */
1121 if(num_RAPext_cols){
1122 global_id_RAP = new index_t[num_RAPext_cols];
1123 #pragma omp parallel for private(i) schedule(static)
1124 for (i=0; i<num_RAPext_cols; i++)
1125 global_id_RAP[i] = temp[i];
1126 }
1127 if (num_Pext_cols || sum > 0)
1128 delete[] temp;
1129 j1_ub = offset + num_Pmain_cols;
1130 #pragma omp parallel for private(i, where_p) schedule(static)
1131 for (i=0; i<sum; i++) {
1132 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1133 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1134 /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1135 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1136 } else
1137 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1138 }
1139
1140 /* build the mapping */
1141 if (num_Pcouple_cols) {
1142 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1143 iptr = 0;
1144 for (i=0; i<num_RAPext_cols; i++)
1145 if (global_id_RAP[i] == P->global_id[iptr]) {
1146 Pcouple_to_RAP[iptr++] = i;
1147 if (iptr == num_Pcouple_cols) break;
1148 }
1149 }
1150
1151 if (num_Pext_cols) {
1152 Pext_to_RAP = new index_t[num_Pext_cols];
1153 iptr = 0;
1154 for (i=0; i<num_RAPext_cols; i++)
1155 if (global_id_RAP[i] == global_id_P[iptr]) {
1156 Pext_to_RAP[iptr++] = i;
1157 if (iptr == num_Pext_cols) break;
1158 }
1159 }
1160
1161 if (global_id_P){
1162 delete[] global_id_P;
1163 global_id_P = NULL;
1164 }
1165
1166 /* alloc and initialise the makers */
1167 sum = num_RAPext_cols + num_Pmain_cols;
1168 P_marker = new index_t[sum];
1169 #pragma omp parallel for private(i) schedule(static)
1170 for (i=0; i<sum; i++) P_marker[i] = -1;
1171 #pragma omp parallel for private(i) schedule(static)
1172 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1173
1174 /* Now, count the size of RAP. Start with rows in R_main */
1175 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1176 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1177 shared = P->col_coupler->connector->send->shared;
1178 i = 0;
1179 j = 0;
1180 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1181 /* Mark the start of row for both main block and couple block */
1182 row_marker = i;
1183 row_marker_ext = j;
1184
1185 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1186 in RAP_ext */
1187 P_marker[i_r] = i;
1188 i++;
1189 for (j1=0; j1<num_neighbors; j1++) {
1190 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1191 if (shared[j2] == i_r) {
1192 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1193 i_c = RAP_ext_idx[k];
1194 if (i_c < num_Pmain_cols) {
1195 if (P_marker[i_c] < row_marker) {
1196 P_marker[i_c] = i;
1197 i++;
1198 }
1199 } else {
1200 if (P_marker[i_c] < row_marker_ext) {
1201 P_marker[i_c] = j;
1202 j++;
1203 }
1204 }
1205 }
1206 break;
1207 }
1208 }
1209 }
1210
1211 /* then, loop over elements in row i_r of R_main */
1212 j1_ub = R_main->pattern->ptr[i_r+1];
1213 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1214 i1 = R_main->pattern->index[j1];
1215
1216 /* then, loop over elements in row i1 of A->col_coupleBlock */
1217 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1218 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1219 i2 = A->col_coupleBlock->pattern->index[j2];
1220
1221 /* check whether entry RA[i_r, i2] has been previously visited.
1222 RAP new entry is possible only if entry RA[i_r, i2] has not
1223 been visited yet */
1224 if (A_marker[i2] != i_r) {
1225 /* first, mark entry RA[i_r,i2] as visited */
1226 A_marker[i2] = i_r;
1227
1228 /* then loop over elements in row i2 of P_ext_main */
1229 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1230 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1231 i_c = P->row_coupleBlock->pattern->index[j3];
1232
1233 /* check whether entry RAP[i_r,i_c] has been created.
1234 If not yet created, create the entry and increase
1235 the total number of elements in RAP_ext */
1236 if (P_marker[i_c] < row_marker) {
1237 P_marker[i_c] = i;
1238 i++;
1239 }
1240 }
1241
1242 /* loop over elements in row i2 of P_ext_couple, do the same */
1243 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1244 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1245 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1246 if (P_marker[i_c] < row_marker_ext) {
1247 P_marker[i_c] = j;
1248 j++;
1249 }
1250 }
1251 }
1252 }
1253
1254 /* now loop over elements in row i1 of A->mainBlock, repeat
1255 the process we have done to block A->col_coupleBlock */
1256 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1257 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1258 i2 = A->mainBlock->pattern->index[j2];
1259 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1260 A_marker[i2 + num_Acouple_cols] = i_r;
1261 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1262 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1263 i_c = P->mainBlock->pattern->index[j3];
1264 if (P_marker[i_c] < row_marker) {
1265 P_marker[i_c] = i;
1266 i++;
1267 }
1268 }
1269 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1270 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1271 /* note that we need to map the column index in
1272 P->col_coupleBlock back into the column index in
1273 P_ext_couple */
1274 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1275 if (P_marker[i_c] < row_marker_ext) {
1276 P_marker[i_c] = j;
1277 j++;
1278 }
1279 }
1280 }
1281 }
1282 }
1283 }
1284
1285 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1286 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1287 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1288 RAP_couple */
1289 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1290 RAP_main_idx = new index_t[i];
1291 RAP_main_val = new double[i * block_size];
1292 RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1293 RAP_couple_idx = new index_t[j];
1294 RAP_couple_val = new double[j * block_size];
1295
1296 RAP_main_ptr[num_Pmain_cols] = i;
1297 RAP_couple_ptr[num_Pmain_cols] = j;
1298
1299 #pragma omp parallel for private(i) schedule(static)
1300 for (i=0; i<sum; i++) P_marker[i] = -1;
1301 #pragma omp parallel for private(i) schedule(static)
1302 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1303
1304 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1305 in R_main */
1306 i = 0;
1307 j = 0;
1308 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1309 /* Mark the start of row for both main block and couple block */
1310 row_marker = i;
1311 row_marker_ext = j;
1312 RAP_main_ptr[i_r] = row_marker;
1313 RAP_couple_ptr[i_r] = row_marker_ext;
1314
1315 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1316 in row i_r of RAP_ext */
1317 P_marker[i_r] = i;
1318 for (ib=0; ib<block_size; ib++)
1319 RAP_main_val[i*block_size+ib] = ZERO;
1320 RAP_main_idx[i] = i_r;
1321 i++;
1322
1323 for (j1=0; j1<num_neighbors; j1++) {
1324 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1325 if (shared[j2] == i_r) {
1326 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1327 i_c = RAP_ext_idx[k];
1328 if (i_c < num_Pmain_cols) {
1329 if (P_marker[i_c] < row_marker) {
1330 P_marker[i_c] = i;
1331 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1332 RAP_main_idx[i] = i_c;
1333 i++;
1334 } else {
1335 t1_val = &(RAP_ext_val[k*block_size]);
1336 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1337 for (ib=0; ib<block_size; ib++)
1338 t2_val[ib] += t1_val[ib];
1339 }
1340 } else {
1341 if (P_marker[i_c] < row_marker_ext) {
1342 P_marker[i_c] = j;
1343 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1344 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1345 j++;
1346 } else {
1347 t1_val = &(RAP_ext_val[k*block_size]);
1348 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1349 for (ib=0; ib<block_size; ib++)
1350 t2_val[ib] += t1_val[ib];
1351 }
1352 }
1353 }
1354 break;
1355 }
1356 }
1357 }
1358
1359 /* then, loop over elements in row i_r of R_main */
1360 j1_ub = R_main->pattern->ptr[i_r+1];
1361 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1362 i1 = R_main->pattern->index[j1];
1363 R_val = &(R_main->val[j1*P_block_size]);
1364
1365 /* then, loop over elements in row i1 of A->col_coupleBlock */
1366 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1367 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1368 i2 = A->col_coupleBlock->pattern->index[j2];
1369 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1370 for (irb=0; irb<row_block_size; irb++)
1371 for (icb=0; icb<col_block_size; icb++)
1372 RA_val[irb+row_block_size*icb]=ZERO;
1373 for (irb=0; irb<P_block_size; irb++) {
1374 rtmp=R_val[irb];
1375 for (icb=0; icb<col_block_size; icb++) {
1376 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1377 }
1378 }
1379
1380
1381 /* check whether entry RA[i_r, i2] has been previously visited.
1382 RAP new entry is possible only if entry RA[i_r, i2] has not
1383 been visited yet */
1384 if (A_marker[i2] != i_r) {
1385 /* first, mark entry RA[i_r,i2] as visited */
1386 A_marker[i2] = i_r;
1387
1388 /* then loop over elements in row i2 of P_ext_main */
1389 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1390 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1391 i_c = P->row_coupleBlock->pattern->index[j3];
1392 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1393 for (irb=0; irb<row_block_size; irb++)
1394 for (icb=0; icb<col_block_size; icb++)
1395 RAP_val[irb+row_block_size*icb]=ZERO;
1396 for (icb=0; icb<P_block_size; icb++) {
1397 rtmp=temp_val[icb];
1398 for (irb=0; irb<row_block_size; irb++) {
1399 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1400 }
1401 }
1402
1403
1404 /* check whether entry RAP[i_r,i_c] has been created.
1405 If not yet created, create the entry and increase
1406 the total number of elements in RAP_ext */
1407 if (P_marker[i_c] < row_marker) {
1408 P_marker[i_c] = i;
1409 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1410 RAP_main_idx[i] = i_c;
1411 i++;
1412 } else {
1413 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1414 for (ib=0; ib<block_size; ib++) {
1415 temp_val[ib] += RAP_val[ib];
1416 }
1417 }
1418 }
1419
1420 /* loop over elements in row i2 of P_ext_couple, do the same */
1421 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1422 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1423 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1424 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1425 for (irb=0; irb<row_block_size; irb++)
1426 for (icb=0; icb<col_block_size; icb++)
1427 RAP_val[irb+row_block_size*icb]=ZERO;
1428 for (icb=0; icb<P_block_size; icb++) {
1429 rtmp=temp_val[icb];
1430 for (irb=0; irb<row_block_size; irb++) {
1431 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1432 }
1433 }
1434
1435 if (P_marker[i_c] < row_marker_ext) {
1436 P_marker[i_c] = j;
1437 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1438 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1439 j++;
1440 } else {
1441 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1442 for (ib=0; ib<block_size; ib++) {
1443 temp_val[ib] += RAP_val[ib];
1444 }
1445 }
1446 }
1447
1448 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1449 Only the contributions are added. */
1450 } else {
1451 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1452 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1453 i_c = P->row_coupleBlock->pattern->index[j3];
1454 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1455 for (irb=0; irb<row_block_size; irb++)
1456 for (icb=0; icb<col_block_size; icb++)
1457 RAP_val[irb+row_block_size*icb]=ZERO;
1458 for (icb=0; icb<P_block_size; icb++) {
1459 rtmp=temp_val[icb];
1460 for (irb=0; irb<row_block_size; irb++) {
1461 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1462 }
1463 }
1464
1465 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1466 for (ib=0; ib<block_size; ib++) {
1467 temp_val[ib] += RAP_val[ib];
1468 }
1469 }
1470 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1471 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1472 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1473 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1474 for (irb=0; irb<row_block_size; irb++)
1475 for (icb=0; icb<col_block_size; icb++)
1476 RAP_val[irb+row_block_size*icb]=ZERO;
1477 for (icb=0; icb<P_block_size; icb++) {
1478 rtmp=temp_val[icb];
1479 for (irb=0; irb<row_block_size; irb++) {
1480 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1481 }
1482 }
1483
1484 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1485 for (ib=0; ib<block_size; ib++) {
1486 temp_val[ib] += RAP_val[ib];
1487 }
1488 }
1489 }
1490 }
1491
1492 /* now loop over elements in row i1 of A->mainBlock, repeat
1493 the process we have done to block A->col_coupleBlock */
1494 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1495 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1496 i2 = A->mainBlock->pattern->index[j2];
1497 temp_val = &(A->mainBlock->val[j2*block_size]);
1498 for (irb=0; irb<row_block_size; irb++)
1499 for (icb=0; icb<col_block_size; icb++)
1500 RA_val[irb+row_block_size*icb]=ZERO;
1501 for (irb=0; irb<P_block_size; irb++) {
1502 rtmp=R_val[irb];
1503 for (icb=0; icb<col_block_size; icb++) {
1504 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1505 }
1506 }
1507
1508 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1509 A_marker[i2 + num_Acouple_cols] = i_r;
1510 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1511 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1512 i_c = P->mainBlock->pattern->index[j3];
1513 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1514 for (irb=0; irb<row_block_size; irb++)
1515 for (icb=0; icb<col_block_size; icb++)
1516 RAP_val[irb+row_block_size*icb]=ZERO;
1517 for (icb=0; icb<P_block_size; icb++) {
1518 rtmp=temp_val[icb];
1519 for (irb=0; irb<row_block_size; irb++) {
1520 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1521 }
1522 }
1523 if (P_marker[i_c] < row_marker) {
1524 P_marker[i_c] = i;
1525 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1526 RAP_main_idx[i] = i_c;
1527 i++;
1528 } else {
1529 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1530 for (ib=0; ib<block_size; ib++) {
1531 temp_val[ib] += RAP_val[ib];
1532 }
1533 }
1534 }
1535 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1536 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1537 /* note that we need to map the column index in
1538 P->col_coupleBlock back into the column index in
1539 P_ext_couple */
1540 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1541 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1542 for (irb=0; irb<row_block_size; irb++)
1543 for (icb=0; icb<col_block_size; icb++)
1544 RAP_val[irb+row_block_size*icb]=ZERO;
1545 for (icb=0; icb<P_block_size; icb++) {
1546 rtmp=temp_val[icb];
1547 for (irb=0; irb<row_block_size; irb++) {
1548 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1549 }
1550 }
1551
1552 if (P_marker[i_c] < row_marker_ext) {
1553 P_marker[i_c] = j;
1554 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1555 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1556 j++;
1557 } else {
1558 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1559 for (ib=0; ib<block_size; ib++) {
1560 temp_val[ib] += RAP_val[ib];
1561 }
1562 }
1563 }
1564
1565 } else {
1566 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1567 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1568 i_c = P->mainBlock->pattern->index[j3];
1569 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1570 for (irb=0; irb<row_block_size; irb++)
1571 for (icb=0; icb<col_block_size; icb++)
1572 RAP_val[irb+row_block_size*icb]=ZERO;
1573 for (icb=0; icb<P_block_size; icb++) {
1574 rtmp=temp_val[icb];
1575 for (irb=0; irb<row_block_size; irb++) {
1576 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1577 }
1578 }
1579
1580 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1581 for (ib=0; ib<block_size; ib++) {
1582 temp_val[ib] += RAP_val[ib];
1583 }
1584 }
1585 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1586 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1587 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1588 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1589 for (irb=0; irb<row_block_size; irb++)
1590 for (icb=0; icb<col_block_size; icb++)
1591 RAP_val[irb+row_block_size*icb]=ZERO;
1592 for (icb=0; icb<P_block_size; icb++) {
1593 rtmp = temp_val[icb];
1594 for (irb=0; irb<row_block_size; irb++) {
1595 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1596 }
1597 }
1598
1599 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1600 for (ib=0; ib<block_size; ib++) {
1601 temp_val[ib] += RAP_val[ib];
1602 }
1603 }
1604 }
1605 }
1606 }
1607
1608 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1609 if (i > row_marker) {
1610 offset = i - row_marker;
1611 temp = new index_t[offset];
1612 #pragma omp parallel for schedule(static) private(iptr)
1613 for (iptr=0; iptr<offset; iptr++)
1614 temp[iptr] = RAP_main_idx[row_marker+iptr];
1615 if (offset > 0) {
1616 #ifdef USE_QSORTG
1617 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1618 #else
1619 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1620 #endif
1621 }
1622 temp_val = new double[offset * block_size];
1623 #pragma omp parallel for schedule(static) private(iptr,k)
1624 for (iptr=0; iptr<offset; iptr++){
1625 k = P_marker[temp[iptr]];
1626 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1627 P_marker[temp[iptr]] = iptr + row_marker;
1628 }
1629 #pragma omp parallel for schedule(static) private(iptr)
1630 for (iptr=0; iptr<offset; iptr++){
1631 RAP_main_idx[row_marker+iptr] = temp[iptr];
1632 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1633 }
1634 delete[] temp;
1635 delete[] temp_val;
1636 }
1637 if (j > row_marker_ext) {
1638 offset = j - row_marker_ext;
1639 temp = new index_t[offset];
1640 #pragma omp parallel for schedule(static) private(iptr)
1641 for (iptr=0; iptr<offset; iptr++)
1642 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1643 if (offset > 0) {
1644 #ifdef USE_QSORTG
1645 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1646 #else
1647 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1648 #endif
1649 }
1650 temp_val = new double[offset * block_size];
1651 #pragma omp parallel for schedule(static) private(iptr, k)
1652 for (iptr=0; iptr<offset; iptr++){
1653 k = P_marker[temp[iptr] + num_Pmain_cols];
1654 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1655 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1656 }
1657 #pragma omp parallel for schedule(static) private(iptr)
1658 for (iptr=0; iptr<offset; iptr++){
1659 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1660 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1661 }
1662 delete[] temp;
1663 delete[] temp_val;
1664 }
1665 }
1666
1667 delete[] RA_val;
1668 delete[] RAP_val;
1669 delete[] A_marker;
1670 delete[] Pext_to_RAP;
1671 delete[] Pcouple_to_RAP;
1672 delete[] RAP_ext_ptr;
1673 delete[] RAP_ext_idx;
1674 delete[] RAP_ext_val;
1675 paso::SparseMatrix_free(R_main);
1676 paso::SparseMatrix_free(R_couple);
1677
1678 /* Check whether there are empty columns in RAP_couple */
1679 #pragma omp parallel for schedule(static) private(i)
1680 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1681 /* num of non-empty columns is stored in "k" */
1682 k = 0;
1683 j = RAP_couple_ptr[num_Pmain_cols];
1684 for (i=0; i<j; i++) {
1685 i1 = RAP_couple_idx[i];
1686 if (P_marker[i1]) {
1687 P_marker[i1] = 0;
1688 k++;
1689 }
1690 }
1691
1692 /* empty columns is found */
1693 if (k < num_RAPext_cols) {
1694 temp = new index_t[k];
1695 k = 0;
1696 for (i=0; i<num_RAPext_cols; i++)
1697 if (!P_marker[i]) {
1698 P_marker[i] = k;
1699 temp[k] = global_id_RAP[i];
1700 k++;
1701 }
1702 #pragma omp parallel for schedule(static) private(i, i1)
1703 for (i=0; i<j; i++) {
1704 i1 = RAP_couple_idx[i];
1705 RAP_couple_idx[i] = P_marker[i1];
1706 }
1707 num_RAPext_cols = k;
1708 delete[] global_id_RAP;
1709 global_id_RAP = temp;
1710 }
1711 delete[] P_marker;
1712
1713 /******************************************************/
1714 /* Start to create the coarse level System Matrix A_c */
1715 /******************************************************/
1716 /* first, prepare the sender/receiver for the col_connector */
1717 dist = P->pattern->input_distribution->first_component;
1718 recv_len = new dim_t[size];
1719 send_len = new dim_t[size];
1720 neighbor = new Esys_MPI_rank[size];
1721 offsetInShared = new index_t[size+1];
1722 shared = new index_t[num_RAPext_cols];
1723 memset(recv_len, 0, sizeof(dim_t) * size);
1724 memset(send_len, 0, sizeof(dim_t) * size);
1725 num_neighbors = 0;
1726 offsetInShared[0] = 0;
1727 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1728 shared[i] = i + num_Pmain_cols;
1729 if (k <= global_id_RAP[i]) {
1730 if (recv_len[j] > 0) {
1731 neighbor[num_neighbors] = j;
1732 num_neighbors ++;
1733 offsetInShared[num_neighbors] = i;
1734 }
1735 while (k <= global_id_RAP[i]) {
1736 j++;
1737 k = dist[j+1];
1738 }
1739 }
1740 recv_len[j] ++;
1741 }
1742 if (recv_len[j] > 0) {
1743 neighbor[num_neighbors] = j;
1744 num_neighbors ++;
1745 offsetInShared[num_neighbors] = i;
1746 }
1747
1748 paso::SharedComponents_ptr recv(new paso::SharedComponents(
1749 num_Pmain_cols, num_neighbors, neighbor, shared,
1750 offsetInShared, 1, 0, mpi_info));
1751
1752 #ifdef ESYS_MPI
1753 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1754 #endif
1755
1756 #ifdef ESYS_MPI
1757 mpi_requests=new MPI_Request[size*2];
1758 mpi_stati=new MPI_Status[size*2];
1759 #else
1760 mpi_requests=new int[size*2];
1761 mpi_stati=new int[size*2];
1762 #endif
1763 num_neighbors = 0;
1764 j = 0;
1765 offsetInShared[0] = 0;
1766 for (i=0; i<size; i++) {
1767 if (send_len[i] > 0) {
1768 neighbor[num_neighbors] = i;
1769 num_neighbors ++;
1770 j += send_len[i];
1771 offsetInShared[num_neighbors] = j;
1772 }
1773 }
1774 delete[] shared;
1775 shared = new index_t[j];
1776 for (i=0, j=0; i<num_neighbors; i++) {
1777 k = neighbor[i];
1778 #ifdef ESYS_MPI
1779 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1780 mpi_info->msg_tag_counter+k,
1781 mpi_info->comm, &mpi_requests[i]);
1782 #endif
1783 j += send_len[k];
1784 }
1785 for (i=0, j=0; i<recv->numNeighbors; i++) {
1786 k = recv->neighbor[i];
1787 #ifdef ESYS_MPI
1788 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1789 mpi_info->msg_tag_counter+rank,
1790 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1791 #endif
1792 j += recv_len[k];
1793 }
1794 #ifdef ESYS_MPI
1795 MPI_Waitall(num_neighbors + recv->numNeighbors,
1796 mpi_requests, mpi_stati);
1797 #endif
1798 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1799
1800 j = offsetInShared[num_neighbors];
1801 offset = dist[rank];
1802 #pragma omp parallel for schedule(static) private(i)
1803 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1804
1805 paso::SharedComponents_ptr send(new paso::SharedComponents(
1806 num_Pmain_cols, num_neighbors, neighbor, shared,
1807 offsetInShared, 1, 0, mpi_info));
1808
1809 col_connector.reset(new paso::Connector(send, recv));
1810 delete[] shared;
1811
1812 /* now, create row distribution (output_distri) and col
1813 distribution (input_distribution) */
1814 input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1815 output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1816
1817 /* then, prepare the sender/receiver for the row_connector, first, prepare
1818 the information for sender */
1819 sum = RAP_couple_ptr[num_Pmain_cols];
1820 len = new dim_t[size];
1821 send_ptr = new index_t*[size];
1822 send_idx = new index_t*[size];
1823 #pragma omp parallel for schedule(static) private(i)
1824 for (i=0; i<size; i++) {
1825 send_ptr[i] = new index_t[num_Pmain_cols];
1826 send_idx[i] = new index_t[sum];
1827 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1828 }
1829 memset(len, 0, sizeof(dim_t) * size);
1830 recv = col_connector->recv;
1831 sum=0;
1832 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1833 i1 = RAP_couple_ptr[i_r];
1834 i2 = RAP_couple_ptr[i_r+1];
1835 if (i2 > i1) {
1836 /* then row i_r will be in the sender of row_connector, now check
1837 how many neighbours i_r needs to be send to */
1838 for (j=i1; j<i2; j++) {
1839 i_c = RAP_couple_idx[j];
1840 /* find out the corresponding neighbor "p" of column i_c */
1841 for (p=0; p<recv->numNeighbors; p++) {
1842 if (i_c < recv->offsetInShared[p+1]) {
1843 k = recv->neighbor[p];
1844 if (send_ptr[k][i_r] == 0) sum++;
1845 send_ptr[k][i_r] ++;
1846 send_idx[k][len[k]] = global_id_RAP[i_c];
1847 len[k] ++;
1848 break;
1849 }
1850 }
1851 }
1852 }
1853 }
1854 if (global_id_RAP) {
1855 delete[] global_id_RAP;
1856 global_id_RAP = NULL;
1857 }
1858
1859 /* now allocate the sender */
1860 shared = new index_t[sum];
1861 memset(send_len, 0, sizeof(dim_t) * size);
1862 num_neighbors=0;
1863 offsetInShared[0] = 0;
1864 for (p=0, k=0; p<size; p++) {
1865 for (i=0; i<num_Pmain_cols; i++) {
1866 if (send_ptr[p][i] > 0) {
1867 shared[k] = i;
1868 k++;
1869 send_ptr[p][send_len[p]] = send_ptr[p][i];
1870 send_len[p]++;
1871 }
1872 }
1873 if (k > offsetInShared[num_neighbors]) {
1874 neighbor[num_neighbors] = p;
1875 num_neighbors ++;
1876 offsetInShared[num_neighbors] = k;
1877 }
1878 }
1879 send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1880 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1881
1882 /* send/recv number of rows will be sent from current proc
1883 recover info for the receiver of row_connector from the sender */
1884 #ifdef ESYS_MPI
1885 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1886 #endif
1887 num_neighbors = 0;
1888 offsetInShared[0] = 0;
1889 j = 0;
1890 for (i=0; i<size; i++) {
1891 if (i != rank && recv_len[i] > 0) {
1892 neighbor[num_neighbors] = i;
1893 num_neighbors ++;
1894 j += recv_len[i];
1895 offsetInShared[num_neighbors] = j;
1896 }
1897 }
1898 delete[] shared;
1899 delete[] recv_len;
1900 shared = new index_t[j];
1901 k = offsetInShared[num_neighbors];
1902 #pragma omp parallel for schedule(static) private(i)
1903 for (i=0; i<k; i++) {
1904 shared[i] = i + num_Pmain_cols;
1905 }
1906 recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1907 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1908 row_connector.reset(new paso::Connector(send, recv));
1909 delete[] shared;
1910
1911 /* send/recv pattern->ptr for rowCoupleBlock */
1912 num_RAPext_rows = offsetInShared[num_neighbors];
1913 row_couple_ptr = new index_t[num_RAPext_rows+1];
1914 for (p=0; p<num_neighbors; p++) {
1915 j = offsetInShared[p];
1916 i = offsetInShared[p+1];
1917 #ifdef ESYS_MPI
1918 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1919 mpi_info->msg_tag_counter+neighbor[p],
1920 mpi_info->comm, &mpi_requests[p]);
1921 #endif
1922 }
1923 send = row_connector->send;
1924 for (p=0; p<send->numNeighbors; p++) {
1925 #ifdef ESYS_MPI
1926 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1927 MPI_INT, send->neighbor[p],
1928 mpi_info->msg_tag_counter+rank,
1929 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1930 #endif
1931 }
1932 #ifdef ESYS_MPI
1933 MPI_Waitall(num_neighbors + send->numNeighbors,
1934 mpi_requests, mpi_stati);
1935 #endif
1936 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1937 delete[] send_len;
1938
1939 sum = 0;
1940 for (i=0; i<num_RAPext_rows; i++) {
1941 k = row_couple_ptr[i];
1942 row_couple_ptr[i] = sum;
1943 sum += k;
1944 }
1945 row_couple_ptr[num_RAPext_rows] = sum;
1946
1947 /* send/recv pattern->index for rowCoupleBlock */
1948 k = row_couple_ptr[num_RAPext_rows];
1949 row_couple_idx = new index_t[k];
1950 for (p=0; p<num_neighbors; p++) {
1951 j1 = row_couple_ptr[offsetInShared[p]];
1952 j2 = row_couple_ptr[offsetInShared[p+1]];
1953 #ifdef ESYS_MPI
1954 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1955 mpi_info->msg_tag_counter+neighbor[p],
1956 mpi_info->comm, &mpi_requests[p]);
1957 #endif
1958 }
1959 for (p=0; p<send->numNeighbors; p++) {
1960 #ifdef ESYS_MPI
1961 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1962 MPI_INT, send->neighbor[p],
1963 mpi_info->msg_tag_counter+rank,
1964 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1965 #endif
1966 }
1967 #ifdef ESYS_MPI
1968 MPI_Waitall(num_neighbors + send->numNeighbors,
1969 mpi_requests, mpi_stati);
1970 #endif
1971 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1972
1973 offset = input_dist->first_component[rank];
1974 k = row_couple_ptr[num_RAPext_rows];
1975 #pragma omp parallel for schedule(static) private(i)
1976 for (i=0; i<k; i++) {
1977 row_couple_idx[i] -= offset;
1978 }
1979 #pragma omp parallel for schedule(static) private(i)
1980 for (i=0; i<size; i++) {
1981 delete[] send_ptr[i];
1982 delete[] send_idx[i];
1983 }
1984 delete[] send_ptr;
1985 delete[] send_idx;
1986 delete[] len;
1987 delete[] offsetInShared;
1988 delete[] neighbor;
1989 delete[] mpi_requests;
1990 delete[] mpi_stati;
1991 Esys_MPIInfo_free(mpi_info);
1992
1993 /* Now, we can create pattern for mainBlock and coupleBlock */
1994 main_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1995 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1996 col_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1997 num_Pmain_cols, num_RAPext_cols,
1998 RAP_couple_ptr, RAP_couple_idx);
1999 row_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2000 num_RAPext_rows, num_Pmain_cols,
2001 row_couple_ptr, row_couple_idx);
2002
2003 /* next, create the system matrix */
2004 pattern = new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
2005 output_dist, input_dist, main_pattern, col_couple_pattern,
2006 row_couple_pattern, col_connector, row_connector);
2007 out = Paso_SystemMatrix_alloc(A->type, pattern,
2008 row_block_size, col_block_size, FALSE);
2009
2010 /* finally, fill in the data*/
2011 memcpy(out->mainBlock->val, RAP_main_val,
2012 out->mainBlock->len* sizeof(double));
2013 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2014 out->col_coupleBlock->len * sizeof(double));
2015
2016 /* Clean up */
2017 paso::SystemMatrixPattern_free(pattern);
2018 paso::Pattern_free(main_pattern);
2019 paso::Pattern_free(col_couple_pattern);
2020 paso::Pattern_free(row_couple_pattern);
2021 delete[] RAP_main_val;
2022 delete[] RAP_couple_val;
2023 if (Esys_noError()) {
2024 return out;
2025 } else {
2026 Paso_SystemMatrix_free(out);
2027 return NULL;
2028 }
2029 }
2030
2031
2032 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2033 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2034 {
2035 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2036 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
2037 Paso_SystemMatrix *out=NULL;
2038 paso::SystemMatrixPattern *pattern=NULL;
2039 paso::Distribution_ptr input_dist, output_dist;
2040 paso::SharedComponents_ptr send, recv;
2041 paso::Connector_ptr col_connector, row_connector;
2042 paso::Pattern *main_pattern=NULL;
2043 paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2044 const dim_t row_block_size=A->row_block_size;
2045 const dim_t col_block_size=A->col_block_size;
2046 const dim_t block_size = A->block_size;
2047 const double ZERO = 0.0;
2048 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2049 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2050 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2051 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2052 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2053 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2054 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2055 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2056 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2057 index_t sum, i, j, k, iptr, irb, icb, ib;
2058 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2059 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2060 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2061 index_t row_marker_ext, *where_p=NULL;
2062 index_t **send_ptr=NULL, **send_idx=NULL;
2063 dim_t p, num_neighbors;
2064 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2065 Esys_MPI_rank *neighbor=NULL;
2066 #ifdef ESYS_MPI
2067 MPI_Request* mpi_requests=NULL;
2068 MPI_Status* mpi_stati=NULL;
2069 #else
2070 int *mpi_requests=NULL, *mpi_stati=NULL;
2071 #endif
2072
2073 /* two sparse matrices R_main and R_couple will be generate, as the
2074 transpose of P_main and P_col_couple, respectively. Note that,
2075 R_couple is actually the row_coupleBlock of R (R=P^T) */
2076 R_main = paso::SparseMatrix_getTranspose(P->mainBlock);
2077 R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock);
2078
2079 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2080 and needed locally for triple matrix product RAP
2081 to be specific, P_ext (on processor k) are group of rows in P, where
2082 the list of rows from processor q is given by
2083 A->col_coupler->connector->send->shared[rPtr...]
2084 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2085 on q.
2086 P_ext is represented by two sparse matrices P_ext_main and
2087 P_ext_couple */
2088 Paso_Preconditioner_AMG_extendB(A, P);
2089
2090 /* count the number of cols in P_ext_couple, resize the pattern of
2091 sparse matrix P_ext_couple with new compressed order, and then
2092 build the col id mapping from P->col_coupleBlock to
2093 P_ext_couple */
2094 num_Pmain_cols = P->mainBlock->numCols;
2095 num_Pcouple_cols = P->col_coupleBlock->numCols;
2096 num_Acouple_cols = A->col_coupleBlock->numCols;
2097 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2098 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2099 offset = P->col_distribution->first_component[rank];
2100 num_Pext_cols = 0;
2101 if (P->global_id) {
2102 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2103 and P->col_coupleBlock */
2104 iptr = 0;
2105 if (num_Pcouple_cols || sum > 0) {
2106 temp = new index_t[num_Pcouple_cols+sum];
2107 for (; iptr<sum; iptr++){
2108 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2109 }
2110 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2111 temp[iptr] = P->global_id[j];
2112 }
2113 }
2114 if (iptr) {
2115 #ifdef USE_QSORTG
2116 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2117 #else
2118 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2119 #endif
2120 num_Pext_cols = 1;
2121 i = temp[0];
2122 for (j=1; j<iptr; j++) {
2123 if (temp[j] > i) {
2124 i = temp[j];
2125 temp[num_Pext_cols++] = i;
2126 }
2127 }
2128 }
2129
2130 /* resize the pattern of P_ext_couple */
2131 if(num_Pext_cols){
2132 global_id_P = new index_t[num_Pext_cols];
2133 for (i=0; i<num_Pext_cols; i++)
2134 global_id_P[i] = temp[i];
2135 }
2136 if (num_Pcouple_cols || sum > 0)
2137 delete[] temp;
2138 for (i=0; i<sum; i++) {
2139 where_p = (index_t *)bsearch(
2140 &(P->remote_coupleBlock->pattern->index[i]),
2141 global_id_P, num_Pext_cols,
2142 sizeof(index_t), paso::comparIndex);
2143 P->remote_coupleBlock->pattern->index[i] =
2144 (index_t)(where_p -global_id_P);
2145 }
2146
2147 /* build the mapping */
2148 if (num_Pcouple_cols) {
2149 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2150 iptr = 0;
2151 for (i=0; i<num_Pext_cols; i++)
2152 if (global_id_P[i] == P->global_id[iptr]) {
2153 Pcouple_to_Pext[iptr++] = i;
2154 if (iptr == num_Pcouple_cols) break;
2155 }
2156 }
2157 }
2158
2159 /* alloc and initialise the makers */
2160 sum = num_Pext_cols + num_Pmain_cols;
2161 P_marker = new index_t[sum];
2162 A_marker = new index_t[num_A_cols];
2163 #pragma omp parallel for private(i) schedule(static)
2164 for (i=0; i<sum; i++) P_marker[i] = -1;
2165 #pragma omp parallel for private(i) schedule(static)
2166 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2167
2168 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2169 sum = 0;
2170 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2171 row_marker = sum;
2172 /* then, loop over elements in row i_r of R_couple */
2173 j1_ub = R_couple->pattern->ptr[i_r+1];
2174 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2175 i1 = R_couple->pattern->index[j1];
2176 /* then, loop over elements in row i1 of A->col_coupleBlock */
2177 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2178 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2179 i2 = A->col_coupleBlock->pattern->index[j2];
2180
2181 /* check whether entry RA[i_r, i2] has been previously visited.
2182 RAP new entry is possible only if entry RA[i_r, i2] has not
2183 been visited yet */
2184 if (A_marker[i2] != i_r) {
2185 /* first, mark entry RA[i_r,i2] as visited */
2186 A_marker[i2] = i_r;
2187
2188 /* then loop over elements in row i2 of P_ext_main */
2189 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2190 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2191 i_c = P->row_coupleBlock->pattern->index[j3];
2192
2193 /* check whether entry RAP[i_r,i_c] has been created.
2194 If not yet created, create the entry and increase
2195 the total number of elements in RAP_ext */
2196 if (P_marker[i_c] < row_marker) {
2197 P_marker[i_c] = sum;
2198 sum++;
2199 }
2200 }
2201
2202 /* loop over elements in row i2 of P_ext_couple, do the same */
2203 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2204 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2205 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2206 if (P_marker[i_c] < row_marker) {
2207 P_marker[i_c] = sum;
2208 sum++;
2209 }
2210 }
2211 }
2212 }
2213
2214 /* now loop over elements in row i1 of A->mainBlock, repeat
2215 the process we have done to block A->col_coupleBlock */
2216 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2217 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2218 i2 = A->mainBlock->pattern->index[j2];
2219 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2220 A_marker[i2 + num_Acouple_cols] = i_r;
2221 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2222 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2223 i_c = P->mainBlock->pattern->index[j3];
2224 if (P_marker[i_c] < row_marker) {
2225 P_marker[i_c] = sum;
2226 sum++;
2227 }
2228 }
2229 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2230 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2231 /* note that we need to map the column index in
2232 P->col_coupleBlock back into the column index in
2233 P_ext_couple */
2234 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2235 if (P_marker[i_c] < row_marker) {
2236 P_marker[i_c] = sum;
2237 sum++;
2238 }
2239 }
2240 }
2241 }
2242 }
2243 }
2244
2245 /* Now we have the number of non-zero elements in RAP_ext, allocate
2246 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2247 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2248 RAP_ext_idx = new index_t[sum];
2249 RAP_ext_val = new double[sum * block_size];
2250 RA_val = new double[block_size];
2251 RAP_val = new double[block_size];
2252
2253 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2254 sum = num_Pext_cols + num_Pmain_cols;
2255 #pragma omp parallel for private(i) schedule(static)
2256 for (i=0; i<sum; i++) P_marker[i] = -1;
2257 #pragma omp parallel for private(i) schedule(static)
2258 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2259 sum = 0;
2260 RAP_ext_ptr[0] = 0;
2261 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2262 row_marker = sum;
2263 /* then, loop over elements in row i_r of R_couple */
2264 j1_ub = R_couple->pattern->ptr[i_r+1];
2265 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2266 i1 = R_couple->pattern->index[j1];
2267 R_val = &(R_couple->val[j1*block_size]);
2268
2269 /* then, loop over elements in row i1 of A->col_coupleBlock */
2270 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2271 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2272 i2 = A->col_coupleBlock->pattern->index[j2];
2273 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2274 for (irb=0; irb<row_block_size; irb++) {
2275 for (icb=0; icb<col_block_size; icb++) {
2276 rtmp = ZERO;
2277 for (ib=0; ib<col_block_size; ib++) {
2278 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2279 }
2280 RA_val[irb+row_block_size*icb]=rtmp;
2281 }
2282 }
2283
2284 /* check whether entry RA[i_r, i2] has been previously visited.
2285 RAP new entry is possible only if entry RA[i_r, i2] has not
2286 been visited yet */
2287 if (A_marker[i2] != i_r) {
2288 /* first, mark entry RA[i_r,i2] as visited */
2289 A_marker[i2] = i_r;
2290
2291 /* then loop over elements in row i2 of P_ext_main */
2292 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2293 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2294 i_c = P->row_coupleBlock->pattern->index[j3];
2295 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2296 for (irb=0; irb<row_block_size; irb++) {
2297 for (icb=0; icb<col_block_size; icb++) {
2298 rtmp = ZERO;
2299 for (ib=0; ib<col_block_size; ib++) {
2300 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2301 }
2302 RAP_val[irb+row_block_size*icb]=rtmp;
2303 }
2304 }
2305
2306
2307 /* check whether entry RAP[i_r,i_c] has been created.
2308 If not yet created, create the entry and increase
2309 the total number of elements in RAP_ext */
2310 if (P_marker[i_c] < row_marker) {
2311 P_marker[i_c] = sum;
2312 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2313 RAP_ext_idx[sum] = i_c + offset;
2314 sum++;
2315 } else {
2316 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2317 for (ib=0; ib<block_size; ib++) {
2318 temp_val[ib] += RAP_val[ib];
2319 }
2320 }
2321 }
2322
2323 /* loop over elements in row i2 of P_ext_couple, do the same */
2324 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2325 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2326 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2327 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2328 for (irb=0; irb<row_block_size; irb++) {
2329 for (icb=0; icb<col_block_size; icb++) {
2330 rtmp = ZERO;
2331 for (ib=0; ib<col_block_size; ib++) {
2332 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2333 }
2334 RAP_val[irb+row_block_size*icb]=rtmp;
2335 }
2336 }
2337
2338 if (P_marker[i_c] < row_marker) {
2339 P_marker[i_c] = sum;
2340 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2341 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2342 sum++;
2343 } else {
2344 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2345 for (ib=0; ib<block_size; ib++) {
2346 temp_val[ib] += RAP_val[ib];
2347 }
2348 }
2349 }
2350
2351 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2352 Only the contributions are added. */
2353 } else {
2354 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2355 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2356 i_c = P->row_coupleBlock->pattern->index[j3];
2357 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2358 for (irb=0; irb<row_block_size; irb++) {
2359 for (icb=0; icb<col_block_size; icb++) {
2360 rtmp = ZERO;
2361 for (ib=0; ib<col_block_size; ib++) {
2362 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2363 }
2364 RAP_val[irb+row_block_size*icb]=rtmp;
2365 }
2366 }
2367
2368 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2369 for (ib=0; ib<block_size; ib++) {
2370 temp_val[ib] += RAP_val[ib];
2371 }
2372 }
2373 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2374 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2375 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2376 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2377 for (irb=0; irb<row_block_size; irb++) {
2378 for (icb=0; icb<col_block_size; icb++) {
2379 rtmp = ZERO;
2380 for (ib=0; ib<col_block_size; ib++) {
2381 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2382 }
2383 RAP_val[irb+row_block_size*icb]=rtmp;
2384 }
2385 }
2386
2387 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2388 for (ib=0; ib<block_size; ib++) {
2389 temp_val[ib] += RAP_val[ib];
2390 }
2391 }
2392 }
2393 }
2394
2395 /* now loop over elements in row i1 of A->mainBlock, repeat
2396 the process we have done to block A->col_coupleBlock */
2397 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2398 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2399 i2 = A->mainBlock->pattern->index[j2];
2400 temp_val = &(A->mainBlock->val[j2*block_size]);
2401 for (irb=0; irb<row_block_size; irb++) {
2402 for (icb=0; icb<col_block_size; icb++) {
2403 rtmp = ZERO;
2404 for (ib=0; ib<col_block_size; ib++) {
2405 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2406 }
2407 RA_val[irb+row_block_size*icb]=rtmp;
2408 }
2409 }
2410
2411 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2412 A_marker[i2 + num_Acouple_cols] = i_r;
2413 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2414 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2415 i_c = P->mainBlock->pattern->index[j3];
2416 temp_val = &(P->mainBlock->val[j3*block_size]);
2417 for (irb=0; irb<row_block_size; irb++) {
2418 for (icb=0; icb<col_block_size; icb++) {
2419 rtmp = ZERO;
2420 for (ib=0; ib<col_block_size; ib++) {
2421 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2422 }
2423 RAP_val[irb+row_block_size*icb]=rtmp;
2424 }
2425 }
2426
2427 if (P_marker[i_c] < row_marker) {
2428 P_marker[i_c] = sum;
2429 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2430 RAP_ext_idx[sum] = i_c + offset;
2431 sum++;
2432 } else {
2433 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2434 for (ib=0; ib<block_size; ib++) {
2435 temp_val[ib] += RAP_val[ib];
2436 }
2437 }
2438 }
2439 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2440 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2441 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2442 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2443 for (irb=0; irb<row_block_size; irb++) {
2444 for (icb=0; icb<col_block_size; icb++) {
2445 rtmp = ZERO;
2446 for (ib=0; ib<col_block_size; ib++) {
2447 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2448 }
2449 RAP_val[irb+row_block_size*icb]=rtmp;
2450 }
2451 }
2452
2453 if (P_marker[i_c] < row_marker) {
2454 P_marker[i_c] = sum;
2455 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2456 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2457 sum++;
2458 } else {
2459 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2460 for (ib=0; ib<block_size; ib++) {
2461 temp_val[ib] += RAP_val[ib];
2462 }
2463 }
2464 }
2465 } else {
2466 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2467 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2468 i_c = P->mainBlock->pattern->index[j3];
2469 temp_val = &(P->mainBlock->val[j3*block_size]);
2470 for (irb=0; irb<row_block_size; irb++) {
2471 for (icb=0; icb<col_block_size; icb++) {
2472 rtmp = ZERO;
2473 for (ib=0; ib<col_block_size; ib++) {
2474 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2475 }
2476 RAP_val[irb+row_block_size*icb]=rtmp;
2477 }
2478 }
2479
2480 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2481 for (ib=0; ib<block_size; ib++) {
2482 temp_val[ib] += RAP_val[ib];
2483 }
2484 }
2485 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2486 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2487 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2488 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2489 for (irb=0; irb<row_block_size; irb++) {
2490 for (icb=0; icb<col_block_size; icb++) {
2491 rtmp = ZERO;
2492 for (ib=0; ib<col_block_size; ib++) {
2493 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2494 }
2495 RAP_val[irb+row_block_size*icb]=rtmp;
2496 }
2497 }
2498
2499 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2500 for (ib=0; ib<block_size; ib++) {
2501 temp_val[ib] += RAP_val[ib];
2502 }
2503 }
2504 }
2505 }
2506 }
2507 RAP_ext_ptr[i_r+1] = sum;
2508 }
2509 delete[] P_marker;
2510 delete[] Pcouple_to_Pext;
2511
2512 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2513 which is given by the column list of P->col_coupleBlock, and
2514 column "c" is the list of columns which possibly covers the
2515 whole column range of system matrix P. This part of data is to
2516 be passed to neighbouring processors, and added to corresponding
2517 RAP[r,c] entries in the neighbouring processors */
2518 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2519 &RAP_ext_val, global_id_P, block_size);
2520
2521 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2522 sum = RAP_ext_ptr[num_RAPext_rows];
2523 num_RAPext_cols = 0;
2524 if (num_Pext_cols || sum > 0) {
2525 temp = new index_t[num_Pext_cols+sum];
2526 j1_ub = offset + num_Pmain_cols;
2527 for (i=0, iptr=0; i<sum; i++) {
2528 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2529 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2530 }
2531 for (j=0; j<num_Pext_cols; j++, iptr++){
2532 temp[iptr] = global_id_P[j];
2533 }
2534
2535 if (iptr) {
2536 #ifdef USE_QSORTG
2537 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2538 #else
2539 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2540 #endif
2541 num_RAPext_cols = 1;
2542 i = temp[0];
2543 for (j=1; j<iptr; j++) {
2544 if (temp[j] > i) {
2545 i = temp[j];
2546 temp[num_RAPext_cols++] = i;
2547 }
2548 }
2549 }
2550 }
2551
2552 /* resize the pattern of P_ext_couple */
2553 if(num_RAPext_cols){
2554 global_id_RAP = new index_t[num_RAPext_cols];
2555 for (i=0; i<num_RAPext_cols; i++)
2556 global_id_RAP[i] = temp[i];
2557 }
2558 if (num_Pext_cols || sum > 0)
2559 delete[] temp;
2560 j1_ub = offset + num_Pmain_cols;
2561 for (i=0; i<sum; i++) {
2562 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2563 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2564 /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex);
2565 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2566 } else
2567 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2568 }
2569
2570 /* build the mapping */
2571 if (num_Pcouple_cols) {
2572