/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4816 - (show annotations)
Fri Mar 28 06:16:02 2014 UTC (5 years, 8 months ago) by caltinay
File size: 122136 byte(s)
paso::SharedComponents now header-only and shared ptr managed.

1 /*****************************************************************************
2 *
3 * Copyright (c) 2003-2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16
17 /****************************************************************************/
18
19 /* Paso: defines AMG Interpolation */
20
21 /****************************************************************************/
22
23 /* Author: l.gao@uq.edu.au */
24
25 /****************************************************************************/
26
27 #include "Paso.h"
28 #include "SparseMatrix.h"
29 #include "PasoUtil.h"
30 #include "Preconditioner.h"
31
32 /****************************************************************************
33
34 Methods necessary for AMG preconditioner
35
36 construct n_C x n_C interpolation operator A_C from matrix A
37 and prolongation matrix P.
38
39 The coarsening operator A_C is defined by RAP where R=P^T.
40
41 *****************************************************************************/
42
43 /* Extend system matrix B with extra two sparse matrices:
44 B_ext_main and B_ext_couple
45 The combination of this two sparse matrices represents
46 the portion of B that is stored on neighbour procs and
47 needed locally for triple matrix product (B^T) A B.
48
49 FOR NOW, we assume that the system matrix B has a NULL
50 row_coupleBlock and a NULL remote_coupleBlock. Based on
51 this assumption, we use link row_coupleBlock for sparse
52 matrix B_ext_main, and link remote_coupleBlock for sparse
53 matrix B_ext_couple.
54
55 To be specific, B_ext (on processor k) are group of rows
56 in B, where the list of rows from processor q is given by
57 A->col_coupler->connector->send->shared[rPtr...]
58 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59 on q. */
60 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
61 {
62 paso::Pattern *pattern_main=NULL, *pattern_couple=NULL;
63 paso::Coupler *coupler=NULL;
64 paso::SharedComponents_ptr send, recv;
65 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
66 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
67 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
68 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
69 index_t *idx_m=NULL, *idx_c=NULL;
70 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
71 index_t offset, len, block_size, block_size_size, max_num_cols;
72 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
73 dim_t *recv_degree=NULL, *send_degree=NULL;
74 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
75
76 if (size == 1) return;
77
78 if (B->row_coupleBlock) {
79 paso::SparseMatrix_free(B->row_coupleBlock);
80 B->row_coupleBlock = NULL;
81 }
82
83 if (B->row_coupleBlock || B->remote_coupleBlock) {
84 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
85 return;
86 }
87
88 /* sending/receiving unknown's global ID */
89 num_main_cols = B->mainBlock->numCols;
90 cols = new double[num_main_cols];
91 offset = B->col_distribution->first_component[rank];
92 #pragma omp parallel for private(i) schedule(static)
93 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
94 if (B->global_id == NULL) {
95 coupler=paso::Coupler_alloc(B->col_coupler->connector, 1);
96 paso::Coupler_startCollect(coupler, cols);
97 }
98
99 recv_buf = new index_t[size];
100 recv_degree = new dim_t[size];
101 recv_offset = new index_t[size+1];
102 #pragma omp parallel for private(i) schedule(static)
103 for (i=0; i<size; i++){
104 recv_buf[i] = 0;
105 recv_degree[i] = 1;
106 recv_offset[i] = i;
107 }
108
109 block_size = B->block_size;
110 block_size_size = block_size * sizeof(double);
111 num_couple_cols = B->col_coupleBlock->numCols;
112 send = A->col_coupler->connector->send;
113 recv = A->col_coupler->connector->recv;
114 num_neighbors = send->numNeighbors;
115 p = send->offsetInShared[num_neighbors];
116 len = p * B->col_distribution->first_component[size];
117 send_buf = new double[len * block_size];
118 send_idx = new index_t[len];
119 send_offset = new index_t[(p+1)*2];
120 send_degree = new dim_t[num_neighbors];
121 i = num_main_cols + num_couple_cols;
122 send_m = new double[i * block_size];
123 send_c = new double[i * block_size];
124 idx_m = new index_t[i];
125 idx_c = new index_t[i];
126
127 /* waiting for receiving unknown's global ID */
128 if (B->global_id == NULL) {
129 paso::Coupler_finishCollect(coupler);
130 global_id = new index_t[num_couple_cols];
131 #pragma omp parallel for private(i) schedule(static)
132 for (i=0; i<num_couple_cols; ++i)
133 global_id[i] = coupler->recv_buffer[i];
134 B->global_id = global_id;
135 paso::Coupler_free(coupler);
136 } else
137 global_id = B->global_id;
138
139 /* distribute the number of cols in current col_coupleBlock to all ranks */
140 #ifdef ESYS_MPI
141 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
142 #endif
143
144 /* distribute global_ids of cols to be considered to all ranks*/
145 len = 0;
146 max_num_cols = 0;
147 for (i=0; i<size; i++){
148 recv_degree[i] = recv_buf[i];
149 recv_offset[i] = len;
150 len += recv_buf[i];
151 if (max_num_cols < recv_buf[i])
152 max_num_cols = recv_buf[i];
153 }
154 recv_offset[size] = len;
155 cols_array = new index_t[len];
156 #ifdef ESYS_MPI
157 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
158 #endif
159
160 /* first, prepare the ptr_ptr to be received */
161 q = recv->numNeighbors;
162 len = recv->offsetInShared[q];
163 ptr_ptr = new index_t[(len+1) * 2];
164 for (p=0; p<q; p++) {
165 row = recv->offsetInShared[p];
166 m = recv->offsetInShared[p + 1];
167 #ifdef ESYS_MPI
168 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
169 A->mpi_info->msg_tag_counter+recv->neighbor[p],
170 A->mpi_info->comm,
171 &(A->col_coupler->mpi_requests[p]));
172 #endif
173 }
174
175 /* now prepare the rows to be sent (the degree, the offset and the data) */
176 len = 0;
177 i0 = 0;
178 for (p=0; p<num_neighbors; p++) {
179 i = i0;
180 neighbor = send->neighbor[p];
181 m_lb = B->col_distribution->first_component[neighbor];
182 m_ub = B->col_distribution->first_component[neighbor + 1];
183 j_ub = send->offsetInShared[p + 1];
184 for (j=send->offsetInShared[p]; j<j_ub; j++) {
185 row = send->shared[j];
186 l_m = 0;
187 l_c = 0;
188 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
189 k_lb = B->col_coupleBlock->pattern->ptr[row];
190
191 /* check part of col_coupleBlock for data to be passed @row */
192 for (k=k_lb; k<k_ub; k++) {
193 m = global_id[B->col_coupleBlock->pattern->index[k]];
194 if (m > offset) break;
195 if (m>= m_lb && m < m_ub) {
196 /* data to be passed to sparse matrix B_ext_main */
197 idx_m[l_m] = m - m_lb;
198 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
199 l_m++;
200 } else {
201 /* data to be passed to sparse matrix B_ext_couple */
202 idx_c[l_c] = m;
203 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
204 l_c++;
205 }
206 }
207 k_lb = k;
208
209 /* check mainBlock for data to be passed @row to sparse
210 matrix B_ext_couple */
211 k_ub = B->mainBlock->pattern->ptr[row + 1];
212 k = B->mainBlock->pattern->ptr[row];
213 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
214 for (; k<k_ub; k++) {
215 m = B->mainBlock->pattern->index[k] + offset;
216 idx_c[l_c] = m;
217 l_c++;
218 }
219
220 /* check the rest part of col_coupleBlock for data to
221 be passed @row to sparse matrix B_ext_couple */
222 k = k_lb;
223 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
224 for (k=k_lb; k<k_ub; k++) {
225 m = global_id[B->col_coupleBlock->pattern->index[k]];
226 if (m>= m_lb && m < m_ub) {
227 /* data to be passed to sparse matrix B_ext_main */
228 idx_m[l_m] = m - m_lb;
229 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
230 l_m++;
231 } else {
232 /* data to be passed to sparse matrix B_ext_couple */
233 idx_c[l_c] = m;
234 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
235 l_c++;
236 }
237 }
238
239 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
240 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
241 send_offset[2*i] = l_m;
242 len += l_m;
243 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
244 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
245 send_offset[2*i+1] = l_c;
246 len += l_c;
247 i++;
248 }
249
250 /* sending */
251 #ifdef ESYS_MPI
252 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
253 A->mpi_info->msg_tag_counter+rank,
254 A->mpi_info->comm,
255 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
256 #endif
257 send_degree[p] = len;
258 i0 = i;
259 }
260 delete[] send_m;
261 delete[] send_c;
262 delete[] idx_m;
263 delete[] idx_c;
264
265
266 q = recv->numNeighbors;
267 len = recv->offsetInShared[q];
268 ptr_main = new index_t[(len+1)];
269 ptr_couple = new index_t[(len+1)];
270
271 #ifdef ESYS_MPI
272 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
273 A->col_coupler->mpi_requests,
274 A->col_coupler->mpi_stati);
275 #endif
276 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
277
278 j = 0;
279 k = 0;
280 ptr_main[0] = 0;
281 ptr_couple[0] = 0;
282 for (i=0; i<len; i++) {
283 j += ptr_ptr[2*i];
284 k += ptr_ptr[2*i+1];
285 ptr_main[i+1] = j;
286 ptr_couple[i+1] = k;
287 }
288
289 delete[] ptr_ptr;
290 idx_main = new index_t[j];
291 idx_couple = new index_t[k];
292 ptr_idx = new index_t[j+k];
293 ptr_val = new double[(j+k) * block_size];
294
295 /* send/receive index array */
296 j=0;
297 k_ub = 0;
298 for (p=0; p<recv->numNeighbors; p++) {
299 k = recv->offsetInShared[p];
300 m = recv->offsetInShared[p+1];
301 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
302 if (i > 0) {
303 k_ub ++;
304 #ifdef ESYS_MPI
305 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
306 A->mpi_info->msg_tag_counter+recv->neighbor[p],
307 A->mpi_info->comm,
308 &(A->col_coupler->mpi_requests[p]));
309 #endif
310 }
311 j += i;
312 }
313
314 j=0;
315 k_ub = 0;
316 for (p=0; p<num_neighbors; p++) {
317 i = send_degree[p] - j;
318 if (i > 0){
319 k_ub ++;
320 #ifdef ESYS_MPI
321 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
322 A->mpi_info->msg_tag_counter+rank,
323 A->mpi_info->comm,
324 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
325 #endif
326 }
327 j = send_degree[p];
328 }
329
330 #ifdef ESYS_MPI
331 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
332 A->col_coupler->mpi_requests,
333 A->col_coupler->mpi_stati);
334 #endif
335 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
336
337 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
338 for (i=0; i<len; i++) {
339 j = ptr_main[i];
340 k = ptr_main[i+1];
341 m = ptr_couple[i];
342 for (p=j; p<k; p++) {
343 idx_main[p] = ptr_idx[m+p];
344 }
345 j = ptr_couple[i+1];
346 for (p=m; p<j; p++) {
347 idx_couple[p] = ptr_idx[k+p];
348 }
349 }
350 delete[] ptr_idx;
351
352 /* allocate pattern and sparsematrix for B_ext_main */
353 pattern_main = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
354 len, num_main_cols, ptr_main, idx_main);
355 B->row_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
356 pattern_main, B->row_block_size, B->col_block_size,
357 FALSE);
358 paso::Pattern_free(pattern_main);
359
360 /* allocate pattern and sparsematrix for B_ext_couple */
361 pattern_couple = paso::Pattern_alloc(B->col_coupleBlock->pattern->type,
362 len, B->col_distribution->first_component[size],
363 ptr_couple, idx_couple);
364 B->remote_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type,
365 pattern_couple, B->row_block_size, B->col_block_size,
366 FALSE);
367 paso::Pattern_free(pattern_couple);
368
369 /* send/receive value array */
370 j=0;
371 for (p=0; p<recv->numNeighbors; p++) {
372 k = recv->offsetInShared[p];
373 m = recv->offsetInShared[p+1];
374 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
375 #ifdef ESYS_MPI
376 if (i > 0)
377 MPI_Irecv(&(ptr_val[j]), i * block_size,
378 MPI_DOUBLE, recv->neighbor[p],
379 A->mpi_info->msg_tag_counter+recv->neighbor[p],
380 A->mpi_info->comm,
381 &(A->col_coupler->mpi_requests[p]));
382 #endif
383 j += (i * block_size);
384 }
385
386 j=0;
387 for (p=0; p<num_neighbors; p++) {
388 i = send_degree[p] - j;
389 #ifdef ESYS_MPI
390 if (i > 0)
391 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
392 A->mpi_info->msg_tag_counter+rank,
393 A->mpi_info->comm,
394 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
395 #endif
396 j = send_degree[p] ;
397 }
398
399 #ifdef ESYS_MPI
400 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
401 A->col_coupler->mpi_requests,
402 A->col_coupler->mpi_stati);
403 #endif
404 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
405
406 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
407 for (i=0; i<len; i++) {
408 j = ptr_main[i];
409 k = ptr_main[i+1];
410 m = ptr_couple[i];
411 for (p=j; p<k; p++) {
412 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
413 }
414 j = ptr_couple[i+1];
415 for (p=m; p<j; p++) {
416 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
417 }
418 }
419 delete[] ptr_val;
420
421
422 /* release all temp memory allocation */
423 delete[] cols;
424 delete[] cols_array;
425 delete[] recv_offset;
426 delete[] recv_degree;
427 delete[] recv_buf;
428 delete[] send_buf;
429 delete[] send_offset;
430 delete[] send_degree;
431 delete[] send_idx;
432 }
433
434 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
435 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
436 to copy block of data in T to neighbour processors, defined by
437 P->col_coupler->connector->recv->neighbor[k] where k is in
438 [0, P->col_coupler->connector->recv->numNeighbors).
439 Rows to be copied to neighbor processor k is in the list defined by
440 P->col_coupler->connector->recv->offsetInShared[k] ...
441 P->col_coupler->connector->recv->offsetInShared[k+1] */
442 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
443 index_t **p_ptr, index_t **p_idx, double **p_val,
444 index_t *global_id, index_t block_size)
445 {
446 paso::SharedComponents_ptr send, recv;
447 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
448 index_t i, j, p, m, n, size;
449 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
450 index_t *ptr=*p_ptr, *idx=*p_idx;
451 double *val=*p_val, *recv_val=NULL;
452 #ifdef ESYS_MPI
453 index_t rank = P->mpi_info->rank;
454 #endif
455
456 size = P->mpi_info->size;
457 send = P->col_coupler->connector->recv;
458 recv = P->col_coupler->connector->send;
459 send_neighbors = send->numNeighbors;
460 recv_neighbors = recv->numNeighbors;
461 send_rows = P->col_coupleBlock->numCols;
462 recv_rows = recv->offsetInShared[recv_neighbors];
463
464 send_degree = new index_t[send_rows];
465 recv_ptr = new index_t[recv_rows + 1];
466 #pragma omp for schedule(static) private(i)
467 for (i=0; i<send_rows; i++)
468 send_degree[i] = ptr[i+1] - ptr[i];
469
470 /* First, send/receive the degree */
471 for (p=0; p<recv_neighbors; p++) { /* Receiving */
472 m = recv->offsetInShared[p];
473 n = recv->offsetInShared[p+1];
474 #ifdef ESYS_MPI
475 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
476 P->mpi_info->msg_tag_counter + recv->neighbor[p],
477 P->mpi_info->comm,
478 &(P->col_coupler->mpi_requests[p]));
479 #endif
480 }
481 for (p=0; p<send_neighbors; p++) { /* Sending */
482 m = send->offsetInShared[p];
483 n = send->offsetInShared[p+1];
484 #ifdef ESYS_MPI
485 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
486 P->mpi_info->msg_tag_counter + rank,
487 P->mpi_info->comm,
488 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
489 #endif
490 }
491 #ifdef ESYS_MPI
492 MPI_Waitall(send_neighbors+recv_neighbors,
493 P->col_coupler->mpi_requests,
494 P->col_coupler->mpi_stati);
495 #endif
496 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
497
498 delete[] send_degree;
499 m = Paso_Util_cumsum(recv_rows, recv_ptr);
500 recv_ptr[recv_rows] = m;
501 recv_idx = new index_t[m];
502 recv_val = new double[m * block_size];
503
504 /* Next, send/receive the index array */
505 j = 0;
506 for (p=0; p<recv_neighbors; p++) { /* Receiving */
507 m = recv->offsetInShared[p];
508 n = recv->offsetInShared[p+1];
509 i = recv_ptr[n] - recv_ptr[m];
510 if (i > 0) {
511 #ifdef ESYS_MPI
512 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
513 P->mpi_info->msg_tag_counter + recv->neighbor[p],
514 P->mpi_info->comm,
515 &(P->col_coupler->mpi_requests[p]));
516 #endif
517 }
518 j += i;
519 }
520
521 j = 0;
522 for (p=0; p<send_neighbors; p++) { /* Sending */
523 m = send->offsetInShared[p];
524 n = send->offsetInShared[p+1];
525 i = ptr[n] - ptr[m];
526 if (i >0) {
527 #ifdef ESYS_MPI
528 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
529 P->mpi_info->msg_tag_counter + rank,
530 P->mpi_info->comm,
531 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
532 #endif
533 j += i;
534 }
535 }
536 #ifdef ESYS_MPI
537 MPI_Waitall(send_neighbors+recv_neighbors,
538 P->col_coupler->mpi_requests,
539 P->col_coupler->mpi_stati);
540 #endif
541 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
542
543 /* Last, send/receive the data array */
544 j = 0;
545 for (p=0; p<recv_neighbors; p++) { /* Receiving */
546 m = recv->offsetInShared[p];
547 n = recv->offsetInShared[p+1];
548 i = recv_ptr[n] - recv_ptr[m];
549 #ifdef ESYS_MPI
550 if (i > 0)
551 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
552 P->mpi_info->msg_tag_counter + recv->neighbor[p],
553 P->mpi_info->comm,
554 &(P->col_coupler->mpi_requests[p]));
555 #endif
556 j += (i*block_size);
557 }
558
559 j = 0;
560 for (p=0; p<send_neighbors; p++) { /* Sending */
561 m = send->offsetInShared[p];
562 n = send->offsetInShared[p+1];
563 i = ptr[n] - ptr[m];
564 if (i >0) {
565 #ifdef ESYS_MPI
566 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
567 P->mpi_info->msg_tag_counter + rank,
568 P->mpi_info->comm,
569 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
570 #endif
571 j += i * block_size;
572 }
573 }
574 #ifdef ESYS_MPI
575 MPI_Waitall(send_neighbors+recv_neighbors,
576 P->col_coupler->mpi_requests,
577 P->col_coupler->mpi_stati);
578 #endif
579 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
580
581 /* Clean up and return with received ptr, index and data arrays */
582 delete[] ptr;
583 delete[] idx;
584 delete[] val;
585 *p_ptr = recv_ptr;
586 *p_idx = recv_idx;
587 *p_val = recv_val;
588 }
589
590 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
591 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
592 {
593 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
594 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
595 Paso_SystemMatrix *out=NULL;
596 paso::SystemMatrixPattern *pattern=NULL;
597 paso::Distribution_ptr input_dist, output_dist;
598 paso::Connector *col_connector=NULL, *row_connector=NULL;
599 paso::Pattern *main_pattern=NULL;
600 paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
601 const dim_t row_block_size=A->row_block_size;
602 const dim_t col_block_size=A->col_block_size;
603 const dim_t block_size = A->block_size;
604 const dim_t P_block_size = P->block_size;
605 const double ZERO = 0.0;
606 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
607 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
608 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
609 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
610 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
611 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
612 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
613 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
614 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
615 index_t sum, i, j, k, iptr, irb, icb, ib;
616 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
617 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
618 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
619 index_t row_marker_ext, *where_p=NULL;
620 index_t **send_ptr=NULL, **send_idx=NULL;
621 dim_t p, num_neighbors;
622 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
623 Esys_MPI_rank *neighbor=NULL;
624 #ifdef ESYS_MPI
625 MPI_Request* mpi_requests=NULL;
626 MPI_Status* mpi_stati=NULL;
627 #else
628 int *mpi_requests=NULL, *mpi_stati=NULL;
629 #endif
630
631 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
632 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
633
634 /* two sparse matrices R_main and R_couple will be generate, as the
635 transpose of P_main and P_col_couple, respectively. Note that,
636 R_couple is actually the row_coupleBlock of R (R=P^T) */
637 R_main = paso::SparseMatrix_getTranspose(P->mainBlock);
638 if (size > 1)
639 R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock);
640
641 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
642 and needed locally for triple matrix product RAP
643 to be specific, P_ext (on processor k) are group of rows in P, where
644 the list of rows from processor q is given by
645 A->col_coupler->connector->send->shared[rPtr...]
646 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
647 on q.
648 P_ext is represented by two sparse matrices P_ext_main and
649 P_ext_couple */
650 Paso_Preconditioner_AMG_extendB(A, P);
651
652 /* count the number of cols in P_ext_couple, resize the pattern of
653 sparse matrix P_ext_couple with new compressed order, and then
654 build the col id mapping from P->col_coupleBlock to
655 P_ext_couple */
656 num_Pmain_cols = P->mainBlock->numCols;
657 if (size > 1) {
658 num_Pcouple_cols = P->col_coupleBlock->numCols;
659 num_Acouple_cols = A->col_coupleBlock->numCols;
660 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
661 } else {
662 num_Pcouple_cols = 0;
663 num_Acouple_cols = 0;
664 sum = 0;
665 }
666 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
667 offset = P->col_distribution->first_component[rank];
668 num_Pext_cols = 0;
669 if (P->global_id) {
670 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
671 and P->col_coupleBlock */
672 iptr = 0;
673 if (num_Pcouple_cols || sum > 0) {
674 temp = new index_t[num_Pcouple_cols+sum];
675 #pragma omp parallel for lastprivate(iptr) schedule(static)
676 for (iptr=0; iptr<sum; iptr++){
677 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
678 }
679 for (j=0; j<num_Pcouple_cols; j++, iptr++){
680 temp[iptr] = P->global_id[j];
681 }
682 }
683 if (iptr) {
684 #ifdef USE_QSORTG
685 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
686 #else
687 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
688 #endif
689 num_Pext_cols = 1;
690 i = temp[0];
691 for (j=1; j<iptr; j++) {
692 if (temp[j] > i) {
693 i = temp[j];
694 temp[num_Pext_cols++] = i;
695 }
696 }
697 }
698 /* resize the pattern of P_ext_couple */
699 if(num_Pext_cols){
700 global_id_P = new index_t[num_Pext_cols];
701 #pragma omp parallel for private(i) schedule(static)
702 for (i=0; i<num_Pext_cols; i++)
703 global_id_P[i] = temp[i];
704 }
705 if (num_Pcouple_cols || sum > 0)
706 delete[] temp;
707 #pragma omp parallel for private(i, where_p) schedule(static)
708 for (i=0; i<sum; i++) {
709 where_p = (index_t *)bsearch(
710 &(P->remote_coupleBlock->pattern->index[i]),
711 global_id_P, num_Pext_cols,
712 sizeof(index_t), paso::comparIndex);
713 P->remote_coupleBlock->pattern->index[i] =
714 (index_t)(where_p -global_id_P);
715 }
716
717 /* build the mapping */
718 if (num_Pcouple_cols) {
719 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
720 iptr = 0;
721 for (i=0; i<num_Pext_cols; i++)
722 if (global_id_P[i] == P->global_id[iptr]) {
723 Pcouple_to_Pext[iptr++] = i;
724 if (iptr == num_Pcouple_cols) break;
725 }
726 }
727 }
728
729 /* alloc and initialise the makers */
730 sum = num_Pext_cols + num_Pmain_cols;
731 P_marker = new index_t[sum];
732 A_marker = new index_t[num_A_cols];
733 #pragma omp parallel for private(i) schedule(static)
734 for (i=0; i<sum; i++) P_marker[i] = -1;
735 #pragma omp parallel for private(i) schedule(static)
736 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
737
738 /* Now, count the size of RAP_ext. Start with rows in R_couple */
739 sum = 0;
740 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
741 row_marker = sum;
742 /* then, loop over elements in row i_r of R_couple */
743 j1_ub = R_couple->pattern->ptr[i_r+1];
744 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
745 i1 = R_couple->pattern->index[j1];
746 /* then, loop over elements in row i1 of A->col_coupleBlock */
747 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
748 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
749 i2 = A->col_coupleBlock->pattern->index[j2];
750
751 /* check whether entry RA[i_r, i2] has been previously visited.
752 RAP new entry is possible only if entry RA[i_r, i2] has not
753 been visited yet */
754 if (A_marker[i2] != i_r) {
755 /* first, mark entry RA[i_r,i2] as visited */
756 A_marker[i2] = i_r;
757
758 /* then loop over elements in row i2 of P_ext_main */
759 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
760 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
761 i_c = P->row_coupleBlock->pattern->index[j3];
762
763 /* check whether entry RAP[i_r,i_c] has been created.
764 If not yet created, create the entry and increase
765 the total number of elements in RAP_ext */
766 if (P_marker[i_c] < row_marker) {
767 P_marker[i_c] = sum;
768 sum++;
769 }
770 }
771
772 /* loop over elements in row i2 of P_ext_couple, do the same */
773 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
774 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
775 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
776 if (P_marker[i_c] < row_marker) {
777 P_marker[i_c] = sum;
778 sum++;
779 }
780 }
781 }
782 }
783
784 /* now loop over elements in row i1 of A->mainBlock, repeat
785 the process we have done to block A->col_coupleBlock */
786 j2_ub = A->mainBlock->pattern->ptr[i1+1];
787 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
788 i2 = A->mainBlock->pattern->index[j2];
789 if (A_marker[i2 + num_Acouple_cols] != i_r) {
790 A_marker[i2 + num_Acouple_cols] = i_r;
791 j3_ub = P->mainBlock->pattern->ptr[i2+1];
792 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
793 i_c = P->mainBlock->pattern->index[j3];
794 if (P_marker[i_c] < row_marker) {
795 P_marker[i_c] = sum;
796 sum++;
797 }
798 }
799 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
800 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
801 /* note that we need to map the column index in
802 P->col_coupleBlock back into the column index in
803 P_ext_couple */
804 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
805 if (P_marker[i_c] < row_marker) {
806 P_marker[i_c] = sum;
807 sum++;
808 }
809 }
810 }
811 }
812 }
813 }
814
815 /* Now we have the number of non-zero elements in RAP_ext, allocate
816 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
817 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
818 RAP_ext_idx = new index_t[sum];
819 RAP_ext_val = new double[sum * block_size];
820 RA_val = new double[block_size];
821 RAP_val = new double[block_size];
822
823 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
824 sum = num_Pext_cols + num_Pmain_cols;
825 #pragma omp parallel for private(i) schedule(static)
826 for (i=0; i<sum; i++) P_marker[i] = -1;
827 #pragma omp parallel for private(i) schedule(static)
828 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
829 sum = 0;
830 RAP_ext_ptr[0] = 0;
831 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
832 row_marker = sum;
833 /* then, loop over elements in row i_r of R_couple */
834 j1_ub = R_couple->pattern->ptr[i_r+1];
835 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
836 i1 = R_couple->pattern->index[j1];
837 R_val = &(R_couple->val[j1*P_block_size]);
838
839 /* then, loop over elements in row i1 of A->col_coupleBlock */
840 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
841 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
842 i2 = A->col_coupleBlock->pattern->index[j2];
843 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
844 for (irb=0; irb<row_block_size; irb++)
845 for (icb=0; icb<col_block_size; icb++)
846 RA_val[irb+row_block_size*icb] = ZERO;
847 for (irb=0; irb<P_block_size; irb++) {
848 rtmp=R_val[irb];
849 for (icb=0; icb<col_block_size; icb++) {
850 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
851 }
852 }
853
854 /* check whether entry RA[i_r, i2] has been previously visited.
855 RAP new entry is possible only if entry RA[i_r, i2] has not
856 been visited yet */
857 if (A_marker[i2] != i_r) {
858 /* first, mark entry RA[i_r,i2] as visited */
859 A_marker[i2] = i_r;
860
861 /* then loop over elements in row i2 of P_ext_main */
862 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
863 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
864 i_c = P->row_coupleBlock->pattern->index[j3];
865 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
866 for (irb=0; irb<row_block_size; irb++)
867 for (icb=0; icb<col_block_size; icb++)
868 RAP_val[irb+row_block_size*icb] = ZERO;
869 for (icb=0; icb<P_block_size; icb++) {
870 rtmp = temp_val[icb];
871 for (irb=0; irb<row_block_size; irb++) {
872 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
873 }
874 }
875
876 /* check whether entry RAP[i_r,i_c] has been created.
877 If not yet created, create the entry and increase
878 the total number of elements in RAP_ext */
879 if (P_marker[i_c] < row_marker) {
880 P_marker[i_c] = sum;
881 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
882 RAP_ext_idx[sum] = i_c + offset;
883 sum++;
884 } else {
885 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
886 for (ib=0; ib<block_size; ib++) {
887 temp_val[ib] += RAP_val[ib];
888 }
889 }
890 }
891
892 /* loop over elements in row i2 of P_ext_couple, do the same */
893 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
894 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
895 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
896 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
897 for (irb=0; irb<row_block_size; irb++)
898 for (icb=0; icb<col_block_size; icb++)
899 RAP_val[irb+row_block_size*icb]=ZERO;
900 for (icb=0; icb<P_block_size; icb++) {
901 rtmp = temp_val[icb];
902 for (irb=0; irb<row_block_size; irb++) {
903 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
904 }
905 }
906
907 if (P_marker[i_c] < row_marker) {
908 P_marker[i_c] = sum;
909 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
910 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
911 sum++;
912 } else {
913 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
914 for (ib=0; ib<block_size; ib++) {
915 temp_val[ib] += RAP_val[ib];
916 }
917 }
918 }
919
920 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
921 Only the contributions are added. */
922 } else {
923 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
924 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
925 i_c = P->row_coupleBlock->pattern->index[j3];
926 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
927 for (irb=0; irb<row_block_size; irb++)
928 for (icb=0; icb<col_block_size; icb++)
929 RAP_val[irb+row_block_size*icb]=ZERO;
930 for (icb=0; icb<P_block_size; icb++) {
931 rtmp = temp_val[icb];
932 for (irb=0; irb<row_block_size; irb++) {
933 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
934 }
935 }
936
937 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
938 for (ib=0; ib<block_size; ib++) {
939 temp_val[ib] += RAP_val[ib];
940 }
941 }
942 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
943 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
944 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
945 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
946 for (irb=0; irb<row_block_size; irb++)
947 for (icb=0; icb<col_block_size; icb++)
948 RAP_val[irb+row_block_size*icb]=ZERO;
949 for (icb=0; icb<P_block_size; icb++) {
950 rtmp = temp_val[icb];
951 for (irb=0; irb<row_block_size; irb++) {
952 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
953 }
954 }
955
956 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
957 for (ib=0; ib<block_size; ib++) {
958 temp_val[ib] += RAP_val[ib];
959 }
960 }
961 }
962 }
963
964 /* now loop over elements in row i1 of A->mainBlock, repeat
965 the process we have done to block A->col_coupleBlock */
966 j2_ub = A->mainBlock->pattern->ptr[i1+1];
967 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
968 i2 = A->mainBlock->pattern->index[j2];
969 temp_val = &(A->mainBlock->val[j2*block_size]);
970 for (irb=0; irb<row_block_size; irb++)
971 for (icb=0; icb<col_block_size; icb++)
972 RA_val[irb+row_block_size*icb]=ZERO;
973 for (irb=0; irb<P_block_size; irb++) {
974 rtmp=R_val[irb];
975 for (icb=0; icb<col_block_size; icb++) {
976 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
977 }
978 }
979
980 if (A_marker[i2 + num_Acouple_cols] != i_r) {
981 A_marker[i2 + num_Acouple_cols] = i_r;
982 j3_ub = P->mainBlock->pattern->ptr[i2+1];
983 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
984 i_c = P->mainBlock->pattern->index[j3];
985 temp_val = &(P->mainBlock->val[j3*P_block_size]);
986 for (irb=0; irb<row_block_size; irb++)
987 for (icb=0; icb<col_block_size; icb++)
988 RAP_val[irb+row_block_size*icb]=ZERO;
989 for (icb=0; icb<P_block_size; icb++) {
990 rtmp = temp_val[icb];
991 for (irb=0; irb<row_block_size; irb++) {
992 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
993 }
994 }
995
996 if (P_marker[i_c] < row_marker) {
997 P_marker[i_c] = sum;
998 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
999 RAP_ext_idx[sum] = i_c + offset;
1000 sum++;
1001 } else {
1002 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1003 for (ib=0; ib<block_size; ib++) {
1004 temp_val[ib] += RAP_val[ib];
1005 }
1006 }
1007 }
1008 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1009 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1010 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1011 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1012 for (irb=0; irb<row_block_size; irb++)
1013 for (icb=0; icb<col_block_size; icb++)
1014 RAP_val[irb+row_block_size*icb]=ZERO;
1015 for (icb=0; icb<P_block_size; icb++) {
1016 rtmp=temp_val[icb];
1017 for (irb=0; irb<row_block_size; irb++) {
1018 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1019 }
1020 }
1021
1022 if (P_marker[i_c] < row_marker) {
1023 P_marker[i_c] = sum;
1024 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1025 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1026 sum++;
1027 } else {
1028 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1029 for (ib=0; ib<block_size; ib++) {
1030 temp_val[ib] += RAP_val[ib];
1031 }
1032 }
1033 }
1034 } else {
1035 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1036 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1037 i_c = P->mainBlock->pattern->index[j3];
1038 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1039 for (irb=0; irb<row_block_size; irb++)
1040 for (icb=0; icb<col_block_size; icb++)
1041 RAP_val[irb+row_block_size*icb]=ZERO;
1042 for (icb=0; icb<P_block_size; icb++) {
1043 rtmp=temp_val[icb];
1044 for (irb=0; irb<row_block_size; irb++) {
1045 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1046 }
1047 }
1048
1049 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1050 for (ib=0; ib<block_size; ib++) {
1051 temp_val[ib] += RAP_val[ib];
1052 }
1053 }
1054 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1055 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1056 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1057 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1058 for (irb=0; irb<row_block_size; irb++)
1059 for (icb=0; icb<col_block_size; icb++)
1060 RAP_val[irb+row_block_size*icb]=ZERO;
1061 for (icb=0; icb<P_block_size; icb++) {
1062 rtmp=temp_val[icb];
1063 for (irb=0; irb<row_block_size; irb++) {
1064 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1065 }
1066 }
1067
1068 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1069 for (ib=0; ib<block_size; ib++) {
1070 temp_val[ib] += RAP_val[ib];
1071 }
1072 }
1073 }
1074 }
1075 }
1076 RAP_ext_ptr[i_r+1] = sum;
1077 }
1078 delete[] P_marker;
1079 delete[] Pcouple_to_Pext;
1080
1081 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1082 which is given by the column list of P->col_coupleBlock, and
1083 column "c" is the list of columns which possibly covers the
1084 whole column range of system matrix P. This part of data is to
1085 be passed to neighbouring processors, and added to corresponding
1086 RAP[r,c] entries in the neighbouring processors */
1087 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1088 &RAP_ext_val, global_id_P, block_size);
1089
1090 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1091 sum = RAP_ext_ptr[num_RAPext_rows];
1092 num_RAPext_cols = 0;
1093 if (num_Pext_cols || sum > 0) {
1094 temp = new index_t[num_Pext_cols+sum];
1095 j1_ub = offset + num_Pmain_cols;
1096 for (i=0, iptr=0; i<sum; i++) {
1097 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1098 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1099 }
1100 for (j=0; j<num_Pext_cols; j++, iptr++){
1101 temp[iptr] = global_id_P[j];
1102 }
1103
1104 if (iptr) {
1105 #ifdef USE_QSORTG
1106 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1107 #else
1108 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
1109 #endif
1110 num_RAPext_cols = 1;
1111 i = temp[0];
1112 for (j=1; j<iptr; j++) {
1113 if (temp[j] > i) {
1114 i = temp[j];
1115 temp[num_RAPext_cols++] = i;
1116 }
1117 }
1118 }
1119 }
1120
1121 /* resize the pattern of P_ext_couple */
1122 if(num_RAPext_cols){
1123 global_id_RAP = new index_t[num_RAPext_cols];
1124 #pragma omp parallel for private(i) schedule(static)
1125 for (i=0; i<num_RAPext_cols; i++)
1126 global_id_RAP[i] = temp[i];
1127 }
1128 if (num_Pext_cols || sum > 0)
1129 delete[] temp;
1130 j1_ub = offset + num_Pmain_cols;
1131 #pragma omp parallel for private(i, where_p) schedule(static)
1132 for (i=0; i<sum; i++) {
1133 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1134 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1135 /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex);
1136 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1137 } else
1138 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1139 }
1140
1141 /* build the mapping */
1142 if (num_Pcouple_cols) {
1143 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1144 iptr = 0;
1145 for (i=0; i<num_RAPext_cols; i++)
1146 if (global_id_RAP[i] == P->global_id[iptr]) {
1147 Pcouple_to_RAP[iptr++] = i;
1148 if (iptr == num_Pcouple_cols) break;
1149 }
1150 }
1151
1152 if (num_Pext_cols) {
1153 Pext_to_RAP = new index_t[num_Pext_cols];
1154 iptr = 0;
1155 for (i=0; i<num_RAPext_cols; i++)
1156 if (global_id_RAP[i] == global_id_P[iptr]) {
1157 Pext_to_RAP[iptr++] = i;
1158 if (iptr == num_Pext_cols) break;
1159 }
1160 }
1161
1162 if (global_id_P){
1163 delete[] global_id_P;
1164 global_id_P = NULL;
1165 }
1166
1167 /* alloc and initialise the makers */
1168 sum = num_RAPext_cols + num_Pmain_cols;
1169 P_marker = new index_t[sum];
1170 #pragma omp parallel for private(i) schedule(static)
1171 for (i=0; i<sum; i++) P_marker[i] = -1;
1172 #pragma omp parallel for private(i) schedule(static)
1173 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1174
1175 /* Now, count the size of RAP. Start with rows in R_main */
1176 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1177 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1178 shared = P->col_coupler->connector->send->shared;
1179 i = 0;
1180 j = 0;
1181 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1182 /* Mark the start of row for both main block and couple block */
1183 row_marker = i;
1184 row_marker_ext = j;
1185
1186 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1187 in RAP_ext */
1188 P_marker[i_r] = i;
1189 i++;
1190 for (j1=0; j1<num_neighbors; j1++) {
1191 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1192 if (shared[j2] == i_r) {
1193 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1194 i_c = RAP_ext_idx[k];
1195 if (i_c < num_Pmain_cols) {
1196 if (P_marker[i_c] < row_marker) {
1197 P_marker[i_c] = i;
1198 i++;
1199 }
1200 } else {
1201 if (P_marker[i_c] < row_marker_ext) {
1202 P_marker[i_c] = j;
1203 j++;
1204 }
1205 }
1206 }
1207 break;
1208 }
1209 }
1210 }
1211
1212 /* then, loop over elements in row i_r of R_main */
1213 j1_ub = R_main->pattern->ptr[i_r+1];
1214 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1215 i1 = R_main->pattern->index[j1];
1216
1217 /* then, loop over elements in row i1 of A->col_coupleBlock */
1218 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1219 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1220 i2 = A->col_coupleBlock->pattern->index[j2];
1221
1222 /* check whether entry RA[i_r, i2] has been previously visited.
1223 RAP new entry is possible only if entry RA[i_r, i2] has not
1224 been visited yet */
1225 if (A_marker[i2] != i_r) {
1226 /* first, mark entry RA[i_r,i2] as visited */
1227 A_marker[i2] = i_r;
1228
1229 /* then loop over elements in row i2 of P_ext_main */
1230 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1231 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232 i_c = P->row_coupleBlock->pattern->index[j3];
1233
1234 /* check whether entry RAP[i_r,i_c] has been created.
1235 If not yet created, create the entry and increase
1236 the total number of elements in RAP_ext */
1237 if (P_marker[i_c] < row_marker) {
1238 P_marker[i_c] = i;
1239 i++;
1240 }
1241 }
1242
1243 /* loop over elements in row i2 of P_ext_couple, do the same */
1244 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1245 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1246 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1247 if (P_marker[i_c] < row_marker_ext) {
1248 P_marker[i_c] = j;
1249 j++;
1250 }
1251 }
1252 }
1253 }
1254
1255 /* now loop over elements in row i1 of A->mainBlock, repeat
1256 the process we have done to block A->col_coupleBlock */
1257 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1258 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1259 i2 = A->mainBlock->pattern->index[j2];
1260 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1261 A_marker[i2 + num_Acouple_cols] = i_r;
1262 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1263 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1264 i_c = P->mainBlock->pattern->index[j3];
1265 if (P_marker[i_c] < row_marker) {
1266 P_marker[i_c] = i;
1267 i++;
1268 }
1269 }
1270 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1271 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1272 /* note that we need to map the column index in
1273 P->col_coupleBlock back into the column index in
1274 P_ext_couple */
1275 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1276 if (P_marker[i_c] < row_marker_ext) {
1277 P_marker[i_c] = j;
1278 j++;
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285
1286 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1287 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1288 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1289 RAP_couple */
1290 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1291 RAP_main_idx = new index_t[i];
1292 RAP_main_val = new double[i * block_size];
1293 RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1294 RAP_couple_idx = new index_t[j];
1295 RAP_couple_val = new double[j * block_size];
1296
1297 RAP_main_ptr[num_Pmain_cols] = i;
1298 RAP_couple_ptr[num_Pmain_cols] = j;
1299
1300 #pragma omp parallel for private(i) schedule(static)
1301 for (i=0; i<sum; i++) P_marker[i] = -1;
1302 #pragma omp parallel for private(i) schedule(static)
1303 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1304
1305 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1306 in R_main */
1307 i = 0;
1308 j = 0;
1309 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1310 /* Mark the start of row for both main block and couple block */
1311 row_marker = i;
1312 row_marker_ext = j;
1313 RAP_main_ptr[i_r] = row_marker;
1314 RAP_couple_ptr[i_r] = row_marker_ext;
1315
1316 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1317 in row i_r of RAP_ext */
1318 P_marker[i_r] = i;
1319 for (ib=0; ib<block_size; ib++)
1320 RAP_main_val[i*block_size+ib] = ZERO;
1321 RAP_main_idx[i] = i_r;
1322 i++;
1323
1324 for (j1=0; j1<num_neighbors; j1++) {
1325 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1326 if (shared[j2] == i_r) {
1327 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1328 i_c = RAP_ext_idx[k];
1329 if (i_c < num_Pmain_cols) {
1330 if (P_marker[i_c] < row_marker) {
1331 P_marker[i_c] = i;
1332 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1333 RAP_main_idx[i] = i_c;
1334 i++;
1335 } else {
1336 t1_val = &(RAP_ext_val[k*block_size]);
1337 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1338 for (ib=0; ib<block_size; ib++)
1339 t2_val[ib] += t1_val[ib];
1340 }
1341 } else {
1342 if (P_marker[i_c] < row_marker_ext) {
1343 P_marker[i_c] = j;
1344 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1345 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1346 j++;
1347 } else {
1348 t1_val = &(RAP_ext_val[k*block_size]);
1349 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1350 for (ib=0; ib<block_size; ib++)
1351 t2_val[ib] += t1_val[ib];
1352 }
1353 }
1354 }
1355 break;
1356 }
1357 }
1358 }
1359
1360 /* then, loop over elements in row i_r of R_main */
1361 j1_ub = R_main->pattern->ptr[i_r+1];
1362 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1363 i1 = R_main->pattern->index[j1];
1364 R_val = &(R_main->val[j1*P_block_size]);
1365
1366 /* then, loop over elements in row i1 of A->col_coupleBlock */
1367 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1368 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1369 i2 = A->col_coupleBlock->pattern->index[j2];
1370 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1371 for (irb=0; irb<row_block_size; irb++)
1372 for (icb=0; icb<col_block_size; icb++)
1373 RA_val[irb+row_block_size*icb]=ZERO;
1374 for (irb=0; irb<P_block_size; irb++) {
1375 rtmp=R_val[irb];
1376 for (icb=0; icb<col_block_size; icb++) {
1377 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1378 }
1379 }
1380
1381
1382 /* check whether entry RA[i_r, i2] has been previously visited.
1383 RAP new entry is possible only if entry RA[i_r, i2] has not
1384 been visited yet */
1385 if (A_marker[i2] != i_r) {
1386 /* first, mark entry RA[i_r,i2] as visited */
1387 A_marker[i2] = i_r;
1388
1389 /* then loop over elements in row i2 of P_ext_main */
1390 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1391 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1392 i_c = P->row_coupleBlock->pattern->index[j3];
1393 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1394 for (irb=0; irb<row_block_size; irb++)
1395 for (icb=0; icb<col_block_size; icb++)
1396 RAP_val[irb+row_block_size*icb]=ZERO;
1397 for (icb=0; icb<P_block_size; icb++) {
1398 rtmp=temp_val[icb];
1399 for (irb=0; irb<row_block_size; irb++) {
1400 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1401 }
1402 }
1403
1404
1405 /* check whether entry RAP[i_r,i_c] has been created.
1406 If not yet created, create the entry and increase
1407 the total number of elements in RAP_ext */
1408 if (P_marker[i_c] < row_marker) {
1409 P_marker[i_c] = i;
1410 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1411 RAP_main_idx[i] = i_c;
1412 i++;
1413 } else {
1414 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1415 for (ib=0; ib<block_size; ib++) {
1416 temp_val[ib] += RAP_val[ib];
1417 }
1418 }
1419 }
1420
1421 /* loop over elements in row i2 of P_ext_couple, do the same */
1422 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1423 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1424 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1425 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1426 for (irb=0; irb<row_block_size; irb++)
1427 for (icb=0; icb<col_block_size; icb++)
1428 RAP_val[irb+row_block_size*icb]=ZERO;
1429 for (icb=0; icb<P_block_size; icb++) {
1430 rtmp=temp_val[icb];
1431 for (irb=0; irb<row_block_size; irb++) {
1432 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1433 }
1434 }
1435
1436 if (P_marker[i_c] < row_marker_ext) {
1437 P_marker[i_c] = j;
1438 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1439 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1440 j++;
1441 } else {
1442 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1443 for (ib=0; ib<block_size; ib++) {
1444 temp_val[ib] += RAP_val[ib];
1445 }
1446 }
1447 }
1448
1449 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1450 Only the contributions are added. */
1451 } else {
1452 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1453 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1454 i_c = P->row_coupleBlock->pattern->index[j3];
1455 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1456 for (irb=0; irb<row_block_size; irb++)
1457 for (icb=0; icb<col_block_size; icb++)
1458 RAP_val[irb+row_block_size*icb]=ZERO;
1459 for (icb=0; icb<P_block_size; icb++) {
1460 rtmp=temp_val[icb];
1461 for (irb=0; irb<row_block_size; irb++) {
1462 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1463 }
1464 }
1465
1466 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1467 for (ib=0; ib<block_size; ib++) {
1468 temp_val[ib] += RAP_val[ib];
1469 }
1470 }
1471 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1472 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1473 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1474 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1475 for (irb=0; irb<row_block_size; irb++)
1476 for (icb=0; icb<col_block_size; icb++)
1477 RAP_val[irb+row_block_size*icb]=ZERO;
1478 for (icb=0; icb<P_block_size; icb++) {
1479 rtmp=temp_val[icb];
1480 for (irb=0; irb<row_block_size; irb++) {
1481 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1482 }
1483 }
1484
1485 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1486 for (ib=0; ib<block_size; ib++) {
1487 temp_val[ib] += RAP_val[ib];
1488 }
1489 }
1490 }
1491 }
1492
1493 /* now loop over elements in row i1 of A->mainBlock, repeat
1494 the process we have done to block A->col_coupleBlock */
1495 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1496 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1497 i2 = A->mainBlock->pattern->index[j2];
1498 temp_val = &(A->mainBlock->val[j2*block_size]);
1499 for (irb=0; irb<row_block_size; irb++)
1500 for (icb=0; icb<col_block_size; icb++)
1501 RA_val[irb+row_block_size*icb]=ZERO;
1502 for (irb=0; irb<P_block_size; irb++) {
1503 rtmp=R_val[irb];
1504 for (icb=0; icb<col_block_size; icb++) {
1505 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1506 }
1507 }
1508
1509 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1510 A_marker[i2 + num_Acouple_cols] = i_r;
1511 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1512 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1513 i_c = P->mainBlock->pattern->index[j3];
1514 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1515 for (irb=0; irb<row_block_size; irb++)
1516 for (icb=0; icb<col_block_size; icb++)
1517 RAP_val[irb+row_block_size*icb]=ZERO;
1518 for (icb=0; icb<P_block_size; icb++) {
1519 rtmp=temp_val[icb];
1520 for (irb=0; irb<row_block_size; irb++) {
1521 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1522 }
1523 }
1524 if (P_marker[i_c] < row_marker) {
1525 P_marker[i_c] = i;
1526 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1527 RAP_main_idx[i] = i_c;
1528 i++;
1529 } else {
1530 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1531 for (ib=0; ib<block_size; ib++) {
1532 temp_val[ib] += RAP_val[ib];
1533 }
1534 }
1535 }
1536 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1537 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1538 /* note that we need to map the column index in
1539 P->col_coupleBlock back into the column index in
1540 P_ext_couple */
1541 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1542 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1543 for (irb=0; irb<row_block_size; irb++)
1544 for (icb=0; icb<col_block_size; icb++)
1545 RAP_val[irb+row_block_size*icb]=ZERO;
1546 for (icb=0; icb<P_block_size; icb++) {
1547 rtmp=temp_val[icb];
1548 for (irb=0; irb<row_block_size; irb++) {
1549 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1550 }
1551 }
1552
1553 if (P_marker[i_c] < row_marker_ext) {
1554 P_marker[i_c] = j;
1555 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1556 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1557 j++;
1558 } else {
1559 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1560 for (ib=0; ib<block_size; ib++) {
1561 temp_val[ib] += RAP_val[ib];
1562 }
1563 }
1564 }
1565
1566 } else {
1567 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1568 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1569 i_c = P->mainBlock->pattern->index[j3];
1570 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1571 for (irb=0; irb<row_block_size; irb++)
1572 for (icb=0; icb<col_block_size; icb++)
1573 RAP_val[irb+row_block_size*icb]=ZERO;
1574 for (icb=0; icb<P_block_size; icb++) {
1575 rtmp=temp_val[icb];
1576 for (irb=0; irb<row_block_size; irb++) {
1577 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1578 }
1579 }
1580
1581 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1582 for (ib=0; ib<block_size; ib++) {
1583 temp_val[ib] += RAP_val[ib];
1584 }
1585 }
1586 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1587 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1588 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1589 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1590 for (irb=0; irb<row_block_size; irb++)
1591 for (icb=0; icb<col_block_size; icb++)
1592 RAP_val[irb+row_block_size*icb]=ZERO;
1593 for (icb=0; icb<P_block_size; icb++) {
1594 rtmp = temp_val[icb];
1595 for (irb=0; irb<row_block_size; irb++) {
1596 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1597 }
1598 }
1599
1600 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1601 for (ib=0; ib<block_size; ib++) {
1602 temp_val[ib] += RAP_val[ib];
1603 }
1604 }
1605 }
1606 }
1607 }
1608
1609 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1610 if (i > row_marker) {
1611 offset = i - row_marker;
1612 temp = new index_t[offset];
1613 #pragma omp parallel for schedule(static) private(iptr)
1614 for (iptr=0; iptr<offset; iptr++)
1615 temp[iptr] = RAP_main_idx[row_marker+iptr];
1616 if (offset > 0) {
1617 #ifdef USE_QSORTG
1618 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1619 #else
1620 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1621 #endif
1622 }
1623 temp_val = new double[offset * block_size];
1624 #pragma omp parallel for schedule(static) private(iptr,k)
1625 for (iptr=0; iptr<offset; iptr++){
1626 k = P_marker[temp[iptr]];
1627 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1628 P_marker[temp[iptr]] = iptr + row_marker;
1629 }
1630 #pragma omp parallel for schedule(static) private(iptr)
1631 for (iptr=0; iptr<offset; iptr++){
1632 RAP_main_idx[row_marker+iptr] = temp[iptr];
1633 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1634 }
1635 delete[] temp;
1636 delete[] temp_val;
1637 }
1638 if (j > row_marker_ext) {
1639 offset = j - row_marker_ext;
1640 temp = new index_t[offset];
1641 #pragma omp parallel for schedule(static) private(iptr)
1642 for (iptr=0; iptr<offset; iptr++)
1643 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1644 if (offset > 0) {
1645 #ifdef USE_QSORTG
1646 qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1647 #else
1648 qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex);
1649 #endif
1650 }
1651 temp_val = new double[offset * block_size];
1652 #pragma omp parallel for schedule(static) private(iptr, k)
1653 for (iptr=0; iptr<offset; iptr++){
1654 k = P_marker[temp[iptr] + num_Pmain_cols];
1655 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1656 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1657 }
1658 #pragma omp parallel for schedule(static) private(iptr)
1659 for (iptr=0; iptr<offset; iptr++){
1660 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1661 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1662 }
1663 delete[] temp;
1664 delete[] temp_val;
1665 }
1666 }
1667
1668 delete[] RA_val;
1669 delete[] RAP_val;
1670 delete[] A_marker;
1671 delete[] Pext_to_RAP;
1672 delete[] Pcouple_to_RAP;
1673 delete[] RAP_ext_ptr;
1674 delete[] RAP_ext_idx;
1675 delete[] RAP_ext_val;
1676 paso::SparseMatrix_free(R_main);
1677 paso::SparseMatrix_free(R_couple);
1678
1679 /* Check whether there are empty columns in RAP_couple */
1680 #pragma omp parallel for schedule(static) private(i)
1681 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1682 /* num of non-empty columns is stored in "k" */
1683 k = 0;
1684 j = RAP_couple_ptr[num_Pmain_cols];
1685 for (i=0; i<j; i++) {
1686 i1 = RAP_couple_idx[i];
1687 if (P_marker[i1]) {
1688 P_marker[i1] = 0;
1689 k++;
1690 }
1691 }
1692
1693 /* empty columns is found */
1694 if (k < num_RAPext_cols) {
1695 temp = new index_t[k];
1696 k = 0;
1697 for (i=0; i<num_RAPext_cols; i++)
1698 if (!P_marker[i]) {
1699 P_marker[i] = k;
1700 temp[k] = global_id_RAP[i];
1701 k++;
1702 }
1703 #pragma omp parallel for schedule(static) private(i, i1)
1704 for (i=0; i<j; i++) {
1705 i1 = RAP_couple_idx[i];
1706 RAP_couple_idx[i] = P_marker[i1];
1707 }
1708 num_RAPext_cols = k;
1709 delete[] global_id_RAP;
1710 global_id_RAP = temp;
1711 }
1712 delete[] P_marker;
1713
1714 /******************************************************/
1715 /* Start to create the coarse level System Matrix A_c */
1716 /******************************************************/
1717 /* first, prepare the sender/receiver for the col_connector */
1718 dist = P->pattern->input_distribution->first_component;
1719 recv_len = new dim_t[size];
1720 send_len = new dim_t[size];
1721 neighbor = new Esys_MPI_rank[size];
1722 offsetInShared = new index_t[size+1];
1723 shared = new index_t[num_RAPext_cols];
1724 memset(recv_len, 0, sizeof(dim_t) * size);
1725 memset(send_len, 0, sizeof(dim_t) * size);
1726 num_neighbors = 0;
1727 offsetInShared[0] = 0;
1728 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1729 shared[i] = i + num_Pmain_cols;
1730 if (k <= global_id_RAP[i]) {
1731 if (recv_len[j] > 0) {
1732 neighbor[num_neighbors] = j;
1733 num_neighbors ++;
1734 offsetInShared[num_neighbors] = i;
1735 }
1736 while (k <= global_id_RAP[i]) {
1737 j++;
1738 k = dist[j+1];
1739 }
1740 }
1741 recv_len[j] ++;
1742 }
1743 if (recv_len[j] > 0) {
1744 neighbor[num_neighbors] = j;
1745 num_neighbors ++;
1746 offsetInShared[num_neighbors] = i;
1747 }
1748
1749 paso::SharedComponents_ptr recv(new paso::SharedComponents(
1750 num_Pmain_cols, num_neighbors, neighbor, shared,
1751 offsetInShared, 1, 0, mpi_info));
1752
1753 #ifdef ESYS_MPI
1754 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1755 #endif
1756
1757 #ifdef ESYS_MPI
1758 mpi_requests=new MPI_Request[size*2];
1759 mpi_stati=new MPI_Status[size*2];
1760 #else
1761 mpi_requests=new int[size*2];
1762 mpi_stati=new int[size*2];
1763 #endif
1764 num_neighbors = 0;
1765 j = 0;
1766 offsetInShared[0] = 0;
1767 for (i=0; i<size; i++) {
1768 if (send_len[i] > 0) {
1769 neighbor[num_neighbors] = i;
1770 num_neighbors ++;
1771 j += send_len[i];
1772 offsetInShared[num_neighbors] = j;
1773 }
1774 }
1775 delete[] shared;
1776 shared = new index_t[j];
1777 for (i=0, j=0; i<num_neighbors; i++) {
1778 k = neighbor[i];
1779 #ifdef ESYS_MPI
1780 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1781 mpi_info->msg_tag_counter+k,
1782 mpi_info->comm, &mpi_requests[i]);
1783 #endif
1784 j += send_len[k];
1785 }
1786 for (i=0, j=0; i<recv->numNeighbors; i++) {
1787 k = recv->neighbor[i];
1788 #ifdef ESYS_MPI
1789 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1790 mpi_info->msg_tag_counter+rank,
1791 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1792 #endif
1793 j += recv_len[k];
1794 }
1795 #ifdef ESYS_MPI
1796 MPI_Waitall(num_neighbors + recv->numNeighbors,
1797 mpi_requests, mpi_stati);
1798 #endif
1799 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1800
1801 j = offsetInShared[num_neighbors];
1802 offset = dist[rank];
1803 #pragma omp parallel for schedule(static) private(i)
1804 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1805
1806 paso::SharedComponents_ptr send(new paso::SharedComponents(
1807 num_Pmain_cols, num_neighbors, neighbor, shared,
1808 offsetInShared, 1, 0, mpi_info));
1809
1810 col_connector = paso::Connector_alloc(send, recv);
1811 delete[] shared;
1812
1813 /* now, create row distribution (output_distri) and col
1814 distribution (input_distribution) */
1815 input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1816 output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0));
1817
1818 /* then, prepare the sender/receiver for the row_connector, first, prepare
1819 the information for sender */
1820 sum = RAP_couple_ptr[num_Pmain_cols];
1821 len = new dim_t[size];
1822 send_ptr = new index_t*[size];
1823 send_idx = new index_t*[size];
1824 #pragma omp parallel for schedule(static) private(i)
1825 for (i=0; i<size; i++) {
1826 send_ptr[i] = new index_t[num_Pmain_cols];
1827 send_idx[i] = new index_t[sum];
1828 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1829 }
1830 memset(len, 0, sizeof(dim_t) * size);
1831 recv = col_connector->recv;
1832 sum=0;
1833 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1834 i1 = RAP_couple_ptr[i_r];
1835 i2 = RAP_couple_ptr[i_r+1];
1836 if (i2 > i1) {
1837 /* then row i_r will be in the sender of row_connector, now check
1838 how many neighbours i_r needs to be send to */
1839 for (j=i1; j<i2; j++) {
1840 i_c = RAP_couple_idx[j];
1841 /* find out the corresponding neighbor "p" of column i_c */
1842 for (p=0; p<recv->numNeighbors; p++) {
1843 if (i_c < recv->offsetInShared[p+1]) {
1844 k = recv->neighbor[p];
1845 if (send_ptr[k][i_r] == 0) sum++;
1846 send_ptr[k][i_r] ++;
1847 send_idx[k][len[k]] = global_id_RAP[i_c];
1848 len[k] ++;
1849 break;
1850 }
1851 }
1852 }
1853 }
1854 }
1855 if (global_id_RAP) {
1856 delete[] global_id_RAP;
1857 global_id_RAP = NULL;
1858 }
1859
1860 /* now allocate the sender */
1861 shared = new index_t[sum];
1862 memset(send_len, 0, sizeof(dim_t) * size);
1863 num_neighbors=0;
1864 offsetInShared[0] = 0;
1865 for (p=0, k=0; p<size; p++) {
1866 for (i=0; i<num_Pmain_cols; i++) {
1867 if (send_ptr[p][i] > 0) {
1868 shared[k] = i;
1869 k++;
1870 send_ptr[p][send_len[p]] = send_ptr[p][i];
1871 send_len[p]++;
1872 }
1873 }
1874 if (k > offsetInShared[num_neighbors]) {
1875 neighbor[num_neighbors] = p;
1876 num_neighbors ++;
1877 offsetInShared[num_neighbors] = k;
1878 }
1879 }
1880 send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1881 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1882
1883 /* send/recv number of rows will be sent from current proc
1884 recover info for the receiver of row_connector from the sender */
1885 #ifdef ESYS_MPI
1886 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1887 #endif
1888 num_neighbors = 0;
1889 offsetInShared[0] = 0;
1890 j = 0;
1891 for (i=0; i<size; i++) {
1892 if (i != rank && recv_len[i] > 0) {
1893 neighbor[num_neighbors] = i;
1894 num_neighbors ++;
1895 j += recv_len[i];
1896 offsetInShared[num_neighbors] = j;
1897 }
1898 }
1899 delete[] shared;
1900 delete[] recv_len;
1901 shared = new index_t[j];
1902 k = offsetInShared[num_neighbors];
1903 #pragma omp parallel for schedule(static) private(i)
1904 for (i=0; i<k; i++) {
1905 shared[i] = i + num_Pmain_cols;
1906 }
1907 recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors,
1908 neighbor, shared, offsetInShared, 1, 0, mpi_info));
1909 row_connector = paso::Connector_alloc(send, recv);
1910 delete[] shared;
1911
1912 /* send/recv pattern->ptr for rowCoupleBlock */
1913 num_RAPext_rows = offsetInShared[num_neighbors];
1914 row_couple_ptr = new index_t[num_RAPext_rows+1];
1915 for (p=0; p<num_neighbors; p++) {
1916 j = offsetInShared[p];
1917 i = offsetInShared[p+1];
1918 #ifdef ESYS_MPI
1919 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1920 mpi_info->msg_tag_counter+neighbor[p],
1921 mpi_info->comm, &mpi_requests[p]);
1922 #endif
1923 }
1924 send = row_connector->send;
1925 for (p=0; p<send->numNeighbors; p++) {
1926 #ifdef ESYS_MPI
1927 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1928 MPI_INT, send->neighbor[p],
1929 mpi_info->msg_tag_counter+rank,
1930 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1931 #endif
1932 }
1933 #ifdef ESYS_MPI
1934 MPI_Waitall(num_neighbors + send->numNeighbors,
1935 mpi_requests, mpi_stati);
1936 #endif
1937 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1938 delete[] send_len;
1939
1940 sum = 0;
1941 for (i=0; i<num_RAPext_rows; i++) {
1942 k = row_couple_ptr[i];
1943 row_couple_ptr[i] = sum;
1944 sum += k;
1945 }
1946 row_couple_ptr[num_RAPext_rows] = sum;
1947
1948 /* send/recv pattern->index for rowCoupleBlock */
1949 k = row_couple_ptr[num_RAPext_rows];
1950 row_couple_idx = new index_t[k];
1951 for (p=0; p<num_neighbors; p++) {
1952 j1 = row_couple_ptr[offsetInShared[p]];
1953 j2 = row_couple_ptr[offsetInShared[p+1]];
1954 #ifdef ESYS_MPI
1955 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1956 mpi_info->msg_tag_counter+neighbor[p],
1957 mpi_info->comm, &mpi_requests[p]);
1958 #endif
1959 }
1960 for (p=0; p<send->numNeighbors; p++) {
1961 #ifdef ESYS_MPI
1962 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1963 MPI_INT, send->neighbor[p],
1964 mpi_info->msg_tag_counter+rank,
1965 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1966 #endif
1967 }
1968 #ifdef ESYS_MPI
1969 MPI_Waitall(num_neighbors + send->numNeighbors,
1970 mpi_requests, mpi_stati);
1971 #endif
1972 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1973
1974 offset = input_dist->first_component[rank];
1975 k = row_couple_ptr[num_RAPext_rows];
1976 #pragma omp parallel for schedule(static) private(i)
1977 for (i=0; i<k; i++) {
1978 row_couple_idx[i] -= offset;
1979 }
1980 #pragma omp parallel for schedule(static) private(i)
1981 for (i=0; i<size; i++) {
1982 delete[] send_ptr[i];
1983 delete[] send_idx[i];
1984 }
1985 delete[] send_ptr;
1986 delete[] send_idx;
1987 delete[] len;
1988 delete[] offsetInShared;
1989 delete[] neighbor;
1990 delete[] mpi_requests;
1991 delete[] mpi_stati;
1992 Esys_MPIInfo_free(mpi_info);
1993
1994 /* Now, we can create pattern for mainBlock and coupleBlock */
1995 main_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1996 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1997 col_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1998 num_Pmain_cols, num_RAPext_cols,
1999 RAP_couple_ptr, RAP_couple_idx);
2000 row_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2001 num_RAPext_rows, num_Pmain_cols,
2002 row_couple_ptr, row_couple_idx);
2003
2004 /* next, create the system matrix */
2005 pattern = new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
2006 output_dist, input_dist, main_pattern, col_couple_pattern,
2007 row_couple_pattern, col_connector, row_connector);
2008 out = Paso_SystemMatrix_alloc(A->type, pattern,
2009 row_block_size, col_block_size, FALSE);
2010
2011 /* finally, fill in the data*/
2012 memcpy(out->mainBlock->val, RAP_main_val,
2013 out->mainBlock->len* sizeof(double));
2014 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2015 out->col_coupleBlock->len * sizeof(double));
2016
2017 /* Clean up */
2018 paso::SystemMatrixPattern_free(pattern);
2019 paso::Pattern_free(main_pattern);
2020 paso::Pattern_free(col_couple_pattern);
2021 paso::Pattern_free(row_couple_pattern);
2022 paso::Connector_free(col_connector);
2023 paso::Connector_free(row_connector);
2024 delete[] RAP_main_val;
2025 delete[] RAP_couple_val;
2026 if (Esys_noError()) {
2027 return out;
2028 } else {
2029 Paso_SystemMatrix_free(out);
2030 return NULL;
2031 }
2032 }
2033
2034
2035 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2036 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2037 {
2038 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2039 paso::SparseMatrix *R_main=NULL, *R_couple=NULL;
2040 Paso_SystemMatrix *out=NULL;
2041 paso::SystemMatrixPattern *pattern=NULL;
2042 paso::Distribution_ptr input_dist, output_dist;
2043 paso::SharedComponents_ptr send, recv;
2044 paso::Connector *col_connector=NULL, *row_connector=NULL;
2045 paso::Pattern *main_pattern=NULL;
2046 paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2047 const dim_t row_block_size=A->row_block_size;
2048 const dim_t col_block_size=A->col_block_size;
2049 const dim_t block_size = A->block_size;
2050 const double ZERO = 0.0;
2051 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2052 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2053 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2054 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2055 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2056 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2057 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2058 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2059 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2060 index_t sum, i, j, k, iptr, irb, icb, ib;
2061 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2062 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2063 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2064 index_t row_marker_ext, *where_p=NULL;
2065 index_t **send_ptr=NULL, **send_idx=NULL;
2066 dim_t p, num_neighbors;
2067 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2068 Esys_MPI_rank *neighbor=NULL;
2069 #ifdef ESYS_MPI
2070 MPI_Request* mpi_requests=NULL;
2071 MPI_Status* mpi_stati=NULL;
2072 #else
2073 int *mpi_requests=NULL, *mpi_stati=NULL;
2074 #endif
2075
2076 /* two sparse matrices R_main and R_couple will be generate, as the
2077 transpose of P_main and P_col_couple, respectively. Note that,
2078 R_couple is actually the row_coupleBlock of R (R=P^T) */
2079 R_main = paso::SparseMatrix_getTranspose(P->mainBlock);
2080 R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock);
2081
2082 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2083 and needed locally for triple matrix product RAP
2084 to be specific, P_ext (on processor k) are group of rows in P, where
2085 the list of rows from processor q is given by
2086 A->col_coupler->connector->send->shared[rPtr...]
2087 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2088 on q.
2089 P_ext is represented by two sparse matrices P_ext_main and
2090 P_ext_couple */
2091 Paso_Preconditioner_AMG_extendB(A, P);
2092
2093 /* count the number of cols in P_ext_couple, resize the pattern of
2094 sparse matrix P_ext_couple with new compressed order, and then
2095 build the col id mapping from P->col_coupleBlock to
2096 P_ext_couple */
2097 num_Pmain_cols = P->mainBlock->numCols;
2098 num_Pcouple_cols = P->col_coupleBlock->numCols;
2099 num_Acouple_cols = A->col_coupleBlock->numCols;
2100 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2101 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2102 offset = P->col_distribution->first_component[rank];
2103 num_Pext_cols = 0;
2104 if (P->global_id) {
2105 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2106 and P->col_coupleBlock */
2107 iptr = 0;
2108 if (num_Pcouple_cols || sum > 0) {
2109 temp = new index_t[num_Pcouple_cols+sum];
2110 for (; iptr<sum; iptr++){
2111 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2112 }
2113 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2114 temp[iptr] = P->global_id[j];
2115 }
2116 }
2117 if (iptr) {
2118 #ifdef USE_QSORTG
2119 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2120 #else
2121 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2122 #endif
2123 num_Pext_cols = 1;
2124 i = temp[0];
2125 for (j=1; j<iptr; j++) {
2126 if (temp[j] > i) {
2127 i = temp[j];
2128 temp[num_Pext_cols++] = i;
2129 }
2130 }
2131 }
2132
2133 /* resize the pattern of P_ext_couple */
2134 if(num_Pext_cols){
2135 global_id_P = new index_t[num_Pext_cols];
2136 for (i=0; i<num_Pext_cols; i++)
2137 global_id_P[i] = temp[i];
2138 }
2139 if (num_Pcouple_cols || sum > 0)
2140 delete[] temp;
2141 for (i=0; i<sum; i++) {
2142 where_p = (index_t *)bsearch(
2143 &(P->remote_coupleBlock->pattern->index[i]),
2144 global_id_P, num_Pext_cols,
2145 sizeof(index_t), paso::comparIndex);
2146 P->remote_coupleBlock->pattern->index[i] =
2147 (index_t)(where_p -global_id_P);
2148 }
2149
2150 /* build the mapping */
2151 if (num_Pcouple_cols) {
2152 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2153 iptr = 0;
2154 for (i=0; i<num_Pext_cols; i++)
2155 if (global_id_P[i] == P->global_id[iptr]) {
2156 Pcouple_to_Pext[iptr++] = i;
2157 if (iptr == num_Pcouple_cols) break;
2158 }
2159 }
2160 }
2161
2162 /* alloc and initialise the makers */
2163 sum = num_Pext_cols + num_Pmain_cols;
2164 P_marker = new index_t[sum];
2165 A_marker = new index_t[num_A_cols];
2166 #pragma omp parallel for private(i) schedule(static)
2167 for (i=0; i<sum; i++) P_marker[i] = -1;
2168 #pragma omp parallel for private(i) schedule(static)
2169 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2170
2171 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2172 sum = 0;
2173 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2174 row_marker = sum;
2175 /* then, loop over elements in row i_r of R_couple */
2176 j1_ub = R_couple->pattern->ptr[i_r+1];
2177 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2178 i1 = R_couple->pattern->index[j1];
2179 /* then, loop over elements in row i1 of A->col_coupleBlock */
2180 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2181 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2182 i2 = A->col_coupleBlock->pattern->index[j2];
2183
2184 /* check whether entry RA[i_r, i2] has been previously visited.
2185 RAP new entry is possible only if entry RA[i_r, i2] has not
2186 been visited yet */
2187 if (A_marker[i2] != i_r) {
2188 /* first, mark entry RA[i_r,i2] as visited */
2189 A_marker[i2] = i_r;
2190
2191 /* then loop over elements in row i2 of P_ext_main */
2192 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2193 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2194 i_c = P->row_coupleBlock->pattern->index[j3];
2195
2196 /* check whether entry RAP[i_r,i_c] has been created.
2197 If not yet created, create the entry and increase
2198 the total number of elements in RAP_ext */
2199 if (P_marker[i_c] < row_marker) {
2200 P_marker[i_c] = sum;
2201 sum++;
2202 }
2203 }
2204
2205 /* loop over elements in row i2 of P_ext_couple, do the same */
2206 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2207 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2208 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2209 if (P_marker[i_c] < row_marker) {
2210 P_marker[i_c] = sum;
2211 sum++;
2212 }
2213 }
2214 }
2215 }
2216
2217 /* now loop over elements in row i1 of A->mainBlock, repeat
2218 the process we have done to block A->col_coupleBlock */
2219 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2220 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2221 i2 = A->mainBlock->pattern->index[j2];
2222 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2223 A_marker[i2 + num_Acouple_cols] = i_r;
2224 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2225 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2226 i_c = P->mainBlock->pattern->index[j3];
2227 if (P_marker[i_c] < row_marker) {
2228 P_marker[i_c] = sum;
2229 sum++;
2230 }
2231 }
2232 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2233 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2234 /* note that we need to map the column index in
2235 P->col_coupleBlock back into the column index in
2236 P_ext_couple */
2237 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2238 if (P_marker[i_c] < row_marker) {
2239 P_marker[i_c] = sum;
2240 sum++;
2241 }
2242 }
2243 }
2244 }
2245 }
2246 }
2247
2248 /* Now we have the number of non-zero elements in RAP_ext, allocate
2249 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2250 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2251 RAP_ext_idx = new index_t[sum];
2252 RAP_ext_val = new double[sum * block_size];
2253 RA_val = new double[block_size];
2254 RAP_val = new double[block_size];
2255
2256 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2257 sum = num_Pext_cols + num_Pmain_cols;
2258 #pragma omp parallel for private(i) schedule(static)
2259 for (i=0; i<sum; i++) P_marker[i] = -1;
2260 #pragma omp parallel for private(i) schedule(static)
2261 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2262 sum = 0;
2263 RAP_ext_ptr[0] = 0;
2264 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2265 row_marker = sum;
2266 /* then, loop over elements in row i_r of R_couple */
2267 j1_ub = R_couple->pattern->ptr[i_r+1];
2268 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2269 i1 = R_couple->pattern->index[j1];
2270 R_val = &(R_couple->val[j1*block_size]);
2271
2272 /* then, loop over elements in row i1 of A->col_coupleBlock */
2273 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2274 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2275 i2 = A->col_coupleBlock->pattern->index[j2];
2276 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2277 for (irb=0; irb<row_block_size; irb++) {
2278 for (icb=0; icb<col_block_size; icb++) {
2279 rtmp = ZERO;
2280 for (ib=0; ib<col_block_size; ib++) {
2281 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2282 }
2283 RA_val[irb+row_block_size*icb]=rtmp;
2284 }
2285 }
2286
2287 /* check whether entry RA[i_r, i2] has been previously visited.
2288 RAP new entry is possible only if entry RA[i_r, i2] has not
2289 been visited yet */
2290 if (A_marker[i2] != i_r) {
2291 /* first, mark entry RA[i_r,i2] as visited */
2292 A_marker[i2] = i_r;
2293
2294 /* then loop over elements in row i2 of P_ext_main */
2295 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2296 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2297 i_c = P->row_coupleBlock->pattern->index[j3];
2298 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2299 for (irb=0; irb<row_block_size; irb++) {
2300 for (icb=0; icb<col_block_size; icb++) {
2301 rtmp = ZERO;
2302 for (ib=0; ib<col_block_size; ib++) {
2303 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2304 }
2305 RAP_val[irb+row_block_size*icb]=rtmp;
2306 }
2307 }
2308
2309
2310 /* check whether entry RAP[i_r,i_c] has been created.
2311 If not yet created, create the entry and increase
2312 the total number of elements in RAP_ext */
2313 if (P_marker[i_c] < row_marker) {
2314 P_marker[i_c] = sum;
2315 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2316 RAP_ext_idx[sum] = i_c + offset;
2317 sum++;
2318 } else {
2319 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2320 for (ib=0; ib<block_size; ib++) {
2321 temp_val[ib] += RAP_val[ib];
2322 }
2323 }
2324 }
2325
2326 /* loop over elements in row i2 of P_ext_couple, do the same */
2327 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2328 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2329 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2330 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2331 for (irb=0; irb<row_block_size; irb++) {
2332 for (icb=0; icb<col_block_size; icb++) {
2333 rtmp = ZERO;
2334 for (ib=0; ib<col_block_size; ib++) {
2335 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2336 }
2337 RAP_val[irb+row_block_size*icb]=rtmp;
2338 }
2339 }
2340
2341 if (P_marker[i_c] < row_marker) {
2342 P_marker[i_c] = sum;
2343 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2344 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2345 sum++;
2346 } else {
2347 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2348 for (ib=0; ib<block_size; ib++) {
2349 temp_val[ib] += RAP_val[ib];
2350 }
2351 }
2352 }
2353
2354 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2355 Only the contributions are added. */
2356 } else {
2357 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2358 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2359 i_c = P->row_coupleBlock->pattern->index[j3];
2360 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2361 for (irb=0; irb<row_block_size; irb++) {
2362 for (icb=0; icb<col_block_size; icb++) {
2363 rtmp = ZERO;
2364 for (ib=0; ib<col_block_size; ib++) {
2365 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2366 }
2367 RAP_val[irb+row_block_size*icb]=rtmp;
2368 }
2369 }
2370
2371 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2372 for (ib=0; ib<block_size; ib++) {
2373 temp_val[ib] += RAP_val[ib];
2374 }
2375 }
2376 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2377 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2378 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2379 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2380 for (irb=0; irb<row_block_size; irb++) {
2381 for (icb=0; icb<col_block_size; icb++) {
2382 rtmp = ZERO;
2383 for (ib=0; ib<col_block_size; ib++) {
2384 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2385 }
2386 RAP_val[irb+row_block_size*icb]=rtmp;
2387 }
2388 }
2389
2390 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2391 for (ib=0; ib<block_size; ib++) {
2392 temp_val[ib] += RAP_val[ib];
2393 }
2394 }
2395 }
2396 }
2397
2398 /* now loop over elements in row i1 of A->mainBlock, repeat
2399 the process we have done to block A->col_coupleBlock */
2400 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2401 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2402 i2 = A->mainBlock->pattern->index[j2];
2403 temp_val = &(A->mainBlock->val[j2*block_size]);
2404 for (irb=0; irb<row_block_size; irb++) {
2405 for (icb=0; icb<col_block_size; icb++) {
2406 rtmp = ZERO;
2407 for (ib=0; ib<col_block_size; ib++) {
2408 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2409 }
2410 RA_val[irb+row_block_size*icb]=rtmp;
2411 }
2412 }
2413
2414 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2415 A_marker[i2 + num_Acouple_cols] = i_r;
2416 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2417 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2418 i_c = P->mainBlock->pattern->index[j3];
2419 temp_val = &(P->mainBlock->val[j3*block_size]);
2420 for (irb=0; irb<row_block_size; irb++) {
2421 for (icb=0; icb<col_block_size; icb++) {
2422 rtmp = ZERO;
2423 for (ib=0; ib<col_block_size; ib++) {
2424 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2425 }
2426 RAP_val[irb+row_block_size*icb]=rtmp;
2427 }
2428 }
2429
2430 if (P_marker[i_c] < row_marker) {
2431 P_marker[i_c] = sum;
2432 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2433 RAP_ext_idx[sum] = i_c + offset;
2434 sum++;
2435 } else {
2436 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2437 for (ib=0; ib<block_size; ib++) {
2438 temp_val[ib] += RAP_val[ib];
2439 }
2440 }
2441 }
2442 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2443 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2444 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2445 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2446 for (irb=0; irb<row_block_size; irb++) {
2447 for (icb=0; icb<col_block_size; icb++) {
2448 rtmp = ZERO;
2449 for (ib=0; ib<col_block_size; ib++) {
2450 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2451 }
2452 RAP_val[irb+row_block_size*icb]=rtmp;
2453 }
2454 }
2455
2456 if (P_marker[i_c] < row_marker) {
2457 P_marker[i_c] = sum;
2458 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2459 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2460 sum++;
2461 } else {
2462 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2463 for (ib=0; ib<block_size; ib++) {
2464 temp_val[ib] += RAP_val[ib];
2465 }
2466 }
2467 }
2468 } else {
2469 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2470 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2471 i_c = P->mainBlock->pattern->index[j3];
2472 temp_val = &(P->mainBlock->val[j3*block_size]);
2473 for (irb=0; irb<row_block_size; irb++) {
2474 for (icb=0; icb<col_block_size; icb++) {
2475 rtmp = ZERO;
2476 for (ib=0; ib<col_block_size; ib++) {
2477 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2478 }
2479 RAP_val[irb+row_block_size*icb]=rtmp;
2480 }
2481 }
2482
2483 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2484 for (ib=0; ib<block_size; ib++) {
2485 temp_val[ib] += RAP_val[ib];
2486 }
2487 }
2488 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2489 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2490 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2491 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2492 for (irb=0; irb<row_block_size; irb++) {
2493 for (icb=0; icb<col_block_size; icb++) {
2494 rtmp = ZERO;
2495 for (ib=0; ib<col_block_size; ib++) {
2496 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2497 }
2498 RAP_val[irb+row_block_size*icb]=rtmp;
2499 }
2500 }
2501
2502 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2503 for (ib=0; ib<block_size; ib++) {
2504 temp_val[ib] += RAP_val[ib];
2505 }
2506 }
2507 }
2508 }
2509 }
2510 RAP_ext_ptr[i_r+1] = sum;
2511 }
2512 delete[] P_marker;
2513 delete[] Pcouple_to_Pext;
2514
2515 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2516 which is given by the column list of P->col_coupleBlock, and
2517 column "c" is the list of columns which possibly covers the
2518 whole column range of system matrix P. This part of data is to
2519 be passed to neighbouring processors, and added to corresponding
2520 RAP[r,c] entries in the neighbouring processors */
2521 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2522 &RAP_ext_val, global_id_P, block_size);
2523
2524 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2525 sum = RAP_ext_ptr[num_RAPext_rows];
2526 num_RAPext_cols = 0;
2527 if (num_Pext_cols || sum > 0) {
2528 temp = new index_t[num_Pext_cols+sum];
2529 j1_ub = offset + num_Pmain_cols;
2530 for (i=0, iptr=0; i<sum; i++) {
2531 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2532 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2533 }
2534 for (j=0; j<num_Pext_cols; j++, iptr++){
2535 temp[iptr] = global_id_P[j];
2536 }
2537
2538 if (iptr) {
2539 #ifdef USE_QSORTG
2540 qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2541 #else
2542 qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex);
2543 #endif
2544 num_RAPext_cols = 1;
2545 i = temp[0];
2546 for (j=1; j<iptr; j++) {
2547 if (temp[j] > i) {
2548 i = temp[j];
2549 temp[num_RAPext_cols++] = i;
2550 }
2551 }
2552 }
2553 }
2554
2555 /* resize the pattern of P_ext_couple */
2556 if(num_RAPext_cols){
2557 global_id_RAP = new index_t[num_RAPext_cols];
2558 for (i=0; i<num_RAPext_cols; i++)
2559 global_id_RAP[i] = temp[i];
2560 }
2561 if (num_Pext_cols || sum > 0)
2562 delete[] temp;
2563 j1_ub = offset + num_Pmain_cols;
2564 for (i=0; i<sum; i++) {
2565 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2566 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2567 /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex);
2568 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2569 } else
<