/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4345 - (show annotations)
Fri Mar 29 07:09:41 2013 UTC (6 years, 7 months ago) by jfenwick
Original Path: branches/doubleplusgood/paso/src/AMG_Interpolation.cpp
File size: 122380 byte(s)
Spelling fixes
1 /*****************************************************************************
2 *
3 * Copyright (c) 2003-2013 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development since 2012 by School of Earth Sciences
12 *
13 *****************************************************************************/
14
15
16 /************************************************************************************/
17
18 /* Paso: defines AMG Interpolation */
19
20 /************************************************************************************/
21
22 /* Author: l.gao@uq.edu.au */
23
24 /************************************************************************************/
25
26 #include "Paso.h"
27 #include "SparseMatrix.h"
28 #include "PasoUtil.h"
29 #include "Preconditioner.h"
30
31 /************************************************************************************
32
33 Methods necessary for AMG preconditioner
34
35 construct n_C x n_C interpolation operator A_C from matrix A
36 and prolongation matrix P.
37
38 The coarsening operator A_C is defined by RAP where R=P^T.
39
40 *************************************************************************************/
41
42 /* Extend system matrix B with extra two sparse matrices:
43 B_ext_main and B_ext_couple
44 The combination of this two sparse matrices represents
45 the portion of B that is stored on neighbour procs and
46 needed locally for triple matrix product (B^T) A B.
47
48 FOR NOW, we assume that the system matrix B has a NULL
49 row_coupleBlock and a NULL remote_coupleBlock. Based on
50 this assumption, we use link row_coupleBlock for sparse
51 matrix B_ext_main, and link remote_coupleBlock for sparse
52 matrix B_ext_couple.
53
54 To be specific, B_ext (on processor k) are group of rows
55 in B, where the list of rows from processor q is given by
56 A->col_coupler->connector->send->shared[rPtr...]
57 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
58 on q. */
59 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
60 {
61 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
62 Paso_Coupler *coupler=NULL;
63 Paso_SharedComponents *send=NULL, *recv=NULL;
64 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
65 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
66 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
67 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
68 index_t *idx_m=NULL, *idx_c=NULL;
69 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
70 index_t offset, len, block_size, block_size_size, max_num_cols;
71 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
72 dim_t *recv_degree=NULL, *send_degree=NULL;
73 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
74
75 if (size == 1) return;
76
77 if (B->row_coupleBlock) {
78 Paso_SparseMatrix_free(B->row_coupleBlock);
79 B->row_coupleBlock = NULL;
80 }
81
82 if (B->row_coupleBlock || B->remote_coupleBlock) {
83 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
84 return;
85 }
86
87 /* sending/receiving unknown's global ID */
88 num_main_cols = B->mainBlock->numCols;
89 cols = new double[num_main_cols];
90 offset = B->col_distribution->first_component[rank];
91 #pragma omp parallel for private(i) schedule(static)
92 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
93 if (B->global_id == NULL) {
94 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
95 Paso_Coupler_startCollect(coupler, cols);
96 }
97
98 recv_buf = new index_t[size];
99 recv_degree = new dim_t[size];
100 recv_offset = new index_t[size+1];
101 #pragma omp parallel for private(i) schedule(static)
102 for (i=0; i<size; i++){
103 recv_buf[i] = 0;
104 recv_degree[i] = 1;
105 recv_offset[i] = i;
106 }
107
108 block_size = B->block_size;
109 block_size_size = block_size * sizeof(double);
110 num_couple_cols = B->col_coupleBlock->numCols;
111 send = A->col_coupler->connector->send;
112 recv = A->col_coupler->connector->recv;
113 num_neighbors = send->numNeighbors;
114 p = send->offsetInShared[num_neighbors];
115 len = p * B->col_distribution->first_component[size];
116 send_buf = new double[len * block_size];
117 send_idx = new index_t[len];
118 send_offset = new index_t[(p+1)*2];
119 send_degree = new dim_t[num_neighbors];
120 i = num_main_cols + num_couple_cols;
121 send_m = new double[i * block_size];
122 send_c = new double[i * block_size];
123 idx_m = new index_t[i];
124 idx_c = new index_t[i];
125
126 /* waiting for receiving unknown's global ID */
127 if (B->global_id == NULL) {
128 Paso_Coupler_finishCollect(coupler);
129 global_id = new index_t[num_couple_cols];
130 #pragma omp parallel for private(i) schedule(static)
131 for (i=0; i<num_couple_cols; ++i)
132 global_id[i] = coupler->recv_buffer[i];
133 B->global_id = global_id;
134 Paso_Coupler_free(coupler);
135 } else
136 global_id = B->global_id;
137
138 /* distribute the number of cols in current col_coupleBlock to all ranks */
139 #ifdef ESYS_MPI
140 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
141 #endif
142
143 /* distribute global_ids of cols to be considered to all ranks*/
144 len = 0;
145 max_num_cols = 0;
146 for (i=0; i<size; i++){
147 recv_degree[i] = recv_buf[i];
148 recv_offset[i] = len;
149 len += recv_buf[i];
150 if (max_num_cols < recv_buf[i])
151 max_num_cols = recv_buf[i];
152 }
153 recv_offset[size] = len;
154 cols_array = new index_t[len];
155 #ifdef ESYS_MPI
156 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
157 #endif
158
159 /* first, prepare the ptr_ptr to be received */
160 q = recv->numNeighbors;
161 len = recv->offsetInShared[q];
162 ptr_ptr = new index_t[(len+1) * 2];
163 for (p=0; p<q; p++) {
164 row = recv->offsetInShared[p];
165 m = recv->offsetInShared[p + 1];
166 #ifdef ESYS_MPI
167 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
168 A->mpi_info->msg_tag_counter+recv->neighbor[p],
169 A->mpi_info->comm,
170 &(A->col_coupler->mpi_requests[p]));
171 #endif
172 }
173
174 /* now prepare the rows to be sent (the degree, the offset and the data) */
175 len = 0;
176 i0 = 0;
177 for (p=0; p<num_neighbors; p++) {
178 i = i0;
179 neighbor = send->neighbor[p];
180 m_lb = B->col_distribution->first_component[neighbor];
181 m_ub = B->col_distribution->first_component[neighbor + 1];
182 j_ub = send->offsetInShared[p + 1];
183 for (j=send->offsetInShared[p]; j<j_ub; j++) {
184 row = send->shared[j];
185 l_m = 0;
186 l_c = 0;
187 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
188 k_lb = B->col_coupleBlock->pattern->ptr[row];
189
190 /* check part of col_coupleBlock for data to be passed @row */
191 for (k=k_lb; k<k_ub; k++) {
192 m = global_id[B->col_coupleBlock->pattern->index[k]];
193 if (m > offset) break;
194 if (m>= m_lb && m < m_ub) {
195 /* data to be passed to sparse matrix B_ext_main */
196 idx_m[l_m] = m - m_lb;
197 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
198 l_m++;
199 } else {
200 /* data to be passed to sparse matrix B_ext_couple */
201 idx_c[l_c] = m;
202 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
203 l_c++;
204 }
205 }
206 k_lb = k;
207
208 /* check mainBlock for data to be passed @row to sparse
209 matrix B_ext_couple */
210 k_ub = B->mainBlock->pattern->ptr[row + 1];
211 k = B->mainBlock->pattern->ptr[row];
212 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
213 for (; k<k_ub; k++) {
214 m = B->mainBlock->pattern->index[k] + offset;
215 idx_c[l_c] = m;
216 l_c++;
217 }
218
219 /* check the rest part of col_coupleBlock for data to
220 be passed @row to sparse matrix B_ext_couple */
221 k = k_lb;
222 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
223 for (k=k_lb; k<k_ub; k++) {
224 m = global_id[B->col_coupleBlock->pattern->index[k]];
225 if (m>= m_lb && m < m_ub) {
226 /* data to be passed to sparse matrix B_ext_main */
227 idx_m[l_m] = m - m_lb;
228 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
229 l_m++;
230 } else {
231 /* data to be passed to sparse matrix B_ext_couple */
232 idx_c[l_c] = m;
233 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
234 l_c++;
235 }
236 }
237
238 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
239 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
240 send_offset[2*i] = l_m;
241 len += l_m;
242 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
243 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
244 send_offset[2*i+1] = l_c;
245 len += l_c;
246 i++;
247 }
248
249 /* sending */
250 #ifdef ESYS_MPI
251 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
252 A->mpi_info->msg_tag_counter+rank,
253 A->mpi_info->comm,
254 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
255 #endif
256 send_degree[p] = len;
257 i0 = i;
258 }
259 delete[] send_m;
260 delete[] send_c;
261 delete[] idx_m;
262 delete[] idx_c;
263
264
265 q = recv->numNeighbors;
266 len = recv->offsetInShared[q];
267 ptr_main = new index_t[(len+1)];
268 ptr_couple = new index_t[(len+1)];
269
270 #ifdef ESYS_MPI
271 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
272 A->col_coupler->mpi_requests,
273 A->col_coupler->mpi_stati);
274 #endif
275 A->mpi_info->msg_tag_counter += size;
276
277 j = 0;
278 k = 0;
279 ptr_main[0] = 0;
280 ptr_couple[0] = 0;
281 for (i=0; i<len; i++) {
282 j += ptr_ptr[2*i];
283 k += ptr_ptr[2*i+1];
284 ptr_main[i+1] = j;
285 ptr_couple[i+1] = k;
286 }
287
288 delete[] ptr_ptr;
289 idx_main = new index_t[j];
290 idx_couple = new index_t[k];
291 ptr_idx = new index_t[j+k];
292 ptr_val = new double[(j+k) * block_size];
293
294 /* send/receive index array */
295 j=0;
296 k_ub = 0;
297 for (p=0; p<recv->numNeighbors; p++) {
298 k = recv->offsetInShared[p];
299 m = recv->offsetInShared[p+1];
300 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
301 if (i > 0) {
302 k_ub ++;
303 #ifdef ESYS_MPI
304 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
305 A->mpi_info->msg_tag_counter+recv->neighbor[p],
306 A->mpi_info->comm,
307 &(A->col_coupler->mpi_requests[p]));
308 #endif
309 }
310 j += i;
311 }
312
313 j=0;
314 k_ub = 0;
315 for (p=0; p<num_neighbors; p++) {
316 i = send_degree[p] - j;
317 if (i > 0){
318 k_ub ++;
319 #ifdef ESYS_MPI
320 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
321 A->mpi_info->msg_tag_counter+rank,
322 A->mpi_info->comm,
323 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
324 #endif
325 }
326 j = send_degree[p];
327 }
328
329 #ifdef ESYS_MPI
330 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
331 A->col_coupler->mpi_requests,
332 A->col_coupler->mpi_stati);
333 #endif
334 A->mpi_info->msg_tag_counter += size;
335
336 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
337 for (i=0; i<len; i++) {
338 j = ptr_main[i];
339 k = ptr_main[i+1];
340 m = ptr_couple[i];
341 for (p=j; p<k; p++) {
342 idx_main[p] = ptr_idx[m+p];
343 }
344 j = ptr_couple[i+1];
345 for (p=m; p<j; p++) {
346 idx_couple[p] = ptr_idx[k+p];
347 }
348 }
349 delete[] ptr_idx;
350
351 /* allocate pattern and sparsematrix for B_ext_main */
352 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
353 len, num_main_cols, ptr_main, idx_main);
354 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
355 pattern_main, B->row_block_size, B->col_block_size,
356 FALSE);
357 Paso_Pattern_free(pattern_main);
358
359 /* allocate pattern and sparsematrix for B_ext_couple */
360 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
361 len, B->col_distribution->first_component[size],
362 ptr_couple, idx_couple);
363 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
364 pattern_couple, B->row_block_size, B->col_block_size,
365 FALSE);
366 Paso_Pattern_free(pattern_couple);
367
368 /* send/receive value array */
369 j=0;
370 for (p=0; p<recv->numNeighbors; p++) {
371 k = recv->offsetInShared[p];
372 m = recv->offsetInShared[p+1];
373 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
374 #ifdef ESYS_MPI
375 if (i > 0)
376 MPI_Irecv(&(ptr_val[j]), i * block_size,
377 MPI_DOUBLE, recv->neighbor[p],
378 A->mpi_info->msg_tag_counter+recv->neighbor[p],
379 A->mpi_info->comm,
380 &(A->col_coupler->mpi_requests[p]));
381 #endif
382 j += (i * block_size);
383 }
384
385 j=0;
386 for (p=0; p<num_neighbors; p++) {
387 i = send_degree[p] - j;
388 #ifdef ESYS_MPI
389 if (i > 0)
390 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
391 A->mpi_info->msg_tag_counter+rank,
392 A->mpi_info->comm,
393 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
394 #endif
395 j = send_degree[p] ;
396 }
397
398 #ifdef ESYS_MPI
399 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
400 A->col_coupler->mpi_requests,
401 A->col_coupler->mpi_stati);
402 #endif
403 A->mpi_info->msg_tag_counter += size;
404
405 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
406 for (i=0; i<len; i++) {
407 j = ptr_main[i];
408 k = ptr_main[i+1];
409 m = ptr_couple[i];
410 for (p=j; p<k; p++) {
411 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
412 }
413 j = ptr_couple[i+1];
414 for (p=m; p<j; p++) {
415 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
416 }
417 }
418 delete[] ptr_val;
419
420
421 /* release all temp memory allocation */
422 delete[] cols;
423 delete[] cols_array;
424 delete[] recv_offset;
425 delete[] recv_degree;
426 delete[] recv_buf;
427 delete[] send_buf;
428 delete[] send_offset;
429 delete[] send_degree;
430 delete[] send_idx;
431 }
432
433 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
434 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
435 to copy block of data in T to neighbour processors, defined by
436 P->col_coupler->connector->recv->neighbor[k] where k is in
437 [0, P->col_coupler->connector->recv->numNeighbors).
438 Rows to be copied to neighbor processor k is in the list defined by
439 P->col_coupler->connector->recv->offsetInShared[k] ...
440 P->col_coupler->connector->recv->offsetInShared[k+1] */
441 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
442 index_t **p_ptr, index_t **p_idx, double **p_val,
443 index_t *global_id, index_t block_size)
444 {
445 Paso_SharedComponents *send=NULL, *recv=NULL;
446 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
447 index_t i, j, p, m, n, size;
448 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
449 index_t *ptr=*p_ptr, *idx=*p_idx;
450 double *val=*p_val, *recv_val=NULL;
451 #ifdef ESYS_MPI
452 index_t rank = P->mpi_info->rank;
453 #endif
454
455 size = P->mpi_info->size;
456 send = P->col_coupler->connector->recv;
457 recv = P->col_coupler->connector->send;
458 send_neighbors = send->numNeighbors;
459 recv_neighbors = recv->numNeighbors;
460 send_rows = P->col_coupleBlock->numCols;
461 recv_rows = recv->offsetInShared[recv_neighbors];
462
463 send_degree = new index_t[send_rows];
464 recv_ptr = new index_t[recv_rows + 1];
465 #pragma omp for schedule(static) private(i)
466 for (i=0; i<send_rows; i++)
467 send_degree[i] = ptr[i+1] - ptr[i];
468
469 /* First, send/receive the degree */
470 for (p=0; p<recv_neighbors; p++) { /* Receiving */
471 m = recv->offsetInShared[p];
472 n = recv->offsetInShared[p+1];
473 #ifdef ESYS_MPI
474 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
475 P->mpi_info->msg_tag_counter + recv->neighbor[p],
476 P->mpi_info->comm,
477 &(P->col_coupler->mpi_requests[p]));
478 #endif
479 }
480 for (p=0; p<send_neighbors; p++) { /* Sending */
481 m = send->offsetInShared[p];
482 n = send->offsetInShared[p+1];
483 #ifdef ESYS_MPI
484 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
485 P->mpi_info->msg_tag_counter + rank,
486 P->mpi_info->comm,
487 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
488 #endif
489 }
490 #ifdef ESYS_MPI
491 MPI_Waitall(send_neighbors+recv_neighbors,
492 P->col_coupler->mpi_requests,
493 P->col_coupler->mpi_stati);
494 #endif
495 P->mpi_info->msg_tag_counter += size;
496
497 delete[] send_degree;
498 m = Paso_Util_cumsum(recv_rows, recv_ptr);
499 recv_ptr[recv_rows] = m;
500 recv_idx = new index_t[m];
501 recv_val = new double[m * block_size];
502
503 /* Next, send/receive the index array */
504 j = 0;
505 for (p=0; p<recv_neighbors; p++) { /* Receiving */
506 m = recv->offsetInShared[p];
507 n = recv->offsetInShared[p+1];
508 i = recv_ptr[n] - recv_ptr[m];
509 if (i > 0) {
510 #ifdef ESYS_MPI
511 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
512 P->mpi_info->msg_tag_counter + recv->neighbor[p],
513 P->mpi_info->comm,
514 &(P->col_coupler->mpi_requests[p]));
515 #endif
516 }
517 j += i;
518 }
519
520 j = 0;
521 for (p=0; p<send_neighbors; p++) { /* Sending */
522 m = send->offsetInShared[p];
523 n = send->offsetInShared[p+1];
524 i = ptr[n] - ptr[m];
525 if (i >0) {
526 #ifdef ESYS_MPI
527 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
528 P->mpi_info->msg_tag_counter + rank,
529 P->mpi_info->comm,
530 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
531 #endif
532 j += i;
533 }
534 }
535 #ifdef ESYS_MPI
536 MPI_Waitall(send_neighbors+recv_neighbors,
537 P->col_coupler->mpi_requests,
538 P->col_coupler->mpi_stati);
539 #endif
540 P->mpi_info->msg_tag_counter += size;
541
542 /* Last, send/receive the data array */
543 j = 0;
544 for (p=0; p<recv_neighbors; p++) { /* Receiving */
545 m = recv->offsetInShared[p];
546 n = recv->offsetInShared[p+1];
547 i = recv_ptr[n] - recv_ptr[m];
548 #ifdef ESYS_MPI
549 if (i > 0)
550 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
551 P->mpi_info->msg_tag_counter + recv->neighbor[p],
552 P->mpi_info->comm,
553 &(P->col_coupler->mpi_requests[p]));
554 #endif
555 j += (i*block_size);
556 }
557
558 j = 0;
559 for (p=0; p<send_neighbors; p++) { /* Sending */
560 m = send->offsetInShared[p];
561 n = send->offsetInShared[p+1];
562 i = ptr[n] - ptr[m];
563 if (i >0) {
564 #ifdef ESYS_MPI
565 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
566 P->mpi_info->msg_tag_counter + rank,
567 P->mpi_info->comm,
568 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
569 #endif
570 j += i * block_size;
571 }
572 }
573 #ifdef ESYS_MPI
574 MPI_Waitall(send_neighbors+recv_neighbors,
575 P->col_coupler->mpi_requests,
576 P->col_coupler->mpi_stati);
577 #endif
578 P->mpi_info->msg_tag_counter += size;
579
580 /* Clean up and return with received ptr, index and data arrays */
581 delete[] ptr;
582 delete[] idx;
583 delete[] val;
584 *p_ptr = recv_ptr;
585 *p_idx = recv_idx;
586 *p_val = recv_val;
587 }
588
589 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
590 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
591 {
592 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
593 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
594 Paso_SystemMatrix *out=NULL;
595 Paso_SystemMatrixPattern *pattern=NULL;
596 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
597 Paso_SharedComponents *send =NULL, *recv=NULL;
598 Paso_Connector *col_connector=NULL, *row_connector=NULL;
599 Paso_Pattern *main_pattern=NULL;
600 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
601 const dim_t row_block_size=A->row_block_size;
602 const dim_t col_block_size=A->col_block_size;
603 const dim_t block_size = A->block_size;
604 const dim_t P_block_size = P->block_size;
605 const double ZERO = 0.0;
606 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
607 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
608 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
609 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
610 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
611 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
612 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
613 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
614 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
615 index_t sum, i, j, k, iptr, irb, icb, ib;
616 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
617 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
618 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
619 index_t row_marker_ext, *where_p=NULL;
620 index_t **send_ptr=NULL, **send_idx=NULL;
621 dim_t p, num_neighbors;
622 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
623 Esys_MPI_rank *neighbor=NULL;
624 #ifdef ESYS_MPI
625 MPI_Request* mpi_requests=NULL;
626 MPI_Status* mpi_stati=NULL;
627 #else
628 int *mpi_requests=NULL, *mpi_stati=NULL;
629 #endif
630
631 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
632 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
633
634 /* two sparse matrices R_main and R_couple will be generate, as the
635 transpose of P_main and P_col_couple, respectively. Note that,
636 R_couple is actually the row_coupleBlock of R (R=P^T) */
637 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
638 if (size > 1)
639 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
640
641 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
642 and needed locally for triple matrix product RAP
643 to be specific, P_ext (on processor k) are group of rows in P, where
644 the list of rows from processor q is given by
645 A->col_coupler->connector->send->shared[rPtr...]
646 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
647 on q.
648 P_ext is represented by two sparse matrices P_ext_main and
649 P_ext_couple */
650 Paso_Preconditioner_AMG_extendB(A, P);
651
652 /* count the number of cols in P_ext_couple, resize the pattern of
653 sparse matrix P_ext_couple with new compressed order, and then
654 build the col id mapping from P->col_coupleBlock to
655 P_ext_couple */
656 num_Pmain_cols = P->mainBlock->numCols;
657 if (size > 1) {
658 num_Pcouple_cols = P->col_coupleBlock->numCols;
659 num_Acouple_cols = A->col_coupleBlock->numCols;
660 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
661 } else {
662 num_Pcouple_cols = 0;
663 num_Acouple_cols = 0;
664 sum = 0;
665 }
666 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
667 offset = P->col_distribution->first_component[rank];
668 num_Pext_cols = 0;
669 if (P->global_id) {
670 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
671 and P->col_coupleBlock */
672 iptr = 0;
673 if (num_Pcouple_cols || sum > 0) {
674 temp = new index_t[num_Pcouple_cols+sum];
675 #pragma omp parallel for lastprivate(iptr) schedule(static)
676 for (iptr=0; iptr<sum; iptr++){
677 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
678 }
679 for (j=0; j<num_Pcouple_cols; j++, iptr++){
680 temp[iptr] = P->global_id[j];
681 }
682 }
683 if (iptr) {
684 #ifdef USE_QSORTG
685 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
686 #else
687 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
688 #endif
689 num_Pext_cols = 1;
690 i = temp[0];
691 for (j=1; j<iptr; j++) {
692 if (temp[j] > i) {
693 i = temp[j];
694 temp[num_Pext_cols++] = i;
695 }
696 }
697 }
698 /* resize the pattern of P_ext_couple */
699 if(num_Pext_cols){
700 global_id_P = new index_t[num_Pext_cols];
701 #pragma omp parallel for private(i) schedule(static)
702 for (i=0; i<num_Pext_cols; i++)
703 global_id_P[i] = temp[i];
704 }
705 if (num_Pcouple_cols || sum > 0)
706 delete[] temp;
707 #pragma omp parallel for private(i, where_p) schedule(static)
708 for (i=0; i<sum; i++) {
709 where_p = (index_t *)bsearch(
710 &(P->remote_coupleBlock->pattern->index[i]),
711 global_id_P, num_Pext_cols,
712 sizeof(index_t), Paso_comparIndex);
713 P->remote_coupleBlock->pattern->index[i] =
714 (index_t)(where_p -global_id_P);
715 }
716
717 /* build the mapping */
718 if (num_Pcouple_cols) {
719 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
720 iptr = 0;
721 for (i=0; i<num_Pext_cols; i++)
722 if (global_id_P[i] == P->global_id[iptr]) {
723 Pcouple_to_Pext[iptr++] = i;
724 if (iptr == num_Pcouple_cols) break;
725 }
726 }
727 }
728
729 /* alloc and initialise the makers */
730 sum = num_Pext_cols + num_Pmain_cols;
731 P_marker = new index_t[sum];
732 A_marker = new index_t[num_A_cols];
733 #pragma omp parallel for private(i) schedule(static)
734 for (i=0; i<sum; i++) P_marker[i] = -1;
735 #pragma omp parallel for private(i) schedule(static)
736 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
737
738 /* Now, count the size of RAP_ext. Start with rows in R_couple */
739 sum = 0;
740 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
741 row_marker = sum;
742 /* then, loop over elements in row i_r of R_couple */
743 j1_ub = R_couple->pattern->ptr[i_r+1];
744 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
745 i1 = R_couple->pattern->index[j1];
746 /* then, loop over elements in row i1 of A->col_coupleBlock */
747 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
748 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
749 i2 = A->col_coupleBlock->pattern->index[j2];
750
751 /* check whether entry RA[i_r, i2] has been previously visited.
752 RAP new entry is possible only if entry RA[i_r, i2] has not
753 been visited yet */
754 if (A_marker[i2] != i_r) {
755 /* first, mark entry RA[i_r,i2] as visited */
756 A_marker[i2] = i_r;
757
758 /* then loop over elements in row i2 of P_ext_main */
759 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
760 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
761 i_c = P->row_coupleBlock->pattern->index[j3];
762
763 /* check whether entry RAP[i_r,i_c] has been created.
764 If not yet created, create the entry and increase
765 the total number of elements in RAP_ext */
766 if (P_marker[i_c] < row_marker) {
767 P_marker[i_c] = sum;
768 sum++;
769 }
770 }
771
772 /* loop over elements in row i2 of P_ext_couple, do the same */
773 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
774 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
775 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
776 if (P_marker[i_c] < row_marker) {
777 P_marker[i_c] = sum;
778 sum++;
779 }
780 }
781 }
782 }
783
784 /* now loop over elements in row i1 of A->mainBlock, repeat
785 the process we have done to block A->col_coupleBlock */
786 j2_ub = A->mainBlock->pattern->ptr[i1+1];
787 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
788 i2 = A->mainBlock->pattern->index[j2];
789 if (A_marker[i2 + num_Acouple_cols] != i_r) {
790 A_marker[i2 + num_Acouple_cols] = i_r;
791 j3_ub = P->mainBlock->pattern->ptr[i2+1];
792 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
793 i_c = P->mainBlock->pattern->index[j3];
794 if (P_marker[i_c] < row_marker) {
795 P_marker[i_c] = sum;
796 sum++;
797 }
798 }
799 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
800 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
801 /* note that we need to map the column index in
802 P->col_coupleBlock back into the column index in
803 P_ext_couple */
804 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
805 if (P_marker[i_c] < row_marker) {
806 P_marker[i_c] = sum;
807 sum++;
808 }
809 }
810 }
811 }
812 }
813 }
814
815 /* Now we have the number of non-zero elements in RAP_ext, allocate
816 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
817 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
818 RAP_ext_idx = new index_t[sum];
819 RAP_ext_val = new double[sum * block_size];
820 RA_val = new double[block_size];
821 RAP_val = new double[block_size];
822
823 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
824 sum = num_Pext_cols + num_Pmain_cols;
825 #pragma omp parallel for private(i) schedule(static)
826 for (i=0; i<sum; i++) P_marker[i] = -1;
827 #pragma omp parallel for private(i) schedule(static)
828 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
829 sum = 0;
830 RAP_ext_ptr[0] = 0;
831 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
832 row_marker = sum;
833 /* then, loop over elements in row i_r of R_couple */
834 j1_ub = R_couple->pattern->ptr[i_r+1];
835 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
836 i1 = R_couple->pattern->index[j1];
837 R_val = &(R_couple->val[j1*P_block_size]);
838
839 /* then, loop over elements in row i1 of A->col_coupleBlock */
840 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
841 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
842 i2 = A->col_coupleBlock->pattern->index[j2];
843 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
844 for (irb=0; irb<row_block_size; irb++)
845 for (icb=0; icb<col_block_size; icb++)
846 RA_val[irb+row_block_size*icb] = ZERO;
847 for (irb=0; irb<P_block_size; irb++) {
848 rtmp=R_val[irb];
849 for (icb=0; icb<col_block_size; icb++) {
850 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
851 }
852 }
853
854 /* check whether entry RA[i_r, i2] has been previously visited.
855 RAP new entry is possible only if entry RA[i_r, i2] has not
856 been visited yet */
857 if (A_marker[i2] != i_r) {
858 /* first, mark entry RA[i_r,i2] as visited */
859 A_marker[i2] = i_r;
860
861 /* then loop over elements in row i2 of P_ext_main */
862 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
863 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
864 i_c = P->row_coupleBlock->pattern->index[j3];
865 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
866 for (irb=0; irb<row_block_size; irb++)
867 for (icb=0; icb<col_block_size; icb++)
868 RAP_val[irb+row_block_size*icb] = ZERO;
869 for (icb=0; icb<P_block_size; icb++) {
870 rtmp = temp_val[icb];
871 for (irb=0; irb<row_block_size; irb++) {
872 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
873 }
874 }
875
876 /* check whether entry RAP[i_r,i_c] has been created.
877 If not yet created, create the entry and increase
878 the total number of elements in RAP_ext */
879 if (P_marker[i_c] < row_marker) {
880 P_marker[i_c] = sum;
881 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
882 RAP_ext_idx[sum] = i_c + offset;
883 sum++;
884 } else {
885 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
886 for (ib=0; ib<block_size; ib++) {
887 temp_val[ib] += RAP_val[ib];
888 }
889 }
890 }
891
892 /* loop over elements in row i2 of P_ext_couple, do the same */
893 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
894 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
895 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
896 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
897 for (irb=0; irb<row_block_size; irb++)
898 for (icb=0; icb<col_block_size; icb++)
899 RAP_val[irb+row_block_size*icb]=ZERO;
900 for (icb=0; icb<P_block_size; icb++) {
901 rtmp = temp_val[icb];
902 for (irb=0; irb<row_block_size; irb++) {
903 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
904 }
905 }
906
907 if (P_marker[i_c] < row_marker) {
908 P_marker[i_c] = sum;
909 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
910 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
911 sum++;
912 } else {
913 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
914 for (ib=0; ib<block_size; ib++) {
915 temp_val[ib] += RAP_val[ib];
916 }
917 }
918 }
919
920 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
921 Only the contributions are added. */
922 } else {
923 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
924 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
925 i_c = P->row_coupleBlock->pattern->index[j3];
926 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
927 for (irb=0; irb<row_block_size; irb++)
928 for (icb=0; icb<col_block_size; icb++)
929 RAP_val[irb+row_block_size*icb]=ZERO;
930 for (icb=0; icb<P_block_size; icb++) {
931 rtmp = temp_val[icb];
932 for (irb=0; irb<row_block_size; irb++) {
933 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
934 }
935 }
936
937 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
938 for (ib=0; ib<block_size; ib++) {
939 temp_val[ib] += RAP_val[ib];
940 }
941 }
942 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
943 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
944 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
945 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
946 for (irb=0; irb<row_block_size; irb++)
947 for (icb=0; icb<col_block_size; icb++)
948 RAP_val[irb+row_block_size*icb]=ZERO;
949 for (icb=0; icb<P_block_size; icb++) {
950 rtmp = temp_val[icb];
951 for (irb=0; irb<row_block_size; irb++) {
952 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
953 }
954 }
955
956 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
957 for (ib=0; ib<block_size; ib++) {
958 temp_val[ib] += RAP_val[ib];
959 }
960 }
961 }
962 }
963
964 /* now loop over elements in row i1 of A->mainBlock, repeat
965 the process we have done to block A->col_coupleBlock */
966 j2_ub = A->mainBlock->pattern->ptr[i1+1];
967 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
968 i2 = A->mainBlock->pattern->index[j2];
969 temp_val = &(A->mainBlock->val[j2*block_size]);
970 for (irb=0; irb<row_block_size; irb++)
971 for (icb=0; icb<col_block_size; icb++)
972 RA_val[irb+row_block_size*icb]=ZERO;
973 for (irb=0; irb<P_block_size; irb++) {
974 rtmp=R_val[irb];
975 for (icb=0; icb<col_block_size; icb++) {
976 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
977 }
978 }
979
980 if (A_marker[i2 + num_Acouple_cols] != i_r) {
981 A_marker[i2 + num_Acouple_cols] = i_r;
982 j3_ub = P->mainBlock->pattern->ptr[i2+1];
983 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
984 i_c = P->mainBlock->pattern->index[j3];
985 temp_val = &(P->mainBlock->val[j3*P_block_size]);
986 for (irb=0; irb<row_block_size; irb++)
987 for (icb=0; icb<col_block_size; icb++)
988 RAP_val[irb+row_block_size*icb]=ZERO;
989 for (icb=0; icb<P_block_size; icb++) {
990 rtmp = temp_val[icb];
991 for (irb=0; irb<row_block_size; irb++) {
992 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
993 }
994 }
995
996 if (P_marker[i_c] < row_marker) {
997 P_marker[i_c] = sum;
998 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
999 RAP_ext_idx[sum] = i_c + offset;
1000 sum++;
1001 } else {
1002 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1003 for (ib=0; ib<block_size; ib++) {
1004 temp_val[ib] += RAP_val[ib];
1005 }
1006 }
1007 }
1008 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1009 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1010 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1011 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1012 for (irb=0; irb<row_block_size; irb++)
1013 for (icb=0; icb<col_block_size; icb++)
1014 RAP_val[irb+row_block_size*icb]=ZERO;
1015 for (icb=0; icb<P_block_size; icb++) {
1016 rtmp=temp_val[icb];
1017 for (irb=0; irb<row_block_size; irb++) {
1018 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1019 }
1020 }
1021
1022 if (P_marker[i_c] < row_marker) {
1023 P_marker[i_c] = sum;
1024 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1025 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1026 sum++;
1027 } else {
1028 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1029 for (ib=0; ib<block_size; ib++) {
1030 temp_val[ib] += RAP_val[ib];
1031 }
1032 }
1033 }
1034 } else {
1035 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1036 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1037 i_c = P->mainBlock->pattern->index[j3];
1038 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1039 for (irb=0; irb<row_block_size; irb++)
1040 for (icb=0; icb<col_block_size; icb++)
1041 RAP_val[irb+row_block_size*icb]=ZERO;
1042 for (icb=0; icb<P_block_size; icb++) {
1043 rtmp=temp_val[icb];
1044 for (irb=0; irb<row_block_size; irb++) {
1045 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1046 }
1047 }
1048
1049 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1050 for (ib=0; ib<block_size; ib++) {
1051 temp_val[ib] += RAP_val[ib];
1052 }
1053 }
1054 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1055 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1056 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1057 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1058 for (irb=0; irb<row_block_size; irb++)
1059 for (icb=0; icb<col_block_size; icb++)
1060 RAP_val[irb+row_block_size*icb]=ZERO;
1061 for (icb=0; icb<P_block_size; icb++) {
1062 rtmp=temp_val[icb];
1063 for (irb=0; irb<row_block_size; irb++) {
1064 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1065 }
1066 }
1067
1068 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1069 for (ib=0; ib<block_size; ib++) {
1070 temp_val[ib] += RAP_val[ib];
1071 }
1072 }
1073 }
1074 }
1075 }
1076 RAP_ext_ptr[i_r+1] = sum;
1077 }
1078 delete[] P_marker;
1079 delete[] Pcouple_to_Pext;
1080
1081 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1082 which is given by the column list of P->col_coupleBlock, and
1083 column "c" is the list of columns which possibly covers the
1084 whole column range of system matrix P. This part of data is to
1085 be passed to neighbouring processors, and added to corresponding
1086 RAP[r,c] entries in the neighbouring processors */
1087 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1088 &RAP_ext_val, global_id_P, block_size);
1089
1090 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1091 sum = RAP_ext_ptr[num_RAPext_rows];
1092 num_RAPext_cols = 0;
1093 if (num_Pext_cols || sum > 0) {
1094 temp = new index_t[num_Pext_cols+sum];
1095 j1_ub = offset + num_Pmain_cols;
1096 for (i=0, iptr=0; i<sum; i++) {
1097 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1098 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1099 }
1100 for (j=0; j<num_Pext_cols; j++, iptr++){
1101 temp[iptr] = global_id_P[j];
1102 }
1103
1104 if (iptr) {
1105 #ifdef USE_QSORTG
1106 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1107 #else
1108 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1109 #endif
1110 num_RAPext_cols = 1;
1111 i = temp[0];
1112 for (j=1; j<iptr; j++) {
1113 if (temp[j] > i) {
1114 i = temp[j];
1115 temp[num_RAPext_cols++] = i;
1116 }
1117 }
1118 }
1119 }
1120
1121 /* resize the pattern of P_ext_couple */
1122 if(num_RAPext_cols){
1123 global_id_RAP = new index_t[num_RAPext_cols];
1124 #pragma omp parallel for private(i) schedule(static)
1125 for (i=0; i<num_RAPext_cols; i++)
1126 global_id_RAP[i] = temp[i];
1127 }
1128 if (num_Pext_cols || sum > 0)
1129 delete[] temp;
1130 j1_ub = offset + num_Pmain_cols;
1131 #pragma omp parallel for private(i, where_p) schedule(static)
1132 for (i=0; i<sum; i++) {
1133 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1134 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1135 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1136 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1137 } else
1138 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1139 }
1140
1141 /* build the mapping */
1142 if (num_Pcouple_cols) {
1143 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1144 iptr = 0;
1145 for (i=0; i<num_RAPext_cols; i++)
1146 if (global_id_RAP[i] == P->global_id[iptr]) {
1147 Pcouple_to_RAP[iptr++] = i;
1148 if (iptr == num_Pcouple_cols) break;
1149 }
1150 }
1151
1152 if (num_Pext_cols) {
1153 Pext_to_RAP = new index_t[num_Pext_cols];
1154 iptr = 0;
1155 for (i=0; i<num_RAPext_cols; i++)
1156 if (global_id_RAP[i] == global_id_P[iptr]) {
1157 Pext_to_RAP[iptr++] = i;
1158 if (iptr == num_Pext_cols) break;
1159 }
1160 }
1161
1162 if (global_id_P){
1163 delete[] global_id_P;
1164 global_id_P = NULL;
1165 }
1166
1167 /* alloc and initialise the makers */
1168 sum = num_RAPext_cols + num_Pmain_cols;
1169 P_marker = new index_t[sum];
1170 #pragma omp parallel for private(i) schedule(static)
1171 for (i=0; i<sum; i++) P_marker[i] = -1;
1172 #pragma omp parallel for private(i) schedule(static)
1173 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1174
1175 /* Now, count the size of RAP. Start with rows in R_main */
1176 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1177 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1178 shared = P->col_coupler->connector->send->shared;
1179 i = 0;
1180 j = 0;
1181 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1182 /* Mark the start of row for both main block and couple block */
1183 row_marker = i;
1184 row_marker_ext = j;
1185
1186 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1187 in RAP_ext */
1188 P_marker[i_r] = i;
1189 i++;
1190 for (j1=0; j1<num_neighbors; j1++) {
1191 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1192 if (shared[j2] == i_r) {
1193 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1194 i_c = RAP_ext_idx[k];
1195 if (i_c < num_Pmain_cols) {
1196 if (P_marker[i_c] < row_marker) {
1197 P_marker[i_c] = i;
1198 i++;
1199 }
1200 } else {
1201 if (P_marker[i_c] < row_marker_ext) {
1202 P_marker[i_c] = j;
1203 j++;
1204 }
1205 }
1206 }
1207 break;
1208 }
1209 }
1210 }
1211
1212 /* then, loop over elements in row i_r of R_main */
1213 j1_ub = R_main->pattern->ptr[i_r+1];
1214 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1215 i1 = R_main->pattern->index[j1];
1216
1217 /* then, loop over elements in row i1 of A->col_coupleBlock */
1218 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1219 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1220 i2 = A->col_coupleBlock->pattern->index[j2];
1221
1222 /* check whether entry RA[i_r, i2] has been previously visited.
1223 RAP new entry is possible only if entry RA[i_r, i2] has not
1224 been visited yet */
1225 if (A_marker[i2] != i_r) {
1226 /* first, mark entry RA[i_r,i2] as visited */
1227 A_marker[i2] = i_r;
1228
1229 /* then loop over elements in row i2 of P_ext_main */
1230 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1231 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232 i_c = P->row_coupleBlock->pattern->index[j3];
1233
1234 /* check whether entry RAP[i_r,i_c] has been created.
1235 If not yet created, create the entry and increase
1236 the total number of elements in RAP_ext */
1237 if (P_marker[i_c] < row_marker) {
1238 P_marker[i_c] = i;
1239 i++;
1240 }
1241 }
1242
1243 /* loop over elements in row i2 of P_ext_couple, do the same */
1244 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1245 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1246 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1247 if (P_marker[i_c] < row_marker_ext) {
1248 P_marker[i_c] = j;
1249 j++;
1250 }
1251 }
1252 }
1253 }
1254
1255 /* now loop over elements in row i1 of A->mainBlock, repeat
1256 the process we have done to block A->col_coupleBlock */
1257 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1258 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1259 i2 = A->mainBlock->pattern->index[j2];
1260 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1261 A_marker[i2 + num_Acouple_cols] = i_r;
1262 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1263 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1264 i_c = P->mainBlock->pattern->index[j3];
1265 if (P_marker[i_c] < row_marker) {
1266 P_marker[i_c] = i;
1267 i++;
1268 }
1269 }
1270 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1271 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1272 /* note that we need to map the column index in
1273 P->col_coupleBlock back into the column index in
1274 P_ext_couple */
1275 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1276 if (P_marker[i_c] < row_marker_ext) {
1277 P_marker[i_c] = j;
1278 j++;
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285
1286 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1287 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1288 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1289 RAP_couple */
1290 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1291 RAP_main_idx = new index_t[i];
1292 RAP_main_val = new double[i * block_size];
1293 RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1294 RAP_couple_idx = new index_t[j];
1295 RAP_couple_val = new double[j * block_size];
1296
1297 RAP_main_ptr[num_Pmain_cols] = i;
1298 RAP_couple_ptr[num_Pmain_cols] = j;
1299
1300 #pragma omp parallel for private(i) schedule(static)
1301 for (i=0; i<sum; i++) P_marker[i] = -1;
1302 #pragma omp parallel for private(i) schedule(static)
1303 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1304
1305 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1306 in R_main */
1307 i = 0;
1308 j = 0;
1309 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1310 /* Mark the start of row for both main block and couple block */
1311 row_marker = i;
1312 row_marker_ext = j;
1313 RAP_main_ptr[i_r] = row_marker;
1314 RAP_couple_ptr[i_r] = row_marker_ext;
1315
1316 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1317 in row i_r of RAP_ext */
1318 P_marker[i_r] = i;
1319 for (ib=0; ib<block_size; ib++)
1320 RAP_main_val[i*block_size+ib] = ZERO;
1321 RAP_main_idx[i] = i_r;
1322 i++;
1323
1324 for (j1=0; j1<num_neighbors; j1++) {
1325 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1326 if (shared[j2] == i_r) {
1327 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1328 i_c = RAP_ext_idx[k];
1329 if (i_c < num_Pmain_cols) {
1330 if (P_marker[i_c] < row_marker) {
1331 P_marker[i_c] = i;
1332 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1333 RAP_main_idx[i] = i_c;
1334 i++;
1335 } else {
1336 t1_val = &(RAP_ext_val[k*block_size]);
1337 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1338 for (ib=0; ib<block_size; ib++)
1339 t2_val[ib] += t1_val[ib];
1340 }
1341 } else {
1342 if (P_marker[i_c] < row_marker_ext) {
1343 P_marker[i_c] = j;
1344 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1345 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1346 j++;
1347 } else {
1348 t1_val = &(RAP_ext_val[k*block_size]);
1349 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1350 for (ib=0; ib<block_size; ib++)
1351 t2_val[ib] += t1_val[ib];
1352 }
1353 }
1354 }
1355 break;
1356 }
1357 }
1358 }
1359
1360 /* then, loop over elements in row i_r of R_main */
1361 j1_ub = R_main->pattern->ptr[i_r+1];
1362 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1363 i1 = R_main->pattern->index[j1];
1364 R_val = &(R_main->val[j1*P_block_size]);
1365
1366 /* then, loop over elements in row i1 of A->col_coupleBlock */
1367 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1368 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1369 i2 = A->col_coupleBlock->pattern->index[j2];
1370 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1371 for (irb=0; irb<row_block_size; irb++)
1372 for (icb=0; icb<col_block_size; icb++)
1373 RA_val[irb+row_block_size*icb]=ZERO;
1374 for (irb=0; irb<P_block_size; irb++) {
1375 rtmp=R_val[irb];
1376 for (icb=0; icb<col_block_size; icb++) {
1377 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1378 }
1379 }
1380
1381
1382 /* check whether entry RA[i_r, i2] has been previously visited.
1383 RAP new entry is possible only if entry RA[i_r, i2] has not
1384 been visited yet */
1385 if (A_marker[i2] != i_r) {
1386 /* first, mark entry RA[i_r,i2] as visited */
1387 A_marker[i2] = i_r;
1388
1389 /* then loop over elements in row i2 of P_ext_main */
1390 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1391 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1392 i_c = P->row_coupleBlock->pattern->index[j3];
1393 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1394 for (irb=0; irb<row_block_size; irb++)
1395 for (icb=0; icb<col_block_size; icb++)
1396 RAP_val[irb+row_block_size*icb]=ZERO;
1397 for (icb=0; icb<P_block_size; icb++) {
1398 rtmp=temp_val[icb];
1399 for (irb=0; irb<row_block_size; irb++) {
1400 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1401 }
1402 }
1403
1404
1405 /* check whether entry RAP[i_r,i_c] has been created.
1406 If not yet created, create the entry and increase
1407 the total number of elements in RAP_ext */
1408 if (P_marker[i_c] < row_marker) {
1409 P_marker[i_c] = i;
1410 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1411 RAP_main_idx[i] = i_c;
1412 i++;
1413 } else {
1414 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1415 for (ib=0; ib<block_size; ib++) {
1416 temp_val[ib] += RAP_val[ib];
1417 }
1418 }
1419 }
1420
1421 /* loop over elements in row i2 of P_ext_couple, do the same */
1422 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1423 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1424 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1425 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1426 for (irb=0; irb<row_block_size; irb++)
1427 for (icb=0; icb<col_block_size; icb++)
1428 RAP_val[irb+row_block_size*icb]=ZERO;
1429 for (icb=0; icb<P_block_size; icb++) {
1430 rtmp=temp_val[icb];
1431 for (irb=0; irb<row_block_size; irb++) {
1432 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1433 }
1434 }
1435
1436 if (P_marker[i_c] < row_marker_ext) {
1437 P_marker[i_c] = j;
1438 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1439 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1440 j++;
1441 } else {
1442 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1443 for (ib=0; ib<block_size; ib++) {
1444 temp_val[ib] += RAP_val[ib];
1445 }
1446 }
1447 }
1448
1449 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1450 Only the contributions are added. */
1451 } else {
1452 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1453 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1454 i_c = P->row_coupleBlock->pattern->index[j3];
1455 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1456 for (irb=0; irb<row_block_size; irb++)
1457 for (icb=0; icb<col_block_size; icb++)
1458 RAP_val[irb+row_block_size*icb]=ZERO;
1459 for (icb=0; icb<P_block_size; icb++) {
1460 rtmp=temp_val[icb];
1461 for (irb=0; irb<row_block_size; irb++) {
1462 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1463 }
1464 }
1465
1466 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1467 for (ib=0; ib<block_size; ib++) {
1468 temp_val[ib] += RAP_val[ib];
1469 }
1470 }
1471 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1472 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1473 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1474 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1475 for (irb=0; irb<row_block_size; irb++)
1476 for (icb=0; icb<col_block_size; icb++)
1477 RAP_val[irb+row_block_size*icb]=ZERO;
1478 for (icb=0; icb<P_block_size; icb++) {
1479 rtmp=temp_val[icb];
1480 for (irb=0; irb<row_block_size; irb++) {
1481 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1482 }
1483 }
1484
1485 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1486 for (ib=0; ib<block_size; ib++) {
1487 temp_val[ib] += RAP_val[ib];
1488 }
1489 }
1490 }
1491 }
1492
1493 /* now loop over elements in row i1 of A->mainBlock, repeat
1494 the process we have done to block A->col_coupleBlock */
1495 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1496 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1497 i2 = A->mainBlock->pattern->index[j2];
1498 temp_val = &(A->mainBlock->val[j2*block_size]);
1499 for (irb=0; irb<row_block_size; irb++)
1500 for (icb=0; icb<col_block_size; icb++)
1501 RA_val[irb+row_block_size*icb]=ZERO;
1502 for (irb=0; irb<P_block_size; irb++) {
1503 rtmp=R_val[irb];
1504 for (icb=0; icb<col_block_size; icb++) {
1505 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1506 }
1507 }
1508
1509 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1510 A_marker[i2 + num_Acouple_cols] = i_r;
1511 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1512 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1513 i_c = P->mainBlock->pattern->index[j3];
1514 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1515 for (irb=0; irb<row_block_size; irb++)
1516 for (icb=0; icb<col_block_size; icb++)
1517 RAP_val[irb+row_block_size*icb]=ZERO;
1518 for (icb=0; icb<P_block_size; icb++) {
1519 rtmp=temp_val[icb];
1520 for (irb=0; irb<row_block_size; irb++) {
1521 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1522 }
1523 }
1524 if (P_marker[i_c] < row_marker) {
1525 P_marker[i_c] = i;
1526 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1527 RAP_main_idx[i] = i_c;
1528 i++;
1529 } else {
1530 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1531 for (ib=0; ib<block_size; ib++) {
1532 temp_val[ib] += RAP_val[ib];
1533 }
1534 }
1535 }
1536 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1537 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1538 /* note that we need to map the column index in
1539 P->col_coupleBlock back into the column index in
1540 P_ext_couple */
1541 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1542 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1543 for (irb=0; irb<row_block_size; irb++)
1544 for (icb=0; icb<col_block_size; icb++)
1545 RAP_val[irb+row_block_size*icb]=ZERO;
1546 for (icb=0; icb<P_block_size; icb++) {
1547 rtmp=temp_val[icb];
1548 for (irb=0; irb<row_block_size; irb++) {
1549 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1550 }
1551 }
1552
1553 if (P_marker[i_c] < row_marker_ext) {
1554 P_marker[i_c] = j;
1555 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1556 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1557 j++;
1558 } else {
1559 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1560 for (ib=0; ib<block_size; ib++) {
1561 temp_val[ib] += RAP_val[ib];
1562 }
1563 }
1564 }
1565
1566 } else {
1567 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1568 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1569 i_c = P->mainBlock->pattern->index[j3];
1570 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1571 for (irb=0; irb<row_block_size; irb++)
1572 for (icb=0; icb<col_block_size; icb++)
1573 RAP_val[irb+row_block_size*icb]=ZERO;
1574 for (icb=0; icb<P_block_size; icb++) {
1575 rtmp=temp_val[icb];
1576 for (irb=0; irb<row_block_size; irb++) {
1577 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1578 }
1579 }
1580
1581 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1582 for (ib=0; ib<block_size; ib++) {
1583 temp_val[ib] += RAP_val[ib];
1584 }
1585 }
1586 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1587 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1588 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1589 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1590 for (irb=0; irb<row_block_size; irb++)
1591 for (icb=0; icb<col_block_size; icb++)
1592 RAP_val[irb+row_block_size*icb]=ZERO;
1593 for (icb=0; icb<P_block_size; icb++) {
1594 rtmp = temp_val[icb];
1595 for (irb=0; irb<row_block_size; irb++) {
1596 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1597 }
1598 }
1599
1600 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1601 for (ib=0; ib<block_size; ib++) {
1602 temp_val[ib] += RAP_val[ib];
1603 }
1604 }
1605 }
1606 }
1607 }
1608
1609 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1610 if (i > row_marker) {
1611 offset = i - row_marker;
1612 temp = new index_t[offset];
1613 #pragma omp parallel for schedule(static) private(iptr)
1614 for (iptr=0; iptr<offset; iptr++)
1615 temp[iptr] = RAP_main_idx[row_marker+iptr];
1616 if (offset > 0) {
1617 #ifdef USE_QSORTG
1618 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1619 #else
1620 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1621 #endif
1622 }
1623 temp_val = new double[offset * block_size];
1624 #pragma omp parallel for schedule(static) private(iptr,k)
1625 for (iptr=0; iptr<offset; iptr++){
1626 k = P_marker[temp[iptr]];
1627 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1628 P_marker[temp[iptr]] = iptr + row_marker;
1629 }
1630 #pragma omp parallel for schedule(static) private(iptr)
1631 for (iptr=0; iptr<offset; iptr++){
1632 RAP_main_idx[row_marker+iptr] = temp[iptr];
1633 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1634 }
1635 delete[] temp;
1636 delete[] temp_val;
1637 }
1638 if (j > row_marker_ext) {
1639 offset = j - row_marker_ext;
1640 temp = new index_t[offset];
1641 #pragma omp parallel for schedule(static) private(iptr)
1642 for (iptr=0; iptr<offset; iptr++)
1643 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1644 if (offset > 0) {
1645 #ifdef USE_QSORTG
1646 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1647 #else
1648 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1649 #endif
1650 }
1651 temp_val = new double[offset * block_size];
1652 #pragma omp parallel for schedule(static) private(iptr, k)
1653 for (iptr=0; iptr<offset; iptr++){
1654 k = P_marker[temp[iptr] + num_Pmain_cols];
1655 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1656 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1657 }
1658 #pragma omp parallel for schedule(static) private(iptr)
1659 for (iptr=0; iptr<offset; iptr++){
1660 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1661 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1662 }
1663 delete[] temp;
1664 delete[] temp_val;
1665 }
1666 }
1667
1668 delete[] RA_val;
1669 delete[] RAP_val;
1670 delete[] A_marker;
1671 delete[] Pext_to_RAP;
1672 delete[] Pcouple_to_RAP;
1673 delete[] RAP_ext_ptr;
1674 delete[] RAP_ext_idx;
1675 delete[] RAP_ext_val;
1676 Paso_SparseMatrix_free(R_main);
1677 Paso_SparseMatrix_free(R_couple);
1678
1679 /* Check whether there are empty columns in RAP_couple */
1680 #pragma omp parallel for schedule(static) private(i)
1681 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1682 /* num of non-empty columns is stored in "k" */
1683 k = 0;
1684 j = RAP_couple_ptr[num_Pmain_cols];
1685 for (i=0; i<j; i++) {
1686 i1 = RAP_couple_idx[i];
1687 if (P_marker[i1]) {
1688 P_marker[i1] = 0;
1689 k++;
1690 }
1691 }
1692
1693 /* empty columns is found */
1694 if (k < num_RAPext_cols) {
1695 temp = new index_t[k];
1696 k = 0;
1697 for (i=0; i<num_RAPext_cols; i++)
1698 if (!P_marker[i]) {
1699 P_marker[i] = k;
1700 temp[k] = global_id_RAP[i];
1701 k++;
1702 }
1703 #pragma omp parallel for schedule(static) private(i, i1)
1704 for (i=0; i<j; i++) {
1705 i1 = RAP_couple_idx[i];
1706 RAP_couple_idx[i] = P_marker[i1];
1707 }
1708 num_RAPext_cols = k;
1709 delete[] global_id_RAP;
1710 global_id_RAP = temp;
1711 }
1712 delete[] P_marker;
1713
1714 /******************************************************/
1715 /* Start to create the coarse level System Matrix A_c */
1716 /******************************************************/
1717 /* first, prepare the sender/receiver for the col_connector */
1718 dist = P->pattern->input_distribution->first_component;
1719 recv_len = new dim_t[size];
1720 send_len = new dim_t[size];
1721 neighbor = new Esys_MPI_rank[size];
1722 offsetInShared = new index_t[size+1];
1723 shared = new index_t[num_RAPext_cols];
1724 memset(recv_len, 0, sizeof(dim_t) * size);
1725 memset(send_len, 0, sizeof(dim_t) * size);
1726 num_neighbors = 0;
1727 offsetInShared[0] = 0;
1728 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1729 shared[i] = i + num_Pmain_cols;
1730 if (k <= global_id_RAP[i]) {
1731 if (recv_len[j] > 0) {
1732 neighbor[num_neighbors] = j;
1733 num_neighbors ++;
1734 offsetInShared[num_neighbors] = i;
1735 }
1736 while (k <= global_id_RAP[i]) {
1737 j++;
1738 k = dist[j+1];
1739 }
1740 }
1741 recv_len[j] ++;
1742 }
1743 if (recv_len[j] > 0) {
1744 neighbor[num_neighbors] = j;
1745 num_neighbors ++;
1746 offsetInShared[num_neighbors] = i;
1747 }
1748 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1749 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1750
1751 #ifdef ESYS_MPI
1752 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1753 #endif
1754
1755 #ifdef ESYS_MPI
1756 mpi_requests=new MPI_Request[size*2];
1757 mpi_stati=new MPI_Status[size*2];
1758 #else
1759 mpi_requests=new int[size*2];
1760 mpi_stati=new int[size*2];
1761 #endif
1762 num_neighbors = 0;
1763 j = 0;
1764 offsetInShared[0] = 0;
1765 for (i=0; i<size; i++) {
1766 if (send_len[i] > 0) {
1767 neighbor[num_neighbors] = i;
1768 num_neighbors ++;
1769 j += send_len[i];
1770 offsetInShared[num_neighbors] = j;
1771 }
1772 }
1773 delete[] shared;
1774 shared = new index_t[j];
1775 for (i=0, j=0; i<num_neighbors; i++) {
1776 k = neighbor[i];
1777 #ifdef ESYS_MPI
1778 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1779 mpi_info->msg_tag_counter+k,
1780 mpi_info->comm, &mpi_requests[i]);
1781 #endif
1782 j += send_len[k];
1783 }
1784 for (i=0, j=0; i<recv->numNeighbors; i++) {
1785 k = recv->neighbor[i];
1786 #ifdef ESYS_MPI
1787 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1788 mpi_info->msg_tag_counter+rank,
1789 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1790 #endif
1791 j += recv_len[k];
1792 }
1793 #ifdef ESYS_MPI
1794 MPI_Waitall(num_neighbors + recv->numNeighbors,
1795 mpi_requests, mpi_stati);
1796 #endif
1797 mpi_info->msg_tag_counter += size;
1798
1799 j = offsetInShared[num_neighbors];
1800 offset = dist[rank];
1801 #pragma omp parallel for schedule(static) private(i)
1802 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1803 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1804 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1805
1806 col_connector = Paso_Connector_alloc(send, recv);
1807 delete[] shared;
1808 Paso_SharedComponents_free(recv);
1809 Paso_SharedComponents_free(send);
1810
1811 /* now, create row distribution (output_distri) and col
1812 distribution (input_distribution) */
1813 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1814 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1815
1816 /* then, prepare the sender/receiver for the row_connector, first, prepare
1817 the information for sender */
1818 sum = RAP_couple_ptr[num_Pmain_cols];
1819 len = new dim_t[size];
1820 send_ptr = new index_t*[size];
1821 send_idx = new index_t*[size];
1822 #pragma omp parallel for schedule(static) private(i)
1823 for (i=0; i<size; i++) {
1824 send_ptr[i] = new index_t[num_Pmain_cols];
1825 send_idx[i] = new index_t[sum];
1826 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1827 }
1828 memset(len, 0, sizeof(dim_t) * size);
1829 recv = col_connector->recv;
1830 sum=0;
1831 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1832 i1 = RAP_couple_ptr[i_r];
1833 i2 = RAP_couple_ptr[i_r+1];
1834 if (i2 > i1) {
1835 /* then row i_r will be in the sender of row_connector, now check
1836 how many neighbours i_r needs to be send to */
1837 for (j=i1; j<i2; j++) {
1838 i_c = RAP_couple_idx[j];
1839 /* find out the corresponding neighbor "p" of column i_c */
1840 for (p=0; p<recv->numNeighbors; p++) {
1841 if (i_c < recv->offsetInShared[p+1]) {
1842 k = recv->neighbor[p];
1843 if (send_ptr[k][i_r] == 0) sum++;
1844 send_ptr[k][i_r] ++;
1845 send_idx[k][len[k]] = global_id_RAP[i_c];
1846 len[k] ++;
1847 break;
1848 }
1849 }
1850 }
1851 }
1852 }
1853 if (global_id_RAP) {
1854 delete[] global_id_RAP;
1855 global_id_RAP = NULL;
1856 }
1857
1858 /* now allocate the sender */
1859 shared = new index_t[sum];
1860 memset(send_len, 0, sizeof(dim_t) * size);
1861 num_neighbors=0;
1862 offsetInShared[0] = 0;
1863 for (p=0, k=0; p<size; p++) {
1864 for (i=0; i<num_Pmain_cols; i++) {
1865 if (send_ptr[p][i] > 0) {
1866 shared[k] = i;
1867 k++;
1868 send_ptr[p][send_len[p]] = send_ptr[p][i];
1869 send_len[p]++;
1870 }
1871 }
1872 if (k > offsetInShared[num_neighbors]) {
1873 neighbor[num_neighbors] = p;
1874 num_neighbors ++;
1875 offsetInShared[num_neighbors] = k;
1876 }
1877 }
1878 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1879 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1880
1881 /* send/recv number of rows will be sent from current proc
1882 recover info for the receiver of row_connector from the sender */
1883 #ifdef ESYS_MPI
1884 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1885 #endif
1886 num_neighbors = 0;
1887 offsetInShared[0] = 0;
1888 j = 0;
1889 for (i=0; i<size; i++) {
1890 if (i != rank && recv_len[i] > 0) {
1891 neighbor[num_neighbors] = i;
1892 num_neighbors ++;
1893 j += recv_len[i];
1894 offsetInShared[num_neighbors] = j;
1895 }
1896 }
1897 delete[] shared;
1898 delete[] recv_len;
1899 shared = new index_t[j];
1900 k = offsetInShared[num_neighbors];
1901 #pragma omp parallel for schedule(static) private(i)
1902 for (i=0; i<k; i++) {
1903 shared[i] = i + num_Pmain_cols;
1904 }
1905 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1906 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1907 row_connector = Paso_Connector_alloc(send, recv);
1908 delete[] shared;
1909 Paso_SharedComponents_free(recv);
1910 Paso_SharedComponents_free(send);
1911
1912 /* send/recv pattern->ptr for rowCoupleBlock */
1913 num_RAPext_rows = offsetInShared[num_neighbors];
1914 row_couple_ptr = new index_t[num_RAPext_rows+1];
1915 for (p=0; p<num_neighbors; p++) {
1916 j = offsetInShared[p];
1917 i = offsetInShared[p+1];
1918 #ifdef ESYS_MPI
1919 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1920 mpi_info->msg_tag_counter+neighbor[p],
1921 mpi_info->comm, &mpi_requests[p]);
1922 #endif
1923 }
1924 send = row_connector->send;
1925 for (p=0; p<send->numNeighbors; p++) {
1926 #ifdef ESYS_MPI
1927 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1928 MPI_INT, send->neighbor[p],
1929 mpi_info->msg_tag_counter+rank,
1930 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1931 #endif
1932 }
1933 #ifdef ESYS_MPI
1934 MPI_Waitall(num_neighbors + send->numNeighbors,
1935 mpi_requests, mpi_stati);
1936 #endif
1937 mpi_info->msg_tag_counter += size;
1938 delete[] send_len;
1939
1940 sum = 0;
1941 for (i=0; i<num_RAPext_rows; i++) {
1942 k = row_couple_ptr[i];
1943 row_couple_ptr[i] = sum;
1944 sum += k;
1945 }
1946 row_couple_ptr[num_RAPext_rows] = sum;
1947
1948 /* send/recv pattern->index for rowCoupleBlock */
1949 k = row_couple_ptr[num_RAPext_rows];
1950 row_couple_idx = new index_t[k];
1951 for (p=0; p<num_neighbors; p++) {
1952 j1 = row_couple_ptr[offsetInShared[p]];
1953 j2 = row_couple_ptr[offsetInShared[p+1]];
1954 #ifdef ESYS_MPI
1955 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1956 mpi_info->msg_tag_counter+neighbor[p],
1957 mpi_info->comm, &mpi_requests[p]);
1958 #endif
1959 }
1960 for (p=0; p<send->numNeighbors; p++) {
1961 #ifdef ESYS_MPI
1962 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1963 MPI_INT, send->neighbor[p],
1964 mpi_info->msg_tag_counter+rank,
1965 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1966 #endif
1967 }
1968 #ifdef ESYS_MPI
1969 MPI_Waitall(num_neighbors + send->numNeighbors,
1970 mpi_requests, mpi_stati);
1971 #endif
1972 mpi_info->msg_tag_counter += size;
1973
1974 offset = input_dist->first_component[rank];
1975 k = row_couple_ptr[num_RAPext_rows];
1976 #pragma omp parallel for schedule(static) private(i)
1977 for (i=0; i<k; i++) {
1978 row_couple_idx[i] -= offset;
1979 }
1980 #pragma omp parallel for schedule(static) private(i)
1981 for (i=0; i<size; i++) {
1982 delete[] send_ptr[i];
1983 delete[] send_idx[i];
1984 }
1985 delete[] send_ptr;
1986 delete[] send_idx;
1987 delete[] len;
1988 delete[] offsetInShared;
1989 delete[] neighbor;
1990 delete[] mpi_requests;
1991 delete[] mpi_stati;
1992 Esys_MPIInfo_free(mpi_info);
1993
1994 /* Now, we can create pattern for mainBlock and coupleBlock */
1995 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1996 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1997 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1998 num_Pmain_cols, num_RAPext_cols,
1999 RAP_couple_ptr, RAP_couple_idx);
2000 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2001 num_RAPext_rows, num_Pmain_cols,
2002 row_couple_ptr, row_couple_idx);
2003
2004 /* next, create the system matrix */
2005 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2006 output_dist, input_dist, main_pattern, col_couple_pattern,
2007 row_couple_pattern, col_connector, row_connector);
2008 out = Paso_SystemMatrix_alloc(A->type, pattern,
2009 row_block_size, col_block_size, FALSE);
2010
2011 /* finally, fill in the data*/
2012 memcpy(out->mainBlock->val, RAP_main_val,
2013 out->mainBlock->len* sizeof(double));
2014 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2015 out->col_coupleBlock->len * sizeof(double));
2016
2017 /* Clean up */
2018 Paso_SystemMatrixPattern_free(pattern);
2019 Paso_Pattern_free(main_pattern);
2020 Paso_Pattern_free(col_couple_pattern);
2021 Paso_Pattern_free(row_couple_pattern);
2022 Paso_Connector_free(col_connector);
2023 Paso_Connector_free(row_connector);
2024 Paso_Distribution_free(output_dist);
2025 Paso_Distribution_free(input_dist);
2026 delete[] RAP_main_val;
2027 delete[] RAP_couple_val;
2028 if (Esys_noError()) {
2029 return out;
2030 } else {
2031 Paso_SystemMatrix_free(out);
2032 return NULL;
2033 }
2034 }
2035
2036
2037 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2038 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2039 {
2040 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2041 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2042 Paso_SystemMatrix *out=NULL;
2043 Paso_SystemMatrixPattern *pattern=NULL;
2044 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2045 Paso_SharedComponents *send =NULL, *recv=NULL;
2046 Paso_Connector *col_connector=NULL, *row_connector=NULL;
2047 Paso_Pattern *main_pattern=NULL;
2048 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2049 const dim_t row_block_size=A->row_block_size;
2050 const dim_t col_block_size=A->col_block_size;
2051 const dim_t block_size = A->block_size;
2052 const double ZERO = 0.0;
2053 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2054 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2055 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2056 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2057 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2058 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2059 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2060 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2061 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2062 index_t sum, i, j, k, iptr, irb, icb, ib;
2063 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2064 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2065 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2066 index_t row_marker_ext, *where_p=NULL;
2067 index_t **send_ptr=NULL, **send_idx=NULL;
2068 dim_t p, num_neighbors;
2069 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2070 Esys_MPI_rank *neighbor=NULL;
2071 #ifdef ESYS_MPI
2072 MPI_Request* mpi_requests=NULL;
2073 MPI_Status* mpi_stati=NULL;
2074 #else
2075 int *mpi_requests=NULL, *mpi_stati=NULL;
2076 #endif
2077
2078 /* two sparse matrices R_main and R_couple will be generate, as the
2079 transpose of P_main and P_col_couple, respectively. Note that,
2080 R_couple is actually the row_coupleBlock of R (R=P^T) */
2081 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2082 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2083
2084 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2085 and needed locally for triple matrix product RAP
2086 to be specific, P_ext (on processor k) are group of rows in P, where
2087 the list of rows from processor q is given by
2088 A->col_coupler->connector->send->shared[rPtr...]
2089 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2090 on q.
2091 P_ext is represented by two sparse matrices P_ext_main and
2092 P_ext_couple */
2093 Paso_Preconditioner_AMG_extendB(A, P);
2094
2095 /* count the number of cols in P_ext_couple, resize the pattern of
2096 sparse matrix P_ext_couple with new compressed order, and then
2097 build the col id mapping from P->col_coupleBlock to
2098 P_ext_couple */
2099 num_Pmain_cols = P->mainBlock->numCols;
2100 num_Pcouple_cols = P->col_coupleBlock->numCols;
2101 num_Acouple_cols = A->col_coupleBlock->numCols;
2102 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2103 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2104 offset = P->col_distribution->first_component[rank];
2105 num_Pext_cols = 0;
2106 if (P->global_id) {
2107 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2108 and P->col_coupleBlock */
2109 iptr = 0;
2110 if (num_Pcouple_cols || sum > 0) {
2111 temp = new index_t[num_Pcouple_cols+sum];
2112 for (; iptr<sum; iptr++){
2113 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2114 }
2115 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2116 temp[iptr] = P->global_id[j];
2117 }
2118 }
2119 if (iptr) {
2120 #ifdef USE_QSORTG
2121 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2122 #else
2123 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2124 #endif
2125 num_Pext_cols = 1;
2126 i = temp[0];
2127 for (j=1; j<iptr; j++) {
2128 if (temp[j] > i) {
2129 i = temp[j];
2130 temp[num_Pext_cols++] = i;
2131 }
2132 }
2133 }
2134
2135 /* resize the pattern of P_ext_couple */
2136 if(num_Pext_cols){
2137 global_id_P = new index_t[num_Pext_cols];
2138 for (i=0; i<num_Pext_cols; i++)
2139 global_id_P[i] = temp[i];
2140 }
2141 if (num_Pcouple_cols || sum > 0)
2142 delete[] temp;
2143 for (i=0; i<sum; i++) {
2144 where_p = (index_t *)bsearch(
2145 &(P->remote_coupleBlock->pattern->index[i]),
2146 global_id_P, num_Pext_cols,
2147 sizeof(index_t), Paso_comparIndex);
2148 P->remote_coupleBlock->pattern->index[i] =
2149 (index_t)(where_p -global_id_P);
2150 }
2151
2152 /* build the mapping */
2153 if (num_Pcouple_cols) {
2154 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2155 iptr = 0;
2156 for (i=0; i<num_Pext_cols; i++)
2157 if (global_id_P[i] == P->global_id[iptr]) {
2158 Pcouple_to_Pext[iptr++] = i;
2159 if (iptr == num_Pcouple_cols) break;
2160 }
2161 }
2162 }
2163
2164 /* alloc and initialise the makers */
2165 sum = num_Pext_cols + num_Pmain_cols;
2166 P_marker = new index_t[sum];
2167 A_marker = new index_t[num_A_cols];
2168 #pragma omp parallel for private(i) schedule(static)
2169 for (i=0; i<sum; i++) P_marker[i] = -1;
2170 #pragma omp parallel for private(i) schedule(static)
2171 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2172
2173 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2174 sum = 0;
2175 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2176 row_marker = sum;
2177 /* then, loop over elements in row i_r of R_couple */
2178 j1_ub = R_couple->pattern->ptr[i_r+1];
2179 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2180 i1 = R_couple->pattern->index[j1];
2181 /* then, loop over elements in row i1 of A->col_coupleBlock */
2182 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2183 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2184 i2 = A->col_coupleBlock->pattern->index[j2];
2185
2186 /* check whether entry RA[i_r, i2] has been previously visited.
2187 RAP new entry is possible only if entry RA[i_r, i2] has not
2188 been visited yet */
2189 if (A_marker[i2] != i_r) {
2190 /* first, mark entry RA[i_r,i2] as visited */
2191 A_marker[i2] = i_r;
2192
2193 /* then loop over elements in row i2 of P_ext_main */
2194 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2195 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2196 i_c = P->row_coupleBlock->pattern->index[j3];
2197
2198 /* check whether entry RAP[i_r,i_c] has been created.
2199 If not yet created, create the entry and increase
2200 the total number of elements in RAP_ext */
2201 if (P_marker[i_c] < row_marker) {
2202 P_marker[i_c] = sum;
2203 sum++;
2204 }
2205 }
2206
2207 /* loop over elements in row i2 of P_ext_couple, do the same */
2208 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2209 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2210 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2211 if (P_marker[i_c] < row_marker) {
2212 P_marker[i_c] = sum;
2213 sum++;
2214 }
2215 }
2216 }
2217 }
2218
2219 /* now loop over elements in row i1 of A->mainBlock, repeat
2220 the process we have done to block A->col_coupleBlock */
2221 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2222 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2223 i2 = A->mainBlock->pattern->index[j2];
2224 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2225 A_marker[i2 + num_Acouple_cols] = i_r;
2226 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2227 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2228 i_c = P->mainBlock->pattern->index[j3];
2229 if (P_marker[i_c] < row_marker) {
2230 P_marker[i_c] = sum;
2231 sum++;
2232 }
2233 }
2234 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2235 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2236 /* note that we need to map the column index in
2237 P->col_coupleBlock back into the column index in
2238 P_ext_couple */
2239 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2240 if (P_marker[i_c] < row_marker) {
2241 P_marker[i_c] = sum;
2242 sum++;
2243 }
2244 }
2245 }
2246 }
2247 }
2248 }
2249
2250 /* Now we have the number of non-zero elements in RAP_ext, allocate
2251 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2252 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2253 RAP_ext_idx = new index_t[sum];
2254 RAP_ext_val = new double[sum * block_size];
2255 RA_val = new double[block_size];
2256 RAP_val = new double[block_size];
2257
2258 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2259 sum = num_Pext_cols + num_Pmain_cols;
2260 #pragma omp parallel for private(i) schedule(static)
2261 for (i=0; i<sum; i++) P_marker[i] = -1;
2262 #pragma omp parallel for private(i) schedule(static)
2263 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2264 sum = 0;
2265 RAP_ext_ptr[0] = 0;
2266 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2267 row_marker = sum;
2268 /* then, loop over elements in row i_r of R_couple */
2269 j1_ub = R_couple->pattern->ptr[i_r+1];
2270 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2271 i1 = R_couple->pattern->index[j1];
2272 R_val = &(R_couple->val[j1*block_size]);
2273
2274 /* then, loop over elements in row i1 of A->col_coupleBlock */
2275 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2276 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2277 i2 = A->col_coupleBlock->pattern->index[j2];
2278 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2279 for (irb=0; irb<row_block_size; irb++) {
2280 for (icb=0; icb<col_block_size; icb++) {
2281 rtmp = ZERO;
2282 for (ib=0; ib<col_block_size; ib++) {
2283 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2284 }
2285 RA_val[irb+row_block_size*icb]=rtmp;
2286 }
2287 }
2288
2289 /* check whether entry RA[i_r, i2] has been previously visited.
2290 RAP new entry is possible only if entry RA[i_r, i2] has not
2291 been visited yet */
2292 if (A_marker[i2] != i_r) {
2293 /* first, mark entry RA[i_r,i2] as visited */
2294 A_marker[i2] = i_r;
2295
2296 /* then loop over elements in row i2 of P_ext_main */
2297 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2298 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2299 i_c = P->row_coupleBlock->pattern->index[j3];
2300 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2301 for (irb=0; irb<row_block_size; irb++) {
2302 for (icb=0; icb<col_block_size; icb++) {
2303 rtmp = ZERO;
2304 for (ib=0; ib<col_block_size; ib++) {
2305 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2306 }
2307 RAP_val[irb+row_block_size*icb]=rtmp;
2308 }
2309 }
2310
2311
2312 /* check whether entry RAP[i_r,i_c] has been created.
2313 If not yet created, create the entry and increase
2314 the total number of elements in RAP_ext */
2315 if (P_marker[i_c] < row_marker) {
2316 P_marker[i_c] = sum;
2317 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2318 RAP_ext_idx[sum] = i_c + offset;
2319 sum++;
2320 } else {
2321 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2322 for (ib=0; ib<block_size; ib++) {
2323 temp_val[ib] += RAP_val[ib];
2324 }
2325 }
2326 }
2327
2328 /* loop over elements in row i2 of P_ext_couple, do the same */
2329 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2330 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2331 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2332 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2333 for (irb=0; irb<row_block_size; irb++) {
2334 for (icb=0; icb<col_block_size; icb++) {
2335 rtmp = ZERO;
2336 for (ib=0; ib<col_block_size; ib++) {
2337 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2338 }
2339 RAP_val[irb+row_block_size*icb]=rtmp;
2340 }
2341 }
2342
2343 if (P_marker[i_c] < row_marker) {
2344 P_marker[i_c] = sum;
2345 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2346 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2347 sum++;
2348 } else {
2349 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2350 for (ib=0; ib<block_size; ib++) {
2351 temp_val[ib] += RAP_val[ib];
2352 }
2353 }
2354 }
2355
2356 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2357 Only the contributions are added. */
2358 } else {
2359 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2360 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2361 i_c = P->row_coupleBlock->pattern->index[j3];
2362 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2363 for (irb=0; irb<row_block_size; irb++) {
2364 for (icb=0; icb<col_block_size; icb++) {
2365 rtmp = ZERO;
2366 for (ib=0; ib<col_block_size; ib++) {
2367 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2368 }
2369 RAP_val[irb+row_block_size*icb]=rtmp;
2370 }
2371 }
2372
2373 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2374 for (ib=0; ib<block_size; ib++) {
2375 temp_val[ib] += RAP_val[ib];
2376 }
2377 }
2378 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2379 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2380 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2381 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2382 for (irb=0; irb<row_block_size; irb++) {
2383 for (icb=0; icb<col_block_size; icb++) {
2384 rtmp = ZERO;
2385 for (ib=0; ib<col_block_size; ib++) {
2386 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2387 }
2388 RAP_val[irb+row_block_size*icb]=rtmp;
2389 }
2390 }
2391
2392 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2393 for (ib=0; ib<block_size; ib++) {
2394 temp_val[ib] += RAP_val[ib];
2395 }
2396 }
2397 }
2398 }
2399
2400 /* now loop over elements in row i1 of A->mainBlock, repeat
2401 the process we have done to block A->col_coupleBlock */
2402 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2403 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2404 i2 = A->mainBlock->pattern->index[j2];
2405 temp_val = &(A->mainBlock->val[j2*block_size]);
2406 for (irb=0; irb<row_block_size; irb++) {
2407 for (icb=0; icb<col_block_size; icb++) {
2408 rtmp = ZERO;
2409 for (ib=0; ib<col_block_size; ib++) {
2410 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2411 }
2412 RA_val[irb+row_block_size*icb]=rtmp;
2413 }
2414 }
2415
2416 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2417 A_marker[i2 + num_Acouple_cols] = i_r;
2418 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2419 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2420 i_c = P->mainBlock->pattern->index[j3];
2421 temp_val = &(P->mainBlock->val[j3*block_size]);
2422 for (irb=0; irb<row_block_size; irb++) {
2423 for (icb=0; icb<col_block_size; icb++) {
2424 rtmp = ZERO;
2425 for (ib=0; ib<col_block_size; ib++) {
2426 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2427 }
2428 RAP_val[irb+row_block_size*icb]=rtmp;
2429 }
2430 }
2431
2432 if (P_marker[i_c] < row_marker) {
2433 P_marker[i_c] = sum;
2434 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2435 RAP_ext_idx[sum] = i_c + offset;
2436 sum++;
2437 } else {
2438 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2439 for (ib=0; ib<block_size; ib++) {
2440 temp_val[ib] += RAP_val[ib];
2441 }
2442 }
2443 }
2444 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2445 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2446 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2447 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2448 for (irb=0; irb<row_block_size; irb++) {
2449 for (icb=0; icb<col_block_size; icb++) {
2450 rtmp = ZERO;
2451 for (ib=0; ib<col_block_size; ib++) {
2452 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2453 }
2454 RAP_val[irb+row_block_size*icb]=rtmp;
2455 }
2456 }
2457
2458 if (P_marker[i_c] < row_marker) {
2459 P_marker[i_c] = sum;
2460 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2461 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2462 sum++;
2463 } else {
2464 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2465 for (ib=0; ib<block_size; ib++) {
2466 temp_val[ib] += RAP_val[ib];
2467 }
2468 }
2469 }
2470 } else {
2471 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2472 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2473 i_c = P->mainBlock->pattern->index[j3];
2474 temp_val = &(P->mainBlock->val[j3*block_size]);
2475 for (irb=0; irb<row_block_size; irb++) {
2476 for (icb=0; icb<col_block_size; icb++) {
2477 rtmp = ZERO;
2478 for (ib=0; ib<col_block_size; ib++) {
2479 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2480 }
2481 RAP_val[irb+row_block_size*icb]=rtmp;
2482 }
2483 }
2484
2485 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2486 for (ib=0; ib<block_size; ib++) {
2487 temp_val[ib] += RAP_val[ib];
2488 }
2489 }
2490 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2491 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2492 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2493 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2494 for (irb=0; irb<row_block_size; irb++) {
2495 for (icb=0; icb<col_block_size; icb++) {
2496 rtmp = ZERO;
2497 for (ib=0; ib<col_block_size; ib++) {
2498 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2499 }
2500 RAP_val[irb+row_block_size*icb]=rtmp;
2501 }
2502 }
2503
2504 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2505 for (ib=0; ib<block_size; ib++) {
2506 temp_val[ib] += RAP_val[ib];
2507 }
2508 }
2509 }
2510 }
2511 }
2512 RAP_ext_ptr[i_r+1] = sum;
2513 }
2514 delete[] P_marker;
2515 delete[] Pcouple_to_Pext;
2516
2517 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2518 which is given by the column list of P->col_coupleBlock, and
2519 column "c" is the list of columns which possibly covers the
2520 whole column range of system matrix P. This part of data is to
2521 be passed to neighbouring processors, and added to corresponding
2522 RAP[r,c] entries in the neighbouring processors */
2523 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2524 &RAP_ext_val, global_id_P, block_size);
2525
2526 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2527 sum = RAP_ext_ptr[num_RAPext_rows];
2528 num_RAPext_cols = 0;
2529 if (num_Pext_cols || sum > 0) {
2530 temp = new index_t[num_Pext_cols+sum];
2531 j1_ub = offset + num_Pmain_cols;
2532 for (i=0, iptr=0; i<sum; i++) {
2533 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2534 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2535 }
2536 for (j=0; j<num_Pext_cols; j++, iptr++){
2537 temp[iptr] = global_id_P[j];
2538 }
2539
2540 if (iptr) {
2541 #ifdef USE_QSORTG
2542 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2543 #else
2544 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2545 #endif
2546 num_RAPext_cols = 1;
2547 i = temp[0];
2548 for (j=1; j<iptr; j++) {
2549 if (temp[j] > i) {
2550 i = temp[j];
2551 temp[num_RAPext_cols++] = i;
2552 }
2553 }
2554 }
2555 }
2556
2557 /* resize the pattern of P_ext_couple */
2558 if(num_RAPext_cols){
2559 global_id_RAP = new index_t[num_RAPext_cols];
2560 for (i=0; i<num_RAPext_cols; i++)
2561 global_id_RAP[i] = temp[i];
2562 }
2563 if (num_Pext_cols || sum > 0)
2564 delete[] temp;
2565 j1_ub = offset + num_Pmain_cols;
2566 for (i=0; i<sum; i++) {
2567 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2568 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2569 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2570 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2571 } else
2572 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2573 }
2574
2575 /* build the mapping */
2576 if (num_Pcouple_cols) {
2577 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
2578 iptr = 0;
2579 for (i=0; i<num_RAPext_cols; i++)
2580 if (