/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4657 - (show annotations)
Thu Feb 6 06:12:20 2014 UTC (5 years, 9 months ago) by jfenwick
File size: 122505 byte(s)
I changed some files.
Updated copyright notices, added GeoComp.



1 /*****************************************************************************
2 *
3 * Copyright (c) 2003-2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16
17 /************************************************************************************/
18
19 /* Paso: defines AMG Interpolation */
20
21 /************************************************************************************/
22
23 /* Author: l.gao@uq.edu.au */
24
25 /************************************************************************************/
26
27 #include "Paso.h"
28 #include "SparseMatrix.h"
29 #include "PasoUtil.h"
30 #include "Preconditioner.h"
31
32 /************************************************************************************
33
34 Methods necessary for AMG preconditioner
35
36 construct n_C x n_C interpolation operator A_C from matrix A
37 and prolongation matrix P.
38
39 The coarsening operator A_C is defined by RAP where R=P^T.
40
41 *************************************************************************************/
42
43 /* Extend system matrix B with extra two sparse matrices:
44 B_ext_main and B_ext_couple
45 The combination of this two sparse matrices represents
46 the portion of B that is stored on neighbour procs and
47 needed locally for triple matrix product (B^T) A B.
48
49 FOR NOW, we assume that the system matrix B has a NULL
50 row_coupleBlock and a NULL remote_coupleBlock. Based on
51 this assumption, we use link row_coupleBlock for sparse
52 matrix B_ext_main, and link remote_coupleBlock for sparse
53 matrix B_ext_couple.
54
55 To be specific, B_ext (on processor k) are group of rows
56 in B, where the list of rows from processor q is given by
57 A->col_coupler->connector->send->shared[rPtr...]
58 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
59 on q. */
60 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
61 {
62 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
63 Paso_Coupler *coupler=NULL;
64 Paso_SharedComponents *send=NULL, *recv=NULL;
65 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
66 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
67 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
68 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
69 index_t *idx_m=NULL, *idx_c=NULL;
70 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
71 index_t offset, len, block_size, block_size_size, max_num_cols;
72 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
73 dim_t *recv_degree=NULL, *send_degree=NULL;
74 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
75
76 if (size == 1) return;
77
78 if (B->row_coupleBlock) {
79 Paso_SparseMatrix_free(B->row_coupleBlock);
80 B->row_coupleBlock = NULL;
81 }
82
83 if (B->row_coupleBlock || B->remote_coupleBlock) {
84 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
85 return;
86 }
87
88 /* sending/receiving unknown's global ID */
89 num_main_cols = B->mainBlock->numCols;
90 cols = new double[num_main_cols];
91 offset = B->col_distribution->first_component[rank];
92 #pragma omp parallel for private(i) schedule(static)
93 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
94 if (B->global_id == NULL) {
95 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
96 Paso_Coupler_startCollect(coupler, cols);
97 }
98
99 recv_buf = new index_t[size];
100 recv_degree = new dim_t[size];
101 recv_offset = new index_t[size+1];
102 #pragma omp parallel for private(i) schedule(static)
103 for (i=0; i<size; i++){
104 recv_buf[i] = 0;
105 recv_degree[i] = 1;
106 recv_offset[i] = i;
107 }
108
109 block_size = B->block_size;
110 block_size_size = block_size * sizeof(double);
111 num_couple_cols = B->col_coupleBlock->numCols;
112 send = A->col_coupler->connector->send;
113 recv = A->col_coupler->connector->recv;
114 num_neighbors = send->numNeighbors;
115 p = send->offsetInShared[num_neighbors];
116 len = p * B->col_distribution->first_component[size];
117 send_buf = new double[len * block_size];
118 send_idx = new index_t[len];
119 send_offset = new index_t[(p+1)*2];
120 send_degree = new dim_t[num_neighbors];
121 i = num_main_cols + num_couple_cols;
122 send_m = new double[i * block_size];
123 send_c = new double[i * block_size];
124 idx_m = new index_t[i];
125 idx_c = new index_t[i];
126
127 /* waiting for receiving unknown's global ID */
128 if (B->global_id == NULL) {
129 Paso_Coupler_finishCollect(coupler);
130 global_id = new index_t[num_couple_cols];
131 #pragma omp parallel for private(i) schedule(static)
132 for (i=0; i<num_couple_cols; ++i)
133 global_id[i] = coupler->recv_buffer[i];
134 B->global_id = global_id;
135 Paso_Coupler_free(coupler);
136 } else
137 global_id = B->global_id;
138
139 /* distribute the number of cols in current col_coupleBlock to all ranks */
140 #ifdef ESYS_MPI
141 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
142 #endif
143
144 /* distribute global_ids of cols to be considered to all ranks*/
145 len = 0;
146 max_num_cols = 0;
147 for (i=0; i<size; i++){
148 recv_degree[i] = recv_buf[i];
149 recv_offset[i] = len;
150 len += recv_buf[i];
151 if (max_num_cols < recv_buf[i])
152 max_num_cols = recv_buf[i];
153 }
154 recv_offset[size] = len;
155 cols_array = new index_t[len];
156 #ifdef ESYS_MPI
157 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
158 #endif
159
160 /* first, prepare the ptr_ptr to be received */
161 q = recv->numNeighbors;
162 len = recv->offsetInShared[q];
163 ptr_ptr = new index_t[(len+1) * 2];
164 for (p=0; p<q; p++) {
165 row = recv->offsetInShared[p];
166 m = recv->offsetInShared[p + 1];
167 #ifdef ESYS_MPI
168 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
169 A->mpi_info->msg_tag_counter+recv->neighbor[p],
170 A->mpi_info->comm,
171 &(A->col_coupler->mpi_requests[p]));
172 #endif
173 }
174
175 /* now prepare the rows to be sent (the degree, the offset and the data) */
176 len = 0;
177 i0 = 0;
178 for (p=0; p<num_neighbors; p++) {
179 i = i0;
180 neighbor = send->neighbor[p];
181 m_lb = B->col_distribution->first_component[neighbor];
182 m_ub = B->col_distribution->first_component[neighbor + 1];
183 j_ub = send->offsetInShared[p + 1];
184 for (j=send->offsetInShared[p]; j<j_ub; j++) {
185 row = send->shared[j];
186 l_m = 0;
187 l_c = 0;
188 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
189 k_lb = B->col_coupleBlock->pattern->ptr[row];
190
191 /* check part of col_coupleBlock for data to be passed @row */
192 for (k=k_lb; k<k_ub; k++) {
193 m = global_id[B->col_coupleBlock->pattern->index[k]];
194 if (m > offset) break;
195 if (m>= m_lb && m < m_ub) {
196 /* data to be passed to sparse matrix B_ext_main */
197 idx_m[l_m] = m - m_lb;
198 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
199 l_m++;
200 } else {
201 /* data to be passed to sparse matrix B_ext_couple */
202 idx_c[l_c] = m;
203 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
204 l_c++;
205 }
206 }
207 k_lb = k;
208
209 /* check mainBlock for data to be passed @row to sparse
210 matrix B_ext_couple */
211 k_ub = B->mainBlock->pattern->ptr[row + 1];
212 k = B->mainBlock->pattern->ptr[row];
213 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
214 for (; k<k_ub; k++) {
215 m = B->mainBlock->pattern->index[k] + offset;
216 idx_c[l_c] = m;
217 l_c++;
218 }
219
220 /* check the rest part of col_coupleBlock for data to
221 be passed @row to sparse matrix B_ext_couple */
222 k = k_lb;
223 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
224 for (k=k_lb; k<k_ub; k++) {
225 m = global_id[B->col_coupleBlock->pattern->index[k]];
226 if (m>= m_lb && m < m_ub) {
227 /* data to be passed to sparse matrix B_ext_main */
228 idx_m[l_m] = m - m_lb;
229 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
230 l_m++;
231 } else {
232 /* data to be passed to sparse matrix B_ext_couple */
233 idx_c[l_c] = m;
234 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
235 l_c++;
236 }
237 }
238
239 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
240 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
241 send_offset[2*i] = l_m;
242 len += l_m;
243 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
244 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
245 send_offset[2*i+1] = l_c;
246 len += l_c;
247 i++;
248 }
249
250 /* sending */
251 #ifdef ESYS_MPI
252 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
253 A->mpi_info->msg_tag_counter+rank,
254 A->mpi_info->comm,
255 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
256 #endif
257 send_degree[p] = len;
258 i0 = i;
259 }
260 delete[] send_m;
261 delete[] send_c;
262 delete[] idx_m;
263 delete[] idx_c;
264
265
266 q = recv->numNeighbors;
267 len = recv->offsetInShared[q];
268 ptr_main = new index_t[(len+1)];
269 ptr_couple = new index_t[(len+1)];
270
271 #ifdef ESYS_MPI
272 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
273 A->col_coupler->mpi_requests,
274 A->col_coupler->mpi_stati);
275 #endif
276 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
277
278 j = 0;
279 k = 0;
280 ptr_main[0] = 0;
281 ptr_couple[0] = 0;
282 for (i=0; i<len; i++) {
283 j += ptr_ptr[2*i];
284 k += ptr_ptr[2*i+1];
285 ptr_main[i+1] = j;
286 ptr_couple[i+1] = k;
287 }
288
289 delete[] ptr_ptr;
290 idx_main = new index_t[j];
291 idx_couple = new index_t[k];
292 ptr_idx = new index_t[j+k];
293 ptr_val = new double[(j+k) * block_size];
294
295 /* send/receive index array */
296 j=0;
297 k_ub = 0;
298 for (p=0; p<recv->numNeighbors; p++) {
299 k = recv->offsetInShared[p];
300 m = recv->offsetInShared[p+1];
301 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
302 if (i > 0) {
303 k_ub ++;
304 #ifdef ESYS_MPI
305 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
306 A->mpi_info->msg_tag_counter+recv->neighbor[p],
307 A->mpi_info->comm,
308 &(A->col_coupler->mpi_requests[p]));
309 #endif
310 }
311 j += i;
312 }
313
314 j=0;
315 k_ub = 0;
316 for (p=0; p<num_neighbors; p++) {
317 i = send_degree[p] - j;
318 if (i > 0){
319 k_ub ++;
320 #ifdef ESYS_MPI
321 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
322 A->mpi_info->msg_tag_counter+rank,
323 A->mpi_info->comm,
324 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
325 #endif
326 }
327 j = send_degree[p];
328 }
329
330 #ifdef ESYS_MPI
331 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
332 A->col_coupler->mpi_requests,
333 A->col_coupler->mpi_stati);
334 #endif
335 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
336
337 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
338 for (i=0; i<len; i++) {
339 j = ptr_main[i];
340 k = ptr_main[i+1];
341 m = ptr_couple[i];
342 for (p=j; p<k; p++) {
343 idx_main[p] = ptr_idx[m+p];
344 }
345 j = ptr_couple[i+1];
346 for (p=m; p<j; p++) {
347 idx_couple[p] = ptr_idx[k+p];
348 }
349 }
350 delete[] ptr_idx;
351
352 /* allocate pattern and sparsematrix for B_ext_main */
353 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
354 len, num_main_cols, ptr_main, idx_main);
355 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
356 pattern_main, B->row_block_size, B->col_block_size,
357 FALSE);
358 Paso_Pattern_free(pattern_main);
359
360 /* allocate pattern and sparsematrix for B_ext_couple */
361 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
362 len, B->col_distribution->first_component[size],
363 ptr_couple, idx_couple);
364 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
365 pattern_couple, B->row_block_size, B->col_block_size,
366 FALSE);
367 Paso_Pattern_free(pattern_couple);
368
369 /* send/receive value array */
370 j=0;
371 for (p=0; p<recv->numNeighbors; p++) {
372 k = recv->offsetInShared[p];
373 m = recv->offsetInShared[p+1];
374 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
375 #ifdef ESYS_MPI
376 if (i > 0)
377 MPI_Irecv(&(ptr_val[j]), i * block_size,
378 MPI_DOUBLE, recv->neighbor[p],
379 A->mpi_info->msg_tag_counter+recv->neighbor[p],
380 A->mpi_info->comm,
381 &(A->col_coupler->mpi_requests[p]));
382 #endif
383 j += (i * block_size);
384 }
385
386 j=0;
387 for (p=0; p<num_neighbors; p++) {
388 i = send_degree[p] - j;
389 #ifdef ESYS_MPI
390 if (i > 0)
391 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
392 A->mpi_info->msg_tag_counter+rank,
393 A->mpi_info->comm,
394 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
395 #endif
396 j = send_degree[p] ;
397 }
398
399 #ifdef ESYS_MPI
400 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
401 A->col_coupler->mpi_requests,
402 A->col_coupler->mpi_stati);
403 #endif
404 ESYS_MPI_INC_COUNTER(*(A->mpi_info), size);
405
406 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
407 for (i=0; i<len; i++) {
408 j = ptr_main[i];
409 k = ptr_main[i+1];
410 m = ptr_couple[i];
411 for (p=j; p<k; p++) {
412 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
413 }
414 j = ptr_couple[i+1];
415 for (p=m; p<j; p++) {
416 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
417 }
418 }
419 delete[] ptr_val;
420
421
422 /* release all temp memory allocation */
423 delete[] cols;
424 delete[] cols_array;
425 delete[] recv_offset;
426 delete[] recv_degree;
427 delete[] recv_buf;
428 delete[] send_buf;
429 delete[] send_offset;
430 delete[] send_degree;
431 delete[] send_idx;
432 }
433
434 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
435 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
436 to copy block of data in T to neighbour processors, defined by
437 P->col_coupler->connector->recv->neighbor[k] where k is in
438 [0, P->col_coupler->connector->recv->numNeighbors).
439 Rows to be copied to neighbor processor k is in the list defined by
440 P->col_coupler->connector->recv->offsetInShared[k] ...
441 P->col_coupler->connector->recv->offsetInShared[k+1] */
442 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
443 index_t **p_ptr, index_t **p_idx, double **p_val,
444 index_t *global_id, index_t block_size)
445 {
446 Paso_SharedComponents *send=NULL, *recv=NULL;
447 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
448 index_t i, j, p, m, n, size;
449 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
450 index_t *ptr=*p_ptr, *idx=*p_idx;
451 double *val=*p_val, *recv_val=NULL;
452 #ifdef ESYS_MPI
453 index_t rank = P->mpi_info->rank;
454 #endif
455
456 size = P->mpi_info->size;
457 send = P->col_coupler->connector->recv;
458 recv = P->col_coupler->connector->send;
459 send_neighbors = send->numNeighbors;
460 recv_neighbors = recv->numNeighbors;
461 send_rows = P->col_coupleBlock->numCols;
462 recv_rows = recv->offsetInShared[recv_neighbors];
463
464 send_degree = new index_t[send_rows];
465 recv_ptr = new index_t[recv_rows + 1];
466 #pragma omp for schedule(static) private(i)
467 for (i=0; i<send_rows; i++)
468 send_degree[i] = ptr[i+1] - ptr[i];
469
470 /* First, send/receive the degree */
471 for (p=0; p<recv_neighbors; p++) { /* Receiving */
472 m = recv->offsetInShared[p];
473 n = recv->offsetInShared[p+1];
474 #ifdef ESYS_MPI
475 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
476 P->mpi_info->msg_tag_counter + recv->neighbor[p],
477 P->mpi_info->comm,
478 &(P->col_coupler->mpi_requests[p]));
479 #endif
480 }
481 for (p=0; p<send_neighbors; p++) { /* Sending */
482 m = send->offsetInShared[p];
483 n = send->offsetInShared[p+1];
484 #ifdef ESYS_MPI
485 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
486 P->mpi_info->msg_tag_counter + rank,
487 P->mpi_info->comm,
488 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
489 #endif
490 }
491 #ifdef ESYS_MPI
492 MPI_Waitall(send_neighbors+recv_neighbors,
493 P->col_coupler->mpi_requests,
494 P->col_coupler->mpi_stati);
495 #endif
496 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
497
498 delete[] send_degree;
499 m = Paso_Util_cumsum(recv_rows, recv_ptr);
500 recv_ptr[recv_rows] = m;
501 recv_idx = new index_t[m];
502 recv_val = new double[m * block_size];
503
504 /* Next, send/receive the index array */
505 j = 0;
506 for (p=0; p<recv_neighbors; p++) { /* Receiving */
507 m = recv->offsetInShared[p];
508 n = recv->offsetInShared[p+1];
509 i = recv_ptr[n] - recv_ptr[m];
510 if (i > 0) {
511 #ifdef ESYS_MPI
512 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
513 P->mpi_info->msg_tag_counter + recv->neighbor[p],
514 P->mpi_info->comm,
515 &(P->col_coupler->mpi_requests[p]));
516 #endif
517 }
518 j += i;
519 }
520
521 j = 0;
522 for (p=0; p<send_neighbors; p++) { /* Sending */
523 m = send->offsetInShared[p];
524 n = send->offsetInShared[p+1];
525 i = ptr[n] - ptr[m];
526 if (i >0) {
527 #ifdef ESYS_MPI
528 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
529 P->mpi_info->msg_tag_counter + rank,
530 P->mpi_info->comm,
531 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
532 #endif
533 j += i;
534 }
535 }
536 #ifdef ESYS_MPI
537 MPI_Waitall(send_neighbors+recv_neighbors,
538 P->col_coupler->mpi_requests,
539 P->col_coupler->mpi_stati);
540 #endif
541 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
542
543 /* Last, send/receive the data array */
544 j = 0;
545 for (p=0; p<recv_neighbors; p++) { /* Receiving */
546 m = recv->offsetInShared[p];
547 n = recv->offsetInShared[p+1];
548 i = recv_ptr[n] - recv_ptr[m];
549 #ifdef ESYS_MPI
550 if (i > 0)
551 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
552 P->mpi_info->msg_tag_counter + recv->neighbor[p],
553 P->mpi_info->comm,
554 &(P->col_coupler->mpi_requests[p]));
555 #endif
556 j += (i*block_size);
557 }
558
559 j = 0;
560 for (p=0; p<send_neighbors; p++) { /* Sending */
561 m = send->offsetInShared[p];
562 n = send->offsetInShared[p+1];
563 i = ptr[n] - ptr[m];
564 if (i >0) {
565 #ifdef ESYS_MPI
566 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
567 P->mpi_info->msg_tag_counter + rank,
568 P->mpi_info->comm,
569 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
570 #endif
571 j += i * block_size;
572 }
573 }
574 #ifdef ESYS_MPI
575 MPI_Waitall(send_neighbors+recv_neighbors,
576 P->col_coupler->mpi_requests,
577 P->col_coupler->mpi_stati);
578 #endif
579 ESYS_MPI_INC_COUNTER(*(P->mpi_info),size);
580
581 /* Clean up and return with received ptr, index and data arrays */
582 delete[] ptr;
583 delete[] idx;
584 delete[] val;
585 *p_ptr = recv_ptr;
586 *p_idx = recv_idx;
587 *p_val = recv_val;
588 }
589
590 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
591 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
592 {
593 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
594 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
595 Paso_SystemMatrix *out=NULL;
596 Paso_SystemMatrixPattern *pattern=NULL;
597 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
598 Paso_SharedComponents *send =NULL, *recv=NULL;
599 Paso_Connector *col_connector=NULL, *row_connector=NULL;
600 Paso_Pattern *main_pattern=NULL;
601 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
602 const dim_t row_block_size=A->row_block_size;
603 const dim_t col_block_size=A->col_block_size;
604 const dim_t block_size = A->block_size;
605 const dim_t P_block_size = P->block_size;
606 const double ZERO = 0.0;
607 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
608 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
609 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
610 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
611 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
612 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
613 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
614 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
615 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
616 index_t sum, i, j, k, iptr, irb, icb, ib;
617 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
618 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
619 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
620 index_t row_marker_ext, *where_p=NULL;
621 index_t **send_ptr=NULL, **send_idx=NULL;
622 dim_t p, num_neighbors;
623 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
624 Esys_MPI_rank *neighbor=NULL;
625 #ifdef ESYS_MPI
626 MPI_Request* mpi_requests=NULL;
627 MPI_Status* mpi_stati=NULL;
628 #else
629 int *mpi_requests=NULL, *mpi_stati=NULL;
630 #endif
631
632 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
633 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
634
635 /* two sparse matrices R_main and R_couple will be generate, as the
636 transpose of P_main and P_col_couple, respectively. Note that,
637 R_couple is actually the row_coupleBlock of R (R=P^T) */
638 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
639 if (size > 1)
640 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
641
642 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
643 and needed locally for triple matrix product RAP
644 to be specific, P_ext (on processor k) are group of rows in P, where
645 the list of rows from processor q is given by
646 A->col_coupler->connector->send->shared[rPtr...]
647 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
648 on q.
649 P_ext is represented by two sparse matrices P_ext_main and
650 P_ext_couple */
651 Paso_Preconditioner_AMG_extendB(A, P);
652
653 /* count the number of cols in P_ext_couple, resize the pattern of
654 sparse matrix P_ext_couple with new compressed order, and then
655 build the col id mapping from P->col_coupleBlock to
656 P_ext_couple */
657 num_Pmain_cols = P->mainBlock->numCols;
658 if (size > 1) {
659 num_Pcouple_cols = P->col_coupleBlock->numCols;
660 num_Acouple_cols = A->col_coupleBlock->numCols;
661 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
662 } else {
663 num_Pcouple_cols = 0;
664 num_Acouple_cols = 0;
665 sum = 0;
666 }
667 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
668 offset = P->col_distribution->first_component[rank];
669 num_Pext_cols = 0;
670 if (P->global_id) {
671 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
672 and P->col_coupleBlock */
673 iptr = 0;
674 if (num_Pcouple_cols || sum > 0) {
675 temp = new index_t[num_Pcouple_cols+sum];
676 #pragma omp parallel for lastprivate(iptr) schedule(static)
677 for (iptr=0; iptr<sum; iptr++){
678 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
679 }
680 for (j=0; j<num_Pcouple_cols; j++, iptr++){
681 temp[iptr] = P->global_id[j];
682 }
683 }
684 if (iptr) {
685 #ifdef USE_QSORTG
686 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
687 #else
688 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
689 #endif
690 num_Pext_cols = 1;
691 i = temp[0];
692 for (j=1; j<iptr; j++) {
693 if (temp[j] > i) {
694 i = temp[j];
695 temp[num_Pext_cols++] = i;
696 }
697 }
698 }
699 /* resize the pattern of P_ext_couple */
700 if(num_Pext_cols){
701 global_id_P = new index_t[num_Pext_cols];
702 #pragma omp parallel for private(i) schedule(static)
703 for (i=0; i<num_Pext_cols; i++)
704 global_id_P[i] = temp[i];
705 }
706 if (num_Pcouple_cols || sum > 0)
707 delete[] temp;
708 #pragma omp parallel for private(i, where_p) schedule(static)
709 for (i=0; i<sum; i++) {
710 where_p = (index_t *)bsearch(
711 &(P->remote_coupleBlock->pattern->index[i]),
712 global_id_P, num_Pext_cols,
713 sizeof(index_t), Paso_comparIndex);
714 P->remote_coupleBlock->pattern->index[i] =
715 (index_t)(where_p -global_id_P);
716 }
717
718 /* build the mapping */
719 if (num_Pcouple_cols) {
720 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
721 iptr = 0;
722 for (i=0; i<num_Pext_cols; i++)
723 if (global_id_P[i] == P->global_id[iptr]) {
724 Pcouple_to_Pext[iptr++] = i;
725 if (iptr == num_Pcouple_cols) break;
726 }
727 }
728 }
729
730 /* alloc and initialise the makers */
731 sum = num_Pext_cols + num_Pmain_cols;
732 P_marker = new index_t[sum];
733 A_marker = new index_t[num_A_cols];
734 #pragma omp parallel for private(i) schedule(static)
735 for (i=0; i<sum; i++) P_marker[i] = -1;
736 #pragma omp parallel for private(i) schedule(static)
737 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
738
739 /* Now, count the size of RAP_ext. Start with rows in R_couple */
740 sum = 0;
741 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
742 row_marker = sum;
743 /* then, loop over elements in row i_r of R_couple */
744 j1_ub = R_couple->pattern->ptr[i_r+1];
745 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
746 i1 = R_couple->pattern->index[j1];
747 /* then, loop over elements in row i1 of A->col_coupleBlock */
748 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
749 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
750 i2 = A->col_coupleBlock->pattern->index[j2];
751
752 /* check whether entry RA[i_r, i2] has been previously visited.
753 RAP new entry is possible only if entry RA[i_r, i2] has not
754 been visited yet */
755 if (A_marker[i2] != i_r) {
756 /* first, mark entry RA[i_r,i2] as visited */
757 A_marker[i2] = i_r;
758
759 /* then loop over elements in row i2 of P_ext_main */
760 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
761 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
762 i_c = P->row_coupleBlock->pattern->index[j3];
763
764 /* check whether entry RAP[i_r,i_c] has been created.
765 If not yet created, create the entry and increase
766 the total number of elements in RAP_ext */
767 if (P_marker[i_c] < row_marker) {
768 P_marker[i_c] = sum;
769 sum++;
770 }
771 }
772
773 /* loop over elements in row i2 of P_ext_couple, do the same */
774 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
775 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
776 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
777 if (P_marker[i_c] < row_marker) {
778 P_marker[i_c] = sum;
779 sum++;
780 }
781 }
782 }
783 }
784
785 /* now loop over elements in row i1 of A->mainBlock, repeat
786 the process we have done to block A->col_coupleBlock */
787 j2_ub = A->mainBlock->pattern->ptr[i1+1];
788 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
789 i2 = A->mainBlock->pattern->index[j2];
790 if (A_marker[i2 + num_Acouple_cols] != i_r) {
791 A_marker[i2 + num_Acouple_cols] = i_r;
792 j3_ub = P->mainBlock->pattern->ptr[i2+1];
793 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
794 i_c = P->mainBlock->pattern->index[j3];
795 if (P_marker[i_c] < row_marker) {
796 P_marker[i_c] = sum;
797 sum++;
798 }
799 }
800 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
801 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
802 /* note that we need to map the column index in
803 P->col_coupleBlock back into the column index in
804 P_ext_couple */
805 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
806 if (P_marker[i_c] < row_marker) {
807 P_marker[i_c] = sum;
808 sum++;
809 }
810 }
811 }
812 }
813 }
814 }
815
816 /* Now we have the number of non-zero elements in RAP_ext, allocate
817 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
818 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
819 RAP_ext_idx = new index_t[sum];
820 RAP_ext_val = new double[sum * block_size];
821 RA_val = new double[block_size];
822 RAP_val = new double[block_size];
823
824 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
825 sum = num_Pext_cols + num_Pmain_cols;
826 #pragma omp parallel for private(i) schedule(static)
827 for (i=0; i<sum; i++) P_marker[i] = -1;
828 #pragma omp parallel for private(i) schedule(static)
829 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
830 sum = 0;
831 RAP_ext_ptr[0] = 0;
832 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
833 row_marker = sum;
834 /* then, loop over elements in row i_r of R_couple */
835 j1_ub = R_couple->pattern->ptr[i_r+1];
836 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
837 i1 = R_couple->pattern->index[j1];
838 R_val = &(R_couple->val[j1*P_block_size]);
839
840 /* then, loop over elements in row i1 of A->col_coupleBlock */
841 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
842 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
843 i2 = A->col_coupleBlock->pattern->index[j2];
844 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
845 for (irb=0; irb<row_block_size; irb++)
846 for (icb=0; icb<col_block_size; icb++)
847 RA_val[irb+row_block_size*icb] = ZERO;
848 for (irb=0; irb<P_block_size; irb++) {
849 rtmp=R_val[irb];
850 for (icb=0; icb<col_block_size; icb++) {
851 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
852 }
853 }
854
855 /* check whether entry RA[i_r, i2] has been previously visited.
856 RAP new entry is possible only if entry RA[i_r, i2] has not
857 been visited yet */
858 if (A_marker[i2] != i_r) {
859 /* first, mark entry RA[i_r,i2] as visited */
860 A_marker[i2] = i_r;
861
862 /* then loop over elements in row i2 of P_ext_main */
863 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
864 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
865 i_c = P->row_coupleBlock->pattern->index[j3];
866 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
867 for (irb=0; irb<row_block_size; irb++)
868 for (icb=0; icb<col_block_size; icb++)
869 RAP_val[irb+row_block_size*icb] = ZERO;
870 for (icb=0; icb<P_block_size; icb++) {
871 rtmp = temp_val[icb];
872 for (irb=0; irb<row_block_size; irb++) {
873 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
874 }
875 }
876
877 /* check whether entry RAP[i_r,i_c] has been created.
878 If not yet created, create the entry and increase
879 the total number of elements in RAP_ext */
880 if (P_marker[i_c] < row_marker) {
881 P_marker[i_c] = sum;
882 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
883 RAP_ext_idx[sum] = i_c + offset;
884 sum++;
885 } else {
886 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
887 for (ib=0; ib<block_size; ib++) {
888 temp_val[ib] += RAP_val[ib];
889 }
890 }
891 }
892
893 /* loop over elements in row i2 of P_ext_couple, do the same */
894 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
895 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
896 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
897 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
898 for (irb=0; irb<row_block_size; irb++)
899 for (icb=0; icb<col_block_size; icb++)
900 RAP_val[irb+row_block_size*icb]=ZERO;
901 for (icb=0; icb<P_block_size; icb++) {
902 rtmp = temp_val[icb];
903 for (irb=0; irb<row_block_size; irb++) {
904 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
905 }
906 }
907
908 if (P_marker[i_c] < row_marker) {
909 P_marker[i_c] = sum;
910 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
911 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
912 sum++;
913 } else {
914 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
915 for (ib=0; ib<block_size; ib++) {
916 temp_val[ib] += RAP_val[ib];
917 }
918 }
919 }
920
921 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
922 Only the contributions are added. */
923 } else {
924 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
925 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
926 i_c = P->row_coupleBlock->pattern->index[j3];
927 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
928 for (irb=0; irb<row_block_size; irb++)
929 for (icb=0; icb<col_block_size; icb++)
930 RAP_val[irb+row_block_size*icb]=ZERO;
931 for (icb=0; icb<P_block_size; icb++) {
932 rtmp = temp_val[icb];
933 for (irb=0; irb<row_block_size; irb++) {
934 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
935 }
936 }
937
938 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
939 for (ib=0; ib<block_size; ib++) {
940 temp_val[ib] += RAP_val[ib];
941 }
942 }
943 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
944 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
945 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
946 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
947 for (irb=0; irb<row_block_size; irb++)
948 for (icb=0; icb<col_block_size; icb++)
949 RAP_val[irb+row_block_size*icb]=ZERO;
950 for (icb=0; icb<P_block_size; icb++) {
951 rtmp = temp_val[icb];
952 for (irb=0; irb<row_block_size; irb++) {
953 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
954 }
955 }
956
957 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
958 for (ib=0; ib<block_size; ib++) {
959 temp_val[ib] += RAP_val[ib];
960 }
961 }
962 }
963 }
964
965 /* now loop over elements in row i1 of A->mainBlock, repeat
966 the process we have done to block A->col_coupleBlock */
967 j2_ub = A->mainBlock->pattern->ptr[i1+1];
968 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
969 i2 = A->mainBlock->pattern->index[j2];
970 temp_val = &(A->mainBlock->val[j2*block_size]);
971 for (irb=0; irb<row_block_size; irb++)
972 for (icb=0; icb<col_block_size; icb++)
973 RA_val[irb+row_block_size*icb]=ZERO;
974 for (irb=0; irb<P_block_size; irb++) {
975 rtmp=R_val[irb];
976 for (icb=0; icb<col_block_size; icb++) {
977 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
978 }
979 }
980
981 if (A_marker[i2 + num_Acouple_cols] != i_r) {
982 A_marker[i2 + num_Acouple_cols] = i_r;
983 j3_ub = P->mainBlock->pattern->ptr[i2+1];
984 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
985 i_c = P->mainBlock->pattern->index[j3];
986 temp_val = &(P->mainBlock->val[j3*P_block_size]);
987 for (irb=0; irb<row_block_size; irb++)
988 for (icb=0; icb<col_block_size; icb++)
989 RAP_val[irb+row_block_size*icb]=ZERO;
990 for (icb=0; icb<P_block_size; icb++) {
991 rtmp = temp_val[icb];
992 for (irb=0; irb<row_block_size; irb++) {
993 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
994 }
995 }
996
997 if (P_marker[i_c] < row_marker) {
998 P_marker[i_c] = sum;
999 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1000 RAP_ext_idx[sum] = i_c + offset;
1001 sum++;
1002 } else {
1003 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1004 for (ib=0; ib<block_size; ib++) {
1005 temp_val[ib] += RAP_val[ib];
1006 }
1007 }
1008 }
1009 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1010 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1011 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1012 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1013 for (irb=0; irb<row_block_size; irb++)
1014 for (icb=0; icb<col_block_size; icb++)
1015 RAP_val[irb+row_block_size*icb]=ZERO;
1016 for (icb=0; icb<P_block_size; icb++) {
1017 rtmp=temp_val[icb];
1018 for (irb=0; irb<row_block_size; irb++) {
1019 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1020 }
1021 }
1022
1023 if (P_marker[i_c] < row_marker) {
1024 P_marker[i_c] = sum;
1025 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1026 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1027 sum++;
1028 } else {
1029 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1030 for (ib=0; ib<block_size; ib++) {
1031 temp_val[ib] += RAP_val[ib];
1032 }
1033 }
1034 }
1035 } else {
1036 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1037 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1038 i_c = P->mainBlock->pattern->index[j3];
1039 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1040 for (irb=0; irb<row_block_size; irb++)
1041 for (icb=0; icb<col_block_size; icb++)
1042 RAP_val[irb+row_block_size*icb]=ZERO;
1043 for (icb=0; icb<P_block_size; icb++) {
1044 rtmp=temp_val[icb];
1045 for (irb=0; irb<row_block_size; irb++) {
1046 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1047 }
1048 }
1049
1050 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1051 for (ib=0; ib<block_size; ib++) {
1052 temp_val[ib] += RAP_val[ib];
1053 }
1054 }
1055 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1056 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1057 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1058 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1059 for (irb=0; irb<row_block_size; irb++)
1060 for (icb=0; icb<col_block_size; icb++)
1061 RAP_val[irb+row_block_size*icb]=ZERO;
1062 for (icb=0; icb<P_block_size; icb++) {
1063 rtmp=temp_val[icb];
1064 for (irb=0; irb<row_block_size; irb++) {
1065 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1066 }
1067 }
1068
1069 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1070 for (ib=0; ib<block_size; ib++) {
1071 temp_val[ib] += RAP_val[ib];
1072 }
1073 }
1074 }
1075 }
1076 }
1077 RAP_ext_ptr[i_r+1] = sum;
1078 }
1079 delete[] P_marker;
1080 delete[] Pcouple_to_Pext;
1081
1082 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1083 which is given by the column list of P->col_coupleBlock, and
1084 column "c" is the list of columns which possibly covers the
1085 whole column range of system matrix P. This part of data is to
1086 be passed to neighbouring processors, and added to corresponding
1087 RAP[r,c] entries in the neighbouring processors */
1088 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1089 &RAP_ext_val, global_id_P, block_size);
1090
1091 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1092 sum = RAP_ext_ptr[num_RAPext_rows];
1093 num_RAPext_cols = 0;
1094 if (num_Pext_cols || sum > 0) {
1095 temp = new index_t[num_Pext_cols+sum];
1096 j1_ub = offset + num_Pmain_cols;
1097 for (i=0, iptr=0; i<sum; i++) {
1098 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1099 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1100 }
1101 for (j=0; j<num_Pext_cols; j++, iptr++){
1102 temp[iptr] = global_id_P[j];
1103 }
1104
1105 if (iptr) {
1106 #ifdef USE_QSORTG
1107 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1108 #else
1109 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1110 #endif
1111 num_RAPext_cols = 1;
1112 i = temp[0];
1113 for (j=1; j<iptr; j++) {
1114 if (temp[j] > i) {
1115 i = temp[j];
1116 temp[num_RAPext_cols++] = i;
1117 }
1118 }
1119 }
1120 }
1121
1122 /* resize the pattern of P_ext_couple */
1123 if(num_RAPext_cols){
1124 global_id_RAP = new index_t[num_RAPext_cols];
1125 #pragma omp parallel for private(i) schedule(static)
1126 for (i=0; i<num_RAPext_cols; i++)
1127 global_id_RAP[i] = temp[i];
1128 }
1129 if (num_Pext_cols || sum > 0)
1130 delete[] temp;
1131 j1_ub = offset + num_Pmain_cols;
1132 #pragma omp parallel for private(i, where_p) schedule(static)
1133 for (i=0; i<sum; i++) {
1134 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1135 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1136 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1137 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1138 } else
1139 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1140 }
1141
1142 /* build the mapping */
1143 if (num_Pcouple_cols) {
1144 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
1145 iptr = 0;
1146 for (i=0; i<num_RAPext_cols; i++)
1147 if (global_id_RAP[i] == P->global_id[iptr]) {
1148 Pcouple_to_RAP[iptr++] = i;
1149 if (iptr == num_Pcouple_cols) break;
1150 }
1151 }
1152
1153 if (num_Pext_cols) {
1154 Pext_to_RAP = new index_t[num_Pext_cols];
1155 iptr = 0;
1156 for (i=0; i<num_RAPext_cols; i++)
1157 if (global_id_RAP[i] == global_id_P[iptr]) {
1158 Pext_to_RAP[iptr++] = i;
1159 if (iptr == num_Pext_cols) break;
1160 }
1161 }
1162
1163 if (global_id_P){
1164 delete[] global_id_P;
1165 global_id_P = NULL;
1166 }
1167
1168 /* alloc and initialise the makers */
1169 sum = num_RAPext_cols + num_Pmain_cols;
1170 P_marker = new index_t[sum];
1171 #pragma omp parallel for private(i) schedule(static)
1172 for (i=0; i<sum; i++) P_marker[i] = -1;
1173 #pragma omp parallel for private(i) schedule(static)
1174 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1175
1176 /* Now, count the size of RAP. Start with rows in R_main */
1177 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1178 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1179 shared = P->col_coupler->connector->send->shared;
1180 i = 0;
1181 j = 0;
1182 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1183 /* Mark the start of row for both main block and couple block */
1184 row_marker = i;
1185 row_marker_ext = j;
1186
1187 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1188 in RAP_ext */
1189 P_marker[i_r] = i;
1190 i++;
1191 for (j1=0; j1<num_neighbors; j1++) {
1192 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1193 if (shared[j2] == i_r) {
1194 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1195 i_c = RAP_ext_idx[k];
1196 if (i_c < num_Pmain_cols) {
1197 if (P_marker[i_c] < row_marker) {
1198 P_marker[i_c] = i;
1199 i++;
1200 }
1201 } else {
1202 if (P_marker[i_c] < row_marker_ext) {
1203 P_marker[i_c] = j;
1204 j++;
1205 }
1206 }
1207 }
1208 break;
1209 }
1210 }
1211 }
1212
1213 /* then, loop over elements in row i_r of R_main */
1214 j1_ub = R_main->pattern->ptr[i_r+1];
1215 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1216 i1 = R_main->pattern->index[j1];
1217
1218 /* then, loop over elements in row i1 of A->col_coupleBlock */
1219 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1220 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1221 i2 = A->col_coupleBlock->pattern->index[j2];
1222
1223 /* check whether entry RA[i_r, i2] has been previously visited.
1224 RAP new entry is possible only if entry RA[i_r, i2] has not
1225 been visited yet */
1226 if (A_marker[i2] != i_r) {
1227 /* first, mark entry RA[i_r,i2] as visited */
1228 A_marker[i2] = i_r;
1229
1230 /* then loop over elements in row i2 of P_ext_main */
1231 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1232 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1233 i_c = P->row_coupleBlock->pattern->index[j3];
1234
1235 /* check whether entry RAP[i_r,i_c] has been created.
1236 If not yet created, create the entry and increase
1237 the total number of elements in RAP_ext */
1238 if (P_marker[i_c] < row_marker) {
1239 P_marker[i_c] = i;
1240 i++;
1241 }
1242 }
1243
1244 /* loop over elements in row i2 of P_ext_couple, do the same */
1245 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1246 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1247 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1248 if (P_marker[i_c] < row_marker_ext) {
1249 P_marker[i_c] = j;
1250 j++;
1251 }
1252 }
1253 }
1254 }
1255
1256 /* now loop over elements in row i1 of A->mainBlock, repeat
1257 the process we have done to block A->col_coupleBlock */
1258 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1259 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1260 i2 = A->mainBlock->pattern->index[j2];
1261 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1262 A_marker[i2 + num_Acouple_cols] = i_r;
1263 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1264 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1265 i_c = P->mainBlock->pattern->index[j3];
1266 if (P_marker[i_c] < row_marker) {
1267 P_marker[i_c] = i;
1268 i++;
1269 }
1270 }
1271 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1272 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1273 /* note that we need to map the column index in
1274 P->col_coupleBlock back into the column index in
1275 P_ext_couple */
1276 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1277 if (P_marker[i_c] < row_marker_ext) {
1278 P_marker[i_c] = j;
1279 j++;
1280 }
1281 }
1282 }
1283 }
1284 }
1285 }
1286
1287 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1288 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1289 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1290 RAP_couple */
1291 RAP_main_ptr = new index_t[num_Pmain_cols+1];
1292 RAP_main_idx = new index_t[i];
1293 RAP_main_val = new double[i * block_size];
1294 RAP_couple_ptr = new index_t[num_Pmain_cols+1];
1295 RAP_couple_idx = new index_t[j];
1296 RAP_couple_val = new double[j * block_size];
1297
1298 RAP_main_ptr[num_Pmain_cols] = i;
1299 RAP_couple_ptr[num_Pmain_cols] = j;
1300
1301 #pragma omp parallel for private(i) schedule(static)
1302 for (i=0; i<sum; i++) P_marker[i] = -1;
1303 #pragma omp parallel for private(i) schedule(static)
1304 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1305
1306 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1307 in R_main */
1308 i = 0;
1309 j = 0;
1310 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1311 /* Mark the start of row for both main block and couple block */
1312 row_marker = i;
1313 row_marker_ext = j;
1314 RAP_main_ptr[i_r] = row_marker;
1315 RAP_couple_ptr[i_r] = row_marker_ext;
1316
1317 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1318 in row i_r of RAP_ext */
1319 P_marker[i_r] = i;
1320 for (ib=0; ib<block_size; ib++)
1321 RAP_main_val[i*block_size+ib] = ZERO;
1322 RAP_main_idx[i] = i_r;
1323 i++;
1324
1325 for (j1=0; j1<num_neighbors; j1++) {
1326 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1327 if (shared[j2] == i_r) {
1328 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1329 i_c = RAP_ext_idx[k];
1330 if (i_c < num_Pmain_cols) {
1331 if (P_marker[i_c] < row_marker) {
1332 P_marker[i_c] = i;
1333 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1334 RAP_main_idx[i] = i_c;
1335 i++;
1336 } else {
1337 t1_val = &(RAP_ext_val[k*block_size]);
1338 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1339 for (ib=0; ib<block_size; ib++)
1340 t2_val[ib] += t1_val[ib];
1341 }
1342 } else {
1343 if (P_marker[i_c] < row_marker_ext) {
1344 P_marker[i_c] = j;
1345 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1346 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1347 j++;
1348 } else {
1349 t1_val = &(RAP_ext_val[k*block_size]);
1350 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1351 for (ib=0; ib<block_size; ib++)
1352 t2_val[ib] += t1_val[ib];
1353 }
1354 }
1355 }
1356 break;
1357 }
1358 }
1359 }
1360
1361 /* then, loop over elements in row i_r of R_main */
1362 j1_ub = R_main->pattern->ptr[i_r+1];
1363 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1364 i1 = R_main->pattern->index[j1];
1365 R_val = &(R_main->val[j1*P_block_size]);
1366
1367 /* then, loop over elements in row i1 of A->col_coupleBlock */
1368 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1369 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1370 i2 = A->col_coupleBlock->pattern->index[j2];
1371 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1372 for (irb=0; irb<row_block_size; irb++)
1373 for (icb=0; icb<col_block_size; icb++)
1374 RA_val[irb+row_block_size*icb]=ZERO;
1375 for (irb=0; irb<P_block_size; irb++) {
1376 rtmp=R_val[irb];
1377 for (icb=0; icb<col_block_size; icb++) {
1378 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1379 }
1380 }
1381
1382
1383 /* check whether entry RA[i_r, i2] has been previously visited.
1384 RAP new entry is possible only if entry RA[i_r, i2] has not
1385 been visited yet */
1386 if (A_marker[i2] != i_r) {
1387 /* first, mark entry RA[i_r,i2] as visited */
1388 A_marker[i2] = i_r;
1389
1390 /* then loop over elements in row i2 of P_ext_main */
1391 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1392 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1393 i_c = P->row_coupleBlock->pattern->index[j3];
1394 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1395 for (irb=0; irb<row_block_size; irb++)
1396 for (icb=0; icb<col_block_size; icb++)
1397 RAP_val[irb+row_block_size*icb]=ZERO;
1398 for (icb=0; icb<P_block_size; icb++) {
1399 rtmp=temp_val[icb];
1400 for (irb=0; irb<row_block_size; irb++) {
1401 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1402 }
1403 }
1404
1405
1406 /* check whether entry RAP[i_r,i_c] has been created.
1407 If not yet created, create the entry and increase
1408 the total number of elements in RAP_ext */
1409 if (P_marker[i_c] < row_marker) {
1410 P_marker[i_c] = i;
1411 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1412 RAP_main_idx[i] = i_c;
1413 i++;
1414 } else {
1415 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1416 for (ib=0; ib<block_size; ib++) {
1417 temp_val[ib] += RAP_val[ib];
1418 }
1419 }
1420 }
1421
1422 /* loop over elements in row i2 of P_ext_couple, do the same */
1423 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1424 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1425 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1426 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1427 for (irb=0; irb<row_block_size; irb++)
1428 for (icb=0; icb<col_block_size; icb++)
1429 RAP_val[irb+row_block_size*icb]=ZERO;
1430 for (icb=0; icb<P_block_size; icb++) {
1431 rtmp=temp_val[icb];
1432 for (irb=0; irb<row_block_size; irb++) {
1433 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1434 }
1435 }
1436
1437 if (P_marker[i_c] < row_marker_ext) {
1438 P_marker[i_c] = j;
1439 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1440 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1441 j++;
1442 } else {
1443 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1444 for (ib=0; ib<block_size; ib++) {
1445 temp_val[ib] += RAP_val[ib];
1446 }
1447 }
1448 }
1449
1450 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1451 Only the contributions are added. */
1452 } else {
1453 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1454 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1455 i_c = P->row_coupleBlock->pattern->index[j3];
1456 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1457 for (irb=0; irb<row_block_size; irb++)
1458 for (icb=0; icb<col_block_size; icb++)
1459 RAP_val[irb+row_block_size*icb]=ZERO;
1460 for (icb=0; icb<P_block_size; icb++) {
1461 rtmp=temp_val[icb];
1462 for (irb=0; irb<row_block_size; irb++) {
1463 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1464 }
1465 }
1466
1467 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1468 for (ib=0; ib<block_size; ib++) {
1469 temp_val[ib] += RAP_val[ib];
1470 }
1471 }
1472 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1473 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1474 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1475 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1476 for (irb=0; irb<row_block_size; irb++)
1477 for (icb=0; icb<col_block_size; icb++)
1478 RAP_val[irb+row_block_size*icb]=ZERO;
1479 for (icb=0; icb<P_block_size; icb++) {
1480 rtmp=temp_val[icb];
1481 for (irb=0; irb<row_block_size; irb++) {
1482 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1483 }
1484 }
1485
1486 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1487 for (ib=0; ib<block_size; ib++) {
1488 temp_val[ib] += RAP_val[ib];
1489 }
1490 }
1491 }
1492 }
1493
1494 /* now loop over elements in row i1 of A->mainBlock, repeat
1495 the process we have done to block A->col_coupleBlock */
1496 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1497 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1498 i2 = A->mainBlock->pattern->index[j2];
1499 temp_val = &(A->mainBlock->val[j2*block_size]);
1500 for (irb=0; irb<row_block_size; irb++)
1501 for (icb=0; icb<col_block_size; icb++)
1502 RA_val[irb+row_block_size*icb]=ZERO;
1503 for (irb=0; irb<P_block_size; irb++) {
1504 rtmp=R_val[irb];
1505 for (icb=0; icb<col_block_size; icb++) {
1506 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1507 }
1508 }
1509
1510 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1511 A_marker[i2 + num_Acouple_cols] = i_r;
1512 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1513 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1514 i_c = P->mainBlock->pattern->index[j3];
1515 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1516 for (irb=0; irb<row_block_size; irb++)
1517 for (icb=0; icb<col_block_size; icb++)
1518 RAP_val[irb+row_block_size*icb]=ZERO;
1519 for (icb=0; icb<P_block_size; icb++) {
1520 rtmp=temp_val[icb];
1521 for (irb=0; irb<row_block_size; irb++) {
1522 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1523 }
1524 }
1525 if (P_marker[i_c] < row_marker) {
1526 P_marker[i_c] = i;
1527 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1528 RAP_main_idx[i] = i_c;
1529 i++;
1530 } else {
1531 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1532 for (ib=0; ib<block_size; ib++) {
1533 temp_val[ib] += RAP_val[ib];
1534 }
1535 }
1536 }
1537 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1538 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1539 /* note that we need to map the column index in
1540 P->col_coupleBlock back into the column index in
1541 P_ext_couple */
1542 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1543 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1544 for (irb=0; irb<row_block_size; irb++)
1545 for (icb=0; icb<col_block_size; icb++)
1546 RAP_val[irb+row_block_size*icb]=ZERO;
1547 for (icb=0; icb<P_block_size; icb++) {
1548 rtmp=temp_val[icb];
1549 for (irb=0; irb<row_block_size; irb++) {
1550 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1551 }
1552 }
1553
1554 if (P_marker[i_c] < row_marker_ext) {
1555 P_marker[i_c] = j;
1556 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1557 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1558 j++;
1559 } else {
1560 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1561 for (ib=0; ib<block_size; ib++) {
1562 temp_val[ib] += RAP_val[ib];
1563 }
1564 }
1565 }
1566
1567 } else {
1568 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1569 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1570 i_c = P->mainBlock->pattern->index[j3];
1571 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1572 for (irb=0; irb<row_block_size; irb++)
1573 for (icb=0; icb<col_block_size; icb++)
1574 RAP_val[irb+row_block_size*icb]=ZERO;
1575 for (icb=0; icb<P_block_size; icb++) {
1576 rtmp=temp_val[icb];
1577 for (irb=0; irb<row_block_size; irb++) {
1578 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1579 }
1580 }
1581
1582 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1583 for (ib=0; ib<block_size; ib++) {
1584 temp_val[ib] += RAP_val[ib];
1585 }
1586 }
1587 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1588 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1589 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1590 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1591 for (irb=0; irb<row_block_size; irb++)
1592 for (icb=0; icb<col_block_size; icb++)
1593 RAP_val[irb+row_block_size*icb]=ZERO;
1594 for (icb=0; icb<P_block_size; icb++) {
1595 rtmp = temp_val[icb];
1596 for (irb=0; irb<row_block_size; irb++) {
1597 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1598 }
1599 }
1600
1601 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1602 for (ib=0; ib<block_size; ib++) {
1603 temp_val[ib] += RAP_val[ib];
1604 }
1605 }
1606 }
1607 }
1608 }
1609
1610 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1611 if (i > row_marker) {
1612 offset = i - row_marker;
1613 temp = new index_t[offset];
1614 #pragma omp parallel for schedule(static) private(iptr)
1615 for (iptr=0; iptr<offset; iptr++)
1616 temp[iptr] = RAP_main_idx[row_marker+iptr];
1617 if (offset > 0) {
1618 #ifdef USE_QSORTG
1619 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1620 #else
1621 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1622 #endif
1623 }
1624 temp_val = new double[offset * block_size];
1625 #pragma omp parallel for schedule(static) private(iptr,k)
1626 for (iptr=0; iptr<offset; iptr++){
1627 k = P_marker[temp[iptr]];
1628 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1629 P_marker[temp[iptr]] = iptr + row_marker;
1630 }
1631 #pragma omp parallel for schedule(static) private(iptr)
1632 for (iptr=0; iptr<offset; iptr++){
1633 RAP_main_idx[row_marker+iptr] = temp[iptr];
1634 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1635 }
1636 delete[] temp;
1637 delete[] temp_val;
1638 }
1639 if (j > row_marker_ext) {
1640 offset = j - row_marker_ext;
1641 temp = new index_t[offset];
1642 #pragma omp parallel for schedule(static) private(iptr)
1643 for (iptr=0; iptr<offset; iptr++)
1644 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1645 if (offset > 0) {
1646 #ifdef USE_QSORTG
1647 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1648 #else
1649 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1650 #endif
1651 }
1652 temp_val = new double[offset * block_size];
1653 #pragma omp parallel for schedule(static) private(iptr, k)
1654 for (iptr=0; iptr<offset; iptr++){
1655 k = P_marker[temp[iptr] + num_Pmain_cols];
1656 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1657 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1658 }
1659 #pragma omp parallel for schedule(static) private(iptr)
1660 for (iptr=0; iptr<offset; iptr++){
1661 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1662 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1663 }
1664 delete[] temp;
1665 delete[] temp_val;
1666 }
1667 }
1668
1669 delete[] RA_val;
1670 delete[] RAP_val;
1671 delete[] A_marker;
1672 delete[] Pext_to_RAP;
1673 delete[] Pcouple_to_RAP;
1674 delete[] RAP_ext_ptr;
1675 delete[] RAP_ext_idx;
1676 delete[] RAP_ext_val;
1677 Paso_SparseMatrix_free(R_main);
1678 Paso_SparseMatrix_free(R_couple);
1679
1680 /* Check whether there are empty columns in RAP_couple */
1681 #pragma omp parallel for schedule(static) private(i)
1682 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1683 /* num of non-empty columns is stored in "k" */
1684 k = 0;
1685 j = RAP_couple_ptr[num_Pmain_cols];
1686 for (i=0; i<j; i++) {
1687 i1 = RAP_couple_idx[i];
1688 if (P_marker[i1]) {
1689 P_marker[i1] = 0;
1690 k++;
1691 }
1692 }
1693
1694 /* empty columns is found */
1695 if (k < num_RAPext_cols) {
1696 temp = new index_t[k];
1697 k = 0;
1698 for (i=0; i<num_RAPext_cols; i++)
1699 if (!P_marker[i]) {
1700 P_marker[i] = k;
1701 temp[k] = global_id_RAP[i];
1702 k++;
1703 }
1704 #pragma omp parallel for schedule(static) private(i, i1)
1705 for (i=0; i<j; i++) {
1706 i1 = RAP_couple_idx[i];
1707 RAP_couple_idx[i] = P_marker[i1];
1708 }
1709 num_RAPext_cols = k;
1710 delete[] global_id_RAP;
1711 global_id_RAP = temp;
1712 }
1713 delete[] P_marker;
1714
1715 /******************************************************/
1716 /* Start to create the coarse level System Matrix A_c */
1717 /******************************************************/
1718 /* first, prepare the sender/receiver for the col_connector */
1719 dist = P->pattern->input_distribution->first_component;
1720 recv_len = new dim_t[size];
1721 send_len = new dim_t[size];
1722 neighbor = new Esys_MPI_rank[size];
1723 offsetInShared = new index_t[size+1];
1724 shared = new index_t[num_RAPext_cols];
1725 memset(recv_len, 0, sizeof(dim_t) * size);
1726 memset(send_len, 0, sizeof(dim_t) * size);
1727 num_neighbors = 0;
1728 offsetInShared[0] = 0;
1729 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1730 shared[i] = i + num_Pmain_cols;
1731 if (k <= global_id_RAP[i]) {
1732 if (recv_len[j] > 0) {
1733 neighbor[num_neighbors] = j;
1734 num_neighbors ++;
1735 offsetInShared[num_neighbors] = i;
1736 }
1737 while (k <= global_id_RAP[i]) {
1738 j++;
1739 k = dist[j+1];
1740 }
1741 }
1742 recv_len[j] ++;
1743 }
1744 if (recv_len[j] > 0) {
1745 neighbor[num_neighbors] = j;
1746 num_neighbors ++;
1747 offsetInShared[num_neighbors] = i;
1748 }
1749 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1750 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1751
1752 #ifdef ESYS_MPI
1753 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1754 #endif
1755
1756 #ifdef ESYS_MPI
1757 mpi_requests=new MPI_Request[size*2];
1758 mpi_stati=new MPI_Status[size*2];
1759 #else
1760 mpi_requests=new int[size*2];
1761 mpi_stati=new int[size*2];
1762 #endif
1763 num_neighbors = 0;
1764 j = 0;
1765 offsetInShared[0] = 0;
1766 for (i=0; i<size; i++) {
1767 if (send_len[i] > 0) {
1768 neighbor[num_neighbors] = i;
1769 num_neighbors ++;
1770 j += send_len[i];
1771 offsetInShared[num_neighbors] = j;
1772 }
1773 }
1774 delete[] shared;
1775 shared = new index_t[j];
1776 for (i=0, j=0; i<num_neighbors; i++) {
1777 k = neighbor[i];
1778 #ifdef ESYS_MPI
1779 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1780 mpi_info->msg_tag_counter+k,
1781 mpi_info->comm, &mpi_requests[i]);
1782 #endif
1783 j += send_len[k];
1784 }
1785 for (i=0, j=0; i<recv->numNeighbors; i++) {
1786 k = recv->neighbor[i];
1787 #ifdef ESYS_MPI
1788 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1789 mpi_info->msg_tag_counter+rank,
1790 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1791 #endif
1792 j += recv_len[k];
1793 }
1794 #ifdef ESYS_MPI
1795 MPI_Waitall(num_neighbors + recv->numNeighbors,
1796 mpi_requests, mpi_stati);
1797 #endif
1798 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1799
1800 j = offsetInShared[num_neighbors];
1801 offset = dist[rank];
1802 #pragma omp parallel for schedule(static) private(i)
1803 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1804 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1805 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1806
1807 col_connector = Paso_Connector_alloc(send, recv);
1808 delete[] shared;
1809 Paso_SharedComponents_free(recv);
1810 Paso_SharedComponents_free(send);
1811
1812 /* now, create row distribution (output_distri) and col
1813 distribution (input_distribution) */
1814 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1815 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1816
1817 /* then, prepare the sender/receiver for the row_connector, first, prepare
1818 the information for sender */
1819 sum = RAP_couple_ptr[num_Pmain_cols];
1820 len = new dim_t[size];
1821 send_ptr = new index_t*[size];
1822 send_idx = new index_t*[size];
1823 #pragma omp parallel for schedule(static) private(i)
1824 for (i=0; i<size; i++) {
1825 send_ptr[i] = new index_t[num_Pmain_cols];
1826 send_idx[i] = new index_t[sum];
1827 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1828 }
1829 memset(len, 0, sizeof(dim_t) * size);
1830 recv = col_connector->recv;
1831 sum=0;
1832 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1833 i1 = RAP_couple_ptr[i_r];
1834 i2 = RAP_couple_ptr[i_r+1];
1835 if (i2 > i1) {
1836 /* then row i_r will be in the sender of row_connector, now check
1837 how many neighbours i_r needs to be send to */
1838 for (j=i1; j<i2; j++) {
1839 i_c = RAP_couple_idx[j];
1840 /* find out the corresponding neighbor "p" of column i_c */
1841 for (p=0; p<recv->numNeighbors; p++) {
1842 if (i_c < recv->offsetInShared[p+1]) {
1843 k = recv->neighbor[p];
1844 if (send_ptr[k][i_r] == 0) sum++;
1845 send_ptr[k][i_r] ++;
1846 send_idx[k][len[k]] = global_id_RAP[i_c];
1847 len[k] ++;
1848 break;
1849 }
1850 }
1851 }
1852 }
1853 }
1854 if (global_id_RAP) {
1855 delete[] global_id_RAP;
1856 global_id_RAP = NULL;
1857 }
1858
1859 /* now allocate the sender */
1860 shared = new index_t[sum];
1861 memset(send_len, 0, sizeof(dim_t) * size);
1862 num_neighbors=0;
1863 offsetInShared[0] = 0;
1864 for (p=0, k=0; p<size; p++) {
1865 for (i=0; i<num_Pmain_cols; i++) {
1866 if (send_ptr[p][i] > 0) {
1867 shared[k] = i;
1868 k++;
1869 send_ptr[p][send_len[p]] = send_ptr[p][i];
1870 send_len[p]++;
1871 }
1872 }
1873 if (k > offsetInShared[num_neighbors]) {
1874 neighbor[num_neighbors] = p;
1875 num_neighbors ++;
1876 offsetInShared[num_neighbors] = k;
1877 }
1878 }
1879 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1880 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1881
1882 /* send/recv number of rows will be sent from current proc
1883 recover info for the receiver of row_connector from the sender */
1884 #ifdef ESYS_MPI
1885 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1886 #endif
1887 num_neighbors = 0;
1888 offsetInShared[0] = 0;
1889 j = 0;
1890 for (i=0; i<size; i++) {
1891 if (i != rank && recv_len[i] > 0) {
1892 neighbor[num_neighbors] = i;
1893 num_neighbors ++;
1894 j += recv_len[i];
1895 offsetInShared[num_neighbors] = j;
1896 }
1897 }
1898 delete[] shared;
1899 delete[] recv_len;
1900 shared = new index_t[j];
1901 k = offsetInShared[num_neighbors];
1902 #pragma omp parallel for schedule(static) private(i)
1903 for (i=0; i<k; i++) {
1904 shared[i] = i + num_Pmain_cols;
1905 }
1906 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1907 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1908 row_connector = Paso_Connector_alloc(send, recv);
1909 delete[] shared;
1910 Paso_SharedComponents_free(recv);
1911 Paso_SharedComponents_free(send);
1912
1913 /* send/recv pattern->ptr for rowCoupleBlock */
1914 num_RAPext_rows = offsetInShared[num_neighbors];
1915 row_couple_ptr = new index_t[num_RAPext_rows+1];
1916 for (p=0; p<num_neighbors; p++) {
1917 j = offsetInShared[p];
1918 i = offsetInShared[p+1];
1919 #ifdef ESYS_MPI
1920 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1921 mpi_info->msg_tag_counter+neighbor[p],
1922 mpi_info->comm, &mpi_requests[p]);
1923 #endif
1924 }
1925 send = row_connector->send;
1926 for (p=0; p<send->numNeighbors; p++) {
1927 #ifdef ESYS_MPI
1928 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1929 MPI_INT, send->neighbor[p],
1930 mpi_info->msg_tag_counter+rank,
1931 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1932 #endif
1933 }
1934 #ifdef ESYS_MPI
1935 MPI_Waitall(num_neighbors + send->numNeighbors,
1936 mpi_requests, mpi_stati);
1937 #endif
1938 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1939 delete[] send_len;
1940
1941 sum = 0;
1942 for (i=0; i<num_RAPext_rows; i++) {
1943 k = row_couple_ptr[i];
1944 row_couple_ptr[i] = sum;
1945 sum += k;
1946 }
1947 row_couple_ptr[num_RAPext_rows] = sum;
1948
1949 /* send/recv pattern->index for rowCoupleBlock */
1950 k = row_couple_ptr[num_RAPext_rows];
1951 row_couple_idx = new index_t[k];
1952 for (p=0; p<num_neighbors; p++) {
1953 j1 = row_couple_ptr[offsetInShared[p]];
1954 j2 = row_couple_ptr[offsetInShared[p+1]];
1955 #ifdef ESYS_MPI
1956 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1957 mpi_info->msg_tag_counter+neighbor[p],
1958 mpi_info->comm, &mpi_requests[p]);
1959 #endif
1960 }
1961 for (p=0; p<send->numNeighbors; p++) {
1962 #ifdef ESYS_MPI
1963 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1964 MPI_INT, send->neighbor[p],
1965 mpi_info->msg_tag_counter+rank,
1966 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1967 #endif
1968 }
1969 #ifdef ESYS_MPI
1970 MPI_Waitall(num_neighbors + send->numNeighbors,
1971 mpi_requests, mpi_stati);
1972 #endif
1973 ESYS_MPI_INC_COUNTER(*mpi_info, size);
1974
1975 offset = input_dist->first_component[rank];
1976 k = row_couple_ptr[num_RAPext_rows];
1977 #pragma omp parallel for schedule(static) private(i)
1978 for (i=0; i<k; i++) {
1979 row_couple_idx[i] -= offset;
1980 }
1981 #pragma omp parallel for schedule(static) private(i)
1982 for (i=0; i<size; i++) {
1983 delete[] send_ptr[i];
1984 delete[] send_idx[i];
1985 }
1986 delete[] send_ptr;
1987 delete[] send_idx;
1988 delete[] len;
1989 delete[] offsetInShared;
1990 delete[] neighbor;
1991 delete[] mpi_requests;
1992 delete[] mpi_stati;
1993 Esys_MPIInfo_free(mpi_info);
1994
1995 /* Now, we can create pattern for mainBlock and coupleBlock */
1996 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1997 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1998 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1999 num_Pmain_cols, num_RAPext_cols,
2000 RAP_couple_ptr, RAP_couple_idx);
2001 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2002 num_RAPext_rows, num_Pmain_cols,
2003 row_couple_ptr, row_couple_idx);
2004
2005 /* next, create the system matrix */
2006 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2007 output_dist, input_dist, main_pattern, col_couple_pattern,
2008 row_couple_pattern, col_connector, row_connector);
2009 out = Paso_SystemMatrix_alloc(A->type, pattern,
2010 row_block_size, col_block_size, FALSE);
2011
2012 /* finally, fill in the data*/
2013 memcpy(out->mainBlock->val, RAP_main_val,
2014 out->mainBlock->len* sizeof(double));
2015 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2016 out->col_coupleBlock->len * sizeof(double));
2017
2018 /* Clean up */
2019 Paso_SystemMatrixPattern_free(pattern);
2020 Paso_Pattern_free(main_pattern);
2021 Paso_Pattern_free(col_couple_pattern);
2022 Paso_Pattern_free(row_couple_pattern);
2023 Paso_Connector_free(col_connector);
2024 Paso_Connector_free(row_connector);
2025 Paso_Distribution_free(output_dist);
2026 Paso_Distribution_free(input_dist);
2027 delete[] RAP_main_val;
2028 delete[] RAP_couple_val;
2029 if (Esys_noError()) {
2030 return out;
2031 } else {
2032 Paso_SystemMatrix_free(out);
2033 return NULL;
2034 }
2035 }
2036
2037
2038 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2039 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2040 {
2041 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2042 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2043 Paso_SystemMatrix *out=NULL;
2044 Paso_SystemMatrixPattern *pattern=NULL;
2045 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2046 Paso_SharedComponents *send =NULL, *recv=NULL;
2047 Paso_Connector *col_connector=NULL, *row_connector=NULL;
2048 Paso_Pattern *main_pattern=NULL;
2049 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2050 const dim_t row_block_size=A->row_block_size;
2051 const dim_t col_block_size=A->col_block_size;
2052 const dim_t block_size = A->block_size;
2053 const double ZERO = 0.0;
2054 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2055 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2056 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2057 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2058 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2059 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2060 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2061 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2062 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2063 index_t sum, i, j, k, iptr, irb, icb, ib;
2064 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2065 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2066 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2067 index_t row_marker_ext, *where_p=NULL;
2068 index_t **send_ptr=NULL, **send_idx=NULL;
2069 dim_t p, num_neighbors;
2070 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2071 Esys_MPI_rank *neighbor=NULL;
2072 #ifdef ESYS_MPI
2073 MPI_Request* mpi_requests=NULL;
2074 MPI_Status* mpi_stati=NULL;
2075 #else
2076 int *mpi_requests=NULL, *mpi_stati=NULL;
2077 #endif
2078
2079 /* two sparse matrices R_main and R_couple will be generate, as the
2080 transpose of P_main and P_col_couple, respectively. Note that,
2081 R_couple is actually the row_coupleBlock of R (R=P^T) */
2082 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2083 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2084
2085 /* generate P_ext, i.e. portion of P that is stored on neighbor procs
2086 and needed locally for triple matrix product RAP
2087 to be specific, P_ext (on processor k) are group of rows in P, where
2088 the list of rows from processor q is given by
2089 A->col_coupler->connector->send->shared[rPtr...]
2090 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2091 on q.
2092 P_ext is represented by two sparse matrices P_ext_main and
2093 P_ext_couple */
2094 Paso_Preconditioner_AMG_extendB(A, P);
2095
2096 /* count the number of cols in P_ext_couple, resize the pattern of
2097 sparse matrix P_ext_couple with new compressed order, and then
2098 build the col id mapping from P->col_coupleBlock to
2099 P_ext_couple */
2100 num_Pmain_cols = P->mainBlock->numCols;
2101 num_Pcouple_cols = P->col_coupleBlock->numCols;
2102 num_Acouple_cols = A->col_coupleBlock->numCols;
2103 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2104 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2105 offset = P->col_distribution->first_component[rank];
2106 num_Pext_cols = 0;
2107 if (P->global_id) {
2108 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2109 and P->col_coupleBlock */
2110 iptr = 0;
2111 if (num_Pcouple_cols || sum > 0) {
2112 temp = new index_t[num_Pcouple_cols+sum];
2113 for (; iptr<sum; iptr++){
2114 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2115 }
2116 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2117 temp[iptr] = P->global_id[j];
2118 }
2119 }
2120 if (iptr) {
2121 #ifdef USE_QSORTG
2122 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2123 #else
2124 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2125 #endif
2126 num_Pext_cols = 1;
2127 i = temp[0];
2128 for (j=1; j<iptr; j++) {
2129 if (temp[j] > i) {
2130 i = temp[j];
2131 temp[num_Pext_cols++] = i;
2132 }
2133 }
2134 }
2135
2136 /* resize the pattern of P_ext_couple */
2137 if(num_Pext_cols){
2138 global_id_P = new index_t[num_Pext_cols];
2139 for (i=0; i<num_Pext_cols; i++)
2140 global_id_P[i] = temp[i];
2141 }
2142 if (num_Pcouple_cols || sum > 0)
2143 delete[] temp;
2144 for (i=0; i<sum; i++) {
2145 where_p = (index_t *)bsearch(
2146 &(P->remote_coupleBlock->pattern->index[i]),
2147 global_id_P, num_Pext_cols,
2148 sizeof(index_t), Paso_comparIndex);
2149 P->remote_coupleBlock->pattern->index[i] =
2150 (index_t)(where_p -global_id_P);
2151 }
2152
2153 /* build the mapping */
2154 if (num_Pcouple_cols) {
2155 Pcouple_to_Pext = new index_t[num_Pcouple_cols];
2156 iptr = 0;
2157 for (i=0; i<num_Pext_cols; i++)
2158 if (global_id_P[i] == P->global_id[iptr]) {
2159 Pcouple_to_Pext[iptr++] = i;
2160 if (iptr == num_Pcouple_cols) break;
2161 }
2162 }
2163 }
2164
2165 /* alloc and initialise the makers */
2166 sum = num_Pext_cols + num_Pmain_cols;
2167 P_marker = new index_t[sum];
2168 A_marker = new index_t[num_A_cols];
2169 #pragma omp parallel for private(i) schedule(static)
2170 for (i=0; i<sum; i++) P_marker[i] = -1;
2171 #pragma omp parallel for private(i) schedule(static)
2172 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2173
2174 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2175 sum = 0;
2176 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2177 row_marker = sum;
2178 /* then, loop over elements in row i_r of R_couple */
2179 j1_ub = R_couple->pattern->ptr[i_r+1];
2180 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2181 i1 = R_couple->pattern->index[j1];
2182 /* then, loop over elements in row i1 of A->col_coupleBlock */
2183 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2184 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2185 i2 = A->col_coupleBlock->pattern->index[j2];
2186
2187 /* check whether entry RA[i_r, i2] has been previously visited.
2188 RAP new entry is possible only if entry RA[i_r, i2] has not
2189 been visited yet */
2190 if (A_marker[i2] != i_r) {
2191 /* first, mark entry RA[i_r,i2] as visited */
2192 A_marker[i2] = i_r;
2193
2194 /* then loop over elements in row i2 of P_ext_main */
2195 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2196 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2197 i_c = P->row_coupleBlock->pattern->index[j3];
2198
2199 /* check whether entry RAP[i_r,i_c] has been created.
2200 If not yet created, create the entry and increase
2201 the total number of elements in RAP_ext */
2202 if (P_marker[i_c] < row_marker) {
2203 P_marker[i_c] = sum;
2204 sum++;
2205 }
2206 }
2207
2208 /* loop over elements in row i2 of P_ext_couple, do the same */
2209 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2210 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2211 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2212 if (P_marker[i_c] < row_marker) {
2213 P_marker[i_c] = sum;
2214 sum++;
2215 }
2216 }
2217 }
2218 }
2219
2220 /* now loop over elements in row i1 of A->mainBlock, repeat
2221 the process we have done to block A->col_coupleBlock */
2222 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2223 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2224 i2 = A->mainBlock->pattern->index[j2];
2225 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2226 A_marker[i2 + num_Acouple_cols] = i_r;
2227 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2228 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2229 i_c = P->mainBlock->pattern->index[j3];
2230 if (P_marker[i_c] < row_marker) {
2231 P_marker[i_c] = sum;
2232 sum++;
2233 }
2234 }
2235 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2236 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2237 /* note that we need to map the column index in
2238 P->col_coupleBlock back into the column index in
2239 P_ext_couple */
2240 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2241 if (P_marker[i_c] < row_marker) {
2242 P_marker[i_c] = sum;
2243 sum++;
2244 }
2245 }
2246 }
2247 }
2248 }
2249 }
2250
2251 /* Now we have the number of non-zero elements in RAP_ext, allocate
2252 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2253 RAP_ext_ptr = new index_t[num_Pcouple_cols+1];
2254 RAP_ext_idx = new index_t[sum];
2255 RAP_ext_val = new double[sum * block_size];
2256 RA_val = new double[block_size];
2257 RAP_val = new double[block_size];
2258
2259 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2260 sum = num_Pext_cols + num_Pmain_cols;
2261 #pragma omp parallel for private(i) schedule(static)
2262 for (i=0; i<sum; i++) P_marker[i] = -1;
2263 #pragma omp parallel for private(i) schedule(static)
2264 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2265 sum = 0;
2266 RAP_ext_ptr[0] = 0;
2267 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2268 row_marker = sum;
2269 /* then, loop over elements in row i_r of R_couple */
2270 j1_ub = R_couple->pattern->ptr[i_r+1];
2271 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2272 i1 = R_couple->pattern->index[j1];
2273 R_val = &(R_couple->val[j1*block_size]);
2274
2275 /* then, loop over elements in row i1 of A->col_coupleBlock */
2276 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2277 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2278 i2 = A->col_coupleBlock->pattern->index[j2];
2279 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2280 for (irb=0; irb<row_block_size; irb++) {
2281 for (icb=0; icb<col_block_size; icb++) {
2282 rtmp = ZERO;
2283 for (ib=0; ib<col_block_size; ib++) {
2284 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2285 }
2286 RA_val[irb+row_block_size*icb]=rtmp;
2287 }
2288 }
2289
2290 /* check whether entry RA[i_r, i2] has been previously visited.
2291 RAP new entry is possible only if entry RA[i_r, i2] has not
2292 been visited yet */
2293 if (A_marker[i2] != i_r) {
2294 /* first, mark entry RA[i_r,i2] as visited */
2295 A_marker[i2] = i_r;
2296
2297 /* then loop over elements in row i2 of P_ext_main */
2298 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2299 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2300 i_c = P->row_coupleBlock->pattern->index[j3];
2301 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2302 for (irb=0; irb<row_block_size; irb++) {
2303 for (icb=0; icb<col_block_size; icb++) {
2304 rtmp = ZERO;
2305 for (ib=0; ib<col_block_size; ib++) {
2306 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2307 }
2308 RAP_val[irb+row_block_size*icb]=rtmp;
2309 }
2310 }
2311
2312
2313 /* check whether entry RAP[i_r,i_c] has been created.
2314 If not yet created, create the entry and increase
2315 the total number of elements in RAP_ext */
2316 if (P_marker[i_c] < row_marker) {
2317 P_marker[i_c] = sum;
2318 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2319 RAP_ext_idx[sum] = i_c + offset;
2320 sum++;
2321 } else {
2322 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2323 for (ib=0; ib<block_size; ib++) {
2324 temp_val[ib] += RAP_val[ib];
2325 }
2326 }
2327 }
2328
2329 /* loop over elements in row i2 of P_ext_couple, do the same */
2330 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2331 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2332 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2333 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2334 for (irb=0; irb<row_block_size; irb++) {
2335 for (icb=0; icb<col_block_size; icb++) {
2336 rtmp = ZERO;
2337 for (ib=0; ib<col_block_size; ib++) {
2338 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2339 }
2340 RAP_val[irb+row_block_size*icb]=rtmp;
2341 }
2342 }
2343
2344 if (P_marker[i_c] < row_marker) {
2345 P_marker[i_c] = sum;
2346 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2347 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2348 sum++;
2349 } else {
2350 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2351 for (ib=0; ib<block_size; ib++) {
2352 temp_val[ib] += RAP_val[ib];
2353 }
2354 }
2355 }
2356
2357 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2358 Only the contributions are added. */
2359 } else {
2360 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2361 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2362 i_c = P->row_coupleBlock->pattern->index[j3];
2363 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2364 for (irb=0; irb<row_block_size; irb++) {
2365 for (icb=0; icb<col_block_size; icb++) {
2366 rtmp = ZERO;
2367 for (ib=0; ib<col_block_size; ib++) {
2368 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2369 }
2370 RAP_val[irb+row_block_size*icb]=rtmp;
2371 }
2372 }
2373
2374 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2375 for (ib=0; ib<block_size; ib++) {
2376 temp_val[ib] += RAP_val[ib];
2377 }
2378 }
2379 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2380 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2381 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2382 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2383 for (irb=0; irb<row_block_size; irb++) {
2384 for (icb=0; icb<col_block_size; icb++) {
2385 rtmp = ZERO;
2386 for (ib=0; ib<col_block_size; ib++) {
2387 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2388 }
2389 RAP_val[irb+row_block_size*icb]=rtmp;
2390 }
2391 }
2392
2393 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2394 for (ib=0; ib<block_size; ib++) {
2395 temp_val[ib] += RAP_val[ib];
2396 }
2397 }
2398 }
2399 }
2400
2401 /* now loop over elements in row i1 of A->mainBlock, repeat
2402 the process we have done to block A->col_coupleBlock */
2403 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2404 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2405 i2 = A->mainBlock->pattern->index[j2];
2406 temp_val = &(A->mainBlock->val[j2*block_size]);
2407 for (irb=0; irb<row_block_size; irb++) {
2408 for (icb=0; icb<col_block_size; icb++) {
2409 rtmp = ZERO;
2410 for (ib=0; ib<col_block_size; ib++) {
2411 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2412 }
2413 RA_val[irb+row_block_size*icb]=rtmp;
2414 }
2415 }
2416
2417 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2418 A_marker[i2 + num_Acouple_cols] = i_r;
2419 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2420 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2421 i_c = P->mainBlock->pattern->index[j3];
2422 temp_val = &(P->mainBlock->val[j3*block_size]);
2423 for (irb=0; irb<row_block_size; irb++) {
2424 for (icb=0; icb<col_block_size; icb++) {
2425 rtmp = ZERO;
2426 for (ib=0; ib<col_block_size; ib++) {
2427 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2428 }
2429 RAP_val[irb+row_block_size*icb]=rtmp;
2430 }
2431 }
2432
2433 if (P_marker[i_c] < row_marker) {
2434 P_marker[i_c] = sum;
2435 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2436 RAP_ext_idx[sum] = i_c + offset;
2437 sum++;
2438 } else {
2439 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2440 for (ib=0; ib<block_size; ib++) {
2441 temp_val[ib] += RAP_val[ib];
2442 }
2443 }
2444 }
2445 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2446 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2447 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2448 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2449 for (irb=0; irb<row_block_size; irb++) {
2450 for (icb=0; icb<col_block_size; icb++) {
2451 rtmp = ZERO;
2452 for (ib=0; ib<col_block_size; ib++) {
2453 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2454 }
2455 RAP_val[irb+row_block_size*icb]=rtmp;
2456 }
2457 }
2458
2459 if (P_marker[i_c] < row_marker) {
2460 P_marker[i_c] = sum;
2461 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2462 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2463 sum++;
2464 } else {
2465 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2466 for (ib=0; ib<block_size; ib++) {
2467 temp_val[ib] += RAP_val[ib];
2468 }
2469 }
2470 }
2471 } else {
2472 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2473 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2474 i_c = P->mainBlock->pattern->index[j3];
2475 temp_val = &(P->mainBlock->val[j3*block_size]);
2476 for (irb=0; irb<row_block_size; irb++) {
2477 for (icb=0; icb<col_block_size; icb++) {
2478 rtmp = ZERO;
2479 for (ib=0; ib<col_block_size; ib++) {
2480 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2481 }
2482 RAP_val[irb+row_block_size*icb]=rtmp;
2483 }
2484 }
2485
2486 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2487 for (ib=0; ib<block_size; ib++) {
2488 temp_val[ib] += RAP_val[ib];
2489 }
2490 }
2491 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2492 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2493 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2494 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2495 for (irb=0; irb<row_block_size; irb++) {
2496 for (icb=0; icb<col_block_size; icb++) {
2497 rtmp = ZERO;
2498 for (ib=0; ib<col_block_size; ib++) {
2499 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2500 }
2501 RAP_val[irb+row_block_size*icb]=rtmp;
2502 }
2503 }
2504
2505 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2506 for (ib=0; ib<block_size; ib++) {
2507 temp_val[ib] += RAP_val[ib];
2508 }
2509 }
2510 }
2511 }
2512 }
2513 RAP_ext_ptr[i_r+1] = sum;
2514 }
2515 delete[] P_marker;
2516 delete[] Pcouple_to_Pext;
2517
2518 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2519 which is given by the column list of P->col_coupleBlock, and
2520 column "c" is the list of columns which possibly covers the
2521 whole column range of system matrix P. This part of data is to
2522 be passed to neighbouring processors, and added to corresponding
2523 RAP[r,c] entries in the neighbouring processors */
2524 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2525 &RAP_ext_val, global_id_P, block_size);
2526
2527 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2528 sum = RAP_ext_ptr[num_RAPext_rows];
2529 num_RAPext_cols = 0;
2530 if (num_Pext_cols || sum > 0) {
2531 temp = new index_t[num_Pext_cols+sum];
2532 j1_ub = offset + num_Pmain_cols;
2533 for (i=0, iptr=0; i<sum; i++) {
2534 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2535 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2536 }
2537 for (j=0; j<num_Pext_cols; j++, iptr++){
2538 temp[iptr] = global_id_P[j];
2539 }
2540
2541 if (iptr) {
2542 #ifdef USE_QSORTG
2543 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2544 #else
2545 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2546 #endif
2547 num_RAPext_cols = 1;
2548 i = temp[0];
2549 for (j=1; j<iptr; j++) {
2550 if (temp[j] > i) {
2551 i = temp[j];
2552 temp[num_RAPext_cols++] = i;
2553 }
2554 }
2555 }
2556 }
2557
2558 /* resize the pattern of P_ext_couple */
2559 if(num_RAPext_cols){
2560 global_id_RAP = new index_t[num_RAPext_cols];
2561 for (i=0; i<num_RAPext_cols; i++)
2562 global_id_RAP[i] = temp[i];
2563 }
2564 if (num_Pext_cols || sum > 0)
2565 delete[] temp;
2566 j1_ub = offset + num_Pmain_cols;
2567 for (i=0; i<sum; i++) {
2568 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2569 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2570 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2571 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2572 } else
2573 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2574 }
2575
2576 /* build the mapping */
2577 if (num_Pcouple_cols) {
2578 Pcouple_to_RAP = new index_t[num_Pcouple_cols];
2579 iptr = 0;
2580 for (