/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3821 - (show annotations)
Mon Feb 13 05:57:49 2012 UTC (7 years, 9 months ago) by lgao
Original Path: branches/amg_from_3530/paso/src/AMG_Interpolation.c
File MIME type: text/plain
File size: 122501 byte(s)
A bit opt using OpenMP.


1 //
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2011 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: defines AMG Interpolation */
18
19 /**************************************************************/
20
21 /* Author: l.gao@uq.edu.au */
22
23 /**************************************************************/
24
25 #include "Paso.h"
26 #include "SparseMatrix.h"
27 #include "PasoUtil.h"
28 #include "Preconditioner.h"
29
30 /**************************************************************
31
32 Methods nessecary for AMG preconditioner
33
34 construct n_C x n_C interpolation operator A_C from matrix A
35 and prolongation matrix P.
36
37 The coarsening operator A_C is defined by RAP where R=P^T.
38
39 ***************************************************************/
40
41 /* Extend system matrix B with extra two sparse matrixes:
42 B_ext_main and B_ext_couple
43 The combination of this two sparse matrixes represents
44 the portion of B that is stored on neighbor procs and
45 needed locally for triple matrix product (B^T) A B.
46
47 FOR NOW, we assume that the system matrix B has a NULL
48 row_coupleBlock and a NULL remote_coupleBlock. Based on
49 this assumption, we use link row_coupleBlock for sparse
50 matrix B_ext_main, and link remote_coupleBlock for sparse
51 matrix B_ext_couple.
52
53 To be specific, B_ext (on processor k) are group of rows
54 in B, where the list of rows from processor q is given by
55 A->col_coupler->connector->send->shared[rPtr...]
56 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
57 on q. */
58 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
59 {
60 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
61 Paso_Coupler *coupler=NULL;
62 Paso_SharedComponents *send=NULL, *recv=NULL;
63 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
64 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
65 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
66 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
67 index_t *idx_m=NULL, *idx_c=NULL;
68 index_t i, j, k, m, n, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
69 index_t offset, len, block_size, block_size_size, max_num_cols;
70 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
71 dim_t *recv_degree=NULL, *send_degree=NULL;
72 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
73
74 if (size == 1) return;
75
76 if (B->row_coupleBlock) {
77 Paso_SparseMatrix_free(B->row_coupleBlock);
78 B->row_coupleBlock = NULL;
79 }
80
81 if (B->row_coupleBlock || B->remote_coupleBlock) {
82 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
83 return;
84 }
85
86 /* sending/receiving unknown's global ID */
87 num_main_cols = B->mainBlock->numCols;
88 cols = TMPMEMALLOC(num_main_cols, double);
89 offset = B->col_distribution->first_component[rank];
90 #pragma omp parallel for private(i) schedule(static)
91 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
92 if (B->global_id == NULL) {
93 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
94 Paso_Coupler_startCollect(coupler, cols);
95 }
96
97 recv_buf = TMPMEMALLOC(size, index_t);
98 recv_degree = TMPMEMALLOC(size, dim_t);
99 recv_offset = TMPMEMALLOC(size+1, index_t);
100 #pragma omp parallel for private(i) schedule(static)
101 for (i=0; i<size; i++){
102 recv_buf[i] = 0;
103 recv_degree[i] = 1;
104 recv_offset[i] = i;
105 }
106
107 block_size = B->block_size;
108 block_size_size = block_size * sizeof(double);
109 num_couple_cols = B->col_coupleBlock->numCols;
110 send = A->col_coupler->connector->send;
111 recv = A->col_coupler->connector->recv;
112 num_neighbors = send->numNeighbors;
113 p = send->offsetInShared[num_neighbors];
114 len = p * B->col_distribution->first_component[size];
115 send_buf = TMPMEMALLOC(len * block_size, double);
116 send_idx = TMPMEMALLOC(len, index_t);
117 send_offset = TMPMEMALLOC((p+1)*2, index_t);
118 send_degree = TMPMEMALLOC(num_neighbors, dim_t);
119 i = num_main_cols + num_couple_cols;
120 send_m = TMPMEMALLOC(i * block_size, double);
121 send_c = TMPMEMALLOC(i * block_size, double);
122 idx_m = TMPMEMALLOC(i, index_t);
123 idx_c = TMPMEMALLOC(i, index_t);
124
125 /* waiting for receiving unknown's global ID */
126 if (B->global_id == NULL) {
127 Paso_Coupler_finishCollect(coupler);
128 global_id = MEMALLOC(num_couple_cols, index_t);
129 #pragma omp parallel for private(i) schedule(static)
130 for (i=0; i<num_couple_cols; ++i)
131 global_id[i] = coupler->recv_buffer[i];
132 B->global_id = global_id;
133 Paso_Coupler_free(coupler);
134 } else
135 global_id = B->global_id;
136
137 /* distribute the number of cols in current col_coupleBlock to all ranks */
138 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
139
140 /* distribute global_ids of cols to be considered to all ranks*/
141 len = 0;
142 max_num_cols = 0;
143 for (i=0; i<size; i++){
144 recv_degree[i] = recv_buf[i];
145 recv_offset[i] = len;
146 len += recv_buf[i];
147 if (max_num_cols < recv_buf[i])
148 max_num_cols = recv_buf[i];
149 }
150 recv_offset[size] = len;
151 cols_array = TMPMEMALLOC(len, index_t);
152 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
153
154 /* first, prepare the ptr_ptr to be received */
155 q = recv->numNeighbors;
156 len = recv->offsetInShared[q];
157 ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
158 for (p=0; p<q; p++) {
159 row = recv->offsetInShared[p];
160 m = recv->offsetInShared[p + 1];
161 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
162 A->mpi_info->msg_tag_counter+recv->neighbor[p],
163 A->mpi_info->comm,
164 &(A->col_coupler->mpi_requests[p]));
165 }
166
167 /* now prepare the rows to be sent (the degree, the offset and the data) */
168 len = 0;
169 i0 = 0;
170 for (p=0; p<num_neighbors; p++) {
171 i = i0;
172 neighbor = send->neighbor[p];
173 m_lb = B->col_distribution->first_component[neighbor];
174 m_ub = B->col_distribution->first_component[neighbor + 1];
175 j_ub = send->offsetInShared[p + 1];
176 for (j=send->offsetInShared[p]; j<j_ub; j++) {
177 row = send->shared[j];
178 l_m = 0;
179 l_c = 0;
180 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
181 k_lb = B->col_coupleBlock->pattern->ptr[row];
182
183 /* check part of col_coupleBlock for data to be passed @row */
184 for (k=k_lb; k<k_ub; k++) {
185 m = global_id[B->col_coupleBlock->pattern->index[k]];
186 if (m > offset) break;
187 if (m>= m_lb && m < m_ub) {
188 /* data to be passed to sparse matrix B_ext_main */
189 idx_m[l_m] = m - m_lb;
190 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
191 l_m++;
192 } else {
193 /* data to be passed to sparse matrix B_ext_couple */
194 idx_c[l_c] = m;
195 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196 l_c++;
197 }
198 }
199 k_lb = k;
200
201 /* check mainBlock for data to be passed @row to sparse
202 matrix B_ext_couple */
203 k_ub = B->mainBlock->pattern->ptr[row + 1];
204 k = B->mainBlock->pattern->ptr[row];
205 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
206 for (; k<k_ub; k++) {
207 m = B->mainBlock->pattern->index[k] + offset;
208 idx_c[l_c] = m;
209 l_c++;
210 }
211
212 /* check the rest part of col_coupleBlock for data to
213 be passed @row to sparse matrix B_ext_couple */
214 k = k_lb;
215 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
216 for (k=k_lb; k<k_ub; k++) {
217 m = global_id[B->col_coupleBlock->pattern->index[k]];
218 if (m>= m_lb && m < m_ub) {
219 /* data to be passed to sparse matrix B_ext_main */
220 idx_m[l_m] = m - m_lb;
221 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
222 l_m++;
223 } else {
224 /* data to be passed to sparse matrix B_ext_couple */
225 idx_c[l_c] = m;
226 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227 l_c++;
228 }
229 }
230
231 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
232 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
233 send_offset[2*i] = l_m;
234 len += l_m;
235 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
236 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
237 send_offset[2*i+1] = l_c;
238 len += l_c;
239 i++;
240 }
241
242 /* sending */
243 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
244 A->mpi_info->msg_tag_counter+rank,
245 A->mpi_info->comm,
246 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
247 send_degree[p] = len;
248 i0 = i;
249 }
250 TMPMEMFREE(send_m);
251 TMPMEMFREE(send_c);
252 TMPMEMFREE(idx_m);
253 TMPMEMFREE(idx_c);
254
255
256 q = recv->numNeighbors;
257 len = recv->offsetInShared[q];
258 ptr_main = MEMALLOC((len+1), index_t);
259 ptr_couple = MEMALLOC((len+1), index_t);
260
261
262 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
263 A->col_coupler->mpi_requests,
264 A->col_coupler->mpi_stati);
265 A->mpi_info->msg_tag_counter += size;
266
267 j = 0;
268 k = 0;
269 ptr_main[0] = 0;
270 ptr_couple[0] = 0;
271 // #pragma omp parallel for private(i,j,k) schedule(static)
272 for (i=0; i<len; i++) {
273 j += ptr_ptr[2*i];
274 k += ptr_ptr[2*i+1];
275 ptr_main[i+1] = j;
276 ptr_couple[i+1] = k;
277 }
278
279 TMPMEMFREE(ptr_ptr);
280 idx_main = MEMALLOC(j, index_t);
281 idx_couple = MEMALLOC(k, index_t);
282 ptr_idx = TMPMEMALLOC(j+k, index_t);
283 ptr_val = TMPMEMALLOC((j+k) * block_size, double);
284
285 /* send/receive index array */
286 j=0;
287 k_ub = 0;
288 for (p=0; p<recv->numNeighbors; p++) {
289 k = recv->offsetInShared[p];
290 m = recv->offsetInShared[p+1];
291 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
292 if (i > 0) {
293 k_ub ++;
294 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
295 A->mpi_info->msg_tag_counter+recv->neighbor[p],
296 A->mpi_info->comm,
297 &(A->col_coupler->mpi_requests[p]));
298 }
299 j += i;
300 }
301
302 j=0;
303 k_ub = 0;
304 for (p=0; p<num_neighbors; p++) {
305 i = send_degree[p] - j;
306 if (i > 0){
307 k_ub ++;
308 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
309 A->mpi_info->msg_tag_counter+rank,
310 A->mpi_info->comm,
311 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
312 }
313 j = send_degree[p];
314 }
315
316 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
317 A->col_coupler->mpi_requests,
318 A->col_coupler->mpi_stati);
319 A->mpi_info->msg_tag_counter += size;
320
321 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
322 for (i=0; i<len; i++) {
323 j = ptr_main[i];
324 k = ptr_main[i+1];
325 m = ptr_couple[i];
326 for (p=j; p<k; p++) {
327 idx_main[p] = ptr_idx[m+p];
328 }
329 j = ptr_couple[i+1];
330 for (p=m; p<j; p++) {
331 idx_couple[p] = ptr_idx[k+p];
332 }
333 }
334 TMPMEMFREE(ptr_idx);
335
336 /* allocate pattern and sparsematrix for B_ext_main */
337 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
338 len, num_main_cols, ptr_main, idx_main);
339 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
340 pattern_main, B->row_block_size, B->col_block_size,
341 FALSE);
342 Paso_Pattern_free(pattern_main);
343
344 /* allocate pattern and sparsematrix for B_ext_couple */
345 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
346 len, B->col_distribution->first_component[size],
347 ptr_couple, idx_couple);
348 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
349 pattern_couple, B->row_block_size, B->col_block_size,
350 FALSE);
351 Paso_Pattern_free(pattern_couple);
352
353 /* send/receive value array */
354 j=0;
355 for (p=0; p<recv->numNeighbors; p++) {
356 k = recv->offsetInShared[p];
357 m = recv->offsetInShared[p+1];
358 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
359 if (i > 0)
360 MPI_Irecv(&(ptr_val[j]), i * block_size,
361 MPI_DOUBLE, recv->neighbor[p],
362 A->mpi_info->msg_tag_counter+recv->neighbor[p],
363 A->mpi_info->comm,
364 &(A->col_coupler->mpi_requests[p]));
365 j += (i * block_size);
366 }
367
368 j=0;
369 for (p=0; p<num_neighbors; p++) {
370 i = send_degree[p] - j;
371 if (i > 0)
372 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
373 A->mpi_info->msg_tag_counter+rank,
374 A->mpi_info->comm,
375 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
376 j = send_degree[p] ;
377 }
378
379 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
380 A->col_coupler->mpi_requests,
381 A->col_coupler->mpi_stati);
382 A->mpi_info->msg_tag_counter += size;
383
384 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
385 for (i=0; i<len; i++) {
386 j = ptr_main[i];
387 k = ptr_main[i+1];
388 m = ptr_couple[i];
389 for (p=j; p<k; p++) {
390 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
391 }
392 j = ptr_couple[i+1];
393 for (p=m; p<j; p++) {
394 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
395 }
396 }
397 TMPMEMFREE(ptr_val);
398
399
400 /* release all temp memory allocation */
401 TMPMEMFREE(cols);
402 TMPMEMFREE(cols_array);
403 TMPMEMFREE(recv_offset);
404 TMPMEMFREE(recv_degree);
405 TMPMEMFREE(recv_buf);
406 TMPMEMFREE(send_buf);
407 TMPMEMFREE(send_offset);
408 TMPMEMFREE(send_degree);
409 TMPMEMFREE(send_idx);
410 }
411
412 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
413 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
414 to copy block of data in T to neighbor processors, defined by
415 P->col_coupler->connector->recv->neighbor[k] where k is in
416 [0, P->col_coupler->connector->recv->numNeighbors).
417 Rows to be copied to neighbor processor k is in the list defined by
418 P->col_coupler->connector->recv->offsetInShared[k] ...
419 P->col_coupler->connector->recv->offsetInShared[k+1] */
420 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
421 index_t **p_ptr, index_t **p_idx, double **p_val,
422 index_t *global_id, index_t block_size)
423 {
424 Paso_SharedComponents *send=NULL, *recv=NULL;
425 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
426 index_t i, j, p, m, n, rank, size, offset;
427 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
428 index_t *ptr=*p_ptr, *idx=*p_idx;
429 double *val=*p_val, *recv_val=NULL;
430
431 rank = P->mpi_info->rank;
432 size = P->mpi_info->size;
433 send = P->col_coupler->connector->recv;
434 recv = P->col_coupler->connector->send;
435 send_neighbors = send->numNeighbors;
436 recv_neighbors = recv->numNeighbors;
437 send_rows = P->col_coupleBlock->numCols;
438 recv_rows = recv->offsetInShared[recv_neighbors];
439 offset = P->col_distribution->first_component[rank];
440
441 send_degree = TMPMEMALLOC(send_rows, index_t);
442 recv_ptr = MEMALLOC(recv_rows + 1, index_t);
443 #pragma omp for schedule(static) private(i)
444 for (i=0; i<send_rows; i++)
445 send_degree[i] = ptr[i+1] - ptr[i];
446
447 /* First, send/receive the degree */
448 for (p=0; p<recv_neighbors; p++) { /* Receiving */
449 m = recv->offsetInShared[p];
450 n = recv->offsetInShared[p+1];
451 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
452 P->mpi_info->msg_tag_counter + recv->neighbor[p],
453 P->mpi_info->comm,
454 &(P->col_coupler->mpi_requests[p]));
455 }
456 for (p=0; p<send_neighbors; p++) { /* Sending */
457 m = send->offsetInShared[p];
458 n = send->offsetInShared[p+1];
459 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
460 P->mpi_info->msg_tag_counter + rank,
461 P->mpi_info->comm,
462 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
463 }
464 MPI_Waitall(send_neighbors+recv_neighbors,
465 P->col_coupler->mpi_requests,
466 P->col_coupler->mpi_stati);
467 P->mpi_info->msg_tag_counter += size;
468
469 TMPMEMFREE(send_degree);
470 m = Paso_Util_cumsum(recv_rows, recv_ptr);
471 recv_ptr[recv_rows] = m;
472 recv_idx = MEMALLOC(m, index_t);
473 recv_val = MEMALLOC(m * block_size, double);
474
475 /* Next, send/receive the index array */
476 j = 0;
477 for (p=0; p<recv_neighbors; p++) { /* Receiving */
478 m = recv->offsetInShared[p];
479 n = recv->offsetInShared[p+1];
480 i = recv_ptr[n] - recv_ptr[m];
481 if (i > 0) {
482 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
483 P->mpi_info->msg_tag_counter + recv->neighbor[p],
484 P->mpi_info->comm,
485 &(P->col_coupler->mpi_requests[p]));
486 }
487 j += i;
488 }
489
490 j = 0;
491 for (p=0; p<send_neighbors; p++) { /* Sending */
492 m = send->offsetInShared[p];
493 n = send->offsetInShared[p+1];
494 i = ptr[n] - ptr[m];
495 if (i >0) {
496 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
497 P->mpi_info->msg_tag_counter + rank,
498 P->mpi_info->comm,
499 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
500 j += i;
501 }
502 }
503 MPI_Waitall(send_neighbors+recv_neighbors,
504 P->col_coupler->mpi_requests,
505 P->col_coupler->mpi_stati);
506 P->mpi_info->msg_tag_counter += size;
507
508 /* Last, send/receive the data array */
509 j = 0;
510 for (p=0; p<recv_neighbors; p++) { /* Receiving */
511 m = recv->offsetInShared[p];
512 n = recv->offsetInShared[p+1];
513 i = recv_ptr[n] - recv_ptr[m];
514 if (i > 0)
515 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
516 P->mpi_info->msg_tag_counter + recv->neighbor[p],
517 P->mpi_info->comm,
518 &(P->col_coupler->mpi_requests[p]));
519 j += (i*block_size);
520 }
521
522 j = 0;
523 for (p=0; p<send_neighbors; p++) { /* Sending */
524 m = send->offsetInShared[p];
525 n = send->offsetInShared[p+1];
526 i = ptr[n] - ptr[m];
527 if (i >0) {
528 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
529 P->mpi_info->msg_tag_counter + rank,
530 P->mpi_info->comm,
531 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
532 j += i * block_size;
533 }
534 }
535 MPI_Waitall(send_neighbors+recv_neighbors,
536 P->col_coupler->mpi_requests,
537 P->col_coupler->mpi_stati);
538 P->mpi_info->msg_tag_counter += size;
539
540 /* Clean up and return with recevied ptr, index and data arrays */
541 MEMFREE(ptr);
542 MEMFREE(idx);
543 MEMFREE(val);
544 *p_ptr = recv_ptr;
545 *p_idx = recv_idx;
546 *p_val = recv_val;
547 }
548
549 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
550 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
551 {
552 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
553 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
554 Paso_SystemMatrix *out=NULL;
555 Paso_SystemMatrixPattern *pattern=NULL;
556 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
557 Paso_SharedComponents *send =NULL, *recv=NULL;
558 Paso_Connector *col_connector=NULL, *row_connector=NULL;
559 Paso_Coupler *coupler=NULL;
560 Paso_Pattern *main_pattern=NULL;
561 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
562 const dim_t row_block_size=A->row_block_size;
563 const dim_t col_block_size=A->col_block_size;
564 const dim_t block_size = A->block_size;
565 const dim_t P_block_size = P->block_size;
566 const dim_t num_threads=omp_get_max_threads();
567 const double ZERO = 0.0;
568 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
569 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
570 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
571 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
572 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
573 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
574 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
575 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
576 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
577 index_t sum, i, j, k, iptr, irb, icb, ib;
578 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
579 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
580 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
581 index_t row_marker_ext, *where_p=NULL;
582 index_t **send_ptr=NULL, **send_idx=NULL;
583 dim_t l, p, q, global_label, num_neighbors;
584 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
585 Esys_MPI_rank *neighbor=NULL;
586 #ifdef ESYS_MPI
587 MPI_Request* mpi_requests=NULL;
588 MPI_Status* mpi_stati=NULL;
589 #else
590 int *mpi_requests=NULL, *mpi_stati=NULL;
591 #endif
592
593 // if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
594 // return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);
595
596 /* two sparse matrixes R_main and R_couple will be generate, as the
597 transpose of P_main and P_col_couple, respectively. Note that,
598 R_couple is actually the row_coupleBlock of R (R=P^T) */
599 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
600 if (size > 1)
601 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
602
603 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
604 and needed locally for triple matrix product RAP
605 to be specific, P_ext (on processor k) are group of rows in P, where
606 the list of rows from processor q is given by
607 A->col_coupler->connector->send->shared[rPtr...]
608 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
609 on q.
610 P_ext is represented by two sparse matrixes P_ext_main and
611 P_ext_couple */
612 Paso_Preconditioner_AMG_extendB(A, P);
613
614 /* count the number of cols in P_ext_couple, resize the pattern of
615 sparse matrix P_ext_couple with new compressed order, and then
616 build the col id mapping from P->col_coupleBlock to
617 P_ext_couple */
618 num_Pmain_cols = P->mainBlock->numCols;
619 if (size > 1) {
620 num_Pcouple_cols = P->col_coupleBlock->numCols;
621 num_Acouple_cols = A->col_coupleBlock->numCols;
622 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
623 } else {
624 num_Pcouple_cols = 0;
625 num_Acouple_cols = 0;
626 sum = 0;
627 }
628 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
629 offset = P->col_distribution->first_component[rank];
630 num_Pext_cols = 0;
631 if (P->global_id) {
632 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
633 and P->col_coupleBlock */
634 iptr = 0;
635 if (num_Pcouple_cols || sum > 0) {
636 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
637 #pragma omp parallel for lastprivate(iptr) schedule(static)
638 for (iptr=0; iptr<sum; iptr++){
639 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
640 }
641 for (j=0; j<num_Pcouple_cols; j++, iptr++){
642 temp[iptr] = P->global_id[j];
643 }
644 }
645 if (iptr) {
646 #ifdef USE_QSORTG
647 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
648 #else
649 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
650 #endif
651 num_Pext_cols = 1;
652 i = temp[0];
653 for (j=1; j<iptr; j++) {
654 if (temp[j] > i) {
655 i = temp[j];
656 temp[num_Pext_cols++] = i;
657 }
658 }
659 }
660 /* resize the pattern of P_ext_couple */
661 if(num_Pext_cols){
662 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
663 #pragma omp parallel for private(i) schedule(static)
664 for (i=0; i<num_Pext_cols; i++)
665 global_id_P[i] = temp[i];
666 }
667 if (num_Pcouple_cols || sum > 0)
668 TMPMEMFREE(temp);
669 #pragma omp parallel for private(i, where_p) schedule(static)
670 for (i=0; i<sum; i++) {
671 where_p = (index_t *)bsearch(
672 &(P->remote_coupleBlock->pattern->index[i]),
673 global_id_P, num_Pext_cols,
674 sizeof(index_t), Paso_comparIndex);
675 P->remote_coupleBlock->pattern->index[i] =
676 (index_t)(where_p -global_id_P);
677 }
678
679 /* build the mapping */
680 if (num_Pcouple_cols) {
681 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
682 iptr = 0;
683 for (i=0; i<num_Pext_cols; i++)
684 if (global_id_P[i] == P->global_id[iptr]) {
685 Pcouple_to_Pext[iptr++] = i;
686 if (iptr == num_Pcouple_cols) break;
687 }
688 }
689 }
690
691 /* alloc and initialize the makers */
692 sum = num_Pext_cols + num_Pmain_cols;
693 P_marker = TMPMEMALLOC(sum, index_t);
694 A_marker = TMPMEMALLOC(num_A_cols, index_t);
695 #pragma omp parallel for private(i) schedule(static)
696 for (i=0; i<sum; i++) P_marker[i] = -1;
697 #pragma omp parallel for private(i) schedule(static)
698 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
699
700 /* Now, count the size of RAP_ext. Start with rows in R_couple */
701 sum = 0;
702 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
703 row_marker = sum;
704 /* then, loop over elements in row i_r of R_couple */
705 j1_ub = R_couple->pattern->ptr[i_r+1];
706 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
707 i1 = R_couple->pattern->index[j1];
708 /* then, loop over elements in row i1 of A->col_coupleBlock */
709 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
710 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
711 i2 = A->col_coupleBlock->pattern->index[j2];
712
713 /* check whether entry RA[i_r, i2] has been previously visited.
714 RAP new entry is possible only if entry RA[i_r, i2] has not
715 been visited yet */
716 if (A_marker[i2] != i_r) {
717 /* first, mark entry RA[i_r,i2] as visited */
718 A_marker[i2] = i_r;
719
720 /* then loop over elements in row i2 of P_ext_main */
721 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
722 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
723 i_c = P->row_coupleBlock->pattern->index[j3];
724
725 /* check whether entry RAP[i_r,i_c] has been created.
726 If not yet created, create the entry and increase
727 the total number of elements in RAP_ext */
728 if (P_marker[i_c] < row_marker) {
729 P_marker[i_c] = sum;
730 sum++;
731 }
732 }
733
734 /* loop over elements in row i2 of P_ext_couple, do the same */
735 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
736 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
737 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
738 if (P_marker[i_c] < row_marker) {
739 P_marker[i_c] = sum;
740 sum++;
741 }
742 }
743 }
744 }
745
746 /* now loop over elements in row i1 of A->mainBlock, repeat
747 the process we have done to block A->col_coupleBlock */
748 j2_ub = A->mainBlock->pattern->ptr[i1+1];
749 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
750 i2 = A->mainBlock->pattern->index[j2];
751 if (A_marker[i2 + num_Acouple_cols] != i_r) {
752 A_marker[i2 + num_Acouple_cols] = i_r;
753 j3_ub = P->mainBlock->pattern->ptr[i2+1];
754 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
755 i_c = P->mainBlock->pattern->index[j3];
756 if (P_marker[i_c] < row_marker) {
757 P_marker[i_c] = sum;
758 sum++;
759 }
760 }
761 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
762 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
763 /* note that we need to map the column index in
764 P->col_coupleBlock back into the column index in
765 P_ext_couple */
766 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
767 if (P_marker[i_c] < row_marker) {
768 P_marker[i_c] = sum;
769 sum++;
770 }
771 }
772 }
773 }
774 }
775 }
776
777 /* Now we have the number of non-zero elements in RAP_ext, allocate
778 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
779 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
780 RAP_ext_idx = MEMALLOC(sum, index_t);
781 RAP_ext_val = MEMALLOC(sum * block_size, double);
782 RA_val = TMPMEMALLOC(block_size, double);
783 RAP_val = TMPMEMALLOC(block_size, double);
784
785 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
786 sum = num_Pext_cols + num_Pmain_cols;
787 #pragma omp parallel for private(i) schedule(static)
788 for (i=0; i<sum; i++) P_marker[i] = -1;
789 #pragma omp parallel for private(i) schedule(static)
790 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
791 sum = 0;
792 RAP_ext_ptr[0] = 0;
793 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
794 row_marker = sum;
795 /* then, loop over elements in row i_r of R_couple */
796 j1_ub = R_couple->pattern->ptr[i_r+1];
797 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
798 i1 = R_couple->pattern->index[j1];
799 R_val = &(R_couple->val[j1*P_block_size]);
800
801 /* then, loop over elements in row i1 of A->col_coupleBlock */
802 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
803 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
804 i2 = A->col_coupleBlock->pattern->index[j2];
805 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
806 for (irb=0; irb<row_block_size; irb++)
807 for (icb=0; icb<col_block_size; icb++)
808 RA_val[irb+row_block_size*icb] = ZERO;
809 for (irb=0; irb<P_block_size; irb++) {
810 rtmp=R_val[irb];
811 for (icb=0; icb<col_block_size; icb++) {
812 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
813 }
814 }
815
816 /* check whether entry RA[i_r, i2] has been previously visited.
817 RAP new entry is possible only if entry RA[i_r, i2] has not
818 been visited yet */
819 if (A_marker[i2] != i_r) {
820 /* first, mark entry RA[i_r,i2] as visited */
821 A_marker[i2] = i_r;
822
823 /* then loop over elements in row i2 of P_ext_main */
824 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
825 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
826 i_c = P->row_coupleBlock->pattern->index[j3];
827 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
828 for (irb=0; irb<row_block_size; irb++)
829 for (icb=0; icb<col_block_size; icb++)
830 RAP_val[irb+row_block_size*icb] = ZERO;
831 for (icb=0; icb<P_block_size; icb++) {
832 rtmp = temp_val[icb];
833 for (irb=0; irb<row_block_size; irb++) {
834 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
835 }
836 }
837
838 /* check whether entry RAP[i_r,i_c] has been created.
839 If not yet created, create the entry and increase
840 the total number of elements in RAP_ext */
841 if (P_marker[i_c] < row_marker) {
842 P_marker[i_c] = sum;
843 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
844 RAP_ext_idx[sum] = i_c + offset;
845 sum++;
846 } else {
847 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
848 for (ib=0; ib<block_size; ib++) {
849 temp_val[ib] += RAP_val[ib];
850 }
851 }
852 }
853
854 /* loop over elements in row i2 of P_ext_couple, do the same */
855 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
856 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
857 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
858 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
859 for (irb=0; irb<row_block_size; irb++)
860 for (icb=0; icb<col_block_size; icb++)
861 RAP_val[irb+row_block_size*icb]=ZERO;
862 for (icb=0; icb<P_block_size; icb++) {
863 rtmp = temp_val[icb];
864 for (irb=0; irb<row_block_size; irb++) {
865 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
866 }
867 }
868
869 if (P_marker[i_c] < row_marker) {
870 P_marker[i_c] = sum;
871 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
872 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
873 sum++;
874 } else {
875 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
876 for (ib=0; ib<block_size; ib++) {
877 temp_val[ib] += RAP_val[ib];
878 }
879 }
880 }
881
882 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
883 Only the contributions are added. */
884 } else {
885 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
886 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
887 i_c = P->row_coupleBlock->pattern->index[j3];
888 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
889 for (irb=0; irb<row_block_size; irb++)
890 for (icb=0; icb<col_block_size; icb++)
891 RAP_val[irb+row_block_size*icb]=ZERO;
892 for (icb=0; icb<P_block_size; icb++) {
893 rtmp = temp_val[icb];
894 for (irb=0; irb<row_block_size; irb++) {
895 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
896 }
897 }
898
899 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
900 for (ib=0; ib<block_size; ib++) {
901 temp_val[ib] += RAP_val[ib];
902 }
903 }
904 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
905 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
906 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
907 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
908 for (irb=0; irb<row_block_size; irb++)
909 for (icb=0; icb<col_block_size; icb++)
910 RAP_val[irb+row_block_size*icb]=ZERO;
911 for (icb=0; icb<P_block_size; icb++) {
912 rtmp = temp_val[icb];
913 for (irb=0; irb<row_block_size; irb++) {
914 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
915 }
916 }
917
918 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
919 for (ib=0; ib<block_size; ib++) {
920 temp_val[ib] += RAP_val[ib];
921 }
922 }
923 }
924 }
925
926 /* now loop over elements in row i1 of A->mainBlock, repeat
927 the process we have done to block A->col_coupleBlock */
928 j2_ub = A->mainBlock->pattern->ptr[i1+1];
929 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
930 i2 = A->mainBlock->pattern->index[j2];
931 temp_val = &(A->mainBlock->val[j2*block_size]);
932 for (irb=0; irb<row_block_size; irb++)
933 for (icb=0; icb<col_block_size; icb++)
934 RA_val[irb+row_block_size*icb]=ZERO;
935 for (irb=0; irb<P_block_size; irb++) {
936 rtmp=R_val[irb];
937 for (icb=0; icb<col_block_size; icb++) {
938 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
939 }
940 }
941
942 if (A_marker[i2 + num_Acouple_cols] != i_r) {
943 A_marker[i2 + num_Acouple_cols] = i_r;
944 j3_ub = P->mainBlock->pattern->ptr[i2+1];
945 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
946 i_c = P->mainBlock->pattern->index[j3];
947 temp_val = &(P->mainBlock->val[j3*P_block_size]);
948 for (irb=0; irb<row_block_size; irb++)
949 for (icb=0; icb<col_block_size; icb++)
950 RAP_val[irb+row_block_size*icb]=ZERO;
951 for (icb=0; icb<P_block_size; icb++) {
952 rtmp = temp_val[icb];
953 for (irb=0; irb<row_block_size; irb++) {
954 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
955 }
956 }
957
958 if (P_marker[i_c] < row_marker) {
959 P_marker[i_c] = sum;
960 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
961 RAP_ext_idx[sum] = i_c + offset;
962 sum++;
963 } else {
964 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
965 for (ib=0; ib<block_size; ib++) {
966 temp_val[ib] += RAP_val[ib];
967 }
968 }
969 }
970 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
971 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
972 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
973 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
974 for (irb=0; irb<row_block_size; irb++)
975 for (icb=0; icb<col_block_size; icb++)
976 RAP_val[irb+row_block_size*icb]=ZERO;
977 for (icb=0; icb<P_block_size; icb++) {
978 rtmp=temp_val[icb];
979 for (irb=0; irb<row_block_size; irb++) {
980 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
981 }
982 }
983
984 if (P_marker[i_c] < row_marker) {
985 P_marker[i_c] = sum;
986 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
987 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
988 sum++;
989 } else {
990 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
991 for (ib=0; ib<block_size; ib++) {
992 temp_val[ib] += RAP_val[ib];
993 }
994 }
995 }
996 } else {
997 j3_ub = P->mainBlock->pattern->ptr[i2+1];
998 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
999 i_c = P->mainBlock->pattern->index[j3];
1000 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1001 for (irb=0; irb<row_block_size; irb++)
1002 for (icb=0; icb<col_block_size; icb++)
1003 RAP_val[irb+row_block_size*icb]=ZERO;
1004 for (icb=0; icb<P_block_size; icb++) {
1005 rtmp=temp_val[icb];
1006 for (irb=0; irb<row_block_size; irb++) {
1007 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1008 }
1009 }
1010
1011 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1012 for (ib=0; ib<block_size; ib++) {
1013 temp_val[ib] += RAP_val[ib];
1014 }
1015 }
1016 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1017 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1018 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1019 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1020 for (irb=0; irb<row_block_size; irb++)
1021 for (icb=0; icb<col_block_size; icb++)
1022 RAP_val[irb+row_block_size*icb]=ZERO;
1023 for (icb=0; icb<P_block_size; icb++) {
1024 rtmp=temp_val[icb];
1025 for (irb=0; irb<row_block_size; irb++) {
1026 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1027 }
1028 }
1029
1030 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1031 for (ib=0; ib<block_size; ib++) {
1032 temp_val[ib] += RAP_val[ib];
1033 }
1034 }
1035 }
1036 }
1037 }
1038 RAP_ext_ptr[i_r+1] = sum;
1039 }
1040 TMPMEMFREE(P_marker);
1041 TMPMEMFREE(Pcouple_to_Pext);
1042
1043 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1044 which is given by the column list of P->col_coupleBlock, and
1045 column "c" is the list of columns which possiblely covers the
1046 whole column range of systme martris P. This part of data is to
1047 be passed to neighboring processors, and added to corresponding
1048 RAP[r,c] entries in the neighboring processors */
1049 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1050 &RAP_ext_val, global_id_P, block_size);
1051
1052 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1053 sum = RAP_ext_ptr[num_RAPext_rows];
1054 num_RAPext_cols = 0;
1055 if (num_Pext_cols || sum > 0) {
1056 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1057 j1_ub = offset + num_Pmain_cols;
1058 for (i=0, iptr=0; i<sum; i++) {
1059 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1060 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1061 }
1062 for (j=0; j<num_Pext_cols; j++, iptr++){
1063 temp[iptr] = global_id_P[j];
1064 }
1065
1066 if (iptr) {
1067 #ifdef USE_QSORTG
1068 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1069 #else
1070 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1071 #endif
1072 num_RAPext_cols = 1;
1073 i = temp[0];
1074 for (j=1; j<iptr; j++) {
1075 if (temp[j] > i) {
1076 i = temp[j];
1077 temp[num_RAPext_cols++] = i;
1078 }
1079 }
1080 }
1081 }
1082
1083 /* resize the pattern of P_ext_couple */
1084 if(num_RAPext_cols){
1085 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1086 #pragma omp parallel for private(i) schedule(static)
1087 for (i=0; i<num_RAPext_cols; i++)
1088 global_id_RAP[i] = temp[i];
1089 }
1090 if (num_Pext_cols || sum > 0)
1091 TMPMEMFREE(temp);
1092 j1_ub = offset + num_Pmain_cols;
1093 #pragma omp parallel for private(i, where_p) schedule(static)
1094 for (i=0; i<sum; i++) {
1095 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1096 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1097 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1098 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1099 } else
1100 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1101 }
1102
1103 /* build the mapping */
1104 if (num_Pcouple_cols) {
1105 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1106 iptr = 0;
1107 for (i=0; i<num_RAPext_cols; i++)
1108 if (global_id_RAP[i] == P->global_id[iptr]) {
1109 Pcouple_to_RAP[iptr++] = i;
1110 if (iptr == num_Pcouple_cols) break;
1111 }
1112 }
1113
1114 if (num_Pext_cols) {
1115 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1116 iptr = 0;
1117 for (i=0; i<num_RAPext_cols; i++)
1118 if (global_id_RAP[i] == global_id_P[iptr]) {
1119 Pext_to_RAP[iptr++] = i;
1120 if (iptr == num_Pext_cols) break;
1121 }
1122 }
1123
1124 if (global_id_P){
1125 TMPMEMFREE(global_id_P);
1126 global_id_P = NULL;
1127 }
1128
1129 /* alloc and initialize the makers */
1130 sum = num_RAPext_cols + num_Pmain_cols;
1131 P_marker = TMPMEMALLOC(sum, index_t);
1132 #pragma omp parallel for private(i) schedule(static)
1133 for (i=0; i<sum; i++) P_marker[i] = -1;
1134 #pragma omp parallel for private(i) schedule(static)
1135 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1136
1137 /* Now, count the size of RAP. Start with rows in R_main */
1138 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1139 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1140 shared = P->col_coupler->connector->send->shared;
1141 i = 0;
1142 j = 0;
1143 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1144 /* Mark the start of row for both main block and couple block */
1145 row_marker = i;
1146 row_marker_ext = j;
1147
1148 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1149 in RAP_ext */
1150 P_marker[i_r] = i;
1151 i++;
1152 for (j1=0; j1<num_neighbors; j1++) {
1153 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1154 if (shared[j2] == i_r) {
1155 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1156 i_c = RAP_ext_idx[k];
1157 if (i_c < num_Pmain_cols) {
1158 if (P_marker[i_c] < row_marker) {
1159 P_marker[i_c] = i;
1160 i++;
1161 }
1162 } else {
1163 if (P_marker[i_c] < row_marker_ext) {
1164 P_marker[i_c] = j;
1165 j++;
1166 }
1167 }
1168 }
1169 break;
1170 }
1171 }
1172 }
1173
1174 /* then, loop over elements in row i_r of R_main */
1175 j1_ub = R_main->pattern->ptr[i_r+1];
1176 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1177 i1 = R_main->pattern->index[j1];
1178
1179 /* then, loop over elements in row i1 of A->col_coupleBlock */
1180 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1181 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1182 i2 = A->col_coupleBlock->pattern->index[j2];
1183
1184 /* check whether entry RA[i_r, i2] has been previously visited.
1185 RAP new entry is possible only if entry RA[i_r, i2] has not
1186 been visited yet */
1187 if (A_marker[i2] != i_r) {
1188 /* first, mark entry RA[i_r,i2] as visited */
1189 A_marker[i2] = i_r;
1190
1191 /* then loop over elements in row i2 of P_ext_main */
1192 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1193 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1194 i_c = P->row_coupleBlock->pattern->index[j3];
1195
1196 /* check whether entry RAP[i_r,i_c] has been created.
1197 If not yet created, create the entry and increase
1198 the total number of elements in RAP_ext */
1199 if (P_marker[i_c] < row_marker) {
1200 P_marker[i_c] = i;
1201 i++;
1202 }
1203 }
1204
1205 /* loop over elements in row i2 of P_ext_couple, do the same */
1206 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1207 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1208 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1209 if (P_marker[i_c] < row_marker_ext) {
1210 P_marker[i_c] = j;
1211 j++;
1212 }
1213 }
1214 }
1215 }
1216
1217 /* now loop over elements in row i1 of A->mainBlock, repeat
1218 the process we have done to block A->col_coupleBlock */
1219 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1220 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1221 i2 = A->mainBlock->pattern->index[j2];
1222 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1223 A_marker[i2 + num_Acouple_cols] = i_r;
1224 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1225 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1226 i_c = P->mainBlock->pattern->index[j3];
1227 if (P_marker[i_c] < row_marker) {
1228 P_marker[i_c] = i;
1229 i++;
1230 }
1231 }
1232 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1233 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1234 /* note that we need to map the column index in
1235 P->col_coupleBlock back into the column index in
1236 P_ext_couple */
1237 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1238 if (P_marker[i_c] < row_marker_ext) {
1239 P_marker[i_c] = j;
1240 j++;
1241 }
1242 }
1243 }
1244 }
1245 }
1246 }
1247
1248 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1249 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1250 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1251 RAP_couple */
1252 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1253 RAP_main_idx = MEMALLOC(i, index_t);
1254 RAP_main_val = TMPMEMALLOC(i * block_size, double);
1255 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1256 RAP_couple_idx = MEMALLOC(j, index_t);
1257 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
1258
1259 RAP_main_ptr[num_Pmain_cols] = i;
1260 RAP_couple_ptr[num_Pmain_cols] = j;
1261
1262 #pragma omp parallel for private(i) schedule(static)
1263 for (i=0; i<sum; i++) P_marker[i] = -1;
1264 #pragma omp parallel for private(i) schedule(static)
1265 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1266
1267 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1268 in R_main */
1269 i = 0;
1270 j = 0;
1271 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1272 /* Mark the start of row for both main block and couple block */
1273 row_marker = i;
1274 row_marker_ext = j;
1275 RAP_main_ptr[i_r] = row_marker;
1276 RAP_couple_ptr[i_r] = row_marker_ext;
1277
1278 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1279 in row i_r of RAP_ext */
1280 P_marker[i_r] = i;
1281 for (ib=0; ib<block_size; ib++)
1282 RAP_main_val[i*block_size+ib] = ZERO;
1283 RAP_main_idx[i] = i_r;
1284 i++;
1285
1286 for (j1=0; j1<num_neighbors; j1++) {
1287 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1288 if (shared[j2] == i_r) {
1289 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1290 i_c = RAP_ext_idx[k];
1291 if (i_c < num_Pmain_cols) {
1292 if (P_marker[i_c] < row_marker) {
1293 P_marker[i_c] = i;
1294 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1295 RAP_main_idx[i] = i_c;
1296 i++;
1297 } else {
1298 t1_val = &(RAP_ext_val[k*block_size]);
1299 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1300 for (ib=0; ib<block_size; ib++)
1301 t2_val[ib] += t1_val[ib];
1302 }
1303 } else {
1304 if (P_marker[i_c] < row_marker_ext) {
1305 P_marker[i_c] = j;
1306 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1307 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1308 j++;
1309 } else {
1310 t1_val = &(RAP_ext_val[k*block_size]);
1311 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1312 for (ib=0; ib<block_size; ib++)
1313 t2_val[ib] += t1_val[ib];
1314 }
1315 }
1316 }
1317 break;
1318 }
1319 }
1320 }
1321
1322 /* then, loop over elements in row i_r of R_main */
1323 j1_ub = R_main->pattern->ptr[i_r+1];
1324 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1325 i1 = R_main->pattern->index[j1];
1326 R_val = &(R_main->val[j1*P_block_size]);
1327
1328 /* then, loop over elements in row i1 of A->col_coupleBlock */
1329 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1330 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1331 i2 = A->col_coupleBlock->pattern->index[j2];
1332 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1333 for (irb=0; irb<row_block_size; irb++)
1334 for (icb=0; icb<col_block_size; icb++)
1335 RA_val[irb+row_block_size*icb]=ZERO;
1336 for (irb=0; irb<P_block_size; irb++) {
1337 rtmp=R_val[irb];
1338 for (icb=0; icb<col_block_size; icb++) {
1339 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1340 }
1341 }
1342
1343
1344 /* check whether entry RA[i_r, i2] has been previously visited.
1345 RAP new entry is possible only if entry RA[i_r, i2] has not
1346 been visited yet */
1347 if (A_marker[i2] != i_r) {
1348 /* first, mark entry RA[i_r,i2] as visited */
1349 A_marker[i2] = i_r;
1350
1351 /* then loop over elements in row i2 of P_ext_main */
1352 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1353 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1354 i_c = P->row_coupleBlock->pattern->index[j3];
1355 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1356 for (irb=0; irb<row_block_size; irb++)
1357 for (icb=0; icb<col_block_size; icb++)
1358 RAP_val[irb+row_block_size*icb]=ZERO;
1359 for (icb=0; icb<P_block_size; icb++) {
1360 rtmp=temp_val[icb];
1361 for (irb=0; irb<row_block_size; irb++) {
1362 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1363 }
1364 }
1365
1366
1367 /* check whether entry RAP[i_r,i_c] has been created.
1368 If not yet created, create the entry and increase
1369 the total number of elements in RAP_ext */
1370 if (P_marker[i_c] < row_marker) {
1371 P_marker[i_c] = i;
1372 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1373 RAP_main_idx[i] = i_c;
1374 i++;
1375 } else {
1376 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1377 for (ib=0; ib<block_size; ib++) {
1378 temp_val[ib] += RAP_val[ib];
1379 }
1380 }
1381 }
1382
1383 /* loop over elements in row i2 of P_ext_couple, do the same */
1384 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1385 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1386 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1387 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1388 for (irb=0; irb<row_block_size; irb++)
1389 for (icb=0; icb<col_block_size; icb++)
1390 RAP_val[irb+row_block_size*icb]=ZERO;
1391 for (icb=0; icb<P_block_size; icb++) {
1392 rtmp=temp_val[icb];
1393 for (irb=0; irb<row_block_size; irb++) {
1394 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1395 }
1396 }
1397
1398 if (P_marker[i_c] < row_marker_ext) {
1399 P_marker[i_c] = j;
1400 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1401 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1402 j++;
1403 } else {
1404 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1405 for (ib=0; ib<block_size; ib++) {
1406 temp_val[ib] += RAP_val[ib];
1407 }
1408 }
1409 }
1410
1411 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1412 Only the contributions are added. */
1413 } else {
1414 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1415 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1416 i_c = P->row_coupleBlock->pattern->index[j3];
1417 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1418 for (irb=0; irb<row_block_size; irb++)
1419 for (icb=0; icb<col_block_size; icb++)
1420 RAP_val[irb+row_block_size*icb]=ZERO;
1421 for (icb=0; icb<P_block_size; icb++) {
1422 rtmp=temp_val[icb];
1423 for (irb=0; irb<row_block_size; irb++) {
1424 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1425 }
1426 }
1427
1428 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1429 for (ib=0; ib<block_size; ib++) {
1430 temp_val[ib] += RAP_val[ib];
1431 }
1432 }
1433 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1434 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1435 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1436 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1437 for (irb=0; irb<row_block_size; irb++)
1438 for (icb=0; icb<col_block_size; icb++)
1439 RAP_val[irb+row_block_size*icb]=ZERO;
1440 for (icb=0; icb<P_block_size; icb++) {
1441 rtmp=temp_val[icb];
1442 for (irb=0; irb<row_block_size; irb++) {
1443 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1444 }
1445 }
1446
1447 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1448 for (ib=0; ib<block_size; ib++) {
1449 temp_val[ib] += RAP_val[ib];
1450 }
1451 }
1452 }
1453 }
1454
1455 /* now loop over elements in row i1 of A->mainBlock, repeat
1456 the process we have done to block A->col_coupleBlock */
1457 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1458 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1459 i2 = A->mainBlock->pattern->index[j2];
1460 temp_val = &(A->mainBlock->val[j2*block_size]);
1461 for (irb=0; irb<row_block_size; irb++)
1462 for (icb=0; icb<col_block_size; icb++)
1463 RA_val[irb+row_block_size*icb]=ZERO;
1464 for (irb=0; irb<P_block_size; irb++) {
1465 rtmp=R_val[irb];
1466 for (icb=0; icb<col_block_size; icb++) {
1467 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1468 }
1469 }
1470
1471 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1472 A_marker[i2 + num_Acouple_cols] = i_r;
1473 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1474 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1475 i_c = P->mainBlock->pattern->index[j3];
1476 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1477 for (irb=0; irb<row_block_size; irb++)
1478 for (icb=0; icb<col_block_size; icb++)
1479 RAP_val[irb+row_block_size*icb]=ZERO;
1480 for (icb=0; icb<P_block_size; icb++) {
1481 rtmp=temp_val[icb];
1482 for (irb=0; irb<row_block_size; irb++) {
1483 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1484 }
1485 }
1486 if (P_marker[i_c] < row_marker) {
1487 P_marker[i_c] = i;
1488 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1489 RAP_main_idx[i] = i_c;
1490 i++;
1491 } else {
1492 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1493 for (ib=0; ib<block_size; ib++) {
1494 temp_val[ib] += RAP_val[ib];
1495 }
1496 }
1497 }
1498 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1499 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1500 /* note that we need to map the column index in
1501 P->col_coupleBlock back into the column index in
1502 P_ext_couple */
1503 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1504 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1505 for (irb=0; irb<row_block_size; irb++)
1506 for (icb=0; icb<col_block_size; icb++)
1507 RAP_val[irb+row_block_size*icb]=ZERO;
1508 for (icb=0; icb<P_block_size; icb++) {
1509 rtmp=temp_val[icb];
1510 for (irb=0; irb<row_block_size; irb++) {
1511 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1512 }
1513 }
1514
1515 if (P_marker[i_c] < row_marker_ext) {
1516 P_marker[i_c] = j;
1517 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1518 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1519 j++;
1520 } else {
1521 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1522 for (ib=0; ib<block_size; ib++) {
1523 temp_val[ib] += RAP_val[ib];
1524 }
1525 }
1526 }
1527
1528 } else {
1529 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1530 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1531 i_c = P->mainBlock->pattern->index[j3];
1532 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1533 for (irb=0; irb<row_block_size; irb++)
1534 for (icb=0; icb<col_block_size; icb++)
1535 RAP_val[irb+row_block_size*icb]=ZERO;
1536 for (icb=0; icb<P_block_size; icb++) {
1537 rtmp=temp_val[icb];
1538 for (irb=0; irb<row_block_size; irb++) {
1539 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1540 }
1541 }
1542
1543 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1544 for (ib=0; ib<block_size; ib++) {
1545 temp_val[ib] += RAP_val[ib];
1546 }
1547 }
1548 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1549 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1550 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1551 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1552 for (irb=0; irb<row_block_size; irb++)
1553 for (icb=0; icb<col_block_size; icb++)
1554 RAP_val[irb+row_block_size*icb]=ZERO;
1555 for (icb=0; icb<P_block_size; icb++) {
1556 rtmp = temp_val[icb];
1557 for (irb=0; irb<row_block_size; irb++) {
1558 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1559 }
1560 }
1561
1562 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1563 for (ib=0; ib<block_size; ib++) {
1564 temp_val[ib] += RAP_val[ib];
1565 }
1566 }
1567 }
1568 }
1569 }
1570
1571 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1572 if (i > row_marker) {
1573 offset = i - row_marker;
1574 temp = TMPMEMALLOC(offset, index_t);
1575 #pragma omp parallel for schedule(static) private(iptr)
1576 for (iptr=0; iptr<offset; iptr++)
1577 temp[iptr] = RAP_main_idx[row_marker+iptr];
1578 if (offset > 0) {
1579 #ifdef USE_QSORTG
1580 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1581 #else
1582 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1583 #endif
1584 }
1585 temp_val = TMPMEMALLOC(offset * block_size, double);
1586 #pragma omp parallel for schedule(static) private(iptr,k)
1587 for (iptr=0; iptr<offset; iptr++){
1588 k = P_marker[temp[iptr]];
1589 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1590 P_marker[temp[iptr]] = iptr + row_marker;
1591 }
1592 #pragma omp parallel for schedule(static) private(iptr)
1593 for (iptr=0; iptr<offset; iptr++){
1594 RAP_main_idx[row_marker+iptr] = temp[iptr];
1595 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1596 }
1597 TMPMEMFREE(temp);
1598 TMPMEMFREE(temp_val);
1599 }
1600 if (j > row_marker_ext) {
1601 offset = j - row_marker_ext;
1602 temp = TMPMEMALLOC(offset, index_t);
1603 #pragma omp parallel for schedule(static) private(iptr)
1604 for (iptr=0; iptr<offset; iptr++)
1605 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1606 if (offset > 0) {
1607 #ifdef USE_QSORTG
1608 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1609 #else
1610 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1611 #endif
1612 }
1613 temp_val = TMPMEMALLOC(offset * block_size, double);
1614 #pragma omp parallel for schedule(static) private(iptr, k)
1615 for (iptr=0; iptr<offset; iptr++){
1616 k = P_marker[temp[iptr] + num_Pmain_cols];
1617 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1618 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1619 }
1620 #pragma omp parallel for schedule(static) private(iptr)
1621 for (iptr=0; iptr<offset; iptr++){
1622 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1623 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1624 }
1625 TMPMEMFREE(temp);
1626 TMPMEMFREE(temp_val);
1627 }
1628 }
1629
1630 TMPMEMFREE(RA_val);
1631 TMPMEMFREE(RAP_val);
1632 TMPMEMFREE(A_marker);
1633 TMPMEMFREE(Pext_to_RAP);
1634 TMPMEMFREE(Pcouple_to_RAP);
1635 MEMFREE(RAP_ext_ptr);
1636 MEMFREE(RAP_ext_idx);
1637 MEMFREE(RAP_ext_val);
1638 Paso_SparseMatrix_free(R_main);
1639 Paso_SparseMatrix_free(R_couple);
1640
1641 /* Check whether there are empty columns in RAP_couple */
1642 #pragma omp parallel for schedule(static) private(i)
1643 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1644 /* num of non-empty columns is stored in "k" */
1645 k = 0;
1646 j = RAP_couple_ptr[num_Pmain_cols];
1647 for (i=0; i<j; i++) {
1648 i1 = RAP_couple_idx[i];
1649 if (P_marker[i1]) {
1650 P_marker[i1] = 0;
1651 k++;
1652 }
1653 }
1654
1655 /* empty columns is found */
1656 if (k < num_RAPext_cols) {
1657 temp = TMPMEMALLOC(k, index_t);
1658 k = 0;
1659 for (i=0; i<num_RAPext_cols; i++)
1660 if (!P_marker[i]) {
1661 P_marker[i] = k;
1662 temp[k] = global_id_RAP[i];
1663 k++;
1664 }
1665 #pragma omp parallel for schedule(static) private(i, i1)
1666 for (i=0; i<j; i++) {
1667 i1 = RAP_couple_idx[i];
1668 RAP_couple_idx[i] = P_marker[i1];
1669 }
1670 num_RAPext_cols = k;
1671 TMPMEMFREE(global_id_RAP);
1672 global_id_RAP = temp;
1673 }
1674 TMPMEMFREE(P_marker);
1675
1676 /******************************************************/
1677 /* Start to create the coasre level System Matrix A_c */
1678 /******************************************************/
1679 /* first, prepare the sender/receiver for the col_connector */
1680 dist = P->pattern->input_distribution->first_component;
1681 recv_len = TMPMEMALLOC(size, dim_t);
1682 send_len = TMPMEMALLOC(size, dim_t);
1683 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
1684 offsetInShared = TMPMEMALLOC(size+1, index_t);
1685 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
1686 memset(recv_len, 0, sizeof(dim_t) * size);
1687 memset(send_len, 0, sizeof(dim_t) * size);
1688 num_neighbors = 0;
1689 offsetInShared[0] = 0;
1690 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1691 shared[i] = i + num_Pmain_cols;
1692 if (k <= global_id_RAP[i]) {
1693 if (recv_len[j] > 0) {
1694 neighbor[num_neighbors] = j;
1695 num_neighbors ++;
1696 offsetInShared[num_neighbors] = i;
1697 }
1698 while (k <= global_id_RAP[i]) {
1699 j++;
1700 k = dist[j+1];
1701 }
1702 }
1703 recv_len[j] ++;
1704 }
1705 if (recv_len[j] > 0) {
1706 neighbor[num_neighbors] = j;
1707 num_neighbors ++;
1708 offsetInShared[num_neighbors] = i;
1709 }
1710 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1711 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1712
1713 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1714
1715 #ifdef ESYS_MPI
1716 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
1717 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
1718 #else
1719 mpi_requests=TMPMEMALLOC(size*2,int);
1720 mpi_stati=TMPMEMALLOC(size*2,int);
1721 #endif
1722 num_neighbors = 0;
1723 j = 0;
1724 offsetInShared[0] = 0;
1725 for (i=0; i<size; i++) {
1726 if (send_len[i] > 0) {
1727 neighbor[num_neighbors] = i;
1728 num_neighbors ++;
1729 j += send_len[i];
1730 offsetInShared[num_neighbors] = j;
1731 }
1732 }
1733 TMPMEMFREE(shared);
1734 shared = TMPMEMALLOC(j, index_t);
1735 for (i=0, j=0; i<num_neighbors; i++) {
1736 k = neighbor[i];
1737 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1738 mpi_info->msg_tag_counter+k,
1739 mpi_info->comm, &mpi_requests[i]);
1740 j += send_len[k];
1741 }
1742 for (i=0, j=0; i<recv->numNeighbors; i++) {
1743 k = recv->neighbor[i];
1744 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1745 mpi_info->msg_tag_counter+rank,
1746 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1747 j += recv_len[k];
1748 }
1749 MPI_Waitall(num_neighbors + recv->numNeighbors,
1750 mpi_requests, mpi_stati);
1751 mpi_info->msg_tag_counter += size;
1752
1753 j = offsetInShared[num_neighbors];
1754 offset = dist[rank];
1755 #pragma omp parallel for schedule(static) private(i)
1756 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1757 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1758 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1759
1760 col_connector = Paso_Connector_alloc(send, recv);
1761 TMPMEMFREE(shared);
1762 Paso_SharedComponents_free(recv);
1763 Paso_SharedComponents_free(send);
1764
1765 /* now, create row distribution (output_distri) and col
1766 distribution (input_distribution) */
1767 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1768 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1769
1770 /* then, prepare the sender/receiver for the row_connector, first, prepare
1771 the information for sender */
1772 sum = RAP_couple_ptr[num_Pmain_cols];
1773 len = TMPMEMALLOC(size, dim_t);
1774 send_ptr = TMPMEMALLOC(size, index_t*);
1775 send_idx = TMPMEMALLOC(size, index_t*);
1776 #pragma omp parallel for schedule(static) private(i)
1777 for (i=0; i<size; i++) {
1778 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
1779 send_idx[i] = TMPMEMALLOC(sum, index_t);
1780 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1781 }
1782 memset(len, 0, sizeof(dim_t) * size);
1783 recv = col_connector->recv;
1784 sum=0;
1785 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1786 i1 = RAP_couple_ptr[i_r];
1787 i2 = RAP_couple_ptr[i_r+1];
1788 if (i2 > i1) {
1789 /* then row i_r will be in the sender of row_connector, now check
1790 how many neighbors i_r needs to be send to */
1791 for (j=i1; j<i2; j++) {
1792 i_c = RAP_couple_idx[j];
1793 /* find out the corresponding neighbor "p" of column i_c */
1794 for (p=0; p<recv->numNeighbors; p++) {
1795 if (i_c < recv->offsetInShared[p+1]) {
1796 k = recv->neighbor[p];
1797 if (send_ptr[k][i_r] == 0) sum++;
1798 send_ptr[k][i_r] ++;
1799 send_idx[k][len[k]] = global_id_RAP[i_c];
1800 len[k] ++;
1801 break;
1802 }
1803 }
1804 }
1805 }
1806 }
1807 if (global_id_RAP) {
1808 TMPMEMFREE(global_id_RAP);
1809 global_id_RAP = NULL;
1810 }
1811
1812 /* now allocate the sender */
1813 shared = TMPMEMALLOC(sum, index_t);
1814 memset(send_len, 0, sizeof(dim_t) * size);
1815 num_neighbors=0;
1816 offsetInShared[0] = 0;
1817 for (p=0, k=0; p<size; p++) {
1818 for (i=0; i<num_Pmain_cols; i++) {
1819 if (send_ptr[p][i] > 0) {
1820 shared[k] = i;
1821 k++;
1822 send_ptr[p][send_len[p]] = send_ptr[p][i];
1823 send_len[p]++;
1824 }
1825 }
1826 if (k > offsetInShared[num_neighbors]) {
1827 neighbor[num_neighbors] = p;
1828 num_neighbors ++;
1829 offsetInShared[num_neighbors] = k;
1830 }
1831 }
1832 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1833 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1834
1835 /* send/recv number of rows will be sent from current proc
1836 recover info for the receiver of row_connector from the sender */
1837 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1838 num_neighbors = 0;
1839 offsetInShared[0] = 0;
1840 j = 0;
1841 for (i=0; i<size; i++) {
1842 if (i != rank && recv_len[i] > 0) {
1843 neighbor[num_neighbors] = i;
1844 num_neighbors ++;
1845 j += recv_len[i];
1846 offsetInShared[num_neighbors] = j;
1847 }
1848 }
1849 TMPMEMFREE(shared);
1850 TMPMEMFREE(recv_len);
1851 shared = TMPMEMALLOC(j, index_t);
1852 k = offsetInShared[num_neighbors];
1853 #pragma omp parallel for schedule(static) private(i)
1854 for (i=0; i<k; i++) {
1855 shared[i] = i + num_Pmain_cols;
1856 }
1857 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1858 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1859 row_connector = Paso_Connector_alloc(send, recv);
1860 TMPMEMFREE(shared);
1861 Paso_SharedComponents_free(recv);
1862 Paso_SharedComponents_free(send);
1863
1864 /* send/recv pattern->ptr for rowCoupleBlock */
1865 num_RAPext_rows = offsetInShared[num_neighbors];
1866 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
1867 for (p=0; p<num_neighbors; p++) {
1868 j = offsetInShared[p];
1869 i = offsetInShared[p+1];
1870 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1871 mpi_info->msg_tag_counter+neighbor[p],
1872 mpi_info->comm, &mpi_requests[p]);
1873 }
1874 send = row_connector->send;
1875 for (p=0; p<send->numNeighbors; p++) {
1876 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1877 MPI_INT, send->neighbor[p],
1878 mpi_info->msg_tag_counter+rank,
1879 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1880 }
1881 MPI_Waitall(num_neighbors + send->numNeighbors,
1882 mpi_requests, mpi_stati);
1883 mpi_info->msg_tag_counter += size;
1884 TMPMEMFREE(send_len);
1885
1886 sum = 0;
1887 for (i=0; i<num_RAPext_rows; i++) {
1888 k = row_couple_ptr[i];
1889 row_couple_ptr[i] = sum;
1890 sum += k;
1891 }
1892 row_couple_ptr[num_RAPext_rows] = sum;
1893
1894 /* send/recv pattern->index for rowCoupleBlock */
1895 k = row_couple_ptr[num_RAPext_rows];
1896 row_couple_idx = MEMALLOC(k, index_t);
1897 for (p=0; p<num_neighbors; p++) {
1898 j1 = row_couple_ptr[offsetInShared[p]];
1899 j2 = row_couple_ptr[offsetInShared[p+1]];
1900 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1901 mpi_info->msg_tag_counter+neighbor[p],
1902 mpi_info->comm, &mpi_requests[p]);
1903 }
1904 for (p=0; p<send->numNeighbors; p++) {
1905 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1906 MPI_INT, send->neighbor[p],
1907 mpi_info->msg_tag_counter+rank,
1908 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1909 }
1910 MPI_Waitall(num_neighbors + send->numNeighbors,
1911 mpi_requests, mpi_stati);
1912 mpi_info->msg_tag_counter += size;
1913
1914 offset = input_dist->first_component[rank];
1915 k = row_couple_ptr[num_RAPext_rows];
1916 #pragma omp parallel for schedule(static) private(i)
1917 for (i=0; i<k; i++) {
1918 row_couple_idx[i] -= offset;
1919 }
1920 #pragma omp parallel for schedule(static) private(i)
1921 for (i=0; i<size; i++) {
1922 TMPMEMFREE(send_ptr[i]);
1923 TMPMEMFREE(send_idx[i]);
1924 }
1925 TMPMEMFREE(send_ptr);
1926 TMPMEMFREE(send_idx);
1927 TMPMEMFREE(len);
1928 TMPMEMFREE(offsetInShared);
1929 TMPMEMFREE(neighbor);
1930 TMPMEMFREE(mpi_requests);
1931 TMPMEMFREE(mpi_stati);
1932 Esys_MPIInfo_free(mpi_info);
1933
1934 /* Now, we can create pattern for mainBlock and coupleBlock */
1935 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1936 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1937 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1938 num_Pmain_cols, num_RAPext_cols,
1939 RAP_couple_ptr, RAP_couple_idx);
1940 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1941 num_RAPext_rows, num_Pmain_cols,
1942 row_couple_ptr, row_couple_idx);
1943
1944 /* next, create the system matrix */
1945 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
1946 output_dist, input_dist, main_pattern, col_couple_pattern,
1947 row_couple_pattern, col_connector, row_connector);
1948 out = Paso_SystemMatrix_alloc(A->type, pattern,
1949 row_block_size, col_block_size, FALSE);
1950
1951 /* finally, fill in the data*/
1952 memcpy(out->mainBlock->val, RAP_main_val,
1953 out->mainBlock->len* sizeof(double));
1954 memcpy(out->col_coupleBlock->val, RAP_couple_val,
1955 out->col_coupleBlock->len * sizeof(double));
1956
1957 /* Clean up */
1958 Paso_SystemMatrixPattern_free(pattern);
1959 Paso_Pattern_free(main_pattern);
1960 Paso_Pattern_free(col_couple_pattern);
1961 Paso_Pattern_free(row_couple_pattern);
1962 Paso_Connector_free(col_connector);
1963 Paso_Connector_free(row_connector);
1964 Paso_Distribution_free(output_dist);
1965 Paso_Distribution_free(input_dist);
1966 TMPMEMFREE(RAP_main_val);
1967 TMPMEMFREE(RAP_couple_val);
1968 if (Esys_noError()) {
1969 return out;
1970 } else {
1971 Paso_SystemMatrix_free(out);
1972 return NULL;
1973 }
1974 }
1975
1976
1977 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
1978 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
1979 {
1980 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
1981 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
1982 Paso_SystemMatrix *out=NULL;
1983 Paso_SystemMatrixPattern *pattern=NULL;
1984 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
1985 Paso_SharedComponents *send =NULL, *recv=NULL;
1986 Paso_Connector *col_connector=NULL, *row_connector=NULL;
1987 Paso_Coupler *coupler=NULL;
1988 Paso_Pattern *main_pattern=NULL;
1989 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
1990 const dim_t row_block_size=A->row_block_size;
1991 const dim_t col_block_size=A->col_block_size;
1992 const dim_t block_size = A->block_size;
1993 const dim_t P_block_size = P->block_size;
1994 const dim_t num_threads=omp_get_max_threads();
1995 const double ZERO = 0.0;
1996 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
1997 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
1998 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
1999 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2000 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2001 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2002 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2003 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2004 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2005 index_t sum, i, j, k, iptr, irb, icb, ib;
2006 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2007 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2008 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2009 index_t row_marker_ext, *where_p=NULL;
2010 index_t **send_ptr=NULL, **send_idx=NULL;
2011 dim_t l, p, q, global_label, num_neighbors;
2012 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2013 Esys_MPI_rank *neighbor=NULL;
2014 #ifdef ESYS_MPI
2015 MPI_Request* mpi_requests=NULL;
2016 MPI_Status* mpi_stati=NULL;
2017 #else
2018 int *mpi_requests=NULL, *mpi_stati=NULL;
2019 #endif
2020
2021 /* two sparse matrixes R_main and R_couple will be generate, as the
2022 transpose of P_main and P_col_couple, respectively. Note that,
2023 R_couple is actually the row_coupleBlock of R (R=P^T) */
2024 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2025 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2026
2027 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
2028 and needed locally for triple matrix product RAP
2029 to be specific, P_ext (on processor k) are group of rows in P, where
2030 the list of rows from processor q is given by
2031 A->col_coupler->connector->send->shared[rPtr...]
2032 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2033 on q.
2034 P_ext is represented by two sparse matrixes P_ext_main and
2035 P_ext_couple */
2036 Paso_Preconditioner_AMG_extendB(A, P);
2037
2038 /* count the number of cols in P_ext_couple, resize the pattern of
2039 sparse matrix P_ext_couple with new compressed order, and then
2040 build the col id mapping from P->col_coupleBlock to
2041 P_ext_couple */
2042 num_Pmain_cols = P->mainBlock->numCols;
2043 num_Pcouple_cols = P->col_coupleBlock->numCols;
2044 num_Acouple_cols = A->col_coupleBlock->numCols;
2045 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2046 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2047 offset = P->col_distribution->first_component[rank];
2048 num_Pext_cols = 0;
2049 if (P->global_id) {
2050 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2051 and P->col_coupleBlock */
2052 iptr = 0;
2053 if (num_Pcouple_cols || sum > 0) {
2054 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
2055 for (; iptr<sum; iptr++){
2056 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2057 }
2058 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2059 temp[iptr] = P->global_id[j];
2060 }
2061 }
2062 if (iptr) {
2063 #ifdef USE_QSORTG
2064 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2065 #else
2066 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2067 #endif
2068 num_Pext_cols = 1;
2069 i = temp[0];
2070 for (j=1; j<iptr; j++) {
2071 if (temp[j] > i) {
2072 i = temp[j];
2073 temp[num_Pext_cols++] = i;
2074 }
2075 }
2076 }
2077
2078 /* resize the pattern of P_ext_couple */
2079 if(num_Pext_cols){
2080 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
2081 for (i=0; i<num_Pext_cols; i++)
2082 global_id_P[i] = temp[i];
2083 }
2084 if (num_Pcouple_cols || sum > 0)
2085 TMPMEMFREE(temp);
2086 for (i=0; i<sum; i++) {
2087 where_p = (index_t *)bsearch(
2088 &(P->remote_coupleBlock->pattern->index[i]),
2089 global_id_P, num_Pext_cols,
2090 sizeof(index_t), Paso_comparIndex);
2091 P->remote_coupleBlock->pattern->index[i] =
2092 (index_t)(where_p -global_id_P);
2093 }
2094
2095 /* build the mapping */
2096 if (num_Pcouple_cols) {
2097 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
2098 iptr = 0;
2099 for (i=0; i<num_Pext_cols; i++)
2100 if (global_id_P[i] == P->global_id[iptr]) {
2101 Pcouple_to_Pext[iptr++] = i;
2102 if (iptr == num_Pcouple_cols) break;
2103 }
2104 }
2105 }
2106
2107 /* alloc and initialize the makers */
2108 sum = num_Pext_cols + num_Pmain_cols;
2109 P_marker = TMPMEMALLOC(sum, index_t);
2110 A_marker = TMPMEMALLOC(num_A_cols, index_t);
2111 #pragma omp parallel for private(i) schedule(static)
2112 for (i=0; i<sum; i++) P_marker[i] = -1;
2113 #pragma omp parallel for private(i) schedule(static)
2114 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2115
2116 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2117 sum = 0;
2118 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2119 row_marker = sum;
2120 /* then, loop over elements in row i_r of R_couple */
2121 j1_ub = R_couple->pattern->ptr[i_r+1];
2122 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2123 i1 = R_couple->pattern->index[j1];
2124 /* then, loop over elements in row i1 of A->col_coupleBlock */
2125 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2126 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2127 i2 = A->col_coupleBlock->pattern->index[j2];
2128
2129 /* check whether entry RA[i_r, i2] has been previously visited.
2130 RAP new entry is possible only if entry RA[i_r, i2] has not
2131 been visited yet */
2132 if (A_marker[i2] != i_r) {
2133 /* first, mark entry RA[i_r,i2] as visited */
2134 A_marker[i2] = i_r;
2135
2136 /* then loop over elements in row i2 of P_ext_main */
2137 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2138 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2139 i_c = P->row_coupleBlock->pattern->index[j3];
2140
2141 /* check whether entry RAP[i_r,i_c] has been created.
2142 If not yet created, create the entry and increase
2143 the total number of elements in RAP_ext */
2144 if (P_marker[i_c] < row_marker) {
2145 P_marker[i_c] = sum;
2146 sum++;
2147 }
2148 }
2149
2150 /* loop over elements in row i2 of P_ext_couple, do the same */
2151 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2152 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2153 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2154 if (P_marker[i_c] < row_marker) {
2155 P_marker[i_c] = sum;
2156 sum++;
2157 }
2158 }
2159 }
2160 }
2161
2162 /* now loop over elements in row i1 of A->mainBlock, repeat
2163 the process we have done to block A->col_coupleBlock */
2164 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2165 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2166 i2 = A->mainBlock->pattern->index[j2];
2167 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2168 A_marker[i2 + num_Acouple_cols] = i_r;
2169 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2170 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2171 i_c = P->mainBlock->pattern->index[j3];
2172 if (P_marker[i_c] < row_marker) {
2173 P_marker[i_c] = sum;
2174 sum++;
2175 }
2176 }
2177 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2178 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2179 /* note that we need to map the column index in
2180 P->col_coupleBlock back into the column index in
2181 P_ext_couple */
2182 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2183 if (P_marker[i_c] < row_marker) {
2184 P_marker[i_c] = sum;
2185 sum++;
2186 }
2187 }
2188 }
2189 }
2190 }
2191 }
2192
2193 /* Now we have the number of non-zero elements in RAP_ext, allocate
2194 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2195 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
2196 RAP_ext_idx = MEMALLOC(sum, index_t);
2197 RAP_ext_val = MEMALLOC(sum * block_size, double);
2198 RA_val = TMPMEMALLOC(block_size, double);
2199 RAP_val = TMPMEMALLOC(block_size, double);
2200
2201 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2202 sum = num_Pext_cols + num_Pmain_cols;
2203 #pragma omp parallel for private(i) schedule(static)
2204 for (i=0; i<sum; i++) P_marker[i] = -1;
2205 #pragma omp parallel for private(i) schedule(static)
2206 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2207 sum = 0;
2208 RAP_ext_ptr[0] = 0;
2209 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2210 row_marker = sum;
2211 /* then, loop over elements in row i_r of R_couple */
2212 j1_ub = R_couple->pattern->ptr[i_r+1];
2213 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2214 i1 = R_couple->pattern->index[j1];
2215 R_val = &(R_couple->val[j1*block_size]);
2216
2217 /* then, loop over elements in row i1 of A->col_coupleBlock */
2218 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2219 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2220 i2 = A->col_coupleBlock->pattern->index[j2];
2221 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2222 for (irb=0; irb<row_block_size; irb++) {
2223 for (icb=0; icb<col_block_size; icb++) {
2224 rtmp = ZERO;
2225 for (ib=0; ib<col_block_size; ib++) {
2226 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2227 }
2228 RA_val[irb+row_block_size*icb]=rtmp;
2229 }
2230 }
2231
2232 /* check whether entry RA[i_r, i2] has been previously visited.
2233 RAP new entry is possible only if entry RA[i_r, i2] has not
2234 been visited yet */
2235 if (A_marker[i2] != i_r) {
2236 /* first, mark entry RA[i_r,i2] as visited */
2237 A_marker[i2] = i_r;
2238
2239 /* then loop over elements in row i2 of P_ext_main */
2240 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2241 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2242 i_c = P->row_coupleBlock->pattern->index[j3];
2243 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2244 for (irb=0; irb<row_block_size; irb++) {
2245 for (icb=0; icb<col_block_size; icb++) {
2246 rtmp = ZERO;
2247 for (ib=0; ib<col_block_size; ib++) {
2248 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2249 }
2250 RAP_val[irb+row_block_size*icb]=rtmp;
2251 }
2252 }
2253
2254
2255 /* check whether entry RAP[i_r,i_c] has been created.
2256 If not yet created, create the entry and increase
2257 the total number of elements in RAP_ext */
2258 if (P_marker[i_c] < row_marker) {
2259 P_marker[i_c] = sum;
2260 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2261 RAP_ext_idx[sum] = i_c + offset;
2262 sum++;
2263 } else {
2264 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2265 for (ib=0; ib<block_size; ib++) {
2266 temp_val[ib] += RAP_val[ib];
2267 }
2268 }
2269 }
2270
2271 /* loop over elements in row i2 of P_ext_couple, do the same */
2272 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2273 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2274 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2275 //RAP_val = RA_val * P->remote_coupleBlock->val[j3];
2276 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2277 for (irb=0; irb<row_block_size; irb++) {
2278 for (icb=0; icb<col_block_size; icb++) {
2279 rtmp = ZERO;
2280 for (ib=0; ib<col_block_size; ib++) {
2281 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2282 }
2283 RAP_val[irb+row_block_size*icb]=rtmp;
2284 }
2285 }
2286
2287 if (P_marker[i_c] < row_marker) {
2288 P_marker[i_c] = sum;
2289 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2290 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2291 sum++;
2292 } else {
2293 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2294 for (ib=0; ib<block_size; ib++) {
2295 temp_val[ib] += RAP_val[ib];
2296 }
2297 }
2298 }
2299
2300 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2301 Only the contributions are added. */
2302 } else {
2303 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2304 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2305 i_c = P->row_coupleBlock->pattern->index[j3];
2306 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2307 for (irb=0; irb<row_block_size; irb++) {
2308 for (icb=0; icb<col_block_size; icb++) {
2309 rtmp = ZERO;
2310 for (ib=0; ib<col_block_size; ib++) {
2311 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2312 }
2313 RAP_val[irb+row_block_size*icb]=rtmp;
2314 }
2315 }
2316
2317 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2318 for (ib=0; ib<block_size; ib++) {
2319 temp_val[ib] += RAP_val[ib];
2320 }
2321 }
2322 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2323 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2324 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2325 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2326 for (irb=0; irb<row_block_size; irb++) {
2327 for (icb=0; icb<col_block_size; icb++) {
2328 rtmp = ZERO;
2329 for (ib=0; ib<col_block_size; ib++) {
2330 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2331 }
2332 RAP_val[irb+row_block_size*icb]=rtmp;
2333 }
2334 }
2335
2336 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2337 for (ib=0; ib<block_size; ib++) {
2338 temp_val[ib] += RAP_val[ib];
2339 }
2340 }
2341 }
2342 }
2343
2344 /* now loop over elements in row i1 of A->mainBlock, repeat
2345 the process we have done to block A->col_coupleBlock */
2346 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2347 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2348 i2 = A->mainBlock->pattern->index[j2];
2349 temp_val = &(A->mainBlock->val[j2*block_size]);
2350 for (irb=0; irb<row_block_size; irb++) {
2351 for (icb=0; icb<col_block_size; icb++) {
2352 rtmp = ZERO;
2353 for (ib=0; ib<col_block_size; ib++) {
2354 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2355 }
2356 RA_val[irb+row_block_size*icb]=rtmp;
2357 }
2358 }
2359
2360 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2361 A_marker[i2 + num_Acouple_cols] = i_r;
2362 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2363 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2364 i_c = P->mainBlock->pattern->index[j3];
2365 temp_val = &(P->mainBlock->val[j3*block_size]);
2366 for (irb=0; irb<row_block_size; irb++) {
2367 for (icb=0; icb<col_block_size; icb++) {
2368 rtmp = ZERO;
2369 for (ib=0; ib<col_block_size; ib++) {
2370 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2371 }
2372 RAP_val[irb+row_block_size*icb]=rtmp;
2373 }
2374 }
2375
2376 if (P_marker[i_c] < row_marker) {
2377 P_marker[i_c] = sum;
2378 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2379 RAP_ext_idx[sum] = i_c + offset;
2380 sum++;
2381 } else {
2382 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2383 for (ib=0; ib<block_size; ib++) {
2384 temp_val[ib] += RAP_val[ib];
2385 }
2386 }
2387 }
2388 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2389 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2390 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2391 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2392 for (irb=0; irb<row_block_size; irb++) {
2393 for (icb=0; icb<col_block_size; icb++) {
2394 rtmp = ZERO;
2395 for (ib=0; ib<col_block_size; ib++) {
2396 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2397 }
2398 RAP_val[irb+row_block_size*icb]=rtmp;
2399 }
2400 }
2401
2402 if (P_marker[i_c] < row_marker) {
2403 P_marker[i_c] = sum;
2404 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2405 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2406 sum++;
2407 } else {
2408 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2409 for (ib=0; ib<block_size; ib++) {
2410 temp_val[ib] += RAP_val[ib];
2411 }
2412 }
2413 }
2414 } else {
2415 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2416 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2417 i_c = P->mainBlock->pattern->index[j3];
2418 temp_val = &(P->mainBlock->val[j3*block_size]);
2419 for (irb=0; irb<row_block_size; irb++) {
2420 for (icb=0; icb<col_block_size; icb++) {
2421 rtmp = ZERO;
2422 for (ib=0; ib<col_block_size; ib++) {
2423 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2424 }
2425 RAP_val[irb+row_block_size*icb]=rtmp;
2426 }
2427 }
2428
2429 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2430 for (ib=0; ib<block_size; ib++) {
2431 temp_val[ib] += RAP_val[ib];
2432 }
2433 }
2434 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2435 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2436 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2437 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2438 for (irb=0; irb<row_block_size; irb++) {
2439 for (icb=0; icb<col_block_size; icb++) {
2440 rtmp = ZERO;
2441 for (ib=0; ib<col_block_size; ib++) {
2442 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2443 }
2444 RAP_val[irb+row_block_size*icb]=rtmp;
2445 }
2446 }
2447
2448 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2449 for (ib=0; ib<block_size; ib++) {
2450 temp_val[ib] += RAP_val[ib];
2451 }
2452 }
2453 }
2454 }
2455 }
2456 RAP_ext_ptr[i_r+1] = sum;
2457 }
2458 TMPMEMFREE(P_marker);
2459 TMPMEMFREE(Pcouple_to_Pext);
2460
2461 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2462 which is given by the column list of P->col_coupleBlock, and
2463 column "c" is the list of columns which possiblely covers the
2464 whole column range of systme martris P. This part of data is to
2465 be passed to neighboring processors, and added to corresponding
2466 RAP[r,c] entries in the neighboring processors */
2467 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2468 &RAP_ext_val, global_id_P, block_size);
2469
2470 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2471 sum = RAP_ext_ptr[num_RAPext_rows];
2472 num_RAPext_cols = 0;
2473 if (num_Pext_cols || sum > 0) {
2474 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
2475 j1_ub = offset + num_Pmain_cols;
2476 for (i=0, iptr=0; i<sum; i++) {
2477 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2478 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2479 }
2480 for (j=0; j<num_Pext_cols; j++, iptr++){
2481 temp[iptr] = global_id_P[j];
2482 }
2483
2484 if (iptr) {
2485 #ifdef USE_QSORTG
2486 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2487 #else
2488 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2489 #endif
2490 num_RAPext_cols = 1;
2491 i = temp[0];
2492 for (j=1; j<iptr; j++) {
2493 if (temp[j] > i) {
2494 i = temp[j];
2495 temp[num_RAPext_cols++] = i;
2496 }
2497 }
2498 }
2499 }
2500
2501 /* resize the pattern of P_ext_couple */
2502 if(num_RAPext_cols){
2503 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
2504 for (i=0; i<num_RAPext_cols; i++)
2505 global_id_RAP[i] = temp[i];
2506 }
2507 if (num_Pext_cols || sum > 0)
2508 TMPMEMFREE(temp);
2509 j1_ub = offset + num_Pmain_cols;
2510 for (i=0; i<sum; i++) {
2511 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2512 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2513 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2514 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2515 } else
2516 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2517 }
2518
2519 /* build the mapping */
2520 if (num_Pcouple_cols) {
2521 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
2522 iptr = 0;
2523 for (i=0; i<num_RAPext_cols; i++)
2524 if (global_id_RAP[i] == P->global_id[iptr]) {
2525 Pcouple_to_RAP[iptr++] = i;
2526 if (iptr == num_Pcouple_cols) break;
2527 }
2528 }
2529
2530 if (num_Pext_cols) {
2531 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
2532 iptr = 0;
2533 for (i=0; i<num_RAPext_cols; i++)
2534 if (global_id_RAP[i] == global_id_P[iptr]) {
2535 Pext_to_RAP[iptr++] = i;
2536 if (iptr == num_Pext_cols) break;
2537 }
2538 }
2539
2540 if (global_id_P){
2541 TMPMEMFREE(global_id_P);
2542 global_id_P = NULL;
2543 }
2544
2545 /* alloc and initialize the makers */
2546 sum = num_RAPext_cols + num_Pmain_cols;
2547 P_marker = TMPMEMALLOC(sum, index_t);
2548 #pragma omp parallel for private(i) schedule(static)
2549 for (i=0; i<sum; i++) P_marker[i] = -1;
2550 #pragma omp parallel for private(i) schedule(static)
2551 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2552
2553 /* Now, count the size of RAP. Start with rows in R_main */
2554 num_neighbors = P->col_coupler->connector->send->numNeighbors;
2555 offsetInShared = P->col_coupler->connector->send->offsetInShared;
2556 shared = P->col_coupler->connector->send->shared;
2557 i = 0;
2558 j = 0;
2559 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2560 /* Mark the start of row for both main block and couple block */
2561 row_marker = i;
2562 row_marker_ext = j;
2563
2564 /* Mark the diagonal element RAP[i_r, i_r], and other elements
2565 in RAP_ext */
2566 P_marker[i_r] = i;
2567 i++;
2568 for (j1=0; j1<num_neighbors; j1++) {
2569 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2570 if (shared[j2] == i_r) {
2571 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2572 i_c = RAP_ext_idx[k];
2573 if (i_c < num_Pmain_cols) {
2574 if (P_marker[i_c] < row_marker) {
2575 P_marker[i_c] = i;
2576 i++;
2577 }
2578 } else {
2579 if (P_marker[i_c] < row_marker_ext) {
2580 P_marker[i_c] = j;
2581 j++;
2582 }
2583 }
2584 }
2585 break;
2586 }
2587 }
2588 }
2589
2590 /* then, loop over elements in row i_r of R_main */
2591 j1_ub = R_main->pattern->ptr[i_r+1];
2592 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2593 i1 = R_main->pattern->index[j1];
2594
2595 /* then, loop over elements in row i1 of A->col_coupleBlock */
2596 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2597 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2598 i2 = A->col_coupleBlock->pattern->index[j2];
2599
2600 /* check whether entry RA[i_r, i2] has been previously visited.
2601 RAP new entry is possible only if entry RA[i_r, i2] has not
2602 been visited yet */
2603 if (A_marker[i2] != i_r) {
2604 /* first, mark entry RA[i_r,i2] as visited */
2605 A_marker[i2] = i_r;
2606
2607 /* then loop over elements in row i2 of P_ext_main */
2608 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2609 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2610 i_c = P->row_coupleBlock->pattern->index[j3];
2611
2612 /* check whether entry RAP[i_r,i_c] has been created.
2613 If not yet created, create the entry and increase
2614 the total number of elements in RAP_ext */
2615 if (P_marker[i_c] < row_marker) {
2616 P_marker[i_c] = i;
2617 i++;
2618 }
2619 }
2620
2621 /* loop over elements in row i2 of P_ext_couple, do the same */
2622 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2623 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2624 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2625 if (P_marker[i_c] < row_marker_ext) {
2626 P_marker[i_c] = j;
2627 j++;
2628 }
2629 }
2630 }
2631 }
2632
2633 /* now loop over elements in row i1 of A->mainBlock, repeat
2634 the process we have done to block A->col_coupleBlock */
2635 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2636 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2637 i2 = A->mainBlock->pattern->index[j2];
2638 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2639 A_marker[i2 + num_Acouple_cols] = i_r;
2640 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2641 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2642 i_c = P->mainBlock->pattern->index[j3];
2643 if (P_marker[i_c] < row_marker) {
2644 P_marker[i_c] = i;
2645 i++;
2646 }
2647 }
2648 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2649 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2650 /* note that we need to map the column index in
2651 P->col_coupleBlock back into the column index in
2652 P_ext_couple */
2653 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2654 if (P_marker[i_c] < row_marker_ext) {
2655 P_marker[i_c] = j;
2656 j++;
2657 }
2658 }
2659 }
2660 }
2661 }
2662 }
2663
2664 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
2665 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2666 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2667 RAP_couple */
2668 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2669 RAP_main_idx = MEMALLOC(i, index_t);
2670 RAP_main_val = TMPMEMALLOC(i * block_size, double);
2671 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2672 RAP_couple_idx = MEMALLOC(j, index_t);
2673 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
2674
2675 RAP_main_ptr[num_Pmain_cols] = i;
2676 RAP_couple_ptr[num_Pmain_cols] = j;
2677
2678 #pragma omp parallel for private(i) schedule(static)
2679 for (i=0; i<sum; i++) P_marker[i] = -1;
2680 #pragma omp parallel for private(i) schedule(static)
2681 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2682
2683 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
2684 in R_main */
2685 i = 0;
2686 j = 0;
2687 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2688 /* Mark the start of row for both main block and couple block */
2689 row_marker = i;
2690 row_marker_ext = j;
2691 RAP_main_ptr[i_r] = row_marker;
2692 RAP_couple_ptr[i_r] = row_marker_ext;
2693
2694 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2695 in row i_r of RAP_ext */
2696 P_marker[i_r] = i;
2697 RAP_main_val[i] = ZERO;
2698 RAP_main_idx[i] = i_r;
2699 i++;
2700
2701 for (j1=0; j1<num_neighbors; j1++) {
2702 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2703 if (shared[j2] == i_r) {
2704 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2705 i_c = RAP_ext_idx[k];
2706 if (i_c < num_Pmain_cols) {
2707 if (P_marker[i_c] < row_marker) {
2708 P_marker[i_c] = i;
2709 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2710 RAP_main_idx[i] = i_c;
2711 i++;
2712 } else {
2713 temp_val = &(RAP_ext_val[k*block_size]);
2714 RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2715 for (ib=0; ib<block_size; ib++)
2716 RAP_val[ib] += temp_val[ib];
2717 }
2718 } else {
2719 if (P_marker[i_c] < row_marker_ext) {
2720 P_marker[i_c] = j;
2721 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2722 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2723 j++;
2724 } else {
2725 temp_val = &(RAP_ext_val[k*block_size]);
2726 RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2727 for (ib=0; ib<block_size; ib++)
2728 RAP_val[ib] += temp_val[ib];
2729 }
2730 }
2731 }
2732 break;
2733 }
2734 }
2735 }
2736
2737 /* then, loop over elements in row i_r of R_main */
2738 j1_ub = R_main->pattern->ptr[i_r+1];
2739 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2740 i1 = R_main->pattern->index[j1];
2741 R_val = &(R_main->val[j1*block_size]);
2742
2743 /* then, loop over elements in row i1 of A->col_coupleBlock */
2744 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2745 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2746 i2 = A->col_coupleBlock->pattern->index[j2];
2747 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2748 for (irb=0; irb<row_block_size; irb++) {
2749 for (icb=0; icb<col_block_size; icb++) {
2750 rtmp = ZERO;
2751 for (ib=0; ib<col_block_size; ib++) {
2752 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2753 }
2754 RA_val[irb+row_block_size*icb]=rtmp;
2755 }
2756 }
2757
2758
2759 /* check whether entry RA[i_r, i2] has been previously visited.
2760 RAP new entry is possible only if entry RA[i_r, i2] has not
2761 been visited yet */
2762 if (A_marker[i2] != i_r) {
2763 /* first, mark entry RA[i_r,i2] as visited */
2764 A_marker[i2] = i_r;
2765
2766 /* then loop over elements in row i2 of P_ext_main */
2767 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2768 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2769 i_c = P->row_coupleBlock->pattern->index[j3];
2770 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2771 for (irb=0; irb<row_block_size; irb++) {
2772 for (icb=0; icb<col_block_size; icb++) {
2773 rtmp = ZERO;
2774 for (ib=0; ib<col_block_size; ib++) {
2775 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2776 }
2777 RAP_val[irb+row_block_size*icb]=rtmp;
2778 }
2779 }
2780
2781
2782 /* check whether entry RAP[i_r,i_c] has been created.
2783 If not yet created, create the entry and increase
2784 the total number of elements in RAP_ext */
2785 if (P_marker[i_c] < row_marker) {
2786 P_marker[i_c] = i;
2787 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2788 RAP_main_idx[i] = i_c;
2789 i++;
2790 } else {
2791 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2792 for (ib=0; ib<block_size; ib++) {
2793 temp_val[ib] += RAP_val[ib];
2794 }
2795 }
2796 }
2797
2798 /* loop over elements in row i2 of P_ext_couple, do the same */
2799 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2800 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2801 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2802 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2803 for (irb=0; irb<row_block_size; irb++) {
2804 for (icb=0; icb<col_block_size; icb++) {
2805 rtmp = ZERO;
2806 for (ib=0; ib<col_block_size; ib++) {
2807 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2808 }
2809 RAP_val[irb+row_block_size*icb]=rtmp;
2810 }
2811 }
2812
2813 if (P_marker[i_c] < row_marker_ext) {
2814 P_marker[i_c] = j;
2815 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2816 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2817 j++;
2818 } else {
2819 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2820 for (ib=0; ib<block_size; ib++) {
2821 temp_val[ib] += RAP_val[ib];
2822 }
2823 }
2824 }
2825
2826 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2827 Only the contributions are added. */
2828 } else {
2829 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2830 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2831 i_c = P->row_coupleBlock->pattern->index[j3];
2832 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2833 for (irb=0; irb<row_block_size; irb++) {
2834 for (icb=0; icb<col_block_size; icb++) {
2835 rtmp = ZERO;
2836 for (ib=0; ib<col_block_size; ib++) {
2837 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2838 }
2839 RAP_val[irb+row_block_size*icb]=rtmp;
2840 }
2841 }
2842
2843 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2844 for (ib=0; ib<block_size; ib++) {
2845 temp_val[ib] += RAP_val[ib];
2846 }
2847 }
2848 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2849 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2850 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2851 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2852 for (irb=0; irb<row_block_size; irb++) {
2853 for (icb=0; icb<col_block_size; icb++) {
2854 rtmp = ZERO;
2855 for (ib=0; ib<col_block_size; ib++) {
2856 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2857 }
2858 RAP_val[irb+row_block_size*icb]=rtmp;
2859 }
2860 }
2861
2862 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2863 for (ib=0; ib<block_size; ib++) {
2864 temp_val[ib] += RAP_val[ib];
2865 }
2866 }
2867 }
2868 }
2869
2870 /* now loop over elements in row i1 of A->mainBlock, repeat
2871 the process we have done to block A->col_coupleBlock */
2872 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2873 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2874 i2 = A->mainBlock->pattern->index[j2];
2875 temp_val = &(A->mainBlock->val[j2*block_size]);
2876 for (irb=0; irb<row_block_size; irb++) {
2877 for (icb=0; icb<col_block_size; icb++) {
2878 rtmp = ZERO;
2879 for (ib=0; ib<col_block_size; ib++) {
2880 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2881 }
2882 RA_val[irb+row_block_size*icb]=rtmp;
2883 }
2884 }
2885
2886 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2887 A_marker[i2 + num_Acouple_cols] = i_r;
2888 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2889 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2890 i_c = P->mainBlock->pattern->index[j3];
2891 temp_val = &(P->mainBlock->val[j3*block_size]);
2892 for (irb=0; irb<row_block_size; irb++) {
2893 for (icb=0; icb<col_block_size; icb++) {
2894 rtmp = ZERO;
2895 for (ib=0; ib<col_block_size; ib++) {
2896 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2897 }
2898 RAP_val[irb+row_block_size*icb]=rtmp;
2899 }
2900 }
2901
2902 if (P_marker[i_c] < row_marker) {
2903 P_marker[i_c] = i;
2904 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2905 RAP_main_idx[i] = i_c;
2906 i++;
2907 } else {
2908 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2909 for (ib=0; ib<block_size; ib++) {
2910 temp_val[ib] += RAP_val[ib];
2911 }
2912 }
2913 }
2914 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2915 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2916 /* note that we need to map the column index in
2917 P->col_coupleBlock back into the column index in
2918 P_ext_couple */
2919 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2920 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2921 for (irb=0; irb<row_block_size; irb++) {
2922 for (icb=0; icb<col_block_size; icb++) {
2923 rtmp = ZERO;
2924 for (ib=0; ib<col_block_size; ib++) {
2925 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2926 }
2927 RAP_val[irb+row_block_size*icb]=rtmp;
2928 }
2929 }
2930
2931 if (P_marker[i_c] < row_marker_ext) {
2932 P_marker[i_c] = j;
2933 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2934 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2935 j++;
2936 } else {
2937 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2938 for (ib=0; ib<block_size; ib++) {
2939 temp_val[ib] += RAP_val[ib];
2940 }
2941 }
2942 }
2943
2944 } else {
2945 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2946 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2947 i_c = P->mainBlock->pattern->index[j3];
2948 temp_val = &(P->mainBlock->val[j3*block_size]);
2949 for (irb=0; irb<row_block_size; irb++) {
2950 for (icb=0; icb<col_block_size; icb++) {
2951 rtmp = ZERO;
2952 for (ib=0; ib<col_block_size; ib++) {
2953 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2954 }
2955 RAP_val[irb+row_block_size*icb]=rtmp;
2956 }
2957 }
2958
2959 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2960 for (ib=0; ib<block_size; ib++) {
2961 temp_val[ib] += RAP_val[ib];
2962 }
2963 }
2964 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2965 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2966 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2967 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2968 for (irb=0; irb<row_block_size; irb++) {
2969 for (icb=0; icb<col_block_size; icb++) {
2970 rtmp = ZERO;
2971 for (ib=0; ib<col_block_size; ib++) {
2972 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2973 }
2974 RAP_val[irb+row_block_size*icb]=rtmp;
2975 }
2976 }
2977
2978 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2979 for (ib=0; ib<block_size; ib++) {
2980 temp_val[ib] += RAP_val[ib];
2981 }
2982 }
2983 }
2984 }
2985 }
2986
2987 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
2988 if (i > row_marker) {
2989 offset = i - row_marker;
2990 temp = TMPMEMALLOC(offset, index_t);
2991 for (iptr=0; iptr<offset; iptr++)
2992 temp[iptr] = RAP_main_idx[row_marker+iptr];
2993 if (offset > 0) {
2994 #ifdef USE_QSORTG
2995 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
2996 #else
2997 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
2998 #endif
2999 }
3000 temp_val = TMPMEMALLOC(offset * block_size, double);
3001 for (iptr=0; iptr<offset; iptr++){
3002 k = P_marker[temp[iptr]];
3003 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
3004 P_marker[temp[iptr]] = iptr + row_marker;
3005 }
3006 for (iptr=0; iptr<offset; iptr++){
3007 RAP_main_idx[row_marker+iptr] = temp[iptr];
3008 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3009 }
3010 TMPMEMFREE(temp);
3011 TMPMEMFREE(temp_val);
3012 }
3013 if (j > row_marker_ext) {
3014 offset = j - row_marker_ext;
3015 temp = TMPMEMALLOC(offset, index_t);
3016 for (iptr=0; iptr<offset; iptr++)
3017 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
3018 if (offset > 0) {
3019 #ifdef USE_QSORTG
3020 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3021 #else
3022 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3023 #endif
3024 }
3025 temp_val = TMPMEMALLOC(offset * block_size, double);
3026 for (iptr=0; iptr<offset; iptr++){
3027 k = P_marker[temp[iptr] + num_Pmain_cols];
3028 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
3029 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
3030 }
3031 for (iptr=0; iptr<offset; iptr++){
3032 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
3033 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3034 }
3035 TMPMEMFREE(temp);
3036 TMPMEMFREE(temp_val);
3037 }
3038 }
3039
3040 TMPMEMFREE(RA_val);
3041 TMPMEMFREE(RAP_val);
3042 TMPMEMFREE(A_marker);
3043 TMPMEMFREE(Pext_to_RAP);
3044 TMPMEMFREE(Pcouple_to_RAP);
3045 MEMFREE(RAP_ext_ptr);
3046 MEMFREE(RAP_ext_idx);
3047 MEMFREE(RAP_ext_val);
3048 Paso_SparseMatrix_free(R_main);
3049 Paso_SparseMatrix_free(R_couple);
3050
3051 /* Check whether there are empty columns in RAP_couple */
3052 #pragma omp parallel for schedule(static) private(i)
3053 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
3054 /* num of non-empty columns is stored in "k" */
3055 k = 0;
3056 j = RAP_couple_ptr[num_Pmain_cols];
3057 for (i=0; i<j; i++) {
3058 i1 = RAP_couple_idx[i];
3059 if (P_marker[i1]) {
3060 P_marker[i1] = 0;
3061 k++;
3062 }
3063 }
3064
3065 /* empty columns is found */
3066 if (k < num_RAPext_cols) {
3067 temp = TMPMEMALLOC(k, index_t);
3068 k = 0;
3069 for (i=0; i<num_RAPext_cols; i++)
3070 if (!P_marker[i]) {
3071 P_marker[i] = k;
3072 temp[k] = global_id_RAP[i];
3073 k++;
3074 }
3075 for (i=0; i<j; i++) {
3076 i1 = RAP_couple_idx[i];
3077 RAP_couple_idx[i] = P_marker[i1];
3078 }
3079 num_RAPext_cols = k;
3080 TMPMEMFREE(global_id_RAP);
3081 global_id_RAP = temp;
3082 }
3083 TMPMEMFREE(P_marker);
3084
3085 /******************************************************/
3086 /* Start to create the coasre level System Matrix A_c */
3087 /******************************************************/
3088 /* first, prepare the sender/receiver for the col_connector */
3089 dist = P->pattern->input_distribution->first_component;
3090 recv_len = TMPMEMALLOC(size, dim_t);
3091 send_len = TMPMEMALLOC(size, dim_t);
3092 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
3093 offsetInShared = TMPMEMALLOC(size+1, index_t);
3094 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
3095 memset(recv_len, 0, sizeof(dim_t) * size);
3096 memset(send_len, 0, sizeof(dim_t) * size);
3097 num_neighbors = 0;
3098 offsetInShared[0] = 0;
3099 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
3100 shared[i] = i + num_Pmain_cols;
3101 if (k <= global_id_RAP[i]) {
3102 if (recv_len[j] > 0) {
3103 neighbor[num_neighbors] = j;
3104 num_neighbors ++;
3105 offsetInShared[num_neighbors] = i;
3106 }
3107 while (k <= global_id_RAP[i]) {
3108 j++;
3109 k = dist[j+1];
3110 }
3111 }
3112 recv_len[j] ++;
3113 }
3114 if (recv_len[j] > 0) {
3115 neighbor[num_neighbors] = j;
3116 num_neighbors ++;
3117 offsetInShared[num_neighbors] = i;
3118 }
3119 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3120 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3121
3122 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
3123
3124 #ifdef ESYS_MPI
3125 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
3126 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
3127 #else
3128 mpi_requests=TMPMEMALLOC(size*2,int);
3129 mpi_stati=TMPMEMALLOC(size*2,int);
3130 #endif
3131 num_neighbors = 0;
3132 j = 0;
3133 offsetInShared[0] = 0;
3134 for (i=0; i<size; i++) {
3135 if (send_len[i] > 0) {
3136 neighbor[num_neighbors] = i;
3137 num_neighbors ++;
3138 j += send_len[i];
3139 offsetInShared[num_neighbors] = j;
3140 }
3141 }
3142 TMPMEMFREE(shared);
3143 shared = TMPMEMALLOC(j, index_t);
3144 for (i=0, j=0; i<num_neighbors; i++) {
3145 k = neighbor[i];
3146 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
3147 mpi_info->msg_tag_counter+k,
3148 mpi_info->comm, &mpi_requests[i]);
3149 j += send_len[k];
3150 }
3151 for (i=0, j=0; i<recv->numNeighbors; i++) {
3152 k = recv->neighbor[i];
3153 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
3154 mpi_info->msg_tag_counter+rank,
3155 mpi_info->comm, &mpi_requests[i+num_neighbors]);
3156 j += recv_len[k];
3157 }
3158 MPI_Waitall(num_neighbors + recv->numNeighbors,
3159 mpi_requests, mpi_stati);
3160 mpi_info->msg_tag_counter += size;
3161
3162 j = offsetInShared[num_neighbors];
3163 offset = dist[rank];
3164 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
3165 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3166 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3167
3168 col_connector = Paso_Connector_alloc(send, recv);
3169 TMPMEMFREE(shared);
3170 Paso_SharedComponents_free(recv);
3171 Paso_SharedComponents_free(send);
3172
3173 /* now, create row distribution (output_distri) and col
3174 distribution (input_distribution) */
3175 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3176 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3177
3178 /* then, prepare the sender/receiver for the row_connector, first, prepare
3179 the information for sender */
3180 sum = RAP_couple_ptr[num_Pmain_cols];
3181 len = TMPMEMALLOC(size, dim_t);
3182 send_ptr = TMPMEMALLOC(size, index_t*);
3183 send_idx = TMPMEMALLOC(size, index_t*);
3184 for (i=0; i<size; i++) {
3185 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
3186 send_idx[i] = TMPMEMALLOC(sum, index_t);
3187 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
3188 }
3189 memset(len, 0, sizeof(dim_t) * size);
3190 recv = col_connector->recv;
3191 sum=0;
3192 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
3193 i1 = RAP_couple_ptr[i_r];
3194 i2 = RAP_couple_ptr[i_r+1];
3195 if (i2 > i1) {
3196 /* then row i_r will be in the sender of row_connector, now check
3197 how many neighbors i_r needs to be send to */
3198 for (j=i1; j<i2; j++) {
3199 i_c = RAP_couple_idx[j];
3200 /* find out the corresponding neighbor "p" of column i_c */
3201 for (p=0; p<recv->numNeighbors; p++) {
3202 if (i_c < recv->offsetInShared[p+1]) {
3203 k = recv->neighbor[p];
3204 if (send_ptr[k][i_r] == 0) sum++;
3205 send_ptr[k][i_r] ++;
3206 send_idx[k][len[k]] = global_id_RAP[i_c];
3207 len[k] ++;
3208 break;
3209 }
3210 }
3211 }
3212 }
3213 }
3214 if (global_id_RAP) {
3215 TMPMEMFREE(global_id_RAP);
3216 global_id_RAP = NULL;
3217 }
3218
3219 /* now allocate the sender */
3220 shared = TMPMEMALLOC(sum, index_t);
3221 memset(send_len, 0, sizeof(dim_t) * size);
3222 num_neighbors=0;
3223 offsetInShared[0] = 0;
3224 for (p=0, k=0; p<size; p++) {
3225 for (i=0; i<num_Pmain_cols; i++) {
3226 if (send_ptr[p][i] > 0) {
3227 shared[k] = i;
3228 k++;
3229 send_ptr[p][send_len[p]] = send_ptr[p][i];
3230 send_len[p]++;
3231 }
3232 }
3233 if (k > offsetInShared[num_neighbors]) {
3234 neighbor[num_neighbors] = p;
3235 num_neighbors ++;
3236 offsetInShared[num_neighbors] = k;
3237 }
3238 }
3239 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3240 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3241
3242 /* send/recv number of rows will be sent from current proc
3243 recover info for the receiver of row_connector from the sender */
3244 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
3245 num_neighbors = 0;
3246 offsetInShared[0] = 0;
3247 j = 0;
3248 for (i=0; i<size; i++) {
3249 if (i != rank && recv_len[i] > 0) {
3250 neighbor[num_neighbors] = i;
3251 num_neighbors ++;
3252 j += recv_len[i];
3253 offsetInShared[num_neighbors] = j;
3254 }
3255 }
3256 TMPMEMFREE(shared);
3257 TMPMEMFREE(recv_len);
3258 shared = TMPMEMALLOC(j, index_t);
3259 k = offsetInShared[num_neighbors];
3260 for (i=0; i<k; i++) {
3261 shared[i] = i + num_Pmain_cols;
3262 }
3263 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3264 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3265 row_connector = Paso_Connector_alloc(send, recv);
3266 TMPMEMFREE(shared);
3267 Paso_SharedComponents_free(recv);
3268 Paso_SharedComponents_free(send);
3269
3270 /* send/recv pattern->ptr for rowCoupleBlock */
3271 num_RAPext_rows = offsetInShared[num_neighbors];
3272 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
3273 for (p=0; p<num_neighbors; p++) {
3274 j = offsetInShared[p];
3275 i = offsetInShared[p+1];
3276 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
3277 mpi_info->msg_tag_counter+neighbor[p],
3278 mpi_info->comm, &mpi_requests[p]);
3279 }
3280 send = row_connector->send;
3281 for (p=0; p<send->numNeighbors; p++) {
3282 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
3283 MPI_INT, send->neighbor[p],
3284 mpi_info->msg_tag_counter+rank,
3285 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3286 }
3287 MPI_Waitall(num_neighbors + send->numNeighbors,
3288 mpi_requests, mpi_stati);
3289 mpi_info->msg_tag_counter += size;
3290 TMPMEMFREE(send_len);
3291
3292 sum = 0;
3293 for (i=0; i<num_RAPext_rows; i++) {
3294 k = row_couple_ptr[i];
3295 row_couple_ptr[i] = sum;
3296 sum += k;
3297 }
3298 row_couple_ptr[num_RAPext_rows] = sum;
3299
3300 /* send/recv pattern->index for rowCoupleBlock */
3301 k = row_couple_ptr[num_RAPext_rows];
3302 row_couple_idx = MEMALLOC(k, index_t);
3303 for (p=0; p<num_neighbors; p++) {
3304 j1 = row_couple_ptr[offsetInShared[p]];
3305 j2 = row_couple_ptr[offsetInShared[p+1]];
3306 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
3307 mpi_info->msg_tag_counter+neighbor[p],
3308 mpi_info->comm, &mpi_requests[p]);
3309 }
3310 for (p=0; p<send->numNeighbors; p++) {
3311 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
3312 MPI_INT, send->neighbor[p],
3313 mpi_info->msg_tag_counter+rank,
3314 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3315 }
3316 MPI_Waitall(num_neighbors + send->numNeighbors,
3317 mpi_requests, mpi_stati);
3318 mpi_info->msg_tag_counter += size;
3319
3320 offset = input_dist->first_component[rank];
3321 k = row_couple_ptr[num_RAPext_rows];
3322 for (i=0; i<k; i++) {
3323 row_couple_idx[i] -= offset;
3324 }
3325
3326 for (i=0; i<size; i++) {
3327 TMPMEMFREE(send_ptr[i]);
3328 TMPMEMFREE(send_idx[i]);
3329 }
3330 TMPMEMFREE(send_ptr);
3331 TMPMEMFREE(send_idx);
3332 TMPMEMFREE(len);
3333 TMPMEMFREE(offsetInShared);
3334 TMPMEMFREE(neighbor);
3335 TMPMEMFREE(mpi_requests);
3336 TMPMEMFREE(mpi_stati);
3337 Esys_MPIInfo_free(mpi_info);
3338
3339 /* Now, we can create pattern for mainBlock and coupleBlock */
3340 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
3341 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
3342 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3343 num_Pmain_cols, num_RAPext_cols,
3344 RAP_couple_ptr, RAP_couple_idx);
3345 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3346 num_RAPext_rows, num_Pmain_cols,
3347 row_couple_ptr, row_couple_idx);
3348
3349 /* next, create the system matrix */
3350 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
3351 output_dist, input_dist, main_pattern, col_couple_pattern,
3352 row_couple_pattern, col_connector, row_connector);
3353 out = Paso_SystemMatrix_alloc(A->type, pattern,
3354 row_block_size, col_block_size, FALSE);
3355
3356 /* finally, fill in the data*/
3357 memcpy(out->mainBlock->val, RAP_main_val,
3358 out->mainBlock->len* sizeof(double));
3359 memcpy(out->col_coupleBlock->val, RAP_couple_val,
3360 out->col_coupleBlock->len * sizeof(double));
3361
3362 /* Clean up */
3363 Paso_SystemMatrixPattern_free(pattern);
3364 Paso_Pattern_free(main_pattern);
3365 Paso_Pattern_free(col_couple_pattern);
3366 Paso_Pattern_free(row_couple_pattern);
3367 Paso_Connector_free(col_connector);
3368 Paso_Connector_free(row_connector);
3369 Paso_Distribution_free(output_dist);
3370 Paso_Distribution_free(input_dist);
3371 TMPMEMFREE(RAP_main_val);
3372 TMPMEMFREE(RAP_couple_val);
3373 if (Esys_noError()) {
3374 return out;
3375 } else {
3376 Paso_SystemMatrix_free(out);
3377 return NULL;
3378 }
3379 }

  ViewVC Help
Powered by ViewVC 1.1.26