/[escript]/trunk/paso/src/AMG_Interpolation.c
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3828 - (show annotations)
Wed Feb 15 03:27:58 2012 UTC (7 years, 7 months ago) by lgao
File MIME type: text/plain
File size: 123385 byte(s)
Fix all warnings reported on guineapig.


1 /*******************************************************
2 *
3 * Copyright (c) 2003-2011 by University of Queensland
4 * Earth Systems Science Computational Center (ESSCC)
5 * http://www.uq.edu.au/esscc
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 *******************************************************/
12
13
14 /**************************************************************/
15
16 /* Paso: defines AMG Interpolation */
17
18 /**************************************************************/
19
20 /* Author: l.gao@uq.edu.au */
21
22 /**************************************************************/
23
24 #include "Paso.h"
25 #include "SparseMatrix.h"
26 #include "PasoUtil.h"
27 #include "Preconditioner.h"
28
29 /**************************************************************
30
31 Methods nessecary for AMG preconditioner
32
33 construct n_C x n_C interpolation operator A_C from matrix A
34 and prolongation matrix P.
35
36 The coarsening operator A_C is defined by RAP where R=P^T.
37
38 ***************************************************************/
39
40 /* Extend system matrix B with extra two sparse matrixes:
41 B_ext_main and B_ext_couple
42 The combination of this two sparse matrixes represents
43 the portion of B that is stored on neighbor procs and
44 needed locally for triple matrix product (B^T) A B.
45
46 FOR NOW, we assume that the system matrix B has a NULL
47 row_coupleBlock and a NULL remote_coupleBlock. Based on
48 this assumption, we use link row_coupleBlock for sparse
49 matrix B_ext_main, and link remote_coupleBlock for sparse
50 matrix B_ext_couple.
51
52 To be specific, B_ext (on processor k) are group of rows
53 in B, where the list of rows from processor q is given by
54 A->col_coupler->connector->send->shared[rPtr...]
55 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
56 on q. */
57 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
58 {
59 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
60 Paso_Coupler *coupler=NULL;
61 Paso_SharedComponents *send=NULL, *recv=NULL;
62 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
63 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
64 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
65 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
66 index_t *idx_m=NULL, *idx_c=NULL;
67 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
68 index_t offset, len, block_size, block_size_size, max_num_cols;
69 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
70 dim_t *recv_degree=NULL, *send_degree=NULL;
71 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
72
73 if (size == 1) return;
74
75 if (B->row_coupleBlock) {
76 Paso_SparseMatrix_free(B->row_coupleBlock);
77 B->row_coupleBlock = NULL;
78 }
79
80 if (B->row_coupleBlock || B->remote_coupleBlock) {
81 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
82 return;
83 }
84
85 /* sending/receiving unknown's global ID */
86 num_main_cols = B->mainBlock->numCols;
87 cols = TMPMEMALLOC(num_main_cols, double);
88 offset = B->col_distribution->first_component[rank];
89 #pragma omp parallel for private(i) schedule(static)
90 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
91 if (B->global_id == NULL) {
92 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
93 Paso_Coupler_startCollect(coupler, cols);
94 }
95
96 recv_buf = TMPMEMALLOC(size, index_t);
97 recv_degree = TMPMEMALLOC(size, dim_t);
98 recv_offset = TMPMEMALLOC(size+1, index_t);
99 #pragma omp parallel for private(i) schedule(static)
100 for (i=0; i<size; i++){
101 recv_buf[i] = 0;
102 recv_degree[i] = 1;
103 recv_offset[i] = i;
104 }
105
106 block_size = B->block_size;
107 block_size_size = block_size * sizeof(double);
108 num_couple_cols = B->col_coupleBlock->numCols;
109 send = A->col_coupler->connector->send;
110 recv = A->col_coupler->connector->recv;
111 num_neighbors = send->numNeighbors;
112 p = send->offsetInShared[num_neighbors];
113 len = p * B->col_distribution->first_component[size];
114 send_buf = TMPMEMALLOC(len * block_size, double);
115 send_idx = TMPMEMALLOC(len, index_t);
116 send_offset = TMPMEMALLOC((p+1)*2, index_t);
117 send_degree = TMPMEMALLOC(num_neighbors, dim_t);
118 i = num_main_cols + num_couple_cols;
119 send_m = TMPMEMALLOC(i * block_size, double);
120 send_c = TMPMEMALLOC(i * block_size, double);
121 idx_m = TMPMEMALLOC(i, index_t);
122 idx_c = TMPMEMALLOC(i, index_t);
123
124 /* waiting for receiving unknown's global ID */
125 if (B->global_id == NULL) {
126 Paso_Coupler_finishCollect(coupler);
127 global_id = MEMALLOC(num_couple_cols, index_t);
128 #pragma omp parallel for private(i) schedule(static)
129 for (i=0; i<num_couple_cols; ++i)
130 global_id[i] = coupler->recv_buffer[i];
131 B->global_id = global_id;
132 Paso_Coupler_free(coupler);
133 } else
134 global_id = B->global_id;
135
136 /* distribute the number of cols in current col_coupleBlock to all ranks */
137 #ifdef ESYS_MPI
138 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
139 #endif
140
141 /* distribute global_ids of cols to be considered to all ranks*/
142 len = 0;
143 max_num_cols = 0;
144 for (i=0; i<size; i++){
145 recv_degree[i] = recv_buf[i];
146 recv_offset[i] = len;
147 len += recv_buf[i];
148 if (max_num_cols < recv_buf[i])
149 max_num_cols = recv_buf[i];
150 }
151 recv_offset[size] = len;
152 cols_array = TMPMEMALLOC(len, index_t);
153 #ifdef ESYS_MPI
154 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155 #endif
156
157 /* first, prepare the ptr_ptr to be received */
158 q = recv->numNeighbors;
159 len = recv->offsetInShared[q];
160 ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
161 for (p=0; p<q; p++) {
162 row = recv->offsetInShared[p];
163 m = recv->offsetInShared[p + 1];
164 #ifdef ESYS_MPI
165 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166 A->mpi_info->msg_tag_counter+recv->neighbor[p],
167 A->mpi_info->comm,
168 &(A->col_coupler->mpi_requests[p]));
169 #endif
170 }
171
172 /* now prepare the rows to be sent (the degree, the offset and the data) */
173 len = 0;
174 i0 = 0;
175 for (p=0; p<num_neighbors; p++) {
176 i = i0;
177 neighbor = send->neighbor[p];
178 m_lb = B->col_distribution->first_component[neighbor];
179 m_ub = B->col_distribution->first_component[neighbor + 1];
180 j_ub = send->offsetInShared[p + 1];
181 for (j=send->offsetInShared[p]; j<j_ub; j++) {
182 row = send->shared[j];
183 l_m = 0;
184 l_c = 0;
185 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
186 k_lb = B->col_coupleBlock->pattern->ptr[row];
187
188 /* check part of col_coupleBlock for data to be passed @row */
189 for (k=k_lb; k<k_ub; k++) {
190 m = global_id[B->col_coupleBlock->pattern->index[k]];
191 if (m > offset) break;
192 if (m>= m_lb && m < m_ub) {
193 /* data to be passed to sparse matrix B_ext_main */
194 idx_m[l_m] = m - m_lb;
195 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196 l_m++;
197 } else {
198 /* data to be passed to sparse matrix B_ext_couple */
199 idx_c[l_c] = m;
200 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
201 l_c++;
202 }
203 }
204 k_lb = k;
205
206 /* check mainBlock for data to be passed @row to sparse
207 matrix B_ext_couple */
208 k_ub = B->mainBlock->pattern->ptr[row + 1];
209 k = B->mainBlock->pattern->ptr[row];
210 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
211 for (; k<k_ub; k++) {
212 m = B->mainBlock->pattern->index[k] + offset;
213 idx_c[l_c] = m;
214 l_c++;
215 }
216
217 /* check the rest part of col_coupleBlock for data to
218 be passed @row to sparse matrix B_ext_couple */
219 k = k_lb;
220 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
221 for (k=k_lb; k<k_ub; k++) {
222 m = global_id[B->col_coupleBlock->pattern->index[k]];
223 if (m>= m_lb && m < m_ub) {
224 /* data to be passed to sparse matrix B_ext_main */
225 idx_m[l_m] = m - m_lb;
226 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227 l_m++;
228 } else {
229 /* data to be passed to sparse matrix B_ext_couple */
230 idx_c[l_c] = m;
231 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
232 l_c++;
233 }
234 }
235
236 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
237 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
238 send_offset[2*i] = l_m;
239 len += l_m;
240 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
241 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
242 send_offset[2*i+1] = l_c;
243 len += l_c;
244 i++;
245 }
246
247 /* sending */
248 #ifdef ESYS_MPI
249 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
250 A->mpi_info->msg_tag_counter+rank,
251 A->mpi_info->comm,
252 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
253 #endif
254 send_degree[p] = len;
255 i0 = i;
256 }
257 TMPMEMFREE(send_m);
258 TMPMEMFREE(send_c);
259 TMPMEMFREE(idx_m);
260 TMPMEMFREE(idx_c);
261
262
263 q = recv->numNeighbors;
264 len = recv->offsetInShared[q];
265 ptr_main = MEMALLOC((len+1), index_t);
266 ptr_couple = MEMALLOC((len+1), index_t);
267
268 #ifdef ESYS_MPI
269 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
270 A->col_coupler->mpi_requests,
271 A->col_coupler->mpi_stati);
272 #endif
273 A->mpi_info->msg_tag_counter += size;
274
275 j = 0;
276 k = 0;
277 ptr_main[0] = 0;
278 ptr_couple[0] = 0;
279 for (i=0; i<len; i++) {
280 j += ptr_ptr[2*i];
281 k += ptr_ptr[2*i+1];
282 ptr_main[i+1] = j;
283 ptr_couple[i+1] = k;
284 }
285
286 TMPMEMFREE(ptr_ptr);
287 idx_main = MEMALLOC(j, index_t);
288 idx_couple = MEMALLOC(k, index_t);
289 ptr_idx = TMPMEMALLOC(j+k, index_t);
290 ptr_val = TMPMEMALLOC((j+k) * block_size, double);
291
292 /* send/receive index array */
293 j=0;
294 k_ub = 0;
295 for (p=0; p<recv->numNeighbors; p++) {
296 k = recv->offsetInShared[p];
297 m = recv->offsetInShared[p+1];
298 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
299 if (i > 0) {
300 k_ub ++;
301 #ifdef ESYS_MPI
302 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
303 A->mpi_info->msg_tag_counter+recv->neighbor[p],
304 A->mpi_info->comm,
305 &(A->col_coupler->mpi_requests[p]));
306 #endif
307 }
308 j += i;
309 }
310
311 j=0;
312 k_ub = 0;
313 for (p=0; p<num_neighbors; p++) {
314 i = send_degree[p] - j;
315 if (i > 0){
316 k_ub ++;
317 #ifdef ESYS_MPI
318 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
319 A->mpi_info->msg_tag_counter+rank,
320 A->mpi_info->comm,
321 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
322 #endif
323 }
324 j = send_degree[p];
325 }
326
327 #ifdef ESYS_MPI
328 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
329 A->col_coupler->mpi_requests,
330 A->col_coupler->mpi_stati);
331 #endif
332 A->mpi_info->msg_tag_counter += size;
333
334 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
335 for (i=0; i<len; i++) {
336 j = ptr_main[i];
337 k = ptr_main[i+1];
338 m = ptr_couple[i];
339 for (p=j; p<k; p++) {
340 idx_main[p] = ptr_idx[m+p];
341 }
342 j = ptr_couple[i+1];
343 for (p=m; p<j; p++) {
344 idx_couple[p] = ptr_idx[k+p];
345 }
346 }
347 TMPMEMFREE(ptr_idx);
348
349 /* allocate pattern and sparsematrix for B_ext_main */
350 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
351 len, num_main_cols, ptr_main, idx_main);
352 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
353 pattern_main, B->row_block_size, B->col_block_size,
354 FALSE);
355 Paso_Pattern_free(pattern_main);
356
357 /* allocate pattern and sparsematrix for B_ext_couple */
358 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
359 len, B->col_distribution->first_component[size],
360 ptr_couple, idx_couple);
361 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
362 pattern_couple, B->row_block_size, B->col_block_size,
363 FALSE);
364 Paso_Pattern_free(pattern_couple);
365
366 /* send/receive value array */
367 j=0;
368 for (p=0; p<recv->numNeighbors; p++) {
369 k = recv->offsetInShared[p];
370 m = recv->offsetInShared[p+1];
371 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
372 #ifdef ESYS_MPI
373 if (i > 0)
374 MPI_Irecv(&(ptr_val[j]), i * block_size,
375 MPI_DOUBLE, recv->neighbor[p],
376 A->mpi_info->msg_tag_counter+recv->neighbor[p],
377 A->mpi_info->comm,
378 &(A->col_coupler->mpi_requests[p]));
379 #endif
380 j += (i * block_size);
381 }
382
383 j=0;
384 for (p=0; p<num_neighbors; p++) {
385 i = send_degree[p] - j;
386 #ifdef ESYS_MPI
387 if (i > 0)
388 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
389 A->mpi_info->msg_tag_counter+rank,
390 A->mpi_info->comm,
391 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
392 #endif
393 j = send_degree[p] ;
394 }
395
396 #ifdef ESYS_MPI
397 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
398 A->col_coupler->mpi_requests,
399 A->col_coupler->mpi_stati);
400 #endif
401 A->mpi_info->msg_tag_counter += size;
402
403 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
404 for (i=0; i<len; i++) {
405 j = ptr_main[i];
406 k = ptr_main[i+1];
407 m = ptr_couple[i];
408 for (p=j; p<k; p++) {
409 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
410 }
411 j = ptr_couple[i+1];
412 for (p=m; p<j; p++) {
413 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
414 }
415 }
416 TMPMEMFREE(ptr_val);
417
418
419 /* release all temp memory allocation */
420 TMPMEMFREE(cols);
421 TMPMEMFREE(cols_array);
422 TMPMEMFREE(recv_offset);
423 TMPMEMFREE(recv_degree);
424 TMPMEMFREE(recv_buf);
425 TMPMEMFREE(send_buf);
426 TMPMEMFREE(send_offset);
427 TMPMEMFREE(send_degree);
428 TMPMEMFREE(send_idx);
429 }
430
431 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
432 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
433 to copy block of data in T to neighbor processors, defined by
434 P->col_coupler->connector->recv->neighbor[k] where k is in
435 [0, P->col_coupler->connector->recv->numNeighbors).
436 Rows to be copied to neighbor processor k is in the list defined by
437 P->col_coupler->connector->recv->offsetInShared[k] ...
438 P->col_coupler->connector->recv->offsetInShared[k+1] */
439 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
440 index_t **p_ptr, index_t **p_idx, double **p_val,
441 index_t *global_id, index_t block_size)
442 {
443 Paso_SharedComponents *send=NULL, *recv=NULL;
444 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
445 index_t i, j, p, m, n, rank, size, offset;
446 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
447 index_t *ptr=*p_ptr, *idx=*p_idx;
448 double *val=*p_val, *recv_val=NULL;
449
450 rank = P->mpi_info->rank;
451 size = P->mpi_info->size;
452 send = P->col_coupler->connector->recv;
453 recv = P->col_coupler->connector->send;
454 send_neighbors = send->numNeighbors;
455 recv_neighbors = recv->numNeighbors;
456 send_rows = P->col_coupleBlock->numCols;
457 recv_rows = recv->offsetInShared[recv_neighbors];
458 offset = P->col_distribution->first_component[rank];
459
460 send_degree = TMPMEMALLOC(send_rows, index_t);
461 recv_ptr = MEMALLOC(recv_rows + 1, index_t);
462 #pragma omp for schedule(static) private(i)
463 for (i=0; i<send_rows; i++)
464 send_degree[i] = ptr[i+1] - ptr[i];
465
466 /* First, send/receive the degree */
467 for (p=0; p<recv_neighbors; p++) { /* Receiving */
468 m = recv->offsetInShared[p];
469 n = recv->offsetInShared[p+1];
470 #ifdef ESYS_MPI
471 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
472 P->mpi_info->msg_tag_counter + recv->neighbor[p],
473 P->mpi_info->comm,
474 &(P->col_coupler->mpi_requests[p]));
475 #endif
476 }
477 for (p=0; p<send_neighbors; p++) { /* Sending */
478 m = send->offsetInShared[p];
479 n = send->offsetInShared[p+1];
480 #ifdef ESYS_MPI
481 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
482 P->mpi_info->msg_tag_counter + rank,
483 P->mpi_info->comm,
484 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
485 #endif
486 }
487 #ifdef ESYS_MPI
488 MPI_Waitall(send_neighbors+recv_neighbors,
489 P->col_coupler->mpi_requests,
490 P->col_coupler->mpi_stati);
491 #endif
492 P->mpi_info->msg_tag_counter += size;
493
494 TMPMEMFREE(send_degree);
495 m = Paso_Util_cumsum(recv_rows, recv_ptr);
496 recv_ptr[recv_rows] = m;
497 recv_idx = MEMALLOC(m, index_t);
498 recv_val = MEMALLOC(m * block_size, double);
499
500 /* Next, send/receive the index array */
501 j = 0;
502 for (p=0; p<recv_neighbors; p++) { /* Receiving */
503 m = recv->offsetInShared[p];
504 n = recv->offsetInShared[p+1];
505 i = recv_ptr[n] - recv_ptr[m];
506 if (i > 0) {
507 #ifdef ESYS_MPI
508 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
509 P->mpi_info->msg_tag_counter + recv->neighbor[p],
510 P->mpi_info->comm,
511 &(P->col_coupler->mpi_requests[p]));
512 #endif
513 }
514 j += i;
515 }
516
517 j = 0;
518 for (p=0; p<send_neighbors; p++) { /* Sending */
519 m = send->offsetInShared[p];
520 n = send->offsetInShared[p+1];
521 i = ptr[n] - ptr[m];
522 if (i >0) {
523 #ifdef ESYS_MPI
524 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
525 P->mpi_info->msg_tag_counter + rank,
526 P->mpi_info->comm,
527 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
528 #endif
529 j += i;
530 }
531 }
532 #ifdef ESYS_MPI
533 MPI_Waitall(send_neighbors+recv_neighbors,
534 P->col_coupler->mpi_requests,
535 P->col_coupler->mpi_stati);
536 #endif
537 P->mpi_info->msg_tag_counter += size;
538
539 /* Last, send/receive the data array */
540 j = 0;
541 for (p=0; p<recv_neighbors; p++) { /* Receiving */
542 m = recv->offsetInShared[p];
543 n = recv->offsetInShared[p+1];
544 i = recv_ptr[n] - recv_ptr[m];
545 #ifdef ESYS_MPI
546 if (i > 0)
547 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
548 P->mpi_info->msg_tag_counter + recv->neighbor[p],
549 P->mpi_info->comm,
550 &(P->col_coupler->mpi_requests[p]));
551 #endif
552 j += (i*block_size);
553 }
554
555 j = 0;
556 for (p=0; p<send_neighbors; p++) { /* Sending */
557 m = send->offsetInShared[p];
558 n = send->offsetInShared[p+1];
559 i = ptr[n] - ptr[m];
560 if (i >0) {
561 #ifdef ESYS_MPI
562 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
563 P->mpi_info->msg_tag_counter + rank,
564 P->mpi_info->comm,
565 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
566 #endif
567 j += i * block_size;
568 }
569 }
570 #ifdef ESYS_MPI
571 MPI_Waitall(send_neighbors+recv_neighbors,
572 P->col_coupler->mpi_requests,
573 P->col_coupler->mpi_stati);
574 #endif
575 P->mpi_info->msg_tag_counter += size;
576
577 /* Clean up and return with recevied ptr, index and data arrays */
578 MEMFREE(ptr);
579 MEMFREE(idx);
580 MEMFREE(val);
581 *p_ptr = recv_ptr;
582 *p_idx = recv_idx;
583 *p_val = recv_val;
584 }
585
586 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
587 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
588 {
589 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
590 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
591 Paso_SystemMatrix *out=NULL;
592 Paso_SystemMatrixPattern *pattern=NULL;
593 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
594 Paso_SharedComponents *send =NULL, *recv=NULL;
595 Paso_Connector *col_connector=NULL, *row_connector=NULL;
596 Paso_Pattern *main_pattern=NULL;
597 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
598 const dim_t row_block_size=A->row_block_size;
599 const dim_t col_block_size=A->col_block_size;
600 const dim_t block_size = A->block_size;
601 const dim_t P_block_size = P->block_size;
602 const double ZERO = 0.0;
603 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
604 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
605 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
606 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
607 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
608 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
609 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
610 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
611 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
612 index_t sum, i, j, k, iptr, irb, icb, ib;
613 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
614 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
615 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
616 index_t row_marker_ext, *where_p=NULL;
617 index_t **send_ptr=NULL, **send_idx=NULL;
618 dim_t p, num_neighbors;
619 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
620 Esys_MPI_rank *neighbor=NULL;
621 #ifdef ESYS_MPI
622 MPI_Request* mpi_requests=NULL;
623 MPI_Status* mpi_stati=NULL;
624 #else
625 int *mpi_requests=NULL, *mpi_stati=NULL;
626 #endif
627
628 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
629 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
630
631 /* two sparse matrixes R_main and R_couple will be generate, as the
632 transpose of P_main and P_col_couple, respectively. Note that,
633 R_couple is actually the row_coupleBlock of R (R=P^T) */
634 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
635 if (size > 1)
636 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
637
638 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
639 and needed locally for triple matrix product RAP
640 to be specific, P_ext (on processor k) are group of rows in P, where
641 the list of rows from processor q is given by
642 A->col_coupler->connector->send->shared[rPtr...]
643 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
644 on q.
645 P_ext is represented by two sparse matrixes P_ext_main and
646 P_ext_couple */
647 Paso_Preconditioner_AMG_extendB(A, P);
648
649 /* count the number of cols in P_ext_couple, resize the pattern of
650 sparse matrix P_ext_couple with new compressed order, and then
651 build the col id mapping from P->col_coupleBlock to
652 P_ext_couple */
653 num_Pmain_cols = P->mainBlock->numCols;
654 if (size > 1) {
655 num_Pcouple_cols = P->col_coupleBlock->numCols;
656 num_Acouple_cols = A->col_coupleBlock->numCols;
657 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
658 } else {
659 num_Pcouple_cols = 0;
660 num_Acouple_cols = 0;
661 sum = 0;
662 }
663 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
664 offset = P->col_distribution->first_component[rank];
665 num_Pext_cols = 0;
666 if (P->global_id) {
667 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
668 and P->col_coupleBlock */
669 iptr = 0;
670 if (num_Pcouple_cols || sum > 0) {
671 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
672 #pragma omp parallel for lastprivate(iptr) schedule(static)
673 for (iptr=0; iptr<sum; iptr++){
674 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
675 }
676 for (j=0; j<num_Pcouple_cols; j++, iptr++){
677 temp[iptr] = P->global_id[j];
678 }
679 }
680 if (iptr) {
681 #ifdef USE_QSORTG
682 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
683 #else
684 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
685 #endif
686 num_Pext_cols = 1;
687 i = temp[0];
688 for (j=1; j<iptr; j++) {
689 if (temp[j] > i) {
690 i = temp[j];
691 temp[num_Pext_cols++] = i;
692 }
693 }
694 }
695 /* resize the pattern of P_ext_couple */
696 if(num_Pext_cols){
697 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
698 #pragma omp parallel for private(i) schedule(static)
699 for (i=0; i<num_Pext_cols; i++)
700 global_id_P[i] = temp[i];
701 }
702 if (num_Pcouple_cols || sum > 0)
703 TMPMEMFREE(temp);
704 #pragma omp parallel for private(i, where_p) schedule(static)
705 for (i=0; i<sum; i++) {
706 where_p = (index_t *)bsearch(
707 &(P->remote_coupleBlock->pattern->index[i]),
708 global_id_P, num_Pext_cols,
709 sizeof(index_t), Paso_comparIndex);
710 P->remote_coupleBlock->pattern->index[i] =
711 (index_t)(where_p -global_id_P);
712 }
713
714 /* build the mapping */
715 if (num_Pcouple_cols) {
716 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
717 iptr = 0;
718 for (i=0; i<num_Pext_cols; i++)
719 if (global_id_P[i] == P->global_id[iptr]) {
720 Pcouple_to_Pext[iptr++] = i;
721 if (iptr == num_Pcouple_cols) break;
722 }
723 }
724 }
725
726 /* alloc and initialize the makers */
727 sum = num_Pext_cols + num_Pmain_cols;
728 P_marker = TMPMEMALLOC(sum, index_t);
729 A_marker = TMPMEMALLOC(num_A_cols, index_t);
730 #pragma omp parallel for private(i) schedule(static)
731 for (i=0; i<sum; i++) P_marker[i] = -1;
732 #pragma omp parallel for private(i) schedule(static)
733 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
734
735 /* Now, count the size of RAP_ext. Start with rows in R_couple */
736 sum = 0;
737 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
738 row_marker = sum;
739 /* then, loop over elements in row i_r of R_couple */
740 j1_ub = R_couple->pattern->ptr[i_r+1];
741 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
742 i1 = R_couple->pattern->index[j1];
743 /* then, loop over elements in row i1 of A->col_coupleBlock */
744 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
745 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
746 i2 = A->col_coupleBlock->pattern->index[j2];
747
748 /* check whether entry RA[i_r, i2] has been previously visited.
749 RAP new entry is possible only if entry RA[i_r, i2] has not
750 been visited yet */
751 if (A_marker[i2] != i_r) {
752 /* first, mark entry RA[i_r,i2] as visited */
753 A_marker[i2] = i_r;
754
755 /* then loop over elements in row i2 of P_ext_main */
756 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
757 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
758 i_c = P->row_coupleBlock->pattern->index[j3];
759
760 /* check whether entry RAP[i_r,i_c] has been created.
761 If not yet created, create the entry and increase
762 the total number of elements in RAP_ext */
763 if (P_marker[i_c] < row_marker) {
764 P_marker[i_c] = sum;
765 sum++;
766 }
767 }
768
769 /* loop over elements in row i2 of P_ext_couple, do the same */
770 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
771 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
772 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
773 if (P_marker[i_c] < row_marker) {
774 P_marker[i_c] = sum;
775 sum++;
776 }
777 }
778 }
779 }
780
781 /* now loop over elements in row i1 of A->mainBlock, repeat
782 the process we have done to block A->col_coupleBlock */
783 j2_ub = A->mainBlock->pattern->ptr[i1+1];
784 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
785 i2 = A->mainBlock->pattern->index[j2];
786 if (A_marker[i2 + num_Acouple_cols] != i_r) {
787 A_marker[i2 + num_Acouple_cols] = i_r;
788 j3_ub = P->mainBlock->pattern->ptr[i2+1];
789 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
790 i_c = P->mainBlock->pattern->index[j3];
791 if (P_marker[i_c] < row_marker) {
792 P_marker[i_c] = sum;
793 sum++;
794 }
795 }
796 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
797 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
798 /* note that we need to map the column index in
799 P->col_coupleBlock back into the column index in
800 P_ext_couple */
801 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
802 if (P_marker[i_c] < row_marker) {
803 P_marker[i_c] = sum;
804 sum++;
805 }
806 }
807 }
808 }
809 }
810 }
811
812 /* Now we have the number of non-zero elements in RAP_ext, allocate
813 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
814 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
815 RAP_ext_idx = MEMALLOC(sum, index_t);
816 RAP_ext_val = MEMALLOC(sum * block_size, double);
817 RA_val = TMPMEMALLOC(block_size, double);
818 RAP_val = TMPMEMALLOC(block_size, double);
819
820 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
821 sum = num_Pext_cols + num_Pmain_cols;
822 #pragma omp parallel for private(i) schedule(static)
823 for (i=0; i<sum; i++) P_marker[i] = -1;
824 #pragma omp parallel for private(i) schedule(static)
825 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
826 sum = 0;
827 RAP_ext_ptr[0] = 0;
828 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
829 row_marker = sum;
830 /* then, loop over elements in row i_r of R_couple */
831 j1_ub = R_couple->pattern->ptr[i_r+1];
832 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
833 i1 = R_couple->pattern->index[j1];
834 R_val = &(R_couple->val[j1*P_block_size]);
835
836 /* then, loop over elements in row i1 of A->col_coupleBlock */
837 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
838 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
839 i2 = A->col_coupleBlock->pattern->index[j2];
840 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
841 for (irb=0; irb<row_block_size; irb++)
842 for (icb=0; icb<col_block_size; icb++)
843 RA_val[irb+row_block_size*icb] = ZERO;
844 for (irb=0; irb<P_block_size; irb++) {
845 rtmp=R_val[irb];
846 for (icb=0; icb<col_block_size; icb++) {
847 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
848 }
849 }
850
851 /* check whether entry RA[i_r, i2] has been previously visited.
852 RAP new entry is possible only if entry RA[i_r, i2] has not
853 been visited yet */
854 if (A_marker[i2] != i_r) {
855 /* first, mark entry RA[i_r,i2] as visited */
856 A_marker[i2] = i_r;
857
858 /* then loop over elements in row i2 of P_ext_main */
859 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
860 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
861 i_c = P->row_coupleBlock->pattern->index[j3];
862 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
863 for (irb=0; irb<row_block_size; irb++)
864 for (icb=0; icb<col_block_size; icb++)
865 RAP_val[irb+row_block_size*icb] = ZERO;
866 for (icb=0; icb<P_block_size; icb++) {
867 rtmp = temp_val[icb];
868 for (irb=0; irb<row_block_size; irb++) {
869 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
870 }
871 }
872
873 /* check whether entry RAP[i_r,i_c] has been created.
874 If not yet created, create the entry and increase
875 the total number of elements in RAP_ext */
876 if (P_marker[i_c] < row_marker) {
877 P_marker[i_c] = sum;
878 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
879 RAP_ext_idx[sum] = i_c + offset;
880 sum++;
881 } else {
882 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
883 for (ib=0; ib<block_size; ib++) {
884 temp_val[ib] += RAP_val[ib];
885 }
886 }
887 }
888
889 /* loop over elements in row i2 of P_ext_couple, do the same */
890 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
891 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
892 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
893 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
894 for (irb=0; irb<row_block_size; irb++)
895 for (icb=0; icb<col_block_size; icb++)
896 RAP_val[irb+row_block_size*icb]=ZERO;
897 for (icb=0; icb<P_block_size; icb++) {
898 rtmp = temp_val[icb];
899 for (irb=0; irb<row_block_size; irb++) {
900 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
901 }
902 }
903
904 if (P_marker[i_c] < row_marker) {
905 P_marker[i_c] = sum;
906 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
907 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
908 sum++;
909 } else {
910 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
911 for (ib=0; ib<block_size; ib++) {
912 temp_val[ib] += RAP_val[ib];
913 }
914 }
915 }
916
917 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
918 Only the contributions are added. */
919 } else {
920 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
921 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
922 i_c = P->row_coupleBlock->pattern->index[j3];
923 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
924 for (irb=0; irb<row_block_size; irb++)
925 for (icb=0; icb<col_block_size; icb++)
926 RAP_val[irb+row_block_size*icb]=ZERO;
927 for (icb=0; icb<P_block_size; icb++) {
928 rtmp = temp_val[icb];
929 for (irb=0; irb<row_block_size; irb++) {
930 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
931 }
932 }
933
934 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
935 for (ib=0; ib<block_size; ib++) {
936 temp_val[ib] += RAP_val[ib];
937 }
938 }
939 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
940 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
941 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
942 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
943 for (irb=0; irb<row_block_size; irb++)
944 for (icb=0; icb<col_block_size; icb++)
945 RAP_val[irb+row_block_size*icb]=ZERO;
946 for (icb=0; icb<P_block_size; icb++) {
947 rtmp = temp_val[icb];
948 for (irb=0; irb<row_block_size; irb++) {
949 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
950 }
951 }
952
953 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
954 for (ib=0; ib<block_size; ib++) {
955 temp_val[ib] += RAP_val[ib];
956 }
957 }
958 }
959 }
960
961 /* now loop over elements in row i1 of A->mainBlock, repeat
962 the process we have done to block A->col_coupleBlock */
963 j2_ub = A->mainBlock->pattern->ptr[i1+1];
964 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
965 i2 = A->mainBlock->pattern->index[j2];
966 temp_val = &(A->mainBlock->val[j2*block_size]);
967 for (irb=0; irb<row_block_size; irb++)
968 for (icb=0; icb<col_block_size; icb++)
969 RA_val[irb+row_block_size*icb]=ZERO;
970 for (irb=0; irb<P_block_size; irb++) {
971 rtmp=R_val[irb];
972 for (icb=0; icb<col_block_size; icb++) {
973 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
974 }
975 }
976
977 if (A_marker[i2 + num_Acouple_cols] != i_r) {
978 A_marker[i2 + num_Acouple_cols] = i_r;
979 j3_ub = P->mainBlock->pattern->ptr[i2+1];
980 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
981 i_c = P->mainBlock->pattern->index[j3];
982 temp_val = &(P->mainBlock->val[j3*P_block_size]);
983 for (irb=0; irb<row_block_size; irb++)
984 for (icb=0; icb<col_block_size; icb++)
985 RAP_val[irb+row_block_size*icb]=ZERO;
986 for (icb=0; icb<P_block_size; icb++) {
987 rtmp = temp_val[icb];
988 for (irb=0; irb<row_block_size; irb++) {
989 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
990 }
991 }
992
993 if (P_marker[i_c] < row_marker) {
994 P_marker[i_c] = sum;
995 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
996 RAP_ext_idx[sum] = i_c + offset;
997 sum++;
998 } else {
999 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1000 for (ib=0; ib<block_size; ib++) {
1001 temp_val[ib] += RAP_val[ib];
1002 }
1003 }
1004 }
1005 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1006 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1007 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1008 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1009 for (irb=0; irb<row_block_size; irb++)
1010 for (icb=0; icb<col_block_size; icb++)
1011 RAP_val[irb+row_block_size*icb]=ZERO;
1012 for (icb=0; icb<P_block_size; icb++) {
1013 rtmp=temp_val[icb];
1014 for (irb=0; irb<row_block_size; irb++) {
1015 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1016 }
1017 }
1018
1019 if (P_marker[i_c] < row_marker) {
1020 P_marker[i_c] = sum;
1021 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1022 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1023 sum++;
1024 } else {
1025 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1026 for (ib=0; ib<block_size; ib++) {
1027 temp_val[ib] += RAP_val[ib];
1028 }
1029 }
1030 }
1031 } else {
1032 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1033 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1034 i_c = P->mainBlock->pattern->index[j3];
1035 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1036 for (irb=0; irb<row_block_size; irb++)
1037 for (icb=0; icb<col_block_size; icb++)
1038 RAP_val[irb+row_block_size*icb]=ZERO;
1039 for (icb=0; icb<P_block_size; icb++) {
1040 rtmp=temp_val[icb];
1041 for (irb=0; irb<row_block_size; irb++) {
1042 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1043 }
1044 }
1045
1046 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1047 for (ib=0; ib<block_size; ib++) {
1048 temp_val[ib] += RAP_val[ib];
1049 }
1050 }
1051 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1052 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1053 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1054 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1055 for (irb=0; irb<row_block_size; irb++)
1056 for (icb=0; icb<col_block_size; icb++)
1057 RAP_val[irb+row_block_size*icb]=ZERO;
1058 for (icb=0; icb<P_block_size; icb++) {
1059 rtmp=temp_val[icb];
1060 for (irb=0; irb<row_block_size; irb++) {
1061 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1062 }
1063 }
1064
1065 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1066 for (ib=0; ib<block_size; ib++) {
1067 temp_val[ib] += RAP_val[ib];
1068 }
1069 }
1070 }
1071 }
1072 }
1073 RAP_ext_ptr[i_r+1] = sum;
1074 }
1075 TMPMEMFREE(P_marker);
1076 TMPMEMFREE(Pcouple_to_Pext);
1077
1078 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1079 which is given by the column list of P->col_coupleBlock, and
1080 column "c" is the list of columns which possiblely covers the
1081 whole column range of systme martris P. This part of data is to
1082 be passed to neighboring processors, and added to corresponding
1083 RAP[r,c] entries in the neighboring processors */
1084 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1085 &RAP_ext_val, global_id_P, block_size);
1086
1087 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1088 sum = RAP_ext_ptr[num_RAPext_rows];
1089 num_RAPext_cols = 0;
1090 if (num_Pext_cols || sum > 0) {
1091 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1092 j1_ub = offset + num_Pmain_cols;
1093 for (i=0, iptr=0; i<sum; i++) {
1094 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1095 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1096 }
1097 for (j=0; j<num_Pext_cols; j++, iptr++){
1098 temp[iptr] = global_id_P[j];
1099 }
1100
1101 if (iptr) {
1102 #ifdef USE_QSORTG
1103 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1104 #else
1105 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1106 #endif
1107 num_RAPext_cols = 1;
1108 i = temp[0];
1109 for (j=1; j<iptr; j++) {
1110 if (temp[j] > i) {
1111 i = temp[j];
1112 temp[num_RAPext_cols++] = i;
1113 }
1114 }
1115 }
1116 }
1117
1118 /* resize the pattern of P_ext_couple */
1119 if(num_RAPext_cols){
1120 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1121 #pragma omp parallel for private(i) schedule(static)
1122 for (i=0; i<num_RAPext_cols; i++)
1123 global_id_RAP[i] = temp[i];
1124 }
1125 if (num_Pext_cols || sum > 0)
1126 TMPMEMFREE(temp);
1127 j1_ub = offset + num_Pmain_cols;
1128 #pragma omp parallel for private(i, where_p) schedule(static)
1129 for (i=0; i<sum; i++) {
1130 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1131 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1132 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1133 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1134 } else
1135 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1136 }
1137
1138 /* build the mapping */
1139 if (num_Pcouple_cols) {
1140 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1141 iptr = 0;
1142 for (i=0; i<num_RAPext_cols; i++)
1143 if (global_id_RAP[i] == P->global_id[iptr]) {
1144 Pcouple_to_RAP[iptr++] = i;
1145 if (iptr == num_Pcouple_cols) break;
1146 }
1147 }
1148
1149 if (num_Pext_cols) {
1150 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1151 iptr = 0;
1152 for (i=0; i<num_RAPext_cols; i++)
1153 if (global_id_RAP[i] == global_id_P[iptr]) {
1154 Pext_to_RAP[iptr++] = i;
1155 if (iptr == num_Pext_cols) break;
1156 }
1157 }
1158
1159 if (global_id_P){
1160 TMPMEMFREE(global_id_P);
1161 global_id_P = NULL;
1162 }
1163
1164 /* alloc and initialize the makers */
1165 sum = num_RAPext_cols + num_Pmain_cols;
1166 P_marker = TMPMEMALLOC(sum, index_t);
1167 #pragma omp parallel for private(i) schedule(static)
1168 for (i=0; i<sum; i++) P_marker[i] = -1;
1169 #pragma omp parallel for private(i) schedule(static)
1170 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1171
1172 /* Now, count the size of RAP. Start with rows in R_main */
1173 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1174 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1175 shared = P->col_coupler->connector->send->shared;
1176 i = 0;
1177 j = 0;
1178 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1179 /* Mark the start of row for both main block and couple block */
1180 row_marker = i;
1181 row_marker_ext = j;
1182
1183 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1184 in RAP_ext */
1185 P_marker[i_r] = i;
1186 i++;
1187 for (j1=0; j1<num_neighbors; j1++) {
1188 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1189 if (shared[j2] == i_r) {
1190 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1191 i_c = RAP_ext_idx[k];
1192 if (i_c < num_Pmain_cols) {
1193 if (P_marker[i_c] < row_marker) {
1194 P_marker[i_c] = i;
1195 i++;
1196 }
1197 } else {
1198 if (P_marker[i_c] < row_marker_ext) {
1199 P_marker[i_c] = j;
1200 j++;
1201 }
1202 }
1203 }
1204 break;
1205 }
1206 }
1207 }
1208
1209 /* then, loop over elements in row i_r of R_main */
1210 j1_ub = R_main->pattern->ptr[i_r+1];
1211 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1212 i1 = R_main->pattern->index[j1];
1213
1214 /* then, loop over elements in row i1 of A->col_coupleBlock */
1215 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1216 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1217 i2 = A->col_coupleBlock->pattern->index[j2];
1218
1219 /* check whether entry RA[i_r, i2] has been previously visited.
1220 RAP new entry is possible only if entry RA[i_r, i2] has not
1221 been visited yet */
1222 if (A_marker[i2] != i_r) {
1223 /* first, mark entry RA[i_r,i2] as visited */
1224 A_marker[i2] = i_r;
1225
1226 /* then loop over elements in row i2 of P_ext_main */
1227 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1228 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1229 i_c = P->row_coupleBlock->pattern->index[j3];
1230
1231 /* check whether entry RAP[i_r,i_c] has been created.
1232 If not yet created, create the entry and increase
1233 the total number of elements in RAP_ext */
1234 if (P_marker[i_c] < row_marker) {
1235 P_marker[i_c] = i;
1236 i++;
1237 }
1238 }
1239
1240 /* loop over elements in row i2 of P_ext_couple, do the same */
1241 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1242 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1243 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1244 if (P_marker[i_c] < row_marker_ext) {
1245 P_marker[i_c] = j;
1246 j++;
1247 }
1248 }
1249 }
1250 }
1251
1252 /* now loop over elements in row i1 of A->mainBlock, repeat
1253 the process we have done to block A->col_coupleBlock */
1254 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1255 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1256 i2 = A->mainBlock->pattern->index[j2];
1257 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1258 A_marker[i2 + num_Acouple_cols] = i_r;
1259 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1260 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1261 i_c = P->mainBlock->pattern->index[j3];
1262 if (P_marker[i_c] < row_marker) {
1263 P_marker[i_c] = i;
1264 i++;
1265 }
1266 }
1267 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1268 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1269 /* note that we need to map the column index in
1270 P->col_coupleBlock back into the column index in
1271 P_ext_couple */
1272 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1273 if (P_marker[i_c] < row_marker_ext) {
1274 P_marker[i_c] = j;
1275 j++;
1276 }
1277 }
1278 }
1279 }
1280 }
1281 }
1282
1283 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1284 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1285 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1286 RAP_couple */
1287 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1288 RAP_main_idx = MEMALLOC(i, index_t);
1289 RAP_main_val = TMPMEMALLOC(i * block_size, double);
1290 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1291 RAP_couple_idx = MEMALLOC(j, index_t);
1292 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
1293
1294 RAP_main_ptr[num_Pmain_cols] = i;
1295 RAP_couple_ptr[num_Pmain_cols] = j;
1296
1297 #pragma omp parallel for private(i) schedule(static)
1298 for (i=0; i<sum; i++) P_marker[i] = -1;
1299 #pragma omp parallel for private(i) schedule(static)
1300 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1301
1302 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1303 in R_main */
1304 i = 0;
1305 j = 0;
1306 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1307 /* Mark the start of row for both main block and couple block */
1308 row_marker = i;
1309 row_marker_ext = j;
1310 RAP_main_ptr[i_r] = row_marker;
1311 RAP_couple_ptr[i_r] = row_marker_ext;
1312
1313 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1314 in row i_r of RAP_ext */
1315 P_marker[i_r] = i;
1316 for (ib=0; ib<block_size; ib++)
1317 RAP_main_val[i*block_size+ib] = ZERO;
1318 RAP_main_idx[i] = i_r;
1319 i++;
1320
1321 for (j1=0; j1<num_neighbors; j1++) {
1322 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1323 if (shared[j2] == i_r) {
1324 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1325 i_c = RAP_ext_idx[k];
1326 if (i_c < num_Pmain_cols) {
1327 if (P_marker[i_c] < row_marker) {
1328 P_marker[i_c] = i;
1329 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1330 RAP_main_idx[i] = i_c;
1331 i++;
1332 } else {
1333 t1_val = &(RAP_ext_val[k*block_size]);
1334 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1335 for (ib=0; ib<block_size; ib++)
1336 t2_val[ib] += t1_val[ib];
1337 }
1338 } else {
1339 if (P_marker[i_c] < row_marker_ext) {
1340 P_marker[i_c] = j;
1341 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1342 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1343 j++;
1344 } else {
1345 t1_val = &(RAP_ext_val[k*block_size]);
1346 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1347 for (ib=0; ib<block_size; ib++)
1348 t2_val[ib] += t1_val[ib];
1349 }
1350 }
1351 }
1352 break;
1353 }
1354 }
1355 }
1356
1357 /* then, loop over elements in row i_r of R_main */
1358 j1_ub = R_main->pattern->ptr[i_r+1];
1359 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1360 i1 = R_main->pattern->index[j1];
1361 R_val = &(R_main->val[j1*P_block_size]);
1362
1363 /* then, loop over elements in row i1 of A->col_coupleBlock */
1364 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1365 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1366 i2 = A->col_coupleBlock->pattern->index[j2];
1367 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1368 for (irb=0; irb<row_block_size; irb++)
1369 for (icb=0; icb<col_block_size; icb++)
1370 RA_val[irb+row_block_size*icb]=ZERO;
1371 for (irb=0; irb<P_block_size; irb++) {
1372 rtmp=R_val[irb];
1373 for (icb=0; icb<col_block_size; icb++) {
1374 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1375 }
1376 }
1377
1378
1379 /* check whether entry RA[i_r, i2] has been previously visited.
1380 RAP new entry is possible only if entry RA[i_r, i2] has not
1381 been visited yet */
1382 if (A_marker[i2] != i_r) {
1383 /* first, mark entry RA[i_r,i2] as visited */
1384 A_marker[i2] = i_r;
1385
1386 /* then loop over elements in row i2 of P_ext_main */
1387 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1388 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1389 i_c = P->row_coupleBlock->pattern->index[j3];
1390 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1391 for (irb=0; irb<row_block_size; irb++)
1392 for (icb=0; icb<col_block_size; icb++)
1393 RAP_val[irb+row_block_size*icb]=ZERO;
1394 for (icb=0; icb<P_block_size; icb++) {
1395 rtmp=temp_val[icb];
1396 for (irb=0; irb<row_block_size; irb++) {
1397 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1398 }
1399 }
1400
1401
1402 /* check whether entry RAP[i_r,i_c] has been created.
1403 If not yet created, create the entry and increase
1404 the total number of elements in RAP_ext */
1405 if (P_marker[i_c] < row_marker) {
1406 P_marker[i_c] = i;
1407 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1408 RAP_main_idx[i] = i_c;
1409 i++;
1410 } else {
1411 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1412 for (ib=0; ib<block_size; ib++) {
1413 temp_val[ib] += RAP_val[ib];
1414 }
1415 }
1416 }
1417
1418 /* loop over elements in row i2 of P_ext_couple, do the same */
1419 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1420 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1421 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1422 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1423 for (irb=0; irb<row_block_size; irb++)
1424 for (icb=0; icb<col_block_size; icb++)
1425 RAP_val[irb+row_block_size*icb]=ZERO;
1426 for (icb=0; icb<P_block_size; icb++) {
1427 rtmp=temp_val[icb];
1428 for (irb=0; irb<row_block_size; irb++) {
1429 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1430 }
1431 }
1432
1433 if (P_marker[i_c] < row_marker_ext) {
1434 P_marker[i_c] = j;
1435 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1436 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1437 j++;
1438 } else {
1439 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1440 for (ib=0; ib<block_size; ib++) {
1441 temp_val[ib] += RAP_val[ib];
1442 }
1443 }
1444 }
1445
1446 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1447 Only the contributions are added. */
1448 } else {
1449 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1450 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1451 i_c = P->row_coupleBlock->pattern->index[j3];
1452 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1453 for (irb=0; irb<row_block_size; irb++)
1454 for (icb=0; icb<col_block_size; icb++)
1455 RAP_val[irb+row_block_size*icb]=ZERO;
1456 for (icb=0; icb<P_block_size; icb++) {
1457 rtmp=temp_val[icb];
1458 for (irb=0; irb<row_block_size; irb++) {
1459 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1460 }
1461 }
1462
1463 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1464 for (ib=0; ib<block_size; ib++) {
1465 temp_val[ib] += RAP_val[ib];
1466 }
1467 }
1468 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1469 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1470 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1471 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1472 for (irb=0; irb<row_block_size; irb++)
1473 for (icb=0; icb<col_block_size; icb++)
1474 RAP_val[irb+row_block_size*icb]=ZERO;
1475 for (icb=0; icb<P_block_size; icb++) {
1476 rtmp=temp_val[icb];
1477 for (irb=0; irb<row_block_size; irb++) {
1478 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1479 }
1480 }
1481
1482 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1483 for (ib=0; ib<block_size; ib++) {
1484 temp_val[ib] += RAP_val[ib];
1485 }
1486 }
1487 }
1488 }
1489
1490 /* now loop over elements in row i1 of A->mainBlock, repeat
1491 the process we have done to block A->col_coupleBlock */
1492 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1493 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1494 i2 = A->mainBlock->pattern->index[j2];
1495 temp_val = &(A->mainBlock->val[j2*block_size]);
1496 for (irb=0; irb<row_block_size; irb++)
1497 for (icb=0; icb<col_block_size; icb++)
1498 RA_val[irb+row_block_size*icb]=ZERO;
1499 for (irb=0; irb<P_block_size; irb++) {
1500 rtmp=R_val[irb];
1501 for (icb=0; icb<col_block_size; icb++) {
1502 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1503 }
1504 }
1505
1506 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1507 A_marker[i2 + num_Acouple_cols] = i_r;
1508 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1509 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1510 i_c = P->mainBlock->pattern->index[j3];
1511 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1512 for (irb=0; irb<row_block_size; irb++)
1513 for (icb=0; icb<col_block_size; icb++)
1514 RAP_val[irb+row_block_size*icb]=ZERO;
1515 for (icb=0; icb<P_block_size; icb++) {
1516 rtmp=temp_val[icb];
1517 for (irb=0; irb<row_block_size; irb++) {
1518 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1519 }
1520 }
1521 if (P_marker[i_c] < row_marker) {
1522 P_marker[i_c] = i;
1523 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1524 RAP_main_idx[i] = i_c;
1525 i++;
1526 } else {
1527 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1528 for (ib=0; ib<block_size; ib++) {
1529 temp_val[ib] += RAP_val[ib];
1530 }
1531 }
1532 }
1533 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1534 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1535 /* note that we need to map the column index in
1536 P->col_coupleBlock back into the column index in
1537 P_ext_couple */
1538 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1539 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1540 for (irb=0; irb<row_block_size; irb++)
1541 for (icb=0; icb<col_block_size; icb++)
1542 RAP_val[irb+row_block_size*icb]=ZERO;
1543 for (icb=0; icb<P_block_size; icb++) {
1544 rtmp=temp_val[icb];
1545 for (irb=0; irb<row_block_size; irb++) {
1546 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1547 }
1548 }
1549
1550 if (P_marker[i_c] < row_marker_ext) {
1551 P_marker[i_c] = j;
1552 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1553 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1554 j++;
1555 } else {
1556 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1557 for (ib=0; ib<block_size; ib++) {
1558 temp_val[ib] += RAP_val[ib];
1559 }
1560 }
1561 }
1562
1563 } else {
1564 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1565 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1566 i_c = P->mainBlock->pattern->index[j3];
1567 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1568 for (irb=0; irb<row_block_size; irb++)
1569 for (icb=0; icb<col_block_size; icb++)
1570 RAP_val[irb+row_block_size*icb]=ZERO;
1571 for (icb=0; icb<P_block_size; icb++) {
1572 rtmp=temp_val[icb];
1573 for (irb=0; irb<row_block_size; irb++) {
1574 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1575 }
1576 }
1577
1578 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1579 for (ib=0; ib<block_size; ib++) {
1580 temp_val[ib] += RAP_val[ib];
1581 }
1582 }
1583 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1584 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1585 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1586 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1587 for (irb=0; irb<row_block_size; irb++)
1588 for (icb=0; icb<col_block_size; icb++)
1589 RAP_val[irb+row_block_size*icb]=ZERO;
1590 for (icb=0; icb<P_block_size; icb++) {
1591 rtmp = temp_val[icb];
1592 for (irb=0; irb<row_block_size; irb++) {
1593 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1594 }
1595 }
1596
1597 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1598 for (ib=0; ib<block_size; ib++) {
1599 temp_val[ib] += RAP_val[ib];
1600 }
1601 }
1602 }
1603 }
1604 }
1605
1606 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1607 if (i > row_marker) {
1608 offset = i - row_marker;
1609 temp = TMPMEMALLOC(offset, index_t);
1610 #pragma omp parallel for schedule(static) private(iptr)
1611 for (iptr=0; iptr<offset; iptr++)
1612 temp[iptr] = RAP_main_idx[row_marker+iptr];
1613 if (offset > 0) {
1614 #ifdef USE_QSORTG
1615 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1616 #else
1617 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1618 #endif
1619 }
1620 temp_val = TMPMEMALLOC(offset * block_size, double);
1621 #pragma omp parallel for schedule(static) private(iptr,k)
1622 for (iptr=0; iptr<offset; iptr++){
1623 k = P_marker[temp[iptr]];
1624 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1625 P_marker[temp[iptr]] = iptr + row_marker;
1626 }
1627 #pragma omp parallel for schedule(static) private(iptr)
1628 for (iptr=0; iptr<offset; iptr++){
1629 RAP_main_idx[row_marker+iptr] = temp[iptr];
1630 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1631 }
1632 TMPMEMFREE(temp);
1633 TMPMEMFREE(temp_val);
1634 }
1635 if (j > row_marker_ext) {
1636 offset = j - row_marker_ext;
1637 temp = TMPMEMALLOC(offset, index_t);
1638 #pragma omp parallel for schedule(static) private(iptr)
1639 for (iptr=0; iptr<offset; iptr++)
1640 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1641 if (offset > 0) {
1642 #ifdef USE_QSORTG
1643 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1644 #else
1645 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1646 #endif
1647 }
1648 temp_val = TMPMEMALLOC(offset * block_size, double);
1649 #pragma omp parallel for schedule(static) private(iptr, k)
1650 for (iptr=0; iptr<offset; iptr++){
1651 k = P_marker[temp[iptr] + num_Pmain_cols];
1652 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1653 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1654 }
1655 #pragma omp parallel for schedule(static) private(iptr)
1656 for (iptr=0; iptr<offset; iptr++){
1657 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1658 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1659 }
1660 TMPMEMFREE(temp);
1661 TMPMEMFREE(temp_val);
1662 }
1663 }
1664
1665 TMPMEMFREE(RA_val);
1666 TMPMEMFREE(RAP_val);
1667 TMPMEMFREE(A_marker);
1668 TMPMEMFREE(Pext_to_RAP);
1669 TMPMEMFREE(Pcouple_to_RAP);
1670 MEMFREE(RAP_ext_ptr);
1671 MEMFREE(RAP_ext_idx);
1672 MEMFREE(RAP_ext_val);
1673 Paso_SparseMatrix_free(R_main);
1674 Paso_SparseMatrix_free(R_couple);
1675
1676 /* Check whether there are empty columns in RAP_couple */
1677 #pragma omp parallel for schedule(static) private(i)
1678 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1679 /* num of non-empty columns is stored in "k" */
1680 k = 0;
1681 j = RAP_couple_ptr[num_Pmain_cols];
1682 for (i=0; i<j; i++) {
1683 i1 = RAP_couple_idx[i];
1684 if (P_marker[i1]) {
1685 P_marker[i1] = 0;
1686 k++;
1687 }
1688 }
1689
1690 /* empty columns is found */
1691 if (k < num_RAPext_cols) {
1692 temp = TMPMEMALLOC(k, index_t);
1693 k = 0;
1694 for (i=0; i<num_RAPext_cols; i++)
1695 if (!P_marker[i]) {
1696 P_marker[i] = k;
1697 temp[k] = global_id_RAP[i];
1698 k++;
1699 }
1700 #pragma omp parallel for schedule(static) private(i, i1)
1701 for (i=0; i<j; i++) {
1702 i1 = RAP_couple_idx[i];
1703 RAP_couple_idx[i] = P_marker[i1];
1704 }
1705 num_RAPext_cols = k;
1706 TMPMEMFREE(global_id_RAP);
1707 global_id_RAP = temp;
1708 }
1709 TMPMEMFREE(P_marker);
1710
1711 /******************************************************/
1712 /* Start to create the coasre level System Matrix A_c */
1713 /******************************************************/
1714 /* first, prepare the sender/receiver for the col_connector */
1715 dist = P->pattern->input_distribution->first_component;
1716 recv_len = TMPMEMALLOC(size, dim_t);
1717 send_len = TMPMEMALLOC(size, dim_t);
1718 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
1719 offsetInShared = TMPMEMALLOC(size+1, index_t);
1720 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
1721 memset(recv_len, 0, sizeof(dim_t) * size);
1722 memset(send_len, 0, sizeof(dim_t) * size);
1723 num_neighbors = 0;
1724 offsetInShared[0] = 0;
1725 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1726 shared[i] = i + num_Pmain_cols;
1727 if (k <= global_id_RAP[i]) {
1728 if (recv_len[j] > 0) {
1729 neighbor[num_neighbors] = j;
1730 num_neighbors ++;
1731 offsetInShared[num_neighbors] = i;
1732 }
1733 while (k <= global_id_RAP[i]) {
1734 j++;
1735 k = dist[j+1];
1736 }
1737 }
1738 recv_len[j] ++;
1739 }
1740 if (recv_len[j] > 0) {
1741 neighbor[num_neighbors] = j;
1742 num_neighbors ++;
1743 offsetInShared[num_neighbors] = i;
1744 }
1745 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1746 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1747
1748 #ifdef ESYS_MPI
1749 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1750 #endif
1751
1752 #ifdef ESYS_MPI
1753 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
1754 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
1755 #else
1756 mpi_requests=TMPMEMALLOC(size*2,int);
1757 mpi_stati=TMPMEMALLOC(size*2,int);
1758 #endif
1759 num_neighbors = 0;
1760 j = 0;
1761 offsetInShared[0] = 0;
1762 for (i=0; i<size; i++) {
1763 if (send_len[i] > 0) {
1764 neighbor[num_neighbors] = i;
1765 num_neighbors ++;
1766 j += send_len[i];
1767 offsetInShared[num_neighbors] = j;
1768 }
1769 }
1770 TMPMEMFREE(shared);
1771 shared = TMPMEMALLOC(j, index_t);
1772 for (i=0, j=0; i<num_neighbors; i++) {
1773 k = neighbor[i];
1774 #ifdef ESYS_MPI
1775 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1776 mpi_info->msg_tag_counter+k,
1777 mpi_info->comm, &mpi_requests[i]);
1778 #endif
1779 j += send_len[k];
1780 }
1781 for (i=0, j=0; i<recv->numNeighbors; i++) {
1782 k = recv->neighbor[i];
1783 #ifdef ESYS_MPI
1784 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1785 mpi_info->msg_tag_counter+rank,
1786 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1787 #endif
1788 j += recv_len[k];
1789 }
1790 #ifdef ESYS_MPI
1791 MPI_Waitall(num_neighbors + recv->numNeighbors,
1792 mpi_requests, mpi_stati);
1793 #endif
1794 mpi_info->msg_tag_counter += size;
1795
1796 j = offsetInShared[num_neighbors];
1797 offset = dist[rank];
1798 #pragma omp parallel for schedule(static) private(i)
1799 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1800 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1801 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1802
1803 col_connector = Paso_Connector_alloc(send, recv);
1804 TMPMEMFREE(shared);
1805 Paso_SharedComponents_free(recv);
1806 Paso_SharedComponents_free(send);
1807
1808 /* now, create row distribution (output_distri) and col
1809 distribution (input_distribution) */
1810 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1811 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1812
1813 /* then, prepare the sender/receiver for the row_connector, first, prepare
1814 the information for sender */
1815 sum = RAP_couple_ptr[num_Pmain_cols];
1816 len = TMPMEMALLOC(size, dim_t);
1817 send_ptr = TMPMEMALLOC(size, index_t*);
1818 send_idx = TMPMEMALLOC(size, index_t*);
1819 #pragma omp parallel for schedule(static) private(i)
1820 for (i=0; i<size; i++) {
1821 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
1822 send_idx[i] = TMPMEMALLOC(sum, index_t);
1823 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1824 }
1825 memset(len, 0, sizeof(dim_t) * size);
1826 recv = col_connector->recv;
1827 sum=0;
1828 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1829 i1 = RAP_couple_ptr[i_r];
1830 i2 = RAP_couple_ptr[i_r+1];
1831 if (i2 > i1) {
1832 /* then row i_r will be in the sender of row_connector, now check
1833 how many neighbors i_r needs to be send to */
1834 for (j=i1; j<i2; j++) {
1835 i_c = RAP_couple_idx[j];
1836 /* find out the corresponding neighbor "p" of column i_c */
1837 for (p=0; p<recv->numNeighbors; p++) {
1838 if (i_c < recv->offsetInShared[p+1]) {
1839 k = recv->neighbor[p];
1840 if (send_ptr[k][i_r] == 0) sum++;
1841 send_ptr[k][i_r] ++;
1842 send_idx[k][len[k]] = global_id_RAP[i_c];
1843 len[k] ++;
1844 break;
1845 }
1846 }
1847 }
1848 }
1849 }
1850 if (global_id_RAP) {
1851 TMPMEMFREE(global_id_RAP);
1852 global_id_RAP = NULL;
1853 }
1854
1855 /* now allocate the sender */
1856 shared = TMPMEMALLOC(sum, index_t);
1857 memset(send_len, 0, sizeof(dim_t) * size);
1858 num_neighbors=0;
1859 offsetInShared[0] = 0;
1860 for (p=0, k=0; p<size; p++) {
1861 for (i=0; i<num_Pmain_cols; i++) {
1862 if (send_ptr[p][i] > 0) {
1863 shared[k] = i;
1864 k++;
1865 send_ptr[p][send_len[p]] = send_ptr[p][i];
1866 send_len[p]++;
1867 }
1868 }
1869 if (k > offsetInShared[num_neighbors]) {
1870 neighbor[num_neighbors] = p;
1871 num_neighbors ++;
1872 offsetInShared[num_neighbors] = k;
1873 }
1874 }
1875 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1876 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1877
1878 /* send/recv number of rows will be sent from current proc
1879 recover info for the receiver of row_connector from the sender */
1880 #ifdef ESYS_MPI
1881 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1882 #endif
1883 num_neighbors = 0;
1884 offsetInShared[0] = 0;
1885 j = 0;
1886 for (i=0; i<size; i++) {
1887 if (i != rank && recv_len[i] > 0) {
1888 neighbor[num_neighbors] = i;
1889 num_neighbors ++;
1890 j += recv_len[i];
1891 offsetInShared[num_neighbors] = j;
1892 }
1893 }
1894 TMPMEMFREE(shared);
1895 TMPMEMFREE(recv_len);
1896 shared = TMPMEMALLOC(j, index_t);
1897 k = offsetInShared[num_neighbors];
1898 #pragma omp parallel for schedule(static) private(i)
1899 for (i=0; i<k; i++) {
1900 shared[i] = i + num_Pmain_cols;
1901 }
1902 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1903 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1904 row_connector = Paso_Connector_alloc(send, recv);
1905 TMPMEMFREE(shared);
1906 Paso_SharedComponents_free(recv);
1907 Paso_SharedComponents_free(send);
1908
1909 /* send/recv pattern->ptr for rowCoupleBlock */
1910 num_RAPext_rows = offsetInShared[num_neighbors];
1911 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
1912 for (p=0; p<num_neighbors; p++) {
1913 j = offsetInShared[p];
1914 i = offsetInShared[p+1];
1915 #ifdef ESYS_MPI
1916 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1917 mpi_info->msg_tag_counter+neighbor[p],
1918 mpi_info->comm, &mpi_requests[p]);
1919 #endif
1920 }
1921 send = row_connector->send;
1922 for (p=0; p<send->numNeighbors; p++) {
1923 #ifdef ESYS_MPI
1924 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1925 MPI_INT, send->neighbor[p],
1926 mpi_info->msg_tag_counter+rank,
1927 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1928 #endif
1929 }
1930 #ifdef ESYS_MPI
1931 MPI_Waitall(num_neighbors + send->numNeighbors,
1932 mpi_requests, mpi_stati);
1933 #endif
1934 mpi_info->msg_tag_counter += size;
1935 TMPMEMFREE(send_len);
1936
1937 sum = 0;
1938 for (i=0; i<num_RAPext_rows; i++) {
1939 k = row_couple_ptr[i];
1940 row_couple_ptr[i] = sum;
1941 sum += k;
1942 }
1943 row_couple_ptr[num_RAPext_rows] = sum;
1944
1945 /* send/recv pattern->index for rowCoupleBlock */
1946 k = row_couple_ptr[num_RAPext_rows];
1947 row_couple_idx = MEMALLOC(k, index_t);
1948 for (p=0; p<num_neighbors; p++) {
1949 j1 = row_couple_ptr[offsetInShared[p]];
1950 j2 = row_couple_ptr[offsetInShared[p+1]];
1951 #ifdef ESYS_MPI
1952 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1953 mpi_info->msg_tag_counter+neighbor[p],
1954 mpi_info->comm, &mpi_requests[p]);
1955 #endif
1956 }
1957 for (p=0; p<send->numNeighbors; p++) {
1958 #ifdef ESYS_MPI
1959 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1960 MPI_INT, send->neighbor[p],
1961 mpi_info->msg_tag_counter+rank,
1962 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1963 #endif
1964 }
1965 #ifdef ESYS_MPI
1966 MPI_Waitall(num_neighbors + send->numNeighbors,
1967 mpi_requests, mpi_stati);
1968 #endif
1969 mpi_info->msg_tag_counter += size;
1970
1971 offset = input_dist->first_component[rank];
1972 k = row_couple_ptr[num_RAPext_rows];
1973 #pragma omp parallel for schedule(static) private(i)
1974 for (i=0; i<k; i++) {
1975 row_couple_idx[i] -= offset;
1976 }
1977 #pragma omp parallel for schedule(static) private(i)
1978 for (i=0; i<size; i++) {
1979 TMPMEMFREE(send_ptr[i]);
1980 TMPMEMFREE(send_idx[i]);
1981 }
1982 TMPMEMFREE(send_ptr);
1983 TMPMEMFREE(send_idx);
1984 TMPMEMFREE(len);
1985 TMPMEMFREE(offsetInShared);
1986 TMPMEMFREE(neighbor);
1987 TMPMEMFREE(mpi_requests);
1988 TMPMEMFREE(mpi_stati);
1989 Esys_MPIInfo_free(mpi_info);
1990
1991 /* Now, we can create pattern for mainBlock and coupleBlock */
1992 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1993 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1994 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1995 num_Pmain_cols, num_RAPext_cols,
1996 RAP_couple_ptr, RAP_couple_idx);
1997 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1998 num_RAPext_rows, num_Pmain_cols,
1999 row_couple_ptr, row_couple_idx);
2000
2001 /* next, create the system matrix */
2002 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2003 output_dist, input_dist, main_pattern, col_couple_pattern,
2004 row_couple_pattern, col_connector, row_connector);
2005 out = Paso_SystemMatrix_alloc(A->type, pattern,
2006 row_block_size, col_block_size, FALSE);
2007
2008 /* finally, fill in the data*/
2009 memcpy(out->mainBlock->val, RAP_main_val,
2010 out->mainBlock->len* sizeof(double));
2011 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2012 out->col_coupleBlock->len * sizeof(double));
2013
2014 /* Clean up */
2015 Paso_SystemMatrixPattern_free(pattern);
2016 Paso_Pattern_free(main_pattern);
2017 Paso_Pattern_free(col_couple_pattern);
2018 Paso_Pattern_free(row_couple_pattern);
2019 Paso_Connector_free(col_connector);
2020 Paso_Connector_free(row_connector);
2021 Paso_Distribution_free(output_dist);
2022 Paso_Distribution_free(input_dist);
2023 TMPMEMFREE(RAP_main_val);
2024 TMPMEMFREE(RAP_couple_val);
2025 if (Esys_noError()) {
2026 return out;
2027 } else {
2028 Paso_SystemMatrix_free(out);
2029 return NULL;
2030 }
2031 }
2032
2033
2034 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2035 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2036 {
2037 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2038 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2039 Paso_SystemMatrix *out=NULL;
2040 Paso_SystemMatrixPattern *pattern=NULL;
2041 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2042 Paso_SharedComponents *send =NULL, *recv=NULL;
2043 Paso_Connector *col_connector=NULL, *row_connector=NULL;
2044 Paso_Pattern *main_pattern=NULL;
2045 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2046 const dim_t row_block_size=A->row_block_size;
2047 const dim_t col_block_size=A->col_block_size;
2048 const dim_t block_size = A->block_size;
2049 const double ZERO = 0.0;
2050 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2051 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2052 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2053 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2054 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2055 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2056 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2057 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2058 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2059 index_t sum, i, j, k, iptr, irb, icb, ib;
2060 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2061 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2062 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2063 index_t row_marker_ext, *where_p=NULL;
2064 index_t **send_ptr=NULL, **send_idx=NULL;
2065 dim_t p, num_neighbors;
2066 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2067 Esys_MPI_rank *neighbor=NULL;
2068 #ifdef ESYS_MPI
2069 MPI_Request* mpi_requests=NULL;
2070 MPI_Status* mpi_stati=NULL;
2071 #else
2072 int *mpi_requests=NULL, *mpi_stati=NULL;
2073 #endif
2074
2075 /* two sparse matrixes R_main and R_couple will be generate, as the
2076 transpose of P_main and P_col_couple, respectively. Note that,
2077 R_couple is actually the row_coupleBlock of R (R=P^T) */
2078 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2079 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2080
2081 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
2082 and needed locally for triple matrix product RAP
2083 to be specific, P_ext (on processor k) are group of rows in P, where
2084 the list of rows from processor q is given by
2085 A->col_coupler->connector->send->shared[rPtr...]
2086 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2087 on q.
2088 P_ext is represented by two sparse matrixes P_ext_main and
2089 P_ext_couple */
2090 Paso_Preconditioner_AMG_extendB(A, P);
2091
2092 /* count the number of cols in P_ext_couple, resize the pattern of
2093 sparse matrix P_ext_couple with new compressed order, and then
2094 build the col id mapping from P->col_coupleBlock to
2095 P_ext_couple */
2096 num_Pmain_cols = P->mainBlock->numCols;
2097 num_Pcouple_cols = P->col_coupleBlock->numCols;
2098 num_Acouple_cols = A->col_coupleBlock->numCols;
2099 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2100 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2101 offset = P->col_distribution->first_component[rank];
2102 num_Pext_cols = 0;
2103 if (P->global_id) {
2104 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2105 and P->col_coupleBlock */
2106 iptr = 0;
2107 if (num_Pcouple_cols || sum > 0) {
2108 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
2109 for (; iptr<sum; iptr++){
2110 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2111 }
2112 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2113 temp[iptr] = P->global_id[j];
2114 }
2115 }
2116 if (iptr) {
2117 #ifdef USE_QSORTG
2118 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2119 #else
2120 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2121 #endif
2122 num_Pext_cols = 1;
2123 i = temp[0];
2124 for (j=1; j<iptr; j++) {
2125 if (temp[j] > i) {
2126 i = temp[j];
2127 temp[num_Pext_cols++] = i;
2128 }
2129 }
2130 }
2131
2132 /* resize the pattern of P_ext_couple */
2133 if(num_Pext_cols){
2134 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
2135 for (i=0; i<num_Pext_cols; i++)
2136 global_id_P[i] = temp[i];
2137 }
2138 if (num_Pcouple_cols || sum > 0)
2139 TMPMEMFREE(temp);
2140 for (i=0; i<sum; i++) {
2141 where_p = (index_t *)bsearch(
2142 &(P->remote_coupleBlock->pattern->index[i]),
2143 global_id_P, num_Pext_cols,
2144 sizeof(index_t), Paso_comparIndex);
2145 P->remote_coupleBlock->pattern->index[i] =
2146 (index_t)(where_p -global_id_P);
2147 }
2148
2149 /* build the mapping */
2150 if (num_Pcouple_cols) {
2151 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
2152 iptr = 0;
2153 for (i=0; i<num_Pext_cols; i++)
2154 if (global_id_P[i] == P->global_id[iptr]) {
2155 Pcouple_to_Pext[iptr++] = i;
2156 if (iptr == num_Pcouple_cols) break;
2157 }
2158 }
2159 }
2160
2161 /* alloc and initialize the makers */
2162 sum = num_Pext_cols + num_Pmain_cols;
2163 P_marker = TMPMEMALLOC(sum, index_t);
2164 A_marker = TMPMEMALLOC(num_A_cols, index_t);
2165 #pragma omp parallel for private(i) schedule(static)
2166 for (i=0; i<sum; i++) P_marker[i] = -1;
2167 #pragma omp parallel for private(i) schedule(static)
2168 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2169
2170 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2171 sum = 0;
2172 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2173 row_marker = sum;
2174 /* then, loop over elements in row i_r of R_couple */
2175 j1_ub = R_couple->pattern->ptr[i_r+1];
2176 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2177 i1 = R_couple->pattern->index[j1];
2178 /* then, loop over elements in row i1 of A->col_coupleBlock */
2179 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2180 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2181 i2 = A->col_coupleBlock->pattern->index[j2];
2182
2183 /* check whether entry RA[i_r, i2] has been previously visited.
2184 RAP new entry is possible only if entry RA[i_r, i2] has not
2185 been visited yet */
2186 if (A_marker[i2] != i_r) {
2187 /* first, mark entry RA[i_r,i2] as visited */
2188 A_marker[i2] = i_r;
2189
2190 /* then loop over elements in row i2 of P_ext_main */
2191 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2192 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2193 i_c = P->row_coupleBlock->pattern->index[j3];
2194
2195 /* check whether entry RAP[i_r,i_c] has been created.
2196 If not yet created, create the entry and increase
2197 the total number of elements in RAP_ext */
2198 if (P_marker[i_c] < row_marker) {
2199 P_marker[i_c] = sum;
2200 sum++;
2201 }
2202 }
2203
2204 /* loop over elements in row i2 of P_ext_couple, do the same */
2205 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2206 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2207 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2208 if (P_marker[i_c] < row_marker) {
2209 P_marker[i_c] = sum;
2210 sum++;
2211 }
2212 }
2213 }
2214 }
2215
2216 /* now loop over elements in row i1 of A->mainBlock, repeat
2217 the process we have done to block A->col_coupleBlock */
2218 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2219 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2220 i2 = A->mainBlock->pattern->index[j2];
2221 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2222 A_marker[i2 + num_Acouple_cols] = i_r;
2223 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2224 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2225 i_c = P->mainBlock->pattern->index[j3];
2226 if (P_marker[i_c] < row_marker) {
2227 P_marker[i_c] = sum;
2228 sum++;
2229 }
2230 }
2231 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2232 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2233 /* note that we need to map the column index in
2234 P->col_coupleBlock back into the column index in
2235 P_ext_couple */
2236 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2237 if (P_marker[i_c] < row_marker) {
2238 P_marker[i_c] = sum;
2239 sum++;
2240 }
2241 }
2242 }
2243 }
2244 }
2245 }
2246
2247 /* Now we have the number of non-zero elements in RAP_ext, allocate
2248 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2249 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
2250 RAP_ext_idx = MEMALLOC(sum, index_t);
2251 RAP_ext_val = MEMALLOC(sum * block_size, double);
2252 RA_val = TMPMEMALLOC(block_size, double);
2253 RAP_val = TMPMEMALLOC(block_size, double);
2254
2255 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2256 sum = num_Pext_cols + num_Pmain_cols;
2257 #pragma omp parallel for private(i) schedule(static)
2258 for (i=0; i<sum; i++) P_marker[i] = -1;
2259 #pragma omp parallel for private(i) schedule(static)
2260 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2261 sum = 0;
2262 RAP_ext_ptr[0] = 0;
2263 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2264 row_marker = sum;
2265 /* then, loop over elements in row i_r of R_couple */
2266 j1_ub = R_couple->pattern->ptr[i_r+1];
2267 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2268 i1 = R_couple->pattern->index[j1];
2269 R_val = &(R_couple->val[j1*block_size]);
2270
2271 /* then, loop over elements in row i1 of A->col_coupleBlock */
2272 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2273 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2274 i2 = A->col_coupleBlock->pattern->index[j2];
2275 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2276 for (irb=0; irb<row_block_size; irb++) {
2277 for (icb=0; icb<col_block_size; icb++) {
2278 rtmp = ZERO;
2279 for (ib=0; ib<col_block_size; ib++) {
2280 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2281 }
2282 RA_val[irb+row_block_size*icb]=rtmp;
2283 }
2284 }
2285
2286 /* check whether entry RA[i_r, i2] has been previously visited.
2287 RAP new entry is possible only if entry RA[i_r, i2] has not
2288 been visited yet */
2289 if (A_marker[i2] != i_r) {
2290 /* first, mark entry RA[i_r,i2] as visited */
2291 A_marker[i2] = i_r;
2292
2293 /* then loop over elements in row i2 of P_ext_main */
2294 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2295 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2296 i_c = P->row_coupleBlock->pattern->index[j3];
2297 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2298 for (irb=0; irb<row_block_size; irb++) {
2299 for (icb=0; icb<col_block_size; icb++) {
2300 rtmp = ZERO;
2301 for (ib=0; ib<col_block_size; ib++) {
2302 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2303 }
2304 RAP_val[irb+row_block_size*icb]=rtmp;
2305 }
2306 }
2307
2308
2309 /* check whether entry RAP[i_r,i_c] has been created.
2310 If not yet created, create the entry and increase
2311 the total number of elements in RAP_ext */
2312 if (P_marker[i_c] < row_marker) {
2313 P_marker[i_c] = sum;
2314 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2315 RAP_ext_idx[sum] = i_c + offset;
2316 sum++;
2317 } else {
2318 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2319 for (ib=0; ib<block_size; ib++) {
2320 temp_val[ib] += RAP_val[ib];
2321 }
2322 }
2323 }
2324
2325 /* loop over elements in row i2 of P_ext_couple, do the same */
2326 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2327 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2328 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2329 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2330 for (irb=0; irb<row_block_size; irb++) {
2331 for (icb=0; icb<col_block_size; icb++) {
2332 rtmp = ZERO;
2333 for (ib=0; ib<col_block_size; ib++) {
2334 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2335 }
2336 RAP_val[irb+row_block_size*icb]=rtmp;
2337 }
2338 }
2339
2340 if (P_marker[i_c] < row_marker) {
2341 P_marker[i_c] = sum;
2342 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2343 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2344 sum++;
2345 } else {
2346 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2347 for (ib=0; ib<block_size; ib++) {
2348 temp_val[ib] += RAP_val[ib];
2349 }
2350 }
2351 }
2352
2353 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2354 Only the contributions are added. */
2355 } else {
2356 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2357 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2358 i_c = P->row_coupleBlock->pattern->index[j3];
2359 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2360 for (irb=0; irb<row_block_size; irb++) {
2361 for (icb=0; icb<col_block_size; icb++) {
2362 rtmp = ZERO;
2363 for (ib=0; ib<col_block_size; ib++) {
2364 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2365 }
2366 RAP_val[irb+row_block_size*icb]=rtmp;
2367 }
2368 }
2369
2370 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2371 for (ib=0; ib<block_size; ib++) {
2372 temp_val[ib] += RAP_val[ib];
2373 }
2374 }
2375 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2376 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2377 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2378 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2379 for (irb=0; irb<row_block_size; irb++) {
2380 for (icb=0; icb<col_block_size; icb++) {
2381 rtmp = ZERO;
2382 for (ib=0; ib<col_block_size; ib++) {
2383 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2384 }
2385 RAP_val[irb+row_block_size*icb]=rtmp;
2386 }
2387 }
2388
2389 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2390 for (ib=0; ib<block_size; ib++) {
2391 temp_val[ib] += RAP_val[ib];
2392 }
2393 }
2394 }
2395 }
2396
2397 /* now loop over elements in row i1 of A->mainBlock, repeat
2398 the process we have done to block A->col_coupleBlock */
2399 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2400 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2401 i2 = A->mainBlock->pattern->index[j2];
2402 temp_val = &(A->mainBlock->val[j2*block_size]);
2403 for (irb=0; irb<row_block_size; irb++) {
2404 for (icb=0; icb<col_block_size; icb++) {
2405 rtmp = ZERO;
2406 for (ib=0; ib<col_block_size; ib++) {
2407 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2408 }
2409 RA_val[irb+row_block_size*icb]=rtmp;
2410 }
2411 }
2412
2413 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2414 A_marker[i2 + num_Acouple_cols] = i_r;
2415 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2416 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2417 i_c = P->mainBlock->pattern->index[j3];
2418 temp_val = &(P->mainBlock->val[j3*block_size]);
2419 for (irb=0; irb<row_block_size; irb++) {
2420 for (icb=0; icb<col_block_size; icb++) {
2421 rtmp = ZERO;
2422 for (ib=0; ib<col_block_size; ib++) {
2423 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2424 }
2425 RAP_val[irb+row_block_size*icb]=rtmp;
2426 }
2427 }
2428
2429 if (P_marker[i_c] < row_marker) {
2430 P_marker[i_c] = sum;
2431 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2432 RAP_ext_idx[sum] = i_c + offset;
2433 sum++;
2434 } else {
2435 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2436 for (ib=0; ib<block_size; ib++) {
2437 temp_val[ib] += RAP_val[ib];
2438 }
2439 }
2440 }
2441 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2442 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2443 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2444 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2445 for (irb=0; irb<row_block_size; irb++) {
2446 for (icb=0; icb<col_block_size; icb++) {
2447 rtmp = ZERO;
2448 for (ib=0; ib<col_block_size; ib++) {
2449 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2450 }
2451 RAP_val[irb+row_block_size*icb]=rtmp;
2452 }
2453 }
2454
2455 if (P_marker[i_c] < row_marker) {
2456 P_marker[i_c] = sum;
2457 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2458 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2459 sum++;
2460 } else {
2461 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2462 for (ib=0; ib<block_size; ib++) {
2463 temp_val[ib] += RAP_val[ib];
2464 }
2465 }
2466 }
2467 } else {
2468 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2469 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2470 i_c = P->mainBlock->pattern->index[j3];
2471 temp_val = &(P->mainBlock->val[j3*block_size]);
2472 for (irb=0; irb<row_block_size; irb++) {
2473 for (icb=0; icb<col_block_size; icb++) {
2474 rtmp = ZERO;
2475 for (ib=0; ib<col_block_size; ib++) {
2476 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2477 }
2478 RAP_val[irb+row_block_size*icb]=rtmp;
2479 }
2480 }
2481
2482 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2483 for (ib=0; ib<block_size; ib++) {
2484 temp_val[ib] += RAP_val[ib];
2485 }
2486 }
2487 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2488 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2489 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2490 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2491 for (irb=0; irb<row_block_size; irb++) {
2492 for (icb=0; icb<col_block_size; icb++) {
2493 rtmp = ZERO;
2494 for (ib=0; ib<col_block_size; ib++) {
2495 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2496 }
2497 RAP_val[irb+row_block_size*icb]=rtmp;
2498 }
2499 }
2500
2501 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2502 for (ib=0; ib<block_size; ib++) {
2503 temp_val[ib] += RAP_val[ib];
2504 }
2505 }
2506 }
2507 }
2508 }
2509 RAP_ext_ptr[i_r+1] = sum;
2510 }
2511 TMPMEMFREE(P_marker);
2512 TMPMEMFREE(Pcouple_to_Pext);
2513
2514 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2515 which is given by the column list of P->col_coupleBlock, and
2516 column "c" is the list of columns which possiblely covers the
2517 whole column range of systme martris P. This part of data is to
2518 be passed to neighboring processors, and added to corresponding
2519 RAP[r,c] entries in the neighboring processors */
2520 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2521 &RAP_ext_val, global_id_P, block_size);
2522
2523 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2524 sum = RAP_ext_ptr[num_RAPext_rows];
2525 num_RAPext_cols = 0;
2526 if (num_Pext_cols || sum > 0) {
2527 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
2528 j1_ub = offset + num_Pmain_cols;
2529 for (i=0, iptr=0; i<sum; i++) {
2530 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2531 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2532 }
2533 for (j=0; j<num_Pext_cols; j++, iptr++){
2534 temp[iptr] = global_id_P[j];
2535 }
2536
2537 if (iptr) {
2538 #ifdef USE_QSORTG
2539 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2540 #else
2541 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2542 #endif
2543 num_RAPext_cols = 1;
2544 i = temp[0];
2545 for (j=1; j<iptr; j++) {
2546 if (temp[j] > i) {
2547 i = temp[j];
2548 temp[num_RAPext_cols++] = i;
2549 }
2550 }
2551 }
2552 }
2553
2554 /* resize the pattern of P_ext_couple */
2555 if(num_RAPext_cols){
2556 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
2557 for (i=0; i<num_RAPext_cols; i++)
2558 global_id_RAP[i] = temp[i];
2559 }
2560 if (num_Pext_cols || sum > 0)
2561 TMPMEMFREE(temp);
2562 j1_ub = offset + num_Pmain_cols;
2563 for (i=0; i<sum; i++) {
2564 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2565 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2566 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2567 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2568 } else
2569 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2570 }
2571
2572 /* build the mapping */
2573 if (num_Pcouple_cols) {
2574 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
2575 iptr = 0;
2576 for (i=0; i<num_RAPext_cols; i++)
2577 if (global_id_RAP[i] == P->global_id[iptr]) {
2578 Pcouple_to_RAP[iptr++] = i;
2579 if (iptr == num_Pcouple_cols) break;
2580 }
2581 }
2582
2583 if (num_Pext_cols) {
2584 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
2585 iptr = 0;
2586 for (i=0; i<num_RAPext_cols; i++)
2587 if (global_id_RAP[i] == global_id_P[iptr]) {
2588 Pext_to_RAP[iptr++] = i;
2589 if (iptr == num_Pext_cols) break;
2590 }
2591 }
2592
2593 if (global_id_P){
2594 TMPMEMFREE(global_id_P);
2595 global_id_P = NULL;
2596 }
2597
2598 /* alloc and initialize the makers */
2599 sum = num_RAPext_cols + num_Pmain_cols;
2600 P_marker = TMPMEMALLOC(sum, index_t);
2601 #pragma omp parallel for private(i) schedule(static)
2602 for (i=0; i<sum; i++) P_marker[i] = -1;
2603 #pragma omp parallel for private(i) schedule(static)
2604 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2605
2606 /* Now, count the size of RAP. Start with rows in R_main */
2607 num_neighbors = P->col_coupler->connector->send->numNeighbors;
2608 offsetInShared = P->col_coupler->connector->send->offsetInShared;
2609 shared = P->col_coupler->connector->send->shared;
2610 i = 0;
2611 j = 0;
2612 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2613 /* Mark the start of row for both main block and couple block */
2614 row_marker = i;
2615 row_marker_ext = j;
2616
2617 /* Mark the diagonal element RAP[i_r, i_r], and other elements
2618 in RAP_ext */
2619 P_marker[i_r] = i;
2620 i++;
2621 for (j1=0; j1<num_neighbors; j1++) {
2622 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2623 if (shared[j2] == i_r) {
2624 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2625 i_c = RAP_ext_idx[k];
2626 if (i_c < num_Pmain_cols) {
2627 if (P_marker[i_c] < row_marker) {
2628 P_marker[i_c] = i;
2629 i++;
2630 }
2631 } else {
2632 if (P_marker[i_c] < row_marker_ext) {
2633 P_marker[i_c] = j;
2634 j++;
2635 }
2636 }
2637 }
2638 break;
2639 }
2640 }
2641 }
2642
2643 /* then, loop over elements in row i_r of R_main */
2644 j1_ub = R_main->pattern->ptr[i_r+1];
2645 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2646 i1 = R_main->pattern->index[j1];
2647
2648 /* then, loop over elements in row i1 of A->col_coupleBlock */
2649 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2650 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2651 i2 = A->col_coupleBlock->pattern->index[j2];
2652
2653 /* check whether entry RA[i_r, i2] has been previously visited.
2654 RAP new entry is possible only if entry RA[i_r, i2] has not
2655 been visited yet */
2656 if (A_marker[i2] != i_r) {
2657 /* first, mark entry RA[i_r,i2] as visited */
2658 A_marker[i2] = i_r;
2659
2660 /* then loop over elements in row i2 of P_ext_main */
2661 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2662 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2663 i_c = P->row_coupleBlock->pattern->index[j3];
2664
2665 /* check whether entry RAP[i_r,i_c] has been created.
2666 If not yet created, create the entry and increase
2667 the total number of elements in RAP_ext */
2668 if (P_marker[i_c] < row_marker) {
2669 P_marker[i_c] = i;
2670 i++;
2671 }
2672 }
2673
2674 /* loop over elements in row i2 of P_ext_couple, do the same */
2675 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2676 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2677 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2678 if (P_marker[i_c] < row_marker_ext) {
2679 P_marker[i_c] = j;
2680 j++;
2681 }
2682 }
2683 }
2684 }
2685
2686 /* now loop over elements in row i1 of A->mainBlock, repeat
2687 the process we have done to block A->col_coupleBlock */
2688 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2689 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2690 i2 = A->mainBlock->pattern->index[j2];
2691 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2692 A_marker[i2 + num_Acouple_cols] = i_r;
2693 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2694 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2695 i_c = P->mainBlock->pattern->index[j3];
2696 if (P_marker[i_c] < row_marker) {
2697 P_marker[i_c] = i;
2698 i++;
2699 }
2700 }
2701 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2702 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2703 /* note that we need to map the column index in
2704 P->col_coupleBlock back into the column index in
2705 P_ext_couple */
2706 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2707 if (P_marker[i_c] < row_marker_ext) {
2708 P_marker[i_c] = j;
2709 j++;
2710 }
2711 }
2712 }
2713 }
2714 }
2715 }
2716
2717 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
2718 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2719 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2720 RAP_couple */
2721 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2722 RAP_main_idx = MEMALLOC(i, index_t);
2723 RAP_main_val = TMPMEMALLOC(i * block_size, double);
2724 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2725 RAP_couple_idx = MEMALLOC(j, index_t);
2726 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
2727
2728 RAP_main_ptr[num_Pmain_cols] = i;
2729 RAP_couple_ptr[num_Pmain_cols] = j;
2730
2731 #pragma omp parallel for private(i) schedule(static)
2732 for (i=0; i<sum; i++) P_marker[i] = -1;
2733 #pragma omp parallel for private(i) schedule(static)
2734 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2735
2736 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
2737 in R_main */
2738 i = 0;
2739 j = 0;
2740 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2741 /* Mark the start of row for both main block and couple block */
2742 row_marker = i;
2743 row_marker_ext = j;
2744 RAP_main_ptr[i_r] = row_marker;
2745 RAP_couple_ptr[i_r] = row_marker_ext;
2746
2747 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2748 in row i_r of RAP_ext */
2749 P_marker[i_r] = i;
2750 RAP_main_val[i] = ZERO;
2751 RAP_main_idx[i] = i_r;
2752 i++;
2753
2754 for (j1=0; j1<num_neighbors; j1++) {
2755 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2756 if (shared[j2] == i_r) {
2757 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2758 i_c = RAP_ext_idx[k];
2759 if (i_c < num_Pmain_cols) {
2760 if (P_marker[i_c] < row_marker) {
2761 P_marker[i_c] = i;
2762 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2763 RAP_main_idx[i] = i_c;
2764 i++;
2765 } else {
2766 temp_val = &(RAP_ext_val[k*block_size]);
2767 RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2768 for (ib=0; ib<block_size; ib++)
2769 RAP_val[ib] += temp_val[ib];
2770 }
2771 } else {
2772 if (P_marker[i_c] < row_marker_ext) {
2773 P_marker[i_c] = j;
2774 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2775 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2776 j++;
2777 } else {
2778 temp_val = &(RAP_ext_val[k*block_size]);
2779 RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2780 for (ib=0; ib<block_size; ib++)
2781 RAP_val[ib] += temp_val[ib];
2782 }
2783 }
2784 }
2785 break;
2786 }
2787 }
2788 }
2789
2790 /* then, loop over elements in row i_r of R_main */
2791 j1_ub = R_main->pattern->ptr[i_r+1];
2792 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2793 i1 = R_main->pattern->index[j1];
2794 R_val = &(R_main->val[j1*block_size]);
2795
2796 /* then, loop over elements in row i1 of A->col_coupleBlock */
2797 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2798 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2799 i2 = A->col_coupleBlock->pattern->index[j2];
2800 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2801 for (irb=0; irb<row_block_size; irb++) {
2802 for (icb=0; icb<col_block_size; icb++) {
2803 rtmp = ZERO;
2804 for (ib=0; ib<col_block_size; ib++) {
2805 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2806 }
2807 RA_val[irb+row_block_size*icb]=rtmp;
2808 }
2809 }
2810
2811
2812 /* check whether entry RA[i_r, i2] has been previously visited.
2813 RAP new entry is possible only if entry RA[i_r, i2] has not
2814 been visited yet */
2815 if (A_marker[i2] != i_r) {
2816 /* first, mark entry RA[i_r,i2] as visited */
2817 A_marker[i2] = i_r;
2818
2819 /* then loop over elements in row i2 of P_ext_main */
2820 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2821 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2822 i_c = P->row_coupleBlock->pattern->index[j3];
2823 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2824 for (irb=0; irb<row_block_size; irb++) {
2825 for (icb=0; icb<col_block_size; icb++) {
2826 rtmp = ZERO;
2827 for (ib=0; ib<col_block_size; ib++) {
2828 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2829 }
2830 RAP_val[irb+row_block_size*icb]=rtmp;
2831 }
2832 }
2833
2834
2835 /* check whether entry RAP[i_r,i_c] has been created.
2836 If not yet created, create the entry and increase
2837 the total number of elements in RAP_ext */
2838 if (P_marker[i_c] < row_marker) {
2839 P_marker[i_c] = i;
2840 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2841 RAP_main_idx[i] = i_c;
2842 i++;
2843 } else {
2844 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2845 for (ib=0; ib<block_size; ib++) {
2846 temp_val[ib] += RAP_val[ib];
2847 }
2848 }
2849 }
2850
2851 /* loop over elements in row i2 of P_ext_couple, do the same */
2852 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2853 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2854 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2855 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2856 for (irb=0; irb<row_block_size; irb++) {
2857 for (icb=0; icb<col_block_size; icb++) {
2858 rtmp = ZERO;
2859 for (ib=0; ib<col_block_size; ib++) {
2860 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2861 }
2862 RAP_val[irb+row_block_size*icb]=rtmp;
2863 }
2864 }
2865
2866 if (P_marker[i_c] < row_marker_ext) {
2867 P_marker[i_c] = j;
2868 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2869 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2870 j++;
2871 } else {
2872 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2873 for (ib=0; ib<block_size; ib++) {
2874 temp_val[ib] += RAP_val[ib];
2875 }
2876 }
2877 }
2878
2879 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2880 Only the contributions are added. */
2881 } else {
2882 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2883 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2884 i_c = P->row_coupleBlock->pattern->index[j3];
2885 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2886 for (irb=0; irb<row_block_size; irb++) {
2887 for (icb=0; icb<col_block_size; icb++) {
2888 rtmp = ZERO;
2889 for (ib=0; ib<col_block_size; ib++) {
2890 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2891 }
2892 RAP_val[irb+row_block_size*icb]=rtmp;
2893 }
2894 }
2895
2896 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2897 for (ib=0; ib<block_size; ib++) {
2898 temp_val[ib] += RAP_val[ib];
2899 }
2900 }
2901 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2902 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2903 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2904 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2905 for (irb=0; irb<row_block_size; irb++) {
2906 for (icb=0; icb<col_block_size; icb++) {
2907 rtmp = ZERO;
2908 for (ib=0; ib<col_block_size; ib++) {
2909 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2910 }
2911 RAP_val[irb+row_block_size*icb]=rtmp;
2912 }
2913 }
2914
2915 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2916 for (ib=0; ib<block_size; ib++) {
2917 temp_val[ib] += RAP_val[ib];
2918 }
2919 }
2920 }
2921 }
2922
2923 /* now loop over elements in row i1 of A->mainBlock, repeat
2924 the process we have done to block A->col_coupleBlock */
2925 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2926 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2927 i2 = A->mainBlock->pattern->index[j2];
2928 temp_val = &(A->mainBlock->val[j2*block_size]);
2929 for (irb=0; irb<row_block_size; irb++) {
2930 for (icb=0; icb<col_block_size; icb++) {
2931 rtmp = ZERO;
2932 for (ib=0; ib<col_block_size; ib++) {
2933 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2934 }
2935 RA_val[irb+row_block_size*icb]=rtmp;
2936 }
2937 }
2938
2939 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2940 A_marker[i2 + num_Acouple_cols] = i_r;
2941 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2942 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2943 i_c = P->mainBlock->pattern->index[j3];
2944 temp_val = &(P->mainBlock->val[j3*block_size]);
2945 for (irb=0; irb<row_block_size; irb++) {
2946 for (icb=0; icb<col_block_size; icb++) {
2947 rtmp = ZERO;
2948 for (ib=0; ib<col_block_size; ib++) {
2949 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2950 }
2951 RAP_val[irb+row_block_size*icb]=rtmp;
2952 }
2953 }
2954
2955 if (P_marker[i_c] < row_marker) {
2956 P_marker[i_c] = i;
2957 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2958 RAP_main_idx[i] = i_c;
2959 i++;
2960 } else {
2961 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2962 for (ib=0; ib<block_size; ib++) {
2963 temp_val[ib] += RAP_val[ib];
2964 }
2965 }
2966 }
2967 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2968 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2969 /* note that we need to map the column index in
2970 P->col_coupleBlock back into the column index in
2971 P_ext_couple */
2972 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2973 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2974 for (irb=0; irb<row_block_size; irb++) {
2975 for (icb=0; icb<col_block_size; icb++) {
2976 rtmp = ZERO;
2977 for (ib=0; ib<col_block_size; ib++) {
2978 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2979 }
2980 RAP_val[irb+row_block_size*icb]=rtmp;
2981 }
2982 }
2983
2984 if (P_marker[i_c] < row_marker_ext) {
2985 P_marker[i_c] = j;
2986 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2987 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2988 j++;
2989 } else {
2990 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2991 for (ib=0; ib<block_size; ib++) {
2992 temp_val[ib] += RAP_val[ib];
2993 }
2994 }
2995 }
2996
2997 } else {
2998 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2999 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3000 i_c = P->mainBlock->pattern->index[j3];
3001 temp_val = &(P->mainBlock->val[j3*block_size]);
3002 for (irb=0; irb<row_block_size; irb++) {
3003 for (icb=0; icb<col_block_size; icb++) {
3004 rtmp = ZERO;
3005 for (ib=0; ib<col_block_size; ib++) {
3006 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3007 }
3008 RAP_val[irb+row_block_size*icb]=rtmp;
3009 }
3010 }
3011
3012 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3013 for (ib=0; ib<block_size; ib++) {
3014 temp_val[ib] += RAP_val[ib];
3015 }
3016 }
3017 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
3018 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3019 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3020 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
3021 for (irb=0; irb<row_block_size; irb++) {
3022 for (icb=0; icb<col_block_size; icb++) {
3023 rtmp = ZERO;
3024 for (ib=0; ib<col_block_size; ib++) {
3025 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3026 }
3027 RAP_val[irb+row_block_size*icb]=rtmp;
3028 }
3029 }
3030
3031 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3032 for (ib=0; ib<block_size; ib++) {
3033 temp_val[ib] += RAP_val[ib];
3034 }
3035 }
3036 }
3037 }
3038 }
3039
3040 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
3041 if (i > row_marker) {
3042 offset = i - row_marker;
3043 temp = TMPMEMALLOC(offset, index_t);
3044 for (iptr=0; iptr<offset; iptr++)
3045 temp[iptr] = RAP_main_idx[row_marker+iptr];
3046 if (offset > 0) {
3047 #ifdef USE_QSORTG
3048 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3049 #else
3050 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3051 #endif
3052 }
3053 temp_val = TMPMEMALLOC(offset * block_size, double);
3054 for (iptr=0; iptr<offset; iptr++){
3055 k = P_marker[temp[iptr]];
3056 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
3057 P_marker[temp[iptr]] = iptr + row_marker;
3058 }
3059 for (iptr=0; iptr<offset; iptr++){
3060 RAP_main_idx[row_marker+iptr] = temp[iptr];
3061 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3062 }
3063 TMPMEMFREE(temp);
3064 TMPMEMFREE(temp_val);
3065 }
3066 if (j > row_marker_ext) {
3067 offset = j - row_marker_ext;
3068 temp = TMPMEMALLOC(offset, index_t);
3069 for (iptr=0; iptr<offset; iptr++)
3070 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
3071 if (offset > 0) {
3072 #ifdef USE_QSORTG
3073 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3074 #else
3075 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3076 #endif
3077 }
3078 temp_val = TMPMEMALLOC(offset * block_size, double);
3079 for (iptr=0; iptr<offset; iptr++){
3080 k = P_marker[temp[iptr] + num_Pmain_cols];
3081 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
3082 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
3083 }
3084 for (iptr=0; iptr<offset; iptr++){
3085 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
3086 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3087 }
3088 TMPMEMFREE(temp);
3089 TMPMEMFREE(temp_val);
3090 }
3091 }
3092
3093 TMPMEMFREE(RA_val);
3094 TMPMEMFREE(RAP_val);
3095 TMPMEMFREE(A_marker);
3096 TMPMEMFREE(Pext_to_RAP);
3097 TMPMEMFREE(Pcouple_to_RAP);
3098 MEMFREE(RAP_ext_ptr);
3099 MEMFREE(RAP_ext_idx);
3100 MEMFREE(RAP_ext_val);
3101 Paso_SparseMatrix_free(R_main);
3102 Paso_SparseMatrix_free(R_couple);
3103
3104 /* Check whether there are empty columns in RAP_couple */
3105 #pragma omp parallel for schedule(static) private(i)
3106 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
3107 /* num of non-empty columns is stored in "k" */
3108 k = 0;
3109 j = RAP_couple_ptr[num_Pmain_cols];
3110 for (i=0; i<j; i++) {
3111 i1 = RAP_couple_idx[i];
3112 if (P_marker[i1]) {
3113 P_marker[i1] = 0;
3114 k++;
3115 }
3116 }
3117
3118 /* empty columns is found */
3119 if (k < num_RAPext_cols) {
3120 temp = TMPMEMALLOC(k, index_t);
3121 k = 0;
3122 for (i=0; i<num_RAPext_cols; i++)
3123 if (!P_marker[i]) {
3124 P_marker[i] = k;
3125 temp[k] = global_id_RAP[i];
3126 k++;
3127 }
3128 for (i=0; i<j; i++) {
3129 i1 = RAP_couple_idx[i];
3130 RAP_couple_idx[i] = P_marker[i1];
3131 }
3132 num_RAPext_cols = k;
3133 TMPMEMFREE(global_id_RAP);
3134 global_id_RAP = temp;
3135 }
3136 TMPMEMFREE(P_marker);
3137
3138 /******************************************************/
3139 /* Start to create the coasre level System Matrix A_c */
3140 /******************************************************/
3141 /* first, prepare the sender/receiver for the col_connector */
3142 dist = P->pattern->input_distribution->first_component;
3143 recv_len = TMPMEMALLOC(size, dim_t);
3144 send_len = TMPMEMALLOC(size, dim_t);
3145 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
3146 offsetInShared = TMPMEMALLOC(size+1, index_t);
3147 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
3148 memset(recv_len, 0, sizeof(dim_t) * size);
3149 memset(send_len, 0, sizeof(dim_t) * size);
3150 num_neighbors = 0;
3151 offsetInShared[0] = 0;
3152 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
3153 shared[i] = i + num_Pmain_cols;
3154 if (k <= global_id_RAP[i]) {
3155 if (recv_len[j] > 0) {
3156 neighbor[num_neighbors] = j;
3157 num_neighbors ++;
3158 offsetInShared[num_neighbors] = i;
3159 }
3160 while (k <= global_id_RAP[i]) {
3161 j++;
3162 k = dist[j+1];
3163 }
3164 }
3165 recv_len[j] ++;
3166 }
3167 if (recv_len[j] > 0) {
3168 neighbor[num_neighbors] = j;
3169 num_neighbors ++;
3170 offsetInShared[num_neighbors] = i;
3171 }
3172 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3173 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3174
3175 #ifdef ESYS_MPI
3176 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
3177 #endif
3178
3179 #ifdef ESYS_MPI
3180 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
3181 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
3182 #else
3183 mpi_requests=TMPMEMALLOC(size*2,int);
3184 mpi_stati=TMPMEMALLOC(size*2,int);
3185 #endif
3186 num_neighbors = 0;
3187 j = 0;
3188 offsetInShared[0] = 0;
3189 for (i=0; i<size; i++) {
3190 if (send_len[i] > 0) {
3191 neighbor[num_neighbors] = i;
3192 num_neighbors ++;
3193 j += send_len[i];
3194 offsetInShared[num_neighbors] = j;
3195 }
3196 }
3197 TMPMEMFREE(shared);
3198 shared = TMPMEMALLOC(j, index_t);
3199 for (i=0, j=0; i<num_neighbors; i++) {
3200 k = neighbor[i];
3201 #ifdef ESYS_MPI
3202 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
3203 mpi_info->msg_tag_counter+k,
3204 mpi_info->comm, &mpi_requests[i]);
3205 #endif
3206 j += send_len[k];
3207 }
3208 for (i=0, j=0; i<recv->numNeighbors; i++) {
3209 k = recv->neighbor[i];
3210 #ifdef ESYS_MPI
3211 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
3212 mpi_info->msg_tag_counter+rank,
3213 mpi_info->comm, &mpi_requests[i+num_neighbors]);
3214 #endif
3215 j += recv_len[k];
3216 }
3217 #ifdef ESYS_MPI
3218 MPI_Waitall(num_neighbors + recv->numNeighbors,
3219 mpi_requests, mpi_stati);
3220 #endif
3221 mpi_info->msg_tag_counter += size;
3222
3223 j = offsetInShared[num_neighbors];
3224 offset = dist[rank];
3225 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
3226 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3227 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3228
3229 col_connector = Paso_Connector_alloc(send, recv);
3230 TMPMEMFREE(shared);
3231 Paso_SharedComponents_free(recv);
3232 Paso_SharedComponents_free(send);
3233
3234 /* now, create row distribution (output_distri) and col
3235 distribution (input_distribution) */
3236 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3237 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3238
3239 /* then, prepare the sender/receiver for the row_connector, first, prepare
3240 the information for sender */
3241 sum = RAP_couple_ptr[num_Pmain_cols];
3242 len = TMPMEMALLOC(size, dim_t);
3243 send_ptr = TMPMEMALLOC(size, index_t*);
3244 send_idx = TMPMEMALLOC(size, index_t*);
3245 for (i=0; i<size; i++) {
3246 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
3247 send_idx[i] = TMPMEMALLOC(sum, index_t);
3248 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
3249 }
3250 memset(len, 0, sizeof(dim_t) * size);
3251 recv = col_connector->recv;
3252 sum=0;
3253 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
3254 i1 = RAP_couple_ptr[i_r];
3255 i2 = RAP_couple_ptr[i_r+1];
3256 if (i2 > i1) {
3257 /* then row i_r will be in the sender of row_connector, now check
3258 how many neighbors i_r needs to be send to */
3259 for (j=i1; j<i2; j++) {
3260 i_c = RAP_couple_idx[j];
3261 /* find out the corresponding neighbor "p" of column i_c */
3262 for (p=0; p<recv->numNeighbors; p++) {
3263 if (i_c < recv->offsetInShared[p+1]) {
3264 k = recv->neighbor[p];
3265 if (send_ptr[k][i_r] == 0) sum++;
3266 send_ptr[k][i_r] ++;
3267 send_idx[k][len[k]] = global_id_RAP[i_c];
3268 len[k] ++;
3269 break;
3270 }
3271 }
3272 }
3273 }
3274 }
3275 if (global_id_RAP) {
3276 TMPMEMFREE(global_id_RAP);
3277 global_id_RAP = NULL;
3278 }
3279
3280 /* now allocate the sender */
3281 shared = TMPMEMALLOC(sum, index_t);
3282 memset(send_len, 0, sizeof(dim_t) * size);
3283 num_neighbors=0;
3284 offsetInShared[0] = 0;
3285 for (p=0, k=0; p<size; p++) {
3286 for (i=0; i<num_Pmain_cols; i++) {
3287 if (send_ptr[p][i] > 0) {
3288 shared[k] = i;
3289 k++;
3290 send_ptr[p][send_len[p]] = send_ptr[p][i];
3291 send_len[p]++;
3292 }
3293 }
3294 if (k > offsetInShared[num_neighbors]) {
3295 neighbor[num_neighbors] = p;
3296 num_neighbors ++;
3297 offsetInShared[num_neighbors] = k;
3298 }
3299 }
3300 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3301 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3302
3303 /* send/recv number of rows will be sent from current proc
3304 recover info for the receiver of row_connector from the sender */
3305 #ifdef ESYS_MPI
3306 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
3307 #endif
3308 num_neighbors = 0;
3309 offsetInShared[0] = 0;
3310 j = 0;
3311 for (i=0; i<size; i++) {
3312 if (i != rank && recv_len[i] > 0) {
3313 neighbor[num_neighbors] = i;
3314 num_neighbors ++;
3315 j += recv_len[i];
3316 offsetInShared[num_neighbors] = j;
3317 }
3318 }
3319 TMPMEMFREE(shared);
3320 TMPMEMFREE(recv_len);
3321 shared = TMPMEMALLOC(j, index_t);
3322 k = offsetInShared[num_neighbors];
3323 for (i=0; i<k; i++) {
3324 shared[i] = i + num_Pmain_cols;
3325 }
3326 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3327 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3328 row_connector = Paso_Connector_alloc(send, recv);
3329 TMPMEMFREE(shared);
3330 Paso_SharedComponents_free(recv);
3331 Paso_SharedComponents_free(send);
3332
3333 /* send/recv pattern->ptr for rowCoupleBlock */
3334 num_RAPext_rows = offsetInShared[num_neighbors];
3335 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
3336 for (p=0; p<num_neighbors; p++) {
3337 j = offsetInShared[p];
3338 i = offsetInShared[p+1];
3339 #ifdef ESYS_MPI
3340 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
3341 mpi_info->msg_tag_counter+neighbor[p],
3342 mpi_info->comm, &mpi_requests[p]);
3343 #endif
3344 }
3345 send = row_connector->send;
3346 for (p=0; p<send->numNeighbors; p++) {
3347 #ifdef ESYS_MPI
3348 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
3349 MPI_INT, send->neighbor[p],
3350 mpi_info->msg_tag_counter+rank,
3351 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3352 #endif
3353 }
3354 #ifdef ESYS_MPI
3355 MPI_Waitall(num_neighbors + send->numNeighbors,
3356 mpi_requests, mpi_stati);
3357 #endif
3358 mpi_info->msg_tag_counter += size;
3359 TMPMEMFREE(send_len);
3360
3361 sum = 0;
3362 for (i=0; i<num_RAPext_rows; i++) {
3363 k = row_couple_ptr[i];
3364 row_couple_ptr[i] = sum;
3365 sum += k;
3366 }
3367 row_couple_ptr[num_RAPext_rows] = sum;
3368
3369 /* send/recv pattern->index for rowCoupleBlock */
3370 k = row_couple_ptr[num_RAPext_rows];
3371 row_couple_idx = MEMALLOC(k, index_t);
3372 for (p=0; p<num_neighbors; p++) {
3373 j1 = row_couple_ptr[offsetInShared[p]];
3374 j2 = row_couple_ptr[offsetInShared[p+1]];
3375 #ifdef ESYS_MPI
3376 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
3377 mpi_info->msg_tag_counter+neighbor[p],
3378 mpi_info->comm, &mpi_requests[p]);
3379 #endif
3380 }
3381 for (p=0; p<send->numNeighbors; p++) {
3382 #ifdef ESYS_MPI
3383 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
3384 MPI_INT, send->neighbor[p],
3385 mpi_info->msg_tag_counter+rank,
3386 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3387 #endif
3388 }
3389 #ifdef ESYS_MPI
3390 MPI_Waitall(num_neighbors + send->numNeighbors,
3391 mpi_requests, mpi_stati);
3392 #endif
3393 mpi_info->msg_tag_counter += size;
3394
3395 offset = input_dist->first_component[rank];
3396 k = row_couple_ptr[num_RAPext_rows];
3397 for (i=0; i<k; i++) {
3398 row_couple_idx[i] -= offset;
3399 }
3400
3401 for (i=0; i<size; i++) {
3402 TMPMEMFREE(send_ptr[i]);
3403 TMPMEMFREE(send_idx[i]);
3404 }
3405 TMPMEMFREE(send_ptr);
3406 TMPMEMFREE(send_idx);
3407 TMPMEMFREE(len);
3408 TMPMEMFREE(offsetInShared);
3409 TMPMEMFREE(neighbor);
3410 TMPMEMFREE(mpi_requests);
3411 TMPMEMFREE(mpi_stati);
3412 Esys_MPIInfo_free(mpi_info);
3413
3414 /* Now, we can create pattern for mainBlock and coupleBlock */
3415 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
3416 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
3417 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3418 num_Pmain_cols, num_RAPext_cols,
3419 RAP_couple_ptr, RAP_couple_idx);
3420 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3421 num_RAPext_rows, num_Pmain_cols,
3422 row_couple_ptr, row_couple_idx);
3423
3424 /* next, create the system matrix */
3425 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
3426 output_dist, input_dist, main_pattern, col_couple_pattern,
3427 row_couple_pattern, col_connector, row_connector);
3428 out = Paso_SystemMatrix_alloc(A->type, pattern,
3429 row_block_size, col_block_size, FALSE);
3430
3431 /* finally, fill in the data*/
3432 memcpy(out->mainBlock->val, RAP_main_val,
3433 out->mainBlock->len* sizeof(double));
3434 memcpy(out->col_coupleBlock->val, RAP_couple_val,
3435 out->col_coupleBlock->len * sizeof(double));
3436
3437 /* Clean up */
3438 Paso_SystemMatrixPattern_free(pattern);
3439 Paso_Pattern_free(main_pattern);
3440 Paso_Pattern_free(col_couple_pattern);
3441 Paso_Pattern_free(row_couple_pattern);
3442 Paso_Connector_free(col_connector);
3443 Paso_Connector_free(row_connector);
3444 Paso_Distribution_free(output_dist);
3445 Paso_Distribution_free(input_dist);
3446 TMPMEMFREE(RAP_main_val);
3447 TMPMEMFREE(RAP_couple_val);
3448 if (Esys_noError()) {
3449 return out;
3450 } else {
3451 Paso_SystemMatrix_free(out);
3452 return NULL;
3453 }
3454 }

  ViewVC Help
Powered by ViewVC 1.1.26