/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3981 - (show annotations)
Fri Sep 21 02:47:54 2012 UTC (7 years, 2 months ago) by jfenwick
Original Path: trunk/paso/src/AMG_Interpolation.c
File MIME type: text/plain
File size: 123580 byte(s)
First pass of updating copyright notices
1 /*****************************************************************************
2 *
3 * Copyright (c) 2003-2012 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development since 2012 by School of Earth Sciences
12 *
13 *****************************************************************************/
14
15
16 /************************************************************************************/
17
18 /* Paso: defines AMG Interpolation */
19
20 /************************************************************************************/
21
22 /* Author: l.gao@uq.edu.au */
23
24 /************************************************************************************/
25
26 #include "Paso.h"
27 #include "SparseMatrix.h"
28 #include "PasoUtil.h"
29 #include "Preconditioner.h"
30
31 /************************************************************************************
32
33 Methods nessecary for AMG preconditioner
34
35 construct n_C x n_C interpolation operator A_C from matrix A
36 and prolongation matrix P.
37
38 The coarsening operator A_C is defined by RAP where R=P^T.
39
40 *************************************************************************************/
41
42 /* Extend system matrix B with extra two sparse matrixes:
43 B_ext_main and B_ext_couple
44 The combination of this two sparse matrixes represents
45 the portion of B that is stored on neighbor procs and
46 needed locally for triple matrix product (B^T) A B.
47
48 FOR NOW, we assume that the system matrix B has a NULL
49 row_coupleBlock and a NULL remote_coupleBlock. Based on
50 this assumption, we use link row_coupleBlock for sparse
51 matrix B_ext_main, and link remote_coupleBlock for sparse
52 matrix B_ext_couple.
53
54 To be specific, B_ext (on processor k) are group of rows
55 in B, where the list of rows from processor q is given by
56 A->col_coupler->connector->send->shared[rPtr...]
57 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
58 on q. */
59 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
60 {
61 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
62 Paso_Coupler *coupler=NULL;
63 Paso_SharedComponents *send=NULL, *recv=NULL;
64 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
65 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
66 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
67 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
68 index_t *idx_m=NULL, *idx_c=NULL;
69 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
70 index_t offset, len, block_size, block_size_size, max_num_cols;
71 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
72 dim_t *recv_degree=NULL, *send_degree=NULL;
73 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
74
75 if (size == 1) return;
76
77 if (B->row_coupleBlock) {
78 Paso_SparseMatrix_free(B->row_coupleBlock);
79 B->row_coupleBlock = NULL;
80 }
81
82 if (B->row_coupleBlock || B->remote_coupleBlock) {
83 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
84 return;
85 }
86
87 /* sending/receiving unknown's global ID */
88 num_main_cols = B->mainBlock->numCols;
89 cols = TMPMEMALLOC(num_main_cols, double);
90 offset = B->col_distribution->first_component[rank];
91 #pragma omp parallel for private(i) schedule(static)
92 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
93 if (B->global_id == NULL) {
94 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
95 Paso_Coupler_startCollect(coupler, cols);
96 }
97
98 recv_buf = TMPMEMALLOC(size, index_t);
99 recv_degree = TMPMEMALLOC(size, dim_t);
100 recv_offset = TMPMEMALLOC(size+1, index_t);
101 #pragma omp parallel for private(i) schedule(static)
102 for (i=0; i<size; i++){
103 recv_buf[i] = 0;
104 recv_degree[i] = 1;
105 recv_offset[i] = i;
106 }
107
108 block_size = B->block_size;
109 block_size_size = block_size * sizeof(double);
110 num_couple_cols = B->col_coupleBlock->numCols;
111 send = A->col_coupler->connector->send;
112 recv = A->col_coupler->connector->recv;
113 num_neighbors = send->numNeighbors;
114 p = send->offsetInShared[num_neighbors];
115 len = p * B->col_distribution->first_component[size];
116 send_buf = TMPMEMALLOC(len * block_size, double);
117 send_idx = TMPMEMALLOC(len, index_t);
118 send_offset = TMPMEMALLOC((p+1)*2, index_t);
119 send_degree = TMPMEMALLOC(num_neighbors, dim_t);
120 i = num_main_cols + num_couple_cols;
121 send_m = TMPMEMALLOC(i * block_size, double);
122 send_c = TMPMEMALLOC(i * block_size, double);
123 idx_m = TMPMEMALLOC(i, index_t);
124 idx_c = TMPMEMALLOC(i, index_t);
125
126 /* waiting for receiving unknown's global ID */
127 if (B->global_id == NULL) {
128 Paso_Coupler_finishCollect(coupler);
129 global_id = MEMALLOC(num_couple_cols, index_t);
130 #pragma omp parallel for private(i) schedule(static)
131 for (i=0; i<num_couple_cols; ++i)
132 global_id[i] = coupler->recv_buffer[i];
133 B->global_id = global_id;
134 Paso_Coupler_free(coupler);
135 } else
136 global_id = B->global_id;
137
138 /* distribute the number of cols in current col_coupleBlock to all ranks */
139 #ifdef ESYS_MPI
140 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
141 #endif
142
143 /* distribute global_ids of cols to be considered to all ranks*/
144 len = 0;
145 max_num_cols = 0;
146 for (i=0; i<size; i++){
147 recv_degree[i] = recv_buf[i];
148 recv_offset[i] = len;
149 len += recv_buf[i];
150 if (max_num_cols < recv_buf[i])
151 max_num_cols = recv_buf[i];
152 }
153 recv_offset[size] = len;
154 cols_array = TMPMEMALLOC(len, index_t);
155 #ifdef ESYS_MPI
156 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
157 #endif
158
159 /* first, prepare the ptr_ptr to be received */
160 q = recv->numNeighbors;
161 len = recv->offsetInShared[q];
162 ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
163 for (p=0; p<q; p++) {
164 row = recv->offsetInShared[p];
165 m = recv->offsetInShared[p + 1];
166 #ifdef ESYS_MPI
167 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
168 A->mpi_info->msg_tag_counter+recv->neighbor[p],
169 A->mpi_info->comm,
170 &(A->col_coupler->mpi_requests[p]));
171 #endif
172 }
173
174 /* now prepare the rows to be sent (the degree, the offset and the data) */
175 len = 0;
176 i0 = 0;
177 for (p=0; p<num_neighbors; p++) {
178 i = i0;
179 neighbor = send->neighbor[p];
180 m_lb = B->col_distribution->first_component[neighbor];
181 m_ub = B->col_distribution->first_component[neighbor + 1];
182 j_ub = send->offsetInShared[p + 1];
183 for (j=send->offsetInShared[p]; j<j_ub; j++) {
184 row = send->shared[j];
185 l_m = 0;
186 l_c = 0;
187 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
188 k_lb = B->col_coupleBlock->pattern->ptr[row];
189
190 /* check part of col_coupleBlock for data to be passed @row */
191 for (k=k_lb; k<k_ub; k++) {
192 m = global_id[B->col_coupleBlock->pattern->index[k]];
193 if (m > offset) break;
194 if (m>= m_lb && m < m_ub) {
195 /* data to be passed to sparse matrix B_ext_main */
196 idx_m[l_m] = m - m_lb;
197 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
198 l_m++;
199 } else {
200 /* data to be passed to sparse matrix B_ext_couple */
201 idx_c[l_c] = m;
202 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
203 l_c++;
204 }
205 }
206 k_lb = k;
207
208 /* check mainBlock for data to be passed @row to sparse
209 matrix B_ext_couple */
210 k_ub = B->mainBlock->pattern->ptr[row + 1];
211 k = B->mainBlock->pattern->ptr[row];
212 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
213 for (; k<k_ub; k++) {
214 m = B->mainBlock->pattern->index[k] + offset;
215 idx_c[l_c] = m;
216 l_c++;
217 }
218
219 /* check the rest part of col_coupleBlock for data to
220 be passed @row to sparse matrix B_ext_couple */
221 k = k_lb;
222 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
223 for (k=k_lb; k<k_ub; k++) {
224 m = global_id[B->col_coupleBlock->pattern->index[k]];
225 if (m>= m_lb && m < m_ub) {
226 /* data to be passed to sparse matrix B_ext_main */
227 idx_m[l_m] = m - m_lb;
228 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
229 l_m++;
230 } else {
231 /* data to be passed to sparse matrix B_ext_couple */
232 idx_c[l_c] = m;
233 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
234 l_c++;
235 }
236 }
237
238 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
239 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
240 send_offset[2*i] = l_m;
241 len += l_m;
242 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
243 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
244 send_offset[2*i+1] = l_c;
245 len += l_c;
246 i++;
247 }
248
249 /* sending */
250 #ifdef ESYS_MPI
251 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
252 A->mpi_info->msg_tag_counter+rank,
253 A->mpi_info->comm,
254 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
255 #endif
256 send_degree[p] = len;
257 i0 = i;
258 }
259 TMPMEMFREE(send_m);
260 TMPMEMFREE(send_c);
261 TMPMEMFREE(idx_m);
262 TMPMEMFREE(idx_c);
263
264
265 q = recv->numNeighbors;
266 len = recv->offsetInShared[q];
267 ptr_main = MEMALLOC((len+1), index_t);
268 ptr_couple = MEMALLOC((len+1), index_t);
269
270 #ifdef ESYS_MPI
271 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
272 A->col_coupler->mpi_requests,
273 A->col_coupler->mpi_stati);
274 #endif
275 A->mpi_info->msg_tag_counter += size;
276
277 j = 0;
278 k = 0;
279 ptr_main[0] = 0;
280 ptr_couple[0] = 0;
281 for (i=0; i<len; i++) {
282 j += ptr_ptr[2*i];
283 k += ptr_ptr[2*i+1];
284 ptr_main[i+1] = j;
285 ptr_couple[i+1] = k;
286 }
287
288 TMPMEMFREE(ptr_ptr);
289 idx_main = MEMALLOC(j, index_t);
290 idx_couple = MEMALLOC(k, index_t);
291 ptr_idx = TMPMEMALLOC(j+k, index_t);
292 ptr_val = TMPMEMALLOC((j+k) * block_size, double);
293
294 /* send/receive index array */
295 j=0;
296 k_ub = 0;
297 for (p=0; p<recv->numNeighbors; p++) {
298 k = recv->offsetInShared[p];
299 m = recv->offsetInShared[p+1];
300 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
301 if (i > 0) {
302 k_ub ++;
303 #ifdef ESYS_MPI
304 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
305 A->mpi_info->msg_tag_counter+recv->neighbor[p],
306 A->mpi_info->comm,
307 &(A->col_coupler->mpi_requests[p]));
308 #endif
309 }
310 j += i;
311 }
312
313 j=0;
314 k_ub = 0;
315 for (p=0; p<num_neighbors; p++) {
316 i = send_degree[p] - j;
317 if (i > 0){
318 k_ub ++;
319 #ifdef ESYS_MPI
320 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
321 A->mpi_info->msg_tag_counter+rank,
322 A->mpi_info->comm,
323 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
324 #endif
325 }
326 j = send_degree[p];
327 }
328
329 #ifdef ESYS_MPI
330 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
331 A->col_coupler->mpi_requests,
332 A->col_coupler->mpi_stati);
333 #endif
334 A->mpi_info->msg_tag_counter += size;
335
336 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
337 for (i=0; i<len; i++) {
338 j = ptr_main[i];
339 k = ptr_main[i+1];
340 m = ptr_couple[i];
341 for (p=j; p<k; p++) {
342 idx_main[p] = ptr_idx[m+p];
343 }
344 j = ptr_couple[i+1];
345 for (p=m; p<j; p++) {
346 idx_couple[p] = ptr_idx[k+p];
347 }
348 }
349 TMPMEMFREE(ptr_idx);
350
351 /* allocate pattern and sparsematrix for B_ext_main */
352 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
353 len, num_main_cols, ptr_main, idx_main);
354 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
355 pattern_main, B->row_block_size, B->col_block_size,
356 FALSE);
357 Paso_Pattern_free(pattern_main);
358
359 /* allocate pattern and sparsematrix for B_ext_couple */
360 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
361 len, B->col_distribution->first_component[size],
362 ptr_couple, idx_couple);
363 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
364 pattern_couple, B->row_block_size, B->col_block_size,
365 FALSE);
366 Paso_Pattern_free(pattern_couple);
367
368 /* send/receive value array */
369 j=0;
370 for (p=0; p<recv->numNeighbors; p++) {
371 k = recv->offsetInShared[p];
372 m = recv->offsetInShared[p+1];
373 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
374 #ifdef ESYS_MPI
375 if (i > 0)
376 MPI_Irecv(&(ptr_val[j]), i * block_size,
377 MPI_DOUBLE, recv->neighbor[p],
378 A->mpi_info->msg_tag_counter+recv->neighbor[p],
379 A->mpi_info->comm,
380 &(A->col_coupler->mpi_requests[p]));
381 #endif
382 j += (i * block_size);
383 }
384
385 j=0;
386 for (p=0; p<num_neighbors; p++) {
387 i = send_degree[p] - j;
388 #ifdef ESYS_MPI
389 if (i > 0)
390 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
391 A->mpi_info->msg_tag_counter+rank,
392 A->mpi_info->comm,
393 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
394 #endif
395 j = send_degree[p] ;
396 }
397
398 #ifdef ESYS_MPI
399 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
400 A->col_coupler->mpi_requests,
401 A->col_coupler->mpi_stati);
402 #endif
403 A->mpi_info->msg_tag_counter += size;
404
405 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
406 for (i=0; i<len; i++) {
407 j = ptr_main[i];
408 k = ptr_main[i+1];
409 m = ptr_couple[i];
410 for (p=j; p<k; p++) {
411 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
412 }
413 j = ptr_couple[i+1];
414 for (p=m; p<j; p++) {
415 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
416 }
417 }
418 TMPMEMFREE(ptr_val);
419
420
421 /* release all temp memory allocation */
422 TMPMEMFREE(cols);
423 TMPMEMFREE(cols_array);
424 TMPMEMFREE(recv_offset);
425 TMPMEMFREE(recv_degree);
426 TMPMEMFREE(recv_buf);
427 TMPMEMFREE(send_buf);
428 TMPMEMFREE(send_offset);
429 TMPMEMFREE(send_degree);
430 TMPMEMFREE(send_idx);
431 }
432
433 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
434 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
435 to copy block of data in T to neighbor processors, defined by
436 P->col_coupler->connector->recv->neighbor[k] where k is in
437 [0, P->col_coupler->connector->recv->numNeighbors).
438 Rows to be copied to neighbor processor k is in the list defined by
439 P->col_coupler->connector->recv->offsetInShared[k] ...
440 P->col_coupler->connector->recv->offsetInShared[k+1] */
441 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
442 index_t **p_ptr, index_t **p_idx, double **p_val,
443 index_t *global_id, index_t block_size)
444 {
445 Paso_SharedComponents *send=NULL, *recv=NULL;
446 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
447 index_t i, j, p, m, n, size;
448 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
449 index_t *ptr=*p_ptr, *idx=*p_idx;
450 double *val=*p_val, *recv_val=NULL;
451 #ifdef ESYS_MPI
452 index_t rank = P->mpi_info->rank;
453 #endif
454
455 size = P->mpi_info->size;
456 send = P->col_coupler->connector->recv;
457 recv = P->col_coupler->connector->send;
458 send_neighbors = send->numNeighbors;
459 recv_neighbors = recv->numNeighbors;
460 send_rows = P->col_coupleBlock->numCols;
461 recv_rows = recv->offsetInShared[recv_neighbors];
462
463 send_degree = TMPMEMALLOC(send_rows, index_t);
464 recv_ptr = MEMALLOC(recv_rows + 1, index_t);
465 #pragma omp for schedule(static) private(i)
466 for (i=0; i<send_rows; i++)
467 send_degree[i] = ptr[i+1] - ptr[i];
468
469 /* First, send/receive the degree */
470 for (p=0; p<recv_neighbors; p++) { /* Receiving */
471 m = recv->offsetInShared[p];
472 n = recv->offsetInShared[p+1];
473 #ifdef ESYS_MPI
474 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
475 P->mpi_info->msg_tag_counter + recv->neighbor[p],
476 P->mpi_info->comm,
477 &(P->col_coupler->mpi_requests[p]));
478 #endif
479 }
480 for (p=0; p<send_neighbors; p++) { /* Sending */
481 m = send->offsetInShared[p];
482 n = send->offsetInShared[p+1];
483 #ifdef ESYS_MPI
484 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
485 P->mpi_info->msg_tag_counter + rank,
486 P->mpi_info->comm,
487 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
488 #endif
489 }
490 #ifdef ESYS_MPI
491 MPI_Waitall(send_neighbors+recv_neighbors,
492 P->col_coupler->mpi_requests,
493 P->col_coupler->mpi_stati);
494 #endif
495 P->mpi_info->msg_tag_counter += size;
496
497 TMPMEMFREE(send_degree);
498 m = Paso_Util_cumsum(recv_rows, recv_ptr);
499 recv_ptr[recv_rows] = m;
500 recv_idx = MEMALLOC(m, index_t);
501 recv_val = MEMALLOC(m * block_size, double);
502
503 /* Next, send/receive the index array */
504 j = 0;
505 for (p=0; p<recv_neighbors; p++) { /* Receiving */
506 m = recv->offsetInShared[p];
507 n = recv->offsetInShared[p+1];
508 i = recv_ptr[n] - recv_ptr[m];
509 if (i > 0) {
510 #ifdef ESYS_MPI
511 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
512 P->mpi_info->msg_tag_counter + recv->neighbor[p],
513 P->mpi_info->comm,
514 &(P->col_coupler->mpi_requests[p]));
515 #endif
516 }
517 j += i;
518 }
519
520 j = 0;
521 for (p=0; p<send_neighbors; p++) { /* Sending */
522 m = send->offsetInShared[p];
523 n = send->offsetInShared[p+1];
524 i = ptr[n] - ptr[m];
525 if (i >0) {
526 #ifdef ESYS_MPI
527 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
528 P->mpi_info->msg_tag_counter + rank,
529 P->mpi_info->comm,
530 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
531 #endif
532 j += i;
533 }
534 }
535 #ifdef ESYS_MPI
536 MPI_Waitall(send_neighbors+recv_neighbors,
537 P->col_coupler->mpi_requests,
538 P->col_coupler->mpi_stati);
539 #endif
540 P->mpi_info->msg_tag_counter += size;
541
542 /* Last, send/receive the data array */
543 j = 0;
544 for (p=0; p<recv_neighbors; p++) { /* Receiving */
545 m = recv->offsetInShared[p];
546 n = recv->offsetInShared[p+1];
547 i = recv_ptr[n] - recv_ptr[m];
548 #ifdef ESYS_MPI
549 if (i > 0)
550 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
551 P->mpi_info->msg_tag_counter + recv->neighbor[p],
552 P->mpi_info->comm,
553 &(P->col_coupler->mpi_requests[p]));
554 #endif
555 j += (i*block_size);
556 }
557
558 j = 0;
559 for (p=0; p<send_neighbors; p++) { /* Sending */
560 m = send->offsetInShared[p];
561 n = send->offsetInShared[p+1];
562 i = ptr[n] - ptr[m];
563 if (i >0) {
564 #ifdef ESYS_MPI
565 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
566 P->mpi_info->msg_tag_counter + rank,
567 P->mpi_info->comm,
568 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
569 #endif
570 j += i * block_size;
571 }
572 }
573 #ifdef ESYS_MPI
574 MPI_Waitall(send_neighbors+recv_neighbors,
575 P->col_coupler->mpi_requests,
576 P->col_coupler->mpi_stati);
577 #endif
578 P->mpi_info->msg_tag_counter += size;
579
580 /* Clean up and return with recevied ptr, index and data arrays */
581 MEMFREE(ptr);
582 MEMFREE(idx);
583 MEMFREE(val);
584 *p_ptr = recv_ptr;
585 *p_idx = recv_idx;
586 *p_val = recv_val;
587 }
588
589 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
590 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
591 {
592 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
593 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
594 Paso_SystemMatrix *out=NULL;
595 Paso_SystemMatrixPattern *pattern=NULL;
596 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
597 Paso_SharedComponents *send =NULL, *recv=NULL;
598 Paso_Connector *col_connector=NULL, *row_connector=NULL;
599 Paso_Pattern *main_pattern=NULL;
600 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
601 const dim_t row_block_size=A->row_block_size;
602 const dim_t col_block_size=A->col_block_size;
603 const dim_t block_size = A->block_size;
604 const dim_t P_block_size = P->block_size;
605 const double ZERO = 0.0;
606 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
607 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
608 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
609 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
610 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
611 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
612 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
613 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
614 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
615 index_t sum, i, j, k, iptr, irb, icb, ib;
616 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
617 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
618 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
619 index_t row_marker_ext, *where_p=NULL;
620 index_t **send_ptr=NULL, **send_idx=NULL;
621 dim_t p, num_neighbors;
622 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
623 Esys_MPI_rank *neighbor=NULL;
624 #ifdef ESYS_MPI
625 MPI_Request* mpi_requests=NULL;
626 MPI_Status* mpi_stati=NULL;
627 #else
628 int *mpi_requests=NULL, *mpi_stati=NULL;
629 #endif
630
631 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
632 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
633
634 /* two sparse matrixes R_main and R_couple will be generate, as the
635 transpose of P_main and P_col_couple, respectively. Note that,
636 R_couple is actually the row_coupleBlock of R (R=P^T) */
637 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
638 if (size > 1)
639 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
640
641 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
642 and needed locally for triple matrix product RAP
643 to be specific, P_ext (on processor k) are group of rows in P, where
644 the list of rows from processor q is given by
645 A->col_coupler->connector->send->shared[rPtr...]
646 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
647 on q.
648 P_ext is represented by two sparse matrixes P_ext_main and
649 P_ext_couple */
650 Paso_Preconditioner_AMG_extendB(A, P);
651
652 /* count the number of cols in P_ext_couple, resize the pattern of
653 sparse matrix P_ext_couple with new compressed order, and then
654 build the col id mapping from P->col_coupleBlock to
655 P_ext_couple */
656 num_Pmain_cols = P->mainBlock->numCols;
657 if (size > 1) {
658 num_Pcouple_cols = P->col_coupleBlock->numCols;
659 num_Acouple_cols = A->col_coupleBlock->numCols;
660 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
661 } else {
662 num_Pcouple_cols = 0;
663 num_Acouple_cols = 0;
664 sum = 0;
665 }
666 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
667 offset = P->col_distribution->first_component[rank];
668 num_Pext_cols = 0;
669 if (P->global_id) {
670 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
671 and P->col_coupleBlock */
672 iptr = 0;
673 if (num_Pcouple_cols || sum > 0) {
674 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
675 #pragma omp parallel for lastprivate(iptr) schedule(static)
676 for (iptr=0; iptr<sum; iptr++){
677 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
678 }
679 for (j=0; j<num_Pcouple_cols; j++, iptr++){
680 temp[iptr] = P->global_id[j];
681 }
682 }
683 if (iptr) {
684 #ifdef USE_QSORTG
685 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
686 #else
687 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
688 #endif
689 num_Pext_cols = 1;
690 i = temp[0];
691 for (j=1; j<iptr; j++) {
692 if (temp[j] > i) {
693 i = temp[j];
694 temp[num_Pext_cols++] = i;
695 }
696 }
697 }
698 /* resize the pattern of P_ext_couple */
699 if(num_Pext_cols){
700 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
701 #pragma omp parallel for private(i) schedule(static)
702 for (i=0; i<num_Pext_cols; i++)
703 global_id_P[i] = temp[i];
704 }
705 if (num_Pcouple_cols || sum > 0)
706 TMPMEMFREE(temp);
707 #pragma omp parallel for private(i, where_p) schedule(static)
708 for (i=0; i<sum; i++) {
709 where_p = (index_t *)bsearch(
710 &(P->remote_coupleBlock->pattern->index[i]),
711 global_id_P, num_Pext_cols,
712 sizeof(index_t), Paso_comparIndex);
713 P->remote_coupleBlock->pattern->index[i] =
714 (index_t)(where_p -global_id_P);
715 }
716
717 /* build the mapping */
718 if (num_Pcouple_cols) {
719 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
720 iptr = 0;
721 for (i=0; i<num_Pext_cols; i++)
722 if (global_id_P[i] == P->global_id[iptr]) {
723 Pcouple_to_Pext[iptr++] = i;
724 if (iptr == num_Pcouple_cols) break;
725 }
726 }
727 }
728
729 /* alloc and initialize the makers */
730 sum = num_Pext_cols + num_Pmain_cols;
731 P_marker = TMPMEMALLOC(sum, index_t);
732 A_marker = TMPMEMALLOC(num_A_cols, index_t);
733 #pragma omp parallel for private(i) schedule(static)
734 for (i=0; i<sum; i++) P_marker[i] = -1;
735 #pragma omp parallel for private(i) schedule(static)
736 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
737
738 /* Now, count the size of RAP_ext. Start with rows in R_couple */
739 sum = 0;
740 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
741 row_marker = sum;
742 /* then, loop over elements in row i_r of R_couple */
743 j1_ub = R_couple->pattern->ptr[i_r+1];
744 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
745 i1 = R_couple->pattern->index[j1];
746 /* then, loop over elements in row i1 of A->col_coupleBlock */
747 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
748 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
749 i2 = A->col_coupleBlock->pattern->index[j2];
750
751 /* check whether entry RA[i_r, i2] has been previously visited.
752 RAP new entry is possible only if entry RA[i_r, i2] has not
753 been visited yet */
754 if (A_marker[i2] != i_r) {
755 /* first, mark entry RA[i_r,i2] as visited */
756 A_marker[i2] = i_r;
757
758 /* then loop over elements in row i2 of P_ext_main */
759 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
760 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
761 i_c = P->row_coupleBlock->pattern->index[j3];
762
763 /* check whether entry RAP[i_r,i_c] has been created.
764 If not yet created, create the entry and increase
765 the total number of elements in RAP_ext */
766 if (P_marker[i_c] < row_marker) {
767 P_marker[i_c] = sum;
768 sum++;
769 }
770 }
771
772 /* loop over elements in row i2 of P_ext_couple, do the same */
773 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
774 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
775 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
776 if (P_marker[i_c] < row_marker) {
777 P_marker[i_c] = sum;
778 sum++;
779 }
780 }
781 }
782 }
783
784 /* now loop over elements in row i1 of A->mainBlock, repeat
785 the process we have done to block A->col_coupleBlock */
786 j2_ub = A->mainBlock->pattern->ptr[i1+1];
787 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
788 i2 = A->mainBlock->pattern->index[j2];
789 if (A_marker[i2 + num_Acouple_cols] != i_r) {
790 A_marker[i2 + num_Acouple_cols] = i_r;
791 j3_ub = P->mainBlock->pattern->ptr[i2+1];
792 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
793 i_c = P->mainBlock->pattern->index[j3];
794 if (P_marker[i_c] < row_marker) {
795 P_marker[i_c] = sum;
796 sum++;
797 }
798 }
799 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
800 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
801 /* note that we need to map the column index in
802 P->col_coupleBlock back into the column index in
803 P_ext_couple */
804 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
805 if (P_marker[i_c] < row_marker) {
806 P_marker[i_c] = sum;
807 sum++;
808 }
809 }
810 }
811 }
812 }
813 }
814
815 /* Now we have the number of non-zero elements in RAP_ext, allocate
816 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
817 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
818 RAP_ext_idx = MEMALLOC(sum, index_t);
819 RAP_ext_val = MEMALLOC(sum * block_size, double);
820 RA_val = TMPMEMALLOC(block_size, double);
821 RAP_val = TMPMEMALLOC(block_size, double);
822
823 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
824 sum = num_Pext_cols + num_Pmain_cols;
825 #pragma omp parallel for private(i) schedule(static)
826 for (i=0; i<sum; i++) P_marker[i] = -1;
827 #pragma omp parallel for private(i) schedule(static)
828 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
829 sum = 0;
830 RAP_ext_ptr[0] = 0;
831 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
832 row_marker = sum;
833 /* then, loop over elements in row i_r of R_couple */
834 j1_ub = R_couple->pattern->ptr[i_r+1];
835 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
836 i1 = R_couple->pattern->index[j1];
837 R_val = &(R_couple->val[j1*P_block_size]);
838
839 /* then, loop over elements in row i1 of A->col_coupleBlock */
840 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
841 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
842 i2 = A->col_coupleBlock->pattern->index[j2];
843 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
844 for (irb=0; irb<row_block_size; irb++)
845 for (icb=0; icb<col_block_size; icb++)
846 RA_val[irb+row_block_size*icb] = ZERO;
847 for (irb=0; irb<P_block_size; irb++) {
848 rtmp=R_val[irb];
849 for (icb=0; icb<col_block_size; icb++) {
850 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
851 }
852 }
853
854 /* check whether entry RA[i_r, i2] has been previously visited.
855 RAP new entry is possible only if entry RA[i_r, i2] has not
856 been visited yet */
857 if (A_marker[i2] != i_r) {
858 /* first, mark entry RA[i_r,i2] as visited */
859 A_marker[i2] = i_r;
860
861 /* then loop over elements in row i2 of P_ext_main */
862 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
863 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
864 i_c = P->row_coupleBlock->pattern->index[j3];
865 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
866 for (irb=0; irb<row_block_size; irb++)
867 for (icb=0; icb<col_block_size; icb++)
868 RAP_val[irb+row_block_size*icb] = ZERO;
869 for (icb=0; icb<P_block_size; icb++) {
870 rtmp = temp_val[icb];
871 for (irb=0; irb<row_block_size; irb++) {
872 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
873 }
874 }
875
876 /* check whether entry RAP[i_r,i_c] has been created.
877 If not yet created, create the entry and increase
878 the total number of elements in RAP_ext */
879 if (P_marker[i_c] < row_marker) {
880 P_marker[i_c] = sum;
881 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
882 RAP_ext_idx[sum] = i_c + offset;
883 sum++;
884 } else {
885 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
886 for (ib=0; ib<block_size; ib++) {
887 temp_val[ib] += RAP_val[ib];
888 }
889 }
890 }
891
892 /* loop over elements in row i2 of P_ext_couple, do the same */
893 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
894 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
895 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
896 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
897 for (irb=0; irb<row_block_size; irb++)
898 for (icb=0; icb<col_block_size; icb++)
899 RAP_val[irb+row_block_size*icb]=ZERO;
900 for (icb=0; icb<P_block_size; icb++) {
901 rtmp = temp_val[icb];
902 for (irb=0; irb<row_block_size; irb++) {
903 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
904 }
905 }
906
907 if (P_marker[i_c] < row_marker) {
908 P_marker[i_c] = sum;
909 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
910 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
911 sum++;
912 } else {
913 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
914 for (ib=0; ib<block_size; ib++) {
915 temp_val[ib] += RAP_val[ib];
916 }
917 }
918 }
919
920 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
921 Only the contributions are added. */
922 } else {
923 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
924 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
925 i_c = P->row_coupleBlock->pattern->index[j3];
926 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
927 for (irb=0; irb<row_block_size; irb++)
928 for (icb=0; icb<col_block_size; icb++)
929 RAP_val[irb+row_block_size*icb]=ZERO;
930 for (icb=0; icb<P_block_size; icb++) {
931 rtmp = temp_val[icb];
932 for (irb=0; irb<row_block_size; irb++) {
933 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
934 }
935 }
936
937 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
938 for (ib=0; ib<block_size; ib++) {
939 temp_val[ib] += RAP_val[ib];
940 }
941 }
942 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
943 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
944 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
945 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
946 for (irb=0; irb<row_block_size; irb++)
947 for (icb=0; icb<col_block_size; icb++)
948 RAP_val[irb+row_block_size*icb]=ZERO;
949 for (icb=0; icb<P_block_size; icb++) {
950 rtmp = temp_val[icb];
951 for (irb=0; irb<row_block_size; irb++) {
952 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
953 }
954 }
955
956 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
957 for (ib=0; ib<block_size; ib++) {
958 temp_val[ib] += RAP_val[ib];
959 }
960 }
961 }
962 }
963
964 /* now loop over elements in row i1 of A->mainBlock, repeat
965 the process we have done to block A->col_coupleBlock */
966 j2_ub = A->mainBlock->pattern->ptr[i1+1];
967 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
968 i2 = A->mainBlock->pattern->index[j2];
969 temp_val = &(A->mainBlock->val[j2*block_size]);
970 for (irb=0; irb<row_block_size; irb++)
971 for (icb=0; icb<col_block_size; icb++)
972 RA_val[irb+row_block_size*icb]=ZERO;
973 for (irb=0; irb<P_block_size; irb++) {
974 rtmp=R_val[irb];
975 for (icb=0; icb<col_block_size; icb++) {
976 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
977 }
978 }
979
980 if (A_marker[i2 + num_Acouple_cols] != i_r) {
981 A_marker[i2 + num_Acouple_cols] = i_r;
982 j3_ub = P->mainBlock->pattern->ptr[i2+1];
983 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
984 i_c = P->mainBlock->pattern->index[j3];
985 temp_val = &(P->mainBlock->val[j3*P_block_size]);
986 for (irb=0; irb<row_block_size; irb++)
987 for (icb=0; icb<col_block_size; icb++)
988 RAP_val[irb+row_block_size*icb]=ZERO;
989 for (icb=0; icb<P_block_size; icb++) {
990 rtmp = temp_val[icb];
991 for (irb=0; irb<row_block_size; irb++) {
992 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
993 }
994 }
995
996 if (P_marker[i_c] < row_marker) {
997 P_marker[i_c] = sum;
998 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
999 RAP_ext_idx[sum] = i_c + offset;
1000 sum++;
1001 } else {
1002 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1003 for (ib=0; ib<block_size; ib++) {
1004 temp_val[ib] += RAP_val[ib];
1005 }
1006 }
1007 }
1008 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1009 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1010 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1011 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1012 for (irb=0; irb<row_block_size; irb++)
1013 for (icb=0; icb<col_block_size; icb++)
1014 RAP_val[irb+row_block_size*icb]=ZERO;
1015 for (icb=0; icb<P_block_size; icb++) {
1016 rtmp=temp_val[icb];
1017 for (irb=0; irb<row_block_size; irb++) {
1018 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1019 }
1020 }
1021
1022 if (P_marker[i_c] < row_marker) {
1023 P_marker[i_c] = sum;
1024 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1025 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1026 sum++;
1027 } else {
1028 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1029 for (ib=0; ib<block_size; ib++) {
1030 temp_val[ib] += RAP_val[ib];
1031 }
1032 }
1033 }
1034 } else {
1035 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1036 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1037 i_c = P->mainBlock->pattern->index[j3];
1038 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1039 for (irb=0; irb<row_block_size; irb++)
1040 for (icb=0; icb<col_block_size; icb++)
1041 RAP_val[irb+row_block_size*icb]=ZERO;
1042 for (icb=0; icb<P_block_size; icb++) {
1043 rtmp=temp_val[icb];
1044 for (irb=0; irb<row_block_size; irb++) {
1045 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1046 }
1047 }
1048
1049 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1050 for (ib=0; ib<block_size; ib++) {
1051 temp_val[ib] += RAP_val[ib];
1052 }
1053 }
1054 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1055 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1056 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1057 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1058 for (irb=0; irb<row_block_size; irb++)
1059 for (icb=0; icb<col_block_size; icb++)
1060 RAP_val[irb+row_block_size*icb]=ZERO;
1061 for (icb=0; icb<P_block_size; icb++) {
1062 rtmp=temp_val[icb];
1063 for (irb=0; irb<row_block_size; irb++) {
1064 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1065 }
1066 }
1067
1068 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1069 for (ib=0; ib<block_size; ib++) {
1070 temp_val[ib] += RAP_val[ib];
1071 }
1072 }
1073 }
1074 }
1075 }
1076 RAP_ext_ptr[i_r+1] = sum;
1077 }
1078 TMPMEMFREE(P_marker);
1079 TMPMEMFREE(Pcouple_to_Pext);
1080
1081 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1082 which is given by the column list of P->col_coupleBlock, and
1083 column "c" is the list of columns which possiblely covers the
1084 whole column range of systme martris P. This part of data is to
1085 be passed to neighboring processors, and added to corresponding
1086 RAP[r,c] entries in the neighboring processors */
1087 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1088 &RAP_ext_val, global_id_P, block_size);
1089
1090 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1091 sum = RAP_ext_ptr[num_RAPext_rows];
1092 num_RAPext_cols = 0;
1093 if (num_Pext_cols || sum > 0) {
1094 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1095 j1_ub = offset + num_Pmain_cols;
1096 for (i=0, iptr=0; i<sum; i++) {
1097 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1098 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1099 }
1100 for (j=0; j<num_Pext_cols; j++, iptr++){
1101 temp[iptr] = global_id_P[j];
1102 }
1103
1104 if (iptr) {
1105 #ifdef USE_QSORTG
1106 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1107 #else
1108 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1109 #endif
1110 num_RAPext_cols = 1;
1111 i = temp[0];
1112 for (j=1; j<iptr; j++) {
1113 if (temp[j] > i) {
1114 i = temp[j];
1115 temp[num_RAPext_cols++] = i;
1116 }
1117 }
1118 }
1119 }
1120
1121 /* resize the pattern of P_ext_couple */
1122 if(num_RAPext_cols){
1123 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1124 #pragma omp parallel for private(i) schedule(static)
1125 for (i=0; i<num_RAPext_cols; i++)
1126 global_id_RAP[i] = temp[i];
1127 }
1128 if (num_Pext_cols || sum > 0)
1129 TMPMEMFREE(temp);
1130 j1_ub = offset + num_Pmain_cols;
1131 #pragma omp parallel for private(i, where_p) schedule(static)
1132 for (i=0; i<sum; i++) {
1133 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1134 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1135 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1136 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1137 } else
1138 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1139 }
1140
1141 /* build the mapping */
1142 if (num_Pcouple_cols) {
1143 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1144 iptr = 0;
1145 for (i=0; i<num_RAPext_cols; i++)
1146 if (global_id_RAP[i] == P->global_id[iptr]) {
1147 Pcouple_to_RAP[iptr++] = i;
1148 if (iptr == num_Pcouple_cols) break;
1149 }
1150 }
1151
1152 if (num_Pext_cols) {
1153 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1154 iptr = 0;
1155 for (i=0; i<num_RAPext_cols; i++)
1156 if (global_id_RAP[i] == global_id_P[iptr]) {
1157 Pext_to_RAP[iptr++] = i;
1158 if (iptr == num_Pext_cols) break;
1159 }
1160 }
1161
1162 if (global_id_P){
1163 TMPMEMFREE(global_id_P);
1164 global_id_P = NULL;
1165 }
1166
1167 /* alloc and initialize the makers */
1168 sum = num_RAPext_cols + num_Pmain_cols;
1169 P_marker = TMPMEMALLOC(sum, index_t);
1170 #pragma omp parallel for private(i) schedule(static)
1171 for (i=0; i<sum; i++) P_marker[i] = -1;
1172 #pragma omp parallel for private(i) schedule(static)
1173 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1174
1175 /* Now, count the size of RAP. Start with rows in R_main */
1176 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1177 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1178 shared = P->col_coupler->connector->send->shared;
1179 i = 0;
1180 j = 0;
1181 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1182 /* Mark the start of row for both main block and couple block */
1183 row_marker = i;
1184 row_marker_ext = j;
1185
1186 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1187 in RAP_ext */
1188 P_marker[i_r] = i;
1189 i++;
1190 for (j1=0; j1<num_neighbors; j1++) {
1191 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1192 if (shared[j2] == i_r) {
1193 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1194 i_c = RAP_ext_idx[k];
1195 if (i_c < num_Pmain_cols) {
1196 if (P_marker[i_c] < row_marker) {
1197 P_marker[i_c] = i;
1198 i++;
1199 }
1200 } else {
1201 if (P_marker[i_c] < row_marker_ext) {
1202 P_marker[i_c] = j;
1203 j++;
1204 }
1205 }
1206 }
1207 break;
1208 }
1209 }
1210 }
1211
1212 /* then, loop over elements in row i_r of R_main */
1213 j1_ub = R_main->pattern->ptr[i_r+1];
1214 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1215 i1 = R_main->pattern->index[j1];
1216
1217 /* then, loop over elements in row i1 of A->col_coupleBlock */
1218 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1219 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1220 i2 = A->col_coupleBlock->pattern->index[j2];
1221
1222 /* check whether entry RA[i_r, i2] has been previously visited.
1223 RAP new entry is possible only if entry RA[i_r, i2] has not
1224 been visited yet */
1225 if (A_marker[i2] != i_r) {
1226 /* first, mark entry RA[i_r,i2] as visited */
1227 A_marker[i2] = i_r;
1228
1229 /* then loop over elements in row i2 of P_ext_main */
1230 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1231 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1232 i_c = P->row_coupleBlock->pattern->index[j3];
1233
1234 /* check whether entry RAP[i_r,i_c] has been created.
1235 If not yet created, create the entry and increase
1236 the total number of elements in RAP_ext */
1237 if (P_marker[i_c] < row_marker) {
1238 P_marker[i_c] = i;
1239 i++;
1240 }
1241 }
1242
1243 /* loop over elements in row i2 of P_ext_couple, do the same */
1244 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1245 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1246 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1247 if (P_marker[i_c] < row_marker_ext) {
1248 P_marker[i_c] = j;
1249 j++;
1250 }
1251 }
1252 }
1253 }
1254
1255 /* now loop over elements in row i1 of A->mainBlock, repeat
1256 the process we have done to block A->col_coupleBlock */
1257 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1258 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1259 i2 = A->mainBlock->pattern->index[j2];
1260 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1261 A_marker[i2 + num_Acouple_cols] = i_r;
1262 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1263 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1264 i_c = P->mainBlock->pattern->index[j3];
1265 if (P_marker[i_c] < row_marker) {
1266 P_marker[i_c] = i;
1267 i++;
1268 }
1269 }
1270 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1271 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1272 /* note that we need to map the column index in
1273 P->col_coupleBlock back into the column index in
1274 P_ext_couple */
1275 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1276 if (P_marker[i_c] < row_marker_ext) {
1277 P_marker[i_c] = j;
1278 j++;
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285
1286 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1287 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1288 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1289 RAP_couple */
1290 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1291 RAP_main_idx = MEMALLOC(i, index_t);
1292 RAP_main_val = TMPMEMALLOC(i * block_size, double);
1293 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1294 RAP_couple_idx = MEMALLOC(j, index_t);
1295 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
1296
1297 RAP_main_ptr[num_Pmain_cols] = i;
1298 RAP_couple_ptr[num_Pmain_cols] = j;
1299
1300 #pragma omp parallel for private(i) schedule(static)
1301 for (i=0; i<sum; i++) P_marker[i] = -1;
1302 #pragma omp parallel for private(i) schedule(static)
1303 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1304
1305 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1306 in R_main */
1307 i = 0;
1308 j = 0;
1309 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1310 /* Mark the start of row for both main block and couple block */
1311 row_marker = i;
1312 row_marker_ext = j;
1313 RAP_main_ptr[i_r] = row_marker;
1314 RAP_couple_ptr[i_r] = row_marker_ext;
1315
1316 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1317 in row i_r of RAP_ext */
1318 P_marker[i_r] = i;
1319 for (ib=0; ib<block_size; ib++)
1320 RAP_main_val[i*block_size+ib] = ZERO;
1321 RAP_main_idx[i] = i_r;
1322 i++;
1323
1324 for (j1=0; j1<num_neighbors; j1++) {
1325 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1326 if (shared[j2] == i_r) {
1327 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1328 i_c = RAP_ext_idx[k];
1329 if (i_c < num_Pmain_cols) {
1330 if (P_marker[i_c] < row_marker) {
1331 P_marker[i_c] = i;
1332 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1333 RAP_main_idx[i] = i_c;
1334 i++;
1335 } else {
1336 t1_val = &(RAP_ext_val[k*block_size]);
1337 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1338 for (ib=0; ib<block_size; ib++)
1339 t2_val[ib] += t1_val[ib];
1340 }
1341 } else {
1342 if (P_marker[i_c] < row_marker_ext) {
1343 P_marker[i_c] = j;
1344 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1345 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1346 j++;
1347 } else {
1348 t1_val = &(RAP_ext_val[k*block_size]);
1349 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1350 for (ib=0; ib<block_size; ib++)
1351 t2_val[ib] += t1_val[ib];
1352 }
1353 }
1354 }
1355 break;
1356 }
1357 }
1358 }
1359
1360 /* then, loop over elements in row i_r of R_main */
1361 j1_ub = R_main->pattern->ptr[i_r+1];
1362 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1363 i1 = R_main->pattern->index[j1];
1364 R_val = &(R_main->val[j1*P_block_size]);
1365
1366 /* then, loop over elements in row i1 of A->col_coupleBlock */
1367 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1368 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1369 i2 = A->col_coupleBlock->pattern->index[j2];
1370 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1371 for (irb=0; irb<row_block_size; irb++)
1372 for (icb=0; icb<col_block_size; icb++)
1373 RA_val[irb+row_block_size*icb]=ZERO;
1374 for (irb=0; irb<P_block_size; irb++) {
1375 rtmp=R_val[irb];
1376 for (icb=0; icb<col_block_size; icb++) {
1377 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1378 }
1379 }
1380
1381
1382 /* check whether entry RA[i_r, i2] has been previously visited.
1383 RAP new entry is possible only if entry RA[i_r, i2] has not
1384 been visited yet */
1385 if (A_marker[i2] != i_r) {
1386 /* first, mark entry RA[i_r,i2] as visited */
1387 A_marker[i2] = i_r;
1388
1389 /* then loop over elements in row i2 of P_ext_main */
1390 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1391 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1392 i_c = P->row_coupleBlock->pattern->index[j3];
1393 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1394 for (irb=0; irb<row_block_size; irb++)
1395 for (icb=0; icb<col_block_size; icb++)
1396 RAP_val[irb+row_block_size*icb]=ZERO;
1397 for (icb=0; icb<P_block_size; icb++) {
1398 rtmp=temp_val[icb];
1399 for (irb=0; irb<row_block_size; irb++) {
1400 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1401 }
1402 }
1403
1404
1405 /* check whether entry RAP[i_r,i_c] has been created.
1406 If not yet created, create the entry and increase
1407 the total number of elements in RAP_ext */
1408 if (P_marker[i_c] < row_marker) {
1409 P_marker[i_c] = i;
1410 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1411 RAP_main_idx[i] = i_c;
1412 i++;
1413 } else {
1414 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1415 for (ib=0; ib<block_size; ib++) {
1416 temp_val[ib] += RAP_val[ib];
1417 }
1418 }
1419 }
1420
1421 /* loop over elements in row i2 of P_ext_couple, do the same */
1422 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1423 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1424 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1425 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1426 for (irb=0; irb<row_block_size; irb++)
1427 for (icb=0; icb<col_block_size; icb++)
1428 RAP_val[irb+row_block_size*icb]=ZERO;
1429 for (icb=0; icb<P_block_size; icb++) {
1430 rtmp=temp_val[icb];
1431 for (irb=0; irb<row_block_size; irb++) {
1432 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1433 }
1434 }
1435
1436 if (P_marker[i_c] < row_marker_ext) {
1437 P_marker[i_c] = j;
1438 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1439 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1440 j++;
1441 } else {
1442 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1443 for (ib=0; ib<block_size; ib++) {
1444 temp_val[ib] += RAP_val[ib];
1445 }
1446 }
1447 }
1448
1449 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1450 Only the contributions are added. */
1451 } else {
1452 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1453 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1454 i_c = P->row_coupleBlock->pattern->index[j3];
1455 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1456 for (irb=0; irb<row_block_size; irb++)
1457 for (icb=0; icb<col_block_size; icb++)
1458 RAP_val[irb+row_block_size*icb]=ZERO;
1459 for (icb=0; icb<P_block_size; icb++) {
1460 rtmp=temp_val[icb];
1461 for (irb=0; irb<row_block_size; irb++) {
1462 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1463 }
1464 }
1465
1466 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1467 for (ib=0; ib<block_size; ib++) {
1468 temp_val[ib] += RAP_val[ib];
1469 }
1470 }
1471 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1472 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1473 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1474 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1475 for (irb=0; irb<row_block_size; irb++)
1476 for (icb=0; icb<col_block_size; icb++)
1477 RAP_val[irb+row_block_size*icb]=ZERO;
1478 for (icb=0; icb<P_block_size; icb++) {
1479 rtmp=temp_val[icb];
1480 for (irb=0; irb<row_block_size; irb++) {
1481 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1482 }
1483 }
1484
1485 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1486 for (ib=0; ib<block_size; ib++) {
1487 temp_val[ib] += RAP_val[ib];
1488 }
1489 }
1490 }
1491 }
1492
1493 /* now loop over elements in row i1 of A->mainBlock, repeat
1494 the process we have done to block A->col_coupleBlock */
1495 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1496 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1497 i2 = A->mainBlock->pattern->index[j2];
1498 temp_val = &(A->mainBlock->val[j2*block_size]);
1499 for (irb=0; irb<row_block_size; irb++)
1500 for (icb=0; icb<col_block_size; icb++)
1501 RA_val[irb+row_block_size*icb]=ZERO;
1502 for (irb=0; irb<P_block_size; irb++) {
1503 rtmp=R_val[irb];
1504 for (icb=0; icb<col_block_size; icb++) {
1505 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1506 }
1507 }
1508
1509 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1510 A_marker[i2 + num_Acouple_cols] = i_r;
1511 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1512 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1513 i_c = P->mainBlock->pattern->index[j3];
1514 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1515 for (irb=0; irb<row_block_size; irb++)
1516 for (icb=0; icb<col_block_size; icb++)
1517 RAP_val[irb+row_block_size*icb]=ZERO;
1518 for (icb=0; icb<P_block_size; icb++) {
1519 rtmp=temp_val[icb];
1520 for (irb=0; irb<row_block_size; irb++) {
1521 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1522 }
1523 }
1524 if (P_marker[i_c] < row_marker) {
1525 P_marker[i_c] = i;
1526 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1527 RAP_main_idx[i] = i_c;
1528 i++;
1529 } else {
1530 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1531 for (ib=0; ib<block_size; ib++) {
1532 temp_val[ib] += RAP_val[ib];
1533 }
1534 }
1535 }
1536 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1537 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1538 /* note that we need to map the column index in
1539 P->col_coupleBlock back into the column index in
1540 P_ext_couple */
1541 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1542 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1543 for (irb=0; irb<row_block_size; irb++)
1544 for (icb=0; icb<col_block_size; icb++)
1545 RAP_val[irb+row_block_size*icb]=ZERO;
1546 for (icb=0; icb<P_block_size; icb++) {
1547 rtmp=temp_val[icb];
1548 for (irb=0; irb<row_block_size; irb++) {
1549 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1550 }
1551 }
1552
1553 if (P_marker[i_c] < row_marker_ext) {
1554 P_marker[i_c] = j;
1555 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1556 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1557 j++;
1558 } else {
1559 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1560 for (ib=0; ib<block_size; ib++) {
1561 temp_val[ib] += RAP_val[ib];
1562 }
1563 }
1564 }
1565
1566 } else {
1567 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1568 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1569 i_c = P->mainBlock->pattern->index[j3];
1570 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1571 for (irb=0; irb<row_block_size; irb++)
1572 for (icb=0; icb<col_block_size; icb++)
1573 RAP_val[irb+row_block_size*icb]=ZERO;
1574 for (icb=0; icb<P_block_size; icb++) {
1575 rtmp=temp_val[icb];
1576 for (irb=0; irb<row_block_size; irb++) {
1577 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1578 }
1579 }
1580
1581 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1582 for (ib=0; ib<block_size; ib++) {
1583 temp_val[ib] += RAP_val[ib];
1584 }
1585 }
1586 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1587 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1588 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1589 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1590 for (irb=0; irb<row_block_size; irb++)
1591 for (icb=0; icb<col_block_size; icb++)
1592 RAP_val[irb+row_block_size*icb]=ZERO;
1593 for (icb=0; icb<P_block_size; icb++) {
1594 rtmp = temp_val[icb];
1595 for (irb=0; irb<row_block_size; irb++) {
1596 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1597 }
1598 }
1599
1600 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1601 for (ib=0; ib<block_size; ib++) {
1602 temp_val[ib] += RAP_val[ib];
1603 }
1604 }
1605 }
1606 }
1607 }
1608
1609 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1610 if (i > row_marker) {
1611 offset = i - row_marker;
1612 temp = TMPMEMALLOC(offset, index_t);
1613 #pragma omp parallel for schedule(static) private(iptr)
1614 for (iptr=0; iptr<offset; iptr++)
1615 temp[iptr] = RAP_main_idx[row_marker+iptr];
1616 if (offset > 0) {
1617 #ifdef USE_QSORTG
1618 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1619 #else
1620 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1621 #endif
1622 }
1623 temp_val = TMPMEMALLOC(offset * block_size, double);
1624 #pragma omp parallel for schedule(static) private(iptr,k)
1625 for (iptr=0; iptr<offset; iptr++){
1626 k = P_marker[temp[iptr]];
1627 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1628 P_marker[temp[iptr]] = iptr + row_marker;
1629 }
1630 #pragma omp parallel for schedule(static) private(iptr)
1631 for (iptr=0; iptr<offset; iptr++){
1632 RAP_main_idx[row_marker+iptr] = temp[iptr];
1633 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1634 }
1635 TMPMEMFREE(temp);
1636 TMPMEMFREE(temp_val);
1637 }
1638 if (j > row_marker_ext) {
1639 offset = j - row_marker_ext;
1640 temp = TMPMEMALLOC(offset, index_t);
1641 #pragma omp parallel for schedule(static) private(iptr)
1642 for (iptr=0; iptr<offset; iptr++)
1643 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1644 if (offset > 0) {
1645 #ifdef USE_QSORTG
1646 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1647 #else
1648 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1649 #endif
1650 }
1651 temp_val = TMPMEMALLOC(offset * block_size, double);
1652 #pragma omp parallel for schedule(static) private(iptr, k)
1653 for (iptr=0; iptr<offset; iptr++){
1654 k = P_marker[temp[iptr] + num_Pmain_cols];
1655 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1656 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1657 }
1658 #pragma omp parallel for schedule(static) private(iptr)
1659 for (iptr=0; iptr<offset; iptr++){
1660 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1661 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1662 }
1663 TMPMEMFREE(temp);
1664 TMPMEMFREE(temp_val);
1665 }
1666 }
1667
1668 TMPMEMFREE(RA_val);
1669 TMPMEMFREE(RAP_val);
1670 TMPMEMFREE(A_marker);
1671 TMPMEMFREE(Pext_to_RAP);
1672 TMPMEMFREE(Pcouple_to_RAP);
1673 MEMFREE(RAP_ext_ptr);
1674 MEMFREE(RAP_ext_idx);
1675 MEMFREE(RAP_ext_val);
1676 Paso_SparseMatrix_free(R_main);
1677 Paso_SparseMatrix_free(R_couple);
1678
1679 /* Check whether there are empty columns in RAP_couple */
1680 #pragma omp parallel for schedule(static) private(i)
1681 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1682 /* num of non-empty columns is stored in "k" */
1683 k = 0;
1684 j = RAP_couple_ptr[num_Pmain_cols];
1685 for (i=0; i<j; i++) {
1686 i1 = RAP_couple_idx[i];
1687 if (P_marker[i1]) {
1688 P_marker[i1] = 0;
1689 k++;
1690 }
1691 }
1692
1693 /* empty columns is found */
1694 if (k < num_RAPext_cols) {
1695 temp = TMPMEMALLOC(k, index_t);
1696 k = 0;
1697 for (i=0; i<num_RAPext_cols; i++)
1698 if (!P_marker[i]) {
1699 P_marker[i] = k;
1700 temp[k] = global_id_RAP[i];
1701 k++;
1702 }
1703 #pragma omp parallel for schedule(static) private(i, i1)
1704 for (i=0; i<j; i++) {
1705 i1 = RAP_couple_idx[i];
1706 RAP_couple_idx[i] = P_marker[i1];
1707 }
1708 num_RAPext_cols = k;
1709 TMPMEMFREE(global_id_RAP);
1710 global_id_RAP = temp;
1711 }
1712 TMPMEMFREE(P_marker);
1713
1714 /******************************************************/
1715 /* Start to create the coasre level System Matrix A_c */
1716 /******************************************************/
1717 /* first, prepare the sender/receiver for the col_connector */
1718 dist = P->pattern->input_distribution->first_component;
1719 recv_len = TMPMEMALLOC(size, dim_t);
1720 send_len = TMPMEMALLOC(size, dim_t);
1721 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
1722 offsetInShared = TMPMEMALLOC(size+1, index_t);
1723 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
1724 memset(recv_len, 0, sizeof(dim_t) * size);
1725 memset(send_len, 0, sizeof(dim_t) * size);
1726 num_neighbors = 0;
1727 offsetInShared[0] = 0;
1728 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1729 shared[i] = i + num_Pmain_cols;
1730 if (k <= global_id_RAP[i]) {
1731 if (recv_len[j] > 0) {
1732 neighbor[num_neighbors] = j;
1733 num_neighbors ++;
1734 offsetInShared[num_neighbors] = i;
1735 }
1736 while (k <= global_id_RAP[i]) {
1737 j++;
1738 k = dist[j+1];
1739 }
1740 }
1741 recv_len[j] ++;
1742 }
1743 if (recv_len[j] > 0) {
1744 neighbor[num_neighbors] = j;
1745 num_neighbors ++;
1746 offsetInShared[num_neighbors] = i;
1747 }
1748 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1749 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1750
1751 #ifdef ESYS_MPI
1752 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1753 #endif
1754
1755 #ifdef ESYS_MPI
1756 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
1757 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
1758 #else
1759 mpi_requests=TMPMEMALLOC(size*2,int);
1760 mpi_stati=TMPMEMALLOC(size*2,int);
1761 #endif
1762 num_neighbors = 0;
1763 j = 0;
1764 offsetInShared[0] = 0;
1765 for (i=0; i<size; i++) {
1766 if (send_len[i] > 0) {
1767 neighbor[num_neighbors] = i;
1768 num_neighbors ++;
1769 j += send_len[i];
1770 offsetInShared[num_neighbors] = j;
1771 }
1772 }
1773 TMPMEMFREE(shared);
1774 shared = TMPMEMALLOC(j, index_t);
1775 for (i=0, j=0; i<num_neighbors; i++) {
1776 k = neighbor[i];
1777 #ifdef ESYS_MPI
1778 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1779 mpi_info->msg_tag_counter+k,
1780 mpi_info->comm, &mpi_requests[i]);
1781 #endif
1782 j += send_len[k];
1783 }
1784 for (i=0, j=0; i<recv->numNeighbors; i++) {
1785 k = recv->neighbor[i];
1786 #ifdef ESYS_MPI
1787 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1788 mpi_info->msg_tag_counter+rank,
1789 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1790 #endif
1791 j += recv_len[k];
1792 }
1793 #ifdef ESYS_MPI
1794 MPI_Waitall(num_neighbors + recv->numNeighbors,
1795 mpi_requests, mpi_stati);
1796 #endif
1797 mpi_info->msg_tag_counter += size;
1798
1799 j = offsetInShared[num_neighbors];
1800 offset = dist[rank];
1801 #pragma omp parallel for schedule(static) private(i)
1802 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1803 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1804 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1805
1806 col_connector = Paso_Connector_alloc(send, recv);
1807 TMPMEMFREE(shared);
1808 Paso_SharedComponents_free(recv);
1809 Paso_SharedComponents_free(send);
1810
1811 /* now, create row distribution (output_distri) and col
1812 distribution (input_distribution) */
1813 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1814 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1815
1816 /* then, prepare the sender/receiver for the row_connector, first, prepare
1817 the information for sender */
1818 sum = RAP_couple_ptr[num_Pmain_cols];
1819 len = TMPMEMALLOC(size, dim_t);
1820 send_ptr = TMPMEMALLOC(size, index_t*);
1821 send_idx = TMPMEMALLOC(size, index_t*);
1822 #pragma omp parallel for schedule(static) private(i)
1823 for (i=0; i<size; i++) {
1824 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
1825 send_idx[i] = TMPMEMALLOC(sum, index_t);
1826 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1827 }
1828 memset(len, 0, sizeof(dim_t) * size);
1829 recv = col_connector->recv;
1830 sum=0;
1831 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1832 i1 = RAP_couple_ptr[i_r];
1833 i2 = RAP_couple_ptr[i_r+1];
1834 if (i2 > i1) {
1835 /* then row i_r will be in the sender of row_connector, now check
1836 how many neighbors i_r needs to be send to */
1837 for (j=i1; j<i2; j++) {
1838 i_c = RAP_couple_idx[j];
1839 /* find out the corresponding neighbor "p" of column i_c */
1840 for (p=0; p<recv->numNeighbors; p++) {
1841 if (i_c < recv->offsetInShared[p+1]) {
1842 k = recv->neighbor[p];
1843 if (send_ptr[k][i_r] == 0) sum++;
1844 send_ptr[k][i_r] ++;
1845 send_idx[k][len[k]] = global_id_RAP[i_c];
1846 len[k] ++;
1847 break;
1848 }
1849 }
1850 }
1851 }
1852 }
1853 if (global_id_RAP) {
1854 TMPMEMFREE(global_id_RAP);
1855 global_id_RAP = NULL;
1856 }
1857
1858 /* now allocate the sender */
1859 shared = TMPMEMALLOC(sum, index_t);
1860 memset(send_len, 0, sizeof(dim_t) * size);
1861 num_neighbors=0;
1862 offsetInShared[0] = 0;
1863 for (p=0, k=0; p<size; p++) {
1864 for (i=0; i<num_Pmain_cols; i++) {
1865 if (send_ptr[p][i] > 0) {
1866 shared[k] = i;
1867 k++;
1868 send_ptr[p][send_len[p]] = send_ptr[p][i];
1869 send_len[p]++;
1870 }
1871 }
1872 if (k > offsetInShared[num_neighbors]) {
1873 neighbor[num_neighbors] = p;
1874 num_neighbors ++;
1875 offsetInShared[num_neighbors] = k;
1876 }
1877 }
1878 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1879 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1880
1881 /* send/recv number of rows will be sent from current proc
1882 recover info for the receiver of row_connector from the sender */
1883 #ifdef ESYS_MPI
1884 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1885 #endif
1886 num_neighbors = 0;
1887 offsetInShared[0] = 0;
1888 j = 0;
1889 for (i=0; i<size; i++) {
1890 if (i != rank && recv_len[i] > 0) {
1891 neighbor[num_neighbors] = i;
1892 num_neighbors ++;
1893 j += recv_len[i];
1894 offsetInShared[num_neighbors] = j;
1895 }
1896 }
1897 TMPMEMFREE(shared);
1898 TMPMEMFREE(recv_len);
1899 shared = TMPMEMALLOC(j, index_t);
1900 k = offsetInShared[num_neighbors];
1901 #pragma omp parallel for schedule(static) private(i)
1902 for (i=0; i<k; i++) {
1903 shared[i] = i + num_Pmain_cols;
1904 }
1905 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1906 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1907 row_connector = Paso_Connector_alloc(send, recv);
1908 TMPMEMFREE(shared);
1909 Paso_SharedComponents_free(recv);
1910 Paso_SharedComponents_free(send);
1911
1912 /* send/recv pattern->ptr for rowCoupleBlock */
1913 num_RAPext_rows = offsetInShared[num_neighbors];
1914 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
1915 for (p=0; p<num_neighbors; p++) {
1916 j = offsetInShared[p];
1917 i = offsetInShared[p+1];
1918 #ifdef ESYS_MPI
1919 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1920 mpi_info->msg_tag_counter+neighbor[p],
1921 mpi_info->comm, &mpi_requests[p]);
1922 #endif
1923 }
1924 send = row_connector->send;
1925 for (p=0; p<send->numNeighbors; p++) {
1926 #ifdef ESYS_MPI
1927 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1928 MPI_INT, send->neighbor[p],
1929 mpi_info->msg_tag_counter+rank,
1930 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1931 #endif
1932 }
1933 #ifdef ESYS_MPI
1934 MPI_Waitall(num_neighbors + send->numNeighbors,
1935 mpi_requests, mpi_stati);
1936 #endif
1937 mpi_info->msg_tag_counter += size;
1938 TMPMEMFREE(send_len);
1939
1940 sum = 0;
1941 for (i=0; i<num_RAPext_rows; i++) {
1942 k = row_couple_ptr[i];
1943 row_couple_ptr[i] = sum;
1944 sum += k;
1945 }
1946 row_couple_ptr[num_RAPext_rows] = sum;
1947
1948 /* send/recv pattern->index for rowCoupleBlock */
1949 k = row_couple_ptr[num_RAPext_rows];
1950 row_couple_idx = MEMALLOC(k, index_t);
1951 for (p=0; p<num_neighbors; p++) {
1952 j1 = row_couple_ptr[offsetInShared[p]];
1953 j2 = row_couple_ptr[offsetInShared[p+1]];
1954 #ifdef ESYS_MPI
1955 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1956 mpi_info->msg_tag_counter+neighbor[p],
1957 mpi_info->comm, &mpi_requests[p]);
1958 #endif
1959 }
1960 for (p=0; p<send->numNeighbors; p++) {
1961 #ifdef ESYS_MPI
1962 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1963 MPI_INT, send->neighbor[p],
1964 mpi_info->msg_tag_counter+rank,
1965 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1966 #endif
1967 }
1968 #ifdef ESYS_MPI
1969 MPI_Waitall(num_neighbors + send->numNeighbors,
1970 mpi_requests, mpi_stati);
1971 #endif
1972 mpi_info->msg_tag_counter += size;
1973
1974 offset = input_dist->first_component[rank];
1975 k = row_couple_ptr[num_RAPext_rows];
1976 #pragma omp parallel for schedule(static) private(i)
1977 for (i=0; i<k; i++) {
1978 row_couple_idx[i] -= offset;
1979 }
1980 #pragma omp parallel for schedule(static) private(i)
1981 for (i=0; i<size; i++) {
1982 TMPMEMFREE(send_ptr[i]);
1983 TMPMEMFREE(send_idx[i]);
1984 }
1985 TMPMEMFREE(send_ptr);
1986 TMPMEMFREE(send_idx);
1987 TMPMEMFREE(len);
1988 TMPMEMFREE(offsetInShared);
1989 TMPMEMFREE(neighbor);
1990 TMPMEMFREE(mpi_requests);
1991 TMPMEMFREE(mpi_stati);
1992 Esys_MPIInfo_free(mpi_info);
1993
1994 /* Now, we can create pattern for mainBlock and coupleBlock */
1995 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1996 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1997 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1998 num_Pmain_cols, num_RAPext_cols,
1999 RAP_couple_ptr, RAP_couple_idx);
2000 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
2001 num_RAPext_rows, num_Pmain_cols,
2002 row_couple_ptr, row_couple_idx);
2003
2004 /* next, create the system matrix */
2005 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2006 output_dist, input_dist, main_pattern, col_couple_pattern,
2007 row_couple_pattern, col_connector, row_connector);
2008 out = Paso_SystemMatrix_alloc(A->type, pattern,
2009 row_block_size, col_block_size, FALSE);
2010
2011 /* finally, fill in the data*/
2012 memcpy(out->mainBlock->val, RAP_main_val,
2013 out->mainBlock->len* sizeof(double));
2014 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2015 out->col_coupleBlock->len * sizeof(double));
2016
2017 /* Clean up */
2018 Paso_SystemMatrixPattern_free(pattern);
2019 Paso_Pattern_free(main_pattern);
2020 Paso_Pattern_free(col_couple_pattern);
2021 Paso_Pattern_free(row_couple_pattern);
2022 Paso_Connector_free(col_connector);
2023 Paso_Connector_free(row_connector);
2024 Paso_Distribution_free(output_dist);
2025 Paso_Distribution_free(input_dist);
2026 TMPMEMFREE(RAP_main_val);
2027 TMPMEMFREE(RAP_couple_val);
2028 if (Esys_noError()) {
2029 return out;
2030 } else {
2031 Paso_SystemMatrix_free(out);
2032 return NULL;
2033 }
2034 }
2035
2036
2037 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2038 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2039 {
2040 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2041 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2042 Paso_SystemMatrix *out=NULL;
2043 Paso_SystemMatrixPattern *pattern=NULL;
2044 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2045 Paso_SharedComponents *send =NULL, *recv=NULL;
2046 Paso_Connector *col_connector=NULL, *row_connector=NULL;
2047 Paso_Pattern *main_pattern=NULL;
2048 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2049 const dim_t row_block_size=A->row_block_size;
2050 const dim_t col_block_size=A->col_block_size;
2051 const dim_t block_size = A->block_size;
2052 const double ZERO = 0.0;
2053 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2054 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2055 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2056 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2057 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2058 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2059 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2060 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2061 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2062 index_t sum, i, j, k, iptr, irb, icb, ib;
2063 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2064 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2065 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2066 index_t row_marker_ext, *where_p=NULL;
2067 index_t **send_ptr=NULL, **send_idx=NULL;
2068 dim_t p, num_neighbors;
2069 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2070 Esys_MPI_rank *neighbor=NULL;
2071 #ifdef ESYS_MPI
2072 MPI_Request* mpi_requests=NULL;
2073 MPI_Status* mpi_stati=NULL;
2074 #else
2075 int *mpi_requests=NULL, *mpi_stati=NULL;
2076 #endif
2077
2078 /* two sparse matrixes R_main and R_couple will be generate, as the
2079 transpose of P_main and P_col_couple, respectively. Note that,
2080 R_couple is actually the row_coupleBlock of R (R=P^T) */
2081 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2082 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2083
2084 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
2085 and needed locally for triple matrix product RAP
2086 to be specific, P_ext (on processor k) are group of rows in P, where
2087 the list of rows from processor q is given by
2088 A->col_coupler->connector->send->shared[rPtr...]
2089 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2090 on q.
2091 P_ext is represented by two sparse matrixes P_ext_main and
2092 P_ext_couple */
2093 Paso_Preconditioner_AMG_extendB(A, P);
2094
2095 /* count the number of cols in P_ext_couple, resize the pattern of
2096 sparse matrix P_ext_couple with new compressed order, and then
2097 build the col id mapping from P->col_coupleBlock to
2098 P_ext_couple */
2099 num_Pmain_cols = P->mainBlock->numCols;
2100 num_Pcouple_cols = P->col_coupleBlock->numCols;
2101 num_Acouple_cols = A->col_coupleBlock->numCols;
2102 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2103 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2104 offset = P->col_distribution->first_component[rank];
2105 num_Pext_cols = 0;
2106 if (P->global_id) {
2107 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2108 and P->col_coupleBlock */
2109 iptr = 0;
2110 if (num_Pcouple_cols || sum > 0) {
2111 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
2112 for (; iptr<sum; iptr++){
2113 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2114 }
2115 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2116 temp[iptr] = P->global_id[j];
2117 }
2118 }
2119 if (iptr) {
2120 #ifdef USE_QSORTG
2121 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2122 #else
2123 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2124 #endif
2125 num_Pext_cols = 1;
2126 i = temp[0];
2127 for (j=1; j<iptr; j++) {
2128 if (temp[j] > i) {
2129 i = temp[j];
2130 temp[num_Pext_cols++] = i;
2131 }
2132 }
2133 }
2134
2135 /* resize the pattern of P_ext_couple */
2136 if(num_Pext_cols){
2137 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
2138 for (i=0; i<num_Pext_cols; i++)
2139 global_id_P[i] = temp[i];
2140 }
2141 if (num_Pcouple_cols || sum > 0)
2142 TMPMEMFREE(temp);
2143 for (i=0; i<sum; i++) {
2144 where_p = (index_t *)bsearch(
2145 &(P->remote_coupleBlock->pattern->index[i]),
2146 global_id_P, num_Pext_cols,
2147 sizeof(index_t), Paso_comparIndex);
2148 P->remote_coupleBlock->pattern->index[i] =
2149 (index_t)(where_p -global_id_P);
2150 }
2151
2152 /* build the mapping */
2153 if (num_Pcouple_cols) {
2154 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
2155 iptr = 0;
2156 for (i=0; i<num_Pext_cols; i++)
2157 if (global_id_P[i] == P->global_id[iptr]) {
2158 Pcouple_to_Pext[iptr++] = i;
2159 if (iptr == num_Pcouple_cols) break;
2160 }
2161 }
2162 }
2163
2164 /* alloc and initialize the makers */
2165 sum = num_Pext_cols + num_Pmain_cols;
2166 P_marker = TMPMEMALLOC(sum, index_t);
2167 A_marker = TMPMEMALLOC(num_A_cols, index_t);
2168 #pragma omp parallel for private(i) schedule(static)
2169 for (i=0; i<sum; i++) P_marker[i] = -1;
2170 #pragma omp parallel for private(i) schedule(static)
2171 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2172
2173 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2174 sum = 0;
2175 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2176 row_marker = sum;
2177 /* then, loop over elements in row i_r of R_couple */
2178 j1_ub = R_couple->pattern->ptr[i_r+1];
2179 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2180 i1 = R_couple->pattern->index[j1];
2181 /* then, loop over elements in row i1 of A->col_coupleBlock */
2182 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2183 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2184 i2 = A->col_coupleBlock->pattern->index[j2];
2185
2186 /* check whether entry RA[i_r, i2] has been previously visited.
2187 RAP new entry is possible only if entry RA[i_r, i2] has not
2188 been visited yet */
2189 if (A_marker[i2] != i_r) {
2190 /* first, mark entry RA[i_r,i2] as visited */
2191 A_marker[i2] = i_r;
2192
2193 /* then loop over elements in row i2 of P_ext_main */
2194 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2195 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2196 i_c = P->row_coupleBlock->pattern->index[j3];
2197
2198 /* check whether entry RAP[i_r,i_c] has been created.
2199 If not yet created, create the entry and increase
2200 the total number of elements in RAP_ext */
2201 if (P_marker[i_c] < row_marker) {
2202 P_marker[i_c] = sum;
2203 sum++;
2204 }
2205 }
2206
2207 /* loop over elements in row i2 of P_ext_couple, do the same */
2208 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2209 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2210 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2211 if (P_marker[i_c] < row_marker) {
2212 P_marker[i_c] = sum;
2213 sum++;
2214 }
2215 }
2216 }
2217 }
2218
2219 /* now loop over elements in row i1 of A->mainBlock, repeat
2220 the process we have done to block A->col_coupleBlock */
2221 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2222 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2223 i2 = A->mainBlock->pattern->index[j2];
2224 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2225 A_marker[i2 + num_Acouple_cols] = i_r;
2226 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2227 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2228 i_c = P->mainBlock->pattern->index[j3];
2229 if (P_marker[i_c] < row_marker) {
2230 P_marker[i_c] = sum;
2231 sum++;
2232 }
2233 }
2234 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2235 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2236 /* note that we need to map the column index in
2237 P->col_coupleBlock back into the column index in
2238 P_ext_couple */
2239 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2240 if (P_marker[i_c] < row_marker) {
2241 P_marker[i_c] = sum;
2242 sum++;
2243 }
2244 }
2245 }
2246 }
2247 }
2248 }
2249
2250 /* Now we have the number of non-zero elements in RAP_ext, allocate
2251 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2252 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
2253 RAP_ext_idx = MEMALLOC(sum, index_t);
2254 RAP_ext_val = MEMALLOC(sum * block_size, double);
2255 RA_val = TMPMEMALLOC(block_size, double);
2256 RAP_val = TMPMEMALLOC(block_size, double);
2257
2258 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2259 sum = num_Pext_cols + num_Pmain_cols;
2260 #pragma omp parallel for private(i) schedule(static)
2261 for (i=0; i<sum; i++) P_marker[i] = -1;
2262 #pragma omp parallel for private(i) schedule(static)
2263 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2264 sum = 0;
2265 RAP_ext_ptr[0] = 0;
2266 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2267 row_marker = sum;
2268 /* then, loop over elements in row i_r of R_couple */
2269 j1_ub = R_couple->pattern->ptr[i_r+1];
2270 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2271 i1 = R_couple->pattern->index[j1];
2272 R_val = &(R_couple->val[j1*block_size]);
2273
2274 /* then, loop over elements in row i1 of A->col_coupleBlock */
2275 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2276 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2277 i2 = A->col_coupleBlock->pattern->index[j2];
2278 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2279 for (irb=0; irb<row_block_size; irb++) {
2280 for (icb=0; icb<col_block_size; icb++) {
2281 rtmp = ZERO;
2282 for (ib=0; ib<col_block_size; ib++) {
2283 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2284 }
2285 RA_val[irb+row_block_size*icb]=rtmp;
2286 }
2287 }
2288
2289 /* check whether entry RA[i_r, i2] has been previously visited.
2290 RAP new entry is possible only if entry RA[i_r, i2] has not
2291 been visited yet */
2292 if (A_marker[i2] != i_r) {
2293 /* first, mark entry RA[i_r,i2] as visited */
2294 A_marker[i2] = i_r;
2295
2296 /* then loop over elements in row i2 of P_ext_main */
2297 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2298 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2299 i_c = P->row_coupleBlock->pattern->index[j3];
2300 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2301 for (irb=0; irb<row_block_size; irb++) {
2302 for (icb=0; icb<col_block_size; icb++) {
2303 rtmp = ZERO;
2304 for (ib=0; ib<col_block_size; ib++) {
2305 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2306 }
2307 RAP_val[irb+row_block_size*icb]=rtmp;
2308 }
2309 }
2310
2311
2312 /* check whether entry RAP[i_r,i_c] has been created.
2313 If not yet created, create the entry and increase
2314 the total number of elements in RAP_ext */
2315 if (P_marker[i_c] < row_marker) {
2316 P_marker[i_c] = sum;
2317 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2318 RAP_ext_idx[sum] = i_c + offset;
2319 sum++;
2320 } else {
2321 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2322 for (ib=0; ib<block_size; ib++) {
2323 temp_val[ib] += RAP_val[ib];
2324 }
2325 }
2326 }
2327
2328 /* loop over elements in row i2 of P_ext_couple, do the same */
2329 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2330 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2331 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2332 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2333 for (irb=0; irb<row_block_size; irb++) {
2334 for (icb=0; icb<col_block_size; icb++) {
2335 rtmp = ZERO;
2336 for (ib=0; ib<col_block_size; ib++) {
2337 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2338 }
2339 RAP_val[irb+row_block_size*icb]=rtmp;
2340 }
2341 }
2342
2343 if (P_marker[i_c] < row_marker) {
2344 P_marker[i_c] = sum;
2345 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2346 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2347 sum++;
2348 } else {
2349 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2350 for (ib=0; ib<block_size; ib++) {
2351 temp_val[ib] += RAP_val[ib];
2352 }
2353 }
2354 }
2355
2356 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2357 Only the contributions are added. */
2358 } else {
2359 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2360 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2361 i_c = P->row_coupleBlock->pattern->index[j3];
2362 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2363 for (irb=0; irb<row_block_size; irb++) {
2364 for (icb=0; icb<col_block_size; icb++) {
2365 rtmp = ZERO;
2366 for (ib=0; ib<col_block_size; ib++) {
2367 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2368 }
2369 RAP_val[irb+row_block_size*icb]=rtmp;
2370 }
2371 }
2372
2373 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2374 for (ib=0; ib<block_size; ib++) {
2375 temp_val[ib] += RAP_val[ib];
2376 }
2377 }
2378 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2379 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2380 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2381 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2382 for (irb=0; irb<row_block_size; irb++) {
2383 for (icb=0; icb<col_block_size; icb++) {
2384 rtmp = ZERO;
2385 for (ib=0; ib<col_block_size; ib++) {
2386 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2387 }
2388 RAP_val[irb+row_block_size*icb]=rtmp;
2389 }
2390 }
2391
2392 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2393 for (ib=0; ib<block_size; ib++) {
2394 temp_val[ib] += RAP_val[ib];
2395 }
2396 }
2397 }
2398 }
2399
2400 /* now loop over elements in row i1 of A->mainBlock, repeat
2401 the process we have done to block A->col_coupleBlock */
2402 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2403 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2404 i2 = A->mainBlock->pattern->index[j2];
2405 temp_val = &(A->mainBlock->val[j2*block_size]);
2406 for (irb=0; irb<row_block_size; irb++) {
2407 for (icb=0; icb<col_block_size; icb++) {
2408 rtmp = ZERO;
2409 for (ib=0; ib<col_block_size; ib++) {
2410 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2411 }
2412 RA_val[irb+row_block_size*icb]=rtmp;
2413 }
2414 }
2415
2416 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2417 A_marker[i2 + num_Acouple_cols] = i_r;
2418 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2419 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2420 i_c = P->mainBlock->pattern->index[j3];
2421 temp_val = &(P->mainBlock->val[j3*block_size]);
2422 for (irb=0; irb<row_block_size; irb++) {
2423 for (icb=0; icb<col_block_size; icb++) {
2424 rtmp = ZERO;
2425 for (ib=0; ib<col_block_size; ib++) {
2426 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2427 }
2428 RAP_val[irb+row_block_size*icb]=rtmp;
2429 }
2430 }
2431
2432 if (P_marker[i_c] < row_marker) {
2433 P_marker[i_c] = sum;
2434 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2435 RAP_ext_idx[sum] = i_c + offset;
2436 sum++;
2437 } else {
2438 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2439 for (ib=0; ib<block_size; ib++) {
2440 temp_val[ib] += RAP_val[ib];
2441 }
2442 }
2443 }
2444 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2445 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2446 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2447 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2448 for (irb=0; irb<row_block_size; irb++) {
2449 for (icb=0; icb<col_block_size; icb++) {
2450 rtmp = ZERO;
2451 for (ib=0; ib<col_block_size; ib++) {
2452 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2453 }
2454 RAP_val[irb+row_block_size*icb]=rtmp;
2455 }
2456 }
2457
2458 if (P_marker[i_c] < row_marker) {
2459 P_marker[i_c] = sum;
2460 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2461 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2462 sum++;
2463 } else {
2464 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2465 for (ib=0; ib<block_size; ib++) {
2466 temp_val[ib] += RAP_val[ib];
2467 }
2468 }
2469 }
2470 } else {
2471 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2472 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2473 i_c = P->mainBlock->pattern->index[j3];
2474 temp_val = &(P->mainBlock->val[j3*block_size]);
2475 for (irb=0; irb<row_block_size; irb++) {
2476 for (icb=0; icb<col_block_size; icb++) {
2477 rtmp = ZERO;
2478 for (ib=0; ib<col_block_size; ib++) {
2479 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2480 }
2481 RAP_val[irb+row_block_size*icb]=rtmp;
2482 }
2483 }
2484
2485 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2486 for (ib=0; ib<block_size; ib++) {
2487 temp_val[ib] += RAP_val[ib];
2488 }
2489 }
2490 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2491 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2492 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2493 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2494 for (irb=0; irb<row_block_size; irb++) {
2495 for (icb=0; icb<col_block_size; icb++) {
2496 rtmp = ZERO;
2497 for (ib=0; ib<col_block_size; ib++) {
2498 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2499 }
2500 RAP_val[irb+row_block_size*icb]=rtmp;
2501 }
2502 }
2503
2504 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2505 for (ib=0; ib<block_size; ib++) {
2506 temp_val[ib] += RAP_val[ib];
2507 }
2508 }
2509 }
2510 }
2511 }
2512 RAP_ext_ptr[i_r+1] = sum;
2513 }
2514 TMPMEMFREE(P_marker);
2515 TMPMEMFREE(Pcouple_to_Pext);
2516
2517 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2518 which is given by the column list of P->col_coupleBlock, and
2519 column "c" is the list of columns which possiblely covers the
2520 whole column range of systme martris P. This part of data is to
2521 be passed to neighboring processors, and added to corresponding
2522 RAP[r,c] entries in the neighboring processors */
2523 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2524 &RAP_ext_val, global_id_P, block_size);
2525
2526 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2527 sum = RAP_ext_ptr[num_RAPext_rows];
2528 num_RAPext_cols = 0;
2529 if (num_Pext_cols || sum > 0) {
2530 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
2531 j1_ub = offset + num_Pmain_cols;
2532 for (i=0, iptr=0; i<sum; i++) {
2533 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2534 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2535 }
2536 for (j=0; j<num_Pext_cols; j++, iptr++){
2537 temp[iptr] = global_id_P[j];
2538 }
2539
2540 if (iptr) {
2541 #ifdef USE_QSORTG
2542 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2543 #else
2544 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2545 #endif
2546 num_RAPext_cols = 1;
2547 i = temp[0];
2548 for (j=1; j<iptr; j++) {
2549 if (temp[j] > i) {
2550 i = temp[j];
2551 temp[num_RAPext_cols++] = i;
2552 }
2553 }
2554 }
2555 }
2556
2557 /* resize the pattern of P_ext_couple */
2558 if(num_RAPext_cols){
2559 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
2560 for (i=0; i<num_RAPext_cols; i++)
2561 global_id_RAP[i] = temp[i];
2562 }
2563 if (num_Pext_cols || sum > 0)
2564 TMPMEMFREE(temp);
2565 j1_ub = offset + num_Pmain_cols;
2566 for (i=0; i<sum; i++) {
2567 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2568 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2569 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2570 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2571 } else
2572 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2573 }
2574
2575 /* build the mapping */
2576 if (num_Pcouple_cols) {
2577 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
2578 iptr = 0;
2579 for (i=0; i<num_RAPext_cols; i++)
2580 if (global_id_RAP[i] == P->global_id[iptr]) {
2581 Pcouple_to_RAP[iptr++] = i;
2582 if (iptr == num_Pcouple_cols) break;
2583 }
2584 }
2585
2586 if (num_Pext_cols) {
2587 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
2588 iptr = 0;
2589 for (i=0; i<num_RAPext_cols; i++)
2590 if (global_id_RAP[i] == global_id_P[iptr]) {
2591 Pext_to_RAP[iptr++] = i;
2592 if (iptr == num_Pext_cols) break;
2593 }
2594 }
2595
2596 if (global_id_P){
2597 TMPMEMFREE(global_id_P);
2598 global_id_P = NULL;
2599 }
2600
2601 /* alloc and initialize the makers */
2602 sum = num_RAPext_cols + num_Pmain_cols;
2603 P_marker = TMPMEMALLOC(sum, index_t);
2604 #pragma omp parallel for private(i) schedule(static)
2605 for (i=0; i<sum; i++) P_marker[i] = -1;
2606 #pragma omp parallel for private(i) schedule(static)
2607 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2608
2609 /* Now, count the size of RAP. Start with rows in R_main */
2610 num_neighbors = P->col_coupler->connector->send->numNeighbors;
2611 offsetInShared = P->col_coupler->connector->send->offsetInShared;
2612 shared = P->col_coupler->connector->send->shared;
2613 i = 0;
2614 j = 0;
2615 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2616 /* Mark the start of row for both main block and couple block */
2617 row_marker = i;
2618 row_marker_ext = j;
2619
2620 /* Mark the diagonal element RAP[i_r, i_r], and other elements
2621 in RAP_ext */
2622 P_marker[i_r] = i;
2623 i++;
2624 for (j1=0; j1<num_neighbors; j1++) {
2625 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2626 if (shared[j2] == i_r) {
2627 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2628 i_c = RAP_ext_idx[k];
2629 if (i_c < num_Pmain_cols) {
2630 if (P_marker[i_c] < row_marker) {
2631 P_marker[i_c] = i;
2632 i++;
2633 }
2634 } else {
2635 if (P_marker[i_c] < row_marker_ext) {
2636 P_marker[i_c] = j;
2637 j++;
2638 }
2639 }
2640 }
2641 break;
2642 }
2643 }
2644 }
2645
2646 /* then, loop over elements in row i_r of R_main */
2647 j1_ub = R_main->pattern->ptr[i_r+1];
2648 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2649 i1 = R_main->pattern->index[j1];
2650
2651 /* then, loop over elements in row i1 of A->col_coupleBlock */
2652 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2653 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2654 i2 = A->col_coupleBlock->pattern->index[j2];
2655
2656 /* check whether entry RA[i_r, i2] has been previously visited.
2657 RAP new entry is possible only if entry RA[i_r, i2] has not
2658 been visited yet */
2659 if (A_marker[i2] != i_r) {
2660 /* first, mark entry RA[i_r,i2] as visited */
2661 A_marker[i2] = i_r;
2662
2663 /* then loop over elements in row i2 of P_ext_main */
2664 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2665 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2666 i_c = P->row_coupleBlock->pattern->index[j3];
2667
2668 /* check whether entry RAP[i_r,i_c] has been created.
2669 If not yet created, create the entry and increase
2670 the total number of elements in RAP_ext */
2671 if (P_marker[i_c] < row_marker) {
2672 P_marker[i_c] = i;
2673 i++;
2674 }
2675 }
2676
2677 /* loop over elements in row i2 of P_ext_couple, do the same */
2678 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2679 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2680 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2681 if (P_marker[i_c] < row_marker_ext) {
2682 P_marker[i_c] = j;
2683 j++;
2684 }
2685 }
2686 }
2687 }
2688
2689 /* now loop over elements in row i1 of A->mainBlock, repeat
2690 the process we have done to block A->col_coupleBlock */
2691 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2692 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2693 i2 = A->mainBlock->pattern->index[j2];
2694 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2695 A_marker[i2 + num_Acouple_cols] = i_r;
2696 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2697 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2698 i_c = P->mainBlock->pattern->index[j3];
2699 if (P_marker[i_c] < row_marker) {
2700 P_marker[i_c] = i;
2701 i++;
2702 }
2703 }
2704 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2705 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2706 /* note that we need to map the column index in
2707 P->col_coupleBlock back into the column index in
2708 P_ext_couple */
2709 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2710 if (P_marker[i_c] < row_marker_ext) {
2711 P_marker[i_c] = j;
2712 j++;
2713 }
2714 }
2715 }
2716 }
2717 }
2718 }
2719
2720 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
2721 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2722 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2723 RAP_couple */
2724 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2725 RAP_main_idx = MEMALLOC(i, index_t);
2726 RAP_main_val = TMPMEMALLOC(i * block_size, double);
2727 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2728 RAP_couple_idx = MEMALLOC(j, index_t);
2729 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
2730
2731 RAP_main_ptr[num_Pmain_cols] = i;
2732 RAP_couple_ptr[num_Pmain_cols] = j;
2733
2734 #pragma omp parallel for private(i) schedule(static)
2735 for (i=0; i<sum; i++) P_marker[i] = -1;
2736 #pragma omp parallel for private(i) schedule(static)
2737 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2738
2739 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
2740 in R_main */
2741 i = 0;
2742 j = 0;
2743 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2744 /* Mark the start of row for both main block and couple block */
2745 row_marker = i;
2746 row_marker_ext = j;
2747 RAP_main_ptr[i_r] = row_marker;
2748 RAP_couple_ptr[i_r] = row_marker_ext;
2749
2750 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2751 in row i_r of RAP_ext */
2752 P_marker[i_r] = i;
2753 RAP_main_val[i] = ZERO;
2754 RAP_main_idx[i] = i_r;
2755 i++;
2756
2757 for (j1=0; j1<num_neighbors; j1++) {
2758 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2759 if (shared[j2] == i_r) {
2760 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2761 i_c = RAP_ext_idx[k];
2762 if (i_c < num_Pmain_cols) {
2763 if (P_marker[i_c] < row_marker) {
2764 P_marker[i_c] = i;
2765 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2766 RAP_main_idx[i] = i_c;
2767 i++;
2768 } else {
2769 temp_val = &(RAP_ext_val[k*block_size]);
2770 RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2771 for (ib=0; ib<block_size; ib++)
2772 RAP_val[ib] += temp_val[ib];
2773 }
2774 } else {
2775 if (P_marker[i_c] < row_marker_ext) {
2776 P_marker[i_c] = j;
2777 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2778 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2779 j++;
2780 } else {
2781 temp_val = &(RAP_ext_val[k*block_size]);
2782 RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2783 for (ib=0; ib<block_size; ib++)
2784 RAP_val[ib] += temp_val[ib];
2785 }
2786 }
2787 }
2788 break;
2789 }
2790 }
2791 }
2792
2793 /* then, loop over elements in row i_r of R_main */
2794 j1_ub = R_main->pattern->ptr[i_r+1];
2795 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2796 i1 = R_main->pattern->index[j1];
2797 R_val = &(R_main->val[j1*block_size]);
2798
2799 /* then, loop over elements in row i1 of A->col_coupleBlock */
2800 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2801 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2802 i2 = A->col_coupleBlock->pattern->index[j2];
2803 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2804 for (irb=0; irb<row_block_size; irb++) {
2805 for (icb=0; icb<col_block_size; icb++) {
2806 rtmp = ZERO;
2807 for (ib=0; ib<col_block_size; ib++) {
2808 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2809 }
2810 RA_val[irb+row_block_size*icb]=rtmp;
2811 }
2812 }
2813
2814
2815 /* check whether entry RA[i_r, i2] has been previously visited.
2816 RAP new entry is possible only if entry RA[i_r, i2] has not
2817 been visited yet */
2818 if (A_marker[i2] != i_r) {
2819 /* first, mark entry RA[i_r,i2] as visited */
2820 A_marker[i2] = i_r;
2821
2822 /* then loop over elements in row i2 of P_ext_main */
2823 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2824 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2825 i_c = P->row_coupleBlock->pattern->index[j3];
2826 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2827 for (irb=0; irb<row_block_size; irb++) {
2828 for (icb=0; icb<col_block_size; icb++) {
2829 rtmp = ZERO;
2830 for (ib=0; ib<col_block_size; ib++) {
2831 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2832 }
2833 RAP_val[irb+row_block_size*icb]=rtmp;
2834 }
2835 }
2836
2837
2838 /* check whether entry RAP[i_r,i_c] has been created.
2839 If not yet created, create the entry and increase
2840 the total number of elements in RAP_ext */
2841 if (P_marker[i_c] < row_marker) {
2842 P_marker[i_c] = i;
2843 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2844 RAP_main_idx[i] = i_c;
2845 i++;
2846 } else {
2847 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2848 for (ib=0; ib<block_size; ib++) {
2849 temp_val[ib] += RAP_val[ib];
2850 }
2851 }
2852 }
2853
2854 /* loop over elements in row i2 of P_ext_couple, do the same */
2855 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2856 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2857 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2858 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2859 for (irb=0; irb<row_block_size; irb++) {
2860 for (icb=0; icb<col_block_size; icb++) {
2861 rtmp = ZERO;
2862 for (ib=0; ib<col_block_size; ib++) {
2863 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2864 }
2865 RAP_val[irb+row_block_size*icb]=rtmp;
2866 }
2867 }
2868
2869 if (P_marker[i_c] < row_marker_ext) {
2870 P_marker[i_c] = j;
2871 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2872 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2873 j++;
2874 } else {
2875 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2876 for (ib=0; ib<block_size; ib++) {
2877 temp_val[ib] += RAP_val[ib];
2878 }
2879 }
2880 }
2881
2882 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2883 Only the contributions are added. */
2884 } else {
2885 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2886 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2887 i_c = P->row_coupleBlock->pattern->index[j3];
2888 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2889 for (irb=0; irb<row_block_size; irb++) {
2890 for (icb=0; icb<col_block_size; icb++) {
2891 rtmp = ZERO;
2892 for (ib=0; ib<col_block_size; ib++) {
2893 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2894 }
2895 RAP_val[irb+row_block_size*icb]=rtmp;
2896 }
2897 }
2898
2899 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2900 for (ib=0; ib<block_size; ib++) {
2901 temp_val[ib] += RAP_val[ib];
2902 }
2903 }
2904 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2905 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2906 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2907 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2908 for (irb=0; irb<row_block_size; irb++) {
2909 for (icb=0; icb<col_block_size; icb++) {
2910 rtmp = ZERO;
2911 for (ib=0; ib<col_block_size; ib++) {
2912 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2913 }
2914 RAP_val[irb+row_block_size*icb]=rtmp;
2915 }
2916 }
2917
2918 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2919 for (ib=0; ib<block_size; ib++) {
2920 temp_val[ib] += RAP_val[ib];
2921 }
2922 }
2923 }
2924 }
2925
2926 /* now loop over elements in row i1 of A->mainBlock, repeat
2927 the process we have done to block A->col_coupleBlock */
2928 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2929 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2930 i2 = A->mainBlock->pattern->index[j2];
2931 temp_val = &(A->mainBlock->val[j2*block_size]);
2932 for (irb=0; irb<row_block_size; irb++) {
2933 for (icb=0; icb<col_block_size; icb++) {
2934 rtmp = ZERO;
2935 for (ib=0; ib<col_block_size; ib++) {
2936 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2937 }
2938 RA_val[irb+row_block_size*icb]=rtmp;
2939 }
2940 }
2941
2942 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2943 A_marker[i2 + num_Acouple_cols] = i_r;
2944 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2945 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2946 i_c = P->mainBlock->pattern->index[j3];
2947 temp_val = &(P->mainBlock->val[j3*block_size]);
2948 for (irb=0; irb<row_block_size; irb++) {
2949 for (icb=0; icb<col_block_size; icb++) {
2950 rtmp = ZERO;
2951 for (ib=0; ib<col_block_size; ib++) {
2952 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2953 }
2954 RAP_val[irb+row_block_size*icb]=rtmp;
2955 }
2956 }
2957
2958 if (P_marker[i_c] < row_marker) {
2959 P_marker[i_c] = i;
2960 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2961 RAP_main_idx[i] = i_c;
2962 i++;
2963 } else {
2964 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2965 for (ib=0; ib<block_size; ib++) {
2966 temp_val[ib] += RAP_val[ib];
2967 }
2968 }
2969 }
2970 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2971 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2972 /* note that we need to map the column index in
2973 P->col_coupleBlock back into the column index in
2974 P_ext_couple */
2975 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2976 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2977 for (irb=0; irb<row_block_size; irb++) {
2978 for (icb=0; icb<col_block_size; icb++) {
2979 rtmp = ZERO;
2980 for (ib=0; ib<col_block_size; ib++) {
2981 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2982 }
2983 RAP_val[irb+row_block_size*icb]=rtmp;
2984 }
2985 }
2986
2987 if (P_marker[i_c] < row_marker_ext) {
2988 P_marker[i_c] = j;
2989 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2990 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2991 j++;
2992 } else {
2993 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2994 for (ib=0; ib<block_size; ib++) {
2995 temp_val[ib] += RAP_val[ib];
2996 }
2997 }
2998 }
2999
3000 } else {
3001 j3_ub = P->mainBlock->pattern->ptr[i2+1];
3002 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3003 i_c = P->mainBlock->pattern->index[j3];
3004 temp_val = &(P->mainBlock->val[j3*block_size]);
3005 for (irb=0; irb<row_block_size; irb++) {
3006 for (icb=0; icb<col_block_size; icb++) {
3007 rtmp = ZERO;
3008 for (ib=0; ib<col_block_size; ib++) {
3009 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3010 }
3011 RAP_val[irb+row_block_size*icb]=rtmp;
3012 }
3013 }
3014
3015 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3016 for (ib=0; ib<block_size; ib++) {
3017 temp_val[ib] += RAP_val[ib];
3018 }
3019 }
3020 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
3021 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3022 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3023 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
3024 for (irb=0; irb<row_block_size; irb++) {
3025 for (icb=0; icb<col_block_size; icb++) {
3026 rtmp = ZERO;
3027 for (ib=0; ib<col_block_size; ib++) {
3028 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3029 }
3030 RAP_val[irb+row_block_size*icb]=rtmp;
3031 }
3032 }
3033
3034 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3035 for (ib=0; ib<block_size; ib++) {
3036 temp_val[ib] += RAP_val[ib];
3037 }
3038 }
3039 }
3040 }
3041 }
3042
3043 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
3044 if (i > row_marker) {
3045 offset = i - row_marker;
3046 temp = TMPMEMALLOC(offset, index_t);
3047 for (iptr=0; iptr<offset; iptr++)
3048 temp[iptr] = RAP_main_idx[row_marker+iptr];
3049 if (offset > 0) {
3050 #ifdef USE_QSORTG
3051 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3052 #else
3053 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3054 #endif
3055 }
3056 temp_val = TMPMEMALLOC(offset * block_size, double);
3057 for (iptr=0; iptr<offset; iptr++){
3058 k = P_marker[temp[iptr]];
3059 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
3060 P_marker[temp[iptr]] = iptr + row_marker;
3061 }
3062 for (iptr=0; iptr<offset; iptr++){
3063 RAP_main_idx[row_marker+iptr] = temp[iptr];
3064 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3065 }
3066 TMPMEMFREE(temp);
3067 TMPMEMFREE(temp_val);
3068 }
3069 if (j > row_marker_ext) {
3070 offset = j - row_marker_ext;
3071 temp = TMPMEMALLOC(offset, index_t);
3072 for (iptr=0; iptr<offset; iptr++)
3073 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
3074 if (offset > 0) {
3075 #ifdef USE_QSORTG
3076 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3077 #else
3078 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3079 #endif
3080 }
3081 temp_val = TMPMEMALLOC(offset * block_size, double);
3082 for (iptr=0; iptr<offset; iptr++){
3083 k = P_marker[temp[iptr] + num_Pmain_cols];
3084 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
3085 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
3086 }
3087 for (iptr=0; iptr<offset; iptr++){
3088 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
3089 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3090 }
3091 TMPMEMFREE(temp);
3092 TMPMEMFREE(temp_val);
3093 }
3094 }
3095
3096 TMPMEMFREE(RA_val);
3097 TMPMEMFREE(RAP_val);
3098 TMPMEMFREE(A_marker);
3099 TMPMEMFREE(Pext_to_RAP);
3100 TMPMEMFREE(Pcouple_to_RAP);
3101 MEMFREE(RAP_ext_ptr);
3102 MEMFREE(RAP_ext_idx);
3103 MEMFREE(RAP_ext_val);
3104 Paso_SparseMatrix_free(R_main);
3105 Paso_SparseMatrix_free(R_couple);
3106
3107 /* Check whether there are empty columns in RAP_couple */
3108 #pragma omp parallel for schedule(static) private(i)
3109 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
3110 /* num of non-empty columns is stored in "k" */
3111 k = 0;
3112 j = RAP_couple_ptr[num_Pmain_cols];
3113 for (i=0; i<j; i++) {
3114 i1 = RAP_couple_idx[i];
3115 if (P_marker[i1]) {
3116 P_marker[i1] = 0;
3117 k++;
3118 }
3119 }
3120
3121 /* empty columns is found */
3122 if (k < num_RAPext_cols) {
3123 temp = TMPMEMALLOC(k, index_t);
3124 k = 0;
3125 for (i=0; i<num_RAPext_cols; i++)
3126 if (!P_marker[i]) {
3127 P_marker[i] = k;
3128 temp[k] = global_id_RAP[i];
3129 k++;
3130 }
3131 for (i=0; i<j; i++) {
3132 i1 = RAP_couple_idx[i];
3133 RAP_couple_idx[i] = P_marker[i1];
3134 }
3135 num_RAPext_cols = k;
3136 TMPMEMFREE(global_id_RAP);
3137 global_id_RAP = temp;
3138 }
3139 TMPMEMFREE(P_marker);
3140
3141 /******************************************************/
3142 /* Start to create the coasre level System Matrix A_c */
3143 /******************************************************/
3144 /* first, prepare the sender/receiver for the col_connector */
3145 dist = P->pattern->input_distribution->first_component;
3146 recv_len = TMPMEMALLOC(size, dim_t);
3147 send_len = TMPMEMALLOC(size, dim_t);
3148 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
3149 offsetInShared = TMPMEMALLOC(size+1, index_t);
3150 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
3151 memset(recv_len, 0, sizeof(dim_t) * size);
3152 memset(send_len, 0, sizeof(dim_t) * size);
3153 num_neighbors = 0;
3154 offsetInShared[0] = 0;
3155 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
3156 shared[i] = i + num_Pmain_cols;
3157 if (k <= global_id_RAP[i]) {
3158 if (recv_len[j] > 0) {
3159 neighbor[num_neighbors] = j;
3160 num_neighbors ++;
3161 offsetInShared[num_neighbors] = i;
3162 }
3163 while (k <= global_id_RAP[i]) {
3164 j++;
3165 k = dist[j+1];
3166 }
3167 }
3168 recv_len[j] ++;
3169 }
3170 if (recv_len[j] > 0) {
3171 neighbor[num_neighbors] = j;
3172 num_neighbors ++;
3173 offsetInShared[num_neighbors] = i;
3174 }
3175 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3176 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3177
3178 #ifdef ESYS_MPI
3179 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
3180 #endif
3181
3182 #ifdef ESYS_MPI
3183 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
3184 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
3185 #else
3186 mpi_requests=TMPMEMALLOC(size*2,int);
3187 mpi_stati=TMPMEMALLOC(size*2,int);
3188 #endif
3189 num_neighbors = 0;
3190 j = 0;
3191 offsetInShared[0] = 0;
3192 for (i=0; i<size; i++) {
3193 if (send_len[i] > 0) {
3194 neighbor[num_neighbors] = i;
3195 num_neighbors ++;
3196 j += send_len[i];
3197 offsetInShared[num_neighbors] = j;
3198 }
3199 }
3200 TMPMEMFREE(shared);
3201 shared = TMPMEMALLOC(j, index_t);
3202 for (i=0, j=0; i<num_neighbors; i++) {
3203 k = neighbor[i];
3204 #ifdef ESYS_MPI
3205 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
3206 mpi_info->msg_tag_counter+k,
3207 mpi_info->comm, &mpi_requests[i]);
3208 #endif
3209 j += send_len[k];
3210 }
3211 for (i=0, j=0; i<recv->numNeighbors; i++) {
3212 k = recv->neighbor[i];
3213 #ifdef ESYS_MPI
3214 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
3215 mpi_info->msg_tag_counter+rank,
3216 mpi_info->comm, &mpi_requests[i+num_neighbors]);
3217 #endif
3218 j += recv_len[k];
3219 }
3220 #ifdef ESYS_MPI
3221 MPI_Waitall(num_neighbors + recv->numNeighbors,
3222 mpi_requests, mpi_stati);
3223 #endif
3224 mpi_info->msg_tag_counter += size;
3225
3226 j = offsetInShared[num_neighbors];
3227 offset = dist[rank];
3228 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
3229 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3230 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3231
3232 col_connector = Paso_Connector_alloc(send, recv);
3233 TMPMEMFREE(shared);
3234 Paso_SharedComponents_free(recv);
3235 Paso_SharedComponents_free(send);
3236
3237 /* now, create row distribution (output_distri) and col
3238 distribution (input_distribution) */
3239 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3240 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3241
3242 /* then, prepare the sender/receiver for the row_connector, first, prepare
3243 the information for sender */
3244 sum = RAP_couple_ptr[num_Pmain_cols];
3245 len = TMPMEMALLOC(size, dim_t);
3246 send_ptr = TMPMEMALLOC(size, index_t*);
3247 send_idx = TMPMEMALLOC(size, index_t*);
3248 for (i=0; i<size; i++) {
3249 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
3250 send_idx[i] = TMPMEMALLOC(sum, index_t);
3251 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
3252 }
3253 memset(len, 0, sizeof(dim_t) * size);
3254 recv = col_connector->recv;
3255 sum=0;
3256 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
3257 i1 = RAP_couple_ptr[i_r];
3258 i2 = RAP_couple_ptr[i_r+1];
3259 if (i2 > i1) {
3260 /* then row i_r will be in the sender of row_connector, now check
3261 how many neighbors i_r needs to be send to */
3262 for (j=i1; j<i2; j++) {
3263 i_c = RAP_couple_idx[j];
3264 /* find out the corresponding neighbor "p" of column i_c */
3265 for (p=0; p<recv->numNeighbors; p++) {
3266 if (i_c < recv->offsetInShared[p+1]) {
3267 k = recv->neighbor[p];
3268 if (send_ptr[k][i_r] == 0) sum++;
3269 send_ptr[k][i_r] ++;
3270 send_idx[k][len[k]] = global_id_RAP[i_c];
3271 len[k] ++;
3272 break;
3273 }
3274 }
3275 }
3276 }
3277 }
3278 if (global_id_RAP) {
3279 TMPMEMFREE(global_id_RAP);
3280 global_id_RAP = NULL;
3281 }
3282
3283 /* now allocate the sender */
3284 shared = TMPMEMALLOC(sum, index_t);
3285 memset(send_len, 0, sizeof(dim_t) * size);
3286 num_neighbors=0;
3287 offsetInShared[0] = 0;
3288 for (p=0, k=0; p<size; p++) {
3289 for (i=0; i<num_Pmain_cols; i++) {
3290 if (send_ptr[p][i] > 0) {
3291 shared[k] = i;
3292 k++;
3293 send_ptr[p][send_len[p]] = send_ptr[p][i];
3294 send_len[p]++;
3295 }
3296 }
3297 if (k > offsetInShared[num_neighbors]) {
3298 neighbor[num_neighbors] = p;
3299 num_neighbors ++;
3300 offsetInShared[num_neighbors] = k;
3301 }
3302 }
3303 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3304 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3305
3306 /* send/recv number of rows will be sent from current proc
3307 recover info for the receiver of row_connector from the sender */
3308 #ifdef ESYS_MPI
3309 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
3310 #endif
3311 num_neighbors = 0;
3312 offsetInShared[0] = 0;
3313 j = 0;
3314 for (i=0; i<size; i++) {
3315 if (i != rank && recv_len[i] > 0) {
3316 neighbor[num_neighbors] = i;
3317 num_neighbors ++;
3318 j += recv_len[i];
3319 offsetInShared[num_neighbors] = j;
3320 }
3321 }
3322 TMPMEMFREE(shared);
3323 TMPMEMFREE(recv_len);
3324 shared = TMPMEMALLOC(j, index_t);
3325 k = offsetInShared[num_neighbors];
3326 for (i=0; i<k; i++) {
3327 shared[i] = i + num_Pmain_cols;
3328 }
3329 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3330 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3331 row_connector = Paso_Connector_alloc(send, recv);
3332 TMPMEMFREE(shared);
3333 Paso_SharedComponents_free(recv);
3334 Paso_SharedComponents_free(send);
3335
3336 /* send/recv pattern->ptr for rowCoupleBlock */
3337 num_RAPext_rows = offsetInShared[num_neighbors];
3338 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
3339 for (p=0; p<num_neighbors; p++) {
3340 j = offsetInShared[p];
3341 i = offsetInShared[p+1];
3342 #ifdef ESYS_MPI
3343 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
3344 mpi_info->msg_tag_counter+neighbor[p],
3345 mpi_info->comm, &mpi_requests[p]);
3346 #endif
3347 }
3348 send = row_connector->send;
3349 for (p=0; p<send->numNeighbors; p++) {
3350 #ifdef ESYS_MPI
3351 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
3352 MPI_INT, send->neighbor[p],
3353 mpi_info->msg_tag_counter+rank,
3354 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3355 #endif
3356 }
3357 #ifdef ESYS_MPI
3358 MPI_Waitall(num_neighbors + send->numNeighbors,
3359 mpi_requests, mpi_stati);
3360 #endif
3361 mpi_info->msg_tag_counter += size;
3362 TMPMEMFREE(send_len);
3363
3364 sum = 0;
3365 for (i=0; i<num_RAPext_rows; i++) {
3366 k = row_couple_ptr[i];
3367 row_couple_ptr[i] = sum;
3368 sum += k;
3369 }
3370 row_couple_ptr[num_RAPext_rows] = sum;
3371
3372 /* send/recv pattern->index for rowCoupleBlock */
3373 k = row_couple_ptr[num_RAPext_rows];
3374 row_couple_idx = MEMALLOC(k, index_t);
3375 for (p=0; p<num_neighbors; p++) {
3376 j1 = row_couple_ptr[offsetInShared[p]];
3377 j2 = row_couple_ptr[offsetInShared[p+1]];
3378 #ifdef ESYS_MPI
3379 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
3380 mpi_info->msg_tag_counter+neighbor[p],
3381 mpi_info->comm, &mpi_requests[p]);
3382 #endif
3383 }
3384 for (p=0; p<send->numNeighbors; p++) {
3385 #ifdef ESYS_MPI
3386 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
3387 MPI_INT, send->neighbor[p],
3388 mpi_info->msg_tag_counter+rank,
3389 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3390 #endif
3391 }
3392 #ifdef ESYS_MPI
3393 MPI_Waitall(num_neighbors + send->numNeighbors,
3394 mpi_requests, mpi_stati);
3395 #endif
3396 mpi_info->msg_tag_counter += size;
3397
3398 offset = input_dist->first_component[rank];
3399 k = row_couple_ptr[num_RAPext_rows];
3400 for (i=0; i<k; i++) {
3401 row_couple_idx[i] -= offset;
3402 }
3403
3404 for (i=0; i<size; i++) {
3405 TMPMEMFREE(send_ptr[i]);
3406 TMPMEMFREE(send_idx[i]);
3407 }
3408 TMPMEMFREE(send_ptr);
3409 TMPMEMFREE(send_idx);
3410 TMPMEMFREE(len);
3411 TMPMEMFREE(offsetInShared);
3412 TMPMEMFREE(neighbor);
3413 TMPMEMFREE(mpi_requests);
3414 TMPMEMFREE(mpi_stati);
3415 Esys_MPIInfo_free(mpi_info);
3416
3417 /* Now, we can create pattern for mainBlock and coupleBlock */
3418 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
3419 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
3420 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3421 num_Pmain_cols, num_RAPext_cols,
3422 RAP_couple_ptr, RAP_couple_idx);
3423 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3424 num_RAPext_rows, num_Pmain_cols,
3425 row_couple_ptr, row_couple_idx);
3426
3427 /* next, create the system matrix */
3428 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
3429 output_dist, input_dist, main_pattern, col_couple_pattern,
3430 row_couple_pattern, col_connector, row_connector);
3431 out = Paso_SystemMatrix_alloc(A->type, pattern,
3432 row_block_size, col_block_size, FALSE);
3433
3434 /* finally, fill in the data*/
3435 memcpy(out->mainBlock->val, RAP_main_val,
3436 out->mainBlock->len* sizeof(double));
3437 memcpy(out->col_coupleBlock->val, RAP_couple_val,
3438 out->col_coupleBlock->len * sizeof(double));
3439
3440 /* Clean up */
3441 Paso_SystemMatrixPattern_free(pattern);
3442 Paso_Pattern_free(main_pattern);
3443 Paso_Pattern_free(col_couple_pattern);
3444 Paso_Pattern_free(row_couple_pattern);
3445 Paso_Connector_free(col_connector);
3446 Paso_Connector_free(row_connector);
3447 Paso_Distribution_free(output_dist);
3448 Paso_Distribution_free(input_dist);
3449 TMPMEMFREE(RAP_main_val);
3450 TMPMEMFREE(RAP_couple_val);
3451 if (Esys_noError()) {
3452 return out;
3453 } else {
3454 Paso_SystemMatrix_free(out);
3455 return NULL;
3456 }
3457 }

  ViewVC Help
Powered by ViewVC 1.1.26