/[escript]/trunk/paso/src/AMG_Interpolation.cpp
ViewVC logotype

Contents of /trunk/paso/src/AMG_Interpolation.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3911 - (show annotations)
Thu Jun 14 01:01:03 2012 UTC (7 years, 5 months ago) by jfenwick
Original Path: trunk/paso/src/AMG_Interpolation.c
File MIME type: text/plain
File size: 123351 byte(s)
Copyright changes
1 /*******************************************************
2 *
3 * Copyright (c) 2003-2012 by University of Queensland
4 * Earth Systems Science Computational Center (ESSCC)
5 * http://www.uq.edu.au/esscc
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 *******************************************************/
12
13
14 /**************************************************************/
15
16 /* Paso: defines AMG Interpolation */
17
18 /**************************************************************/
19
20 /* Author: l.gao@uq.edu.au */
21
22 /**************************************************************/
23
24 #include "Paso.h"
25 #include "SparseMatrix.h"
26 #include "PasoUtil.h"
27 #include "Preconditioner.h"
28
29 /**************************************************************
30
31 Methods nessecary for AMG preconditioner
32
33 construct n_C x n_C interpolation operator A_C from matrix A
34 and prolongation matrix P.
35
36 The coarsening operator A_C is defined by RAP where R=P^T.
37
38 ***************************************************************/
39
40 /* Extend system matrix B with extra two sparse matrixes:
41 B_ext_main and B_ext_couple
42 The combination of this two sparse matrixes represents
43 the portion of B that is stored on neighbor procs and
44 needed locally for triple matrix product (B^T) A B.
45
46 FOR NOW, we assume that the system matrix B has a NULL
47 row_coupleBlock and a NULL remote_coupleBlock. Based on
48 this assumption, we use link row_coupleBlock for sparse
49 matrix B_ext_main, and link remote_coupleBlock for sparse
50 matrix B_ext_couple.
51
52 To be specific, B_ext (on processor k) are group of rows
53 in B, where the list of rows from processor q is given by
54 A->col_coupler->connector->send->shared[rPtr...]
55 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
56 on q. */
57 void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B)
58 {
59 Paso_Pattern *pattern_main=NULL, *pattern_couple=NULL;
60 Paso_Coupler *coupler=NULL;
61 Paso_SharedComponents *send=NULL, *recv=NULL;
62 double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL;
63 index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL;
64 index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL;
65 index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL;
66 index_t *idx_m=NULL, *idx_c=NULL;
67 index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0;
68 index_t offset, len, block_size, block_size_size, max_num_cols;
69 index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor;
70 dim_t *recv_degree=NULL, *send_degree=NULL;
71 dim_t rank=A->mpi_info->rank, size=A->mpi_info->size;
72
73 if (size == 1) return;
74
75 if (B->row_coupleBlock) {
76 Paso_SparseMatrix_free(B->row_coupleBlock);
77 B->row_coupleBlock = NULL;
78 }
79
80 if (B->row_coupleBlock || B->remote_coupleBlock) {
81 Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set.");
82 return;
83 }
84
85 /* sending/receiving unknown's global ID */
86 num_main_cols = B->mainBlock->numCols;
87 cols = TMPMEMALLOC(num_main_cols, double);
88 offset = B->col_distribution->first_component[rank];
89 #pragma omp parallel for private(i) schedule(static)
90 for (i=0; i<num_main_cols; ++i) cols[i] = offset + i;
91 if (B->global_id == NULL) {
92 coupler=Paso_Coupler_alloc(B->col_coupler->connector, 1);
93 Paso_Coupler_startCollect(coupler, cols);
94 }
95
96 recv_buf = TMPMEMALLOC(size, index_t);
97 recv_degree = TMPMEMALLOC(size, dim_t);
98 recv_offset = TMPMEMALLOC(size+1, index_t);
99 #pragma omp parallel for private(i) schedule(static)
100 for (i=0; i<size; i++){
101 recv_buf[i] = 0;
102 recv_degree[i] = 1;
103 recv_offset[i] = i;
104 }
105
106 block_size = B->block_size;
107 block_size_size = block_size * sizeof(double);
108 num_couple_cols = B->col_coupleBlock->numCols;
109 send = A->col_coupler->connector->send;
110 recv = A->col_coupler->connector->recv;
111 num_neighbors = send->numNeighbors;
112 p = send->offsetInShared[num_neighbors];
113 len = p * B->col_distribution->first_component[size];
114 send_buf = TMPMEMALLOC(len * block_size, double);
115 send_idx = TMPMEMALLOC(len, index_t);
116 send_offset = TMPMEMALLOC((p+1)*2, index_t);
117 send_degree = TMPMEMALLOC(num_neighbors, dim_t);
118 i = num_main_cols + num_couple_cols;
119 send_m = TMPMEMALLOC(i * block_size, double);
120 send_c = TMPMEMALLOC(i * block_size, double);
121 idx_m = TMPMEMALLOC(i, index_t);
122 idx_c = TMPMEMALLOC(i, index_t);
123
124 /* waiting for receiving unknown's global ID */
125 if (B->global_id == NULL) {
126 Paso_Coupler_finishCollect(coupler);
127 global_id = MEMALLOC(num_couple_cols, index_t);
128 #pragma omp parallel for private(i) schedule(static)
129 for (i=0; i<num_couple_cols; ++i)
130 global_id[i] = coupler->recv_buffer[i];
131 B->global_id = global_id;
132 Paso_Coupler_free(coupler);
133 } else
134 global_id = B->global_id;
135
136 /* distribute the number of cols in current col_coupleBlock to all ranks */
137 #ifdef ESYS_MPI
138 MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
139 #endif
140
141 /* distribute global_ids of cols to be considered to all ranks*/
142 len = 0;
143 max_num_cols = 0;
144 for (i=0; i<size; i++){
145 recv_degree[i] = recv_buf[i];
146 recv_offset[i] = len;
147 len += recv_buf[i];
148 if (max_num_cols < recv_buf[i])
149 max_num_cols = recv_buf[i];
150 }
151 recv_offset[size] = len;
152 cols_array = TMPMEMALLOC(len, index_t);
153 #ifdef ESYS_MPI
154 MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm);
155 #endif
156
157 /* first, prepare the ptr_ptr to be received */
158 q = recv->numNeighbors;
159 len = recv->offsetInShared[q];
160 ptr_ptr = TMPMEMALLOC((len+1) * 2, index_t);
161 for (p=0; p<q; p++) {
162 row = recv->offsetInShared[p];
163 m = recv->offsetInShared[p + 1];
164 #ifdef ESYS_MPI
165 MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p],
166 A->mpi_info->msg_tag_counter+recv->neighbor[p],
167 A->mpi_info->comm,
168 &(A->col_coupler->mpi_requests[p]));
169 #endif
170 }
171
172 /* now prepare the rows to be sent (the degree, the offset and the data) */
173 len = 0;
174 i0 = 0;
175 for (p=0; p<num_neighbors; p++) {
176 i = i0;
177 neighbor = send->neighbor[p];
178 m_lb = B->col_distribution->first_component[neighbor];
179 m_ub = B->col_distribution->first_component[neighbor + 1];
180 j_ub = send->offsetInShared[p + 1];
181 for (j=send->offsetInShared[p]; j<j_ub; j++) {
182 row = send->shared[j];
183 l_m = 0;
184 l_c = 0;
185 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
186 k_lb = B->col_coupleBlock->pattern->ptr[row];
187
188 /* check part of col_coupleBlock for data to be passed @row */
189 for (k=k_lb; k<k_ub; k++) {
190 m = global_id[B->col_coupleBlock->pattern->index[k]];
191 if (m > offset) break;
192 if (m>= m_lb && m < m_ub) {
193 /* data to be passed to sparse matrix B_ext_main */
194 idx_m[l_m] = m - m_lb;
195 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
196 l_m++;
197 } else {
198 /* data to be passed to sparse matrix B_ext_couple */
199 idx_c[l_c] = m;
200 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
201 l_c++;
202 }
203 }
204 k_lb = k;
205
206 /* check mainBlock for data to be passed @row to sparse
207 matrix B_ext_couple */
208 k_ub = B->mainBlock->pattern->ptr[row + 1];
209 k = B->mainBlock->pattern->ptr[row];
210 memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k));
211 for (; k<k_ub; k++) {
212 m = B->mainBlock->pattern->index[k] + offset;
213 idx_c[l_c] = m;
214 l_c++;
215 }
216
217 /* check the rest part of col_coupleBlock for data to
218 be passed @row to sparse matrix B_ext_couple */
219 k = k_lb;
220 k_ub = B->col_coupleBlock->pattern->ptr[row + 1];
221 for (k=k_lb; k<k_ub; k++) {
222 m = global_id[B->col_coupleBlock->pattern->index[k]];
223 if (m>= m_lb && m < m_ub) {
224 /* data to be passed to sparse matrix B_ext_main */
225 idx_m[l_m] = m - m_lb;
226 memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
227 l_m++;
228 } else {
229 /* data to be passed to sparse matrix B_ext_couple */
230 idx_c[l_c] = m;
231 memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size);
232 l_c++;
233 }
234 }
235
236 memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m);
237 memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t));
238 send_offset[2*i] = l_m;
239 len += l_m;
240 memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c);
241 memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t));
242 send_offset[2*i+1] = l_c;
243 len += l_c;
244 i++;
245 }
246
247 /* sending */
248 #ifdef ESYS_MPI
249 MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p],
250 A->mpi_info->msg_tag_counter+rank,
251 A->mpi_info->comm,
252 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
253 #endif
254 send_degree[p] = len;
255 i0 = i;
256 }
257 TMPMEMFREE(send_m);
258 TMPMEMFREE(send_c);
259 TMPMEMFREE(idx_m);
260 TMPMEMFREE(idx_c);
261
262
263 q = recv->numNeighbors;
264 len = recv->offsetInShared[q];
265 ptr_main = MEMALLOC((len+1), index_t);
266 ptr_couple = MEMALLOC((len+1), index_t);
267
268 #ifdef ESYS_MPI
269 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
270 A->col_coupler->mpi_requests,
271 A->col_coupler->mpi_stati);
272 #endif
273 A->mpi_info->msg_tag_counter += size;
274
275 j = 0;
276 k = 0;
277 ptr_main[0] = 0;
278 ptr_couple[0] = 0;
279 for (i=0; i<len; i++) {
280 j += ptr_ptr[2*i];
281 k += ptr_ptr[2*i+1];
282 ptr_main[i+1] = j;
283 ptr_couple[i+1] = k;
284 }
285
286 TMPMEMFREE(ptr_ptr);
287 idx_main = MEMALLOC(j, index_t);
288 idx_couple = MEMALLOC(k, index_t);
289 ptr_idx = TMPMEMALLOC(j+k, index_t);
290 ptr_val = TMPMEMALLOC((j+k) * block_size, double);
291
292 /* send/receive index array */
293 j=0;
294 k_ub = 0;
295 for (p=0; p<recv->numNeighbors; p++) {
296 k = recv->offsetInShared[p];
297 m = recv->offsetInShared[p+1];
298 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
299 if (i > 0) {
300 k_ub ++;
301 #ifdef ESYS_MPI
302 MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p],
303 A->mpi_info->msg_tag_counter+recv->neighbor[p],
304 A->mpi_info->comm,
305 &(A->col_coupler->mpi_requests[p]));
306 #endif
307 }
308 j += i;
309 }
310
311 j=0;
312 k_ub = 0;
313 for (p=0; p<num_neighbors; p++) {
314 i = send_degree[p] - j;
315 if (i > 0){
316 k_ub ++;
317 #ifdef ESYS_MPI
318 MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p],
319 A->mpi_info->msg_tag_counter+rank,
320 A->mpi_info->comm,
321 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
322 #endif
323 }
324 j = send_degree[p];
325 }
326
327 #ifdef ESYS_MPI
328 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
329 A->col_coupler->mpi_requests,
330 A->col_coupler->mpi_stati);
331 #endif
332 A->mpi_info->msg_tag_counter += size;
333
334 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
335 for (i=0; i<len; i++) {
336 j = ptr_main[i];
337 k = ptr_main[i+1];
338 m = ptr_couple[i];
339 for (p=j; p<k; p++) {
340 idx_main[p] = ptr_idx[m+p];
341 }
342 j = ptr_couple[i+1];
343 for (p=m; p<j; p++) {
344 idx_couple[p] = ptr_idx[k+p];
345 }
346 }
347 TMPMEMFREE(ptr_idx);
348
349 /* allocate pattern and sparsematrix for B_ext_main */
350 pattern_main = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
351 len, num_main_cols, ptr_main, idx_main);
352 B->row_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
353 pattern_main, B->row_block_size, B->col_block_size,
354 FALSE);
355 Paso_Pattern_free(pattern_main);
356
357 /* allocate pattern and sparsematrix for B_ext_couple */
358 pattern_couple = Paso_Pattern_alloc(B->col_coupleBlock->pattern->type,
359 len, B->col_distribution->first_component[size],
360 ptr_couple, idx_couple);
361 B->remote_coupleBlock = Paso_SparseMatrix_alloc(B->col_coupleBlock->type,
362 pattern_couple, B->row_block_size, B->col_block_size,
363 FALSE);
364 Paso_Pattern_free(pattern_couple);
365
366 /* send/receive value array */
367 j=0;
368 for (p=0; p<recv->numNeighbors; p++) {
369 k = recv->offsetInShared[p];
370 m = recv->offsetInShared[p+1];
371 i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k];
372 #ifdef ESYS_MPI
373 if (i > 0)
374 MPI_Irecv(&(ptr_val[j]), i * block_size,
375 MPI_DOUBLE, recv->neighbor[p],
376 A->mpi_info->msg_tag_counter+recv->neighbor[p],
377 A->mpi_info->comm,
378 &(A->col_coupler->mpi_requests[p]));
379 #endif
380 j += (i * block_size);
381 }
382
383 j=0;
384 for (p=0; p<num_neighbors; p++) {
385 i = send_degree[p] - j;
386 #ifdef ESYS_MPI
387 if (i > 0)
388 MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p],
389 A->mpi_info->msg_tag_counter+rank,
390 A->mpi_info->comm,
391 &(A->col_coupler->mpi_requests[p+recv->numNeighbors]));
392 #endif
393 j = send_degree[p] ;
394 }
395
396 #ifdef ESYS_MPI
397 MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors,
398 A->col_coupler->mpi_requests,
399 A->col_coupler->mpi_stati);
400 #endif
401 A->mpi_info->msg_tag_counter += size;
402
403 #pragma omp parallel for private(i,j,k,m,p) schedule(static)
404 for (i=0; i<len; i++) {
405 j = ptr_main[i];
406 k = ptr_main[i+1];
407 m = ptr_couple[i];
408 for (p=j; p<k; p++) {
409 memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size);
410 }
411 j = ptr_couple[i+1];
412 for (p=m; p<j; p++) {
413 memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size);
414 }
415 }
416 TMPMEMFREE(ptr_val);
417
418
419 /* release all temp memory allocation */
420 TMPMEMFREE(cols);
421 TMPMEMFREE(cols_array);
422 TMPMEMFREE(recv_offset);
423 TMPMEMFREE(recv_degree);
424 TMPMEMFREE(recv_buf);
425 TMPMEMFREE(send_buf);
426 TMPMEMFREE(send_offset);
427 TMPMEMFREE(send_degree);
428 TMPMEMFREE(send_idx);
429 }
430
431 /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val)
432 has the same number of rows as P->col_coupleBlock->numCols. Now, we need
433 to copy block of data in T to neighbor processors, defined by
434 P->col_coupler->connector->recv->neighbor[k] where k is in
435 [0, P->col_coupler->connector->recv->numNeighbors).
436 Rows to be copied to neighbor processor k is in the list defined by
437 P->col_coupler->connector->recv->offsetInShared[k] ...
438 P->col_coupler->connector->recv->offsetInShared[k+1] */
439 void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P,
440 index_t **p_ptr, index_t **p_idx, double **p_val,
441 index_t *global_id, index_t block_size)
442 {
443 Paso_SharedComponents *send=NULL, *recv=NULL;
444 index_t send_neighbors, recv_neighbors, send_rows, recv_rows;
445 index_t i, j, p, m, n, size;
446 index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL;
447 index_t *ptr=*p_ptr, *idx=*p_idx;
448 double *val=*p_val, *recv_val=NULL;
449 #ifdef ESYS_MPI
450 index_t rank = P->mpi_info->rank;
451 #endif
452
453 size = P->mpi_info->size;
454 send = P->col_coupler->connector->recv;
455 recv = P->col_coupler->connector->send;
456 send_neighbors = send->numNeighbors;
457 recv_neighbors = recv->numNeighbors;
458 send_rows = P->col_coupleBlock->numCols;
459 recv_rows = recv->offsetInShared[recv_neighbors];
460
461 send_degree = TMPMEMALLOC(send_rows, index_t);
462 recv_ptr = MEMALLOC(recv_rows + 1, index_t);
463 #pragma omp for schedule(static) private(i)
464 for (i=0; i<send_rows; i++)
465 send_degree[i] = ptr[i+1] - ptr[i];
466
467 /* First, send/receive the degree */
468 for (p=0; p<recv_neighbors; p++) { /* Receiving */
469 m = recv->offsetInShared[p];
470 n = recv->offsetInShared[p+1];
471 #ifdef ESYS_MPI
472 MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p],
473 P->mpi_info->msg_tag_counter + recv->neighbor[p],
474 P->mpi_info->comm,
475 &(P->col_coupler->mpi_requests[p]));
476 #endif
477 }
478 for (p=0; p<send_neighbors; p++) { /* Sending */
479 m = send->offsetInShared[p];
480 n = send->offsetInShared[p+1];
481 #ifdef ESYS_MPI
482 MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p],
483 P->mpi_info->msg_tag_counter + rank,
484 P->mpi_info->comm,
485 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
486 #endif
487 }
488 #ifdef ESYS_MPI
489 MPI_Waitall(send_neighbors+recv_neighbors,
490 P->col_coupler->mpi_requests,
491 P->col_coupler->mpi_stati);
492 #endif
493 P->mpi_info->msg_tag_counter += size;
494
495 TMPMEMFREE(send_degree);
496 m = Paso_Util_cumsum(recv_rows, recv_ptr);
497 recv_ptr[recv_rows] = m;
498 recv_idx = MEMALLOC(m, index_t);
499 recv_val = MEMALLOC(m * block_size, double);
500
501 /* Next, send/receive the index array */
502 j = 0;
503 for (p=0; p<recv_neighbors; p++) { /* Receiving */
504 m = recv->offsetInShared[p];
505 n = recv->offsetInShared[p+1];
506 i = recv_ptr[n] - recv_ptr[m];
507 if (i > 0) {
508 #ifdef ESYS_MPI
509 MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p],
510 P->mpi_info->msg_tag_counter + recv->neighbor[p],
511 P->mpi_info->comm,
512 &(P->col_coupler->mpi_requests[p]));
513 #endif
514 }
515 j += i;
516 }
517
518 j = 0;
519 for (p=0; p<send_neighbors; p++) { /* Sending */
520 m = send->offsetInShared[p];
521 n = send->offsetInShared[p+1];
522 i = ptr[n] - ptr[m];
523 if (i >0) {
524 #ifdef ESYS_MPI
525 MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p],
526 P->mpi_info->msg_tag_counter + rank,
527 P->mpi_info->comm,
528 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
529 #endif
530 j += i;
531 }
532 }
533 #ifdef ESYS_MPI
534 MPI_Waitall(send_neighbors+recv_neighbors,
535 P->col_coupler->mpi_requests,
536 P->col_coupler->mpi_stati);
537 #endif
538 P->mpi_info->msg_tag_counter += size;
539
540 /* Last, send/receive the data array */
541 j = 0;
542 for (p=0; p<recv_neighbors; p++) { /* Receiving */
543 m = recv->offsetInShared[p];
544 n = recv->offsetInShared[p+1];
545 i = recv_ptr[n] - recv_ptr[m];
546 #ifdef ESYS_MPI
547 if (i > 0)
548 MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p],
549 P->mpi_info->msg_tag_counter + recv->neighbor[p],
550 P->mpi_info->comm,
551 &(P->col_coupler->mpi_requests[p]));
552 #endif
553 j += (i*block_size);
554 }
555
556 j = 0;
557 for (p=0; p<send_neighbors; p++) { /* Sending */
558 m = send->offsetInShared[p];
559 n = send->offsetInShared[p+1];
560 i = ptr[n] - ptr[m];
561 if (i >0) {
562 #ifdef ESYS_MPI
563 MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p],
564 P->mpi_info->msg_tag_counter + rank,
565 P->mpi_info->comm,
566 &(P->col_coupler->mpi_requests[p+recv_neighbors]));
567 #endif
568 j += i * block_size;
569 }
570 }
571 #ifdef ESYS_MPI
572 MPI_Waitall(send_neighbors+recv_neighbors,
573 P->col_coupler->mpi_requests,
574 P->col_coupler->mpi_stati);
575 #endif
576 P->mpi_info->msg_tag_counter += size;
577
578 /* Clean up and return with recevied ptr, index and data arrays */
579 MEMFREE(ptr);
580 MEMFREE(idx);
581 MEMFREE(val);
582 *p_ptr = recv_ptr;
583 *p_idx = recv_idx;
584 *p_val = recv_val;
585 }
586
587 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator(
588 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
589 {
590 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
591 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
592 Paso_SystemMatrix *out=NULL;
593 Paso_SystemMatrixPattern *pattern=NULL;
594 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
595 Paso_SharedComponents *send =NULL, *recv=NULL;
596 Paso_Connector *col_connector=NULL, *row_connector=NULL;
597 Paso_Pattern *main_pattern=NULL;
598 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
599 const dim_t row_block_size=A->row_block_size;
600 const dim_t col_block_size=A->col_block_size;
601 const dim_t block_size = A->block_size;
602 const dim_t P_block_size = P->block_size;
603 const double ZERO = 0.0;
604 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
605 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val;
606 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
607 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
608 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
609 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
610 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
611 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
612 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
613 index_t sum, i, j, k, iptr, irb, icb, ib;
614 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
615 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
616 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
617 index_t row_marker_ext, *where_p=NULL;
618 index_t **send_ptr=NULL, **send_idx=NULL;
619 dim_t p, num_neighbors;
620 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
621 Esys_MPI_rank *neighbor=NULL;
622 #ifdef ESYS_MPI
623 MPI_Request* mpi_requests=NULL;
624 MPI_Status* mpi_stati=NULL;
625 #else
626 int *mpi_requests=NULL, *mpi_stati=NULL;
627 #endif
628
629 /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK))
630 return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/
631
632 /* two sparse matrixes R_main and R_couple will be generate, as the
633 transpose of P_main and P_col_couple, respectively. Note that,
634 R_couple is actually the row_coupleBlock of R (R=P^T) */
635 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
636 if (size > 1)
637 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
638
639 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
640 and needed locally for triple matrix product RAP
641 to be specific, P_ext (on processor k) are group of rows in P, where
642 the list of rows from processor q is given by
643 A->col_coupler->connector->send->shared[rPtr...]
644 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
645 on q.
646 P_ext is represented by two sparse matrixes P_ext_main and
647 P_ext_couple */
648 Paso_Preconditioner_AMG_extendB(A, P);
649
650 /* count the number of cols in P_ext_couple, resize the pattern of
651 sparse matrix P_ext_couple with new compressed order, and then
652 build the col id mapping from P->col_coupleBlock to
653 P_ext_couple */
654 num_Pmain_cols = P->mainBlock->numCols;
655 if (size > 1) {
656 num_Pcouple_cols = P->col_coupleBlock->numCols;
657 num_Acouple_cols = A->col_coupleBlock->numCols;
658 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
659 } else {
660 num_Pcouple_cols = 0;
661 num_Acouple_cols = 0;
662 sum = 0;
663 }
664 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
665 offset = P->col_distribution->first_component[rank];
666 num_Pext_cols = 0;
667 if (P->global_id) {
668 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
669 and P->col_coupleBlock */
670 iptr = 0;
671 if (num_Pcouple_cols || sum > 0) {
672 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
673 #pragma omp parallel for lastprivate(iptr) schedule(static)
674 for (iptr=0; iptr<sum; iptr++){
675 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
676 }
677 for (j=0; j<num_Pcouple_cols; j++, iptr++){
678 temp[iptr] = P->global_id[j];
679 }
680 }
681 if (iptr) {
682 #ifdef USE_QSORTG
683 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
684 #else
685 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
686 #endif
687 num_Pext_cols = 1;
688 i = temp[0];
689 for (j=1; j<iptr; j++) {
690 if (temp[j] > i) {
691 i = temp[j];
692 temp[num_Pext_cols++] = i;
693 }
694 }
695 }
696 /* resize the pattern of P_ext_couple */
697 if(num_Pext_cols){
698 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
699 #pragma omp parallel for private(i) schedule(static)
700 for (i=0; i<num_Pext_cols; i++)
701 global_id_P[i] = temp[i];
702 }
703 if (num_Pcouple_cols || sum > 0)
704 TMPMEMFREE(temp);
705 #pragma omp parallel for private(i, where_p) schedule(static)
706 for (i=0; i<sum; i++) {
707 where_p = (index_t *)bsearch(
708 &(P->remote_coupleBlock->pattern->index[i]),
709 global_id_P, num_Pext_cols,
710 sizeof(index_t), Paso_comparIndex);
711 P->remote_coupleBlock->pattern->index[i] =
712 (index_t)(where_p -global_id_P);
713 }
714
715 /* build the mapping */
716 if (num_Pcouple_cols) {
717 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
718 iptr = 0;
719 for (i=0; i<num_Pext_cols; i++)
720 if (global_id_P[i] == P->global_id[iptr]) {
721 Pcouple_to_Pext[iptr++] = i;
722 if (iptr == num_Pcouple_cols) break;
723 }
724 }
725 }
726
727 /* alloc and initialize the makers */
728 sum = num_Pext_cols + num_Pmain_cols;
729 P_marker = TMPMEMALLOC(sum, index_t);
730 A_marker = TMPMEMALLOC(num_A_cols, index_t);
731 #pragma omp parallel for private(i) schedule(static)
732 for (i=0; i<sum; i++) P_marker[i] = -1;
733 #pragma omp parallel for private(i) schedule(static)
734 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
735
736 /* Now, count the size of RAP_ext. Start with rows in R_couple */
737 sum = 0;
738 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
739 row_marker = sum;
740 /* then, loop over elements in row i_r of R_couple */
741 j1_ub = R_couple->pattern->ptr[i_r+1];
742 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
743 i1 = R_couple->pattern->index[j1];
744 /* then, loop over elements in row i1 of A->col_coupleBlock */
745 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
746 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
747 i2 = A->col_coupleBlock->pattern->index[j2];
748
749 /* check whether entry RA[i_r, i2] has been previously visited.
750 RAP new entry is possible only if entry RA[i_r, i2] has not
751 been visited yet */
752 if (A_marker[i2] != i_r) {
753 /* first, mark entry RA[i_r,i2] as visited */
754 A_marker[i2] = i_r;
755
756 /* then loop over elements in row i2 of P_ext_main */
757 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
758 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
759 i_c = P->row_coupleBlock->pattern->index[j3];
760
761 /* check whether entry RAP[i_r,i_c] has been created.
762 If not yet created, create the entry and increase
763 the total number of elements in RAP_ext */
764 if (P_marker[i_c] < row_marker) {
765 P_marker[i_c] = sum;
766 sum++;
767 }
768 }
769
770 /* loop over elements in row i2 of P_ext_couple, do the same */
771 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
772 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
773 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
774 if (P_marker[i_c] < row_marker) {
775 P_marker[i_c] = sum;
776 sum++;
777 }
778 }
779 }
780 }
781
782 /* now loop over elements in row i1 of A->mainBlock, repeat
783 the process we have done to block A->col_coupleBlock */
784 j2_ub = A->mainBlock->pattern->ptr[i1+1];
785 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
786 i2 = A->mainBlock->pattern->index[j2];
787 if (A_marker[i2 + num_Acouple_cols] != i_r) {
788 A_marker[i2 + num_Acouple_cols] = i_r;
789 j3_ub = P->mainBlock->pattern->ptr[i2+1];
790 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
791 i_c = P->mainBlock->pattern->index[j3];
792 if (P_marker[i_c] < row_marker) {
793 P_marker[i_c] = sum;
794 sum++;
795 }
796 }
797 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
798 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
799 /* note that we need to map the column index in
800 P->col_coupleBlock back into the column index in
801 P_ext_couple */
802 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
803 if (P_marker[i_c] < row_marker) {
804 P_marker[i_c] = sum;
805 sum++;
806 }
807 }
808 }
809 }
810 }
811 }
812
813 /* Now we have the number of non-zero elements in RAP_ext, allocate
814 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
815 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
816 RAP_ext_idx = MEMALLOC(sum, index_t);
817 RAP_ext_val = MEMALLOC(sum * block_size, double);
818 RA_val = TMPMEMALLOC(block_size, double);
819 RAP_val = TMPMEMALLOC(block_size, double);
820
821 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
822 sum = num_Pext_cols + num_Pmain_cols;
823 #pragma omp parallel for private(i) schedule(static)
824 for (i=0; i<sum; i++) P_marker[i] = -1;
825 #pragma omp parallel for private(i) schedule(static)
826 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
827 sum = 0;
828 RAP_ext_ptr[0] = 0;
829 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
830 row_marker = sum;
831 /* then, loop over elements in row i_r of R_couple */
832 j1_ub = R_couple->pattern->ptr[i_r+1];
833 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
834 i1 = R_couple->pattern->index[j1];
835 R_val = &(R_couple->val[j1*P_block_size]);
836
837 /* then, loop over elements in row i1 of A->col_coupleBlock */
838 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
839 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
840 i2 = A->col_coupleBlock->pattern->index[j2];
841 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
842 for (irb=0; irb<row_block_size; irb++)
843 for (icb=0; icb<col_block_size; icb++)
844 RA_val[irb+row_block_size*icb] = ZERO;
845 for (irb=0; irb<P_block_size; irb++) {
846 rtmp=R_val[irb];
847 for (icb=0; icb<col_block_size; icb++) {
848 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
849 }
850 }
851
852 /* check whether entry RA[i_r, i2] has been previously visited.
853 RAP new entry is possible only if entry RA[i_r, i2] has not
854 been visited yet */
855 if (A_marker[i2] != i_r) {
856 /* first, mark entry RA[i_r,i2] as visited */
857 A_marker[i2] = i_r;
858
859 /* then loop over elements in row i2 of P_ext_main */
860 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
861 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
862 i_c = P->row_coupleBlock->pattern->index[j3];
863 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
864 for (irb=0; irb<row_block_size; irb++)
865 for (icb=0; icb<col_block_size; icb++)
866 RAP_val[irb+row_block_size*icb] = ZERO;
867 for (icb=0; icb<P_block_size; icb++) {
868 rtmp = temp_val[icb];
869 for (irb=0; irb<row_block_size; irb++) {
870 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
871 }
872 }
873
874 /* check whether entry RAP[i_r,i_c] has been created.
875 If not yet created, create the entry and increase
876 the total number of elements in RAP_ext */
877 if (P_marker[i_c] < row_marker) {
878 P_marker[i_c] = sum;
879 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
880 RAP_ext_idx[sum] = i_c + offset;
881 sum++;
882 } else {
883 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
884 for (ib=0; ib<block_size; ib++) {
885 temp_val[ib] += RAP_val[ib];
886 }
887 }
888 }
889
890 /* loop over elements in row i2 of P_ext_couple, do the same */
891 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
892 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
893 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
894 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
895 for (irb=0; irb<row_block_size; irb++)
896 for (icb=0; icb<col_block_size; icb++)
897 RAP_val[irb+row_block_size*icb]=ZERO;
898 for (icb=0; icb<P_block_size; icb++) {
899 rtmp = temp_val[icb];
900 for (irb=0; irb<row_block_size; irb++) {
901 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
902 }
903 }
904
905 if (P_marker[i_c] < row_marker) {
906 P_marker[i_c] = sum;
907 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
908 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
909 sum++;
910 } else {
911 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
912 for (ib=0; ib<block_size; ib++) {
913 temp_val[ib] += RAP_val[ib];
914 }
915 }
916 }
917
918 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
919 Only the contributions are added. */
920 } else {
921 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
922 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
923 i_c = P->row_coupleBlock->pattern->index[j3];
924 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
925 for (irb=0; irb<row_block_size; irb++)
926 for (icb=0; icb<col_block_size; icb++)
927 RAP_val[irb+row_block_size*icb]=ZERO;
928 for (icb=0; icb<P_block_size; icb++) {
929 rtmp = temp_val[icb];
930 for (irb=0; irb<row_block_size; irb++) {
931 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
932 }
933 }
934
935 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
936 for (ib=0; ib<block_size; ib++) {
937 temp_val[ib] += RAP_val[ib];
938 }
939 }
940 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
941 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
942 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
943 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
944 for (irb=0; irb<row_block_size; irb++)
945 for (icb=0; icb<col_block_size; icb++)
946 RAP_val[irb+row_block_size*icb]=ZERO;
947 for (icb=0; icb<P_block_size; icb++) {
948 rtmp = temp_val[icb];
949 for (irb=0; irb<row_block_size; irb++) {
950 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
951 }
952 }
953
954 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
955 for (ib=0; ib<block_size; ib++) {
956 temp_val[ib] += RAP_val[ib];
957 }
958 }
959 }
960 }
961
962 /* now loop over elements in row i1 of A->mainBlock, repeat
963 the process we have done to block A->col_coupleBlock */
964 j2_ub = A->mainBlock->pattern->ptr[i1+1];
965 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
966 i2 = A->mainBlock->pattern->index[j2];
967 temp_val = &(A->mainBlock->val[j2*block_size]);
968 for (irb=0; irb<row_block_size; irb++)
969 for (icb=0; icb<col_block_size; icb++)
970 RA_val[irb+row_block_size*icb]=ZERO;
971 for (irb=0; irb<P_block_size; irb++) {
972 rtmp=R_val[irb];
973 for (icb=0; icb<col_block_size; icb++) {
974 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
975 }
976 }
977
978 if (A_marker[i2 + num_Acouple_cols] != i_r) {
979 A_marker[i2 + num_Acouple_cols] = i_r;
980 j3_ub = P->mainBlock->pattern->ptr[i2+1];
981 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
982 i_c = P->mainBlock->pattern->index[j3];
983 temp_val = &(P->mainBlock->val[j3*P_block_size]);
984 for (irb=0; irb<row_block_size; irb++)
985 for (icb=0; icb<col_block_size; icb++)
986 RAP_val[irb+row_block_size*icb]=ZERO;
987 for (icb=0; icb<P_block_size; icb++) {
988 rtmp = temp_val[icb];
989 for (irb=0; irb<row_block_size; irb++) {
990 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
991 }
992 }
993
994 if (P_marker[i_c] < row_marker) {
995 P_marker[i_c] = sum;
996 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
997 RAP_ext_idx[sum] = i_c + offset;
998 sum++;
999 } else {
1000 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1001 for (ib=0; ib<block_size; ib++) {
1002 temp_val[ib] += RAP_val[ib];
1003 }
1004 }
1005 }
1006 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1007 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1008 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1009 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1010 for (irb=0; irb<row_block_size; irb++)
1011 for (icb=0; icb<col_block_size; icb++)
1012 RAP_val[irb+row_block_size*icb]=ZERO;
1013 for (icb=0; icb<P_block_size; icb++) {
1014 rtmp=temp_val[icb];
1015 for (irb=0; irb<row_block_size; irb++) {
1016 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1017 }
1018 }
1019
1020 if (P_marker[i_c] < row_marker) {
1021 P_marker[i_c] = sum;
1022 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
1023 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
1024 sum++;
1025 } else {
1026 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1027 for (ib=0; ib<block_size; ib++) {
1028 temp_val[ib] += RAP_val[ib];
1029 }
1030 }
1031 }
1032 } else {
1033 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1034 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1035 i_c = P->mainBlock->pattern->index[j3];
1036 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1037 for (irb=0; irb<row_block_size; irb++)
1038 for (icb=0; icb<col_block_size; icb++)
1039 RAP_val[irb+row_block_size*icb]=ZERO;
1040 for (icb=0; icb<P_block_size; icb++) {
1041 rtmp=temp_val[icb];
1042 for (irb=0; irb<row_block_size; irb++) {
1043 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1044 }
1045 }
1046
1047 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1048 for (ib=0; ib<block_size; ib++) {
1049 temp_val[ib] += RAP_val[ib];
1050 }
1051 }
1052 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1053 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1054 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1055 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1056 for (irb=0; irb<row_block_size; irb++)
1057 for (icb=0; icb<col_block_size; icb++)
1058 RAP_val[irb+row_block_size*icb]=ZERO;
1059 for (icb=0; icb<P_block_size; icb++) {
1060 rtmp=temp_val[icb];
1061 for (irb=0; irb<row_block_size; irb++) {
1062 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1063 }
1064 }
1065
1066 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
1067 for (ib=0; ib<block_size; ib++) {
1068 temp_val[ib] += RAP_val[ib];
1069 }
1070 }
1071 }
1072 }
1073 }
1074 RAP_ext_ptr[i_r+1] = sum;
1075 }
1076 TMPMEMFREE(P_marker);
1077 TMPMEMFREE(Pcouple_to_Pext);
1078
1079 /* Now we have part of RAP[r,c] where row "r" is the list of rows
1080 which is given by the column list of P->col_coupleBlock, and
1081 column "c" is the list of columns which possiblely covers the
1082 whole column range of systme martris P. This part of data is to
1083 be passed to neighboring processors, and added to corresponding
1084 RAP[r,c] entries in the neighboring processors */
1085 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
1086 &RAP_ext_val, global_id_P, block_size);
1087
1088 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
1089 sum = RAP_ext_ptr[num_RAPext_rows];
1090 num_RAPext_cols = 0;
1091 if (num_Pext_cols || sum > 0) {
1092 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
1093 j1_ub = offset + num_Pmain_cols;
1094 for (i=0, iptr=0; i<sum; i++) {
1095 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
1096 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
1097 }
1098 for (j=0; j<num_Pext_cols; j++, iptr++){
1099 temp[iptr] = global_id_P[j];
1100 }
1101
1102 if (iptr) {
1103 #ifdef USE_QSORTG
1104 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1105 #else
1106 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
1107 #endif
1108 num_RAPext_cols = 1;
1109 i = temp[0];
1110 for (j=1; j<iptr; j++) {
1111 if (temp[j] > i) {
1112 i = temp[j];
1113 temp[num_RAPext_cols++] = i;
1114 }
1115 }
1116 }
1117 }
1118
1119 /* resize the pattern of P_ext_couple */
1120 if(num_RAPext_cols){
1121 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
1122 #pragma omp parallel for private(i) schedule(static)
1123 for (i=0; i<num_RAPext_cols; i++)
1124 global_id_RAP[i] = temp[i];
1125 }
1126 if (num_Pext_cols || sum > 0)
1127 TMPMEMFREE(temp);
1128 j1_ub = offset + num_Pmain_cols;
1129 #pragma omp parallel for private(i, where_p) schedule(static)
1130 for (i=0; i<sum; i++) {
1131 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
1132 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
1133 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
1134 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
1135 } else
1136 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
1137 }
1138
1139 /* build the mapping */
1140 if (num_Pcouple_cols) {
1141 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
1142 iptr = 0;
1143 for (i=0; i<num_RAPext_cols; i++)
1144 if (global_id_RAP[i] == P->global_id[iptr]) {
1145 Pcouple_to_RAP[iptr++] = i;
1146 if (iptr == num_Pcouple_cols) break;
1147 }
1148 }
1149
1150 if (num_Pext_cols) {
1151 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
1152 iptr = 0;
1153 for (i=0; i<num_RAPext_cols; i++)
1154 if (global_id_RAP[i] == global_id_P[iptr]) {
1155 Pext_to_RAP[iptr++] = i;
1156 if (iptr == num_Pext_cols) break;
1157 }
1158 }
1159
1160 if (global_id_P){
1161 TMPMEMFREE(global_id_P);
1162 global_id_P = NULL;
1163 }
1164
1165 /* alloc and initialize the makers */
1166 sum = num_RAPext_cols + num_Pmain_cols;
1167 P_marker = TMPMEMALLOC(sum, index_t);
1168 #pragma omp parallel for private(i) schedule(static)
1169 for (i=0; i<sum; i++) P_marker[i] = -1;
1170 #pragma omp parallel for private(i) schedule(static)
1171 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1172
1173 /* Now, count the size of RAP. Start with rows in R_main */
1174 num_neighbors = P->col_coupler->connector->send->numNeighbors;
1175 offsetInShared = P->col_coupler->connector->send->offsetInShared;
1176 shared = P->col_coupler->connector->send->shared;
1177 i = 0;
1178 j = 0;
1179 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1180 /* Mark the start of row for both main block and couple block */
1181 row_marker = i;
1182 row_marker_ext = j;
1183
1184 /* Mark the diagonal element RAP[i_r, i_r], and other elements
1185 in RAP_ext */
1186 P_marker[i_r] = i;
1187 i++;
1188 for (j1=0; j1<num_neighbors; j1++) {
1189 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1190 if (shared[j2] == i_r) {
1191 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1192 i_c = RAP_ext_idx[k];
1193 if (i_c < num_Pmain_cols) {
1194 if (P_marker[i_c] < row_marker) {
1195 P_marker[i_c] = i;
1196 i++;
1197 }
1198 } else {
1199 if (P_marker[i_c] < row_marker_ext) {
1200 P_marker[i_c] = j;
1201 j++;
1202 }
1203 }
1204 }
1205 break;
1206 }
1207 }
1208 }
1209
1210 /* then, loop over elements in row i_r of R_main */
1211 j1_ub = R_main->pattern->ptr[i_r+1];
1212 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1213 i1 = R_main->pattern->index[j1];
1214
1215 /* then, loop over elements in row i1 of A->col_coupleBlock */
1216 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1217 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1218 i2 = A->col_coupleBlock->pattern->index[j2];
1219
1220 /* check whether entry RA[i_r, i2] has been previously visited.
1221 RAP new entry is possible only if entry RA[i_r, i2] has not
1222 been visited yet */
1223 if (A_marker[i2] != i_r) {
1224 /* first, mark entry RA[i_r,i2] as visited */
1225 A_marker[i2] = i_r;
1226
1227 /* then loop over elements in row i2 of P_ext_main */
1228 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1229 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1230 i_c = P->row_coupleBlock->pattern->index[j3];
1231
1232 /* check whether entry RAP[i_r,i_c] has been created.
1233 If not yet created, create the entry and increase
1234 the total number of elements in RAP_ext */
1235 if (P_marker[i_c] < row_marker) {
1236 P_marker[i_c] = i;
1237 i++;
1238 }
1239 }
1240
1241 /* loop over elements in row i2 of P_ext_couple, do the same */
1242 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1243 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1244 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1245 if (P_marker[i_c] < row_marker_ext) {
1246 P_marker[i_c] = j;
1247 j++;
1248 }
1249 }
1250 }
1251 }
1252
1253 /* now loop over elements in row i1 of A->mainBlock, repeat
1254 the process we have done to block A->col_coupleBlock */
1255 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1256 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1257 i2 = A->mainBlock->pattern->index[j2];
1258 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1259 A_marker[i2 + num_Acouple_cols] = i_r;
1260 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1261 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1262 i_c = P->mainBlock->pattern->index[j3];
1263 if (P_marker[i_c] < row_marker) {
1264 P_marker[i_c] = i;
1265 i++;
1266 }
1267 }
1268 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1269 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1270 /* note that we need to map the column index in
1271 P->col_coupleBlock back into the column index in
1272 P_ext_couple */
1273 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1274 if (P_marker[i_c] < row_marker_ext) {
1275 P_marker[i_c] = j;
1276 j++;
1277 }
1278 }
1279 }
1280 }
1281 }
1282 }
1283
1284 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
1285 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
1286 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
1287 RAP_couple */
1288 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1289 RAP_main_idx = MEMALLOC(i, index_t);
1290 RAP_main_val = TMPMEMALLOC(i * block_size, double);
1291 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
1292 RAP_couple_idx = MEMALLOC(j, index_t);
1293 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
1294
1295 RAP_main_ptr[num_Pmain_cols] = i;
1296 RAP_couple_ptr[num_Pmain_cols] = j;
1297
1298 #pragma omp parallel for private(i) schedule(static)
1299 for (i=0; i<sum; i++) P_marker[i] = -1;
1300 #pragma omp parallel for private(i) schedule(static)
1301 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
1302
1303 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
1304 in R_main */
1305 i = 0;
1306 j = 0;
1307 for (i_r=0; i_r<num_Pmain_cols; i_r++){
1308 /* Mark the start of row for both main block and couple block */
1309 row_marker = i;
1310 row_marker_ext = j;
1311 RAP_main_ptr[i_r] = row_marker;
1312 RAP_couple_ptr[i_r] = row_marker_ext;
1313
1314 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
1315 in row i_r of RAP_ext */
1316 P_marker[i_r] = i;
1317 for (ib=0; ib<block_size; ib++)
1318 RAP_main_val[i*block_size+ib] = ZERO;
1319 RAP_main_idx[i] = i_r;
1320 i++;
1321
1322 for (j1=0; j1<num_neighbors; j1++) {
1323 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
1324 if (shared[j2] == i_r) {
1325 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
1326 i_c = RAP_ext_idx[k];
1327 if (i_c < num_Pmain_cols) {
1328 if (P_marker[i_c] < row_marker) {
1329 P_marker[i_c] = i;
1330 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1331 RAP_main_idx[i] = i_c;
1332 i++;
1333 } else {
1334 t1_val = &(RAP_ext_val[k*block_size]);
1335 t2_val = &(RAP_main_val[P_marker[i_c]*block_size]);
1336 for (ib=0; ib<block_size; ib++)
1337 t2_val[ib] += t1_val[ib];
1338 }
1339 } else {
1340 if (P_marker[i_c] < row_marker_ext) {
1341 P_marker[i_c] = j;
1342 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
1343 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1344 j++;
1345 } else {
1346 t1_val = &(RAP_ext_val[k*block_size]);
1347 t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
1348 for (ib=0; ib<block_size; ib++)
1349 t2_val[ib] += t1_val[ib];
1350 }
1351 }
1352 }
1353 break;
1354 }
1355 }
1356 }
1357
1358 /* then, loop over elements in row i_r of R_main */
1359 j1_ub = R_main->pattern->ptr[i_r+1];
1360 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
1361 i1 = R_main->pattern->index[j1];
1362 R_val = &(R_main->val[j1*P_block_size]);
1363
1364 /* then, loop over elements in row i1 of A->col_coupleBlock */
1365 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
1366 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1367 i2 = A->col_coupleBlock->pattern->index[j2];
1368 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
1369 for (irb=0; irb<row_block_size; irb++)
1370 for (icb=0; icb<col_block_size; icb++)
1371 RA_val[irb+row_block_size*icb]=ZERO;
1372 for (irb=0; irb<P_block_size; irb++) {
1373 rtmp=R_val[irb];
1374 for (icb=0; icb<col_block_size; icb++) {
1375 RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1376 }
1377 }
1378
1379
1380 /* check whether entry RA[i_r, i2] has been previously visited.
1381 RAP new entry is possible only if entry RA[i_r, i2] has not
1382 been visited yet */
1383 if (A_marker[i2] != i_r) {
1384 /* first, mark entry RA[i_r,i2] as visited */
1385 A_marker[i2] = i_r;
1386
1387 /* then loop over elements in row i2 of P_ext_main */
1388 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1389 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1390 i_c = P->row_coupleBlock->pattern->index[j3];
1391 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1392 for (irb=0; irb<row_block_size; irb++)
1393 for (icb=0; icb<col_block_size; icb++)
1394 RAP_val[irb+row_block_size*icb]=ZERO;
1395 for (icb=0; icb<P_block_size; icb++) {
1396 rtmp=temp_val[icb];
1397 for (irb=0; irb<row_block_size; irb++) {
1398 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1399 }
1400 }
1401
1402
1403 /* check whether entry RAP[i_r,i_c] has been created.
1404 If not yet created, create the entry and increase
1405 the total number of elements in RAP_ext */
1406 if (P_marker[i_c] < row_marker) {
1407 P_marker[i_c] = i;
1408 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1409 RAP_main_idx[i] = i_c;
1410 i++;
1411 } else {
1412 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1413 for (ib=0; ib<block_size; ib++) {
1414 temp_val[ib] += RAP_val[ib];
1415 }
1416 }
1417 }
1418
1419 /* loop over elements in row i2 of P_ext_couple, do the same */
1420 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1421 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1422 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1423 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1424 for (irb=0; irb<row_block_size; irb++)
1425 for (icb=0; icb<col_block_size; icb++)
1426 RAP_val[irb+row_block_size*icb]=ZERO;
1427 for (icb=0; icb<P_block_size; icb++) {
1428 rtmp=temp_val[icb];
1429 for (irb=0; irb<row_block_size; irb++) {
1430 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1431 }
1432 }
1433
1434 if (P_marker[i_c] < row_marker_ext) {
1435 P_marker[i_c] = j;
1436 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1437 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1438 j++;
1439 } else {
1440 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1441 for (ib=0; ib<block_size; ib++) {
1442 temp_val[ib] += RAP_val[ib];
1443 }
1444 }
1445 }
1446
1447 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
1448 Only the contributions are added. */
1449 } else {
1450 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
1451 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1452 i_c = P->row_coupleBlock->pattern->index[j3];
1453 temp_val = &(P->row_coupleBlock->val[j3*P_block_size]);
1454 for (irb=0; irb<row_block_size; irb++)
1455 for (icb=0; icb<col_block_size; icb++)
1456 RAP_val[irb+row_block_size*icb]=ZERO;
1457 for (icb=0; icb<P_block_size; icb++) {
1458 rtmp=temp_val[icb];
1459 for (irb=0; irb<row_block_size; irb++) {
1460 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1461 }
1462 }
1463
1464 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1465 for (ib=0; ib<block_size; ib++) {
1466 temp_val[ib] += RAP_val[ib];
1467 }
1468 }
1469 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
1470 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1471 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1472 temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]);
1473 for (irb=0; irb<row_block_size; irb++)
1474 for (icb=0; icb<col_block_size; icb++)
1475 RAP_val[irb+row_block_size*icb]=ZERO;
1476 for (icb=0; icb<P_block_size; icb++) {
1477 rtmp=temp_val[icb];
1478 for (irb=0; irb<row_block_size; irb++) {
1479 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1480 }
1481 }
1482
1483 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1484 for (ib=0; ib<block_size; ib++) {
1485 temp_val[ib] += RAP_val[ib];
1486 }
1487 }
1488 }
1489 }
1490
1491 /* now loop over elements in row i1 of A->mainBlock, repeat
1492 the process we have done to block A->col_coupleBlock */
1493 j2_ub = A->mainBlock->pattern->ptr[i1+1];
1494 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
1495 i2 = A->mainBlock->pattern->index[j2];
1496 temp_val = &(A->mainBlock->val[j2*block_size]);
1497 for (irb=0; irb<row_block_size; irb++)
1498 for (icb=0; icb<col_block_size; icb++)
1499 RA_val[irb+row_block_size*icb]=ZERO;
1500 for (irb=0; irb<P_block_size; irb++) {
1501 rtmp=R_val[irb];
1502 for (icb=0; icb<col_block_size; icb++) {
1503 RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb];
1504 }
1505 }
1506
1507 if (A_marker[i2 + num_Acouple_cols] != i_r) {
1508 A_marker[i2 + num_Acouple_cols] = i_r;
1509 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1510 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1511 i_c = P->mainBlock->pattern->index[j3];
1512 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1513 for (irb=0; irb<row_block_size; irb++)
1514 for (icb=0; icb<col_block_size; icb++)
1515 RAP_val[irb+row_block_size*icb]=ZERO;
1516 for (icb=0; icb<P_block_size; icb++) {
1517 rtmp=temp_val[icb];
1518 for (irb=0; irb<row_block_size; irb++) {
1519 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1520 }
1521 }
1522 if (P_marker[i_c] < row_marker) {
1523 P_marker[i_c] = i;
1524 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
1525 RAP_main_idx[i] = i_c;
1526 i++;
1527 } else {
1528 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1529 for (ib=0; ib<block_size; ib++) {
1530 temp_val[ib] += RAP_val[ib];
1531 }
1532 }
1533 }
1534 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1535 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1536 /* note that we need to map the column index in
1537 P->col_coupleBlock back into the column index in
1538 P_ext_couple */
1539 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1540 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1541 for (irb=0; irb<row_block_size; irb++)
1542 for (icb=0; icb<col_block_size; icb++)
1543 RAP_val[irb+row_block_size*icb]=ZERO;
1544 for (icb=0; icb<P_block_size; icb++) {
1545 rtmp=temp_val[icb];
1546 for (irb=0; irb<row_block_size; irb++) {
1547 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1548 }
1549 }
1550
1551 if (P_marker[i_c] < row_marker_ext) {
1552 P_marker[i_c] = j;
1553 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
1554 RAP_couple_idx[j] = i_c - num_Pmain_cols;
1555 j++;
1556 } else {
1557 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1558 for (ib=0; ib<block_size; ib++) {
1559 temp_val[ib] += RAP_val[ib];
1560 }
1561 }
1562 }
1563
1564 } else {
1565 j3_ub = P->mainBlock->pattern->ptr[i2+1];
1566 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1567 i_c = P->mainBlock->pattern->index[j3];
1568 temp_val = &(P->mainBlock->val[j3*P_block_size]);
1569 for (irb=0; irb<row_block_size; irb++)
1570 for (icb=0; icb<col_block_size; icb++)
1571 RAP_val[irb+row_block_size*icb]=ZERO;
1572 for (icb=0; icb<P_block_size; icb++) {
1573 rtmp=temp_val[icb];
1574 for (irb=0; irb<row_block_size; irb++) {
1575 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1576 }
1577 }
1578
1579 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
1580 for (ib=0; ib<block_size; ib++) {
1581 temp_val[ib] += RAP_val[ib];
1582 }
1583 }
1584 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
1585 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
1586 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
1587 temp_val = &(P->col_coupleBlock->val[j3*P_block_size]);
1588 for (irb=0; irb<row_block_size; irb++)
1589 for (icb=0; icb<col_block_size; icb++)
1590 RAP_val[irb+row_block_size*icb]=ZERO;
1591 for (icb=0; icb<P_block_size; icb++) {
1592 rtmp = temp_val[icb];
1593 for (irb=0; irb<row_block_size; irb++) {
1594 RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp;
1595 }
1596 }
1597
1598 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
1599 for (ib=0; ib<block_size; ib++) {
1600 temp_val[ib] += RAP_val[ib];
1601 }
1602 }
1603 }
1604 }
1605 }
1606
1607 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
1608 if (i > row_marker) {
1609 offset = i - row_marker;
1610 temp = TMPMEMALLOC(offset, index_t);
1611 #pragma omp parallel for schedule(static) private(iptr)
1612 for (iptr=0; iptr<offset; iptr++)
1613 temp[iptr] = RAP_main_idx[row_marker+iptr];
1614 if (offset > 0) {
1615 #ifdef USE_QSORTG
1616 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1617 #else
1618 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1619 #endif
1620 }
1621 temp_val = TMPMEMALLOC(offset * block_size, double);
1622 #pragma omp parallel for schedule(static) private(iptr,k)
1623 for (iptr=0; iptr<offset; iptr++){
1624 k = P_marker[temp[iptr]];
1625 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
1626 P_marker[temp[iptr]] = iptr + row_marker;
1627 }
1628 #pragma omp parallel for schedule(static) private(iptr)
1629 for (iptr=0; iptr<offset; iptr++){
1630 RAP_main_idx[row_marker+iptr] = temp[iptr];
1631 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1632 }
1633 TMPMEMFREE(temp);
1634 TMPMEMFREE(temp_val);
1635 }
1636 if (j > row_marker_ext) {
1637 offset = j - row_marker_ext;
1638 temp = TMPMEMALLOC(offset, index_t);
1639 #pragma omp parallel for schedule(static) private(iptr)
1640 for (iptr=0; iptr<offset; iptr++)
1641 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
1642 if (offset > 0) {
1643 #ifdef USE_QSORTG
1644 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1645 #else
1646 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
1647 #endif
1648 }
1649 temp_val = TMPMEMALLOC(offset * block_size, double);
1650 #pragma omp parallel for schedule(static) private(iptr, k)
1651 for (iptr=0; iptr<offset; iptr++){
1652 k = P_marker[temp[iptr] + num_Pmain_cols];
1653 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
1654 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
1655 }
1656 #pragma omp parallel for schedule(static) private(iptr)
1657 for (iptr=0; iptr<offset; iptr++){
1658 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
1659 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
1660 }
1661 TMPMEMFREE(temp);
1662 TMPMEMFREE(temp_val);
1663 }
1664 }
1665
1666 TMPMEMFREE(RA_val);
1667 TMPMEMFREE(RAP_val);
1668 TMPMEMFREE(A_marker);
1669 TMPMEMFREE(Pext_to_RAP);
1670 TMPMEMFREE(Pcouple_to_RAP);
1671 MEMFREE(RAP_ext_ptr);
1672 MEMFREE(RAP_ext_idx);
1673 MEMFREE(RAP_ext_val);
1674 Paso_SparseMatrix_free(R_main);
1675 Paso_SparseMatrix_free(R_couple);
1676
1677 /* Check whether there are empty columns in RAP_couple */
1678 #pragma omp parallel for schedule(static) private(i)
1679 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
1680 /* num of non-empty columns is stored in "k" */
1681 k = 0;
1682 j = RAP_couple_ptr[num_Pmain_cols];
1683 for (i=0; i<j; i++) {
1684 i1 = RAP_couple_idx[i];
1685 if (P_marker[i1]) {
1686 P_marker[i1] = 0;
1687 k++;
1688 }
1689 }
1690
1691 /* empty columns is found */
1692 if (k < num_RAPext_cols) {
1693 temp = TMPMEMALLOC(k, index_t);
1694 k = 0;
1695 for (i=0; i<num_RAPext_cols; i++)
1696 if (!P_marker[i]) {
1697 P_marker[i] = k;
1698 temp[k] = global_id_RAP[i];
1699 k++;
1700 }
1701 #pragma omp parallel for schedule(static) private(i, i1)
1702 for (i=0; i<j; i++) {
1703 i1 = RAP_couple_idx[i];
1704 RAP_couple_idx[i] = P_marker[i1];
1705 }
1706 num_RAPext_cols = k;
1707 TMPMEMFREE(global_id_RAP);
1708 global_id_RAP = temp;
1709 }
1710 TMPMEMFREE(P_marker);
1711
1712 /******************************************************/
1713 /* Start to create the coasre level System Matrix A_c */
1714 /******************************************************/
1715 /* first, prepare the sender/receiver for the col_connector */
1716 dist = P->pattern->input_distribution->first_component;
1717 recv_len = TMPMEMALLOC(size, dim_t);
1718 send_len = TMPMEMALLOC(size, dim_t);
1719 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
1720 offsetInShared = TMPMEMALLOC(size+1, index_t);
1721 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
1722 memset(recv_len, 0, sizeof(dim_t) * size);
1723 memset(send_len, 0, sizeof(dim_t) * size);
1724 num_neighbors = 0;
1725 offsetInShared[0] = 0;
1726 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
1727 shared[i] = i + num_Pmain_cols;
1728 if (k <= global_id_RAP[i]) {
1729 if (recv_len[j] > 0) {
1730 neighbor[num_neighbors] = j;
1731 num_neighbors ++;
1732 offsetInShared[num_neighbors] = i;
1733 }
1734 while (k <= global_id_RAP[i]) {
1735 j++;
1736 k = dist[j+1];
1737 }
1738 }
1739 recv_len[j] ++;
1740 }
1741 if (recv_len[j] > 0) {
1742 neighbor[num_neighbors] = j;
1743 num_neighbors ++;
1744 offsetInShared[num_neighbors] = i;
1745 }
1746 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1747 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1748
1749 #ifdef ESYS_MPI
1750 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
1751 #endif
1752
1753 #ifdef ESYS_MPI
1754 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
1755 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
1756 #else
1757 mpi_requests=TMPMEMALLOC(size*2,int);
1758 mpi_stati=TMPMEMALLOC(size*2,int);
1759 #endif
1760 num_neighbors = 0;
1761 j = 0;
1762 offsetInShared[0] = 0;
1763 for (i=0; i<size; i++) {
1764 if (send_len[i] > 0) {
1765 neighbor[num_neighbors] = i;
1766 num_neighbors ++;
1767 j += send_len[i];
1768 offsetInShared[num_neighbors] = j;
1769 }
1770 }
1771 TMPMEMFREE(shared);
1772 shared = TMPMEMALLOC(j, index_t);
1773 for (i=0, j=0; i<num_neighbors; i++) {
1774 k = neighbor[i];
1775 #ifdef ESYS_MPI
1776 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
1777 mpi_info->msg_tag_counter+k,
1778 mpi_info->comm, &mpi_requests[i]);
1779 #endif
1780 j += send_len[k];
1781 }
1782 for (i=0, j=0; i<recv->numNeighbors; i++) {
1783 k = recv->neighbor[i];
1784 #ifdef ESYS_MPI
1785 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
1786 mpi_info->msg_tag_counter+rank,
1787 mpi_info->comm, &mpi_requests[i+num_neighbors]);
1788 #endif
1789 j += recv_len[k];
1790 }
1791 #ifdef ESYS_MPI
1792 MPI_Waitall(num_neighbors + recv->numNeighbors,
1793 mpi_requests, mpi_stati);
1794 #endif
1795 mpi_info->msg_tag_counter += size;
1796
1797 j = offsetInShared[num_neighbors];
1798 offset = dist[rank];
1799 #pragma omp parallel for schedule(static) private(i)
1800 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
1801 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1802 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1803
1804 col_connector = Paso_Connector_alloc(send, recv);
1805 TMPMEMFREE(shared);
1806 Paso_SharedComponents_free(recv);
1807 Paso_SharedComponents_free(send);
1808
1809 /* now, create row distribution (output_distri) and col
1810 distribution (input_distribution) */
1811 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1812 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
1813
1814 /* then, prepare the sender/receiver for the row_connector, first, prepare
1815 the information for sender */
1816 sum = RAP_couple_ptr[num_Pmain_cols];
1817 len = TMPMEMALLOC(size, dim_t);
1818 send_ptr = TMPMEMALLOC(size, index_t*);
1819 send_idx = TMPMEMALLOC(size, index_t*);
1820 #pragma omp parallel for schedule(static) private(i)
1821 for (i=0; i<size; i++) {
1822 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
1823 send_idx[i] = TMPMEMALLOC(sum, index_t);
1824 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
1825 }
1826 memset(len, 0, sizeof(dim_t) * size);
1827 recv = col_connector->recv;
1828 sum=0;
1829 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
1830 i1 = RAP_couple_ptr[i_r];
1831 i2 = RAP_couple_ptr[i_r+1];
1832 if (i2 > i1) {
1833 /* then row i_r will be in the sender of row_connector, now check
1834 how many neighbors i_r needs to be send to */
1835 for (j=i1; j<i2; j++) {
1836 i_c = RAP_couple_idx[j];
1837 /* find out the corresponding neighbor "p" of column i_c */
1838 for (p=0; p<recv->numNeighbors; p++) {
1839 if (i_c < recv->offsetInShared[p+1]) {
1840 k = recv->neighbor[p];
1841 if (send_ptr[k][i_r] == 0) sum++;
1842 send_ptr[k][i_r] ++;
1843 send_idx[k][len[k]] = global_id_RAP[i_c];
1844 len[k] ++;
1845 break;
1846 }
1847 }
1848 }
1849 }
1850 }
1851 if (global_id_RAP) {
1852 TMPMEMFREE(global_id_RAP);
1853 global_id_RAP = NULL;
1854 }
1855
1856 /* now allocate the sender */
1857 shared = TMPMEMALLOC(sum, index_t);
1858 memset(send_len, 0, sizeof(dim_t) * size);
1859 num_neighbors=0;
1860 offsetInShared[0] = 0;
1861 for (p=0, k=0; p<size; p++) {
1862 for (i=0; i<num_Pmain_cols; i++) {
1863 if (send_ptr[p][i] > 0) {
1864 shared[k] = i;
1865 k++;
1866 send_ptr[p][send_len[p]] = send_ptr[p][i];
1867 send_len[p]++;
1868 }
1869 }
1870 if (k > offsetInShared[num_neighbors]) {
1871 neighbor[num_neighbors] = p;
1872 num_neighbors ++;
1873 offsetInShared[num_neighbors] = k;
1874 }
1875 }
1876 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1877 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1878
1879 /* send/recv number of rows will be sent from current proc
1880 recover info for the receiver of row_connector from the sender */
1881 #ifdef ESYS_MPI
1882 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
1883 #endif
1884 num_neighbors = 0;
1885 offsetInShared[0] = 0;
1886 j = 0;
1887 for (i=0; i<size; i++) {
1888 if (i != rank && recv_len[i] > 0) {
1889 neighbor[num_neighbors] = i;
1890 num_neighbors ++;
1891 j += recv_len[i];
1892 offsetInShared[num_neighbors] = j;
1893 }
1894 }
1895 TMPMEMFREE(shared);
1896 TMPMEMFREE(recv_len);
1897 shared = TMPMEMALLOC(j, index_t);
1898 k = offsetInShared[num_neighbors];
1899 #pragma omp parallel for schedule(static) private(i)
1900 for (i=0; i<k; i++) {
1901 shared[i] = i + num_Pmain_cols;
1902 }
1903 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
1904 neighbor, shared, offsetInShared, 1, 0, mpi_info);
1905 row_connector = Paso_Connector_alloc(send, recv);
1906 TMPMEMFREE(shared);
1907 Paso_SharedComponents_free(recv);
1908 Paso_SharedComponents_free(send);
1909
1910 /* send/recv pattern->ptr for rowCoupleBlock */
1911 num_RAPext_rows = offsetInShared[num_neighbors];
1912 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
1913 for (p=0; p<num_neighbors; p++) {
1914 j = offsetInShared[p];
1915 i = offsetInShared[p+1];
1916 #ifdef ESYS_MPI
1917 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
1918 mpi_info->msg_tag_counter+neighbor[p],
1919 mpi_info->comm, &mpi_requests[p]);
1920 #endif
1921 }
1922 send = row_connector->send;
1923 for (p=0; p<send->numNeighbors; p++) {
1924 #ifdef ESYS_MPI
1925 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
1926 MPI_INT, send->neighbor[p],
1927 mpi_info->msg_tag_counter+rank,
1928 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1929 #endif
1930 }
1931 #ifdef ESYS_MPI
1932 MPI_Waitall(num_neighbors + send->numNeighbors,
1933 mpi_requests, mpi_stati);
1934 #endif
1935 mpi_info->msg_tag_counter += size;
1936 TMPMEMFREE(send_len);
1937
1938 sum = 0;
1939 for (i=0; i<num_RAPext_rows; i++) {
1940 k = row_couple_ptr[i];
1941 row_couple_ptr[i] = sum;
1942 sum += k;
1943 }
1944 row_couple_ptr[num_RAPext_rows] = sum;
1945
1946 /* send/recv pattern->index for rowCoupleBlock */
1947 k = row_couple_ptr[num_RAPext_rows];
1948 row_couple_idx = MEMALLOC(k, index_t);
1949 for (p=0; p<num_neighbors; p++) {
1950 j1 = row_couple_ptr[offsetInShared[p]];
1951 j2 = row_couple_ptr[offsetInShared[p+1]];
1952 #ifdef ESYS_MPI
1953 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
1954 mpi_info->msg_tag_counter+neighbor[p],
1955 mpi_info->comm, &mpi_requests[p]);
1956 #endif
1957 }
1958 for (p=0; p<send->numNeighbors; p++) {
1959 #ifdef ESYS_MPI
1960 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
1961 MPI_INT, send->neighbor[p],
1962 mpi_info->msg_tag_counter+rank,
1963 mpi_info->comm, &mpi_requests[p+num_neighbors]);
1964 #endif
1965 }
1966 #ifdef ESYS_MPI
1967 MPI_Waitall(num_neighbors + send->numNeighbors,
1968 mpi_requests, mpi_stati);
1969 #endif
1970 mpi_info->msg_tag_counter += size;
1971
1972 offset = input_dist->first_component[rank];
1973 k = row_couple_ptr[num_RAPext_rows];
1974 #pragma omp parallel for schedule(static) private(i)
1975 for (i=0; i<k; i++) {
1976 row_couple_idx[i] -= offset;
1977 }
1978 #pragma omp parallel for schedule(static) private(i)
1979 for (i=0; i<size; i++) {
1980 TMPMEMFREE(send_ptr[i]);
1981 TMPMEMFREE(send_idx[i]);
1982 }
1983 TMPMEMFREE(send_ptr);
1984 TMPMEMFREE(send_idx);
1985 TMPMEMFREE(len);
1986 TMPMEMFREE(offsetInShared);
1987 TMPMEMFREE(neighbor);
1988 TMPMEMFREE(mpi_requests);
1989 TMPMEMFREE(mpi_stati);
1990 Esys_MPIInfo_free(mpi_info);
1991
1992 /* Now, we can create pattern for mainBlock and coupleBlock */
1993 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
1994 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
1995 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1996 num_Pmain_cols, num_RAPext_cols,
1997 RAP_couple_ptr, RAP_couple_idx);
1998 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
1999 num_RAPext_rows, num_Pmain_cols,
2000 row_couple_ptr, row_couple_idx);
2001
2002 /* next, create the system matrix */
2003 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
2004 output_dist, input_dist, main_pattern, col_couple_pattern,
2005 row_couple_pattern, col_connector, row_connector);
2006 out = Paso_SystemMatrix_alloc(A->type, pattern,
2007 row_block_size, col_block_size, FALSE);
2008
2009 /* finally, fill in the data*/
2010 memcpy(out->mainBlock->val, RAP_main_val,
2011 out->mainBlock->len* sizeof(double));
2012 memcpy(out->col_coupleBlock->val, RAP_couple_val,
2013 out->col_coupleBlock->len * sizeof(double));
2014
2015 /* Clean up */
2016 Paso_SystemMatrixPattern_free(pattern);
2017 Paso_Pattern_free(main_pattern);
2018 Paso_Pattern_free(col_couple_pattern);
2019 Paso_Pattern_free(row_couple_pattern);
2020 Paso_Connector_free(col_connector);
2021 Paso_Connector_free(row_connector);
2022 Paso_Distribution_free(output_dist);
2023 Paso_Distribution_free(input_dist);
2024 TMPMEMFREE(RAP_main_val);
2025 TMPMEMFREE(RAP_couple_val);
2026 if (Esys_noError()) {
2027 return out;
2028 } else {
2029 Paso_SystemMatrix_free(out);
2030 return NULL;
2031 }
2032 }
2033
2034
2035 Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(
2036 Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R)
2037 {
2038 Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info);
2039 Paso_SparseMatrix *R_main=NULL, *R_couple=NULL;
2040 Paso_SystemMatrix *out=NULL;
2041 Paso_SystemMatrixPattern *pattern=NULL;
2042 Paso_Distribution *input_dist=NULL, *output_dist=NULL;
2043 Paso_SharedComponents *send =NULL, *recv=NULL;
2044 Paso_Connector *col_connector=NULL, *row_connector=NULL;
2045 Paso_Pattern *main_pattern=NULL;
2046 Paso_Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL;
2047 const dim_t row_block_size=A->row_block_size;
2048 const dim_t col_block_size=A->col_block_size;
2049 const dim_t block_size = A->block_size;
2050 const double ZERO = 0.0;
2051 double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL;
2052 double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL;
2053 index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL;
2054 index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL;
2055 index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL;
2056 index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL;
2057 index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL;
2058 index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL;
2059 index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL;
2060 index_t sum, i, j, k, iptr, irb, icb, ib;
2061 index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols;
2062 index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset;
2063 index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows;
2064 index_t row_marker_ext, *where_p=NULL;
2065 index_t **send_ptr=NULL, **send_idx=NULL;
2066 dim_t p, num_neighbors;
2067 dim_t *recv_len=NULL, *send_len=NULL, *len=NULL;
2068 Esys_MPI_rank *neighbor=NULL;
2069 #ifdef ESYS_MPI
2070 MPI_Request* mpi_requests=NULL;
2071 MPI_Status* mpi_stati=NULL;
2072 #else
2073 int *mpi_requests=NULL, *mpi_stati=NULL;
2074 #endif
2075
2076 /* two sparse matrixes R_main and R_couple will be generate, as the
2077 transpose of P_main and P_col_couple, respectively. Note that,
2078 R_couple is actually the row_coupleBlock of R (R=P^T) */
2079 R_main = Paso_SparseMatrix_getTranspose(P->mainBlock);
2080 R_couple = Paso_SparseMatrix_getTranspose(P->col_coupleBlock);
2081
2082 /* generate P_ext, i.e. portion of P that is tored on neighbor procs
2083 and needed locally for triple matrix product RAP
2084 to be specific, P_ext (on processor k) are group of rows in P, where
2085 the list of rows from processor q is given by
2086 A->col_coupler->connector->send->shared[rPtr...]
2087 rPtr=A->col_coupler->connector->send->OffsetInShared[k]
2088 on q.
2089 P_ext is represented by two sparse matrixes P_ext_main and
2090 P_ext_couple */
2091 Paso_Preconditioner_AMG_extendB(A, P);
2092
2093 /* count the number of cols in P_ext_couple, resize the pattern of
2094 sparse matrix P_ext_couple with new compressed order, and then
2095 build the col id mapping from P->col_coupleBlock to
2096 P_ext_couple */
2097 num_Pmain_cols = P->mainBlock->numCols;
2098 num_Pcouple_cols = P->col_coupleBlock->numCols;
2099 num_Acouple_cols = A->col_coupleBlock->numCols;
2100 num_A_cols = A->mainBlock->numCols + num_Acouple_cols;
2101 sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows];
2102 offset = P->col_distribution->first_component[rank];
2103 num_Pext_cols = 0;
2104 if (P->global_id) {
2105 /* first count the number of cols "num_Pext_cols" in both P_ext_couple
2106 and P->col_coupleBlock */
2107 iptr = 0;
2108 if (num_Pcouple_cols || sum > 0) {
2109 temp = TMPMEMALLOC(num_Pcouple_cols+sum, index_t);
2110 for (; iptr<sum; iptr++){
2111 temp[iptr] = P->remote_coupleBlock->pattern->index[iptr];
2112 }
2113 for (j=0; j<num_Pcouple_cols; j++, iptr++){
2114 temp[iptr] = P->global_id[j];
2115 }
2116 }
2117 if (iptr) {
2118 #ifdef USE_QSORTG
2119 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2120 #else
2121 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2122 #endif
2123 num_Pext_cols = 1;
2124 i = temp[0];
2125 for (j=1; j<iptr; j++) {
2126 if (temp[j] > i) {
2127 i = temp[j];
2128 temp[num_Pext_cols++] = i;
2129 }
2130 }
2131 }
2132
2133 /* resize the pattern of P_ext_couple */
2134 if(num_Pext_cols){
2135 global_id_P = TMPMEMALLOC(num_Pext_cols, index_t);
2136 for (i=0; i<num_Pext_cols; i++)
2137 global_id_P[i] = temp[i];
2138 }
2139 if (num_Pcouple_cols || sum > 0)
2140 TMPMEMFREE(temp);
2141 for (i=0; i<sum; i++) {
2142 where_p = (index_t *)bsearch(
2143 &(P->remote_coupleBlock->pattern->index[i]),
2144 global_id_P, num_Pext_cols,
2145 sizeof(index_t), Paso_comparIndex);
2146 P->remote_coupleBlock->pattern->index[i] =
2147 (index_t)(where_p -global_id_P);
2148 }
2149
2150 /* build the mapping */
2151 if (num_Pcouple_cols) {
2152 Pcouple_to_Pext = TMPMEMALLOC(num_Pcouple_cols, index_t);
2153 iptr = 0;
2154 for (i=0; i<num_Pext_cols; i++)
2155 if (global_id_P[i] == P->global_id[iptr]) {
2156 Pcouple_to_Pext[iptr++] = i;
2157 if (iptr == num_Pcouple_cols) break;
2158 }
2159 }
2160 }
2161
2162 /* alloc and initialize the makers */
2163 sum = num_Pext_cols + num_Pmain_cols;
2164 P_marker = TMPMEMALLOC(sum, index_t);
2165 A_marker = TMPMEMALLOC(num_A_cols, index_t);
2166 #pragma omp parallel for private(i) schedule(static)
2167 for (i=0; i<sum; i++) P_marker[i] = -1;
2168 #pragma omp parallel for private(i) schedule(static)
2169 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2170
2171 /* Now, count the size of RAP_ext. Start with rows in R_couple */
2172 sum = 0;
2173 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2174 row_marker = sum;
2175 /* then, loop over elements in row i_r of R_couple */
2176 j1_ub = R_couple->pattern->ptr[i_r+1];
2177 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2178 i1 = R_couple->pattern->index[j1];
2179 /* then, loop over elements in row i1 of A->col_coupleBlock */
2180 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2181 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2182 i2 = A->col_coupleBlock->pattern->index[j2];
2183
2184 /* check whether entry RA[i_r, i2] has been previously visited.
2185 RAP new entry is possible only if entry RA[i_r, i2] has not
2186 been visited yet */
2187 if (A_marker[i2] != i_r) {
2188 /* first, mark entry RA[i_r,i2] as visited */
2189 A_marker[i2] = i_r;
2190
2191 /* then loop over elements in row i2 of P_ext_main */
2192 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2193 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2194 i_c = P->row_coupleBlock->pattern->index[j3];
2195
2196 /* check whether entry RAP[i_r,i_c] has been created.
2197 If not yet created, create the entry and increase
2198 the total number of elements in RAP_ext */
2199 if (P_marker[i_c] < row_marker) {
2200 P_marker[i_c] = sum;
2201 sum++;
2202 }
2203 }
2204
2205 /* loop over elements in row i2 of P_ext_couple, do the same */
2206 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2207 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2208 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2209 if (P_marker[i_c] < row_marker) {
2210 P_marker[i_c] = sum;
2211 sum++;
2212 }
2213 }
2214 }
2215 }
2216
2217 /* now loop over elements in row i1 of A->mainBlock, repeat
2218 the process we have done to block A->col_coupleBlock */
2219 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2220 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2221 i2 = A->mainBlock->pattern->index[j2];
2222 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2223 A_marker[i2 + num_Acouple_cols] = i_r;
2224 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2225 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2226 i_c = P->mainBlock->pattern->index[j3];
2227 if (P_marker[i_c] < row_marker) {
2228 P_marker[i_c] = sum;
2229 sum++;
2230 }
2231 }
2232 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2233 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2234 /* note that we need to map the column index in
2235 P->col_coupleBlock back into the column index in
2236 P_ext_couple */
2237 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2238 if (P_marker[i_c] < row_marker) {
2239 P_marker[i_c] = sum;
2240 sum++;
2241 }
2242 }
2243 }
2244 }
2245 }
2246 }
2247
2248 /* Now we have the number of non-zero elements in RAP_ext, allocate
2249 PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */
2250 RAP_ext_ptr = MEMALLOC(num_Pcouple_cols+1, index_t);
2251 RAP_ext_idx = MEMALLOC(sum, index_t);
2252 RAP_ext_val = MEMALLOC(sum * block_size, double);
2253 RA_val = TMPMEMALLOC(block_size, double);
2254 RAP_val = TMPMEMALLOC(block_size, double);
2255
2256 /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */
2257 sum = num_Pext_cols + num_Pmain_cols;
2258 #pragma omp parallel for private(i) schedule(static)
2259 for (i=0; i<sum; i++) P_marker[i] = -1;
2260 #pragma omp parallel for private(i) schedule(static)
2261 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2262 sum = 0;
2263 RAP_ext_ptr[0] = 0;
2264 for (i_r=0; i_r<num_Pcouple_cols; i_r++){
2265 row_marker = sum;
2266 /* then, loop over elements in row i_r of R_couple */
2267 j1_ub = R_couple->pattern->ptr[i_r+1];
2268 for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){
2269 i1 = R_couple->pattern->index[j1];
2270 R_val = &(R_couple->val[j1*block_size]);
2271
2272 /* then, loop over elements in row i1 of A->col_coupleBlock */
2273 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2274 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2275 i2 = A->col_coupleBlock->pattern->index[j2];
2276 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2277 for (irb=0; irb<row_block_size; irb++) {
2278 for (icb=0; icb<col_block_size; icb++) {
2279 rtmp = ZERO;
2280 for (ib=0; ib<col_block_size; ib++) {
2281 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2282 }
2283 RA_val[irb+row_block_size*icb]=rtmp;
2284 }
2285 }
2286
2287 /* check whether entry RA[i_r, i2] has been previously visited.
2288 RAP new entry is possible only if entry RA[i_r, i2] has not
2289 been visited yet */
2290 if (A_marker[i2] != i_r) {
2291 /* first, mark entry RA[i_r,i2] as visited */
2292 A_marker[i2] = i_r;
2293
2294 /* then loop over elements in row i2 of P_ext_main */
2295 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2296 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2297 i_c = P->row_coupleBlock->pattern->index[j3];
2298 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2299 for (irb=0; irb<row_block_size; irb++) {
2300 for (icb=0; icb<col_block_size; icb++) {
2301 rtmp = ZERO;
2302 for (ib=0; ib<col_block_size; ib++) {
2303 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2304 }
2305 RAP_val[irb+row_block_size*icb]=rtmp;
2306 }
2307 }
2308
2309
2310 /* check whether entry RAP[i_r,i_c] has been created.
2311 If not yet created, create the entry and increase
2312 the total number of elements in RAP_ext */
2313 if (P_marker[i_c] < row_marker) {
2314 P_marker[i_c] = sum;
2315 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2316 RAP_ext_idx[sum] = i_c + offset;
2317 sum++;
2318 } else {
2319 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2320 for (ib=0; ib<block_size; ib++) {
2321 temp_val[ib] += RAP_val[ib];
2322 }
2323 }
2324 }
2325
2326 /* loop over elements in row i2 of P_ext_couple, do the same */
2327 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2328 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2329 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2330 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2331 for (irb=0; irb<row_block_size; irb++) {
2332 for (icb=0; icb<col_block_size; icb++) {
2333 rtmp = ZERO;
2334 for (ib=0; ib<col_block_size; ib++) {
2335 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2336 }
2337 RAP_val[irb+row_block_size*icb]=rtmp;
2338 }
2339 }
2340
2341 if (P_marker[i_c] < row_marker) {
2342 P_marker[i_c] = sum;
2343 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2344 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2345 sum++;
2346 } else {
2347 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2348 for (ib=0; ib<block_size; ib++) {
2349 temp_val[ib] += RAP_val[ib];
2350 }
2351 }
2352 }
2353
2354 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2355 Only the contributions are added. */
2356 } else {
2357 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2358 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2359 i_c = P->row_coupleBlock->pattern->index[j3];
2360 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2361 for (irb=0; irb<row_block_size; irb++) {
2362 for (icb=0; icb<col_block_size; icb++) {
2363 rtmp = ZERO;
2364 for (ib=0; ib<col_block_size; ib++) {
2365 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2366 }
2367 RAP_val[irb+row_block_size*icb]=rtmp;
2368 }
2369 }
2370
2371 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2372 for (ib=0; ib<block_size; ib++) {
2373 temp_val[ib] += RAP_val[ib];
2374 }
2375 }
2376 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2377 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2378 i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols;
2379 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2380 for (irb=0; irb<row_block_size; irb++) {
2381 for (icb=0; icb<col_block_size; icb++) {
2382 rtmp = ZERO;
2383 for (ib=0; ib<col_block_size; ib++) {
2384 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2385 }
2386 RAP_val[irb+row_block_size*icb]=rtmp;
2387 }
2388 }
2389
2390 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2391 for (ib=0; ib<block_size; ib++) {
2392 temp_val[ib] += RAP_val[ib];
2393 }
2394 }
2395 }
2396 }
2397
2398 /* now loop over elements in row i1 of A->mainBlock, repeat
2399 the process we have done to block A->col_coupleBlock */
2400 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2401 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2402 i2 = A->mainBlock->pattern->index[j2];
2403 temp_val = &(A->mainBlock->val[j2*block_size]);
2404 for (irb=0; irb<row_block_size; irb++) {
2405 for (icb=0; icb<col_block_size; icb++) {
2406 rtmp = ZERO;
2407 for (ib=0; ib<col_block_size; ib++) {
2408 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2409 }
2410 RA_val[irb+row_block_size*icb]=rtmp;
2411 }
2412 }
2413
2414 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2415 A_marker[i2 + num_Acouple_cols] = i_r;
2416 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2417 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2418 i_c = P->mainBlock->pattern->index[j3];
2419 temp_val = &(P->mainBlock->val[j3*block_size]);
2420 for (irb=0; irb<row_block_size; irb++) {
2421 for (icb=0; icb<col_block_size; icb++) {
2422 rtmp = ZERO;
2423 for (ib=0; ib<col_block_size; ib++) {
2424 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2425 }
2426 RAP_val[irb+row_block_size*icb]=rtmp;
2427 }
2428 }
2429
2430 if (P_marker[i_c] < row_marker) {
2431 P_marker[i_c] = sum;
2432 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2433 RAP_ext_idx[sum] = i_c + offset;
2434 sum++;
2435 } else {
2436 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2437 for (ib=0; ib<block_size; ib++) {
2438 temp_val[ib] += RAP_val[ib];
2439 }
2440 }
2441 }
2442 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2443 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2444 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2445 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2446 for (irb=0; irb<row_block_size; irb++) {
2447 for (icb=0; icb<col_block_size; icb++) {
2448 rtmp = ZERO;
2449 for (ib=0; ib<col_block_size; ib++) {
2450 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2451 }
2452 RAP_val[irb+row_block_size*icb]=rtmp;
2453 }
2454 }
2455
2456 if (P_marker[i_c] < row_marker) {
2457 P_marker[i_c] = sum;
2458 memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double));
2459 RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols];
2460 sum++;
2461 } else {
2462 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2463 for (ib=0; ib<block_size; ib++) {
2464 temp_val[ib] += RAP_val[ib];
2465 }
2466 }
2467 }
2468 } else {
2469 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2470 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2471 i_c = P->mainBlock->pattern->index[j3];
2472 temp_val = &(P->mainBlock->val[j3*block_size]);
2473 for (irb=0; irb<row_block_size; irb++) {
2474 for (icb=0; icb<col_block_size; icb++) {
2475 rtmp = ZERO;
2476 for (ib=0; ib<col_block_size; ib++) {
2477 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2478 }
2479 RAP_val[irb+row_block_size*icb]=rtmp;
2480 }
2481 }
2482
2483 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2484 for (ib=0; ib<block_size; ib++) {
2485 temp_val[ib] += RAP_val[ib];
2486 }
2487 }
2488 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2489 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2490 i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2491 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2492 for (irb=0; irb<row_block_size; irb++) {
2493 for (icb=0; icb<col_block_size; icb++) {
2494 rtmp = ZERO;
2495 for (ib=0; ib<col_block_size; ib++) {
2496 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2497 }
2498 RAP_val[irb+row_block_size*icb]=rtmp;
2499 }
2500 }
2501
2502 temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]);
2503 for (ib=0; ib<block_size; ib++) {
2504 temp_val[ib] += RAP_val[ib];
2505 }
2506 }
2507 }
2508 }
2509 }
2510 RAP_ext_ptr[i_r+1] = sum;
2511 }
2512 TMPMEMFREE(P_marker);
2513 TMPMEMFREE(Pcouple_to_Pext);
2514
2515 /* Now we have part of RAP[r,c] where row "r" is the list of rows
2516 which is given by the column list of P->col_coupleBlock, and
2517 column "c" is the list of columns which possiblely covers the
2518 whole column range of systme martris P. This part of data is to
2519 be passed to neighboring processors, and added to corresponding
2520 RAP[r,c] entries in the neighboring processors */
2521 Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx,
2522 &RAP_ext_val, global_id_P, block_size);
2523
2524 num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors];
2525 sum = RAP_ext_ptr[num_RAPext_rows];
2526 num_RAPext_cols = 0;
2527 if (num_Pext_cols || sum > 0) {
2528 temp = TMPMEMALLOC(num_Pext_cols+sum, index_t);
2529 j1_ub = offset + num_Pmain_cols;
2530 for (i=0, iptr=0; i<sum; i++) {
2531 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */
2532 temp[iptr++] = RAP_ext_idx[i]; /* XXX */
2533 }
2534 for (j=0; j<num_Pext_cols; j++, iptr++){
2535 temp[iptr] = global_id_P[j];
2536 }
2537
2538 if (iptr) {
2539 #ifdef USE_QSORTG
2540 qsortG(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2541 #else
2542 qsort(temp, (size_t)iptr, sizeof(index_t), Paso_comparIndex);
2543 #endif
2544 num_RAPext_cols = 1;
2545 i = temp[0];
2546 for (j=1; j<iptr; j++) {
2547 if (temp[j] > i) {
2548 i = temp[j];
2549 temp[num_RAPext_cols++] = i;
2550 }
2551 }
2552 }
2553 }
2554
2555 /* resize the pattern of P_ext_couple */
2556 if(num_RAPext_cols){
2557 global_id_RAP = TMPMEMALLOC(num_RAPext_cols, index_t);
2558 for (i=0; i<num_RAPext_cols; i++)
2559 global_id_RAP[i] = temp[i];
2560 }
2561 if (num_Pext_cols || sum > 0)
2562 TMPMEMFREE(temp);
2563 j1_ub = offset + num_Pmain_cols;
2564 for (i=0; i<sum; i++) {
2565 if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){
2566 where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP,
2567 /*XXX*/ num_RAPext_cols, sizeof(index_t), Paso_comparIndex);
2568 RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP);
2569 } else
2570 RAP_ext_idx[i] = RAP_ext_idx[i] - offset;
2571 }
2572
2573 /* build the mapping */
2574 if (num_Pcouple_cols) {
2575 Pcouple_to_RAP = TMPMEMALLOC(num_Pcouple_cols, index_t);
2576 iptr = 0;
2577 for (i=0; i<num_RAPext_cols; i++)
2578 if (global_id_RAP[i] == P->global_id[iptr]) {
2579 Pcouple_to_RAP[iptr++] = i;
2580 if (iptr == num_Pcouple_cols) break;
2581 }
2582 }
2583
2584 if (num_Pext_cols) {
2585 Pext_to_RAP = TMPMEMALLOC(num_Pext_cols, index_t);
2586 iptr = 0;
2587 for (i=0; i<num_RAPext_cols; i++)
2588 if (global_id_RAP[i] == global_id_P[iptr]) {
2589 Pext_to_RAP[iptr++] = i;
2590 if (iptr == num_Pext_cols) break;
2591 }
2592 }
2593
2594 if (global_id_P){
2595 TMPMEMFREE(global_id_P);
2596 global_id_P = NULL;
2597 }
2598
2599 /* alloc and initialize the makers */
2600 sum = num_RAPext_cols + num_Pmain_cols;
2601 P_marker = TMPMEMALLOC(sum, index_t);
2602 #pragma omp parallel for private(i) schedule(static)
2603 for (i=0; i<sum; i++) P_marker[i] = -1;
2604 #pragma omp parallel for private(i) schedule(static)
2605 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2606
2607 /* Now, count the size of RAP. Start with rows in R_main */
2608 num_neighbors = P->col_coupler->connector->send->numNeighbors;
2609 offsetInShared = P->col_coupler->connector->send->offsetInShared;
2610 shared = P->col_coupler->connector->send->shared;
2611 i = 0;
2612 j = 0;
2613 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2614 /* Mark the start of row for both main block and couple block */
2615 row_marker = i;
2616 row_marker_ext = j;
2617
2618 /* Mark the diagonal element RAP[i_r, i_r], and other elements
2619 in RAP_ext */
2620 P_marker[i_r] = i;
2621 i++;
2622 for (j1=0; j1<num_neighbors; j1++) {
2623 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2624 if (shared[j2] == i_r) {
2625 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2626 i_c = RAP_ext_idx[k];
2627 if (i_c < num_Pmain_cols) {
2628 if (P_marker[i_c] < row_marker) {
2629 P_marker[i_c] = i;
2630 i++;
2631 }
2632 } else {
2633 if (P_marker[i_c] < row_marker_ext) {
2634 P_marker[i_c] = j;
2635 j++;
2636 }
2637 }
2638 }
2639 break;
2640 }
2641 }
2642 }
2643
2644 /* then, loop over elements in row i_r of R_main */
2645 j1_ub = R_main->pattern->ptr[i_r+1];
2646 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2647 i1 = R_main->pattern->index[j1];
2648
2649 /* then, loop over elements in row i1 of A->col_coupleBlock */
2650 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2651 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2652 i2 = A->col_coupleBlock->pattern->index[j2];
2653
2654 /* check whether entry RA[i_r, i2] has been previously visited.
2655 RAP new entry is possible only if entry RA[i_r, i2] has not
2656 been visited yet */
2657 if (A_marker[i2] != i_r) {
2658 /* first, mark entry RA[i_r,i2] as visited */
2659 A_marker[i2] = i_r;
2660
2661 /* then loop over elements in row i2 of P_ext_main */
2662 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2663 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2664 i_c = P->row_coupleBlock->pattern->index[j3];
2665
2666 /* check whether entry RAP[i_r,i_c] has been created.
2667 If not yet created, create the entry and increase
2668 the total number of elements in RAP_ext */
2669 if (P_marker[i_c] < row_marker) {
2670 P_marker[i_c] = i;
2671 i++;
2672 }
2673 }
2674
2675 /* loop over elements in row i2 of P_ext_couple, do the same */
2676 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2677 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2678 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2679 if (P_marker[i_c] < row_marker_ext) {
2680 P_marker[i_c] = j;
2681 j++;
2682 }
2683 }
2684 }
2685 }
2686
2687 /* now loop over elements in row i1 of A->mainBlock, repeat
2688 the process we have done to block A->col_coupleBlock */
2689 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2690 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2691 i2 = A->mainBlock->pattern->index[j2];
2692 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2693 A_marker[i2 + num_Acouple_cols] = i_r;
2694 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2695 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2696 i_c = P->mainBlock->pattern->index[j3];
2697 if (P_marker[i_c] < row_marker) {
2698 P_marker[i_c] = i;
2699 i++;
2700 }
2701 }
2702 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2703 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2704 /* note that we need to map the column index in
2705 P->col_coupleBlock back into the column index in
2706 P_ext_couple */
2707 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2708 if (P_marker[i_c] < row_marker_ext) {
2709 P_marker[i_c] = j;
2710 j++;
2711 }
2712 }
2713 }
2714 }
2715 }
2716 }
2717
2718 /* Now we have the number of non-zero elements in RAP_main and RAP_couple.
2719 Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main,
2720 and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for
2721 RAP_couple */
2722 RAP_main_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2723 RAP_main_idx = MEMALLOC(i, index_t);
2724 RAP_main_val = TMPMEMALLOC(i * block_size, double);
2725 RAP_couple_ptr = MEMALLOC(num_Pmain_cols+1, index_t);
2726 RAP_couple_idx = MEMALLOC(j, index_t);
2727 RAP_couple_val = TMPMEMALLOC(j * block_size, double);
2728
2729 RAP_main_ptr[num_Pmain_cols] = i;
2730 RAP_couple_ptr[num_Pmain_cols] = j;
2731
2732 #pragma omp parallel for private(i) schedule(static)
2733 for (i=0; i<sum; i++) P_marker[i] = -1;
2734 #pragma omp parallel for private(i) schedule(static)
2735 for (i=0; i<num_A_cols; i++) A_marker[i] = -1;
2736
2737 /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows
2738 in R_main */
2739 i = 0;
2740 j = 0;
2741 for (i_r=0; i_r<num_Pmain_cols; i_r++){
2742 /* Mark the start of row for both main block and couple block */
2743 row_marker = i;
2744 row_marker_ext = j;
2745 RAP_main_ptr[i_r] = row_marker;
2746 RAP_couple_ptr[i_r] = row_marker_ext;
2747
2748 /* Mark and setup the diagonal element RAP[i_r, i_r], and elements
2749 in row i_r of RAP_ext */
2750 P_marker[i_r] = i;
2751 RAP_main_val[i] = ZERO;
2752 RAP_main_idx[i] = i_r;
2753 i++;
2754
2755 for (j1=0; j1<num_neighbors; j1++) {
2756 for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) {
2757 if (shared[j2] == i_r) {
2758 for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) {
2759 i_c = RAP_ext_idx[k];
2760 if (i_c < num_Pmain_cols) {
2761 if (P_marker[i_c] < row_marker) {
2762 P_marker[i_c] = i;
2763 memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2764 RAP_main_idx[i] = i_c;
2765 i++;
2766 } else {
2767 temp_val = &(RAP_ext_val[k*block_size]);
2768 RAP_val = &(RAP_main_val[P_marker[i_c]*block_size]);
2769 for (ib=0; ib<block_size; ib++)
2770 RAP_val[ib] += temp_val[ib];
2771 }
2772 } else {
2773 if (P_marker[i_c] < row_marker_ext) {
2774 P_marker[i_c] = j;
2775 memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double));
2776 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2777 j++;
2778 } else {
2779 temp_val = &(RAP_ext_val[k*block_size]);
2780 RAP_val = &(RAP_couple_val[P_marker[i_c]*block_size]);
2781 for (ib=0; ib<block_size; ib++)
2782 RAP_val[ib] += temp_val[ib];
2783 }
2784 }
2785 }
2786 break;
2787 }
2788 }
2789 }
2790
2791 /* then, loop over elements in row i_r of R_main */
2792 j1_ub = R_main->pattern->ptr[i_r+1];
2793 for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){
2794 i1 = R_main->pattern->index[j1];
2795 R_val = &(R_main->val[j1*block_size]);
2796
2797 /* then, loop over elements in row i1 of A->col_coupleBlock */
2798 j2_ub = A->col_coupleBlock->pattern->ptr[i1+1];
2799 for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2800 i2 = A->col_coupleBlock->pattern->index[j2];
2801 temp_val = &(A->col_coupleBlock->val[j2*block_size]);
2802 for (irb=0; irb<row_block_size; irb++) {
2803 for (icb=0; icb<col_block_size; icb++) {
2804 rtmp = ZERO;
2805 for (ib=0; ib<col_block_size; ib++) {
2806 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2807 }
2808 RA_val[irb+row_block_size*icb]=rtmp;
2809 }
2810 }
2811
2812
2813 /* check whether entry RA[i_r, i2] has been previously visited.
2814 RAP new entry is possible only if entry RA[i_r, i2] has not
2815 been visited yet */
2816 if (A_marker[i2] != i_r) {
2817 /* first, mark entry RA[i_r,i2] as visited */
2818 A_marker[i2] = i_r;
2819
2820 /* then loop over elements in row i2 of P_ext_main */
2821 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2822 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2823 i_c = P->row_coupleBlock->pattern->index[j3];
2824 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2825 for (irb=0; irb<row_block_size; irb++) {
2826 for (icb=0; icb<col_block_size; icb++) {
2827 rtmp = ZERO;
2828 for (ib=0; ib<col_block_size; ib++) {
2829 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2830 }
2831 RAP_val[irb+row_block_size*icb]=rtmp;
2832 }
2833 }
2834
2835
2836 /* check whether entry RAP[i_r,i_c] has been created.
2837 If not yet created, create the entry and increase
2838 the total number of elements in RAP_ext */
2839 if (P_marker[i_c] < row_marker) {
2840 P_marker[i_c] = i;
2841 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2842 RAP_main_idx[i] = i_c;
2843 i++;
2844 } else {
2845 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2846 for (ib=0; ib<block_size; ib++) {
2847 temp_val[ib] += RAP_val[ib];
2848 }
2849 }
2850 }
2851
2852 /* loop over elements in row i2 of P_ext_couple, do the same */
2853 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2854 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2855 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2856 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2857 for (irb=0; irb<row_block_size; irb++) {
2858 for (icb=0; icb<col_block_size; icb++) {
2859 rtmp = ZERO;
2860 for (ib=0; ib<col_block_size; ib++) {
2861 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2862 }
2863 RAP_val[irb+row_block_size*icb]=rtmp;
2864 }
2865 }
2866
2867 if (P_marker[i_c] < row_marker_ext) {
2868 P_marker[i_c] = j;
2869 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2870 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2871 j++;
2872 } else {
2873 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2874 for (ib=0; ib<block_size; ib++) {
2875 temp_val[ib] += RAP_val[ib];
2876 }
2877 }
2878 }
2879
2880 /* If entry RA[i_r, i2] is visited, no new RAP entry is created.
2881 Only the contributions are added. */
2882 } else {
2883 j3_ub = P->row_coupleBlock->pattern->ptr[i2+1];
2884 for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2885 i_c = P->row_coupleBlock->pattern->index[j3];
2886 temp_val = &(P->row_coupleBlock->val[j3*block_size]);
2887 for (irb=0; irb<row_block_size; irb++) {
2888 for (icb=0; icb<col_block_size; icb++) {
2889 rtmp = ZERO;
2890 for (ib=0; ib<col_block_size; ib++) {
2891 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2892 }
2893 RAP_val[irb+row_block_size*icb]=rtmp;
2894 }
2895 }
2896
2897 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2898 for (ib=0; ib<block_size; ib++) {
2899 temp_val[ib] += RAP_val[ib];
2900 }
2901 }
2902 j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1];
2903 for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2904 i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2905 temp_val = &(P->remote_coupleBlock->val[j3*block_size]);
2906 for (irb=0; irb<row_block_size; irb++) {
2907 for (icb=0; icb<col_block_size; icb++) {
2908 rtmp = ZERO;
2909 for (ib=0; ib<col_block_size; ib++) {
2910 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2911 }
2912 RAP_val[irb+row_block_size*icb]=rtmp;
2913 }
2914 }
2915
2916 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2917 for (ib=0; ib<block_size; ib++) {
2918 temp_val[ib] += RAP_val[ib];
2919 }
2920 }
2921 }
2922 }
2923
2924 /* now loop over elements in row i1 of A->mainBlock, repeat
2925 the process we have done to block A->col_coupleBlock */
2926 j2_ub = A->mainBlock->pattern->ptr[i1+1];
2927 for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) {
2928 i2 = A->mainBlock->pattern->index[j2];
2929 temp_val = &(A->mainBlock->val[j2*block_size]);
2930 for (irb=0; irb<row_block_size; irb++) {
2931 for (icb=0; icb<col_block_size; icb++) {
2932 rtmp = ZERO;
2933 for (ib=0; ib<col_block_size; ib++) {
2934 rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2935 }
2936 RA_val[irb+row_block_size*icb]=rtmp;
2937 }
2938 }
2939
2940 if (A_marker[i2 + num_Acouple_cols] != i_r) {
2941 A_marker[i2 + num_Acouple_cols] = i_r;
2942 j3_ub = P->mainBlock->pattern->ptr[i2+1];
2943 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2944 i_c = P->mainBlock->pattern->index[j3];
2945 temp_val = &(P->mainBlock->val[j3*block_size]);
2946 for (irb=0; irb<row_block_size; irb++) {
2947 for (icb=0; icb<col_block_size; icb++) {
2948 rtmp = ZERO;
2949 for (ib=0; ib<col_block_size; ib++) {
2950 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2951 }
2952 RAP_val[irb+row_block_size*icb]=rtmp;
2953 }
2954 }
2955
2956 if (P_marker[i_c] < row_marker) {
2957 P_marker[i_c] = i;
2958 memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double));
2959 RAP_main_idx[i] = i_c;
2960 i++;
2961 } else {
2962 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
2963 for (ib=0; ib<block_size; ib++) {
2964 temp_val[ib] += RAP_val[ib];
2965 }
2966 }
2967 }
2968 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
2969 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
2970 /* note that we need to map the column index in
2971 P->col_coupleBlock back into the column index in
2972 P_ext_couple */
2973 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
2974 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
2975 for (irb=0; irb<row_block_size; irb++) {
2976 for (icb=0; icb<col_block_size; icb++) {
2977 rtmp = ZERO;
2978 for (ib=0; ib<col_block_size; ib++) {
2979 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
2980 }
2981 RAP_val[irb+row_block_size*icb]=rtmp;
2982 }
2983 }
2984
2985 if (P_marker[i_c] < row_marker_ext) {
2986 P_marker[i_c] = j;
2987 memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double));
2988 RAP_couple_idx[j] = i_c - num_Pmain_cols;
2989 j++;
2990 } else {
2991 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
2992 for (ib=0; ib<block_size; ib++) {
2993 temp_val[ib] += RAP_val[ib];
2994 }
2995 }
2996 }
2997
2998 } else {
2999 j3_ub = P->mainBlock->pattern->ptr[i2+1];
3000 for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3001 i_c = P->mainBlock->pattern->index[j3];
3002 temp_val = &(P->mainBlock->val[j3*block_size]);
3003 for (irb=0; irb<row_block_size; irb++) {
3004 for (icb=0; icb<col_block_size; icb++) {
3005 rtmp = ZERO;
3006 for (ib=0; ib<col_block_size; ib++) {
3007 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3008 }
3009 RAP_val[irb+row_block_size*icb]=rtmp;
3010 }
3011 }
3012
3013 temp_val = &(RAP_main_val[P_marker[i_c] * block_size]);
3014 for (ib=0; ib<block_size; ib++) {
3015 temp_val[ib] += RAP_val[ib];
3016 }
3017 }
3018 j3_ub = P->col_coupleBlock->pattern->ptr[i2+1];
3019 for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) {
3020 i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols;
3021 temp_val = &(P->col_coupleBlock->val[j3*block_size]);
3022 for (irb=0; irb<row_block_size; irb++) {
3023 for (icb=0; icb<col_block_size; icb++) {
3024 rtmp = ZERO;
3025 for (ib=0; ib<col_block_size; ib++) {
3026 rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb];
3027 }
3028 RAP_val[irb+row_block_size*icb]=rtmp;
3029 }
3030 }
3031
3032 temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]);
3033 for (ib=0; ib<block_size; ib++) {
3034 temp_val[ib] += RAP_val[ib];
3035 }
3036 }
3037 }
3038 }
3039 }
3040
3041 /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */
3042 if (i > row_marker) {
3043 offset = i - row_marker;
3044 temp = TMPMEMALLOC(offset, index_t);
3045 for (iptr=0; iptr<offset; iptr++)
3046 temp[iptr] = RAP_main_idx[row_marker+iptr];
3047 if (offset > 0) {
3048 #ifdef USE_QSORTG
3049 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3050 #else
3051 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3052 #endif
3053 }
3054 temp_val = TMPMEMALLOC(offset * block_size, double);
3055 for (iptr=0; iptr<offset; iptr++){
3056 k = P_marker[temp[iptr]];
3057 memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double));
3058 P_marker[temp[iptr]] = iptr + row_marker;
3059 }
3060 for (iptr=0; iptr<offset; iptr++){
3061 RAP_main_idx[row_marker+iptr] = temp[iptr];
3062 memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3063 }
3064 TMPMEMFREE(temp);
3065 TMPMEMFREE(temp_val);
3066 }
3067 if (j > row_marker_ext) {
3068 offset = j - row_marker_ext;
3069 temp = TMPMEMALLOC(offset, index_t);
3070 for (iptr=0; iptr<offset; iptr++)
3071 temp[iptr] = RAP_couple_idx[row_marker_ext+iptr];
3072 if (offset > 0) {
3073 #ifdef USE_QSORTG
3074 qsortG(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3075 #else
3076 qsort(temp, (size_t)offset, sizeof(index_t), Paso_comparIndex);
3077 #endif
3078 }
3079 temp_val = TMPMEMALLOC(offset * block_size, double);
3080 for (iptr=0; iptr<offset; iptr++){
3081 k = P_marker[temp[iptr] + num_Pmain_cols];
3082 memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double));
3083 P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext;
3084 }
3085 for (iptr=0; iptr<offset; iptr++){
3086 RAP_couple_idx[row_marker_ext+iptr] = temp[iptr];
3087 memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double));
3088 }
3089 TMPMEMFREE(temp);
3090 TMPMEMFREE(temp_val);
3091 }
3092 }
3093
3094 TMPMEMFREE(RA_val);
3095 TMPMEMFREE(RAP_val);
3096 TMPMEMFREE(A_marker);
3097 TMPMEMFREE(Pext_to_RAP);
3098 TMPMEMFREE(Pcouple_to_RAP);
3099 MEMFREE(RAP_ext_ptr);
3100 MEMFREE(RAP_ext_idx);
3101 MEMFREE(RAP_ext_val);
3102 Paso_SparseMatrix_free(R_main);
3103 Paso_SparseMatrix_free(R_couple);
3104
3105 /* Check whether there are empty columns in RAP_couple */
3106 #pragma omp parallel for schedule(static) private(i)
3107 for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1;
3108 /* num of non-empty columns is stored in "k" */
3109 k = 0;
3110 j = RAP_couple_ptr[num_Pmain_cols];
3111 for (i=0; i<j; i++) {
3112 i1 = RAP_couple_idx[i];
3113 if (P_marker[i1]) {
3114 P_marker[i1] = 0;
3115 k++;
3116 }
3117 }
3118
3119 /* empty columns is found */
3120 if (k < num_RAPext_cols) {
3121 temp = TMPMEMALLOC(k, index_t);
3122 k = 0;
3123 for (i=0; i<num_RAPext_cols; i++)
3124 if (!P_marker[i]) {
3125 P_marker[i] = k;
3126 temp[k] = global_id_RAP[i];
3127 k++;
3128 }
3129 for (i=0; i<j; i++) {
3130 i1 = RAP_couple_idx[i];
3131 RAP_couple_idx[i] = P_marker[i1];
3132 }
3133 num_RAPext_cols = k;
3134 TMPMEMFREE(global_id_RAP);
3135 global_id_RAP = temp;
3136 }
3137 TMPMEMFREE(P_marker);
3138
3139 /******************************************************/
3140 /* Start to create the coasre level System Matrix A_c */
3141 /******************************************************/
3142 /* first, prepare the sender/receiver for the col_connector */
3143 dist = P->pattern->input_distribution->first_component;
3144 recv_len = TMPMEMALLOC(size, dim_t);
3145 send_len = TMPMEMALLOC(size, dim_t);
3146 neighbor = TMPMEMALLOC(size, Esys_MPI_rank);
3147 offsetInShared = TMPMEMALLOC(size+1, index_t);
3148 shared = TMPMEMALLOC(num_RAPext_cols, index_t);
3149 memset(recv_len, 0, sizeof(dim_t) * size);
3150 memset(send_len, 0, sizeof(dim_t) * size);
3151 num_neighbors = 0;
3152 offsetInShared[0] = 0;
3153 for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) {
3154 shared[i] = i + num_Pmain_cols;
3155 if (k <= global_id_RAP[i]) {
3156 if (recv_len[j] > 0) {
3157 neighbor[num_neighbors] = j;
3158 num_neighbors ++;
3159 offsetInShared[num_neighbors] = i;
3160 }
3161 while (k <= global_id_RAP[i]) {
3162 j++;
3163 k = dist[j+1];
3164 }
3165 }
3166 recv_len[j] ++;
3167 }
3168 if (recv_len[j] > 0) {
3169 neighbor[num_neighbors] = j;
3170 num_neighbors ++;
3171 offsetInShared[num_neighbors] = i;
3172 }
3173 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3174 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3175
3176 #ifdef ESYS_MPI
3177 MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm);
3178 #endif
3179
3180 #ifdef ESYS_MPI
3181 mpi_requests=TMPMEMALLOC(size*2,MPI_Request);
3182 mpi_stati=TMPMEMALLOC(size*2,MPI_Status);
3183 #else
3184 mpi_requests=TMPMEMALLOC(size*2,int);
3185 mpi_stati=TMPMEMALLOC(size*2,int);
3186 #endif
3187 num_neighbors = 0;
3188 j = 0;
3189 offsetInShared[0] = 0;
3190 for (i=0; i<size; i++) {
3191 if (send_len[i] > 0) {
3192 neighbor[num_neighbors] = i;
3193 num_neighbors ++;
3194 j += send_len[i];
3195 offsetInShared[num_neighbors] = j;
3196 }
3197 }
3198 TMPMEMFREE(shared);
3199 shared = TMPMEMALLOC(j, index_t);
3200 for (i=0, j=0; i<num_neighbors; i++) {
3201 k = neighbor[i];
3202 #ifdef ESYS_MPI
3203 MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k,
3204 mpi_info->msg_tag_counter+k,
3205 mpi_info->comm, &mpi_requests[i]);
3206 #endif
3207 j += send_len[k];
3208 }
3209 for (i=0, j=0; i<recv->numNeighbors; i++) {
3210 k = recv->neighbor[i];
3211 #ifdef ESYS_MPI
3212 MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k,
3213 mpi_info->msg_tag_counter+rank,
3214 mpi_info->comm, &mpi_requests[i+num_neighbors]);
3215 #endif
3216 j += recv_len[k];
3217 }
3218 #ifdef ESYS_MPI
3219 MPI_Waitall(num_neighbors + recv->numNeighbors,
3220 mpi_requests, mpi_stati);
3221 #endif
3222 mpi_info->msg_tag_counter += size;
3223
3224 j = offsetInShared[num_neighbors];
3225 offset = dist[rank];
3226 for (i=0; i<j; i++) shared[i] = shared[i] - offset;
3227 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3228 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3229
3230 col_connector = Paso_Connector_alloc(send, recv);
3231 TMPMEMFREE(shared);
3232 Paso_SharedComponents_free(recv);
3233 Paso_SharedComponents_free(send);
3234
3235 /* now, create row distribution (output_distri) and col
3236 distribution (input_distribution) */
3237 input_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3238 output_dist = Paso_Distribution_alloc(mpi_info, dist, 1, 0);
3239
3240 /* then, prepare the sender/receiver for the row_connector, first, prepare
3241 the information for sender */
3242 sum = RAP_couple_ptr[num_Pmain_cols];
3243 len = TMPMEMALLOC(size, dim_t);
3244 send_ptr = TMPMEMALLOC(size, index_t*);
3245 send_idx = TMPMEMALLOC(size, index_t*);
3246 for (i=0; i<size; i++) {
3247 send_ptr[i] = TMPMEMALLOC(num_Pmain_cols, index_t);
3248 send_idx[i] = TMPMEMALLOC(sum, index_t);
3249 memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols);
3250 }
3251 memset(len, 0, sizeof(dim_t) * size);
3252 recv = col_connector->recv;
3253 sum=0;
3254 for (i_r=0; i_r<num_Pmain_cols; i_r++) {
3255 i1 = RAP_couple_ptr[i_r];
3256 i2 = RAP_couple_ptr[i_r+1];
3257 if (i2 > i1) {
3258 /* then row i_r will be in the sender of row_connector, now check
3259 how many neighbors i_r needs to be send to */
3260 for (j=i1; j<i2; j++) {
3261 i_c = RAP_couple_idx[j];
3262 /* find out the corresponding neighbor "p" of column i_c */
3263 for (p=0; p<recv->numNeighbors; p++) {
3264 if (i_c < recv->offsetInShared[p+1]) {
3265 k = recv->neighbor[p];
3266 if (send_ptr[k][i_r] == 0) sum++;
3267 send_ptr[k][i_r] ++;
3268 send_idx[k][len[k]] = global_id_RAP[i_c];
3269 len[k] ++;
3270 break;
3271 }
3272 }
3273 }
3274 }
3275 }
3276 if (global_id_RAP) {
3277 TMPMEMFREE(global_id_RAP);
3278 global_id_RAP = NULL;
3279 }
3280
3281 /* now allocate the sender */
3282 shared = TMPMEMALLOC(sum, index_t);
3283 memset(send_len, 0, sizeof(dim_t) * size);
3284 num_neighbors=0;
3285 offsetInShared[0] = 0;
3286 for (p=0, k=0; p<size; p++) {
3287 for (i=0; i<num_Pmain_cols; i++) {
3288 if (send_ptr[p][i] > 0) {
3289 shared[k] = i;
3290 k++;
3291 send_ptr[p][send_len[p]] = send_ptr[p][i];
3292 send_len[p]++;
3293 }
3294 }
3295 if (k > offsetInShared[num_neighbors]) {
3296 neighbor[num_neighbors] = p;
3297 num_neighbors ++;
3298 offsetInShared[num_neighbors] = k;
3299 }
3300 }
3301 send = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3302 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3303
3304 /* send/recv number of rows will be sent from current proc
3305 recover info for the receiver of row_connector from the sender */
3306 #ifdef ESYS_MPI
3307 MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm);
3308 #endif
3309 num_neighbors = 0;
3310 offsetInShared[0] = 0;
3311 j = 0;
3312 for (i=0; i<size; i++) {
3313 if (i != rank && recv_len[i] > 0) {
3314 neighbor[num_neighbors] = i;
3315 num_neighbors ++;
3316 j += recv_len[i];
3317 offsetInShared[num_neighbors] = j;
3318 }
3319 }
3320 TMPMEMFREE(shared);
3321 TMPMEMFREE(recv_len);
3322 shared = TMPMEMALLOC(j, index_t);
3323 k = offsetInShared[num_neighbors];
3324 for (i=0; i<k; i++) {
3325 shared[i] = i + num_Pmain_cols;
3326 }
3327 recv = Paso_SharedComponents_alloc(num_Pmain_cols, num_neighbors,
3328 neighbor, shared, offsetInShared, 1, 0, mpi_info);
3329 row_connector = Paso_Connector_alloc(send, recv);
3330 TMPMEMFREE(shared);
3331 Paso_SharedComponents_free(recv);
3332 Paso_SharedComponents_free(send);
3333
3334 /* send/recv pattern->ptr for rowCoupleBlock */
3335 num_RAPext_rows = offsetInShared[num_neighbors];
3336 row_couple_ptr = MEMALLOC(num_RAPext_rows+1, index_t);
3337 for (p=0; p<num_neighbors; p++) {
3338 j = offsetInShared[p];
3339 i = offsetInShared[p+1];
3340 #ifdef ESYS_MPI
3341 MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p],
3342 mpi_info->msg_tag_counter+neighbor[p],
3343 mpi_info->comm, &mpi_requests[p]);
3344 #endif
3345 }
3346 send = row_connector->send;
3347 for (p=0; p<send->numNeighbors; p++) {
3348 #ifdef ESYS_MPI
3349 MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]],
3350 MPI_INT, send->neighbor[p],
3351 mpi_info->msg_tag_counter+rank,
3352 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3353 #endif
3354 }
3355 #ifdef ESYS_MPI
3356 MPI_Waitall(num_neighbors + send->numNeighbors,
3357 mpi_requests, mpi_stati);
3358 #endif
3359 mpi_info->msg_tag_counter += size;
3360 TMPMEMFREE(send_len);
3361
3362 sum = 0;
3363 for (i=0; i<num_RAPext_rows; i++) {
3364 k = row_couple_ptr[i];
3365 row_couple_ptr[i] = sum;
3366 sum += k;
3367 }
3368 row_couple_ptr[num_RAPext_rows] = sum;
3369
3370 /* send/recv pattern->index for rowCoupleBlock */
3371 k = row_couple_ptr[num_RAPext_rows];
3372 row_couple_idx = MEMALLOC(k, index_t);
3373 for (p=0; p<num_neighbors; p++) {
3374 j1 = row_couple_ptr[offsetInShared[p]];
3375 j2 = row_couple_ptr[offsetInShared[p+1]];
3376 #ifdef ESYS_MPI
3377 MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p],
3378 mpi_info->msg_tag_counter+neighbor[p],
3379 mpi_info->comm, &mpi_requests[p]);
3380 #endif
3381 }
3382 for (p=0; p<send->numNeighbors; p++) {
3383 #ifdef ESYS_MPI
3384 MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]],
3385 MPI_INT, send->neighbor[p],
3386 mpi_info->msg_tag_counter+rank,
3387 mpi_info->comm, &mpi_requests[p+num_neighbors]);
3388 #endif
3389 }
3390 #ifdef ESYS_MPI
3391 MPI_Waitall(num_neighbors + send->numNeighbors,
3392 mpi_requests, mpi_stati);
3393 #endif
3394 mpi_info->msg_tag_counter += size;
3395
3396 offset = input_dist->first_component[rank];
3397 k = row_couple_ptr[num_RAPext_rows];
3398 for (i=0; i<k; i++) {
3399 row_couple_idx[i] -= offset;
3400 }
3401
3402 for (i=0; i<size; i++) {
3403 TMPMEMFREE(send_ptr[i]);
3404 TMPMEMFREE(send_idx[i]);
3405 }
3406 TMPMEMFREE(send_ptr);
3407 TMPMEMFREE(send_idx);
3408 TMPMEMFREE(len);
3409 TMPMEMFREE(offsetInShared);
3410 TMPMEMFREE(neighbor);
3411 TMPMEMFREE(mpi_requests);
3412 TMPMEMFREE(mpi_stati);
3413 Esys_MPIInfo_free(mpi_info);
3414
3415 /* Now, we can create pattern for mainBlock and coupleBlock */
3416 main_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols,
3417 num_Pmain_cols, RAP_main_ptr, RAP_main_idx);
3418 col_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3419 num_Pmain_cols, num_RAPext_cols,
3420 RAP_couple_ptr, RAP_couple_idx);
3421 row_couple_pattern = Paso_Pattern_alloc(MATRIX_FORMAT_DEFAULT,
3422 num_RAPext_rows, num_Pmain_cols,
3423 row_couple_ptr, row_couple_idx);
3424
3425 /* next, create the system matrix */
3426 pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
3427 output_dist, input_dist, main_pattern, col_couple_pattern,
3428 row_couple_pattern, col_connector, row_connector);
3429 out = Paso_SystemMatrix_alloc(A->type, pattern,
3430 row_block_size, col_block_size, FALSE);
3431
3432 /* finally, fill in the data*/
3433 memcpy(out->mainBlock->val, RAP_main_val,
3434 out->mainBlock->len* sizeof(double));
3435 memcpy(out->col_coupleBlock->val, RAP_couple_val,
3436 out->col_coupleBlock->len * sizeof(double));
3437
3438 /* Clean up */
3439 Paso_SystemMatrixPattern_free(pattern);
3440 Paso_Pattern_free(main_pattern);
3441 Paso_Pattern_free(col_couple_pattern);
3442 Paso_Pattern_free(row_couple_pattern);
3443 Paso_Connector_free(col_connector);
3444 Paso_Connector_free(row_connector);
3445 Paso_Distribution_free(output_dist);
3446 Paso_Distribution_free(input_dist);
3447 TMPMEMFREE(RAP_main_val);
3448 TMPMEMFREE(RAP_couple_val);
3449 if (Esys_noError()) {
3450 return out;
3451 } else {
3452 Paso_SystemMatrix_free(out);
3453 return NULL;
3454 }
3455 }

  ViewVC Help
Powered by ViewVC 1.1.26