Parent Directory
|
Revision Log
paso::SharedComponents now header-only and shared ptr managed.
1 | /***************************************************************************** |
2 | * |
3 | * Copyright (c) 2003-2014 by University of Queensland |
4 | * http://www.uq.edu.au |
5 | * |
6 | * Primary Business: Queensland, Australia |
7 | * Licensed under the Open Software License version 3.0 |
8 | * http://www.opensource.org/licenses/osl-3.0.php |
9 | * |
10 | * Development until 2012 by Earth Systems Science Computational Center (ESSCC) |
11 | * Development 2012-2013 by School of Earth Sciences |
12 | * Development from 2014 by Centre for Geoscience Computing (GeoComp) |
13 | * |
14 | *****************************************************************************/ |
15 | |
16 | |
17 | /****************************************************************************/ |
18 | |
19 | /* Paso: defines AMG Interpolation */ |
20 | |
21 | /****************************************************************************/ |
22 | |
23 | /* Author: l.gao@uq.edu.au */ |
24 | |
25 | /****************************************************************************/ |
26 | |
27 | #include "Paso.h" |
28 | #include "SparseMatrix.h" |
29 | #include "PasoUtil.h" |
30 | #include "Preconditioner.h" |
31 | |
32 | /**************************************************************************** |
33 | |
34 | Methods necessary for AMG preconditioner |
35 | |
36 | construct n_C x n_C interpolation operator A_C from matrix A |
37 | and prolongation matrix P. |
38 | |
39 | The coarsening operator A_C is defined by RAP where R=P^T. |
40 | |
41 | *****************************************************************************/ |
42 | |
43 | /* Extend system matrix B with extra two sparse matrices: |
44 | B_ext_main and B_ext_couple |
45 | The combination of this two sparse matrices represents |
46 | the portion of B that is stored on neighbour procs and |
47 | needed locally for triple matrix product (B^T) A B. |
48 | |
49 | FOR NOW, we assume that the system matrix B has a NULL |
50 | row_coupleBlock and a NULL remote_coupleBlock. Based on |
51 | this assumption, we use link row_coupleBlock for sparse |
52 | matrix B_ext_main, and link remote_coupleBlock for sparse |
53 | matrix B_ext_couple. |
54 | |
55 | To be specific, B_ext (on processor k) are group of rows |
56 | in B, where the list of rows from processor q is given by |
57 | A->col_coupler->connector->send->shared[rPtr...] |
58 | rPtr=A->col_coupler->connector->send->OffsetInShared[k] |
59 | on q. */ |
60 | void Paso_Preconditioner_AMG_extendB(Paso_SystemMatrix* A, Paso_SystemMatrix* B) |
61 | { |
62 | paso::Pattern *pattern_main=NULL, *pattern_couple=NULL; |
63 | paso::Coupler *coupler=NULL; |
64 | paso::SharedComponents_ptr send, recv; |
65 | double *cols=NULL, *send_buf=NULL, *ptr_val=NULL, *send_m=NULL, *send_c=NULL; |
66 | index_t *global_id=NULL, *cols_array=NULL, *ptr_ptr=NULL, *ptr_idx=NULL; |
67 | index_t *ptr_main=NULL, *ptr_couple=NULL, *idx_main=NULL, *idx_couple=NULL; |
68 | index_t *send_idx=NULL, *send_offset=NULL, *recv_buf=NULL, *recv_offset=NULL; |
69 | index_t *idx_m=NULL, *idx_c=NULL; |
70 | index_t i, j, k, m, p, q, j_ub, k_lb, k_ub, m_lb, m_ub, l_m, l_c, i0; |
71 | index_t offset, len, block_size, block_size_size, max_num_cols; |
72 | index_t num_main_cols, num_couple_cols, num_neighbors, row, neighbor; |
73 | dim_t *recv_degree=NULL, *send_degree=NULL; |
74 | dim_t rank=A->mpi_info->rank, size=A->mpi_info->size; |
75 | |
76 | if (size == 1) return; |
77 | |
78 | if (B->row_coupleBlock) { |
79 | paso::SparseMatrix_free(B->row_coupleBlock); |
80 | B->row_coupleBlock = NULL; |
81 | } |
82 | |
83 | if (B->row_coupleBlock || B->remote_coupleBlock) { |
84 | Esys_setError(VALUE_ERROR, "Paso_Preconditioner_AMG_extendB: the link to row_coupleBlock or to remote_coupleBlock has already been set."); |
85 | return; |
86 | } |
87 | |
88 | /* sending/receiving unknown's global ID */ |
89 | num_main_cols = B->mainBlock->numCols; |
90 | cols = new double[num_main_cols]; |
91 | offset = B->col_distribution->first_component[rank]; |
92 | #pragma omp parallel for private(i) schedule(static) |
93 | for (i=0; i<num_main_cols; ++i) cols[i] = offset + i; |
94 | if (B->global_id == NULL) { |
95 | coupler=paso::Coupler_alloc(B->col_coupler->connector, 1); |
96 | paso::Coupler_startCollect(coupler, cols); |
97 | } |
98 | |
99 | recv_buf = new index_t[size]; |
100 | recv_degree = new dim_t[size]; |
101 | recv_offset = new index_t[size+1]; |
102 | #pragma omp parallel for private(i) schedule(static) |
103 | for (i=0; i<size; i++){ |
104 | recv_buf[i] = 0; |
105 | recv_degree[i] = 1; |
106 | recv_offset[i] = i; |
107 | } |
108 | |
109 | block_size = B->block_size; |
110 | block_size_size = block_size * sizeof(double); |
111 | num_couple_cols = B->col_coupleBlock->numCols; |
112 | send = A->col_coupler->connector->send; |
113 | recv = A->col_coupler->connector->recv; |
114 | num_neighbors = send->numNeighbors; |
115 | p = send->offsetInShared[num_neighbors]; |
116 | len = p * B->col_distribution->first_component[size]; |
117 | send_buf = new double[len * block_size]; |
118 | send_idx = new index_t[len]; |
119 | send_offset = new index_t[(p+1)*2]; |
120 | send_degree = new dim_t[num_neighbors]; |
121 | i = num_main_cols + num_couple_cols; |
122 | send_m = new double[i * block_size]; |
123 | send_c = new double[i * block_size]; |
124 | idx_m = new index_t[i]; |
125 | idx_c = new index_t[i]; |
126 | |
127 | /* waiting for receiving unknown's global ID */ |
128 | if (B->global_id == NULL) { |
129 | paso::Coupler_finishCollect(coupler); |
130 | global_id = new index_t[num_couple_cols]; |
131 | #pragma omp parallel for private(i) schedule(static) |
132 | for (i=0; i<num_couple_cols; ++i) |
133 | global_id[i] = coupler->recv_buffer[i]; |
134 | B->global_id = global_id; |
135 | paso::Coupler_free(coupler); |
136 | } else |
137 | global_id = B->global_id; |
138 | |
139 | /* distribute the number of cols in current col_coupleBlock to all ranks */ |
140 | #ifdef ESYS_MPI |
141 | MPI_Allgatherv(&num_couple_cols, 1, MPI_INT, recv_buf, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm); |
142 | #endif |
143 | |
144 | /* distribute global_ids of cols to be considered to all ranks*/ |
145 | len = 0; |
146 | max_num_cols = 0; |
147 | for (i=0; i<size; i++){ |
148 | recv_degree[i] = recv_buf[i]; |
149 | recv_offset[i] = len; |
150 | len += recv_buf[i]; |
151 | if (max_num_cols < recv_buf[i]) |
152 | max_num_cols = recv_buf[i]; |
153 | } |
154 | recv_offset[size] = len; |
155 | cols_array = new index_t[len]; |
156 | #ifdef ESYS_MPI |
157 | MPI_Allgatherv(global_id, num_couple_cols, MPI_INT, cols_array, recv_degree, recv_offset, MPI_INT, A->mpi_info->comm); |
158 | #endif |
159 | |
160 | /* first, prepare the ptr_ptr to be received */ |
161 | q = recv->numNeighbors; |
162 | len = recv->offsetInShared[q]; |
163 | ptr_ptr = new index_t[(len+1) * 2]; |
164 | for (p=0; p<q; p++) { |
165 | row = recv->offsetInShared[p]; |
166 | m = recv->offsetInShared[p + 1]; |
167 | #ifdef ESYS_MPI |
168 | MPI_Irecv(&(ptr_ptr[2*row]), 2 * (m-row), MPI_INT, recv->neighbor[p], |
169 | A->mpi_info->msg_tag_counter+recv->neighbor[p], |
170 | A->mpi_info->comm, |
171 | &(A->col_coupler->mpi_requests[p])); |
172 | #endif |
173 | } |
174 | |
175 | /* now prepare the rows to be sent (the degree, the offset and the data) */ |
176 | len = 0; |
177 | i0 = 0; |
178 | for (p=0; p<num_neighbors; p++) { |
179 | i = i0; |
180 | neighbor = send->neighbor[p]; |
181 | m_lb = B->col_distribution->first_component[neighbor]; |
182 | m_ub = B->col_distribution->first_component[neighbor + 1]; |
183 | j_ub = send->offsetInShared[p + 1]; |
184 | for (j=send->offsetInShared[p]; j<j_ub; j++) { |
185 | row = send->shared[j]; |
186 | l_m = 0; |
187 | l_c = 0; |
188 | k_ub = B->col_coupleBlock->pattern->ptr[row + 1]; |
189 | k_lb = B->col_coupleBlock->pattern->ptr[row]; |
190 | |
191 | /* check part of col_coupleBlock for data to be passed @row */ |
192 | for (k=k_lb; k<k_ub; k++) { |
193 | m = global_id[B->col_coupleBlock->pattern->index[k]]; |
194 | if (m > offset) break; |
195 | if (m>= m_lb && m < m_ub) { |
196 | /* data to be passed to sparse matrix B_ext_main */ |
197 | idx_m[l_m] = m - m_lb; |
198 | memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size); |
199 | l_m++; |
200 | } else { |
201 | /* data to be passed to sparse matrix B_ext_couple */ |
202 | idx_c[l_c] = m; |
203 | memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size); |
204 | l_c++; |
205 | } |
206 | } |
207 | k_lb = k; |
208 | |
209 | /* check mainBlock for data to be passed @row to sparse |
210 | matrix B_ext_couple */ |
211 | k_ub = B->mainBlock->pattern->ptr[row + 1]; |
212 | k = B->mainBlock->pattern->ptr[row]; |
213 | memcpy(&(send_c[l_c*block_size]), &(B->mainBlock->val[block_size*k]), block_size_size * (k_ub-k)); |
214 | for (; k<k_ub; k++) { |
215 | m = B->mainBlock->pattern->index[k] + offset; |
216 | idx_c[l_c] = m; |
217 | l_c++; |
218 | } |
219 | |
220 | /* check the rest part of col_coupleBlock for data to |
221 | be passed @row to sparse matrix B_ext_couple */ |
222 | k = k_lb; |
223 | k_ub = B->col_coupleBlock->pattern->ptr[row + 1]; |
224 | for (k=k_lb; k<k_ub; k++) { |
225 | m = global_id[B->col_coupleBlock->pattern->index[k]]; |
226 | if (m>= m_lb && m < m_ub) { |
227 | /* data to be passed to sparse matrix B_ext_main */ |
228 | idx_m[l_m] = m - m_lb; |
229 | memcpy(&(send_m[l_m*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size); |
230 | l_m++; |
231 | } else { |
232 | /* data to be passed to sparse matrix B_ext_couple */ |
233 | idx_c[l_c] = m; |
234 | memcpy(&(send_c[l_c*block_size]), &(B->col_coupleBlock->val[block_size*k]), block_size_size); |
235 | l_c++; |
236 | } |
237 | } |
238 | |
239 | memcpy(&(send_buf[len*block_size]), send_m, block_size_size*l_m); |
240 | memcpy(&(send_idx[len]), idx_m, l_m * sizeof(index_t)); |
241 | send_offset[2*i] = l_m; |
242 | len += l_m; |
243 | memcpy(&(send_buf[len*block_size]), send_c, block_size_size*l_c); |
244 | memcpy(&(send_idx[len]), idx_c, l_c * sizeof(index_t)); |
245 | send_offset[2*i+1] = l_c; |
246 | len += l_c; |
247 | i++; |
248 | } |
249 | |
250 | /* sending */ |
251 | #ifdef ESYS_MPI |
252 | MPI_Issend(&(send_offset[2*i0]), 2*(i-i0), MPI_INT, send->neighbor[p], |
253 | A->mpi_info->msg_tag_counter+rank, |
254 | A->mpi_info->comm, |
255 | &(A->col_coupler->mpi_requests[p+recv->numNeighbors])); |
256 | #endif |
257 | send_degree[p] = len; |
258 | i0 = i; |
259 | } |
260 | delete[] send_m; |
261 | delete[] send_c; |
262 | delete[] idx_m; |
263 | delete[] idx_c; |
264 | |
265 | |
266 | q = recv->numNeighbors; |
267 | len = recv->offsetInShared[q]; |
268 | ptr_main = new index_t[(len+1)]; |
269 | ptr_couple = new index_t[(len+1)]; |
270 | |
271 | #ifdef ESYS_MPI |
272 | MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors, |
273 | A->col_coupler->mpi_requests, |
274 | A->col_coupler->mpi_stati); |
275 | #endif |
276 | ESYS_MPI_INC_COUNTER(*(A->mpi_info), size); |
277 | |
278 | j = 0; |
279 | k = 0; |
280 | ptr_main[0] = 0; |
281 | ptr_couple[0] = 0; |
282 | for (i=0; i<len; i++) { |
283 | j += ptr_ptr[2*i]; |
284 | k += ptr_ptr[2*i+1]; |
285 | ptr_main[i+1] = j; |
286 | ptr_couple[i+1] = k; |
287 | } |
288 | |
289 | delete[] ptr_ptr; |
290 | idx_main = new index_t[j]; |
291 | idx_couple = new index_t[k]; |
292 | ptr_idx = new index_t[j+k]; |
293 | ptr_val = new double[(j+k) * block_size]; |
294 | |
295 | /* send/receive index array */ |
296 | j=0; |
297 | k_ub = 0; |
298 | for (p=0; p<recv->numNeighbors; p++) { |
299 | k = recv->offsetInShared[p]; |
300 | m = recv->offsetInShared[p+1]; |
301 | i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k]; |
302 | if (i > 0) { |
303 | k_ub ++; |
304 | #ifdef ESYS_MPI |
305 | MPI_Irecv(&(ptr_idx[j]), i, MPI_INT, recv->neighbor[p], |
306 | A->mpi_info->msg_tag_counter+recv->neighbor[p], |
307 | A->mpi_info->comm, |
308 | &(A->col_coupler->mpi_requests[p])); |
309 | #endif |
310 | } |
311 | j += i; |
312 | } |
313 | |
314 | j=0; |
315 | k_ub = 0; |
316 | for (p=0; p<num_neighbors; p++) { |
317 | i = send_degree[p] - j; |
318 | if (i > 0){ |
319 | k_ub ++; |
320 | #ifdef ESYS_MPI |
321 | MPI_Issend(&(send_idx[j]), i, MPI_INT, send->neighbor[p], |
322 | A->mpi_info->msg_tag_counter+rank, |
323 | A->mpi_info->comm, |
324 | &(A->col_coupler->mpi_requests[p+recv->numNeighbors])); |
325 | #endif |
326 | } |
327 | j = send_degree[p]; |
328 | } |
329 | |
330 | #ifdef ESYS_MPI |
331 | MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors, |
332 | A->col_coupler->mpi_requests, |
333 | A->col_coupler->mpi_stati); |
334 | #endif |
335 | ESYS_MPI_INC_COUNTER(*(A->mpi_info), size); |
336 | |
337 | #pragma omp parallel for private(i,j,k,m,p) schedule(static) |
338 | for (i=0; i<len; i++) { |
339 | j = ptr_main[i]; |
340 | k = ptr_main[i+1]; |
341 | m = ptr_couple[i]; |
342 | for (p=j; p<k; p++) { |
343 | idx_main[p] = ptr_idx[m+p]; |
344 | } |
345 | j = ptr_couple[i+1]; |
346 | for (p=m; p<j; p++) { |
347 | idx_couple[p] = ptr_idx[k+p]; |
348 | } |
349 | } |
350 | delete[] ptr_idx; |
351 | |
352 | /* allocate pattern and sparsematrix for B_ext_main */ |
353 | pattern_main = paso::Pattern_alloc(B->col_coupleBlock->pattern->type, |
354 | len, num_main_cols, ptr_main, idx_main); |
355 | B->row_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type, |
356 | pattern_main, B->row_block_size, B->col_block_size, |
357 | FALSE); |
358 | paso::Pattern_free(pattern_main); |
359 | |
360 | /* allocate pattern and sparsematrix for B_ext_couple */ |
361 | pattern_couple = paso::Pattern_alloc(B->col_coupleBlock->pattern->type, |
362 | len, B->col_distribution->first_component[size], |
363 | ptr_couple, idx_couple); |
364 | B->remote_coupleBlock = paso::SparseMatrix_alloc(B->col_coupleBlock->type, |
365 | pattern_couple, B->row_block_size, B->col_block_size, |
366 | FALSE); |
367 | paso::Pattern_free(pattern_couple); |
368 | |
369 | /* send/receive value array */ |
370 | j=0; |
371 | for (p=0; p<recv->numNeighbors; p++) { |
372 | k = recv->offsetInShared[p]; |
373 | m = recv->offsetInShared[p+1]; |
374 | i = ptr_main[m] - ptr_main[k] + ptr_couple[m] - ptr_couple[k]; |
375 | #ifdef ESYS_MPI |
376 | if (i > 0) |
377 | MPI_Irecv(&(ptr_val[j]), i * block_size, |
378 | MPI_DOUBLE, recv->neighbor[p], |
379 | A->mpi_info->msg_tag_counter+recv->neighbor[p], |
380 | A->mpi_info->comm, |
381 | &(A->col_coupler->mpi_requests[p])); |
382 | #endif |
383 | j += (i * block_size); |
384 | } |
385 | |
386 | j=0; |
387 | for (p=0; p<num_neighbors; p++) { |
388 | i = send_degree[p] - j; |
389 | #ifdef ESYS_MPI |
390 | if (i > 0) |
391 | MPI_Issend(&(send_buf[j*block_size]), i*block_size, MPI_DOUBLE, send->neighbor[p], |
392 | A->mpi_info->msg_tag_counter+rank, |
393 | A->mpi_info->comm, |
394 | &(A->col_coupler->mpi_requests[p+recv->numNeighbors])); |
395 | #endif |
396 | j = send_degree[p] ; |
397 | } |
398 | |
399 | #ifdef ESYS_MPI |
400 | MPI_Waitall(A->col_coupler->connector->send->numNeighbors+A->col_coupler->connector->recv->numNeighbors, |
401 | A->col_coupler->mpi_requests, |
402 | A->col_coupler->mpi_stati); |
403 | #endif |
404 | ESYS_MPI_INC_COUNTER(*(A->mpi_info), size); |
405 | |
406 | #pragma omp parallel for private(i,j,k,m,p) schedule(static) |
407 | for (i=0; i<len; i++) { |
408 | j = ptr_main[i]; |
409 | k = ptr_main[i+1]; |
410 | m = ptr_couple[i]; |
411 | for (p=j; p<k; p++) { |
412 | memcpy(&(B->row_coupleBlock->val[p*block_size]), &(ptr_val[(m+p)*block_size]), block_size_size); |
413 | } |
414 | j = ptr_couple[i+1]; |
415 | for (p=m; p<j; p++) { |
416 | memcpy(&(B->remote_coupleBlock->val[p*block_size]), &(ptr_val[(k+p)*block_size]), block_size_size); |
417 | } |
418 | } |
419 | delete[] ptr_val; |
420 | |
421 | |
422 | /* release all temp memory allocation */ |
423 | delete[] cols; |
424 | delete[] cols_array; |
425 | delete[] recv_offset; |
426 | delete[] recv_degree; |
427 | delete[] recv_buf; |
428 | delete[] send_buf; |
429 | delete[] send_offset; |
430 | delete[] send_degree; |
431 | delete[] send_idx; |
432 | } |
433 | |
434 | /* As defined, sparse matrix (let's called it T) defined by T(ptr, idx, val) |
435 | has the same number of rows as P->col_coupleBlock->numCols. Now, we need |
436 | to copy block of data in T to neighbour processors, defined by |
437 | P->col_coupler->connector->recv->neighbor[k] where k is in |
438 | [0, P->col_coupler->connector->recv->numNeighbors). |
439 | Rows to be copied to neighbor processor k is in the list defined by |
440 | P->col_coupler->connector->recv->offsetInShared[k] ... |
441 | P->col_coupler->connector->recv->offsetInShared[k+1] */ |
442 | void Paso_Preconditioner_AMG_CopyRemoteData(Paso_SystemMatrix* P, |
443 | index_t **p_ptr, index_t **p_idx, double **p_val, |
444 | index_t *global_id, index_t block_size) |
445 | { |
446 | paso::SharedComponents_ptr send, recv; |
447 | index_t send_neighbors, recv_neighbors, send_rows, recv_rows; |
448 | index_t i, j, p, m, n, size; |
449 | index_t *send_degree=NULL, *recv_ptr=NULL, *recv_idx=NULL; |
450 | index_t *ptr=*p_ptr, *idx=*p_idx; |
451 | double *val=*p_val, *recv_val=NULL; |
452 | #ifdef ESYS_MPI |
453 | index_t rank = P->mpi_info->rank; |
454 | #endif |
455 | |
456 | size = P->mpi_info->size; |
457 | send = P->col_coupler->connector->recv; |
458 | recv = P->col_coupler->connector->send; |
459 | send_neighbors = send->numNeighbors; |
460 | recv_neighbors = recv->numNeighbors; |
461 | send_rows = P->col_coupleBlock->numCols; |
462 | recv_rows = recv->offsetInShared[recv_neighbors]; |
463 | |
464 | send_degree = new index_t[send_rows]; |
465 | recv_ptr = new index_t[recv_rows + 1]; |
466 | #pragma omp for schedule(static) private(i) |
467 | for (i=0; i<send_rows; i++) |
468 | send_degree[i] = ptr[i+1] - ptr[i]; |
469 | |
470 | /* First, send/receive the degree */ |
471 | for (p=0; p<recv_neighbors; p++) { /* Receiving */ |
472 | m = recv->offsetInShared[p]; |
473 | n = recv->offsetInShared[p+1]; |
474 | #ifdef ESYS_MPI |
475 | MPI_Irecv(&(recv_ptr[m]), n-m, MPI_INT, recv->neighbor[p], |
476 | P->mpi_info->msg_tag_counter + recv->neighbor[p], |
477 | P->mpi_info->comm, |
478 | &(P->col_coupler->mpi_requests[p])); |
479 | #endif |
480 | } |
481 | for (p=0; p<send_neighbors; p++) { /* Sending */ |
482 | m = send->offsetInShared[p]; |
483 | n = send->offsetInShared[p+1]; |
484 | #ifdef ESYS_MPI |
485 | MPI_Issend(&(send_degree[m]), n-m, MPI_INT, send->neighbor[p], |
486 | P->mpi_info->msg_tag_counter + rank, |
487 | P->mpi_info->comm, |
488 | &(P->col_coupler->mpi_requests[p+recv_neighbors])); |
489 | #endif |
490 | } |
491 | #ifdef ESYS_MPI |
492 | MPI_Waitall(send_neighbors+recv_neighbors, |
493 | P->col_coupler->mpi_requests, |
494 | P->col_coupler->mpi_stati); |
495 | #endif |
496 | ESYS_MPI_INC_COUNTER(*(P->mpi_info),size); |
497 | |
498 | delete[] send_degree; |
499 | m = Paso_Util_cumsum(recv_rows, recv_ptr); |
500 | recv_ptr[recv_rows] = m; |
501 | recv_idx = new index_t[m]; |
502 | recv_val = new double[m * block_size]; |
503 | |
504 | /* Next, send/receive the index array */ |
505 | j = 0; |
506 | for (p=0; p<recv_neighbors; p++) { /* Receiving */ |
507 | m = recv->offsetInShared[p]; |
508 | n = recv->offsetInShared[p+1]; |
509 | i = recv_ptr[n] - recv_ptr[m]; |
510 | if (i > 0) { |
511 | #ifdef ESYS_MPI |
512 | MPI_Irecv(&(recv_idx[j]), i, MPI_INT, recv->neighbor[p], |
513 | P->mpi_info->msg_tag_counter + recv->neighbor[p], |
514 | P->mpi_info->comm, |
515 | &(P->col_coupler->mpi_requests[p])); |
516 | #endif |
517 | } |
518 | j += i; |
519 | } |
520 | |
521 | j = 0; |
522 | for (p=0; p<send_neighbors; p++) { /* Sending */ |
523 | m = send->offsetInShared[p]; |
524 | n = send->offsetInShared[p+1]; |
525 | i = ptr[n] - ptr[m]; |
526 | if (i >0) { |
527 | #ifdef ESYS_MPI |
528 | MPI_Issend(&(idx[j]), i, MPI_INT, send->neighbor[p], |
529 | P->mpi_info->msg_tag_counter + rank, |
530 | P->mpi_info->comm, |
531 | &(P->col_coupler->mpi_requests[p+recv_neighbors])); |
532 | #endif |
533 | j += i; |
534 | } |
535 | } |
536 | #ifdef ESYS_MPI |
537 | MPI_Waitall(send_neighbors+recv_neighbors, |
538 | P->col_coupler->mpi_requests, |
539 | P->col_coupler->mpi_stati); |
540 | #endif |
541 | ESYS_MPI_INC_COUNTER(*(P->mpi_info),size); |
542 | |
543 | /* Last, send/receive the data array */ |
544 | j = 0; |
545 | for (p=0; p<recv_neighbors; p++) { /* Receiving */ |
546 | m = recv->offsetInShared[p]; |
547 | n = recv->offsetInShared[p+1]; |
548 | i = recv_ptr[n] - recv_ptr[m]; |
549 | #ifdef ESYS_MPI |
550 | if (i > 0) |
551 | MPI_Irecv(&(recv_val[j]), i*block_size, MPI_DOUBLE, recv->neighbor[p], |
552 | P->mpi_info->msg_tag_counter + recv->neighbor[p], |
553 | P->mpi_info->comm, |
554 | &(P->col_coupler->mpi_requests[p])); |
555 | #endif |
556 | j += (i*block_size); |
557 | } |
558 | |
559 | j = 0; |
560 | for (p=0; p<send_neighbors; p++) { /* Sending */ |
561 | m = send->offsetInShared[p]; |
562 | n = send->offsetInShared[p+1]; |
563 | i = ptr[n] - ptr[m]; |
564 | if (i >0) { |
565 | #ifdef ESYS_MPI |
566 | MPI_Issend(&(val[j]), i * block_size, MPI_DOUBLE, send->neighbor[p], |
567 | P->mpi_info->msg_tag_counter + rank, |
568 | P->mpi_info->comm, |
569 | &(P->col_coupler->mpi_requests[p+recv_neighbors])); |
570 | #endif |
571 | j += i * block_size; |
572 | } |
573 | } |
574 | #ifdef ESYS_MPI |
575 | MPI_Waitall(send_neighbors+recv_neighbors, |
576 | P->col_coupler->mpi_requests, |
577 | P->col_coupler->mpi_stati); |
578 | #endif |
579 | ESYS_MPI_INC_COUNTER(*(P->mpi_info),size); |
580 | |
581 | /* Clean up and return with received ptr, index and data arrays */ |
582 | delete[] ptr; |
583 | delete[] idx; |
584 | delete[] val; |
585 | *p_ptr = recv_ptr; |
586 | *p_idx = recv_idx; |
587 | *p_val = recv_val; |
588 | } |
589 | |
590 | Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperator( |
591 | Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R) |
592 | { |
593 | Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info); |
594 | paso::SparseMatrix *R_main=NULL, *R_couple=NULL; |
595 | Paso_SystemMatrix *out=NULL; |
596 | paso::SystemMatrixPattern *pattern=NULL; |
597 | paso::Distribution_ptr input_dist, output_dist; |
598 | paso::Connector *col_connector=NULL, *row_connector=NULL; |
599 | paso::Pattern *main_pattern=NULL; |
600 | paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL; |
601 | const dim_t row_block_size=A->row_block_size; |
602 | const dim_t col_block_size=A->col_block_size; |
603 | const dim_t block_size = A->block_size; |
604 | const dim_t P_block_size = P->block_size; |
605 | const double ZERO = 0.0; |
606 | double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL; |
607 | double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL, *t1_val, *t2_val; |
608 | index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL; |
609 | index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL; |
610 | index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL; |
611 | index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL; |
612 | index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL; |
613 | index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL; |
614 | index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL; |
615 | index_t sum, i, j, k, iptr, irb, icb, ib; |
616 | index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols; |
617 | index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset; |
618 | index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows; |
619 | index_t row_marker_ext, *where_p=NULL; |
620 | index_t **send_ptr=NULL, **send_idx=NULL; |
621 | dim_t p, num_neighbors; |
622 | dim_t *recv_len=NULL, *send_len=NULL, *len=NULL; |
623 | Esys_MPI_rank *neighbor=NULL; |
624 | #ifdef ESYS_MPI |
625 | MPI_Request* mpi_requests=NULL; |
626 | MPI_Status* mpi_stati=NULL; |
627 | #else |
628 | int *mpi_requests=NULL, *mpi_stati=NULL; |
629 | #endif |
630 | |
631 | /* if (!(P->type & MATRIX_FORMAT_DIAGONAL_BLOCK)) |
632 | return Paso_Preconditioner_AMG_buildInterpolationOperatorBlock(A, P, R);*/ |
633 | |
634 | /* two sparse matrices R_main and R_couple will be generate, as the |
635 | transpose of P_main and P_col_couple, respectively. Note that, |
636 | R_couple is actually the row_coupleBlock of R (R=P^T) */ |
637 | R_main = paso::SparseMatrix_getTranspose(P->mainBlock); |
638 | if (size > 1) |
639 | R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock); |
640 | |
641 | /* generate P_ext, i.e. portion of P that is stored on neighbor procs |
642 | and needed locally for triple matrix product RAP |
643 | to be specific, P_ext (on processor k) are group of rows in P, where |
644 | the list of rows from processor q is given by |
645 | A->col_coupler->connector->send->shared[rPtr...] |
646 | rPtr=A->col_coupler->connector->send->OffsetInShared[k] |
647 | on q. |
648 | P_ext is represented by two sparse matrices P_ext_main and |
649 | P_ext_couple */ |
650 | Paso_Preconditioner_AMG_extendB(A, P); |
651 | |
652 | /* count the number of cols in P_ext_couple, resize the pattern of |
653 | sparse matrix P_ext_couple with new compressed order, and then |
654 | build the col id mapping from P->col_coupleBlock to |
655 | P_ext_couple */ |
656 | num_Pmain_cols = P->mainBlock->numCols; |
657 | if (size > 1) { |
658 | num_Pcouple_cols = P->col_coupleBlock->numCols; |
659 | num_Acouple_cols = A->col_coupleBlock->numCols; |
660 | sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows]; |
661 | } else { |
662 | num_Pcouple_cols = 0; |
663 | num_Acouple_cols = 0; |
664 | sum = 0; |
665 | } |
666 | num_A_cols = A->mainBlock->numCols + num_Acouple_cols; |
667 | offset = P->col_distribution->first_component[rank]; |
668 | num_Pext_cols = 0; |
669 | if (P->global_id) { |
670 | /* first count the number of cols "num_Pext_cols" in both P_ext_couple |
671 | and P->col_coupleBlock */ |
672 | iptr = 0; |
673 | if (num_Pcouple_cols || sum > 0) { |
674 | temp = new index_t[num_Pcouple_cols+sum]; |
675 | #pragma omp parallel for lastprivate(iptr) schedule(static) |
676 | for (iptr=0; iptr<sum; iptr++){ |
677 | temp[iptr] = P->remote_coupleBlock->pattern->index[iptr]; |
678 | } |
679 | for (j=0; j<num_Pcouple_cols; j++, iptr++){ |
680 | temp[iptr] = P->global_id[j]; |
681 | } |
682 | } |
683 | if (iptr) { |
684 | #ifdef USE_QSORTG |
685 | qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
686 | #else |
687 | qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
688 | #endif |
689 | num_Pext_cols = 1; |
690 | i = temp[0]; |
691 | for (j=1; j<iptr; j++) { |
692 | if (temp[j] > i) { |
693 | i = temp[j]; |
694 | temp[num_Pext_cols++] = i; |
695 | } |
696 | } |
697 | } |
698 | /* resize the pattern of P_ext_couple */ |
699 | if(num_Pext_cols){ |
700 | global_id_P = new index_t[num_Pext_cols]; |
701 | #pragma omp parallel for private(i) schedule(static) |
702 | for (i=0; i<num_Pext_cols; i++) |
703 | global_id_P[i] = temp[i]; |
704 | } |
705 | if (num_Pcouple_cols || sum > 0) |
706 | delete[] temp; |
707 | #pragma omp parallel for private(i, where_p) schedule(static) |
708 | for (i=0; i<sum; i++) { |
709 | where_p = (index_t *)bsearch( |
710 | &(P->remote_coupleBlock->pattern->index[i]), |
711 | global_id_P, num_Pext_cols, |
712 | sizeof(index_t), paso::comparIndex); |
713 | P->remote_coupleBlock->pattern->index[i] = |
714 | (index_t)(where_p -global_id_P); |
715 | } |
716 | |
717 | /* build the mapping */ |
718 | if (num_Pcouple_cols) { |
719 | Pcouple_to_Pext = new index_t[num_Pcouple_cols]; |
720 | iptr = 0; |
721 | for (i=0; i<num_Pext_cols; i++) |
722 | if (global_id_P[i] == P->global_id[iptr]) { |
723 | Pcouple_to_Pext[iptr++] = i; |
724 | if (iptr == num_Pcouple_cols) break; |
725 | } |
726 | } |
727 | } |
728 | |
729 | /* alloc and initialise the makers */ |
730 | sum = num_Pext_cols + num_Pmain_cols; |
731 | P_marker = new index_t[sum]; |
732 | A_marker = new index_t[num_A_cols]; |
733 | #pragma omp parallel for private(i) schedule(static) |
734 | for (i=0; i<sum; i++) P_marker[i] = -1; |
735 | #pragma omp parallel for private(i) schedule(static) |
736 | for (i=0; i<num_A_cols; i++) A_marker[i] = -1; |
737 | |
738 | /* Now, count the size of RAP_ext. Start with rows in R_couple */ |
739 | sum = 0; |
740 | for (i_r=0; i_r<num_Pcouple_cols; i_r++){ |
741 | row_marker = sum; |
742 | /* then, loop over elements in row i_r of R_couple */ |
743 | j1_ub = R_couple->pattern->ptr[i_r+1]; |
744 | for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){ |
745 | i1 = R_couple->pattern->index[j1]; |
746 | /* then, loop over elements in row i1 of A->col_coupleBlock */ |
747 | j2_ub = A->col_coupleBlock->pattern->ptr[i1+1]; |
748 | for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
749 | i2 = A->col_coupleBlock->pattern->index[j2]; |
750 | |
751 | /* check whether entry RA[i_r, i2] has been previously visited. |
752 | RAP new entry is possible only if entry RA[i_r, i2] has not |
753 | been visited yet */ |
754 | if (A_marker[i2] != i_r) { |
755 | /* first, mark entry RA[i_r,i2] as visited */ |
756 | A_marker[i2] = i_r; |
757 | |
758 | /* then loop over elements in row i2 of P_ext_main */ |
759 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
760 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
761 | i_c = P->row_coupleBlock->pattern->index[j3]; |
762 | |
763 | /* check whether entry RAP[i_r,i_c] has been created. |
764 | If not yet created, create the entry and increase |
765 | the total number of elements in RAP_ext */ |
766 | if (P_marker[i_c] < row_marker) { |
767 | P_marker[i_c] = sum; |
768 | sum++; |
769 | } |
770 | } |
771 | |
772 | /* loop over elements in row i2 of P_ext_couple, do the same */ |
773 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
774 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
775 | i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols; |
776 | if (P_marker[i_c] < row_marker) { |
777 | P_marker[i_c] = sum; |
778 | sum++; |
779 | } |
780 | } |
781 | } |
782 | } |
783 | |
784 | /* now loop over elements in row i1 of A->mainBlock, repeat |
785 | the process we have done to block A->col_coupleBlock */ |
786 | j2_ub = A->mainBlock->pattern->ptr[i1+1]; |
787 | for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
788 | i2 = A->mainBlock->pattern->index[j2]; |
789 | if (A_marker[i2 + num_Acouple_cols] != i_r) { |
790 | A_marker[i2 + num_Acouple_cols] = i_r; |
791 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
792 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
793 | i_c = P->mainBlock->pattern->index[j3]; |
794 | if (P_marker[i_c] < row_marker) { |
795 | P_marker[i_c] = sum; |
796 | sum++; |
797 | } |
798 | } |
799 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
800 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
801 | /* note that we need to map the column index in |
802 | P->col_coupleBlock back into the column index in |
803 | P_ext_couple */ |
804 | i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
805 | if (P_marker[i_c] < row_marker) { |
806 | P_marker[i_c] = sum; |
807 | sum++; |
808 | } |
809 | } |
810 | } |
811 | } |
812 | } |
813 | } |
814 | |
815 | /* Now we have the number of non-zero elements in RAP_ext, allocate |
816 | PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */ |
817 | RAP_ext_ptr = new index_t[num_Pcouple_cols+1]; |
818 | RAP_ext_idx = new index_t[sum]; |
819 | RAP_ext_val = new double[sum * block_size]; |
820 | RA_val = new double[block_size]; |
821 | RAP_val = new double[block_size]; |
822 | |
823 | /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */ |
824 | sum = num_Pext_cols + num_Pmain_cols; |
825 | #pragma omp parallel for private(i) schedule(static) |
826 | for (i=0; i<sum; i++) P_marker[i] = -1; |
827 | #pragma omp parallel for private(i) schedule(static) |
828 | for (i=0; i<num_A_cols; i++) A_marker[i] = -1; |
829 | sum = 0; |
830 | RAP_ext_ptr[0] = 0; |
831 | for (i_r=0; i_r<num_Pcouple_cols; i_r++){ |
832 | row_marker = sum; |
833 | /* then, loop over elements in row i_r of R_couple */ |
834 | j1_ub = R_couple->pattern->ptr[i_r+1]; |
835 | for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){ |
836 | i1 = R_couple->pattern->index[j1]; |
837 | R_val = &(R_couple->val[j1*P_block_size]); |
838 | |
839 | /* then, loop over elements in row i1 of A->col_coupleBlock */ |
840 | j2_ub = A->col_coupleBlock->pattern->ptr[i1+1]; |
841 | for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
842 | i2 = A->col_coupleBlock->pattern->index[j2]; |
843 | temp_val = &(A->col_coupleBlock->val[j2*block_size]); |
844 | for (irb=0; irb<row_block_size; irb++) |
845 | for (icb=0; icb<col_block_size; icb++) |
846 | RA_val[irb+row_block_size*icb] = ZERO; |
847 | for (irb=0; irb<P_block_size; irb++) { |
848 | rtmp=R_val[irb]; |
849 | for (icb=0; icb<col_block_size; icb++) { |
850 | RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb]; |
851 | } |
852 | } |
853 | |
854 | /* check whether entry RA[i_r, i2] has been previously visited. |
855 | RAP new entry is possible only if entry RA[i_r, i2] has not |
856 | been visited yet */ |
857 | if (A_marker[i2] != i_r) { |
858 | /* first, mark entry RA[i_r,i2] as visited */ |
859 | A_marker[i2] = i_r; |
860 | |
861 | /* then loop over elements in row i2 of P_ext_main */ |
862 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
863 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
864 | i_c = P->row_coupleBlock->pattern->index[j3]; |
865 | temp_val = &(P->row_coupleBlock->val[j3*P_block_size]); |
866 | for (irb=0; irb<row_block_size; irb++) |
867 | for (icb=0; icb<col_block_size; icb++) |
868 | RAP_val[irb+row_block_size*icb] = ZERO; |
869 | for (icb=0; icb<P_block_size; icb++) { |
870 | rtmp = temp_val[icb]; |
871 | for (irb=0; irb<row_block_size; irb++) { |
872 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
873 | } |
874 | } |
875 | |
876 | /* check whether entry RAP[i_r,i_c] has been created. |
877 | If not yet created, create the entry and increase |
878 | the total number of elements in RAP_ext */ |
879 | if (P_marker[i_c] < row_marker) { |
880 | P_marker[i_c] = sum; |
881 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
882 | RAP_ext_idx[sum] = i_c + offset; |
883 | sum++; |
884 | } else { |
885 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
886 | for (ib=0; ib<block_size; ib++) { |
887 | temp_val[ib] += RAP_val[ib]; |
888 | } |
889 | } |
890 | } |
891 | |
892 | /* loop over elements in row i2 of P_ext_couple, do the same */ |
893 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
894 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
895 | i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols; |
896 | temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]); |
897 | for (irb=0; irb<row_block_size; irb++) |
898 | for (icb=0; icb<col_block_size; icb++) |
899 | RAP_val[irb+row_block_size*icb]=ZERO; |
900 | for (icb=0; icb<P_block_size; icb++) { |
901 | rtmp = temp_val[icb]; |
902 | for (irb=0; irb<row_block_size; irb++) { |
903 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
904 | } |
905 | } |
906 | |
907 | if (P_marker[i_c] < row_marker) { |
908 | P_marker[i_c] = sum; |
909 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
910 | RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols]; |
911 | sum++; |
912 | } else { |
913 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
914 | for (ib=0; ib<block_size; ib++) { |
915 | temp_val[ib] += RAP_val[ib]; |
916 | } |
917 | } |
918 | } |
919 | |
920 | /* If entry RA[i_r, i2] is visited, no new RAP entry is created. |
921 | Only the contributions are added. */ |
922 | } else { |
923 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
924 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
925 | i_c = P->row_coupleBlock->pattern->index[j3]; |
926 | temp_val = &(P->row_coupleBlock->val[j3*P_block_size]); |
927 | for (irb=0; irb<row_block_size; irb++) |
928 | for (icb=0; icb<col_block_size; icb++) |
929 | RAP_val[irb+row_block_size*icb]=ZERO; |
930 | for (icb=0; icb<P_block_size; icb++) { |
931 | rtmp = temp_val[icb]; |
932 | for (irb=0; irb<row_block_size; irb++) { |
933 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
934 | } |
935 | } |
936 | |
937 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
938 | for (ib=0; ib<block_size; ib++) { |
939 | temp_val[ib] += RAP_val[ib]; |
940 | } |
941 | } |
942 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
943 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
944 | i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols; |
945 | temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]); |
946 | for (irb=0; irb<row_block_size; irb++) |
947 | for (icb=0; icb<col_block_size; icb++) |
948 | RAP_val[irb+row_block_size*icb]=ZERO; |
949 | for (icb=0; icb<P_block_size; icb++) { |
950 | rtmp = temp_val[icb]; |
951 | for (irb=0; irb<row_block_size; irb++) { |
952 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
953 | } |
954 | } |
955 | |
956 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
957 | for (ib=0; ib<block_size; ib++) { |
958 | temp_val[ib] += RAP_val[ib]; |
959 | } |
960 | } |
961 | } |
962 | } |
963 | |
964 | /* now loop over elements in row i1 of A->mainBlock, repeat |
965 | the process we have done to block A->col_coupleBlock */ |
966 | j2_ub = A->mainBlock->pattern->ptr[i1+1]; |
967 | for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
968 | i2 = A->mainBlock->pattern->index[j2]; |
969 | temp_val = &(A->mainBlock->val[j2*block_size]); |
970 | for (irb=0; irb<row_block_size; irb++) |
971 | for (icb=0; icb<col_block_size; icb++) |
972 | RA_val[irb+row_block_size*icb]=ZERO; |
973 | for (irb=0; irb<P_block_size; irb++) { |
974 | rtmp=R_val[irb]; |
975 | for (icb=0; icb<col_block_size; icb++) { |
976 | RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb]; |
977 | } |
978 | } |
979 | |
980 | if (A_marker[i2 + num_Acouple_cols] != i_r) { |
981 | A_marker[i2 + num_Acouple_cols] = i_r; |
982 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
983 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
984 | i_c = P->mainBlock->pattern->index[j3]; |
985 | temp_val = &(P->mainBlock->val[j3*P_block_size]); |
986 | for (irb=0; irb<row_block_size; irb++) |
987 | for (icb=0; icb<col_block_size; icb++) |
988 | RAP_val[irb+row_block_size*icb]=ZERO; |
989 | for (icb=0; icb<P_block_size; icb++) { |
990 | rtmp = temp_val[icb]; |
991 | for (irb=0; irb<row_block_size; irb++) { |
992 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
993 | } |
994 | } |
995 | |
996 | if (P_marker[i_c] < row_marker) { |
997 | P_marker[i_c] = sum; |
998 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
999 | RAP_ext_idx[sum] = i_c + offset; |
1000 | sum++; |
1001 | } else { |
1002 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
1003 | for (ib=0; ib<block_size; ib++) { |
1004 | temp_val[ib] += RAP_val[ib]; |
1005 | } |
1006 | } |
1007 | } |
1008 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
1009 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1010 | i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1011 | temp_val = &(P->col_coupleBlock->val[j3*P_block_size]); |
1012 | for (irb=0; irb<row_block_size; irb++) |
1013 | for (icb=0; icb<col_block_size; icb++) |
1014 | RAP_val[irb+row_block_size*icb]=ZERO; |
1015 | for (icb=0; icb<P_block_size; icb++) { |
1016 | rtmp=temp_val[icb]; |
1017 | for (irb=0; irb<row_block_size; irb++) { |
1018 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1019 | } |
1020 | } |
1021 | |
1022 | if (P_marker[i_c] < row_marker) { |
1023 | P_marker[i_c] = sum; |
1024 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
1025 | RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols]; |
1026 | sum++; |
1027 | } else { |
1028 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
1029 | for (ib=0; ib<block_size; ib++) { |
1030 | temp_val[ib] += RAP_val[ib]; |
1031 | } |
1032 | } |
1033 | } |
1034 | } else { |
1035 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
1036 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1037 | i_c = P->mainBlock->pattern->index[j3]; |
1038 | temp_val = &(P->mainBlock->val[j3*P_block_size]); |
1039 | for (irb=0; irb<row_block_size; irb++) |
1040 | for (icb=0; icb<col_block_size; icb++) |
1041 | RAP_val[irb+row_block_size*icb]=ZERO; |
1042 | for (icb=0; icb<P_block_size; icb++) { |
1043 | rtmp=temp_val[icb]; |
1044 | for (irb=0; irb<row_block_size; irb++) { |
1045 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1046 | } |
1047 | } |
1048 | |
1049 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
1050 | for (ib=0; ib<block_size; ib++) { |
1051 | temp_val[ib] += RAP_val[ib]; |
1052 | } |
1053 | } |
1054 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
1055 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1056 | i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1057 | temp_val = &(P->col_coupleBlock->val[j3*P_block_size]); |
1058 | for (irb=0; irb<row_block_size; irb++) |
1059 | for (icb=0; icb<col_block_size; icb++) |
1060 | RAP_val[irb+row_block_size*icb]=ZERO; |
1061 | for (icb=0; icb<P_block_size; icb++) { |
1062 | rtmp=temp_val[icb]; |
1063 | for (irb=0; irb<row_block_size; irb++) { |
1064 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1065 | } |
1066 | } |
1067 | |
1068 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
1069 | for (ib=0; ib<block_size; ib++) { |
1070 | temp_val[ib] += RAP_val[ib]; |
1071 | } |
1072 | } |
1073 | } |
1074 | } |
1075 | } |
1076 | RAP_ext_ptr[i_r+1] = sum; |
1077 | } |
1078 | delete[] P_marker; |
1079 | delete[] Pcouple_to_Pext; |
1080 | |
1081 | /* Now we have part of RAP[r,c] where row "r" is the list of rows |
1082 | which is given by the column list of P->col_coupleBlock, and |
1083 | column "c" is the list of columns which possibly covers the |
1084 | whole column range of system matrix P. This part of data is to |
1085 | be passed to neighbouring processors, and added to corresponding |
1086 | RAP[r,c] entries in the neighbouring processors */ |
1087 | Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx, |
1088 | &RAP_ext_val, global_id_P, block_size); |
1089 | |
1090 | num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors]; |
1091 | sum = RAP_ext_ptr[num_RAPext_rows]; |
1092 | num_RAPext_cols = 0; |
1093 | if (num_Pext_cols || sum > 0) { |
1094 | temp = new index_t[num_Pext_cols+sum]; |
1095 | j1_ub = offset + num_Pmain_cols; |
1096 | for (i=0, iptr=0; i<sum; i++) { |
1097 | if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */ |
1098 | temp[iptr++] = RAP_ext_idx[i]; /* XXX */ |
1099 | } |
1100 | for (j=0; j<num_Pext_cols; j++, iptr++){ |
1101 | temp[iptr] = global_id_P[j]; |
1102 | } |
1103 | |
1104 | if (iptr) { |
1105 | #ifdef USE_QSORTG |
1106 | qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
1107 | #else |
1108 | qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
1109 | #endif |
1110 | num_RAPext_cols = 1; |
1111 | i = temp[0]; |
1112 | for (j=1; j<iptr; j++) { |
1113 | if (temp[j] > i) { |
1114 | i = temp[j]; |
1115 | temp[num_RAPext_cols++] = i; |
1116 | } |
1117 | } |
1118 | } |
1119 | } |
1120 | |
1121 | /* resize the pattern of P_ext_couple */ |
1122 | if(num_RAPext_cols){ |
1123 | global_id_RAP = new index_t[num_RAPext_cols]; |
1124 | #pragma omp parallel for private(i) schedule(static) |
1125 | for (i=0; i<num_RAPext_cols; i++) |
1126 | global_id_RAP[i] = temp[i]; |
1127 | } |
1128 | if (num_Pext_cols || sum > 0) |
1129 | delete[] temp; |
1130 | j1_ub = offset + num_Pmain_cols; |
1131 | #pragma omp parallel for private(i, where_p) schedule(static) |
1132 | for (i=0; i<sum; i++) { |
1133 | if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){ |
1134 | where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP, |
1135 | /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex); |
1136 | RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP); |
1137 | } else |
1138 | RAP_ext_idx[i] = RAP_ext_idx[i] - offset; |
1139 | } |
1140 | |
1141 | /* build the mapping */ |
1142 | if (num_Pcouple_cols) { |
1143 | Pcouple_to_RAP = new index_t[num_Pcouple_cols]; |
1144 | iptr = 0; |
1145 | for (i=0; i<num_RAPext_cols; i++) |
1146 | if (global_id_RAP[i] == P->global_id[iptr]) { |
1147 | Pcouple_to_RAP[iptr++] = i; |
1148 | if (iptr == num_Pcouple_cols) break; |
1149 | } |
1150 | } |
1151 | |
1152 | if (num_Pext_cols) { |
1153 | Pext_to_RAP = new index_t[num_Pext_cols]; |
1154 | iptr = 0; |
1155 | for (i=0; i<num_RAPext_cols; i++) |
1156 | if (global_id_RAP[i] == global_id_P[iptr]) { |
1157 | Pext_to_RAP[iptr++] = i; |
1158 | if (iptr == num_Pext_cols) break; |
1159 | } |
1160 | } |
1161 | |
1162 | if (global_id_P){ |
1163 | delete[] global_id_P; |
1164 | global_id_P = NULL; |
1165 | } |
1166 | |
1167 | /* alloc and initialise the makers */ |
1168 | sum = num_RAPext_cols + num_Pmain_cols; |
1169 | P_marker = new index_t[sum]; |
1170 | #pragma omp parallel for private(i) schedule(static) |
1171 | for (i=0; i<sum; i++) P_marker[i] = -1; |
1172 | #pragma omp parallel for private(i) schedule(static) |
1173 | for (i=0; i<num_A_cols; i++) A_marker[i] = -1; |
1174 | |
1175 | /* Now, count the size of RAP. Start with rows in R_main */ |
1176 | num_neighbors = P->col_coupler->connector->send->numNeighbors; |
1177 | offsetInShared = P->col_coupler->connector->send->offsetInShared; |
1178 | shared = P->col_coupler->connector->send->shared; |
1179 | i = 0; |
1180 | j = 0; |
1181 | for (i_r=0; i_r<num_Pmain_cols; i_r++){ |
1182 | /* Mark the start of row for both main block and couple block */ |
1183 | row_marker = i; |
1184 | row_marker_ext = j; |
1185 | |
1186 | /* Mark the diagonal element RAP[i_r, i_r], and other elements |
1187 | in RAP_ext */ |
1188 | P_marker[i_r] = i; |
1189 | i++; |
1190 | for (j1=0; j1<num_neighbors; j1++) { |
1191 | for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) { |
1192 | if (shared[j2] == i_r) { |
1193 | for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) { |
1194 | i_c = RAP_ext_idx[k]; |
1195 | if (i_c < num_Pmain_cols) { |
1196 | if (P_marker[i_c] < row_marker) { |
1197 | P_marker[i_c] = i; |
1198 | i++; |
1199 | } |
1200 | } else { |
1201 | if (P_marker[i_c] < row_marker_ext) { |
1202 | P_marker[i_c] = j; |
1203 | j++; |
1204 | } |
1205 | } |
1206 | } |
1207 | break; |
1208 | } |
1209 | } |
1210 | } |
1211 | |
1212 | /* then, loop over elements in row i_r of R_main */ |
1213 | j1_ub = R_main->pattern->ptr[i_r+1]; |
1214 | for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){ |
1215 | i1 = R_main->pattern->index[j1]; |
1216 | |
1217 | /* then, loop over elements in row i1 of A->col_coupleBlock */ |
1218 | j2_ub = A->col_coupleBlock->pattern->ptr[i1+1]; |
1219 | for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
1220 | i2 = A->col_coupleBlock->pattern->index[j2]; |
1221 | |
1222 | /* check whether entry RA[i_r, i2] has been previously visited. |
1223 | RAP new entry is possible only if entry RA[i_r, i2] has not |
1224 | been visited yet */ |
1225 | if (A_marker[i2] != i_r) { |
1226 | /* first, mark entry RA[i_r,i2] as visited */ |
1227 | A_marker[i2] = i_r; |
1228 | |
1229 | /* then loop over elements in row i2 of P_ext_main */ |
1230 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
1231 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1232 | i_c = P->row_coupleBlock->pattern->index[j3]; |
1233 | |
1234 | /* check whether entry RAP[i_r,i_c] has been created. |
1235 | If not yet created, create the entry and increase |
1236 | the total number of elements in RAP_ext */ |
1237 | if (P_marker[i_c] < row_marker) { |
1238 | P_marker[i_c] = i; |
1239 | i++; |
1240 | } |
1241 | } |
1242 | |
1243 | /* loop over elements in row i2 of P_ext_couple, do the same */ |
1244 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
1245 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1246 | i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1247 | if (P_marker[i_c] < row_marker_ext) { |
1248 | P_marker[i_c] = j; |
1249 | j++; |
1250 | } |
1251 | } |
1252 | } |
1253 | } |
1254 | |
1255 | /* now loop over elements in row i1 of A->mainBlock, repeat |
1256 | the process we have done to block A->col_coupleBlock */ |
1257 | j2_ub = A->mainBlock->pattern->ptr[i1+1]; |
1258 | for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
1259 | i2 = A->mainBlock->pattern->index[j2]; |
1260 | if (A_marker[i2 + num_Acouple_cols] != i_r) { |
1261 | A_marker[i2 + num_Acouple_cols] = i_r; |
1262 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
1263 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1264 | i_c = P->mainBlock->pattern->index[j3]; |
1265 | if (P_marker[i_c] < row_marker) { |
1266 | P_marker[i_c] = i; |
1267 | i++; |
1268 | } |
1269 | } |
1270 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
1271 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1272 | /* note that we need to map the column index in |
1273 | P->col_coupleBlock back into the column index in |
1274 | P_ext_couple */ |
1275 | i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1276 | if (P_marker[i_c] < row_marker_ext) { |
1277 | P_marker[i_c] = j; |
1278 | j++; |
1279 | } |
1280 | } |
1281 | } |
1282 | } |
1283 | } |
1284 | } |
1285 | |
1286 | /* Now we have the number of non-zero elements in RAP_main and RAP_couple. |
1287 | Allocate RAP_main_ptr, RAP_main_idx and RAP_main_val for RAP_main, |
1288 | and allocate RAP_couple_ptr, RAP_couple_idx and RAP_couple_val for |
1289 | RAP_couple */ |
1290 | RAP_main_ptr = new index_t[num_Pmain_cols+1]; |
1291 | RAP_main_idx = new index_t[i]; |
1292 | RAP_main_val = new double[i * block_size]; |
1293 | RAP_couple_ptr = new index_t[num_Pmain_cols+1]; |
1294 | RAP_couple_idx = new index_t[j]; |
1295 | RAP_couple_val = new double[j * block_size]; |
1296 | |
1297 | RAP_main_ptr[num_Pmain_cols] = i; |
1298 | RAP_couple_ptr[num_Pmain_cols] = j; |
1299 | |
1300 | #pragma omp parallel for private(i) schedule(static) |
1301 | for (i=0; i<sum; i++) P_marker[i] = -1; |
1302 | #pragma omp parallel for private(i) schedule(static) |
1303 | for (i=0; i<num_A_cols; i++) A_marker[i] = -1; |
1304 | |
1305 | /* Now, Fill in the data for RAP_main and RAP_couple. Start with rows |
1306 | in R_main */ |
1307 | i = 0; |
1308 | j = 0; |
1309 | for (i_r=0; i_r<num_Pmain_cols; i_r++){ |
1310 | /* Mark the start of row for both main block and couple block */ |
1311 | row_marker = i; |
1312 | row_marker_ext = j; |
1313 | RAP_main_ptr[i_r] = row_marker; |
1314 | RAP_couple_ptr[i_r] = row_marker_ext; |
1315 | |
1316 | /* Mark and setup the diagonal element RAP[i_r, i_r], and elements |
1317 | in row i_r of RAP_ext */ |
1318 | P_marker[i_r] = i; |
1319 | for (ib=0; ib<block_size; ib++) |
1320 | RAP_main_val[i*block_size+ib] = ZERO; |
1321 | RAP_main_idx[i] = i_r; |
1322 | i++; |
1323 | |
1324 | for (j1=0; j1<num_neighbors; j1++) { |
1325 | for (j2=offsetInShared[j1]; j2<offsetInShared[j1+1]; j2++) { |
1326 | if (shared[j2] == i_r) { |
1327 | for (k=RAP_ext_ptr[j2]; k<RAP_ext_ptr[j2+1]; k++) { |
1328 | i_c = RAP_ext_idx[k]; |
1329 | if (i_c < num_Pmain_cols) { |
1330 | if (P_marker[i_c] < row_marker) { |
1331 | P_marker[i_c] = i; |
1332 | memcpy(&(RAP_main_val[i*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double)); |
1333 | RAP_main_idx[i] = i_c; |
1334 | i++; |
1335 | } else { |
1336 | t1_val = &(RAP_ext_val[k*block_size]); |
1337 | t2_val = &(RAP_main_val[P_marker[i_c]*block_size]); |
1338 | for (ib=0; ib<block_size; ib++) |
1339 | t2_val[ib] += t1_val[ib]; |
1340 | } |
1341 | } else { |
1342 | if (P_marker[i_c] < row_marker_ext) { |
1343 | P_marker[i_c] = j; |
1344 | memcpy(&(RAP_couple_val[j*block_size]), &(RAP_ext_val[k*block_size]), block_size*sizeof(double)); |
1345 | RAP_couple_idx[j] = i_c - num_Pmain_cols; |
1346 | j++; |
1347 | } else { |
1348 | t1_val = &(RAP_ext_val[k*block_size]); |
1349 | t2_val = &(RAP_couple_val[P_marker[i_c]*block_size]); |
1350 | for (ib=0; ib<block_size; ib++) |
1351 | t2_val[ib] += t1_val[ib]; |
1352 | } |
1353 | } |
1354 | } |
1355 | break; |
1356 | } |
1357 | } |
1358 | } |
1359 | |
1360 | /* then, loop over elements in row i_r of R_main */ |
1361 | j1_ub = R_main->pattern->ptr[i_r+1]; |
1362 | for (j1=R_main->pattern->ptr[i_r]; j1<j1_ub; j1++){ |
1363 | i1 = R_main->pattern->index[j1]; |
1364 | R_val = &(R_main->val[j1*P_block_size]); |
1365 | |
1366 | /* then, loop over elements in row i1 of A->col_coupleBlock */ |
1367 | j2_ub = A->col_coupleBlock->pattern->ptr[i1+1]; |
1368 | for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
1369 | i2 = A->col_coupleBlock->pattern->index[j2]; |
1370 | temp_val = &(A->col_coupleBlock->val[j2*block_size]); |
1371 | for (irb=0; irb<row_block_size; irb++) |
1372 | for (icb=0; icb<col_block_size; icb++) |
1373 | RA_val[irb+row_block_size*icb]=ZERO; |
1374 | for (irb=0; irb<P_block_size; irb++) { |
1375 | rtmp=R_val[irb]; |
1376 | for (icb=0; icb<col_block_size; icb++) { |
1377 | RA_val[irb+col_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb]; |
1378 | } |
1379 | } |
1380 | |
1381 | |
1382 | /* check whether entry RA[i_r, i2] has been previously visited. |
1383 | RAP new entry is possible only if entry RA[i_r, i2] has not |
1384 | been visited yet */ |
1385 | if (A_marker[i2] != i_r) { |
1386 | /* first, mark entry RA[i_r,i2] as visited */ |
1387 | A_marker[i2] = i_r; |
1388 | |
1389 | /* then loop over elements in row i2 of P_ext_main */ |
1390 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
1391 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1392 | i_c = P->row_coupleBlock->pattern->index[j3]; |
1393 | temp_val = &(P->row_coupleBlock->val[j3*P_block_size]); |
1394 | for (irb=0; irb<row_block_size; irb++) |
1395 | for (icb=0; icb<col_block_size; icb++) |
1396 | RAP_val[irb+row_block_size*icb]=ZERO; |
1397 | for (icb=0; icb<P_block_size; icb++) { |
1398 | rtmp=temp_val[icb]; |
1399 | for (irb=0; irb<row_block_size; irb++) { |
1400 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1401 | } |
1402 | } |
1403 | |
1404 | |
1405 | /* check whether entry RAP[i_r,i_c] has been created. |
1406 | If not yet created, create the entry and increase |
1407 | the total number of elements in RAP_ext */ |
1408 | if (P_marker[i_c] < row_marker) { |
1409 | P_marker[i_c] = i; |
1410 | memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double)); |
1411 | RAP_main_idx[i] = i_c; |
1412 | i++; |
1413 | } else { |
1414 | temp_val = &(RAP_main_val[P_marker[i_c] * block_size]); |
1415 | for (ib=0; ib<block_size; ib++) { |
1416 | temp_val[ib] += RAP_val[ib]; |
1417 | } |
1418 | } |
1419 | } |
1420 | |
1421 | /* loop over elements in row i2 of P_ext_couple, do the same */ |
1422 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
1423 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1424 | i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1425 | temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]); |
1426 | for (irb=0; irb<row_block_size; irb++) |
1427 | for (icb=0; icb<col_block_size; icb++) |
1428 | RAP_val[irb+row_block_size*icb]=ZERO; |
1429 | for (icb=0; icb<P_block_size; icb++) { |
1430 | rtmp=temp_val[icb]; |
1431 | for (irb=0; irb<row_block_size; irb++) { |
1432 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1433 | } |
1434 | } |
1435 | |
1436 | if (P_marker[i_c] < row_marker_ext) { |
1437 | P_marker[i_c] = j; |
1438 | memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double)); |
1439 | RAP_couple_idx[j] = i_c - num_Pmain_cols; |
1440 | j++; |
1441 | } else { |
1442 | temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]); |
1443 | for (ib=0; ib<block_size; ib++) { |
1444 | temp_val[ib] += RAP_val[ib]; |
1445 | } |
1446 | } |
1447 | } |
1448 | |
1449 | /* If entry RA[i_r, i2] is visited, no new RAP entry is created. |
1450 | Only the contributions are added. */ |
1451 | } else { |
1452 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
1453 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1454 | i_c = P->row_coupleBlock->pattern->index[j3]; |
1455 | temp_val = &(P->row_coupleBlock->val[j3*P_block_size]); |
1456 | for (irb=0; irb<row_block_size; irb++) |
1457 | for (icb=0; icb<col_block_size; icb++) |
1458 | RAP_val[irb+row_block_size*icb]=ZERO; |
1459 | for (icb=0; icb<P_block_size; icb++) { |
1460 | rtmp=temp_val[icb]; |
1461 | for (irb=0; irb<row_block_size; irb++) { |
1462 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1463 | } |
1464 | } |
1465 | |
1466 | temp_val = &(RAP_main_val[P_marker[i_c] * block_size]); |
1467 | for (ib=0; ib<block_size; ib++) { |
1468 | temp_val[ib] += RAP_val[ib]; |
1469 | } |
1470 | } |
1471 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
1472 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1473 | i_c = Pext_to_RAP[P->remote_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1474 | temp_val = &(P->remote_coupleBlock->val[j3*P_block_size]); |
1475 | for (irb=0; irb<row_block_size; irb++) |
1476 | for (icb=0; icb<col_block_size; icb++) |
1477 | RAP_val[irb+row_block_size*icb]=ZERO; |
1478 | for (icb=0; icb<P_block_size; icb++) { |
1479 | rtmp=temp_val[icb]; |
1480 | for (irb=0; irb<row_block_size; irb++) { |
1481 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1482 | } |
1483 | } |
1484 | |
1485 | temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]); |
1486 | for (ib=0; ib<block_size; ib++) { |
1487 | temp_val[ib] += RAP_val[ib]; |
1488 | } |
1489 | } |
1490 | } |
1491 | } |
1492 | |
1493 | /* now loop over elements in row i1 of A->mainBlock, repeat |
1494 | the process we have done to block A->col_coupleBlock */ |
1495 | j2_ub = A->mainBlock->pattern->ptr[i1+1]; |
1496 | for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
1497 | i2 = A->mainBlock->pattern->index[j2]; |
1498 | temp_val = &(A->mainBlock->val[j2*block_size]); |
1499 | for (irb=0; irb<row_block_size; irb++) |
1500 | for (icb=0; icb<col_block_size; icb++) |
1501 | RA_val[irb+row_block_size*icb]=ZERO; |
1502 | for (irb=0; irb<P_block_size; irb++) { |
1503 | rtmp=R_val[irb]; |
1504 | for (icb=0; icb<col_block_size; icb++) { |
1505 | RA_val[irb+row_block_size*icb] += rtmp * temp_val[irb+col_block_size*icb]; |
1506 | } |
1507 | } |
1508 | |
1509 | if (A_marker[i2 + num_Acouple_cols] != i_r) { |
1510 | A_marker[i2 + num_Acouple_cols] = i_r; |
1511 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
1512 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1513 | i_c = P->mainBlock->pattern->index[j3]; |
1514 | temp_val = &(P->mainBlock->val[j3*P_block_size]); |
1515 | for (irb=0; irb<row_block_size; irb++) |
1516 | for (icb=0; icb<col_block_size; icb++) |
1517 | RAP_val[irb+row_block_size*icb]=ZERO; |
1518 | for (icb=0; icb<P_block_size; icb++) { |
1519 | rtmp=temp_val[icb]; |
1520 | for (irb=0; irb<row_block_size; irb++) { |
1521 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1522 | } |
1523 | } |
1524 | if (P_marker[i_c] < row_marker) { |
1525 | P_marker[i_c] = i; |
1526 | memcpy(&(RAP_main_val[i*block_size]), RAP_val, block_size*sizeof(double)); |
1527 | RAP_main_idx[i] = i_c; |
1528 | i++; |
1529 | } else { |
1530 | temp_val = &(RAP_main_val[P_marker[i_c] * block_size]); |
1531 | for (ib=0; ib<block_size; ib++) { |
1532 | temp_val[ib] += RAP_val[ib]; |
1533 | } |
1534 | } |
1535 | } |
1536 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
1537 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1538 | /* note that we need to map the column index in |
1539 | P->col_coupleBlock back into the column index in |
1540 | P_ext_couple */ |
1541 | i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1542 | temp_val = &(P->col_coupleBlock->val[j3*P_block_size]); |
1543 | for (irb=0; irb<row_block_size; irb++) |
1544 | for (icb=0; icb<col_block_size; icb++) |
1545 | RAP_val[irb+row_block_size*icb]=ZERO; |
1546 | for (icb=0; icb<P_block_size; icb++) { |
1547 | rtmp=temp_val[icb]; |
1548 | for (irb=0; irb<row_block_size; irb++) { |
1549 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1550 | } |
1551 | } |
1552 | |
1553 | if (P_marker[i_c] < row_marker_ext) { |
1554 | P_marker[i_c] = j; |
1555 | memcpy(&(RAP_couple_val[j*block_size]), RAP_val, block_size*sizeof(double)); |
1556 | RAP_couple_idx[j] = i_c - num_Pmain_cols; |
1557 | j++; |
1558 | } else { |
1559 | temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]); |
1560 | for (ib=0; ib<block_size; ib++) { |
1561 | temp_val[ib] += RAP_val[ib]; |
1562 | } |
1563 | } |
1564 | } |
1565 | |
1566 | } else { |
1567 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
1568 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1569 | i_c = P->mainBlock->pattern->index[j3]; |
1570 | temp_val = &(P->mainBlock->val[j3*P_block_size]); |
1571 | for (irb=0; irb<row_block_size; irb++) |
1572 | for (icb=0; icb<col_block_size; icb++) |
1573 | RAP_val[irb+row_block_size*icb]=ZERO; |
1574 | for (icb=0; icb<P_block_size; icb++) { |
1575 | rtmp=temp_val[icb]; |
1576 | for (irb=0; irb<row_block_size; irb++) { |
1577 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1578 | } |
1579 | } |
1580 | |
1581 | temp_val = &(RAP_main_val[P_marker[i_c] * block_size]); |
1582 | for (ib=0; ib<block_size; ib++) { |
1583 | temp_val[ib] += RAP_val[ib]; |
1584 | } |
1585 | } |
1586 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
1587 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
1588 | i_c = Pcouple_to_RAP[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
1589 | temp_val = &(P->col_coupleBlock->val[j3*P_block_size]); |
1590 | for (irb=0; irb<row_block_size; irb++) |
1591 | for (icb=0; icb<col_block_size; icb++) |
1592 | RAP_val[irb+row_block_size*icb]=ZERO; |
1593 | for (icb=0; icb<P_block_size; icb++) { |
1594 | rtmp = temp_val[icb]; |
1595 | for (irb=0; irb<row_block_size; irb++) { |
1596 | RAP_val[irb+row_block_size*icb] += RA_val[irb+row_block_size*icb] * rtmp; |
1597 | } |
1598 | } |
1599 | |
1600 | temp_val = &(RAP_couple_val[P_marker[i_c] * block_size]); |
1601 | for (ib=0; ib<block_size; ib++) { |
1602 | temp_val[ib] += RAP_val[ib]; |
1603 | } |
1604 | } |
1605 | } |
1606 | } |
1607 | } |
1608 | |
1609 | /* sort RAP_XXXX_idx and reorder RAP_XXXX_val accordingly */ |
1610 | if (i > row_marker) { |
1611 | offset = i - row_marker; |
1612 | temp = new index_t[offset]; |
1613 | #pragma omp parallel for schedule(static) private(iptr) |
1614 | for (iptr=0; iptr<offset; iptr++) |
1615 | temp[iptr] = RAP_main_idx[row_marker+iptr]; |
1616 | if (offset > 0) { |
1617 | #ifdef USE_QSORTG |
1618 | qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex); |
1619 | #else |
1620 | qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex); |
1621 | #endif |
1622 | } |
1623 | temp_val = new double[offset * block_size]; |
1624 | #pragma omp parallel for schedule(static) private(iptr,k) |
1625 | for (iptr=0; iptr<offset; iptr++){ |
1626 | k = P_marker[temp[iptr]]; |
1627 | memcpy(&(temp_val[iptr*block_size]), &(RAP_main_val[k*block_size]), block_size*sizeof(double)); |
1628 | P_marker[temp[iptr]] = iptr + row_marker; |
1629 | } |
1630 | #pragma omp parallel for schedule(static) private(iptr) |
1631 | for (iptr=0; iptr<offset; iptr++){ |
1632 | RAP_main_idx[row_marker+iptr] = temp[iptr]; |
1633 | memcpy(&(RAP_main_val[(row_marker+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double)); |
1634 | } |
1635 | delete[] temp; |
1636 | delete[] temp_val; |
1637 | } |
1638 | if (j > row_marker_ext) { |
1639 | offset = j - row_marker_ext; |
1640 | temp = new index_t[offset]; |
1641 | #pragma omp parallel for schedule(static) private(iptr) |
1642 | for (iptr=0; iptr<offset; iptr++) |
1643 | temp[iptr] = RAP_couple_idx[row_marker_ext+iptr]; |
1644 | if (offset > 0) { |
1645 | #ifdef USE_QSORTG |
1646 | qsortG(temp, (size_t)offset, sizeof(index_t), paso::comparIndex); |
1647 | #else |
1648 | qsort(temp, (size_t)offset, sizeof(index_t), paso::comparIndex); |
1649 | #endif |
1650 | } |
1651 | temp_val = new double[offset * block_size]; |
1652 | #pragma omp parallel for schedule(static) private(iptr, k) |
1653 | for (iptr=0; iptr<offset; iptr++){ |
1654 | k = P_marker[temp[iptr] + num_Pmain_cols]; |
1655 | memcpy(&(temp_val[iptr*block_size]), &(RAP_couple_val[k*block_size]), block_size*sizeof(double)); |
1656 | P_marker[temp[iptr] + num_Pmain_cols] = iptr + row_marker_ext; |
1657 | } |
1658 | #pragma omp parallel for schedule(static) private(iptr) |
1659 | for (iptr=0; iptr<offset; iptr++){ |
1660 | RAP_couple_idx[row_marker_ext+iptr] = temp[iptr]; |
1661 | memcpy(&(RAP_couple_val[(row_marker_ext+iptr)*block_size]), &(temp_val[iptr*block_size]), block_size*sizeof(double)); |
1662 | } |
1663 | delete[] temp; |
1664 | delete[] temp_val; |
1665 | } |
1666 | } |
1667 | |
1668 | delete[] RA_val; |
1669 | delete[] RAP_val; |
1670 | delete[] A_marker; |
1671 | delete[] Pext_to_RAP; |
1672 | delete[] Pcouple_to_RAP; |
1673 | delete[] RAP_ext_ptr; |
1674 | delete[] RAP_ext_idx; |
1675 | delete[] RAP_ext_val; |
1676 | paso::SparseMatrix_free(R_main); |
1677 | paso::SparseMatrix_free(R_couple); |
1678 | |
1679 | /* Check whether there are empty columns in RAP_couple */ |
1680 | #pragma omp parallel for schedule(static) private(i) |
1681 | for (i=0; i<num_RAPext_cols; i++) P_marker[i] = 1; |
1682 | /* num of non-empty columns is stored in "k" */ |
1683 | k = 0; |
1684 | j = RAP_couple_ptr[num_Pmain_cols]; |
1685 | for (i=0; i<j; i++) { |
1686 | i1 = RAP_couple_idx[i]; |
1687 | if (P_marker[i1]) { |
1688 | P_marker[i1] = 0; |
1689 | k++; |
1690 | } |
1691 | } |
1692 | |
1693 | /* empty columns is found */ |
1694 | if (k < num_RAPext_cols) { |
1695 | temp = new index_t[k]; |
1696 | k = 0; |
1697 | for (i=0; i<num_RAPext_cols; i++) |
1698 | if (!P_marker[i]) { |
1699 | P_marker[i] = k; |
1700 | temp[k] = global_id_RAP[i]; |
1701 | k++; |
1702 | } |
1703 | #pragma omp parallel for schedule(static) private(i, i1) |
1704 | for (i=0; i<j; i++) { |
1705 | i1 = RAP_couple_idx[i]; |
1706 | RAP_couple_idx[i] = P_marker[i1]; |
1707 | } |
1708 | num_RAPext_cols = k; |
1709 | delete[] global_id_RAP; |
1710 | global_id_RAP = temp; |
1711 | } |
1712 | delete[] P_marker; |
1713 | |
1714 | /******************************************************/ |
1715 | /* Start to create the coarse level System Matrix A_c */ |
1716 | /******************************************************/ |
1717 | /* first, prepare the sender/receiver for the col_connector */ |
1718 | dist = P->pattern->input_distribution->first_component; |
1719 | recv_len = new dim_t[size]; |
1720 | send_len = new dim_t[size]; |
1721 | neighbor = new Esys_MPI_rank[size]; |
1722 | offsetInShared = new index_t[size+1]; |
1723 | shared = new index_t[num_RAPext_cols]; |
1724 | memset(recv_len, 0, sizeof(dim_t) * size); |
1725 | memset(send_len, 0, sizeof(dim_t) * size); |
1726 | num_neighbors = 0; |
1727 | offsetInShared[0] = 0; |
1728 | for (i=0, j=0, k=dist[j+1]; i<num_RAPext_cols; i++) { |
1729 | shared[i] = i + num_Pmain_cols; |
1730 | if (k <= global_id_RAP[i]) { |
1731 | if (recv_len[j] > 0) { |
1732 | neighbor[num_neighbors] = j; |
1733 | num_neighbors ++; |
1734 | offsetInShared[num_neighbors] = i; |
1735 | } |
1736 | while (k <= global_id_RAP[i]) { |
1737 | j++; |
1738 | k = dist[j+1]; |
1739 | } |
1740 | } |
1741 | recv_len[j] ++; |
1742 | } |
1743 | if (recv_len[j] > 0) { |
1744 | neighbor[num_neighbors] = j; |
1745 | num_neighbors ++; |
1746 | offsetInShared[num_neighbors] = i; |
1747 | } |
1748 | |
1749 | paso::SharedComponents_ptr recv(new paso::SharedComponents( |
1750 | num_Pmain_cols, num_neighbors, neighbor, shared, |
1751 | offsetInShared, 1, 0, mpi_info)); |
1752 | |
1753 | #ifdef ESYS_MPI |
1754 | MPI_Alltoall(recv_len, 1, MPI_INT, send_len, 1, MPI_INT, mpi_info->comm); |
1755 | #endif |
1756 | |
1757 | #ifdef ESYS_MPI |
1758 | mpi_requests=new MPI_Request[size*2]; |
1759 | mpi_stati=new MPI_Status[size*2]; |
1760 | #else |
1761 | mpi_requests=new int[size*2]; |
1762 | mpi_stati=new int[size*2]; |
1763 | #endif |
1764 | num_neighbors = 0; |
1765 | j = 0; |
1766 | offsetInShared[0] = 0; |
1767 | for (i=0; i<size; i++) { |
1768 | if (send_len[i] > 0) { |
1769 | neighbor[num_neighbors] = i; |
1770 | num_neighbors ++; |
1771 | j += send_len[i]; |
1772 | offsetInShared[num_neighbors] = j; |
1773 | } |
1774 | } |
1775 | delete[] shared; |
1776 | shared = new index_t[j]; |
1777 | for (i=0, j=0; i<num_neighbors; i++) { |
1778 | k = neighbor[i]; |
1779 | #ifdef ESYS_MPI |
1780 | MPI_Irecv(&shared[j], send_len[k] , MPI_INT, k, |
1781 | mpi_info->msg_tag_counter+k, |
1782 | mpi_info->comm, &mpi_requests[i]); |
1783 | #endif |
1784 | j += send_len[k]; |
1785 | } |
1786 | for (i=0, j=0; i<recv->numNeighbors; i++) { |
1787 | k = recv->neighbor[i]; |
1788 | #ifdef ESYS_MPI |
1789 | MPI_Issend(&(global_id_RAP[j]), recv_len[k], MPI_INT, k, |
1790 | mpi_info->msg_tag_counter+rank, |
1791 | mpi_info->comm, &mpi_requests[i+num_neighbors]); |
1792 | #endif |
1793 | j += recv_len[k]; |
1794 | } |
1795 | #ifdef ESYS_MPI |
1796 | MPI_Waitall(num_neighbors + recv->numNeighbors, |
1797 | mpi_requests, mpi_stati); |
1798 | #endif |
1799 | ESYS_MPI_INC_COUNTER(*mpi_info, size); |
1800 | |
1801 | j = offsetInShared[num_neighbors]; |
1802 | offset = dist[rank]; |
1803 | #pragma omp parallel for schedule(static) private(i) |
1804 | for (i=0; i<j; i++) shared[i] = shared[i] - offset; |
1805 | |
1806 | paso::SharedComponents_ptr send(new paso::SharedComponents( |
1807 | num_Pmain_cols, num_neighbors, neighbor, shared, |
1808 | offsetInShared, 1, 0, mpi_info)); |
1809 | |
1810 | col_connector = paso::Connector_alloc(send, recv); |
1811 | delete[] shared; |
1812 | |
1813 | /* now, create row distribution (output_distri) and col |
1814 | distribution (input_distribution) */ |
1815 | input_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0)); |
1816 | output_dist.reset(new paso::Distribution(mpi_info, dist, 1, 0)); |
1817 | |
1818 | /* then, prepare the sender/receiver for the row_connector, first, prepare |
1819 | the information for sender */ |
1820 | sum = RAP_couple_ptr[num_Pmain_cols]; |
1821 | len = new dim_t[size]; |
1822 | send_ptr = new index_t*[size]; |
1823 | send_idx = new index_t*[size]; |
1824 | #pragma omp parallel for schedule(static) private(i) |
1825 | for (i=0; i<size; i++) { |
1826 | send_ptr[i] = new index_t[num_Pmain_cols]; |
1827 | send_idx[i] = new index_t[sum]; |
1828 | memset(send_ptr[i], 0, sizeof(index_t) * num_Pmain_cols); |
1829 | } |
1830 | memset(len, 0, sizeof(dim_t) * size); |
1831 | recv = col_connector->recv; |
1832 | sum=0; |
1833 | for (i_r=0; i_r<num_Pmain_cols; i_r++) { |
1834 | i1 = RAP_couple_ptr[i_r]; |
1835 | i2 = RAP_couple_ptr[i_r+1]; |
1836 | if (i2 > i1) { |
1837 | /* then row i_r will be in the sender of row_connector, now check |
1838 | how many neighbours i_r needs to be send to */ |
1839 | for (j=i1; j<i2; j++) { |
1840 | i_c = RAP_couple_idx[j]; |
1841 | /* find out the corresponding neighbor "p" of column i_c */ |
1842 | for (p=0; p<recv->numNeighbors; p++) { |
1843 | if (i_c < recv->offsetInShared[p+1]) { |
1844 | k = recv->neighbor[p]; |
1845 | if (send_ptr[k][i_r] == 0) sum++; |
1846 | send_ptr[k][i_r] ++; |
1847 | send_idx[k][len[k]] = global_id_RAP[i_c]; |
1848 | len[k] ++; |
1849 | break; |
1850 | } |
1851 | } |
1852 | } |
1853 | } |
1854 | } |
1855 | if (global_id_RAP) { |
1856 | delete[] global_id_RAP; |
1857 | global_id_RAP = NULL; |
1858 | } |
1859 | |
1860 | /* now allocate the sender */ |
1861 | shared = new index_t[sum]; |
1862 | memset(send_len, 0, sizeof(dim_t) * size); |
1863 | num_neighbors=0; |
1864 | offsetInShared[0] = 0; |
1865 | for (p=0, k=0; p<size; p++) { |
1866 | for (i=0; i<num_Pmain_cols; i++) { |
1867 | if (send_ptr[p][i] > 0) { |
1868 | shared[k] = i; |
1869 | k++; |
1870 | send_ptr[p][send_len[p]] = send_ptr[p][i]; |
1871 | send_len[p]++; |
1872 | } |
1873 | } |
1874 | if (k > offsetInShared[num_neighbors]) { |
1875 | neighbor[num_neighbors] = p; |
1876 | num_neighbors ++; |
1877 | offsetInShared[num_neighbors] = k; |
1878 | } |
1879 | } |
1880 | send.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors, |
1881 | neighbor, shared, offsetInShared, 1, 0, mpi_info)); |
1882 | |
1883 | /* send/recv number of rows will be sent from current proc |
1884 | recover info for the receiver of row_connector from the sender */ |
1885 | #ifdef ESYS_MPI |
1886 | MPI_Alltoall(send_len, 1, MPI_INT, recv_len, 1, MPI_INT, mpi_info->comm); |
1887 | #endif |
1888 | num_neighbors = 0; |
1889 | offsetInShared[0] = 0; |
1890 | j = 0; |
1891 | for (i=0; i<size; i++) { |
1892 | if (i != rank && recv_len[i] > 0) { |
1893 | neighbor[num_neighbors] = i; |
1894 | num_neighbors ++; |
1895 | j += recv_len[i]; |
1896 | offsetInShared[num_neighbors] = j; |
1897 | } |
1898 | } |
1899 | delete[] shared; |
1900 | delete[] recv_len; |
1901 | shared = new index_t[j]; |
1902 | k = offsetInShared[num_neighbors]; |
1903 | #pragma omp parallel for schedule(static) private(i) |
1904 | for (i=0; i<k; i++) { |
1905 | shared[i] = i + num_Pmain_cols; |
1906 | } |
1907 | recv.reset(new paso::SharedComponents(num_Pmain_cols, num_neighbors, |
1908 | neighbor, shared, offsetInShared, 1, 0, mpi_info)); |
1909 | row_connector = paso::Connector_alloc(send, recv); |
1910 | delete[] shared; |
1911 | |
1912 | /* send/recv pattern->ptr for rowCoupleBlock */ |
1913 | num_RAPext_rows = offsetInShared[num_neighbors]; |
1914 | row_couple_ptr = new index_t[num_RAPext_rows+1]; |
1915 | for (p=0; p<num_neighbors; p++) { |
1916 | j = offsetInShared[p]; |
1917 | i = offsetInShared[p+1]; |
1918 | #ifdef ESYS_MPI |
1919 | MPI_Irecv(&(row_couple_ptr[j]), i-j, MPI_INT, neighbor[p], |
1920 | mpi_info->msg_tag_counter+neighbor[p], |
1921 | mpi_info->comm, &mpi_requests[p]); |
1922 | #endif |
1923 | } |
1924 | send = row_connector->send; |
1925 | for (p=0; p<send->numNeighbors; p++) { |
1926 | #ifdef ESYS_MPI |
1927 | MPI_Issend(send_ptr[send->neighbor[p]], send_len[send->neighbor[p]], |
1928 | MPI_INT, send->neighbor[p], |
1929 | mpi_info->msg_tag_counter+rank, |
1930 | mpi_info->comm, &mpi_requests[p+num_neighbors]); |
1931 | #endif |
1932 | } |
1933 | #ifdef ESYS_MPI |
1934 | MPI_Waitall(num_neighbors + send->numNeighbors, |
1935 | mpi_requests, mpi_stati); |
1936 | #endif |
1937 | ESYS_MPI_INC_COUNTER(*mpi_info, size); |
1938 | delete[] send_len; |
1939 | |
1940 | sum = 0; |
1941 | for (i=0; i<num_RAPext_rows; i++) { |
1942 | k = row_couple_ptr[i]; |
1943 | row_couple_ptr[i] = sum; |
1944 | sum += k; |
1945 | } |
1946 | row_couple_ptr[num_RAPext_rows] = sum; |
1947 | |
1948 | /* send/recv pattern->index for rowCoupleBlock */ |
1949 | k = row_couple_ptr[num_RAPext_rows]; |
1950 | row_couple_idx = new index_t[k]; |
1951 | for (p=0; p<num_neighbors; p++) { |
1952 | j1 = row_couple_ptr[offsetInShared[p]]; |
1953 | j2 = row_couple_ptr[offsetInShared[p+1]]; |
1954 | #ifdef ESYS_MPI |
1955 | MPI_Irecv(&(row_couple_idx[j1]), j2-j1, MPI_INT, neighbor[p], |
1956 | mpi_info->msg_tag_counter+neighbor[p], |
1957 | mpi_info->comm, &mpi_requests[p]); |
1958 | #endif |
1959 | } |
1960 | for (p=0; p<send->numNeighbors; p++) { |
1961 | #ifdef ESYS_MPI |
1962 | MPI_Issend(send_idx[send->neighbor[p]], len[send->neighbor[p]], |
1963 | MPI_INT, send->neighbor[p], |
1964 | mpi_info->msg_tag_counter+rank, |
1965 | mpi_info->comm, &mpi_requests[p+num_neighbors]); |
1966 | #endif |
1967 | } |
1968 | #ifdef ESYS_MPI |
1969 | MPI_Waitall(num_neighbors + send->numNeighbors, |
1970 | mpi_requests, mpi_stati); |
1971 | #endif |
1972 | ESYS_MPI_INC_COUNTER(*mpi_info, size); |
1973 | |
1974 | offset = input_dist->first_component[rank]; |
1975 | k = row_couple_ptr[num_RAPext_rows]; |
1976 | #pragma omp parallel for schedule(static) private(i) |
1977 | for (i=0; i<k; i++) { |
1978 | row_couple_idx[i] -= offset; |
1979 | } |
1980 | #pragma omp parallel for schedule(static) private(i) |
1981 | for (i=0; i<size; i++) { |
1982 | delete[] send_ptr[i]; |
1983 | delete[] send_idx[i]; |
1984 | } |
1985 | delete[] send_ptr; |
1986 | delete[] send_idx; |
1987 | delete[] len; |
1988 | delete[] offsetInShared; |
1989 | delete[] neighbor; |
1990 | delete[] mpi_requests; |
1991 | delete[] mpi_stati; |
1992 | Esys_MPIInfo_free(mpi_info); |
1993 | |
1994 | /* Now, we can create pattern for mainBlock and coupleBlock */ |
1995 | main_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT, num_Pmain_cols, |
1996 | num_Pmain_cols, RAP_main_ptr, RAP_main_idx); |
1997 | col_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT, |
1998 | num_Pmain_cols, num_RAPext_cols, |
1999 | RAP_couple_ptr, RAP_couple_idx); |
2000 | row_couple_pattern = paso::Pattern_alloc(MATRIX_FORMAT_DEFAULT, |
2001 | num_RAPext_rows, num_Pmain_cols, |
2002 | row_couple_ptr, row_couple_idx); |
2003 | |
2004 | /* next, create the system matrix */ |
2005 | pattern = new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT, |
2006 | output_dist, input_dist, main_pattern, col_couple_pattern, |
2007 | row_couple_pattern, col_connector, row_connector); |
2008 | out = Paso_SystemMatrix_alloc(A->type, pattern, |
2009 | row_block_size, col_block_size, FALSE); |
2010 | |
2011 | /* finally, fill in the data*/ |
2012 | memcpy(out->mainBlock->val, RAP_main_val, |
2013 | out->mainBlock->len* sizeof(double)); |
2014 | memcpy(out->col_coupleBlock->val, RAP_couple_val, |
2015 | out->col_coupleBlock->len * sizeof(double)); |
2016 | |
2017 | /* Clean up */ |
2018 | paso::SystemMatrixPattern_free(pattern); |
2019 | paso::Pattern_free(main_pattern); |
2020 | paso::Pattern_free(col_couple_pattern); |
2021 | paso::Pattern_free(row_couple_pattern); |
2022 | paso::Connector_free(col_connector); |
2023 | paso::Connector_free(row_connector); |
2024 | delete[] RAP_main_val; |
2025 | delete[] RAP_couple_val; |
2026 | if (Esys_noError()) { |
2027 | return out; |
2028 | } else { |
2029 | Paso_SystemMatrix_free(out); |
2030 | return NULL; |
2031 | } |
2032 | } |
2033 | |
2034 | |
2035 | Paso_SystemMatrix* Paso_Preconditioner_AMG_buildInterpolationOperatorBlock( |
2036 | Paso_SystemMatrix* A, Paso_SystemMatrix* P, Paso_SystemMatrix* R) |
2037 | { |
2038 | Esys_MPIInfo *mpi_info=Esys_MPIInfo_getReference(A->mpi_info); |
2039 | paso::SparseMatrix *R_main=NULL, *R_couple=NULL; |
2040 | Paso_SystemMatrix *out=NULL; |
2041 | paso::SystemMatrixPattern *pattern=NULL; |
2042 | paso::Distribution_ptr input_dist, output_dist; |
2043 | paso::SharedComponents_ptr send, recv; |
2044 | paso::Connector *col_connector=NULL, *row_connector=NULL; |
2045 | paso::Pattern *main_pattern=NULL; |
2046 | paso::Pattern *col_couple_pattern=NULL, *row_couple_pattern =NULL; |
2047 | const dim_t row_block_size=A->row_block_size; |
2048 | const dim_t col_block_size=A->col_block_size; |
2049 | const dim_t block_size = A->block_size; |
2050 | const double ZERO = 0.0; |
2051 | double *RAP_main_val=NULL, *RAP_couple_val=NULL, *RAP_ext_val=NULL; |
2052 | double rtmp, *RAP_val, *RA_val, *R_val, *temp_val=NULL; |
2053 | index_t size=mpi_info->size, rank=mpi_info->rank, *dist=NULL; |
2054 | index_t *RAP_main_ptr=NULL, *RAP_couple_ptr=NULL, *RAP_ext_ptr=NULL; |
2055 | index_t *RAP_main_idx=NULL, *RAP_couple_idx=NULL, *RAP_ext_idx=NULL; |
2056 | index_t *offsetInShared=NULL, *row_couple_ptr=NULL, *row_couple_idx=NULL; |
2057 | index_t *Pcouple_to_Pext=NULL, *Pext_to_RAP=NULL, *Pcouple_to_RAP=NULL; |
2058 | index_t *temp=NULL, *global_id_P=NULL, *global_id_RAP=NULL; |
2059 | index_t *shared=NULL, *P_marker=NULL, *A_marker=NULL; |
2060 | index_t sum, i, j, k, iptr, irb, icb, ib; |
2061 | index_t num_Pmain_cols, num_Pcouple_cols, num_Pext_cols; |
2062 | index_t num_A_cols, row_marker, num_RAPext_cols, num_Acouple_cols, offset; |
2063 | index_t i_r, i_c, i1, i2, j1, j1_ub, j2, j2_ub, j3, j3_ub, num_RAPext_rows; |
2064 | index_t row_marker_ext, *where_p=NULL; |
2065 | index_t **send_ptr=NULL, **send_idx=NULL; |
2066 | dim_t p, num_neighbors; |
2067 | dim_t *recv_len=NULL, *send_len=NULL, *len=NULL; |
2068 | Esys_MPI_rank *neighbor=NULL; |
2069 | #ifdef ESYS_MPI |
2070 | MPI_Request* mpi_requests=NULL; |
2071 | MPI_Status* mpi_stati=NULL; |
2072 | #else |
2073 | int *mpi_requests=NULL, *mpi_stati=NULL; |
2074 | #endif |
2075 | |
2076 | /* two sparse matrices R_main and R_couple will be generate, as the |
2077 | transpose of P_main and P_col_couple, respectively. Note that, |
2078 | R_couple is actually the row_coupleBlock of R (R=P^T) */ |
2079 | R_main = paso::SparseMatrix_getTranspose(P->mainBlock); |
2080 | R_couple = paso::SparseMatrix_getTranspose(P->col_coupleBlock); |
2081 | |
2082 | /* generate P_ext, i.e. portion of P that is stored on neighbor procs |
2083 | and needed locally for triple matrix product RAP |
2084 | to be specific, P_ext (on processor k) are group of rows in P, where |
2085 | the list of rows from processor q is given by |
2086 | A->col_coupler->connector->send->shared[rPtr...] |
2087 | rPtr=A->col_coupler->connector->send->OffsetInShared[k] |
2088 | on q. |
2089 | P_ext is represented by two sparse matrices P_ext_main and |
2090 | P_ext_couple */ |
2091 | Paso_Preconditioner_AMG_extendB(A, P); |
2092 | |
2093 | /* count the number of cols in P_ext_couple, resize the pattern of |
2094 | sparse matrix P_ext_couple with new compressed order, and then |
2095 | build the col id mapping from P->col_coupleBlock to |
2096 | P_ext_couple */ |
2097 | num_Pmain_cols = P->mainBlock->numCols; |
2098 | num_Pcouple_cols = P->col_coupleBlock->numCols; |
2099 | num_Acouple_cols = A->col_coupleBlock->numCols; |
2100 | num_A_cols = A->mainBlock->numCols + num_Acouple_cols; |
2101 | sum = P->remote_coupleBlock->pattern->ptr[P->remote_coupleBlock->numRows]; |
2102 | offset = P->col_distribution->first_component[rank]; |
2103 | num_Pext_cols = 0; |
2104 | if (P->global_id) { |
2105 | /* first count the number of cols "num_Pext_cols" in both P_ext_couple |
2106 | and P->col_coupleBlock */ |
2107 | iptr = 0; |
2108 | if (num_Pcouple_cols || sum > 0) { |
2109 | temp = new index_t[num_Pcouple_cols+sum]; |
2110 | for (; iptr<sum; iptr++){ |
2111 | temp[iptr] = P->remote_coupleBlock->pattern->index[iptr]; |
2112 | } |
2113 | for (j=0; j<num_Pcouple_cols; j++, iptr++){ |
2114 | temp[iptr] = P->global_id[j]; |
2115 | } |
2116 | } |
2117 | if (iptr) { |
2118 | #ifdef USE_QSORTG |
2119 | qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
2120 | #else |
2121 | qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
2122 | #endif |
2123 | num_Pext_cols = 1; |
2124 | i = temp[0]; |
2125 | for (j=1; j<iptr; j++) { |
2126 | if (temp[j] > i) { |
2127 | i = temp[j]; |
2128 | temp[num_Pext_cols++] = i; |
2129 | } |
2130 | } |
2131 | } |
2132 | |
2133 | /* resize the pattern of P_ext_couple */ |
2134 | if(num_Pext_cols){ |
2135 | global_id_P = new index_t[num_Pext_cols]; |
2136 | for (i=0; i<num_Pext_cols; i++) |
2137 | global_id_P[i] = temp[i]; |
2138 | } |
2139 | if (num_Pcouple_cols || sum > 0) |
2140 | delete[] temp; |
2141 | for (i=0; i<sum; i++) { |
2142 | where_p = (index_t *)bsearch( |
2143 | &(P->remote_coupleBlock->pattern->index[i]), |
2144 | global_id_P, num_Pext_cols, |
2145 | sizeof(index_t), paso::comparIndex); |
2146 | P->remote_coupleBlock->pattern->index[i] = |
2147 | (index_t)(where_p -global_id_P); |
2148 | } |
2149 | |
2150 | /* build the mapping */ |
2151 | if (num_Pcouple_cols) { |
2152 | Pcouple_to_Pext = new index_t[num_Pcouple_cols]; |
2153 | iptr = 0; |
2154 | for (i=0; i<num_Pext_cols; i++) |
2155 | if (global_id_P[i] == P->global_id[iptr]) { |
2156 | Pcouple_to_Pext[iptr++] = i; |
2157 | if (iptr == num_Pcouple_cols) break; |
2158 | } |
2159 | } |
2160 | } |
2161 | |
2162 | /* alloc and initialise the makers */ |
2163 | sum = num_Pext_cols + num_Pmain_cols; |
2164 | P_marker = new index_t[sum]; |
2165 | A_marker = new index_t[num_A_cols]; |
2166 | #pragma omp parallel for private(i) schedule(static) |
2167 | for (i=0; i<sum; i++) P_marker[i] = -1; |
2168 | #pragma omp parallel for private(i) schedule(static) |
2169 | for (i=0; i<num_A_cols; i++) A_marker[i] = -1; |
2170 | |
2171 | /* Now, count the size of RAP_ext. Start with rows in R_couple */ |
2172 | sum = 0; |
2173 | for (i_r=0; i_r<num_Pcouple_cols; i_r++){ |
2174 | row_marker = sum; |
2175 | /* then, loop over elements in row i_r of R_couple */ |
2176 | j1_ub = R_couple->pattern->ptr[i_r+1]; |
2177 | for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){ |
2178 | i1 = R_couple->pattern->index[j1]; |
2179 | /* then, loop over elements in row i1 of A->col_coupleBlock */ |
2180 | j2_ub = A->col_coupleBlock->pattern->ptr[i1+1]; |
2181 | for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
2182 | i2 = A->col_coupleBlock->pattern->index[j2]; |
2183 | |
2184 | /* check whether entry RA[i_r, i2] has been previously visited. |
2185 | RAP new entry is possible only if entry RA[i_r, i2] has not |
2186 | been visited yet */ |
2187 | if (A_marker[i2] != i_r) { |
2188 | /* first, mark entry RA[i_r,i2] as visited */ |
2189 | A_marker[i2] = i_r; |
2190 | |
2191 | /* then loop over elements in row i2 of P_ext_main */ |
2192 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
2193 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2194 | i_c = P->row_coupleBlock->pattern->index[j3]; |
2195 | |
2196 | /* check whether entry RAP[i_r,i_c] has been created. |
2197 | If not yet created, create the entry and increase |
2198 | the total number of elements in RAP_ext */ |
2199 | if (P_marker[i_c] < row_marker) { |
2200 | P_marker[i_c] = sum; |
2201 | sum++; |
2202 | } |
2203 | } |
2204 | |
2205 | /* loop over elements in row i2 of P_ext_couple, do the same */ |
2206 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
2207 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2208 | i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols; |
2209 | if (P_marker[i_c] < row_marker) { |
2210 | P_marker[i_c] = sum; |
2211 | sum++; |
2212 | } |
2213 | } |
2214 | } |
2215 | } |
2216 | |
2217 | /* now loop over elements in row i1 of A->mainBlock, repeat |
2218 | the process we have done to block A->col_coupleBlock */ |
2219 | j2_ub = A->mainBlock->pattern->ptr[i1+1]; |
2220 | for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
2221 | i2 = A->mainBlock->pattern->index[j2]; |
2222 | if (A_marker[i2 + num_Acouple_cols] != i_r) { |
2223 | A_marker[i2 + num_Acouple_cols] = i_r; |
2224 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
2225 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2226 | i_c = P->mainBlock->pattern->index[j3]; |
2227 | if (P_marker[i_c] < row_marker) { |
2228 | P_marker[i_c] = sum; |
2229 | sum++; |
2230 | } |
2231 | } |
2232 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
2233 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2234 | /* note that we need to map the column index in |
2235 | P->col_coupleBlock back into the column index in |
2236 | P_ext_couple */ |
2237 | i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
2238 | if (P_marker[i_c] < row_marker) { |
2239 | P_marker[i_c] = sum; |
2240 | sum++; |
2241 | } |
2242 | } |
2243 | } |
2244 | } |
2245 | } |
2246 | } |
2247 | |
2248 | /* Now we have the number of non-zero elements in RAP_ext, allocate |
2249 | PAP_ext_ptr, RAP_ext_idx and RAP_ext_val */ |
2250 | RAP_ext_ptr = new index_t[num_Pcouple_cols+1]; |
2251 | RAP_ext_idx = new index_t[sum]; |
2252 | RAP_ext_val = new double[sum * block_size]; |
2253 | RA_val = new double[block_size]; |
2254 | RAP_val = new double[block_size]; |
2255 | |
2256 | /* Fill in the RAP_ext_ptr, RAP_ext_idx, RAP_val */ |
2257 | sum = num_Pext_cols + num_Pmain_cols; |
2258 | #pragma omp parallel for private(i) schedule(static) |
2259 | for (i=0; i<sum; i++) P_marker[i] = -1; |
2260 | #pragma omp parallel for private(i) schedule(static) |
2261 | for (i=0; i<num_A_cols; i++) A_marker[i] = -1; |
2262 | sum = 0; |
2263 | RAP_ext_ptr[0] = 0; |
2264 | for (i_r=0; i_r<num_Pcouple_cols; i_r++){ |
2265 | row_marker = sum; |
2266 | /* then, loop over elements in row i_r of R_couple */ |
2267 | j1_ub = R_couple->pattern->ptr[i_r+1]; |
2268 | for (j1=R_couple->pattern->ptr[i_r]; j1<j1_ub; j1++){ |
2269 | i1 = R_couple->pattern->index[j1]; |
2270 | R_val = &(R_couple->val[j1*block_size]); |
2271 | |
2272 | /* then, loop over elements in row i1 of A->col_coupleBlock */ |
2273 | j2_ub = A->col_coupleBlock->pattern->ptr[i1+1]; |
2274 | for (j2=A->col_coupleBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
2275 | i2 = A->col_coupleBlock->pattern->index[j2]; |
2276 | temp_val = &(A->col_coupleBlock->val[j2*block_size]); |
2277 | for (irb=0; irb<row_block_size; irb++) { |
2278 | for (icb=0; icb<col_block_size; icb++) { |
2279 | rtmp = ZERO; |
2280 | for (ib=0; ib<col_block_size; ib++) { |
2281 | rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2282 | } |
2283 | RA_val[irb+row_block_size*icb]=rtmp; |
2284 | } |
2285 | } |
2286 | |
2287 | /* check whether entry RA[i_r, i2] has been previously visited. |
2288 | RAP new entry is possible only if entry RA[i_r, i2] has not |
2289 | been visited yet */ |
2290 | if (A_marker[i2] != i_r) { |
2291 | /* first, mark entry RA[i_r,i2] as visited */ |
2292 | A_marker[i2] = i_r; |
2293 | |
2294 | /* then loop over elements in row i2 of P_ext_main */ |
2295 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
2296 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2297 | i_c = P->row_coupleBlock->pattern->index[j3]; |
2298 | temp_val = &(P->row_coupleBlock->val[j3*block_size]); |
2299 | for (irb=0; irb<row_block_size; irb++) { |
2300 | for (icb=0; icb<col_block_size; icb++) { |
2301 | rtmp = ZERO; |
2302 | for (ib=0; ib<col_block_size; ib++) { |
2303 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2304 | } |
2305 | RAP_val[irb+row_block_size*icb]=rtmp; |
2306 | } |
2307 | } |
2308 | |
2309 | |
2310 | /* check whether entry RAP[i_r,i_c] has been created. |
2311 | If not yet created, create the entry and increase |
2312 | the total number of elements in RAP_ext */ |
2313 | if (P_marker[i_c] < row_marker) { |
2314 | P_marker[i_c] = sum; |
2315 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
2316 | RAP_ext_idx[sum] = i_c + offset; |
2317 | sum++; |
2318 | } else { |
2319 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2320 | for (ib=0; ib<block_size; ib++) { |
2321 | temp_val[ib] += RAP_val[ib]; |
2322 | } |
2323 | } |
2324 | } |
2325 | |
2326 | /* loop over elements in row i2 of P_ext_couple, do the same */ |
2327 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
2328 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2329 | i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols; |
2330 | temp_val = &(P->remote_coupleBlock->val[j3*block_size]); |
2331 | for (irb=0; irb<row_block_size; irb++) { |
2332 | for (icb=0; icb<col_block_size; icb++) { |
2333 | rtmp = ZERO; |
2334 | for (ib=0; ib<col_block_size; ib++) { |
2335 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2336 | } |
2337 | RAP_val[irb+row_block_size*icb]=rtmp; |
2338 | } |
2339 | } |
2340 | |
2341 | if (P_marker[i_c] < row_marker) { |
2342 | P_marker[i_c] = sum; |
2343 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
2344 | RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols]; |
2345 | sum++; |
2346 | } else { |
2347 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2348 | for (ib=0; ib<block_size; ib++) { |
2349 | temp_val[ib] += RAP_val[ib]; |
2350 | } |
2351 | } |
2352 | } |
2353 | |
2354 | /* If entry RA[i_r, i2] is visited, no new RAP entry is created. |
2355 | Only the contributions are added. */ |
2356 | } else { |
2357 | j3_ub = P->row_coupleBlock->pattern->ptr[i2+1]; |
2358 | for (j3=P->row_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2359 | i_c = P->row_coupleBlock->pattern->index[j3]; |
2360 | temp_val = &(P->row_coupleBlock->val[j3*block_size]); |
2361 | for (irb=0; irb<row_block_size; irb++) { |
2362 | for (icb=0; icb<col_block_size; icb++) { |
2363 | rtmp = ZERO; |
2364 | for (ib=0; ib<col_block_size; ib++) { |
2365 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2366 | } |
2367 | RAP_val[irb+row_block_size*icb]=rtmp; |
2368 | } |
2369 | } |
2370 | |
2371 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2372 | for (ib=0; ib<block_size; ib++) { |
2373 | temp_val[ib] += RAP_val[ib]; |
2374 | } |
2375 | } |
2376 | j3_ub = P->remote_coupleBlock->pattern->ptr[i2+1]; |
2377 | for (j3=P->remote_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2378 | i_c = P->remote_coupleBlock->pattern->index[j3] + num_Pmain_cols; |
2379 | temp_val = &(P->remote_coupleBlock->val[j3*block_size]); |
2380 | for (irb=0; irb<row_block_size; irb++) { |
2381 | for (icb=0; icb<col_block_size; icb++) { |
2382 | rtmp = ZERO; |
2383 | for (ib=0; ib<col_block_size; ib++) { |
2384 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2385 | } |
2386 | RAP_val[irb+row_block_size*icb]=rtmp; |
2387 | } |
2388 | } |
2389 | |
2390 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2391 | for (ib=0; ib<block_size; ib++) { |
2392 | temp_val[ib] += RAP_val[ib]; |
2393 | } |
2394 | } |
2395 | } |
2396 | } |
2397 | |
2398 | /* now loop over elements in row i1 of A->mainBlock, repeat |
2399 | the process we have done to block A->col_coupleBlock */ |
2400 | j2_ub = A->mainBlock->pattern->ptr[i1+1]; |
2401 | for (j2=A->mainBlock->pattern->ptr[i1]; j2<j2_ub; j2++) { |
2402 | i2 = A->mainBlock->pattern->index[j2]; |
2403 | temp_val = &(A->mainBlock->val[j2*block_size]); |
2404 | for (irb=0; irb<row_block_size; irb++) { |
2405 | for (icb=0; icb<col_block_size; icb++) { |
2406 | rtmp = ZERO; |
2407 | for (ib=0; ib<col_block_size; ib++) { |
2408 | rtmp+= R_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2409 | } |
2410 | RA_val[irb+row_block_size*icb]=rtmp; |
2411 | } |
2412 | } |
2413 | |
2414 | if (A_marker[i2 + num_Acouple_cols] != i_r) { |
2415 | A_marker[i2 + num_Acouple_cols] = i_r; |
2416 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
2417 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2418 | i_c = P->mainBlock->pattern->index[j3]; |
2419 | temp_val = &(P->mainBlock->val[j3*block_size]); |
2420 | for (irb=0; irb<row_block_size; irb++) { |
2421 | for (icb=0; icb<col_block_size; icb++) { |
2422 | rtmp = ZERO; |
2423 | for (ib=0; ib<col_block_size; ib++) { |
2424 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2425 | } |
2426 | RAP_val[irb+row_block_size*icb]=rtmp; |
2427 | } |
2428 | } |
2429 | |
2430 | if (P_marker[i_c] < row_marker) { |
2431 | P_marker[i_c] = sum; |
2432 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
2433 | RAP_ext_idx[sum] = i_c + offset; |
2434 | sum++; |
2435 | } else { |
2436 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2437 | for (ib=0; ib<block_size; ib++) { |
2438 | temp_val[ib] += RAP_val[ib]; |
2439 | } |
2440 | } |
2441 | } |
2442 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
2443 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2444 | i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
2445 | temp_val = &(P->col_coupleBlock->val[j3*block_size]); |
2446 | for (irb=0; irb<row_block_size; irb++) { |
2447 | for (icb=0; icb<col_block_size; icb++) { |
2448 | rtmp = ZERO; |
2449 | for (ib=0; ib<col_block_size; ib++) { |
2450 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2451 | } |
2452 | RAP_val[irb+row_block_size*icb]=rtmp; |
2453 | } |
2454 | } |
2455 | |
2456 | if (P_marker[i_c] < row_marker) { |
2457 | P_marker[i_c] = sum; |
2458 | memcpy(&(RAP_ext_val[sum*block_size]), RAP_val, block_size*sizeof(double)); |
2459 | RAP_ext_idx[sum] = global_id_P[i_c - num_Pmain_cols]; |
2460 | sum++; |
2461 | } else { |
2462 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2463 | for (ib=0; ib<block_size; ib++) { |
2464 | temp_val[ib] += RAP_val[ib]; |
2465 | } |
2466 | } |
2467 | } |
2468 | } else { |
2469 | j3_ub = P->mainBlock->pattern->ptr[i2+1]; |
2470 | for (j3=P->mainBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2471 | i_c = P->mainBlock->pattern->index[j3]; |
2472 | temp_val = &(P->mainBlock->val[j3*block_size]); |
2473 | for (irb=0; irb<row_block_size; irb++) { |
2474 | for (icb=0; icb<col_block_size; icb++) { |
2475 | rtmp = ZERO; |
2476 | for (ib=0; ib<col_block_size; ib++) { |
2477 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2478 | } |
2479 | RAP_val[irb+row_block_size*icb]=rtmp; |
2480 | } |
2481 | } |
2482 | |
2483 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2484 | for (ib=0; ib<block_size; ib++) { |
2485 | temp_val[ib] += RAP_val[ib]; |
2486 | } |
2487 | } |
2488 | j3_ub = P->col_coupleBlock->pattern->ptr[i2+1]; |
2489 | for (j3=P->col_coupleBlock->pattern->ptr[i2]; j3<j3_ub; j3++) { |
2490 | i_c = Pcouple_to_Pext[P->col_coupleBlock->pattern->index[j3]] + num_Pmain_cols; |
2491 | temp_val = &(P->col_coupleBlock->val[j3*block_size]); |
2492 | for (irb=0; irb<row_block_size; irb++) { |
2493 | for (icb=0; icb<col_block_size; icb++) { |
2494 | rtmp = ZERO; |
2495 | for (ib=0; ib<col_block_size; ib++) { |
2496 | rtmp+= RA_val[irb+row_block_size*ib]*temp_val[ib+col_block_size*icb]; |
2497 | } |
2498 | RAP_val[irb+row_block_size*icb]=rtmp; |
2499 | } |
2500 | } |
2501 | |
2502 | temp_val = &(RAP_ext_val[P_marker[i_c] * block_size]); |
2503 | for (ib=0; ib<block_size; ib++) { |
2504 | temp_val[ib] += RAP_val[ib]; |
2505 | } |
2506 | } |
2507 | } |
2508 | } |
2509 | } |
2510 | RAP_ext_ptr[i_r+1] = sum; |
2511 | } |
2512 | delete[] P_marker; |
2513 | delete[] Pcouple_to_Pext; |
2514 | |
2515 | /* Now we have part of RAP[r,c] where row "r" is the list of rows |
2516 | which is given by the column list of P->col_coupleBlock, and |
2517 | column "c" is the list of columns which possibly covers the |
2518 | whole column range of system matrix P. This part of data is to |
2519 | be passed to neighbouring processors, and added to corresponding |
2520 | RAP[r,c] entries in the neighbouring processors */ |
2521 | Paso_Preconditioner_AMG_CopyRemoteData(P, &RAP_ext_ptr, &RAP_ext_idx, |
2522 | &RAP_ext_val, global_id_P, block_size); |
2523 | |
2524 | num_RAPext_rows = P->col_coupler->connector->send->offsetInShared[P->col_coupler->connector->send->numNeighbors]; |
2525 | sum = RAP_ext_ptr[num_RAPext_rows]; |
2526 | num_RAPext_cols = 0; |
2527 | if (num_Pext_cols || sum > 0) { |
2528 | temp = new index_t[num_Pext_cols+sum]; |
2529 | j1_ub = offset + num_Pmain_cols; |
2530 | for (i=0, iptr=0; i<sum; i++) { |
2531 | if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub) /* XXX */ |
2532 | temp[iptr++] = RAP_ext_idx[i]; /* XXX */ |
2533 | } |
2534 | for (j=0; j<num_Pext_cols; j++, iptr++){ |
2535 | temp[iptr] = global_id_P[j]; |
2536 | } |
2537 | |
2538 | if (iptr) { |
2539 | #ifdef USE_QSORTG |
2540 | qsortG(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
2541 | #else |
2542 | qsort(temp, (size_t)iptr, sizeof(index_t), paso::comparIndex); |
2543 | #endif |
2544 | num_RAPext_cols = 1; |
2545 | i = temp[0]; |
2546 | for (j=1; j<iptr; j++) { |
2547 | if (temp[j] > i) { |
2548 | i = temp[j]; |
2549 | temp[num_RAPext_cols++] = i; |
2550 | } |
2551 | } |
2552 | } |
2553 | } |
2554 | |
2555 | /* resize the pattern of P_ext_couple */ |
2556 | if(num_RAPext_cols){ |
2557 | global_id_RAP = new index_t[num_RAPext_cols]; |
2558 | for (i=0; i<num_RAPext_cols; i++) |
2559 | global_id_RAP[i] = temp[i]; |
2560 | } |
2561 | if (num_Pext_cols || sum > 0) |
2562 | delete[] temp; |
2563 | j1_ub = offset + num_Pmain_cols; |
2564 | for (i=0; i<sum; i++) { |
2565 | if (RAP_ext_idx[i] < offset || RAP_ext_idx[i] >= j1_ub){ |
2566 | where_p = (index_t *) bsearch(&(RAP_ext_idx[i]), global_id_RAP, |
2567 | /*XXX*/ num_RAPext_cols, sizeof(index_t), paso::comparIndex); |
2568 | RAP_ext_idx[i] = num_Pmain_cols + (index_t)(where_p - global_id_RAP); |
2569 | } else |
< |