/[escript]/trunk/paso/src/AMG.c
ViewVC logotype

Contents of /trunk/paso/src/AMG.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3911 - (show annotations)
Thu Jun 14 01:01:03 2012 UTC (7 years, 3 months ago) by jfenwick
File MIME type: text/plain
File size: 29372 byte(s)
Copyright changes
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2012 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: AMG preconditioner (local version) */
18
19 /**************************************************************/
20
21 /* Author: artak@uq.edu.au, l.gross@uq.edu.au */
22
23 /**************************************************************/
24
25 #define SHOW_TIMING FALSE
26 #define MY_DEBUG 0
27 #define MY_DEBUG1 1
28
29 #include "Paso.h"
30 #include "Preconditioner.h"
31 #include "Options.h"
32 #include "PasoUtil.h"
33 #include "UMFPACK.h"
34 #include "MKL.h"
35 #include<stdio.h>
36
37
38 /**************************************************************/
39
40 /* free all memory used by AMG */
41
42 void Paso_Preconditioner_AMG_free(Paso_Preconditioner_AMG * in) {
43 if (in!=NULL) {
44 Paso_Preconditioner_Smoother_free(in->Smoother);
45 Paso_SystemMatrix_free(in->P);
46 Paso_SystemMatrix_free(in->R);
47 Paso_SystemMatrix_free(in->A_C);
48 Paso_Preconditioner_AMG_free(in->AMG_C);
49 MEMFREE(in->r);
50 MEMFREE(in->x_C);
51 MEMFREE(in->b_C);
52 Paso_MergedSolver_free(in->merged_solver);
53
54 MEMFREE(in);
55 }
56 }
57
58 index_t Paso_Preconditioner_AMG_getMaxLevel(const Paso_Preconditioner_AMG * in) {
59 if (in->AMG_C == NULL) {
60 return in->level;
61 } else {
62 return Paso_Preconditioner_AMG_getMaxLevel(in->AMG_C);
63 }
64 }
65 double Paso_Preconditioner_AMG_getCoarseLevelSparsity(const Paso_Preconditioner_AMG * in) {
66 if (in->AMG_C == NULL) {
67 if (in->A_C == NULL) {
68 return 1.;
69 } else {
70 return Paso_SystemMatrix_getSparsity(in->A_C);
71 }
72 } else {
73 return Paso_Preconditioner_AMG_getCoarseLevelSparsity(in->AMG_C);
74 }
75 }
76 dim_t Paso_Preconditioner_AMG_getNumCoarseUnknwons(const Paso_Preconditioner_AMG * in) {
77 if (in->AMG_C == NULL) {
78 if (in->A_C == NULL) {
79 return 0;
80 } else {
81 return Paso_SystemMatrix_getTotalNumRows(in->A_C);
82 }
83 } else {
84 return Paso_Preconditioner_AMG_getNumCoarseUnknwons(in->AMG_C);
85 }
86 }
87 /*****************************************************************
88
89 constructs AMG
90
91 ******************************************************************/
92 Paso_Preconditioner_AMG* Paso_Preconditioner_AMG_alloc(Paso_SystemMatrix *A_p,dim_t level,Paso_Options* options) {
93
94 Paso_Preconditioner_AMG* out=NULL;
95 Paso_SystemMatrix *A_C=NULL;
96 bool_t verbose=options->verbose;
97
98 const dim_t my_n=A_p->mainBlock->numRows;
99 const dim_t overlap_n=A_p->row_coupleBlock->numRows;
100
101 const dim_t n = my_n + overlap_n;
102
103 const dim_t n_block=A_p->row_block_size;
104 index_t* F_marker=NULL, *counter=NULL, *mask_C=NULL, *rows_in_F;
105 dim_t i, my_n_F, my_n_C, n_C, F_flag, *F_set=NULL, global_n_C=0, global_n_F=0, n_F;
106 double time0=0;
107 const double theta = options->coarsening_threshold;
108 const double tau = options->diagonal_dominance_threshold;
109 const double sparsity=Paso_SystemMatrix_getSparsity(A_p);
110 const dim_t global_n=Paso_SystemMatrix_getGlobalNumRows(A_p);
111
112
113 /*
114 is the input matrix A suitable for coarsening?
115
116 */
117 if ( (sparsity >= options->min_coarse_sparsity) ||
118 (global_n <= options->min_coarse_matrix_size) ||
119 (level > options->level_max) ) {
120
121 if (verbose) {
122 /*
123 print stopping condition:
124 - 'SPAR' = min_coarse_matrix_sparsity exceeded
125 - 'SIZE' = min_coarse_matrix_size exceeded
126 - 'LEVEL' = level_max exceeded
127 */
128 printf("Paso_Preconditioner: AMG: termination of coarsening by ");
129
130 if (sparsity >= options->min_coarse_sparsity)
131 printf("SPAR");
132
133 if (global_n <= options->min_coarse_matrix_size)
134 printf("SIZE");
135
136 if (level > options->level_max)
137 printf("LEVEL");
138
139 printf("\n");
140
141 printf("Paso_Preconditioner: AMG level %d (limit = %d) stopped. sparsity = %e (limit = %e), unknowns = %d (limit = %d)\n",
142 level, options->level_max, sparsity, options->min_coarse_sparsity, global_n, options->min_coarse_matrix_size);
143
144 }
145
146 return NULL;
147 } else {
148 /* Start Coarsening : */
149
150 /* this is the table for strong connections combining mainBlock, col_coupleBlock and row_coupleBlock */
151 const dim_t len_S=A_p->mainBlock->pattern->len + A_p->col_coupleBlock->pattern->len + A_p->row_coupleBlock->pattern->len + A_p->row_coupleBlock->numRows * A_p->col_coupleBlock->numCols;
152
153 dim_t* degree_S=TMPMEMALLOC(n, dim_t);
154 index_t *offset_S=TMPMEMALLOC(n, index_t);
155 index_t *S=TMPMEMALLOC(len_S, index_t);
156 dim_t* degree_ST=TMPMEMALLOC(n, dim_t);
157 index_t *offset_ST=TMPMEMALLOC(n, index_t);
158 index_t *ST=TMPMEMALLOC(len_S, index_t);
159
160
161 F_marker=TMPMEMALLOC(n,index_t);
162 counter=TMPMEMALLOC(n,index_t);
163
164 if ( !( Esys_checkPtr(F_marker) || Esys_checkPtr(counter) || Esys_checkPtr(degree_S) || Esys_checkPtr(offset_S) || Esys_checkPtr(S)
165 || Esys_checkPtr(degree_ST) || Esys_checkPtr(offset_ST) || Esys_checkPtr(ST) ) ) {
166 /*
167 make sure that corresponding values in the row_coupleBlock and col_coupleBlock are identical
168 */
169 Paso_SystemMatrix_copyColCoupleBlock(A_p);
170 Paso_SystemMatrix_copyRemoteCoupleBlock(A_p, FALSE);
171
172 /*
173 set splitting of unknows:
174
175 */
176 time0=Esys_timer();
177 if (n_block>1) {
178 Paso_Preconditioner_AMG_setStrongConnections_Block(A_p, degree_S, offset_S, S, theta,tau);
179 } else {
180 Paso_Preconditioner_AMG_setStrongConnections(A_p, degree_S, offset_S, S, theta,tau);
181 }
182 Paso_Preconditioner_AMG_transposeStrongConnections(n, degree_S, offset_S, S, n, degree_ST, offset_ST, ST);
183 /* Paso_SystemMatrix_extendedRowsForST(A_p, degree_ST, offset_ST, ST);
184 */
185
186 Paso_Preconditioner_AMG_CIJPCoarsening(n,my_n,F_marker,
187 degree_S, offset_S, S, degree_ST, offset_ST, ST,
188 A_p->col_coupler->connector,A_p->col_distribution);
189
190
191 /* in BoomerAMG if interpolation is used FF connectivity is required */
192 /*MPI:
193 if (options->interpolation_method == PASO_CLASSIC_INTERPOLATION_WITH_FF_COUPLING)
194 Paso_Preconditioner_AMG_enforceFFConnectivity(n, A_p->pattern->ptr, degree_S, S, F_marker);
195 */
196
197 options->coarsening_selection_time=Esys_timer()-time0 + MAX(0, options->coarsening_selection_time);
198 if (Esys_noError() ) {
199 #pragma omp parallel for private(i) schedule(static)
200 for (i = 0; i < n; ++i) F_marker[i]=(F_marker[i] == PASO_AMG_IN_F);
201
202 /*
203 count number of unkowns to be eliminated:
204 */
205 my_n_F=Paso_Util_cumsum_maskedTrue(my_n,counter, F_marker);
206 n_F=Paso_Util_cumsum_maskedTrue(n,counter, F_marker);
207 /* collect my_n_F values on all processes, a direct solver should
208 be used if any my_n_F value is 0 */
209 F_set = TMPMEMALLOC(A_p->mpi_info->size, dim_t);
210 #ifdef ESYS_MPI
211 MPI_Allgather(&my_n_F, 1, MPI_INT, F_set, 1, MPI_INT, A_p->mpi_info->comm);
212 #endif
213 global_n_F=0;
214 F_flag = 1;
215 for (i=0; i<A_p->mpi_info->size; i++) {
216 global_n_F+=F_set[i];
217 if (F_set[i] == 0) F_flag = 0;
218 }
219 TMPMEMFREE(F_set);
220
221 my_n_C=my_n-my_n_F;
222 global_n_C=global_n-global_n_F;
223 if (verbose) printf("Paso_Preconditioner: AMG (non-local) level %d: %d unknowns are flagged for elimination. %d left.\n",level,global_n_F,global_n_C);
224
225
226 /* if ( n_F == 0 ) { is a nasty case. a direct solver should be used, return NULL */
227 if (F_flag == 0) {
228 out = NULL;
229 } else {
230 out=MEMALLOC(1,Paso_Preconditioner_AMG);
231 if (! Esys_checkPtr(out)) {
232 out->level = level;
233 out->A_C = NULL;
234 out->P = NULL;
235 out->R = NULL;
236 out->post_sweeps = options->post_sweeps;
237 out->pre_sweeps = options->pre_sweeps;
238 out->r = NULL;
239 out->x_C = NULL;
240 out->b_C = NULL;
241 out->AMG_C = NULL;
242 out->Smoother=NULL;
243 out->merged_solver=NULL;
244 }
245 mask_C=TMPMEMALLOC(n,index_t);
246 rows_in_F=TMPMEMALLOC(n_F,index_t);
247 Esys_checkPtr(mask_C);
248 Esys_checkPtr(rows_in_F);
249 if ( Esys_noError() ) {
250
251 out->Smoother = Paso_Preconditioner_Smoother_alloc(A_p, (options->smoother == PASO_JACOBI), 0, verbose);
252
253 if (global_n_C != 0) {
254 /* create mask of C nodes with value >-1, gives new id */
255 n_C=Paso_Util_cumsum_maskedFalse(n, mask_C, F_marker);
256 /* if nothing has been removed we have a diagonal dominant matrix and we just run a few steps of the smoother */
257
258 /* allocate helpers :*/
259 out->x_C=MEMALLOC(n_block*my_n_C,double);
260 out->b_C=MEMALLOC(n_block*my_n_C,double);
261 out->r=MEMALLOC(n_block*my_n,double);
262
263 Esys_checkPtr(out->r);
264 Esys_checkPtr(out->x_C);
265 Esys_checkPtr(out->b_C);
266
267 if ( Esys_noError() ) {
268 /* creates index for F:*/
269 #pragma omp parallel private(i)
270 {
271 #pragma omp for schedule(static)
272 for (i = 0; i < n; ++i) {
273 if (F_marker[i]) rows_in_F[counter[i]]=i;
274 }
275 }
276 /*
277 get Prolongation :
278 */
279
280 time0=Esys_timer();
281
282 out->P=Paso_Preconditioner_AMG_getProlongation(A_p,offset_S, degree_S,S,n_C,mask_C, options->interpolation_method);
283
284 }
285
286 /*
287 construct Restriction operator as transposed of Prolongation operator:
288 */
289
290 if ( Esys_noError()) {
291 time0=Esys_timer();
292
293 out->R=Paso_Preconditioner_AMG_getRestriction(out->P);
294
295 if (SHOW_TIMING) printf("timing: level %d: Paso_SystemMatrix_getTranspose: %e\n",level,Esys_timer()-time0);
296 }
297 /*
298 construct coarse level matrix:
299 */
300 if ( Esys_noError()) {
301 time0=Esys_timer();
302
303 A_C = Paso_Preconditioner_AMG_buildInterpolationOperator(A_p, out->P, out->R);
304
305 if (SHOW_TIMING) printf("timing: level %d : construct coarse matrix: %e\n",level,Esys_timer()-time0);
306 }
307
308 /*
309 constructe courser level:
310
311 */
312 if ( Esys_noError()) {
313 out->AMG_C=Paso_Preconditioner_AMG_alloc(A_C,level+1,options);
314 }
315
316 if ( Esys_noError()) {
317 out->A_C=A_C;
318 if ( out->AMG_C == NULL ) {
319 /* merge the system matrix into 1 rank when
320 it's not suitable coarsening due to the
321 total number of unknowns are too small */
322 out->merged_solver= Paso_MergedSolver_alloc(A_C, options);
323 }
324 }
325 }
326 }
327 TMPMEMFREE(mask_C);
328 TMPMEMFREE(rows_in_F);
329 }
330 }
331
332 }
333 TMPMEMFREE(counter);
334 TMPMEMFREE(F_marker);
335 TMPMEMFREE(degree_S);
336 TMPMEMFREE(offset_S);
337 TMPMEMFREE(S);
338 TMPMEMFREE(degree_ST);
339 TMPMEMFREE(offset_ST);
340 TMPMEMFREE(ST);
341
342 }
343
344 if (Esys_noError()) {
345 return out;
346 } else {
347 Paso_Preconditioner_AMG_free(out);
348 return NULL;
349 }
350 }
351
352
353 void Paso_Preconditioner_AMG_solve(Paso_SystemMatrix* A, Paso_Preconditioner_AMG * amg, double * x, double * b) {
354 const dim_t n = A->mainBlock->numRows * A->mainBlock->row_block_size;
355 double time0=0;
356 const dim_t post_sweeps=amg->post_sweeps;
357 const dim_t pre_sweeps=amg->pre_sweeps;
358
359 /* presmoothing */
360 time0=Esys_timer();
361 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, pre_sweeps, FALSE);
362
363 time0=Esys_timer()-time0;
364 if (SHOW_TIMING) printf("timing: level %d: Presmoothing: %e\n",amg->level, time0);
365 /* end of presmoothing */
366
367 time0=Esys_timer();
368
369 Paso_Copy(n, amg->r, b); /* r <- b */
370 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(-1.,A,x,1.,amg->r); /*r=r-Ax*/
371 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->R,amg->r,0.,amg->b_C); /* b_c = R*r */
372
373 time0=Esys_timer()-time0;
374 /* coarse level solve */
375 if ( amg->AMG_C == NULL) {
376 time0=Esys_timer();
377 /* A_C is the coarsest level */
378 Paso_MergedSolver_solve(amg->merged_solver,amg->x_C, amg->b_C);
379 if (SHOW_TIMING) printf("timing: level %d: DIRECT SOLVER: %e\n",amg->level,Esys_timer()-time0);
380 } else {
381 Paso_Preconditioner_AMG_solve(amg->A_C, amg->AMG_C,amg->x_C,amg->b_C); /* x_C=AMG(b_C) */
382 }
383
384 time0=time0+Esys_timer();
385 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(1.,amg->P,amg->x_C,1.,x); /* x = x + P*x_c */
386
387 /*postsmoothing*/
388
389 /*solve Ax=b with initial guess x */
390 time0=Esys_timer();
391 Paso_Preconditioner_Smoother_solve(A, amg->Smoother, x, b, post_sweeps, TRUE);
392 time0=Esys_timer()-time0;
393 if (SHOW_TIMING) printf("timing: level %d: Postsmoothing: %e\n",amg->level,time0);
394 return;
395 }
396
397 /* theta = threshold for strong connections */
398 /* tau = threshold for diagonal dominance */
399
400 /*S_i={j \in N_i; i strongly coupled to j}
401
402 in the sense that |A_{ij}| >= theta * max_k |A_{ik}|
403 */
404
405 void Paso_Preconditioner_AMG_setStrongConnections(Paso_SystemMatrix* A,
406 dim_t *degree_S, index_t *offset_S, index_t *S,
407 const double theta, const double tau)
408 {
409
410 const dim_t my_n=A->mainBlock->numRows;
411 const dim_t overlap_n=A->row_coupleBlock->numRows;
412
413 index_t iptr, i;
414 double *threshold_p=NULL;
415
416 threshold_p = TMPMEMALLOC(2*my_n, double);
417
418 #pragma omp parallel for private(i,iptr) schedule(static)
419 for (i=0;i<my_n;++i) {
420
421 register double max_offdiagonal = 0.;
422 register double sum_row=0;
423 register double main_row=0;
424 register dim_t kdeg=0;
425 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
426
427
428 /* collect information for row i: */
429 #pragma ivdep
430 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
431 register index_t j=A->mainBlock->pattern->index[iptr];
432 register double fnorm=ABS(A->mainBlock->val[iptr]);
433 if( j != i) {
434 max_offdiagonal = MAX(max_offdiagonal,fnorm);
435 sum_row+=fnorm;
436 } else {
437 main_row=fnorm;
438 }
439
440 }
441
442 #pragma ivdep
443 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
444 register double fnorm=ABS(A->col_coupleBlock->val[iptr]);
445
446 max_offdiagonal = MAX(max_offdiagonal,fnorm);
447 sum_row+=fnorm;
448 }
449
450 /* inspect row i: */
451 {
452 const double threshold = theta*max_offdiagonal;
453 threshold_p[2*i+1]=threshold;
454 if (tau*main_row < sum_row) { /* no diagonal dominance */
455 threshold_p[2*i]=1;
456 #pragma ivdep
457 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
458 register index_t j=A->mainBlock->pattern->index[iptr];
459 if(ABS(A->mainBlock->val[iptr])>threshold && i!=j) {
460 S[koffset+kdeg] = j;
461 kdeg++;
462 }
463 }
464 #pragma ivdep
465 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
466 register index_t j=A->col_coupleBlock->pattern->index[iptr];
467 if(ABS(A->col_coupleBlock->val[iptr])>threshold) {
468 S[koffset+kdeg] = j + my_n;
469 kdeg++;
470 }
471 }
472 } else {
473 threshold_p[2*i]=-1;
474 }
475 }
476 offset_S[i]=koffset;
477 degree_S[i]=kdeg;
478 }
479
480 /* now we need to distribute the threshold and the diagonal dominance indicator */
481 if (A->mpi_info->size > 1) {
482
483 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
484 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
485
486 double *remote_threshold=NULL;
487
488 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
489 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
490 Paso_Coupler_finishCollect(threshold_coupler);
491 remote_threshold=threshold_coupler->recv_buffer;
492
493 #pragma omp parallel for private(i,iptr) schedule(static)
494 for (i=0; i<overlap_n; i++) {
495 const double threshold = remote_threshold[2*i+1];
496 register dim_t kdeg=0;
497 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
498 if (remote_threshold[2*i]>0) {
499 #pragma ivdep
500 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
501 register index_t j=A->row_coupleBlock->pattern->index[iptr];
502 if(ABS(A->row_coupleBlock->val[iptr])>threshold) {
503 S[koffset+kdeg] = j ;
504 kdeg++;
505 }
506 }
507
508 #pragma ivdep
509 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; iptr++) {
510 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
511 if(ABS(A->remote_coupleBlock->val[iptr])>threshold && i!=j) {
512 S[koffset+kdeg] = j + my_n;
513 kdeg++;
514 }
515 }
516 }
517 offset_S[i+my_n]=koffset;
518 degree_S[i+my_n]=kdeg;
519 }
520
521 Paso_Coupler_free(threshold_coupler);
522 }
523 TMPMEMFREE(threshold_p);
524 }
525
526 /* theta = threshold for strong connections */
527 /* tau = threshold for diagonal dominance */
528 /*S_i={j \in N_i; i strongly coupled to j}
529
530 in the sense that |A_{ij}|_F >= theta * max_k |A_{ik}|_F
531 */
532 void Paso_Preconditioner_AMG_setStrongConnections_Block(Paso_SystemMatrix* A,
533 dim_t *degree_S, index_t *offset_S, index_t *S,
534 const double theta, const double tau)
535
536 {
537 const dim_t block_size=A->block_size;
538 const dim_t my_n=A->mainBlock->numRows;
539 const dim_t overlap_n=A->row_coupleBlock->numRows;
540
541 index_t iptr, i, bi;
542 double *threshold_p=NULL;
543
544
545 threshold_p = TMPMEMALLOC(2*my_n, double);
546
547 #pragma omp parallel private(i,iptr,bi)
548 {
549
550 dim_t max_deg=0;
551 double *rtmp=NULL;
552
553 #pragma omp for schedule(static)
554 for (i=0;i<my_n;++i) max_deg=MAX(max_deg, A->mainBlock->pattern->ptr[i+1]-A->mainBlock->pattern->ptr[i]
555 +A->col_coupleBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i]);
556
557 rtmp=TMPMEMALLOC(max_deg, double);
558
559 #pragma omp for schedule(static)
560 for (i=0;i<my_n;++i) {
561 register double max_offdiagonal = 0.;
562 register double sum_row=0;
563 register double main_row=0;
564 register index_t rtmp_offset=-A->mainBlock->pattern->ptr[i];
565 register dim_t kdeg=0;
566 register const index_t koffset=A->mainBlock->pattern->ptr[i]+A->col_coupleBlock->pattern->ptr[i];
567
568 /* collect information for row i: */
569 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
570 register index_t j=A->mainBlock->pattern->index[iptr];
571 register double fnorm=0;
572 #pragma ivdep
573 for(bi=0;bi<block_size;++bi) {
574 register double rtmp2= A->mainBlock->val[iptr*block_size+bi];
575 fnorm+=rtmp2*rtmp2;
576 }
577 fnorm=sqrt(fnorm);
578 rtmp[iptr+rtmp_offset]=fnorm;
579
580 if( j != i) {
581 max_offdiagonal = MAX(max_offdiagonal,fnorm);
582 sum_row+=fnorm;
583 } else {
584 main_row=fnorm;
585 }
586
587 }
588
589 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
590 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
591 register double fnorm=0;
592 #pragma ivdep
593 for(bi=0;bi<block_size;++bi) {
594 register double rtmp2 = A->col_coupleBlock->val[iptr*block_size+bi];
595 fnorm+=rtmp2*rtmp2;
596 }
597 fnorm=sqrt(fnorm);
598
599 rtmp[iptr+rtmp_offset]=fnorm;
600 max_offdiagonal = MAX(max_offdiagonal,fnorm);
601 sum_row+=fnorm;
602 }
603
604
605 /* inspect row i: */
606 {
607 const double threshold = theta*max_offdiagonal;
608 rtmp_offset=-A->mainBlock->pattern->ptr[i];
609
610 threshold_p[2*i+1]=threshold;
611 if (tau*main_row < sum_row) { /* no diagonal dominance */
612 threshold_p[2*i]=1;
613 #pragma ivdep
614 for (iptr=A->mainBlock->pattern->ptr[i];iptr<A->mainBlock->pattern->ptr[i+1]; ++iptr) {
615 register index_t j=A->mainBlock->pattern->index[iptr];
616 if(rtmp[iptr+rtmp_offset] > threshold && i!=j) {
617 S[koffset+kdeg] = j;
618 kdeg++;
619 }
620 }
621 rtmp_offset+=A->mainBlock->pattern->ptr[i+1]-A->col_coupleBlock->pattern->ptr[i];
622 #pragma ivdep
623 for (iptr=A->col_coupleBlock->pattern->ptr[i];iptr<A->col_coupleBlock->pattern->ptr[i+1]; ++iptr) {
624 register index_t j=A->col_coupleBlock->pattern->index[iptr];
625 if( rtmp[iptr+rtmp_offset] >threshold) {
626 S[koffset+kdeg] = j + my_n;
627 kdeg++;
628 }
629 }
630 } else {
631 threshold_p[2*i]=-1;
632 }
633 }
634 offset_S[i]=koffset;
635 degree_S[i]=kdeg;
636 }
637 TMPMEMFREE(rtmp);
638 }
639 /* now we need to distribute the threshold and the diagonal dominance indicator */
640 if (A->mpi_info->size > 1) {
641
642 const index_t koffset_0=A->mainBlock->pattern->ptr[my_n]+A->col_coupleBlock->pattern->ptr[my_n]
643 -A->mainBlock->pattern->ptr[0]-A->col_coupleBlock->pattern->ptr[0];
644
645 double *remote_threshold=NULL;
646
647 Paso_Coupler* threshold_coupler=Paso_Coupler_alloc(A->row_coupler->connector ,2);
648 Paso_Coupler_startCollect(threshold_coupler,threshold_p);
649 Paso_Coupler_finishCollect(threshold_coupler);
650 remote_threshold=threshold_coupler->recv_buffer;
651
652 #pragma omp parallel for private(i,iptr) schedule(static)
653 for (i=0; i<overlap_n; i++) {
654
655 const double threshold2 = remote_threshold[2*i+1]*remote_threshold[2*i+1];
656 register dim_t kdeg=0;
657 register const index_t koffset=koffset_0+A->row_coupleBlock->pattern->ptr[i]+A->remote_coupleBlock->pattern->ptr[i];
658 if (remote_threshold[2*i]>0) {
659 #pragma ivdep
660 for (iptr=A->row_coupleBlock->pattern->ptr[i];iptr<A->row_coupleBlock->pattern->ptr[i+1]; ++iptr) {
661 register index_t j=A->row_coupleBlock->pattern->index[iptr];
662 register double fnorm2=0;
663 #pragma ivdepremote_threshold[2*i]
664 for(bi=0;bi<block_size;++bi) {
665 register double rtmp2 = A->row_coupleBlock->val[iptr*block_size+bi];
666 fnorm2+=rtmp2*rtmp2;
667 }
668
669 if(fnorm2 > threshold2 ) {
670 S[koffset+kdeg] = j ;
671 kdeg++;
672 }
673 }
674
675 #pragma ivdep
676 for (iptr=A->remote_coupleBlock->pattern->ptr[i];iptr<A->remote_coupleBlock->pattern->ptr[i+1]; ++iptr) {
677 register index_t j=A->remote_coupleBlock->pattern->index[iptr];
678 register double fnorm2=0;
679 #pragma ivdepremote_threshold[2*i]
680 for(bi=0;bi<block_size;++bi) {
681 register double rtmp2 = A->remote_coupleBlock->val[iptr*block_size+bi];
682 fnorm2+=rtmp2*rtmp2;
683 }
684 if(fnorm2 > threshold2 && i != j) {
685 S[koffset+kdeg] = j + my_n;
686 kdeg++;
687 }
688 }
689
690 }
691 offset_S[i+my_n]=koffset;
692 degree_S[i+my_n]=kdeg;
693 }
694 Paso_Coupler_free(threshold_coupler);
695 }
696 TMPMEMFREE(threshold_p);
697 }
698
699 void Paso_Preconditioner_AMG_transposeStrongConnections(const dim_t n, const dim_t* degree_S, const index_t* offset_S, const index_t* S,
700 const dim_t nT, dim_t* degree_ST, index_t* offset_ST,index_t* ST)
701 {
702 index_t i, j;
703 dim_t p;
704 dim_t len=0;
705 #pragma omp parallel for private(i) schedule(static)
706 for (i=0; i<nT ;++i) {
707 degree_ST[i]=0;
708 }
709 for (i=0; i<n ;++i) {
710 for (p=0; p<degree_S[i]; ++p) degree_ST[ S[offset_S[i]+p] ]++;
711 }
712 for (i=0; i<nT ;++i) {
713 offset_ST[i]=len;
714 len+=degree_ST[i];
715 degree_ST[i]=0;
716 }
717 for (i=0; i<n ;++i) {
718 for (p=0; p<degree_S[i]; ++p) {
719 j=S[offset_S[i]+p];
720 ST[offset_ST[j]+degree_ST[j]]=i;
721 degree_ST[j]++;
722 }
723 }
724 }
725
726 int compareindex(const void *a, const void *b)
727 {
728 return (*(int *)a - *(int *)b);
729 }
730
731 void Paso_Preconditioner_AMG_CIJPCoarsening(const dim_t n, const dim_t my_n, index_t*split_marker,
732 const dim_t* degree_S, const index_t* offset_S, const index_t* S,
733 const dim_t* degree_ST, const index_t* offset_ST, const index_t* ST,
734 Paso_Connector* col_connector, Paso_Distribution* col_dist)
735 {
736 dim_t i, numUndefined, iter=0;
737 index_t iptr, jptr, kptr;
738 double *random=NULL, *w=NULL, *Status=NULL;
739 index_t * ST_flag=NULL;
740
741 Paso_Coupler* w_coupler=Paso_Coupler_alloc(col_connector ,1);
742
743 w=TMPMEMALLOC(n, double);
744 Status=TMPMEMALLOC(n, double);
745 random = Paso_Distribution_createRandomVector(col_dist,1);
746 ST_flag=TMPMEMALLOC(offset_ST[n-1]+ degree_ST[n-1], index_t);
747
748 #pragma omp parallel for private(i)
749 for (i=0; i< my_n; ++i) {
750 w[i]=degree_ST[i]+random[i];
751 if (degree_ST[i] < 1) {
752 Status[i]=-100; /* F point */
753 } else {
754 Status[i]=1; /* status undefined */
755 }
756 }
757
758 #pragma omp parallel for private(i, iptr)
759 for (i=0; i< n; ++i) {
760 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
761 ST_flag[offset_ST[i]+iptr]=1;
762 }
763 }
764
765
766 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
767 /* printf(" coarsening loop start: num of undefined rows = %d \n",numUndefined); */
768 iter=0;
769 while (numUndefined > 0) {
770 Paso_Coupler_fillOverlap(n, w, w_coupler);
771
772 /* calculate the maximum value of neighbours following active strong connections:
773 w2[i]=MAX(w[k]) with k in ST[i] or S[i] and (i,k) connection is still active */
774 #pragma omp parallel for private(i, iptr)
775 for (i=0; i<my_n; ++i) {
776 if (Status[i]>0) { /* status is still undefined */
777
778 register bool_t inD=TRUE;
779 const double wi=w[i];
780
781 for( iptr =0 ; iptr < degree_S[i]; ++iptr) {
782 const index_t k=S[offset_S[i]+iptr];
783 const index_t* start_p = &ST[offset_ST[k]];
784 const index_t* where_p=(index_t*)bsearch(&i, start_p, degree_ST[k], sizeof(index_t), Paso_comparIndex);
785
786 if (ST_flag[offset_ST[k] + (index_t)(where_p-start_p)]>0) {
787 if (wi <= w[k] ) {
788 inD=FALSE;
789 break;
790 }
791 }
792
793 }
794
795 if (inD) {
796 for( iptr =0 ; iptr < degree_ST[i]; ++iptr) {
797 const index_t k=ST[offset_ST[i]+iptr];
798 if ( ST_flag[offset_ST[i]+iptr] > 0 ) {
799 if (wi <= w[k] ) {
800 inD=FALSE;
801 break;
802 }
803 }
804 }
805 }
806 if (inD) {
807 Status[i]=0.; /* is in D */
808 }
809 }
810
811 }
812
813 Paso_Coupler_fillOverlap(n, Status, w_coupler);
814
815
816 /* remove connection to D points :
817
818 for each i in D:
819 for each j in S_i:
820 w[j]--
821 ST_tag[j,i]=-1
822 for each j in ST[i]:
823 ST_tag[i,j]=-1
824 for each k in ST[j]:
825 if k in ST[i]:
826 w[j]--;
827 ST_tag[j,k]=-1
828
829 */
830 /* w is updated for local rows only */
831 {
832 #pragma omp parallel for private(i, jptr)
833 for (i=0; i< my_n; ++i) {
834
835 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
836 const index_t j=ST[offset_ST[i]+jptr];
837 if ( (Status[j] == 0.) && (ST_flag[offset_ST[i]+jptr]>0) ) {
838 w[i]--;
839 ST_flag[offset_ST[i]+jptr]=-1;
840 }
841 }
842
843 }
844 #pragma omp parallel for private(i, jptr)
845 for (i=my_n; i< n; ++i) {
846 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
847 const index_t j = ST[offset_ST[i]+jptr];
848 if ( Status[j] == 0. ) ST_flag[offset_ST[i]+jptr]=-1;
849 }
850 }
851
852
853 for (i=0; i< n; ++i) {
854 if ( Status[i] == 0. ) {
855
856 const index_t* start_p = &ST[offset_ST[i]];
857
858 for (jptr=0; jptr<degree_ST[i]; ++jptr) {
859 const index_t j=ST[offset_ST[i]+jptr];
860 ST_flag[offset_ST[i]+jptr]=-1;
861 for (kptr=0; kptr<degree_ST[j]; ++kptr) {
862 const index_t k=ST[offset_ST[j]+kptr];
863 if (NULL != bsearch(&k, start_p, degree_ST[i], sizeof(index_t), Paso_comparIndex) ) { /* k in ST[i] ? */
864 if (ST_flag[offset_ST[j]+kptr] >0) {
865 if (j< my_n ) {
866 w[j]--;
867 }
868 ST_flag[offset_ST[j]+kptr]=-1;
869 }
870 }
871 }
872 }
873 }
874 }
875 }
876
877 /* adjust status */
878 #pragma omp parallel for private(i)
879 for (i=0; i< my_n; ++i) {
880 if ( Status[i] == 0. ) {
881 Status[i] = -10; /* this is now a C point */
882 } else if (Status[i] == 1. && w[i]<1.) {
883 Status[i] = -100; /* this is now a F point */
884 }
885 }
886
887 i = numUndefined;
888 numUndefined = Paso_Distribution_numPositives(Status, col_dist, 1 );
889 if (numUndefined == i) {
890 Esys_setError(SYSTEM_ERROR, "Can NOT reduce numUndefined.");
891 return;
892 }
893
894 iter++;
895 /* printf(" coarsening loop %d: num of undefined rows = %d \n",iter, numUndefined); */
896
897 } /* end of while loop */
898
899 /* map to output :*/
900 Paso_Coupler_fillOverlap(n, Status, w_coupler);
901 #pragma omp parallel for private(i)
902 for (i=0; i< n; ++i) {
903 if (Status[i] > -50.) {
904 split_marker[i]=PASO_AMG_IN_C;
905 } else {
906 split_marker[i]=PASO_AMG_IN_F;
907 }
908 }
909 /* clean up : */
910 Paso_Coupler_free(w_coupler);
911 TMPMEMFREE(random);
912 TMPMEMFREE(w);
913 TMPMEMFREE(Status);
914 TMPMEMFREE(ST_flag);
915
916 return;
917 }
918

  ViewVC Help
Powered by ViewVC 1.1.26