/[escript]/trunk/paso/src/Smoother.c
ViewVC logotype

Contents of /trunk/paso/src/Smoother.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3094 - (show annotations)
Fri Aug 13 08:38:06 2010 UTC (8 years, 8 months ago) by gross
Original Path: trunk/paso/src/Solver_GS.c
File MIME type: text/plain
File size: 19913 byte(s)
The MPI and sequational GAUSS_SEIDEL have been merged.
The couring and main diagonal pointer is now manged by the patternm which means that they are calculated once only even if the preconditioner is deleted.



1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: GS preconditioner with reordering */
18
19 /**************************************************************/
20
21 /* Copyrights by ACcESS Australia 2003,2004,2005,2006,2007,2008 */
22 /* Author: artak@uq.edu.au */
23
24 /**************************************************************/
25
26 #include "Paso.h"
27 #include "Solver.h"
28 #include "PasoUtil.h"
29
30 #include <stdio.h>
31
32
33 /**************************************************************/
34
35 /* free all memory used by GS */
36
37 void Paso_Solver_GS_free(Paso_Solver_GS * in) {
38 if (in!=NULL) {
39 Paso_Solver_LocalGS_free(in->localGS);
40 MEMFREE(in);
41 }
42 }
43 void Paso_Solver_LocalGS_free(Paso_Solver_LocalGS * in) {
44 if (in!=NULL) {
45 MEMFREE(in->diag);
46 MEMFREE(in->pivot);
47 MEMFREE(in);
48 }
49 }
50 /**************************************************************/
51
52 /* constructs the symmetric Gauss-Seidel preconditioner
53
54 */
55 Paso_Solver_GS* Paso_Solver_getGS(Paso_SystemMatrix * A_p, dim_t sweeps, bool_t is_local, bool_t verbose)
56 {
57
58 /* allocations: */
59 Paso_Solver_GS* out=MEMALLOC(1,Paso_Solver_GS);
60 if (! Paso_checkPtr(out)) {
61 out->localGS=Paso_Solver_getLocalGS(A_p->mainBlock,sweeps,verbose);
62 out->is_local=is_local;
63 }
64 if (Paso_MPIInfo_noError(A_p->mpi_info)) {
65 return out;
66 } else {
67 Paso_Solver_GS_free(out);
68 return NULL;
69 }
70 }
71 Paso_Solver_LocalGS* Paso_Solver_getLocalGS(Paso_SparseMatrix * A_p, dim_t sweeps, bool_t verbose) {
72
73 dim_t n=A_p->numRows;
74 dim_t n_block=A_p->row_block_size;
75 dim_t block_size=A_p->block_size;
76
77 double time0=Paso_timer();
78 /* allocations: */
79 Paso_Solver_LocalGS* out=MEMALLOC(1,Paso_Solver_LocalGS);
80 if (! Paso_checkPtr(out)) {
81
82 out->diag=MEMALLOC( ((size_t) n) * ((size_t) block_size),double);
83 out->pivot=MEMALLOC( ((size_t) n) * ((size_t) n_block), index_t);
84 out->sweeps=sweeps;
85
86 if ( ! ( Paso_checkPtr(out->diag) || Paso_checkPtr(out->pivot) ) ) {
87 Paso_SparseMatrix_invMain(A_p, out->diag, out->pivot );
88 }
89
90 }
91 time0=Paso_timer()-time0;
92
93 if (Paso_noError()) {
94 if (verbose) printf("timing: Gauss-Seidel preparation: elemination : %e\n",time0);
95 return out;
96 } else {
97 Paso_Solver_LocalGS_free(out);
98 return NULL;
99 }
100 }
101
102 /**************************************************************/
103
104 /* Apply Gauss-Seidel
105
106 in fact it solves Ax=b in two steps:
107
108 step1: among different MPI ranks, we use block Jacobi
109
110 x{k} = x{k-1} + D{-1}(b-A*x{k-1})
111
112 => D*x{k} = b - (E+F)x{k-1}
113
114 where matrix D is (let p be the number of nodes):
115
116 --------------------
117 |A1| | | ... | |
118 --------------------
119 | |A2| | ... | |
120 --------------------
121 | | |A3| ... | |
122 --------------------
123 | ... |
124 --------------------
125 | | | | ... |Ap|
126 --------------------
127
128 and Ai (i \in [1,p]) represents the mainBlock of matrix
129 A on rank i. Matrix (E+F) is represented as the coupleBlock
130 of matrix A on each rank (annotated as ACi).
131
132 Therefore, step1 can be turned into the following for rank i:
133
134 => Ai * x{k} = b - ACi * x{k-1}
135
136 where both x{k} and b are the segment of x and b on node i,
137 and x{k-1} is the old segment values of x on all other nodes.
138
139 step2: inside rank i, we use Gauss-Seidel
140
141 let b'= b - ACi * x{k-1} we have Ai * x{k} = b' for rank i
142 by using symetrix Gauss-Seidel,
143
144 this can be solved in a forward phase and a backward phase:
145
146 forward phase: x{m} = diag(Ai){-1} (b' - E*x{m} - F*x{m-1})
147 backward phase: x{m+1} = diag(Ai){-1} (b' - F*{m+1} - E*x{m})
148
149 */
150
151 void Paso_Solver_solveGS(Paso_SystemMatrix* A_p, Paso_Solver_GS * gs, double * x, const double * b)
152 {
153 register dim_t i;
154 const dim_t n=A_p->mainBlock->numRows;
155 const dim_t n_block=A_p->mainBlock->row_block_size;
156 double *remote_x=NULL;
157 const dim_t sweeps_ref=gs->localGS->sweeps;
158 dim_t sweeps=gs->localGS->sweeps;
159 const bool_t remote = (!gs->is_local) && (A_p->mpi_info->size > 1);
160 double *new_b = NULL;
161
162 if (remote) {
163 new_b=TMPMEMALLOC(n*n_block,double);
164 gs->localGS->sweeps=(dim_t) ceil( sqrt(DBLE(gs->localGS->sweeps)) );
165 }
166
167
168 Paso_Solver_solveLocalGS(A_p->mainBlock,gs->localGS,x,b);
169 sweeps-=gs->localGS->sweeps;
170
171 while (sweeps > 0 && remote ) {
172 Paso_SystemMatrix_startCollect(A_p,x);
173 /* calculate new_b = b - ACi * x{k-1}, where x{k-1} are remote
174 value of x, which requires MPI communications */
175 #pragma omp parallel for private(i) schedule(static)
176 for (i=0;i<n*n_block;++i) new_b[i]=b[i];
177
178 remote_x=Paso_SystemMatrix_finishCollect(A_p);
179 /*new_b = (-1) * AC * x + 1. * new_b */
180 Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(DBLE(-1),A_p->col_coupleBlock,remote_x,DBLE(1), new_b);
181
182 Paso_Solver_solveLocalGS(A_p->mainBlock,gs->localGS,x,new_b);
183 sweeps-=gs->localGS->sweeps;
184 }
185 TMPMEMFREE(new_b);
186 gs->localGS->sweeps=sweeps_ref;
187 return;
188 }
189
190 void Paso_Solver_solveLocalGS(Paso_SparseMatrix* A, Paso_Solver_LocalGS * gs, double * x, const double * b)
191 {
192 dim_t i;
193 #ifdef _OPENMP
194 const dim_t nt=omp_get_max_threads();
195 #else
196 const dim_t nt = 1;
197 #endif
198 gs->sweeps=MAX(gs->sweeps,1);
199
200 for (i =0 ; i<gs->sweeps; i++) {
201 if (nt > 1) {
202 Paso_Solver_solveLocalGS_sequential(A,gs,x,b);
203 } else {
204 Paso_Solver_solveLocalGS_colored(A,gs,x,b);
205 /* Paso_Solver_solveLocalGS_tiled(A,gs,x,b); LIN: ADD YOUR STUFF */
206 }
207 }
208 }
209
210 /*
211
212 applies symmetric Gauss Seidel with coloring = (U+D)^{-1}*D* (L+D)^{-1}
213
214 */
215
216
217 void Paso_Solver_solveLocalGS_colored(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x, const double * b)
218 {
219 const dim_t n=A_p->numRows;
220 const dim_t n_block=A_p->row_block_size;
221 const double *diag = gs->diag;
222 /* const index_t* pivot = gs->pivot;
223 const dim_t block_size=A_p->block_size; use for block size >3*/
224
225 register dim_t i,k;
226 register index_t color,iptr_ik, mm;
227 register double A11,A12,A21,A22,A13,A23,A33,A32,A31,S1,S2,S3,R1,R2,R3;
228
229 const index_t* coloring = Paso_Pattern_borrowColoringPointer(A_p->pattern);
230 const dim_t num_colors = Paso_Pattern_getNumColors(A_p->pattern);
231 const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
232
233 /* forward substitution */
234
235 /* color = 0 */
236 if (n_block==1) {
237 #pragma omp parallel for schedule(static) private(i,S1, A11)
238 for (i = 0; i < n; ++i) {
239 if (coloring[i]==0) {
240 /* x_i=x_i-a_ik*x_k */
241 S1=b[i];
242 A11=diag[i];
243 x[i]=A11*S1;
244 }
245 }
246 } else if (n_block==2) {
247 #pragma omp parallel for schedule(static) private(i,S1,S2,A11,A21,A12,A22)
248 for (i = 0; i < n; ++i) {
249 if (coloring[i]== 0 ) {
250 /* x_i=x_i-a_ik*x_k */
251 S1=b[2*i];
252 S2=b[2*i+1];
253
254 A11=diag[i*4];
255 A12=diag[i*4+2];
256 A21=diag[i*4+1];
257 A22=diag[i*4+3];
258
259 x[2*i ]=A11 * S1 + A12 * S2;
260 x[2*i+1]=A21 * S1 + A22 * S2;
261
262 }
263 }
264 } else if (n_block==3) {
265 #pragma omp parallel for schedule(static) private(i,S1,S2,S3,A11,A21,A31,A12,A22,A32,A13,A23,A33)
266 for (i = 0; i < n; ++i) {
267 if (coloring[i]==0) {
268 /* x_i=x_i-a_ik*x_k */
269 S1=b[3*i];
270 S2=b[3*i+1];
271 S3=b[3*i+2];
272 A11=diag[i*9 ];
273 A21=diag[i*9+1];
274 A31=diag[i*9+2];
275 A12=diag[i*9+3];
276 A22=diag[i*9+4];
277 A32=diag[i*9+5];
278 A13=diag[i*9+6];
279 A23=diag[i*9+7];
280 A33=diag[i*9+8];
281 x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
282 x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
283 x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
284 }
285 }
286 } /* add block size >3 */
287
288 for (color=1;color<num_colors;++color) {
289 if (n_block==1) {
290 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1, A11,R1)
291 for (i = 0; i < n; ++i) {
292 if (coloring[i]==color) {
293 /* x_i=x_i-a_ik*x_k */
294 S1=b[i];
295 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
296 k=A_p->pattern->index[iptr_ik];
297 if (coloring[k]<color) {
298 R1=x[k];
299 S1-=A_p->val[iptr_ik]*R1;
300 }
301 }
302 A11=diag[i];
303 x[i]=A11*S1;
304 }
305 }
306 } else if (n_block==2) {
307 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2,A11,A21,A12,A22)
308 for (i = 0; i < n; ++i) {
309 if (coloring[i]==color) {
310 /* x_i=x_i-a_ik*x_k */
311 S1=b[2*i];
312 S2=b[2*i+1];
313 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
314 k=A_p->pattern->index[iptr_ik];
315 if (coloring[k]<color) {
316 R1=x[2*k];
317 R2=x[2*k+1];
318 S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
319 S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
320 }
321 }
322 A11=diag[i*4];
323 A12=diag[i*4+2];
324 A21=diag[i*4+1];
325 A22=diag[i*4+3];
326
327 x[2*i ]=A11 * S1 + A12 * S2;
328 x[2*i+1]=A21 * S1 + A22 * S2;
329
330 }
331
332 }
333 } else if (n_block==3) {
334 #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3,A11,A21,A31,A12,A22,A32,A13,A23,A33)
335 for (i = 0; i < n; ++i) {
336 if (coloring[i]==color) {
337 /* x_i=x_i-a_ik*x_k */
338 S1=b[3*i];
339 S2=b[3*i+1];
340 S3=b[3*i+2];
341 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
342 k=A_p->pattern->index[iptr_ik];
343 if (coloring[k]<color) {
344 R1=x[3*k];
345 R2=x[3*k+1];
346 R3=x[3*k+2];
347 S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
348 S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
349 S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
350 }
351 }
352 A11=diag[i*9 ];
353 A21=diag[i*9+1];
354 A31=diag[i*9+2];
355 A12=diag[i*9+3];
356 A22=diag[i*9+4];
357 A32=diag[i*9+5];
358 A13=diag[i*9+6];
359 A23=diag[i*9+7];
360 A33=diag[i*9+8];
361 x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
362 x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
363 x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
364 }
365 }
366 } /* add block size >3 */
367 } /* end of coloring loop */
368
369
370 /* backward substitution */
371 for (color=(num_colors)-2 ;color>-1;--color) { /* Note: color=(num_colors)-1 is not required */
372 if (n_block==1) {
373 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k,S1,R1)
374 for (i = 0; i < n; ++i) {
375 if (coloring[i]==color) {
376
377 mm=ptr_main[i];
378 R1=x[i];
379 A11=A_p->val[mm];
380 S1 = A11 * R1;
381
382 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
383 k=A_p->pattern->index[iptr_ik];
384 if (coloring[k]>color) {
385 R1=x[k];
386 S1-=A_p->val[iptr_ik]*R1;
387 }
388 }
389
390 A11=diag[i];
391 x[i]=A11*S1;
392
393 }
394 }
395 } else if (n_block==2) {
396 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k,S1,S2,R1,R2,A11,A21,A12,A22)
397 for (i = 0; i < n; ++i) {
398 if (coloring[i]==color) {
399
400 mm=ptr_main[i];
401
402 R1=x[2*i];
403 R2=x[2*i+1];
404
405 A11=A_p->val[mm*4 ];
406 A21=A_p->val[mm*4+1];
407 A12=A_p->val[mm*4+2];
408 A22=A_p->val[mm*4+3];
409
410 S1 = A11 * R1 + A12 * R2;
411 S2 = A21 * R1 + A22 * R2;
412
413 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
414 k=A_p->pattern->index[iptr_ik];
415 if (coloring[k]>color) {
416 R1=x[2*k];
417 R2=x[2*k+1];
418 S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
419 S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
420 }
421 }
422
423 A11=diag[i*4];
424 A12=diag[i*4+2];
425 A21=diag[i*4+1];
426 A22=diag[i*4+3];
427
428 x[2*i ]=A11 * S1 + A12 * S2;
429 x[2*i+1]=A21 * S1 + A22 * S2;
430
431 }
432 }
433 } else if (n_block==3) {
434 #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k,S1,S2,S3,R1,R2,R3,A11,A21,A31,A12,A22,A32,A13,A23,A33)
435 for (i = 0; i < n; ++i) {
436 if (coloring[i]==color) {
437
438 mm=ptr_main[i];
439 R1=x[3*i];
440 R2=x[3*i+1];
441 R3=x[3*i+2];
442
443 A11=A_p->val[mm*9 ];
444 A21=A_p->val[mm*9+1];
445 A31=A_p->val[mm*9+2];
446 A12=A_p->val[mm*9+3];
447 A22=A_p->val[mm*9+4];
448 A32=A_p->val[mm*9+5];
449 A13=A_p->val[mm*9+6];
450 A23=A_p->val[mm*9+7];
451 A33=A_p->val[mm*9+8];
452
453 S1 =A11 * R1 + A12 * R2 + A13 * R3;
454 S2 =A21 * R1 + A22 * R2 + A23 * R3;
455 S3 =A31 * R1 + A32 * R2 + A33 * R3;
456
457 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
458 k=A_p->pattern->index[iptr_ik];
459 if (coloring[k]>color) {
460 R1=x[3*k];
461 R2=x[3*k+1];
462 R3=x[3*k+2];
463 S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
464 S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
465 S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
466 }
467 }
468
469 A11=diag[i*9 ];
470 A21=diag[i*9+1];
471 A31=diag[i*9+2];
472 A12=diag[i*9+3];
473 A22=diag[i*9+4];
474 A32=diag[i*9+5];
475 A13=diag[i*9+6];
476 A23=diag[i*9+7];
477 A33=diag[i*9+8];
478
479 x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
480 x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
481 x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
482
483 }
484 }
485 } /* add block size >3 */
486 }
487 return;
488 }
489
490 void Paso_Solver_solveLocalGS_sequential(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x, const double * b)
491 {
492 const dim_t n=A_p->numRows;
493 const dim_t n_block=A_p->row_block_size;
494 const double *diag = gs->diag;
495 /* const index_t* pivot = gs->pivot;
496 const dim_t block_size=A_p->block_size; use for block size >3*/
497
498 register dim_t i,k;
499 register index_t iptr_ik, mm;
500 register double A11,A12,A21,A22,A13,A23,A33,A32,A31,S1,S2,S3,R1,R2,R3;
501
502 const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
503
504 /* forward substitution */
505
506 if (n_block==1) {
507 for (i = 0; i < n; ++i) {
508 /* x_i=x_i-a_ik*x_k */
509 S1=b[i];
510 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
511 k=A_p->pattern->index[iptr_ik];
512 if (k<i) {
513 R1=x[k];
514 S1-=A_p->val[iptr_ik]*R1;
515 } else {
516 break; /* index is ordered */
517 }
518 }
519 A11=diag[i];
520 x[i]=A11*S1;
521 }
522 } else if (n_block==2) {
523 for (i = 0; i < n; ++i) {
524 S1=b[2*i];
525 S2=b[2*i+1];
526 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
527 k=A_p->pattern->index[iptr_ik];
528 if (k<i) {
529 R1=x[2*k];
530 R2=x[2*k+1];
531 S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
532 S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
533 } else {
534 break; /* index is ordered */
535 }
536 }
537 A11=diag[i*4];
538 A12=diag[i*4+2];
539 A21=diag[i*4+1];
540 A22=diag[i*4+3];
541
542 x[2*i ]=A11 * S1 + A12 * S2;
543 x[2*i+1]=A21 * S1 + A22 * S2;
544
545 }
546 } else if (n_block==3) {
547 for (i = 0; i < n; ++i) {
548 /* x_i=x_i-a_ik*x_k */
549 S1=b[3*i];
550 S2=b[3*i+1];
551 S3=b[3*i+2];
552 for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
553 k=A_p->pattern->index[iptr_ik];
554 if ( k<i ) {
555 R1=x[3*k];
556 R2=x[3*k+1];
557 R3=x[3*k+2];
558 S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
559 S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
560 S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
561 } else {
562 break; /* index is ordered */
563 }
564 }
565 A11=diag[i*9 ];
566 A21=diag[i*9+1];
567 A31=diag[i*9+2];
568 A12=diag[i*9+3];
569 A22=diag[i*9+4];
570 A32=diag[i*9+5];
571 A13=diag[i*9+6];
572 A23=diag[i*9+7];
573 A33=diag[i*9+8];
574 x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
575 x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
576 x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
577 }
578
579 } /* add block size >3 */
580
581
582
583 /* backward substitution */
584
585 if (n_block==1) {
586 for (i = n-2; i > -1; ++i) {
587
588 mm=ptr_main[i];
589 R1=x[i];
590 A11=A_p->val[mm];
591 S1 = A11 * R1;
592
593 for (iptr_ik=A_p->pattern->ptr[i+1]-1; iptr_ik > A_p->pattern->ptr[i]-1; ++iptr_ik) {
594 k=A_p->pattern->index[iptr_ik];
595 if (k > i) {
596 R1=x[k];
597 S1-=A_p->val[iptr_ik]*R1;
598 } else {
599 break ;
600 }
601 }
602
603 A11=diag[i];
604 x[i]=A11*S1;
605
606 }
607 } else if (n_block==2) {
608 for (i = n-2; i > -1; ++i) {
609
610 mm=ptr_main[i];
611
612 R1=x[2*i];
613 R2=x[2*i+1];
614
615 A11=A_p->val[mm*4 ];
616 A21=A_p->val[mm*4+1];
617 A12=A_p->val[mm*4+2];
618 A22=A_p->val[mm*4+3];
619
620 S1 = A11 * R1 + A12 * R2;
621 S2 = A21 * R1 + A22 * R2;
622
623 for (iptr_ik=A_p->pattern->ptr[i+1]-1; iptr_ik > A_p->pattern->ptr[i]-1; ++iptr_ik) {
624 k=A_p->pattern->index[iptr_ik];
625 if (k > i) {
626 R1=x[2*k];
627 R2=x[2*k+1];
628 S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
629 S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
630 } else {
631 break ;
632 }
633 }
634
635 A11=diag[i*4];
636 A12=diag[i*4+2];
637 A21=diag[i*4+1];
638 A22=diag[i*4+3];
639
640 x[2*i ]=A11 * S1 + A12 * S2;
641 x[2*i+1]=A21 * S1 + A22 * S2;
642
643
644 }
645 } else if (n_block==3) {
646 for (i = n-2; i > -1; ++i) {
647
648 mm=ptr_main[i];
649 R1=x[3*i];
650 R2=x[3*i+1];
651 R3=x[3*i+2];
652
653 A11=A_p->val[mm*9 ];
654 A21=A_p->val[mm*9+1];
655 A31=A_p->val[mm*9+2];
656 A12=A_p->val[mm*9+3];
657 A22=A_p->val[mm*9+4];
658 A32=A_p->val[mm*9+5];
659 A13=A_p->val[mm*9+6];
660 A23=A_p->val[mm*9+7];
661 A33=A_p->val[mm*9+8];
662
663 S1 =A11 * R1 + A12 * R2 + A13 * R3;
664 S2 =A21 * R1 + A22 * R2 + A23 * R3;
665 S3 =A31 * R1 + A32 * R2 + A33 * R3;
666
667 for (iptr_ik=A_p->pattern->ptr[i+1]-1; iptr_ik > A_p->pattern->ptr[i]-1; ++iptr_ik) {
668 k=A_p->pattern->index[iptr_ik];
669 if (k > i) {
670 R1=x[3*k];
671 R2=x[3*k+1];
672 R3=x[3*k+2];
673 S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
674 S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
675 S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
676 } else {
677 break ;
678 }
679 }
680
681 A11=diag[i*9 ];
682 A21=diag[i*9+1];
683 A31=diag[i*9+2];
684 A12=diag[i*9+3];
685 A22=diag[i*9+4];
686 A32=diag[i*9+5];
687 A13=diag[i*9+6];
688 A23=diag[i*9+7];
689 A33=diag[i*9+8];
690
691 x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
692 x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
693 x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
694
695 }
696
697 } /* add block size >3 */
698
699 return;
700 }
701

  ViewVC Help
Powered by ViewVC 1.1.26