/[escript]/trunk/paso/src/Smoother.c
ViewVC logotype

Annotation of /trunk/paso/src/Smoother.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3094 - (hide annotations)
Fri Aug 13 08:38:06 2010 UTC (8 years, 5 months ago) by gross
Original Path: trunk/paso/src/Solver_GS.c
File MIME type: text/plain
File size: 19913 byte(s)
The MPI and sequational GAUSS_SEIDEL have been merged.
The couring and main diagonal pointer is now manged by the patternm which means that they are calculated once only even if the preconditioner is deleted.



1 jfenwick 1851
2     /*******************************************************
3     *
4 jfenwick 2881 * Copyright (c) 2003-2010 by University of Queensland
5 jfenwick 1851 * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13    
14    
15     /**************************************************************/
16    
17     /* Paso: GS preconditioner with reordering */
18    
19     /**************************************************************/
20    
21     /* Copyrights by ACcESS Australia 2003,2004,2005,2006,2007,2008 */
22     /* Author: artak@uq.edu.au */
23    
24     /**************************************************************/
25    
26     #include "Paso.h"
27     #include "Solver.h"
28     #include "PasoUtil.h"
29    
30     #include <stdio.h>
31    
32 gross 3094
33 jfenwick 1851 /**************************************************************/
34    
35     /* free all memory used by GS */
36    
37     void Paso_Solver_GS_free(Paso_Solver_GS * in) {
38     if (in!=NULL) {
39 gross 3094 Paso_Solver_LocalGS_free(in->localGS);
40 jfenwick 1851 MEMFREE(in);
41     }
42     }
43 gross 3094 void Paso_Solver_LocalGS_free(Paso_Solver_LocalGS * in) {
44     if (in!=NULL) {
45     MEMFREE(in->diag);
46     MEMFREE(in->pivot);
47     MEMFREE(in);
48     }
49     }
50 jfenwick 1851 /**************************************************************/
51    
52 gross 3094 /* constructs the symmetric Gauss-Seidel preconditioner
53 jfenwick 1851
54     */
55 gross 3094 Paso_Solver_GS* Paso_Solver_getGS(Paso_SystemMatrix * A_p, dim_t sweeps, bool_t is_local, bool_t verbose)
56     {
57    
58 jfenwick 1851 /* allocations: */
59     Paso_Solver_GS* out=MEMALLOC(1,Paso_Solver_GS);
60 gross 3094 if (! Paso_checkPtr(out)) {
61     out->localGS=Paso_Solver_getLocalGS(A_p->mainBlock,sweeps,verbose);
62     out->is_local=is_local;
63 jfenwick 1851 }
64 gross 3094 if (Paso_MPIInfo_noError(A_p->mpi_info)) {
65 jfenwick 1851 return out;
66 gross 3094 } else {
67 jfenwick 1851 Paso_Solver_GS_free(out);
68     return NULL;
69     }
70     }
71 gross 3094 Paso_Solver_LocalGS* Paso_Solver_getLocalGS(Paso_SparseMatrix * A_p, dim_t sweeps, bool_t verbose) {
72    
73     dim_t n=A_p->numRows;
74     dim_t n_block=A_p->row_block_size;
75     dim_t block_size=A_p->block_size;
76    
77     double time0=Paso_timer();
78     /* allocations: */
79     Paso_Solver_LocalGS* out=MEMALLOC(1,Paso_Solver_LocalGS);
80     if (! Paso_checkPtr(out)) {
81    
82     out->diag=MEMALLOC( ((size_t) n) * ((size_t) block_size),double);
83     out->pivot=MEMALLOC( ((size_t) n) * ((size_t) n_block), index_t);
84     out->sweeps=sweeps;
85    
86     if ( ! ( Paso_checkPtr(out->diag) || Paso_checkPtr(out->pivot) ) ) {
87     Paso_SparseMatrix_invMain(A_p, out->diag, out->pivot );
88     }
89    
90     }
91     time0=Paso_timer()-time0;
92    
93     if (Paso_noError()) {
94     if (verbose) printf("timing: Gauss-Seidel preparation: elemination : %e\n",time0);
95     return out;
96     } else {
97     Paso_Solver_LocalGS_free(out);
98     return NULL;
99     }
100     }
101 jfenwick 1851
102     /**************************************************************/
103    
104 gross 3094 /* Apply Gauss-Seidel
105 jfenwick 1851
106 gross 3094 in fact it solves Ax=b in two steps:
107    
108     step1: among different MPI ranks, we use block Jacobi
109 jfenwick 1851
110 gross 3094 x{k} = x{k-1} + D{-1}(b-A*x{k-1})
111 jfenwick 1851
112 gross 3094 => D*x{k} = b - (E+F)x{k-1}
113    
114     where matrix D is (let p be the number of nodes):
115    
116     --------------------
117     |A1| | | ... | |
118     --------------------
119     | |A2| | ... | |
120     --------------------
121     | | |A3| ... | |
122     --------------------
123     | ... |
124     --------------------
125     | | | | ... |Ap|
126     --------------------
127    
128     and Ai (i \in [1,p]) represents the mainBlock of matrix
129     A on rank i. Matrix (E+F) is represented as the coupleBlock
130     of matrix A on each rank (annotated as ACi).
131    
132     Therefore, step1 can be turned into the following for rank i:
133    
134     => Ai * x{k} = b - ACi * x{k-1}
135    
136     where both x{k} and b are the segment of x and b on node i,
137     and x{k-1} is the old segment values of x on all other nodes.
138    
139     step2: inside rank i, we use Gauss-Seidel
140    
141     let b'= b - ACi * x{k-1} we have Ai * x{k} = b' for rank i
142     by using symetrix Gauss-Seidel,
143    
144     this can be solved in a forward phase and a backward phase:
145    
146     forward phase: x{m} = diag(Ai){-1} (b' - E*x{m} - F*x{m-1})
147     backward phase: x{m+1} = diag(Ai){-1} (b' - F*{m+1} - E*x{m})
148    
149 jfenwick 1851 */
150    
151 gross 3094 void Paso_Solver_solveGS(Paso_SystemMatrix* A_p, Paso_Solver_GS * gs, double * x, const double * b)
152     {
153     register dim_t i;
154     const dim_t n=A_p->mainBlock->numRows;
155     const dim_t n_block=A_p->mainBlock->row_block_size;
156     double *remote_x=NULL;
157     const dim_t sweeps_ref=gs->localGS->sweeps;
158     dim_t sweeps=gs->localGS->sweeps;
159     const bool_t remote = (!gs->is_local) && (A_p->mpi_info->size > 1);
160     double *new_b = NULL;
161    
162     if (remote) {
163     new_b=TMPMEMALLOC(n*n_block,double);
164     gs->localGS->sweeps=(dim_t) ceil( sqrt(DBLE(gs->localGS->sweeps)) );
165     }
166 jfenwick 1851
167 gross 3094
168     Paso_Solver_solveLocalGS(A_p->mainBlock,gs->localGS,x,b);
169     sweeps-=gs->localGS->sweeps;
170    
171     while (sweeps > 0 && remote ) {
172     Paso_SystemMatrix_startCollect(A_p,x);
173     /* calculate new_b = b - ACi * x{k-1}, where x{k-1} are remote
174     value of x, which requires MPI communications */
175     #pragma omp parallel for private(i) schedule(static)
176     for (i=0;i<n*n_block;++i) new_b[i]=b[i];
177    
178     remote_x=Paso_SystemMatrix_finishCollect(A_p);
179     /*new_b = (-1) * AC * x + 1. * new_b */
180     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(DBLE(-1),A_p->col_coupleBlock,remote_x,DBLE(1), new_b);
181    
182     Paso_Solver_solveLocalGS(A_p->mainBlock,gs->localGS,x,new_b);
183     sweeps-=gs->localGS->sweeps;
184     }
185     TMPMEMFREE(new_b);
186     gs->localGS->sweeps=sweeps_ref;
187     return;
188     }
189 jfenwick 1851
190 gross 3094 void Paso_Solver_solveLocalGS(Paso_SparseMatrix* A, Paso_Solver_LocalGS * gs, double * x, const double * b)
191     {
192     dim_t i;
193     #ifdef _OPENMP
194     const dim_t nt=omp_get_max_threads();
195     #else
196     const dim_t nt = 1;
197     #endif
198     gs->sweeps=MAX(gs->sweeps,1);
199    
200     for (i =0 ; i<gs->sweeps; i++) {
201     if (nt > 1) {
202     Paso_Solver_solveLocalGS_sequential(A,gs,x,b);
203     } else {
204     Paso_Solver_solveLocalGS_colored(A,gs,x,b);
205     /* Paso_Solver_solveLocalGS_tiled(A,gs,x,b); LIN: ADD YOUR STUFF */
206     }
207     }
208 jfenwick 1851 }
209    
210 gross 3094 /*
211    
212     applies symmetric Gauss Seidel with coloring = (U+D)^{-1}*D* (L+D)^{-1}
213    
214     */
215    
216    
217     void Paso_Solver_solveLocalGS_colored(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x, const double * b)
218     {
219     const dim_t n=A_p->numRows;
220     const dim_t n_block=A_p->row_block_size;
221     const double *diag = gs->diag;
222     /* const index_t* pivot = gs->pivot;
223     const dim_t block_size=A_p->block_size; use for block size >3*/
224    
225     register dim_t i,k;
226     register index_t color,iptr_ik, mm;
227     register double A11,A12,A21,A22,A13,A23,A33,A32,A31,S1,S2,S3,R1,R2,R3;
228    
229     const index_t* coloring = Paso_Pattern_borrowColoringPointer(A_p->pattern);
230     const dim_t num_colors = Paso_Pattern_getNumColors(A_p->pattern);
231     const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
232    
233     /* forward substitution */
234    
235     /* color = 0 */
236     if (n_block==1) {
237     #pragma omp parallel for schedule(static) private(i,S1, A11)
238     for (i = 0; i < n; ++i) {
239     if (coloring[i]==0) {
240     /* x_i=x_i-a_ik*x_k */
241     S1=b[i];
242     A11=diag[i];
243     x[i]=A11*S1;
244     }
245     }
246     } else if (n_block==2) {
247     #pragma omp parallel for schedule(static) private(i,S1,S2,A11,A21,A12,A22)
248     for (i = 0; i < n; ++i) {
249     if (coloring[i]== 0 ) {
250     /* x_i=x_i-a_ik*x_k */
251     S1=b[2*i];
252     S2=b[2*i+1];
253    
254     A11=diag[i*4];
255     A12=diag[i*4+2];
256     A21=diag[i*4+1];
257     A22=diag[i*4+3];
258    
259     x[2*i ]=A11 * S1 + A12 * S2;
260     x[2*i+1]=A21 * S1 + A22 * S2;
261    
262     }
263     }
264     } else if (n_block==3) {
265     #pragma omp parallel for schedule(static) private(i,S1,S2,S3,A11,A21,A31,A12,A22,A32,A13,A23,A33)
266     for (i = 0; i < n; ++i) {
267     if (coloring[i]==0) {
268     /* x_i=x_i-a_ik*x_k */
269     S1=b[3*i];
270     S2=b[3*i+1];
271     S3=b[3*i+2];
272     A11=diag[i*9 ];
273     A21=diag[i*9+1];
274     A31=diag[i*9+2];
275     A12=diag[i*9+3];
276     A22=diag[i*9+4];
277     A32=diag[i*9+5];
278     A13=diag[i*9+6];
279     A23=diag[i*9+7];
280     A33=diag[i*9+8];
281     x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
282     x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
283     x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
284     }
285     }
286     } /* add block size >3 */
287    
288     for (color=1;color<num_colors;++color) {
289     if (n_block==1) {
290     #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1, A11,R1)
291     for (i = 0; i < n; ++i) {
292     if (coloring[i]==color) {
293     /* x_i=x_i-a_ik*x_k */
294     S1=b[i];
295     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
296     k=A_p->pattern->index[iptr_ik];
297     if (coloring[k]<color) {
298     R1=x[k];
299     S1-=A_p->val[iptr_ik]*R1;
300     }
301     }
302     A11=diag[i];
303     x[i]=A11*S1;
304     }
305     }
306     } else if (n_block==2) {
307     #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,R1,R2,A11,A21,A12,A22)
308     for (i = 0; i < n; ++i) {
309     if (coloring[i]==color) {
310     /* x_i=x_i-a_ik*x_k */
311     S1=b[2*i];
312     S2=b[2*i+1];
313     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
314     k=A_p->pattern->index[iptr_ik];
315     if (coloring[k]<color) {
316     R1=x[2*k];
317     R2=x[2*k+1];
318     S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
319     S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
320     }
321     }
322     A11=diag[i*4];
323     A12=diag[i*4+2];
324     A21=diag[i*4+1];
325     A22=diag[i*4+3];
326    
327     x[2*i ]=A11 * S1 + A12 * S2;
328     x[2*i+1]=A21 * S1 + A22 * S2;
329    
330     }
331    
332     }
333     } else if (n_block==3) {
334     #pragma omp parallel for schedule(static) private(i,iptr_ik,k,S1,S2,S3,R1,R2,R3,A11,A21,A31,A12,A22,A32,A13,A23,A33)
335     for (i = 0; i < n; ++i) {
336     if (coloring[i]==color) {
337     /* x_i=x_i-a_ik*x_k */
338     S1=b[3*i];
339     S2=b[3*i+1];
340     S3=b[3*i+2];
341     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
342     k=A_p->pattern->index[iptr_ik];
343     if (coloring[k]<color) {
344     R1=x[3*k];
345     R2=x[3*k+1];
346     R3=x[3*k+2];
347     S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
348     S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
349     S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
350     }
351     }
352     A11=diag[i*9 ];
353     A21=diag[i*9+1];
354     A31=diag[i*9+2];
355     A12=diag[i*9+3];
356     A22=diag[i*9+4];
357     A32=diag[i*9+5];
358     A13=diag[i*9+6];
359     A23=diag[i*9+7];
360     A33=diag[i*9+8];
361     x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
362     x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
363     x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
364     }
365     }
366     } /* add block size >3 */
367     } /* end of coloring loop */
368    
369    
370     /* backward substitution */
371     for (color=(num_colors)-2 ;color>-1;--color) { /* Note: color=(num_colors)-1 is not required */
372     if (n_block==1) {
373     #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k,S1,R1)
374     for (i = 0; i < n; ++i) {
375     if (coloring[i]==color) {
376    
377     mm=ptr_main[i];
378     R1=x[i];
379     A11=A_p->val[mm];
380     S1 = A11 * R1;
381    
382     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
383     k=A_p->pattern->index[iptr_ik];
384     if (coloring[k]>color) {
385     R1=x[k];
386     S1-=A_p->val[iptr_ik]*R1;
387     }
388     }
389    
390     A11=diag[i];
391     x[i]=A11*S1;
392    
393     }
394     }
395     } else if (n_block==2) {
396     #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k,S1,S2,R1,R2,A11,A21,A12,A22)
397     for (i = 0; i < n; ++i) {
398     if (coloring[i]==color) {
399    
400     mm=ptr_main[i];
401    
402     R1=x[2*i];
403     R2=x[2*i+1];
404    
405     A11=A_p->val[mm*4 ];
406     A21=A_p->val[mm*4+1];
407     A12=A_p->val[mm*4+2];
408     A22=A_p->val[mm*4+3];
409    
410     S1 = A11 * R1 + A12 * R2;
411     S2 = A21 * R1 + A22 * R2;
412    
413     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
414     k=A_p->pattern->index[iptr_ik];
415     if (coloring[k]>color) {
416     R1=x[2*k];
417     R2=x[2*k+1];
418     S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
419     S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
420     }
421     }
422    
423     A11=diag[i*4];
424     A12=diag[i*4+2];
425     A21=diag[i*4+1];
426     A22=diag[i*4+3];
427    
428     x[2*i ]=A11 * S1 + A12 * S2;
429     x[2*i+1]=A21 * S1 + A22 * S2;
430    
431     }
432     }
433     } else if (n_block==3) {
434     #pragma omp parallel for schedule(static) private(mm, i,iptr_ik,k,S1,S2,S3,R1,R2,R3,A11,A21,A31,A12,A22,A32,A13,A23,A33)
435     for (i = 0; i < n; ++i) {
436     if (coloring[i]==color) {
437    
438     mm=ptr_main[i];
439     R1=x[3*i];
440     R2=x[3*i+1];
441     R3=x[3*i+2];
442    
443     A11=A_p->val[mm*9 ];
444     A21=A_p->val[mm*9+1];
445     A31=A_p->val[mm*9+2];
446     A12=A_p->val[mm*9+3];
447     A22=A_p->val[mm*9+4];
448     A32=A_p->val[mm*9+5];
449     A13=A_p->val[mm*9+6];
450     A23=A_p->val[mm*9+7];
451     A33=A_p->val[mm*9+8];
452    
453     S1 =A11 * R1 + A12 * R2 + A13 * R3;
454     S2 =A21 * R1 + A22 * R2 + A23 * R3;
455     S3 =A31 * R1 + A32 * R2 + A33 * R3;
456    
457     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
458     k=A_p->pattern->index[iptr_ik];
459     if (coloring[k]>color) {
460     R1=x[3*k];
461     R2=x[3*k+1];
462     R3=x[3*k+2];
463     S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
464     S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
465     S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
466     }
467     }
468    
469     A11=diag[i*9 ];
470     A21=diag[i*9+1];
471     A31=diag[i*9+2];
472     A12=diag[i*9+3];
473     A22=diag[i*9+4];
474     A32=diag[i*9+5];
475     A13=diag[i*9+6];
476     A23=diag[i*9+7];
477     A33=diag[i*9+8];
478    
479     x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
480     x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
481     x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
482    
483     }
484     }
485     } /* add block size >3 */
486     }
487     return;
488     }
489    
490     void Paso_Solver_solveLocalGS_sequential(Paso_SparseMatrix* A_p, Paso_Solver_LocalGS * gs, double * x, const double * b)
491     {
492     const dim_t n=A_p->numRows;
493     const dim_t n_block=A_p->row_block_size;
494     const double *diag = gs->diag;
495     /* const index_t* pivot = gs->pivot;
496     const dim_t block_size=A_p->block_size; use for block size >3*/
497    
498     register dim_t i,k;
499     register index_t iptr_ik, mm;
500     register double A11,A12,A21,A22,A13,A23,A33,A32,A31,S1,S2,S3,R1,R2,R3;
501    
502     const index_t* ptr_main = Paso_SparseMatrix_borrowMainDiagonalPointer(A_p);
503    
504     /* forward substitution */
505    
506     if (n_block==1) {
507     for (i = 0; i < n; ++i) {
508     /* x_i=x_i-a_ik*x_k */
509     S1=b[i];
510     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
511     k=A_p->pattern->index[iptr_ik];
512     if (k<i) {
513     R1=x[k];
514     S1-=A_p->val[iptr_ik]*R1;
515     } else {
516     break; /* index is ordered */
517     }
518     }
519     A11=diag[i];
520     x[i]=A11*S1;
521     }
522     } else if (n_block==2) {
523     for (i = 0; i < n; ++i) {
524     S1=b[2*i];
525     S2=b[2*i+1];
526     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
527     k=A_p->pattern->index[iptr_ik];
528     if (k<i) {
529     R1=x[2*k];
530     R2=x[2*k+1];
531     S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
532     S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
533     } else {
534     break; /* index is ordered */
535     }
536     }
537     A11=diag[i*4];
538     A12=diag[i*4+2];
539     A21=diag[i*4+1];
540     A22=diag[i*4+3];
541    
542     x[2*i ]=A11 * S1 + A12 * S2;
543     x[2*i+1]=A21 * S1 + A22 * S2;
544    
545     }
546     } else if (n_block==3) {
547     for (i = 0; i < n; ++i) {
548     /* x_i=x_i-a_ik*x_k */
549     S1=b[3*i];
550     S2=b[3*i+1];
551     S3=b[3*i+2];
552     for (iptr_ik=A_p->pattern->ptr[i];iptr_ik<A_p->pattern->ptr[i+1]; ++iptr_ik) {
553     k=A_p->pattern->index[iptr_ik];
554     if ( k<i ) {
555     R1=x[3*k];
556     R2=x[3*k+1];
557     R3=x[3*k+2];
558     S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
559     S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
560     S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
561     } else {
562     break; /* index is ordered */
563     }
564     }
565     A11=diag[i*9 ];
566     A21=diag[i*9+1];
567     A31=diag[i*9+2];
568     A12=diag[i*9+3];
569     A22=diag[i*9+4];
570     A32=diag[i*9+5];
571     A13=diag[i*9+6];
572     A23=diag[i*9+7];
573     A33=diag[i*9+8];
574     x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
575     x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
576     x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
577     }
578    
579     } /* add block size >3 */
580    
581    
582    
583     /* backward substitution */
584    
585     if (n_block==1) {
586     for (i = n-2; i > -1; ++i) {
587    
588     mm=ptr_main[i];
589     R1=x[i];
590     A11=A_p->val[mm];
591     S1 = A11 * R1;
592    
593     for (iptr_ik=A_p->pattern->ptr[i+1]-1; iptr_ik > A_p->pattern->ptr[i]-1; ++iptr_ik) {
594     k=A_p->pattern->index[iptr_ik];
595     if (k > i) {
596     R1=x[k];
597     S1-=A_p->val[iptr_ik]*R1;
598     } else {
599     break ;
600     }
601     }
602    
603     A11=diag[i];
604     x[i]=A11*S1;
605    
606     }
607     } else if (n_block==2) {
608     for (i = n-2; i > -1; ++i) {
609    
610     mm=ptr_main[i];
611    
612     R1=x[2*i];
613     R2=x[2*i+1];
614    
615     A11=A_p->val[mm*4 ];
616     A21=A_p->val[mm*4+1];
617     A12=A_p->val[mm*4+2];
618     A22=A_p->val[mm*4+3];
619    
620     S1 = A11 * R1 + A12 * R2;
621     S2 = A21 * R1 + A22 * R2;
622    
623     for (iptr_ik=A_p->pattern->ptr[i+1]-1; iptr_ik > A_p->pattern->ptr[i]-1; ++iptr_ik) {
624     k=A_p->pattern->index[iptr_ik];
625     if (k > i) {
626     R1=x[2*k];
627     R2=x[2*k+1];
628     S1-=A_p->val[4*iptr_ik ]*R1+A_p->val[4*iptr_ik+2]*R2;
629     S2-=A_p->val[4*iptr_ik+1]*R1+A_p->val[4*iptr_ik+3]*R2;
630     } else {
631     break ;
632     }
633     }
634    
635     A11=diag[i*4];
636     A12=diag[i*4+2];
637     A21=diag[i*4+1];
638     A22=diag[i*4+3];
639    
640     x[2*i ]=A11 * S1 + A12 * S2;
641     x[2*i+1]=A21 * S1 + A22 * S2;
642    
643    
644     }
645     } else if (n_block==3) {
646     for (i = n-2; i > -1; ++i) {
647    
648     mm=ptr_main[i];
649     R1=x[3*i];
650     R2=x[3*i+1];
651     R3=x[3*i+2];
652    
653     A11=A_p->val[mm*9 ];
654     A21=A_p->val[mm*9+1];
655     A31=A_p->val[mm*9+2];
656     A12=A_p->val[mm*9+3];
657     A22=A_p->val[mm*9+4];
658     A32=A_p->val[mm*9+5];
659     A13=A_p->val[mm*9+6];
660     A23=A_p->val[mm*9+7];
661     A33=A_p->val[mm*9+8];
662    
663     S1 =A11 * R1 + A12 * R2 + A13 * R3;
664     S2 =A21 * R1 + A22 * R2 + A23 * R3;
665     S3 =A31 * R1 + A32 * R2 + A33 * R3;
666    
667     for (iptr_ik=A_p->pattern->ptr[i+1]-1; iptr_ik > A_p->pattern->ptr[i]-1; ++iptr_ik) {
668     k=A_p->pattern->index[iptr_ik];
669     if (k > i) {
670     R1=x[3*k];
671     R2=x[3*k+1];
672     R3=x[3*k+2];
673     S1-=A_p->val[9*iptr_ik ]*R1+A_p->val[9*iptr_ik+3]*R2+A_p->val[9*iptr_ik+6]*R3;
674     S2-=A_p->val[9*iptr_ik+1]*R1+A_p->val[9*iptr_ik+4]*R2+A_p->val[9*iptr_ik+7]*R3;
675     S3-=A_p->val[9*iptr_ik+2]*R1+A_p->val[9*iptr_ik+5]*R2+A_p->val[9*iptr_ik+8]*R3;
676     } else {
677     break ;
678     }
679     }
680    
681     A11=diag[i*9 ];
682     A21=diag[i*9+1];
683     A31=diag[i*9+2];
684     A12=diag[i*9+3];
685     A22=diag[i*9+4];
686     A32=diag[i*9+5];
687     A13=diag[i*9+6];
688     A23=diag[i*9+7];
689     A33=diag[i*9+8];
690    
691     x[3*i ]=A11 * S1 + A12 * S2 + A13 * S3;
692     x[3*i+1]=A21 * S1 + A22 * S2 + A23 * S3;
693     x[3*i+2]=A31 * S1 + A32 * S2 + A33 * S3;
694    
695     }
696    
697     } /* add block size >3 */
698    
699     return;
700     }
701    

  ViewVC Help
Powered by ViewVC 1.1.26