/[escript]/temp_trunk_copy/paso/src/SystemMatrix_MatrixVector.c
ViewVC logotype

Annotation of /temp_trunk_copy/paso/src/SystemMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 512 - (hide annotations)
Fri Feb 10 07:04:14 2006 UTC (14 years ago) by gross
Original Path: trunk/paso/src/SystemMatrix_MatrixVector.c
File MIME type: text/plain
File size: 13131 byte(s)
bug in parallelization fixed
1 jgs 150 /* $Id$ */
2    
3     /**************************************************************/
4    
5     /* Paso: matrix vector product with sparse matrix */
6    
7     /**************************************************************/
8    
9     /* Copyrights by ACcESS Australia 2003,2004,2005 */
10     /* Author: gross@access.edu.au */
11    
12     /**************************************************************/
13    
14     #include "Paso.h"
15     #include "SystemMatrix.h"
16    
17     /**************************************************************************/
18    
19     /* raw scaled vector update operation: out = alpha * A * in + beta * out */
20    
21     /* has to be called within a parallel region */
22     /* barrier synconization is performed to make sure that the input vector available */
23    
24     void Paso_SystemMatrix_MatrixVector(double alpha,
25     Paso_SystemMatrix* A,
26     double* in,
27     double beta,
28     double* out) {
29    
30 gross 415 if (A->type & MATRIX_FORMAT_CSC) {
31     if (A->type & MATRIX_FORMAT_OFFSET1) {
32     Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(alpha,A,in,beta,out);
33     } else {
34     Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(alpha,A,in,beta,out);
35     }
36     } else {
37     if (A->type & MATRIX_FORMAT_OFFSET1) {
38     Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(alpha,A,in,beta,out);
39     } else {
40     Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(alpha,A,in,beta,out);
41     }
42     }
43     return;
44     }
45    
46     void Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(double alpha,
47     Paso_SystemMatrix* A,
48     double* in,
49     double beta,
50     double* out) {
51    
52 jgs 150 register index_t ir,icol,iptr,icb,irb,irow,ic;
53     register double reg,reg1,reg2,reg3;
54     #pragma omp barrier
55    
56     if (ABS(beta)>0.) {
57     #pragma omp for private(irow) schedule(static)
58     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
59     out[irow] *= beta;
60     } else {
61     #pragma omp for private(irow) schedule(static)
62     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
63     out[irow] = 0;
64     }
65    
66     /* do the operation: */
67     if (ABS(alpha)>0) {
68     if (A ->col_block_size==1 && A->row_block_size ==1) {
69 gross 415 /* TODO: parallelize (good luck!) */
70     #pragma omp single
71     for (icol=0;icol< A->pattern->n_ptr;++icol) {
72     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
73     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
74 jgs 150 }
75     }
76 gross 415 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
77     /* TODO: parallelize */
78     #pragma omp single
79     for (ic=0;ic< A->pattern->n_ptr;ic++) {
80     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
81     ic=2*(A->pattern->index[iptr]);
82     out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
83     out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
84     }
85     }
86     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
87     /* TODO: parallelize */
88     #pragma omp single
89     for (ic=0;ic< A->pattern->n_ptr;ic++) {
90     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
91     ir=3*(A->pattern->index[iptr]);
92     out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
93     out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
94     out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
95     }
96     }
97     } else {
98     /* TODO: parallelize */
99     #pragma omp single
100     for (ic=0;ic< A->pattern->n_ptr;ic++) {
101     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
102     for (irb=0;irb< A->row_block_size;irb++) {
103     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
104     for (icb=0;icb< A->col_block_size;icb++) {
105     icol=icb+A->col_block_size*ic;
106     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
107     }
108     }
109     }
110     }
111     }
112     }
113     return;
114     }
115    
116     void Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(double alpha,
117     Paso_SystemMatrix* A,
118     double* in,
119     double beta,
120     double* out) {
121    
122     register index_t ir,icol,iptr,icb,irb,irow,ic;
123     register double reg,reg1,reg2,reg3;
124     #pragma omp barrier
125    
126     if (ABS(beta)>0.) {
127     #pragma omp for private(irow) schedule(static)
128     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
129     out[irow] *= beta;
130     } else {
131     #pragma omp for private(irow) schedule(static)
132     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
133     out[irow] = 0;
134     }
135    
136     /* do the operation: */
137     if (ABS(alpha)>0) {
138     if (A ->col_block_size==1 && A->row_block_size ==1) {
139 jgs 150 /* TODO: parallelize (good luck!) */
140     #pragma omp single
141     for (icol=0;icol< A->pattern->n_ptr;++icol) {
142 gross 415 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
143     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
144 jgs 150 }
145     }
146     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
147 gross 415 /* TODO: parallelize */
148     #pragma omp single
149     for (ic=0;ic< A->pattern->n_ptr;ic++) {
150     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
151     ic=2*(A->pattern->index[iptr]-1);
152     out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
153     out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
154     }
155     }
156     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
157     /* TODO: parallelize */
158     #pragma omp single
159     for (ic=0;ic< A->pattern->n_ptr;ic++) {
160     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
161     ir=3*(A->pattern->index[iptr]-1);
162     out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
163     out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
164     out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
165     }
166     }
167     } else {
168     /* TODO: parallelize */
169     #pragma omp single
170     for (ic=0;ic< A->pattern->n_ptr;ic++) {
171     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
172     for (irb=0;irb< A->row_block_size;irb++) {
173     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
174     for (icb=0;icb< A->col_block_size;icb++) {
175     icol=icb+A->col_block_size*ic;
176     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
177     }
178     }
179     }
180     }
181     }
182     }
183     return;
184     }
185    
186     void Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(double alpha,
187     Paso_SystemMatrix* A,
188     double* in,
189     double beta,
190     double* out) {
191    
192 gross 495 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
193     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
194 gross 415 #pragma omp barrier
195     if (ABS(beta)>0.) {
196     #pragma omp for private(irow) schedule(static)
197     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
198     out[irow] *= beta;
199     } else {
200     #pragma omp for private(irow) schedule(static)
201     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
202     out[irow] = 0;
203     }
204     /* do the operation: */
205     if (ABS(alpha)>0) {
206     if (A ->col_block_size==1 && A->row_block_size ==1) {
207     #pragma omp for private(irow,iptr,reg) schedule(static)
208     for (irow=0;irow< A->pattern->n_ptr;++irow) {
209     reg=0.;
210     for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) {
211     reg += A->val[iptr] * in[A->pattern->index[iptr]];
212     }
213     out[irow] += alpha * reg;
214     }
215     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
216 gross 512 #pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static)
217 jgs 150 for (ir=0;ir< A->pattern->n_ptr;ir++) {
218     reg1=0.;
219     reg2=0.;
220 gross 415 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
221     ic=2*(A->pattern->index[iptr]);
222 gross 495 Aiptr=iptr*4;
223     in1=in[ic];
224     in2=in[1+ic];
225     A00=A->val[Aiptr ];
226     A10=A->val[Aiptr+1];
227     A01=A->val[Aiptr+2];
228     A11=A->val[Aiptr+3];
229     reg1 += A00*in1 + A01*in2;
230     reg2 += A10*in1 + A11*in2;
231 jgs 150 }
232     out[ 2*ir] += alpha * reg1;
233     out[1+2*ir] += alpha * reg2;
234     }
235     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
236 gross 512 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static)
237 jgs 150 for (ir=0;ir< A->pattern->n_ptr;ir++) {
238     reg1=0.;
239     reg2=0.;
240     reg3=0.;
241 gross 415 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
242     ic=3*(A->pattern->index[iptr]);
243 gross 495 Aiptr=iptr*9;
244     in1=in[ic];
245     in2=in[1+ic];
246     in3=in[2+ic];
247     A00=A->val[Aiptr ];
248     A10=A->val[Aiptr+1];
249     A20=A->val[Aiptr+2];
250     A01=A->val[Aiptr+3];
251     A11=A->val[Aiptr+4];
252     A21=A->val[Aiptr+5];
253     A02=A->val[Aiptr+6];
254     A12=A->val[Aiptr+7];
255     A22=A->val[Aiptr+8];
256     reg1 += A00*in1 + A01*in2 + A02*in3;
257     reg2 += A10*in1 + A11*in2 + A12*in3;
258     reg3 += A20*in1 + A21*in2 + A22*in3;
259 jgs 150 }
260     out[ 3*ir] += alpha * reg1;
261     out[1+3*ir] += alpha * reg2;
262     out[2+3*ir] += alpha * reg3;
263     }
264     } else {
265     #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
266     for (ir=0;ir< A->pattern->n_ptr;ir++) {
267 gross 415 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
268 jgs 150 for (irb=0;irb< A->row_block_size;irb++) {
269     irow=irb+A->row_block_size*ir;
270     reg=0.;
271     for (icb=0;icb< A->col_block_size;icb++) {
272 gross 415 icol=icb+A->col_block_size*(A->pattern->index[iptr]);
273 jgs 150 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
274     }
275     out[irow] += alpha * reg;
276     }
277     }
278     }
279 gross 415 }
280     }
281     return;
282     }
283    
284     void Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(double alpha,
285     Paso_SystemMatrix* A,
286     double* in,
287     double beta,
288     double* out) {
289    
290     register index_t ir,icol,iptr,icb,irb,irow,ic;
291     register double reg,reg1,reg2,reg3;
292     #pragma omp barrier
293    
294     if (ABS(beta)>0.) {
295     #pragma omp for private(irow) schedule(static)
296     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
297     out[irow] *= beta;
298     } else {
299     #pragma omp for private(irow) schedule(static)
300     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
301     out[irow] = 0;
302     }
303     /* do the operation: */
304     if (ABS(alpha)>0) {
305     if (A ->col_block_size==1 && A->row_block_size ==1) {
306     #pragma omp for private(irow,iptr,reg) schedule(static)
307     for (irow=0;irow< A->pattern->n_ptr;++irow) {
308     reg=0.;
309     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
310     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
311     }
312     out[irow] += alpha * reg;
313     }
314     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
315 gross 512 #pragma omp for private(ir,reg1,reg2,iptr,ic) schedule(static)
316 gross 415 for (ir=0;ir< A->pattern->n_ptr;ir++) {
317     reg1=0.;
318     reg2=0.;
319     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
320     ic=2*(A->pattern->index[iptr]-1);
321     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
322     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
323     }
324     out[ 2*ir] += alpha * reg1;
325     out[1+2*ir] += alpha * reg2;
326     }
327     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
328 gross 512 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
329 gross 415 for (ir=0;ir< A->pattern->n_ptr;ir++) {
330     reg1=0.;
331     reg2=0.;
332     reg3=0.;
333     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
334     ic=3*(A->pattern->index[iptr]-1);
335     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
336     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
337     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
338     }
339     out[ 3*ir] += alpha * reg1;
340     out[1+3*ir] += alpha * reg2;
341     out[2+3*ir] += alpha * reg3;
342     }
343     } else {
344     #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
345     for (ir=0;ir< A->pattern->n_ptr;ir++) {
346     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
347 jgs 150 for (irb=0;irb< A->row_block_size;irb++) {
348 gross 415 irow=irb+A->row_block_size*ir;
349     reg=0.;
350 jgs 150 for (icb=0;icb< A->col_block_size;icb++) {
351 gross 415 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
352     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
353 jgs 150 }
354 gross 415 out[irow] += alpha * reg;
355 jgs 150 }
356     }
357     }
358     }
359     }
360     return;
361     }
362     /*
363     * $Log$
364     * Revision 1.2 2005/09/15 03:44:39 jgs
365     * Merge of development branch dev-02 back to main trunk on 2005-09-15
366     *
367     * Revision 1.1.2.1 2005/09/05 06:29:47 gross
368     * These files have been extracted from finley to define a stand alone libray for iterative
369     * linear solvers on the ALTIX. main entry through Paso_solve. this version compiles but
370     * has not been tested yet.
371     *
372     *
373     */

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26