/[escript]/trunk/paso/src/SystemMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SystemMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1098 - (hide annotations)
Mon Apr 16 23:15:23 2007 UTC (12 years, 10 months ago) by gross
File MIME type: text/plain
File size: 14129 byte(s)
add a few #pragma ivdep which should speed up MV but cannot confiirm this on Pentium




1 jgs 150 /* $Id$ */
2    
3 dhawcroft 631
4     /*
5     ********************************************************************************
6 dhawcroft 633 * Copyright 2006 by ACcESS MNRF *
7 dhawcroft 631 * *
8     * http://www.access.edu.au *
9     * Primary Business: Queensland, Australia *
10     * Licensed under the Open Software License version 3.0 *
11     * http://www.opensource.org/licenses/osl-3.0.php *
12     ********************************************************************************
13     */
14    
15 jgs 150 /**************************************************************/
16    
17     /* Paso: matrix vector product with sparse matrix */
18    
19     /**************************************************************/
20    
21     /* Copyrights by ACcESS Australia 2003,2004,2005 */
22     /* Author: gross@access.edu.au */
23    
24     /**************************************************************/
25    
26     #include "Paso.h"
27     #include "SystemMatrix.h"
28    
29     /**************************************************************************/
30    
31     /* raw scaled vector update operation: out = alpha * A * in + beta * out */
32    
33     /* has to be called within a parallel region */
34     /* barrier synconization is performed to make sure that the input vector available */
35    
36     void Paso_SystemMatrix_MatrixVector(double alpha,
37     Paso_SystemMatrix* A,
38     double* in,
39     double beta,
40     double* out) {
41    
42 gross 415 if (A->type & MATRIX_FORMAT_CSC) {
43     if (A->type & MATRIX_FORMAT_OFFSET1) {
44     Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(alpha,A,in,beta,out);
45     } else {
46     Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(alpha,A,in,beta,out);
47     }
48     } else {
49     if (A->type & MATRIX_FORMAT_OFFSET1) {
50     Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(alpha,A,in,beta,out);
51     } else {
52     Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(alpha,A,in,beta,out);
53     }
54     }
55     return;
56     }
57    
58     void Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(double alpha,
59     Paso_SystemMatrix* A,
60     double* in,
61     double beta,
62     double* out) {
63    
64 jgs 150 register index_t ir,icol,iptr,icb,irb,irow,ic;
65     register double reg,reg1,reg2,reg3;
66     #pragma omp barrier
67    
68     if (ABS(beta)>0.) {
69     #pragma omp for private(irow) schedule(static)
70     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
71     out[irow] *= beta;
72     } else {
73     #pragma omp for private(irow) schedule(static)
74     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
75     out[irow] = 0;
76     }
77    
78     /* do the operation: */
79     if (ABS(alpha)>0) {
80     if (A ->col_block_size==1 && A->row_block_size ==1) {
81 gross 415 /* TODO: parallelize (good luck!) */
82     #pragma omp single
83     for (icol=0;icol< A->pattern->n_ptr;++icol) {
84 gross 1098 #pragma ivdep
85 gross 415 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
86     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
87 jgs 150 }
88     }
89 gross 415 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
90     /* TODO: parallelize */
91     #pragma omp single
92     for (ic=0;ic< A->pattern->n_ptr;ic++) {
93 gross 1098 #pragma ivdep
94 gross 415 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
95     ic=2*(A->pattern->index[iptr]);
96     out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
97     out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
98     }
99     }
100     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
101     /* TODO: parallelize */
102     #pragma omp single
103     for (ic=0;ic< A->pattern->n_ptr;ic++) {
104 gross 1098 #pragma ivdep
105 gross 415 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
106     ir=3*(A->pattern->index[iptr]);
107     out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
108     out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
109     out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
110     }
111     }
112     } else {
113     /* TODO: parallelize */
114     #pragma omp single
115     for (ic=0;ic< A->pattern->n_ptr;ic++) {
116     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
117     for (irb=0;irb< A->row_block_size;irb++) {
118     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
119 gross 1098 #pragma ivdep
120 gross 415 for (icb=0;icb< A->col_block_size;icb++) {
121     icol=icb+A->col_block_size*ic;
122     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
123     }
124     }
125     }
126     }
127     }
128     }
129     return;
130     }
131    
132     void Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(double alpha,
133     Paso_SystemMatrix* A,
134     double* in,
135     double beta,
136     double* out) {
137    
138     register index_t ir,icol,iptr,icb,irb,irow,ic;
139     register double reg,reg1,reg2,reg3;
140     #pragma omp barrier
141    
142     if (ABS(beta)>0.) {
143     #pragma omp for private(irow) schedule(static)
144     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
145     out[irow] *= beta;
146     } else {
147     #pragma omp for private(irow) schedule(static)
148     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
149     out[irow] = 0;
150     }
151    
152     /* do the operation: */
153     if (ABS(alpha)>0) {
154     if (A ->col_block_size==1 && A->row_block_size ==1) {
155 jgs 150 /* TODO: parallelize (good luck!) */
156     #pragma omp single
157     for (icol=0;icol< A->pattern->n_ptr;++icol) {
158 gross 1098 #pragma ivdep
159 gross 415 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
160     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
161 jgs 150 }
162     }
163     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
164 gross 415 /* TODO: parallelize */
165     #pragma omp single
166     for (ic=0;ic< A->pattern->n_ptr;ic++) {
167     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
168     ic=2*(A->pattern->index[iptr]-1);
169     out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
170     out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
171     }
172     }
173     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
174     /* TODO: parallelize */
175     #pragma omp single
176     for (ic=0;ic< A->pattern->n_ptr;ic++) {
177 gross 1098 #pragma ivdep
178 gross 415 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
179     ir=3*(A->pattern->index[iptr]-1);
180     out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
181     out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
182     out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
183     }
184     }
185     } else {
186     /* TODO: parallelize */
187     #pragma omp single
188     for (ic=0;ic< A->pattern->n_ptr;ic++) {
189     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
190     for (irb=0;irb< A->row_block_size;irb++) {
191     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
192 gross 1098 #pragma ivdep
193 gross 415 for (icb=0;icb< A->col_block_size;icb++) {
194     icol=icb+A->col_block_size*ic;
195     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
196     }
197     }
198     }
199     }
200     }
201     }
202     return;
203     }
204    
205     void Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(double alpha,
206     Paso_SystemMatrix* A,
207     double* in,
208     double beta,
209     double* out) {
210    
211 gross 495 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
212     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
213 gross 415 #pragma omp barrier
214     if (ABS(beta)>0.) {
215     #pragma omp for private(irow) schedule(static)
216     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
217     out[irow] *= beta;
218     } else {
219     #pragma omp for private(irow) schedule(static)
220     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
221     out[irow] = 0;
222     }
223     /* do the operation: */
224     if (ABS(alpha)>0) {
225     if (A ->col_block_size==1 && A->row_block_size ==1) {
226     #pragma omp for private(irow,iptr,reg) schedule(static)
227     for (irow=0;irow< A->pattern->n_ptr;++irow) {
228     reg=0.;
229 gross 1098 #pragma ivdep
230 gross 415 for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) {
231     reg += A->val[iptr] * in[A->pattern->index[iptr]];
232     }
233     out[irow] += alpha * reg;
234     }
235     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
236 gross 512 #pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static)
237 jgs 150 for (ir=0;ir< A->pattern->n_ptr;ir++) {
238     reg1=0.;
239     reg2=0.;
240 gross 415 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
241     ic=2*(A->pattern->index[iptr]);
242 gross 495 Aiptr=iptr*4;
243     in1=in[ic];
244     in2=in[1+ic];
245     A00=A->val[Aiptr ];
246     A10=A->val[Aiptr+1];
247     A01=A->val[Aiptr+2];
248     A11=A->val[Aiptr+3];
249     reg1 += A00*in1 + A01*in2;
250     reg2 += A10*in1 + A11*in2;
251 jgs 150 }
252     out[ 2*ir] += alpha * reg1;
253     out[1+2*ir] += alpha * reg2;
254     }
255     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
256 gross 512 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static)
257 jgs 150 for (ir=0;ir< A->pattern->n_ptr;ir++) {
258     reg1=0.;
259     reg2=0.;
260     reg3=0.;
261 gross 1098 #pragma ivdep
262 gross 415 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
263     ic=3*(A->pattern->index[iptr]);
264 gross 495 Aiptr=iptr*9;
265     in1=in[ic];
266     in2=in[1+ic];
267     in3=in[2+ic];
268     A00=A->val[Aiptr ];
269     A10=A->val[Aiptr+1];
270     A20=A->val[Aiptr+2];
271     A01=A->val[Aiptr+3];
272     A11=A->val[Aiptr+4];
273     A21=A->val[Aiptr+5];
274     A02=A->val[Aiptr+6];
275     A12=A->val[Aiptr+7];
276     A22=A->val[Aiptr+8];
277     reg1 += A00*in1 + A01*in2 + A02*in3;
278     reg2 += A10*in1 + A11*in2 + A12*in3;
279     reg3 += A20*in1 + A21*in2 + A22*in3;
280 jgs 150 }
281     out[ 3*ir] += alpha * reg1;
282     out[1+3*ir] += alpha * reg2;
283     out[2+3*ir] += alpha * reg3;
284     }
285     } else {
286     #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
287     for (ir=0;ir< A->pattern->n_ptr;ir++) {
288 gross 415 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
289 jgs 150 for (irb=0;irb< A->row_block_size;irb++) {
290     irow=irb+A->row_block_size*ir;
291     reg=0.;
292 gross 1098 #pragma ivdep
293 jgs 150 for (icb=0;icb< A->col_block_size;icb++) {
294 gross 415 icol=icb+A->col_block_size*(A->pattern->index[iptr]);
295 jgs 150 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
296     }
297     out[irow] += alpha * reg;
298     }
299     }
300     }
301 gross 415 }
302     }
303     return;
304     }
305    
306     void Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(double alpha,
307     Paso_SystemMatrix* A,
308     double* in,
309     double beta,
310     double* out) {
311    
312     register index_t ir,icol,iptr,icb,irb,irow,ic;
313     register double reg,reg1,reg2,reg3;
314     #pragma omp barrier
315    
316     if (ABS(beta)>0.) {
317     #pragma omp for private(irow) schedule(static)
318     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
319     out[irow] *= beta;
320     } else {
321     #pragma omp for private(irow) schedule(static)
322     for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
323     out[irow] = 0;
324     }
325     /* do the operation: */
326     if (ABS(alpha)>0) {
327     if (A ->col_block_size==1 && A->row_block_size ==1) {
328     #pragma omp for private(irow,iptr,reg) schedule(static)
329     for (irow=0;irow< A->pattern->n_ptr;++irow) {
330     reg=0.;
331 gross 1098 #pragma ivdep
332 gross 415 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
333     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
334     }
335     out[irow] += alpha * reg;
336     }
337     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
338 gross 512 #pragma omp for private(ir,reg1,reg2,iptr,ic) schedule(static)
339 gross 415 for (ir=0;ir< A->pattern->n_ptr;ir++) {
340     reg1=0.;
341     reg2=0.;
342 gross 1098 #pragma ivdep
343 gross 415 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
344     ic=2*(A->pattern->index[iptr]-1);
345     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
346     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
347     }
348     out[ 2*ir] += alpha * reg1;
349     out[1+2*ir] += alpha * reg2;
350     }
351     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
352 gross 512 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
353 gross 415 for (ir=0;ir< A->pattern->n_ptr;ir++) {
354     reg1=0.;
355     reg2=0.;
356     reg3=0.;
357 gross 1098 #pragma ivdep
358 gross 415 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
359     ic=3*(A->pattern->index[iptr]-1);
360     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
361     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
362     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
363     }
364     out[ 3*ir] += alpha * reg1;
365     out[1+3*ir] += alpha * reg2;
366     out[2+3*ir] += alpha * reg3;
367     }
368     } else {
369     #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
370     for (ir=0;ir< A->pattern->n_ptr;ir++) {
371     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
372 jgs 150 for (irb=0;irb< A->row_block_size;irb++) {
373 gross 415 irow=irb+A->row_block_size*ir;
374     reg=0.;
375 gross 1098 #pragma ivdep
376 jgs 150 for (icb=0;icb< A->col_block_size;icb++) {
377 gross 415 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
378     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
379 jgs 150 }
380 gross 415 out[irow] += alpha * reg;
381 jgs 150 }
382     }
383     }
384     }
385     }
386     return;
387     }
388     /*
389     * $Log$
390     * Revision 1.2 2005/09/15 03:44:39 jgs
391     * Merge of development branch dev-02 back to main trunk on 2005-09-15
392     *
393     * Revision 1.1.2.1 2005/09/05 06:29:47 gross
394     * These files have been extracted from finley to define a stand alone libray for iterative
395     * linear solvers on the ALTIX. main entry through Paso_solve. this version compiles but
396     * has not been tested yet.
397     *
398     *
399     */

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26