/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1556 - (hide annotations)
Mon May 12 00:54:58 2008 UTC (12 years, 5 months ago) by gross
File MIME type: text/plain
File size: 13501 byte(s)
Modification to allow mixed mode execution. 
In order to keep the code portable accross platform all MPI calls within
parallel regions have been moved. 


1 ksteube 1315
2     /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3    
4     /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15    
16     /**************************************************************/
17    
18     /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19    
20     /**************************************************************/
21    
22     /* Author: gross@access.edu.au */
23    
24     /**************************************************************/
25    
26     #include "SparseMatrix.h"
27    
28     /* CSC format with offset 0*/
29     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha,
30     Paso_SparseMatrix* A,
31     double* in,
32     double beta,
33     double* out) {
34    
35     register index_t ir,icol,iptr,icb,irb,irow,ic;
36     register double reg,reg1,reg2,reg3;
37    
38     if (ABS(beta)>0.) {
39     if (beta != 1.) {
40 gross 1556 #pragma omp parallel for private(irow) schedule(static)
41 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
42     out[irow] *= beta;
43     }
44     } else {
45 gross 1556 #pragma omp parallel for private(irow) schedule(static)
46 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
47     out[irow] = 0;
48     }
49     if (Paso_Pattern_isEmpty(A->pattern)) return;
50     /* do the operation: */
51     if (ABS(alpha)>0) {
52     if (A ->col_block_size==1 && A->row_block_size ==1) {
53     /* TODO: parallelize (good luck!) */
54     for (icol=0;icol< A->pattern->numOutput;++icol) {
55     #pragma ivdep
56     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
57     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
58     }
59     }
60     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
61     /* TODO: parallelize */
62     for (ic=0;ic< A->pattern->numOutput;ic++) {
63     #pragma ivdep
64     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
65 gross 1511 ir=2*(A->pattern->index[iptr]);
66     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
67     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
68 ksteube 1315 }
69     }
70     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
71     /* TODO: parallelize */
72     for (ic=0;ic< A->pattern->numOutput;ic++) {
73     #pragma ivdep
74     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
75     ir=3*(A->pattern->index[iptr]);
76 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
77     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
78     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
79 ksteube 1315 }
80     }
81     } else {
82     /* TODO: parallelize */
83     for (ic=0;ic< A->pattern->numOutput;ic++) {
84     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
85     for (irb=0;irb< A->row_block_size;irb++) {
86     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
87     #pragma ivdep
88     for (icb=0;icb< A->col_block_size;icb++) {
89     icol=icb+A->col_block_size*ic;
90     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
91     }
92     }
93     }
94     }
95     }
96     }
97     return;
98     }
99    
100     /* CSC format with offset 1*/
101     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha,
102     Paso_SparseMatrix* A,
103     double* in,
104     double beta,
105     double* out) {
106    
107     register index_t ir,icol,iptr,icb,irb,irow,ic;
108     register double reg,reg1,reg2,reg3;
109     if (ABS(beta)>0.) {
110     if (beta != 1.) {
111 gross 1556 #pragma omp parallel for private(irow) schedule(static)
112 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
113     out[irow] *= beta;
114     }
115     }
116     } else {
117 gross 1556 #pragma omp parallel for private(irow) schedule(static)
118 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
119     out[irow] = 0;
120     }
121    
122     /* do the operation: */
123     if (ABS(alpha)>0) {
124     if (A ->col_block_size==1 && A->row_block_size ==1) {
125     /* TODO: parallelize (good luck!) */
126     for (icol=0;icol< A->pattern->numOutput;++icol) {
127     #pragma ivdep
128     for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
129     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
130     }
131     }
132     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
133     /* TODO: parallelize */
134     for (ic=0;ic< A->pattern->numOutput;ic++) {
135     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
136 gross 1511 ir=2*(A->pattern->index[iptr]-1);
137     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
138     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
139 ksteube 1315 }
140     }
141     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
142     /* TODO: parallelize */
143     for (ic=0;ic< A->pattern->numOutput;ic++) {
144     #pragma ivdep
145     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
146     ir=3*(A->pattern->index[iptr]-1);
147 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
148     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
149     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
150 ksteube 1315 }
151     }
152     } else {
153     /* TODO: parallelize */
154     for (ic=0;ic< A->pattern->numOutput;ic++) {
155     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
156     for (irb=0;irb< A->row_block_size;irb++) {
157     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
158     #pragma ivdep
159     for (icb=0;icb< A->col_block_size;icb++) {
160     icol=icb+A->col_block_size*ic;
161     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
162     }
163     }
164     }
165     }
166     }
167     }
168     return;
169     }
170     /* CSR format with offset 0*/
171     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha,
172     Paso_SparseMatrix* A,
173     double* in,
174     double beta,
175     double* out)
176     {
177     register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
178     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
179     if (ABS(beta)>0.) {
180     if (beta != 1.) {
181 gross 1556 #pragma omp parallel for private(irow) schedule(static)
182 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
183     out[irow] *= beta;
184     }
185     } else {
186 gross 1556 #pragma omp parallel for private(irow) schedule(static)
187 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
188     out[irow] = 0;
189     }
190     if (ABS(alpha)>0) {
191     if (A ->col_block_size==1 && A->row_block_size ==1) {
192 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
193 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
194     reg=0.;
195     #pragma ivdep
196     for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) {
197     reg += A->val[iptr] * in[A->pattern->index[iptr]];
198     }
199     out[irow] += alpha * reg;
200     }
201     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
202 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static)
203 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
204 gross 1556 reg1=0.;
205     reg2=0.;
206     #pragma ivdep
207 ksteube 1315 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
208     ic=2*(A->pattern->index[iptr]);
209     Aiptr=iptr*4;
210     in1=in[ic];
211     in2=in[1+ic];
212     A00=A->val[Aiptr ];
213     A10=A->val[Aiptr+1];
214     A01=A->val[Aiptr+2];
215     A11=A->val[Aiptr+3];
216     reg1 += A00*in1 + A01*in2;
217     reg2 += A10*in1 + A11*in2;
218     }
219     out[ 2*ir] += alpha * reg1;
220     out[1+2*ir] += alpha * reg2;
221     }
222     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
223 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static)
224 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
225 gross 1556 reg1=0.;
226     reg2=0.;
227     reg3=0.;
228     #pragma ivdep
229 ksteube 1315 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
230     ic=3*(A->pattern->index[iptr]);
231     Aiptr=iptr*9;
232     in1=in[ic];
233     in2=in[1+ic];
234     in3=in[2+ic];
235     A00=A->val[Aiptr ];
236     A10=A->val[Aiptr+1];
237     A20=A->val[Aiptr+2];
238     A01=A->val[Aiptr+3];
239     A11=A->val[Aiptr+4];
240     A21=A->val[Aiptr+5];
241     A02=A->val[Aiptr+6];
242     A12=A->val[Aiptr+7];
243     A22=A->val[Aiptr+8];
244     reg1 += A00*in1 + A01*in2 + A02*in3;
245     reg2 += A10*in1 + A11*in2 + A12*in3;
246     reg3 += A20*in1 + A21*in2 + A22*in3;
247     }
248     out[ 3*ir] += alpha * reg1;
249     out[1+3*ir] += alpha * reg2;
250     out[2+3*ir] += alpha * reg3;
251     }
252     } else {
253 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
254 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
255     for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
256     for (irb=0;irb< A->row_block_size;irb++) {
257     irow=irb+A->row_block_size*ir;
258 gross 1556 reg=0.;
259     #pragma ivdep
260 ksteube 1315 for (icb=0;icb< A->col_block_size;icb++) {
261     icol=icb+A->col_block_size*(A->pattern->index[iptr]);
262     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
263     }
264     out[irow] += alpha * reg;
265     }
266     }
267     }
268     }
269     }
270     return;
271     }
272     /* CSR format with offset 1*/
273     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha,
274     Paso_SparseMatrix* A,
275     double* in,
276     double beta,
277     double* out) {
278    
279     register index_t ir,icol,iptr,icb,irb,irow,ic;
280     register double reg,reg1,reg2,reg3;
281     if (ABS(beta)>0.) {
282     if (beta != 1.) {
283 gross 1556 #pragma omp parallel for private(irow) schedule(static)
284 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
285     out[irow] *= beta;
286     }
287     } else {
288 gross 1556 #pragma omp parallel for private(irow) schedule(static)
289 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
290     out[irow] = 0;
291     }
292     /* do the operation: */
293     if (ABS(alpha)>0) {
294     if (A ->col_block_size==1 && A->row_block_size ==1) {
295 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
296 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
297     reg=0.;
298     #pragma ivdep
299     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
300     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
301     }
302     out[irow] += alpha * reg;
303     }
304     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
305 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
306 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
307     reg1=0.;
308     reg2=0.;
309     #pragma ivdep
310     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
311     ic=2*(A->pattern->index[iptr]-1);
312     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
313     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
314     }
315     out[ 2*ir] += alpha * reg1;
316     out[1+2*ir] += alpha * reg2;
317     }
318     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
319 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
320 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
321     reg1=0.;
322     reg2=0.;
323     reg3=0.;
324     #pragma ivdep
325     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
326     ic=3*(A->pattern->index[iptr]-1);
327     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
328     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
329     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
330     }
331     out[ 3*ir] += alpha * reg1;
332     out[1+3*ir] += alpha * reg2;
333     out[2+3*ir] += alpha * reg3;
334     }
335     } else {
336 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
337 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
338     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
339     for (irb=0;irb< A->row_block_size;irb++) {
340     irow=irb+A->row_block_size*ir;
341     reg=0.;
342     #pragma ivdep
343     for (icb=0;icb< A->col_block_size;icb++) {
344     icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
345     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
346     }
347     out[irow] += alpha * reg;
348     }
349     }
350     }
351     }
352     }
353     return;
354     }

  ViewVC Help
Powered by ViewVC 1.1.26