/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1642 - (hide annotations)
Tue Jul 15 05:21:57 2008 UTC (11 years, 4 months ago) by phornby
File MIME type: text/plain
File size: 15315 byte(s)
make the 1st arg of the ...._call(const double alpha,....
consistent between the .h and .c file. The const is not really needed.
1 ksteube 1315
2     /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3    
4     /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15    
16     /**************************************************************/
17    
18     /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19    
20     /**************************************************************/
21    
22     /* Author: gross@access.edu.au */
23    
24     /**************************************************************/
25    
26     #include "SparseMatrix.h"
27 ksteube 1637 #ifdef _OPENMP
28     #include <omp.h>
29     #endif
30 ksteube 1315
31     /* CSC format with offset 0*/
32 gross 1639 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(const double alpha,
33     const Paso_SparseMatrix* A,
34     const double* in,
35     const double beta,
36 ksteube 1315 double* out) {
37    
38     register index_t ir,icol,iptr,icb,irb,irow,ic;
39    
40     if (ABS(beta)>0.) {
41     if (beta != 1.) {
42 gross 1556 #pragma omp parallel for private(irow) schedule(static)
43 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
44     out[irow] *= beta;
45     }
46     } else {
47 gross 1556 #pragma omp parallel for private(irow) schedule(static)
48 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
49     out[irow] = 0;
50     }
51     if (Paso_Pattern_isEmpty(A->pattern)) return;
52     /* do the operation: */
53     if (ABS(alpha)>0) {
54     if (A ->col_block_size==1 && A->row_block_size ==1) {
55     /* TODO: parallelize (good luck!) */
56     for (icol=0;icol< A->pattern->numOutput;++icol) {
57     #pragma ivdep
58     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
59     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
60     }
61     }
62     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
63     /* TODO: parallelize */
64     for (ic=0;ic< A->pattern->numOutput;ic++) {
65     #pragma ivdep
66     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
67 gross 1511 ir=2*(A->pattern->index[iptr]);
68     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
69     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
70 ksteube 1315 }
71     }
72     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
73     /* TODO: parallelize */
74     for (ic=0;ic< A->pattern->numOutput;ic++) {
75     #pragma ivdep
76     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
77     ir=3*(A->pattern->index[iptr]);
78 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
79     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
80     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
81 ksteube 1315 }
82     }
83     } else {
84     /* TODO: parallelize */
85     for (ic=0;ic< A->pattern->numOutput;ic++) {
86     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
87     for (irb=0;irb< A->row_block_size;irb++) {
88     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
89     #pragma ivdep
90     for (icb=0;icb< A->col_block_size;icb++) {
91     icol=icb+A->col_block_size*ic;
92     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
93     }
94     }
95     }
96     }
97     }
98     }
99     return;
100     }
101    
102     /* CSC format with offset 1*/
103 gross 1639 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(const double alpha,
104     const Paso_SparseMatrix* A,
105     const double* in,
106     const double beta,
107 ksteube 1315 double* out) {
108    
109     register index_t ir,icol,iptr,icb,irb,irow,ic;
110 phornby 1628
111 ksteube 1315 if (ABS(beta)>0.) {
112     if (beta != 1.) {
113 gross 1556 #pragma omp parallel for private(irow) schedule(static)
114 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
115     out[irow] *= beta;
116     }
117     }
118     } else {
119 gross 1556 #pragma omp parallel for private(irow) schedule(static)
120 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
121     out[irow] = 0;
122     }
123    
124     /* do the operation: */
125     if (ABS(alpha)>0) {
126     if (A ->col_block_size==1 && A->row_block_size ==1) {
127     /* TODO: parallelize (good luck!) */
128     for (icol=0;icol< A->pattern->numOutput;++icol) {
129     #pragma ivdep
130     for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
131     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
132     }
133     }
134     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
135     /* TODO: parallelize */
136     for (ic=0;ic< A->pattern->numOutput;ic++) {
137     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
138 gross 1511 ir=2*(A->pattern->index[iptr]-1);
139     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
140     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
141 ksteube 1315 }
142     }
143     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
144     /* TODO: parallelize */
145     for (ic=0;ic< A->pattern->numOutput;ic++) {
146     #pragma ivdep
147     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
148     ir=3*(A->pattern->index[iptr]-1);
149 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
150     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
151     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
152 ksteube 1315 }
153     }
154     } else {
155     /* TODO: parallelize */
156     for (ic=0;ic< A->pattern->numOutput;ic++) {
157     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
158     for (irb=0;irb< A->row_block_size;irb++) {
159     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
160     #pragma ivdep
161     for (icb=0;icb< A->col_block_size;icb++) {
162     icol=icb+A->col_block_size*ic;
163     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
164     }
165     }
166     }
167     }
168     }
169     }
170     return;
171     }
172     /* CSR format with offset 1*/
173 gross 1639 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(const double alpha,
174     const Paso_SparseMatrix* A,
175     const double* in,
176     const double beta,
177 ksteube 1315 double* out) {
178    
179     register index_t ir,icol,iptr,icb,irb,irow,ic;
180     register double reg,reg1,reg2,reg3;
181     if (ABS(beta)>0.) {
182     if (beta != 1.) {
183 gross 1556 #pragma omp parallel for private(irow) schedule(static)
184 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
185     out[irow] *= beta;
186     }
187     } else {
188 gross 1556 #pragma omp parallel for private(irow) schedule(static)
189 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
190     out[irow] = 0;
191     }
192     /* do the operation: */
193     if (ABS(alpha)>0) {
194     if (A ->col_block_size==1 && A->row_block_size ==1) {
195 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
196 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
197     reg=0.;
198     #pragma ivdep
199     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
200     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
201     }
202     out[irow] += alpha * reg;
203     }
204     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
205 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
206 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
207     reg1=0.;
208     reg2=0.;
209     #pragma ivdep
210     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
211     ic=2*(A->pattern->index[iptr]-1);
212     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
213     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
214     }
215     out[ 2*ir] += alpha * reg1;
216     out[1+2*ir] += alpha * reg2;
217     }
218     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
219 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
220 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
221     reg1=0.;
222     reg2=0.;
223     reg3=0.;
224     #pragma ivdep
225     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
226     ic=3*(A->pattern->index[iptr]-1);
227     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
228     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
229     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
230     }
231     out[ 3*ir] += alpha * reg1;
232     out[1+3*ir] += alpha * reg2;
233     out[2+3*ir] += alpha * reg3;
234     }
235     } else {
236 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
237 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
238     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
239     for (irb=0;irb< A->row_block_size;irb++) {
240     irow=irb+A->row_block_size*ir;
241     reg=0.;
242     #pragma ivdep
243     for (icb=0;icb< A->col_block_size;icb++) {
244     icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
245     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
246     }
247     out[irow] += alpha * reg;
248     }
249     }
250     }
251     }
252     }
253     return;
254     }
255 gross 1564 /* CSR format with offset 0*/
256 gross 1639 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(const double alpha,
257     const Paso_SparseMatrix* A,
258     const double* in,
259     const double beta,
260 gross 1564 double* out)
261     {
262 gross 1570 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
263 gross 1564
264 ksteube 1637 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined _OPENMP
265 gross 1570 #define USE_DYNAMIC_SCHEDULING
266     #endif
267    
268 gross 1564 char* chksz_chr=NULL;
269 gross 1570 dim_t chunk_size=1;
270 gross 1564 dim_t nrow=A->numRows;
271 phornby 1642 dim_t np=1, len, rest, irow, local_n, p;
272    
273     #ifdef USE_DYNAMIC_SCHEDULING
274     dim_t n_chunks;
275     #endif
276    
277 ksteube 1637 #ifdef _OPENMP
278 gross 1564 np=omp_get_max_threads();
279 ksteube 1637 #endif
280 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
281 gross 1564 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
282     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
283 gross 1570 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
284     n_chunks=nrow/chunk_size;
285     if (n_chunks*chunk_size<nrow) n_chunks+=1;
286     #else
287     len=nrow/np;
288     rest=nrow-len*np;
289 gross 1564 #endif
290    
291 gross 1571 #pragma omp parallel private(irow, p, local_n)
292 gross 1570 {
293     #ifdef USE_DYNAMIC_SCHEDULING
294 gross 1571 #pragma omp for schedule(dynamic,1)
295 gross 1570 for (p=0; p<n_chunks;p++) {
296     irow=chunk_size*p;
297     local_n=MIN(chunk_size,nrow-chunk_size*p);
298     #else
299 gross 1571 #pragma omp for schedule(static)
300 gross 1570 for (p=0; p<np;p++) {
301     irow=len*p+MIN(p,rest);
302     local_n=len+(p<rest ? 1 :0 );
303     #endif
304     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
305     local_n,
306     A->row_block_size,
307     A->col_block_size,
308     &(A->pattern->ptr[irow]),
309     A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
310     #ifdef USE_DYNAMIC_SCHEDULING
311     }
312     #else
313     }
314     #endif
315 gross 1564 }
316     }
317     /* CSR format with offset 0*/
318 gross 1639 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(const double alpha,
319     const dim_t nRows,
320     const dim_t row_block_size,
321     const dim_t col_block_size,
322     const index_t* ptr,
323     const index_t* index,
324     const double* val,
325     const double* in,
326     const double beta,
327 gross 1564 double* out)
328     {
329     register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
330     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
331     dim_t block_size;
332     if (ABS(beta)>0.) {
333     if (beta != 1.) {
334     for (irow=0;irow < nRows * row_block_size;irow++)
335     out[irow] *= beta;
336     }
337     } else {
338     for (irow=0;irow < nRows * row_block_size;irow++)
339     out[irow] = 0;
340     }
341     if (ABS(alpha)>0) {
342     if (col_block_size==1 && row_block_size ==1) {
343     for (irow=0;irow< nRows;++irow) {
344     reg=0.;
345     #pragma ivdep
346     for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
347     reg += val[iptr] * in[index[iptr]];
348     }
349     out[irow] += alpha * reg;
350     }
351     } else if (col_block_size==2 && row_block_size ==2) {
352     for (ir=0;ir< nRows;ir++) {
353     reg1=0.;
354     reg2=0.;
355     #pragma ivdep
356     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
357     ic=2*index[iptr];
358     Aiptr=iptr*4;
359     in1=in[ic];
360     in2=in[1+ic];
361     A00=val[Aiptr ];
362     A10=val[Aiptr+1];
363     A01=val[Aiptr+2];
364     A11=val[Aiptr+3];
365     reg1 += A00*in1 + A01*in2;
366     reg2 += A10*in1 + A11*in2;
367     }
368     out[ 2*ir] += alpha * reg1;
369     out[1+2*ir] += alpha * reg2;
370     }
371     } else if (col_block_size==3 && row_block_size ==3) {
372     for (ir=0;ir< nRows;ir++) {
373     reg1=0.;
374     reg2=0.;
375     reg3=0.;
376     #pragma ivdep
377     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
378     ic=3*index[iptr];
379     Aiptr=iptr*9;
380     in1=in[ic];
381     in2=in[1+ic];
382     in3=in[2+ic];
383     A00=val[Aiptr ];
384     A10=val[Aiptr+1];
385     A20=val[Aiptr+2];
386     A01=val[Aiptr+3];
387     A11=val[Aiptr+4];
388     A21=val[Aiptr+5];
389     A02=val[Aiptr+6];
390     A12=val[Aiptr+7];
391     A22=val[Aiptr+8];
392     reg1 += A00*in1 + A01*in2 + A02*in3;
393     reg2 += A10*in1 + A11*in2 + A12*in3;
394     reg3 += A20*in1 + A21*in2 + A22*in3;
395     }
396     out[ 3*ir] += alpha * reg1;
397     out[1+3*ir] += alpha * reg2;
398     out[2+3*ir] += alpha * reg3;
399     }
400     } else {
401     block_size=col_block_size*row_block_size;
402     for (ir=0;ir< nRows;ir++) {
403     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
404     for (irb=0;irb< row_block_size;irb++) {
405     irow=irb+row_block_size*ir;
406     reg=0.;
407     #pragma ivdep
408     for (icb=0;icb< col_block_size;icb++) {
409     icol=icb+col_block_size*index[iptr];
410     reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
411     }
412     out[irow] += alpha * reg;
413     }
414     }
415     }
416     }
417     }
418     return;
419     }

  ViewVC Help
Powered by ViewVC 1.1.26