/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1981 - (hide annotations)
Thu Nov 6 05:27:33 2008 UTC (10 years, 11 months ago) by jfenwick
File MIME type: text/plain
File size: 15254 byte(s)
More warning removal.

1 ksteube 1315
2     /*******************************************************
3 ksteube 1811 *
4     * Copyright (c) 2003-2008 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13 ksteube 1315
14 ksteube 1811
15 ksteube 1315 /**************************************************************/
16    
17     /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
18    
19     /**************************************************************/
20    
21     /* Author: gross@access.edu.au */
22    
23     /**************************************************************/
24    
25     #include "SparseMatrix.h"
26 ksteube 1637 #ifdef _OPENMP
27     #include <omp.h>
28     #endif
29 ksteube 1315
30     /* CSC format with offset 0*/
31 gross 1639 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(const double alpha,
32     const Paso_SparseMatrix* A,
33     const double* in,
34     const double beta,
35 ksteube 1315 double* out) {
36    
37     register index_t ir,icol,iptr,icb,irb,irow,ic;
38    
39     if (ABS(beta)>0.) {
40     if (beta != 1.) {
41 gross 1556 #pragma omp parallel for private(irow) schedule(static)
42 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
43     out[irow] *= beta;
44     }
45     } else {
46 gross 1556 #pragma omp parallel for private(irow) schedule(static)
47 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
48     out[irow] = 0;
49     }
50     if (Paso_Pattern_isEmpty(A->pattern)) return;
51     /* do the operation: */
52     if (ABS(alpha)>0) {
53     if (A ->col_block_size==1 && A->row_block_size ==1) {
54     /* TODO: parallelize (good luck!) */
55     for (icol=0;icol< A->pattern->numOutput;++icol) {
56     #pragma ivdep
57     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
58     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
59     }
60     }
61     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
62     /* TODO: parallelize */
63     for (ic=0;ic< A->pattern->numOutput;ic++) {
64     #pragma ivdep
65     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
66 gross 1511 ir=2*(A->pattern->index[iptr]);
67     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
68     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
69 ksteube 1315 }
70     }
71     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
72     /* TODO: parallelize */
73     for (ic=0;ic< A->pattern->numOutput;ic++) {
74     #pragma ivdep
75     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
76     ir=3*(A->pattern->index[iptr]);
77 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
78     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
79     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
80 ksteube 1315 }
81     }
82     } else {
83     /* TODO: parallelize */
84     for (ic=0;ic< A->pattern->numOutput;ic++) {
85     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
86     for (irb=0;irb< A->row_block_size;irb++) {
87     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
88     #pragma ivdep
89     for (icb=0;icb< A->col_block_size;icb++) {
90     icol=icb+A->col_block_size*ic;
91     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
92     }
93     }
94     }
95     }
96     }
97     }
98     return;
99     }
100    
101     /* CSC format with offset 1*/
102 gross 1639 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(const double alpha,
103     const Paso_SparseMatrix* A,
104     const double* in,
105     const double beta,
106 ksteube 1315 double* out) {
107    
108     register index_t ir,icol,iptr,icb,irb,irow,ic;
109 phornby 1628
110 ksteube 1315 if (ABS(beta)>0.) {
111     if (beta != 1.) {
112 gross 1556 #pragma omp parallel for private(irow) schedule(static)
113 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
114     out[irow] *= beta;
115     }
116     }
117     } else {
118 gross 1556 #pragma omp parallel for private(irow) schedule(static)
119 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
120     out[irow] = 0;
121     }
122    
123     /* do the operation: */
124     if (ABS(alpha)>0) {
125     if (A ->col_block_size==1 && A->row_block_size ==1) {
126     /* TODO: parallelize (good luck!) */
127     for (icol=0;icol< A->pattern->numOutput;++icol) {
128     #pragma ivdep
129     for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
130     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
131     }
132     }
133     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
134     /* TODO: parallelize */
135     for (ic=0;ic< A->pattern->numOutput;ic++) {
136     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
137 gross 1511 ir=2*(A->pattern->index[iptr]-1);
138     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
139     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
140 ksteube 1315 }
141     }
142     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
143     /* TODO: parallelize */
144     for (ic=0;ic< A->pattern->numOutput;ic++) {
145     #pragma ivdep
146     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
147     ir=3*(A->pattern->index[iptr]-1);
148 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
149     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
150     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
151 ksteube 1315 }
152     }
153     } else {
154     /* TODO: parallelize */
155     for (ic=0;ic< A->pattern->numOutput;ic++) {
156     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
157     for (irb=0;irb< A->row_block_size;irb++) {
158     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
159     #pragma ivdep
160     for (icb=0;icb< A->col_block_size;icb++) {
161     icol=icb+A->col_block_size*ic;
162     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
163     }
164     }
165     }
166     }
167     }
168     }
169     return;
170     }
171     /* CSR format with offset 1*/
172 gross 1639 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(const double alpha,
173     const Paso_SparseMatrix* A,
174     const double* in,
175     const double beta,
176 ksteube 1315 double* out) {
177    
178     register index_t ir,icol,iptr,icb,irb,irow,ic;
179     register double reg,reg1,reg2,reg3;
180     if (ABS(beta)>0.) {
181     if (beta != 1.) {
182 gross 1556 #pragma omp parallel for private(irow) schedule(static)
183 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
184     out[irow] *= beta;
185     }
186     } else {
187 gross 1556 #pragma omp parallel for private(irow) schedule(static)
188 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
189     out[irow] = 0;
190     }
191     /* do the operation: */
192     if (ABS(alpha)>0) {
193     if (A ->col_block_size==1 && A->row_block_size ==1) {
194 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
195 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
196     reg=0.;
197     #pragma ivdep
198     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
199     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
200     }
201     out[irow] += alpha * reg;
202     }
203     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
204 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
205 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
206     reg1=0.;
207     reg2=0.;
208     #pragma ivdep
209     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
210     ic=2*(A->pattern->index[iptr]-1);
211     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
212     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
213     }
214     out[ 2*ir] += alpha * reg1;
215     out[1+2*ir] += alpha * reg2;
216     }
217     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
218 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
219 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
220     reg1=0.;
221     reg2=0.;
222     reg3=0.;
223     #pragma ivdep
224     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
225     ic=3*(A->pattern->index[iptr]-1);
226     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
227     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
228     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
229     }
230     out[ 3*ir] += alpha * reg1;
231     out[1+3*ir] += alpha * reg2;
232     out[2+3*ir] += alpha * reg3;
233     }
234     } else {
235 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
236 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
237     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
238     for (irb=0;irb< A->row_block_size;irb++) {
239     irow=irb+A->row_block_size*ir;
240     reg=0.;
241     #pragma ivdep
242     for (icb=0;icb< A->col_block_size;icb++) {
243     icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
244     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
245     }
246     out[irow] += alpha * reg;
247     }
248     }
249     }
250     }
251     }
252     return;
253     }
254 gross 1564 /* CSR format with offset 0*/
255 gross 1639 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(const double alpha,
256     const Paso_SparseMatrix* A,
257     const double* in,
258     const double beta,
259 gross 1564 double* out)
260     {
261 gross 1570 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
262 gross 1564
263 ksteube 1637 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined _OPENMP
264 gross 1570 #define USE_DYNAMIC_SCHEDULING
265     #endif
266    
267 jfenwick 1981 #ifdef USE_DYNAMIC_SCHEDULING
268 gross 1564 char* chksz_chr=NULL;
269 gross 1570 dim_t chunk_size=1;
270 jfenwick 1981 #endif
271    
272 gross 1564 dim_t nrow=A->numRows;
273 phornby 1642 dim_t np=1, len, rest, irow, local_n, p;
274    
275     #ifdef USE_DYNAMIC_SCHEDULING
276     dim_t n_chunks;
277     #endif
278    
279 ksteube 1637 #ifdef _OPENMP
280 gross 1564 np=omp_get_max_threads();
281 ksteube 1637 #endif
282 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
283 gross 1564 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
284     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
285 gross 1570 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
286     n_chunks=nrow/chunk_size;
287     if (n_chunks*chunk_size<nrow) n_chunks+=1;
288     #else
289     len=nrow/np;
290     rest=nrow-len*np;
291 gross 1564 #endif
292    
293 gross 1571 #pragma omp parallel private(irow, p, local_n)
294 gross 1570 {
295     #ifdef USE_DYNAMIC_SCHEDULING
296 gross 1571 #pragma omp for schedule(dynamic,1)
297 gross 1570 for (p=0; p<n_chunks;p++) {
298     irow=chunk_size*p;
299     local_n=MIN(chunk_size,nrow-chunk_size*p);
300     #else
301 gross 1571 #pragma omp for schedule(static)
302 gross 1570 for (p=0; p<np;p++) {
303     irow=len*p+MIN(p,rest);
304     local_n=len+(p<rest ? 1 :0 );
305     #endif
306     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
307     local_n,
308     A->row_block_size,
309     A->col_block_size,
310     &(A->pattern->ptr[irow]),
311     A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
312     #ifdef USE_DYNAMIC_SCHEDULING
313     }
314     #else
315     }
316     #endif
317 gross 1564 }
318     }
319     /* CSR format with offset 0*/
320 gross 1639 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(const double alpha,
321     const dim_t nRows,
322     const dim_t row_block_size,
323     const dim_t col_block_size,
324     const index_t* ptr,
325     const index_t* index,
326     const double* val,
327     const double* in,
328     const double beta,
329 gross 1564 double* out)
330     {
331     register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
332     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
333     dim_t block_size;
334     if (ABS(beta)>0.) {
335     if (beta != 1.) {
336     for (irow=0;irow < nRows * row_block_size;irow++)
337     out[irow] *= beta;
338     }
339     } else {
340     for (irow=0;irow < nRows * row_block_size;irow++)
341     out[irow] = 0;
342     }
343     if (ABS(alpha)>0) {
344     if (col_block_size==1 && row_block_size ==1) {
345     for (irow=0;irow< nRows;++irow) {
346     reg=0.;
347     #pragma ivdep
348     for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
349     reg += val[iptr] * in[index[iptr]];
350     }
351     out[irow] += alpha * reg;
352     }
353     } else if (col_block_size==2 && row_block_size ==2) {
354     for (ir=0;ir< nRows;ir++) {
355     reg1=0.;
356     reg2=0.;
357     #pragma ivdep
358     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
359     ic=2*index[iptr];
360     Aiptr=iptr*4;
361     in1=in[ic];
362     in2=in[1+ic];
363     A00=val[Aiptr ];
364     A10=val[Aiptr+1];
365     A01=val[Aiptr+2];
366     A11=val[Aiptr+3];
367     reg1 += A00*in1 + A01*in2;
368     reg2 += A10*in1 + A11*in2;
369     }
370     out[ 2*ir] += alpha * reg1;
371     out[1+2*ir] += alpha * reg2;
372     }
373     } else if (col_block_size==3 && row_block_size ==3) {
374     for (ir=0;ir< nRows;ir++) {
375     reg1=0.;
376     reg2=0.;
377     reg3=0.;
378     #pragma ivdep
379     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
380     ic=3*index[iptr];
381     Aiptr=iptr*9;
382     in1=in[ic];
383     in2=in[1+ic];
384     in3=in[2+ic];
385     A00=val[Aiptr ];
386     A10=val[Aiptr+1];
387     A20=val[Aiptr+2];
388     A01=val[Aiptr+3];
389     A11=val[Aiptr+4];
390     A21=val[Aiptr+5];
391     A02=val[Aiptr+6];
392     A12=val[Aiptr+7];
393     A22=val[Aiptr+8];
394     reg1 += A00*in1 + A01*in2 + A02*in3;
395     reg2 += A10*in1 + A11*in2 + A12*in3;
396     reg3 += A20*in1 + A21*in2 + A22*in3;
397     }
398     out[ 3*ir] += alpha * reg1;
399     out[1+3*ir] += alpha * reg2;
400     out[2+3*ir] += alpha * reg3;
401     }
402     } else {
403     block_size=col_block_size*row_block_size;
404     for (ir=0;ir< nRows;ir++) {
405     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
406     for (irb=0;irb< row_block_size;irb++) {
407     irow=irb+row_block_size*ir;
408     reg=0.;
409     #pragma ivdep
410     for (icb=0;icb< col_block_size;icb++) {
411     icol=icb+col_block_size*index[iptr];
412     reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
413     }
414     out[irow] += alpha * reg;
415     }
416     }
417     }
418     }
419     }
420     return;
421     }

  ViewVC Help
Powered by ViewVC 1.1.26