/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1637 - (hide annotations)
Mon Jul 14 05:34:59 2008 UTC (13 years ago) by ksteube
File MIME type: text/plain
File size: 15116 byte(s)
Resolved some compiler warnings
Changed blocktimer to not use strdup, intead malloc and strcpy

1 ksteube 1315
2     /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3    
4     /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15    
16     /**************************************************************/
17    
18     /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19    
20     /**************************************************************/
21    
22     /* Author: gross@access.edu.au */
23    
24     /**************************************************************/
25    
26     #include "SparseMatrix.h"
27 ksteube 1637 #ifdef _OPENMP
28     #include <omp.h>
29     #endif
30 ksteube 1315
31     /* CSC format with offset 0*/
32     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha,
33     Paso_SparseMatrix* A,
34     double* in,
35     double beta,
36     double* out) {
37    
38     register index_t ir,icol,iptr,icb,irb,irow,ic;
39    
40     if (ABS(beta)>0.) {
41     if (beta != 1.) {
42 gross 1556 #pragma omp parallel for private(irow) schedule(static)
43 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
44     out[irow] *= beta;
45     }
46     } else {
47 gross 1556 #pragma omp parallel for private(irow) schedule(static)
48 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
49     out[irow] = 0;
50     }
51     if (Paso_Pattern_isEmpty(A->pattern)) return;
52     /* do the operation: */
53     if (ABS(alpha)>0) {
54     if (A ->col_block_size==1 && A->row_block_size ==1) {
55     /* TODO: parallelize (good luck!) */
56     for (icol=0;icol< A->pattern->numOutput;++icol) {
57     #pragma ivdep
58     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
59     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
60     }
61     }
62     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
63     /* TODO: parallelize */
64     for (ic=0;ic< A->pattern->numOutput;ic++) {
65     #pragma ivdep
66     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
67 gross 1511 ir=2*(A->pattern->index[iptr]);
68     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
69     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
70 ksteube 1315 }
71     }
72     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
73     /* TODO: parallelize */
74     for (ic=0;ic< A->pattern->numOutput;ic++) {
75     #pragma ivdep
76     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
77     ir=3*(A->pattern->index[iptr]);
78 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
79     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
80     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
81 ksteube 1315 }
82     }
83     } else {
84     /* TODO: parallelize */
85     for (ic=0;ic< A->pattern->numOutput;ic++) {
86     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
87     for (irb=0;irb< A->row_block_size;irb++) {
88     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
89     #pragma ivdep
90     for (icb=0;icb< A->col_block_size;icb++) {
91     icol=icb+A->col_block_size*ic;
92     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
93     }
94     }
95     }
96     }
97     }
98     }
99     return;
100     }
101    
102     /* CSC format with offset 1*/
103     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha,
104     Paso_SparseMatrix* A,
105     double* in,
106     double beta,
107     double* out) {
108    
109     register index_t ir,icol,iptr,icb,irb,irow,ic;
110 phornby 1628
111 ksteube 1315 if (ABS(beta)>0.) {
112     if (beta != 1.) {
113 gross 1556 #pragma omp parallel for private(irow) schedule(static)
114 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
115     out[irow] *= beta;
116     }
117     }
118     } else {
119 gross 1556 #pragma omp parallel for private(irow) schedule(static)
120 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
121     out[irow] = 0;
122     }
123    
124     /* do the operation: */
125     if (ABS(alpha)>0) {
126     if (A ->col_block_size==1 && A->row_block_size ==1) {
127     /* TODO: parallelize (good luck!) */
128     for (icol=0;icol< A->pattern->numOutput;++icol) {
129     #pragma ivdep
130     for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
131     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
132     }
133     }
134     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
135     /* TODO: parallelize */
136     for (ic=0;ic< A->pattern->numOutput;ic++) {
137     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
138 gross 1511 ir=2*(A->pattern->index[iptr]-1);
139     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
140     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
141 ksteube 1315 }
142     }
143     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
144     /* TODO: parallelize */
145     for (ic=0;ic< A->pattern->numOutput;ic++) {
146     #pragma ivdep
147     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
148     ir=3*(A->pattern->index[iptr]-1);
149 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
150     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
151     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
152 ksteube 1315 }
153     }
154     } else {
155     /* TODO: parallelize */
156     for (ic=0;ic< A->pattern->numOutput;ic++) {
157     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
158     for (irb=0;irb< A->row_block_size;irb++) {
159     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
160     #pragma ivdep
161     for (icb=0;icb< A->col_block_size;icb++) {
162     icol=icb+A->col_block_size*ic;
163     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
164     }
165     }
166     }
167     }
168     }
169     }
170     return;
171     }
172     /* CSR format with offset 1*/
173     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha,
174     Paso_SparseMatrix* A,
175     double* in,
176     double beta,
177     double* out) {
178    
179     register index_t ir,icol,iptr,icb,irb,irow,ic;
180     register double reg,reg1,reg2,reg3;
181     if (ABS(beta)>0.) {
182     if (beta != 1.) {
183 gross 1556 #pragma omp parallel for private(irow) schedule(static)
184 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
185     out[irow] *= beta;
186     }
187     } else {
188 gross 1556 #pragma omp parallel for private(irow) schedule(static)
189 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
190     out[irow] = 0;
191     }
192     /* do the operation: */
193     if (ABS(alpha)>0) {
194     if (A ->col_block_size==1 && A->row_block_size ==1) {
195 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
196 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
197     reg=0.;
198     #pragma ivdep
199     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
200     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
201     }
202     out[irow] += alpha * reg;
203     }
204     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
205 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
206 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
207     reg1=0.;
208     reg2=0.;
209     #pragma ivdep
210     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
211     ic=2*(A->pattern->index[iptr]-1);
212     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
213     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
214     }
215     out[ 2*ir] += alpha * reg1;
216     out[1+2*ir] += alpha * reg2;
217     }
218     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
219 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
220 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
221     reg1=0.;
222     reg2=0.;
223     reg3=0.;
224     #pragma ivdep
225     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
226     ic=3*(A->pattern->index[iptr]-1);
227     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
228     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
229     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
230     }
231     out[ 3*ir] += alpha * reg1;
232     out[1+3*ir] += alpha * reg2;
233     out[2+3*ir] += alpha * reg3;
234     }
235     } else {
236 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
237 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
238     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
239     for (irb=0;irb< A->row_block_size;irb++) {
240     irow=irb+A->row_block_size*ir;
241     reg=0.;
242     #pragma ivdep
243     for (icb=0;icb< A->col_block_size;icb++) {
244     icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
245     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
246     }
247     out[irow] += alpha * reg;
248     }
249     }
250     }
251     }
252     }
253     return;
254     }
255 gross 1564 /* CSR format with offset 0*/
256     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha,
257     Paso_SparseMatrix* A,
258     double* in,
259     double beta,
260     double* out)
261     {
262 gross 1570 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
263 gross 1564
264 ksteube 1637 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined _OPENMP
265 gross 1570 #define USE_DYNAMIC_SCHEDULING
266     #endif
267    
268 gross 1564 char* chksz_chr=NULL;
269 gross 1570 dim_t chunk_size=1;
270 gross 1564 dim_t nrow=A->numRows;
271 ksteube 1637 dim_t np=1, len, rest, irow, local_n, p, n_chunks;
272     #ifdef _OPENMP
273 gross 1564 np=omp_get_max_threads();
274 ksteube 1637 #endif
275 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
276 gross 1564 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
277     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
278 gross 1570 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
279     n_chunks=nrow/chunk_size;
280     if (n_chunks*chunk_size<nrow) n_chunks+=1;
281     #else
282     len=nrow/np;
283     rest=nrow-len*np;
284 gross 1564 #endif
285    
286 gross 1571 #pragma omp parallel private(irow, p, local_n)
287 gross 1570 {
288     #ifdef USE_DYNAMIC_SCHEDULING
289 gross 1571 #pragma omp for schedule(dynamic,1)
290 gross 1570 for (p=0; p<n_chunks;p++) {
291     irow=chunk_size*p;
292     local_n=MIN(chunk_size,nrow-chunk_size*p);
293     #else
294 gross 1571 #pragma omp for schedule(static)
295 gross 1570 for (p=0; p<np;p++) {
296     irow=len*p+MIN(p,rest);
297     local_n=len+(p<rest ? 1 :0 );
298     #endif
299     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
300     local_n,
301     A->row_block_size,
302     A->col_block_size,
303     &(A->pattern->ptr[irow]),
304     A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
305     #ifdef USE_DYNAMIC_SCHEDULING
306     }
307     #else
308     }
309     #endif
310 gross 1564 }
311     }
312     /* CSR format with offset 0*/
313     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha,
314     dim_t nRows,
315     dim_t row_block_size,
316     dim_t col_block_size,
317     index_t* ptr,
318     index_t* index,
319     double* val,
320     double* in,
321     double beta,
322     double* out)
323     {
324     register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
325     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
326     dim_t block_size;
327     if (ABS(beta)>0.) {
328     if (beta != 1.) {
329     for (irow=0;irow < nRows * row_block_size;irow++)
330     out[irow] *= beta;
331     }
332     } else {
333     for (irow=0;irow < nRows * row_block_size;irow++)
334     out[irow] = 0;
335     }
336     if (ABS(alpha)>0) {
337     if (col_block_size==1 && row_block_size ==1) {
338     for (irow=0;irow< nRows;++irow) {
339     reg=0.;
340     #pragma ivdep
341     for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
342     reg += val[iptr] * in[index[iptr]];
343     }
344     out[irow] += alpha * reg;
345     }
346     } else if (col_block_size==2 && row_block_size ==2) {
347     for (ir=0;ir< nRows;ir++) {
348     reg1=0.;
349     reg2=0.;
350     #pragma ivdep
351     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
352     ic=2*index[iptr];
353     Aiptr=iptr*4;
354     in1=in[ic];
355     in2=in[1+ic];
356     A00=val[Aiptr ];
357     A10=val[Aiptr+1];
358     A01=val[Aiptr+2];
359     A11=val[Aiptr+3];
360     reg1 += A00*in1 + A01*in2;
361     reg2 += A10*in1 + A11*in2;
362     }
363     out[ 2*ir] += alpha * reg1;
364     out[1+2*ir] += alpha * reg2;
365     }
366     } else if (col_block_size==3 && row_block_size ==3) {
367     for (ir=0;ir< nRows;ir++) {
368     reg1=0.;
369     reg2=0.;
370     reg3=0.;
371     #pragma ivdep
372     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
373     ic=3*index[iptr];
374     Aiptr=iptr*9;
375     in1=in[ic];
376     in2=in[1+ic];
377     in3=in[2+ic];
378     A00=val[Aiptr ];
379     A10=val[Aiptr+1];
380     A20=val[Aiptr+2];
381     A01=val[Aiptr+3];
382     A11=val[Aiptr+4];
383     A21=val[Aiptr+5];
384     A02=val[Aiptr+6];
385     A12=val[Aiptr+7];
386     A22=val[Aiptr+8];
387     reg1 += A00*in1 + A01*in2 + A02*in3;
388     reg2 += A10*in1 + A11*in2 + A12*in3;
389     reg3 += A20*in1 + A21*in2 + A22*in3;
390     }
391     out[ 3*ir] += alpha * reg1;
392     out[1+3*ir] += alpha * reg2;
393     out[2+3*ir] += alpha * reg3;
394     }
395     } else {
396     block_size=col_block_size*row_block_size;
397     for (ir=0;ir< nRows;ir++) {
398     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
399     for (irb=0;irb< row_block_size;irb++) {
400     irow=irb+row_block_size*ir;
401     reg=0.;
402     #pragma ivdep
403     for (icb=0;icb< col_block_size;icb++) {
404     icol=icb+col_block_size*index[iptr];
405     reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
406     }
407     out[irow] += alpha * reg;
408     }
409     }
410     }
411     }
412     }
413     return;
414     }

  ViewVC Help
Powered by ViewVC 1.1.26