/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1564 - (hide annotations)
Thu May 22 09:31:33 2008 UTC (11 years, 6 months ago) by gross
File MIME type: text/plain
File size: 16092 byte(s)
some openmp dynamic scheduling for MVM.
1 ksteube 1315
2     /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3    
4     /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15    
16     /**************************************************************/
17    
18     /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19    
20     /**************************************************************/
21    
22     /* Author: gross@access.edu.au */
23    
24     /**************************************************************/
25    
26     #include "SparseMatrix.h"
27    
28     /* CSC format with offset 0*/
29     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha,
30     Paso_SparseMatrix* A,
31     double* in,
32     double beta,
33     double* out) {
34    
35     register index_t ir,icol,iptr,icb,irb,irow,ic;
36     register double reg,reg1,reg2,reg3;
37    
38     if (ABS(beta)>0.) {
39     if (beta != 1.) {
40 gross 1556 #pragma omp parallel for private(irow) schedule(static)
41 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
42     out[irow] *= beta;
43     }
44     } else {
45 gross 1556 #pragma omp parallel for private(irow) schedule(static)
46 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
47     out[irow] = 0;
48     }
49     if (Paso_Pattern_isEmpty(A->pattern)) return;
50     /* do the operation: */
51     if (ABS(alpha)>0) {
52     if (A ->col_block_size==1 && A->row_block_size ==1) {
53     /* TODO: parallelize (good luck!) */
54     for (icol=0;icol< A->pattern->numOutput;++icol) {
55     #pragma ivdep
56     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
57     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
58     }
59     }
60     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
61     /* TODO: parallelize */
62     for (ic=0;ic< A->pattern->numOutput;ic++) {
63     #pragma ivdep
64     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
65 gross 1511 ir=2*(A->pattern->index[iptr]);
66     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
67     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
68 ksteube 1315 }
69     }
70     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
71     /* TODO: parallelize */
72     for (ic=0;ic< A->pattern->numOutput;ic++) {
73     #pragma ivdep
74     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
75     ir=3*(A->pattern->index[iptr]);
76 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
77     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
78     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
79 ksteube 1315 }
80     }
81     } else {
82     /* TODO: parallelize */
83     for (ic=0;ic< A->pattern->numOutput;ic++) {
84     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
85     for (irb=0;irb< A->row_block_size;irb++) {
86     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
87     #pragma ivdep
88     for (icb=0;icb< A->col_block_size;icb++) {
89     icol=icb+A->col_block_size*ic;
90     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
91     }
92     }
93     }
94     }
95     }
96     }
97     return;
98     }
99    
100     /* CSC format with offset 1*/
101     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha,
102     Paso_SparseMatrix* A,
103     double* in,
104     double beta,
105     double* out) {
106    
107     register index_t ir,icol,iptr,icb,irb,irow,ic;
108     register double reg,reg1,reg2,reg3;
109     if (ABS(beta)>0.) {
110     if (beta != 1.) {
111 gross 1556 #pragma omp parallel for private(irow) schedule(static)
112 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
113     out[irow] *= beta;
114     }
115     }
116     } else {
117 gross 1556 #pragma omp parallel for private(irow) schedule(static)
118 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
119     out[irow] = 0;
120     }
121    
122     /* do the operation: */
123     if (ABS(alpha)>0) {
124     if (A ->col_block_size==1 && A->row_block_size ==1) {
125     /* TODO: parallelize (good luck!) */
126     for (icol=0;icol< A->pattern->numOutput;++icol) {
127     #pragma ivdep
128     for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
129     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
130     }
131     }
132     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
133     /* TODO: parallelize */
134     for (ic=0;ic< A->pattern->numOutput;ic++) {
135     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
136 gross 1511 ir=2*(A->pattern->index[iptr]-1);
137     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
138     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
139 ksteube 1315 }
140     }
141     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
142     /* TODO: parallelize */
143     for (ic=0;ic< A->pattern->numOutput;ic++) {
144     #pragma ivdep
145     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
146     ir=3*(A->pattern->index[iptr]-1);
147 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
148     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
149     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
150 ksteube 1315 }
151     }
152     } else {
153     /* TODO: parallelize */
154     for (ic=0;ic< A->pattern->numOutput;ic++) {
155     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
156     for (irb=0;irb< A->row_block_size;irb++) {
157     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
158     #pragma ivdep
159     for (icb=0;icb< A->col_block_size;icb++) {
160     icol=icb+A->col_block_size*ic;
161     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
162     }
163     }
164     }
165     }
166     }
167     }
168     return;
169     }
170     /* CSR format with offset 1*/
171     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha,
172     Paso_SparseMatrix* A,
173     double* in,
174     double beta,
175     double* out) {
176    
177     register index_t ir,icol,iptr,icb,irb,irow,ic;
178     register double reg,reg1,reg2,reg3;
179     if (ABS(beta)>0.) {
180     if (beta != 1.) {
181 gross 1556 #pragma omp parallel for private(irow) schedule(static)
182 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
183     out[irow] *= beta;
184     }
185     } else {
186 gross 1556 #pragma omp parallel for private(irow) schedule(static)
187 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
188     out[irow] = 0;
189     }
190     /* do the operation: */
191     if (ABS(alpha)>0) {
192     if (A ->col_block_size==1 && A->row_block_size ==1) {
193 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
194 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
195     reg=0.;
196     #pragma ivdep
197     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
198     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
199     }
200     out[irow] += alpha * reg;
201     }
202     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
203 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
204 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
205     reg1=0.;
206     reg2=0.;
207     #pragma ivdep
208     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
209     ic=2*(A->pattern->index[iptr]-1);
210     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
211     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
212     }
213     out[ 2*ir] += alpha * reg1;
214     out[1+2*ir] += alpha * reg2;
215     }
216     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
217 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
218 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
219     reg1=0.;
220     reg2=0.;
221     reg3=0.;
222     #pragma ivdep
223     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
224     ic=3*(A->pattern->index[iptr]-1);
225     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
226     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
227     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
228     }
229     out[ 3*ir] += alpha * reg1;
230     out[1+3*ir] += alpha * reg2;
231     out[2+3*ir] += alpha * reg3;
232     }
233     } else {
234 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
235 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
236     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
237     for (irb=0;irb< A->row_block_size;irb++) {
238     irow=irb+A->row_block_size*ir;
239     reg=0.;
240     #pragma ivdep
241     for (icb=0;icb< A->col_block_size;icb++) {
242     icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
243     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
244     }
245     out[irow] += alpha * reg;
246     }
247     }
248     }
249     }
250     }
251     return;
252     }
253 gross 1564 /* CSR format with offset 0*/
254     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha,
255     Paso_SparseMatrix* A,
256     double* in,
257     double beta,
258     double* out)
259     {
260     /* #define PASO_DYNAMIC_SCHEDULING_MVM */
261    
262     char* chksz_chr=NULL;
263     dim_t chunk_size=-1;
264     dim_t nrow=A->numRows;
265     dim_t np, len, rest, irow, local_n, p, n_chunks;
266     np=omp_get_max_threads();
267     #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
268     chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
269     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
270     #endif
271    
272     if (chunk_size<1 || np <=1) {
273     #pragma omp parallel private(irow, len, rest, local_n)
274     {
275     len=nrow/np;
276     rest=nrow-len*np;
277     #pragma omp for private(p) schedule(static)
278     for (p=0; p<np;p++) {
279     irow=len*p+MIN(p,rest);
280     local_n=len+(p<rest ? 1 :0 );
281     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
282     local_n,
283     A->row_block_size,
284     A->col_block_size,
285     &(A->pattern->ptr[irow]),
286     A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
287     }
288     }
289     } else {
290     #pragma omp parallel private(n_chunks,irow,local_n)
291     {
292     n_chunks=nrow/chunk_size;
293     if (n_chunks*chunk_size<nrow) n_chunks+=1;
294     #pragma omp for private(p) schedule(dynamic,1)
295     for (p=0; p<n_chunks;p++) {
296     irow=chunk_size*p;
297     local_n=MIN(chunk_size,nrow-chunk_size*p);
298     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
299     local_n,
300     A->row_block_size,
301     A->col_block_size,
302     &(A->pattern->ptr[irow]),
303     A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
304     }
305     }
306     }
307     }
308     /* CSR format with offset 0*/
309     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha,
310     dim_t nRows,
311     dim_t row_block_size,
312     dim_t col_block_size,
313     index_t* ptr,
314     index_t* index,
315     double* val,
316     double* in,
317     double beta,
318     double* out)
319     {
320     register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
321     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
322     dim_t block_size;
323     if (ABS(beta)>0.) {
324     if (beta != 1.) {
325     #pragma omp for private(irow) schedule(static)
326     for (irow=0;irow < nRows * row_block_size;irow++)
327     out[irow] *= beta;
328     }
329     } else {
330     #pragma omp for private(irow) schedule(static)
331     for (irow=0;irow < nRows * row_block_size;irow++)
332     out[irow] = 0;
333     }
334     if (ABS(alpha)>0) {
335     if (col_block_size==1 && row_block_size ==1) {
336     #pragma omp for private(irow,iptr,reg) schedule(static)
337     for (irow=0;irow< nRows;++irow) {
338     reg=0.;
339     #pragma ivdep
340     for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
341     reg += val[iptr] * in[index[iptr]];
342     }
343     out[irow] += alpha * reg;
344     }
345     } else if (col_block_size==2 && row_block_size ==2) {
346     #pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static)
347     for (ir=0;ir< nRows;ir++) {
348     reg1=0.;
349     reg2=0.;
350     #pragma ivdep
351     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
352     ic=2*index[iptr];
353     Aiptr=iptr*4;
354     in1=in[ic];
355     in2=in[1+ic];
356     A00=val[Aiptr ];
357     A10=val[Aiptr+1];
358     A01=val[Aiptr+2];
359     A11=val[Aiptr+3];
360     reg1 += A00*in1 + A01*in2;
361     reg2 += A10*in1 + A11*in2;
362     }
363     out[ 2*ir] += alpha * reg1;
364     out[1+2*ir] += alpha * reg2;
365     }
366     } else if (col_block_size==3 && row_block_size ==3) {
367     #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static)
368     for (ir=0;ir< nRows;ir++) {
369     reg1=0.;
370     reg2=0.;
371     reg3=0.;
372     #pragma ivdep
373     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
374     ic=3*index[iptr];
375     Aiptr=iptr*9;
376     in1=in[ic];
377     in2=in[1+ic];
378     in3=in[2+ic];
379     A00=val[Aiptr ];
380     A10=val[Aiptr+1];
381     A20=val[Aiptr+2];
382     A01=val[Aiptr+3];
383     A11=val[Aiptr+4];
384     A21=val[Aiptr+5];
385     A02=val[Aiptr+6];
386     A12=val[Aiptr+7];
387     A22=val[Aiptr+8];
388     reg1 += A00*in1 + A01*in2 + A02*in3;
389     reg2 += A10*in1 + A11*in2 + A12*in3;
390     reg3 += A20*in1 + A21*in2 + A22*in3;
391     }
392     out[ 3*ir] += alpha * reg1;
393     out[1+3*ir] += alpha * reg2;
394     out[2+3*ir] += alpha * reg3;
395     }
396     } else {
397     block_size=col_block_size*row_block_size;
398     #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
399     for (ir=0;ir< nRows;ir++) {
400     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
401     for (irb=0;irb< row_block_size;irb++) {
402     irow=irb+row_block_size*ir;
403     reg=0.;
404     #pragma ivdep
405     for (icb=0;icb< col_block_size;icb++) {
406     icol=icb+col_block_size*index[iptr];
407     reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
408     }
409     out[irow] += alpha * reg;
410     }
411     }
412     }
413     }
414     }
415     return;
416     }

  ViewVC Help
Powered by ViewVC 1.1.26