/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1570 - (hide annotations)
Sat May 24 21:31:04 2008 UTC (11 years, 5 months ago) by gross
File MIME type: text/plain
File size: 15156 byte(s)
modifications to PCG to support dynamic scheduling
1 ksteube 1315
2     /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3    
4     /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15    
16     /**************************************************************/
17    
18     /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19    
20     /**************************************************************/
21    
22     /* Author: gross@access.edu.au */
23    
24     /**************************************************************/
25    
26     #include "SparseMatrix.h"
27    
28     /* CSC format with offset 0*/
29     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha,
30     Paso_SparseMatrix* A,
31     double* in,
32     double beta,
33     double* out) {
34    
35     register index_t ir,icol,iptr,icb,irb,irow,ic;
36     register double reg,reg1,reg2,reg3;
37    
38     if (ABS(beta)>0.) {
39     if (beta != 1.) {
40 gross 1556 #pragma omp parallel for private(irow) schedule(static)
41 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
42     out[irow] *= beta;
43     }
44     } else {
45 gross 1556 #pragma omp parallel for private(irow) schedule(static)
46 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
47     out[irow] = 0;
48     }
49     if (Paso_Pattern_isEmpty(A->pattern)) return;
50     /* do the operation: */
51     if (ABS(alpha)>0) {
52     if (A ->col_block_size==1 && A->row_block_size ==1) {
53     /* TODO: parallelize (good luck!) */
54     for (icol=0;icol< A->pattern->numOutput;++icol) {
55     #pragma ivdep
56     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
57     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
58     }
59     }
60     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
61     /* TODO: parallelize */
62     for (ic=0;ic< A->pattern->numOutput;ic++) {
63     #pragma ivdep
64     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
65 gross 1511 ir=2*(A->pattern->index[iptr]);
66     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
67     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
68 ksteube 1315 }
69     }
70     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
71     /* TODO: parallelize */
72     for (ic=0;ic< A->pattern->numOutput;ic++) {
73     #pragma ivdep
74     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
75     ir=3*(A->pattern->index[iptr]);
76 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
77     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
78     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
79 ksteube 1315 }
80     }
81     } else {
82     /* TODO: parallelize */
83     for (ic=0;ic< A->pattern->numOutput;ic++) {
84     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
85     for (irb=0;irb< A->row_block_size;irb++) {
86     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
87     #pragma ivdep
88     for (icb=0;icb< A->col_block_size;icb++) {
89     icol=icb+A->col_block_size*ic;
90     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
91     }
92     }
93     }
94     }
95     }
96     }
97     return;
98     }
99    
100     /* CSC format with offset 1*/
101     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha,
102     Paso_SparseMatrix* A,
103     double* in,
104     double beta,
105     double* out) {
106    
107     register index_t ir,icol,iptr,icb,irb,irow,ic;
108     register double reg,reg1,reg2,reg3;
109     if (ABS(beta)>0.) {
110     if (beta != 1.) {
111 gross 1556 #pragma omp parallel for private(irow) schedule(static)
112 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
113     out[irow] *= beta;
114     }
115     }
116     } else {
117 gross 1556 #pragma omp parallel for private(irow) schedule(static)
118 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
119     out[irow] = 0;
120     }
121    
122     /* do the operation: */
123     if (ABS(alpha)>0) {
124     if (A ->col_block_size==1 && A->row_block_size ==1) {
125     /* TODO: parallelize (good luck!) */
126     for (icol=0;icol< A->pattern->numOutput;++icol) {
127     #pragma ivdep
128     for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
129     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
130     }
131     }
132     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
133     /* TODO: parallelize */
134     for (ic=0;ic< A->pattern->numOutput;ic++) {
135     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
136 gross 1511 ir=2*(A->pattern->index[iptr]-1);
137     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
138     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
139 ksteube 1315 }
140     }
141     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
142     /* TODO: parallelize */
143     for (ic=0;ic< A->pattern->numOutput;ic++) {
144     #pragma ivdep
145     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
146     ir=3*(A->pattern->index[iptr]-1);
147 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
148     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
149     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
150 ksteube 1315 }
151     }
152     } else {
153     /* TODO: parallelize */
154     for (ic=0;ic< A->pattern->numOutput;ic++) {
155     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
156     for (irb=0;irb< A->row_block_size;irb++) {
157     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
158     #pragma ivdep
159     for (icb=0;icb< A->col_block_size;icb++) {
160     icol=icb+A->col_block_size*ic;
161     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
162     }
163     }
164     }
165     }
166     }
167     }
168     return;
169     }
170     /* CSR format with offset 1*/
171     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha,
172     Paso_SparseMatrix* A,
173     double* in,
174     double beta,
175     double* out) {
176    
177     register index_t ir,icol,iptr,icb,irb,irow,ic;
178     register double reg,reg1,reg2,reg3;
179     if (ABS(beta)>0.) {
180     if (beta != 1.) {
181 gross 1556 #pragma omp parallel for private(irow) schedule(static)
182 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
183     out[irow] *= beta;
184     }
185     } else {
186 gross 1556 #pragma omp parallel for private(irow) schedule(static)
187 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
188     out[irow] = 0;
189     }
190     /* do the operation: */
191     if (ABS(alpha)>0) {
192     if (A ->col_block_size==1 && A->row_block_size ==1) {
193 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
194 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
195     reg=0.;
196     #pragma ivdep
197     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
198     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
199     }
200     out[irow] += alpha * reg;
201     }
202     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
203 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
204 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
205     reg1=0.;
206     reg2=0.;
207     #pragma ivdep
208     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
209     ic=2*(A->pattern->index[iptr]-1);
210     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
211     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
212     }
213     out[ 2*ir] += alpha * reg1;
214     out[1+2*ir] += alpha * reg2;
215     }
216     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
217 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
218 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
219     reg1=0.;
220     reg2=0.;
221     reg3=0.;
222     #pragma ivdep
223     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
224     ic=3*(A->pattern->index[iptr]-1);
225     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
226     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
227     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
228     }
229     out[ 3*ir] += alpha * reg1;
230     out[1+3*ir] += alpha * reg2;
231     out[2+3*ir] += alpha * reg3;
232     }
233     } else {
234 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
235 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
236     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
237     for (irb=0;irb< A->row_block_size;irb++) {
238     irow=irb+A->row_block_size*ir;
239     reg=0.;
240     #pragma ivdep
241     for (icb=0;icb< A->col_block_size;icb++) {
242     icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
243     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
244     }
245     out[irow] += alpha * reg;
246     }
247     }
248     }
249     }
250     }
251     return;
252     }
253 gross 1564 /* CSR format with offset 0*/
254     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha,
255     Paso_SparseMatrix* A,
256     double* in,
257     double beta,
258     double* out)
259     {
260 gross 1570 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
261 gross 1564
262 gross 1570 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
263     #define USE_DYNAMIC_SCHEDULING
264     #endif
265    
266 gross 1564 char* chksz_chr=NULL;
267 gross 1570 dim_t chunk_size=1;
268 gross 1564 dim_t nrow=A->numRows;
269     dim_t np, len, rest, irow, local_n, p, n_chunks;
270     np=omp_get_max_threads();
271 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
272 gross 1564 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
273     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
274 gross 1570 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
275     n_chunks=nrow/chunk_size;
276     if (n_chunks*chunk_size<nrow) n_chunks+=1;
277     #else
278     len=nrow/np;
279     rest=nrow-len*np;
280 gross 1564 #endif
281    
282 gross 1570 #pragma omp parallel private(irow, len, p, local_n)
283     {
284     #ifdef USE_DYNAMIC_SCHEDULING
285     #pragma omp for private(p) schedule(dynamic,1)
286     for (p=0; p<n_chunks;p++) {
287     irow=chunk_size*p;
288     local_n=MIN(chunk_size,nrow-chunk_size*p);
289     #else
290     #pragma omp for private(p) schedule(static)
291     for (p=0; p<np;p++) {
292     irow=len*p+MIN(p,rest);
293     local_n=len+(p<rest ? 1 :0 );
294     #endif
295     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
296     local_n,
297     A->row_block_size,
298     A->col_block_size,
299     &(A->pattern->ptr[irow]),
300     A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
301     #ifdef USE_DYNAMIC_SCHEDULING
302     }
303     #else
304     }
305     #endif
306 gross 1564 }
307     }
308     /* CSR format with offset 0*/
309     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha,
310     dim_t nRows,
311     dim_t row_block_size,
312     dim_t col_block_size,
313     index_t* ptr,
314     index_t* index,
315     double* val,
316     double* in,
317     double beta,
318     double* out)
319     {
320     register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
321     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
322     dim_t block_size;
323     if (ABS(beta)>0.) {
324     if (beta != 1.) {
325     for (irow=0;irow < nRows * row_block_size;irow++)
326     out[irow] *= beta;
327     }
328     } else {
329     for (irow=0;irow < nRows * row_block_size;irow++)
330     out[irow] = 0;
331     }
332     if (ABS(alpha)>0) {
333     if (col_block_size==1 && row_block_size ==1) {
334     for (irow=0;irow< nRows;++irow) {
335     reg=0.;
336     #pragma ivdep
337     for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
338     reg += val[iptr] * in[index[iptr]];
339     }
340     out[irow] += alpha * reg;
341     }
342     } else if (col_block_size==2 && row_block_size ==2) {
343     for (ir=0;ir< nRows;ir++) {
344     reg1=0.;
345     reg2=0.;
346     #pragma ivdep
347     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
348     ic=2*index[iptr];
349     Aiptr=iptr*4;
350     in1=in[ic];
351     in2=in[1+ic];
352     A00=val[Aiptr ];
353     A10=val[Aiptr+1];
354     A01=val[Aiptr+2];
355     A11=val[Aiptr+3];
356     reg1 += A00*in1 + A01*in2;
357     reg2 += A10*in1 + A11*in2;
358     }
359     out[ 2*ir] += alpha * reg1;
360     out[1+2*ir] += alpha * reg2;
361     }
362     } else if (col_block_size==3 && row_block_size ==3) {
363     for (ir=0;ir< nRows;ir++) {
364     reg1=0.;
365     reg2=0.;
366     reg3=0.;
367     #pragma ivdep
368     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
369     ic=3*index[iptr];
370     Aiptr=iptr*9;
371     in1=in[ic];
372     in2=in[1+ic];
373     in3=in[2+ic];
374     A00=val[Aiptr ];
375     A10=val[Aiptr+1];
376     A20=val[Aiptr+2];
377     A01=val[Aiptr+3];
378     A11=val[Aiptr+4];
379     A21=val[Aiptr+5];
380     A02=val[Aiptr+6];
381     A12=val[Aiptr+7];
382     A22=val[Aiptr+8];
383     reg1 += A00*in1 + A01*in2 + A02*in3;
384     reg2 += A10*in1 + A11*in2 + A12*in3;
385     reg3 += A20*in1 + A21*in2 + A22*in3;
386     }
387     out[ 3*ir] += alpha * reg1;
388     out[1+3*ir] += alpha * reg2;
389     out[2+3*ir] += alpha * reg3;
390     }
391     } else {
392     block_size=col_block_size*row_block_size;
393     for (ir=0;ir< nRows;ir++) {
394     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
395     for (irb=0;irb< row_block_size;irb++) {
396     irow=irb+row_block_size*ir;
397     reg=0.;
398     #pragma ivdep
399     for (icb=0;icb< col_block_size;icb++) {
400     icol=icb+col_block_size*index[iptr];
401     reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
402     }
403     out[irow] += alpha * reg;
404     }
405     }
406     }
407     }
408     }
409     return;
410     }

  ViewVC Help
Powered by ViewVC 1.1.26