/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Contents of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1642 - (show annotations)
Tue Jul 15 05:21:57 2008 UTC (11 years, 4 months ago) by phornby
File MIME type: text/plain
File size: 15315 byte(s)
make the 1st arg of the ...._call(const double alpha,....
consistent between the .h and .c file. The const is not really needed.
1
2 /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /**************************************************************/
17
18 /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19
20 /**************************************************************/
21
22 /* Author: gross@access.edu.au */
23
24 /**************************************************************/
25
26 #include "SparseMatrix.h"
27 #ifdef _OPENMP
28 #include <omp.h>
29 #endif
30
31 /* CSC format with offset 0*/
32 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(const double alpha,
33 const Paso_SparseMatrix* A,
34 const double* in,
35 const double beta,
36 double* out) {
37
38 register index_t ir,icol,iptr,icb,irb,irow,ic;
39
40 if (ABS(beta)>0.) {
41 if (beta != 1.) {
42 #pragma omp parallel for private(irow) schedule(static)
43 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
44 out[irow] *= beta;
45 }
46 } else {
47 #pragma omp parallel for private(irow) schedule(static)
48 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
49 out[irow] = 0;
50 }
51 if (Paso_Pattern_isEmpty(A->pattern)) return;
52 /* do the operation: */
53 if (ABS(alpha)>0) {
54 if (A ->col_block_size==1 && A->row_block_size ==1) {
55 /* TODO: parallelize (good luck!) */
56 for (icol=0;icol< A->pattern->numOutput;++icol) {
57 #pragma ivdep
58 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
59 out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
60 }
61 }
62 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
63 /* TODO: parallelize */
64 for (ic=0;ic< A->pattern->numOutput;ic++) {
65 #pragma ivdep
66 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
67 ir=2*(A->pattern->index[iptr]);
68 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
69 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
70 }
71 }
72 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
73 /* TODO: parallelize */
74 for (ic=0;ic< A->pattern->numOutput;ic++) {
75 #pragma ivdep
76 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
77 ir=3*(A->pattern->index[iptr]);
78 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
79 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
80 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
81 }
82 }
83 } else {
84 /* TODO: parallelize */
85 for (ic=0;ic< A->pattern->numOutput;ic++) {
86 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
87 for (irb=0;irb< A->row_block_size;irb++) {
88 irow=irb+A->row_block_size*(A->pattern->index[iptr]);
89 #pragma ivdep
90 for (icb=0;icb< A->col_block_size;icb++) {
91 icol=icb+A->col_block_size*ic;
92 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
93 }
94 }
95 }
96 }
97 }
98 }
99 return;
100 }
101
102 /* CSC format with offset 1*/
103 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(const double alpha,
104 const Paso_SparseMatrix* A,
105 const double* in,
106 const double beta,
107 double* out) {
108
109 register index_t ir,icol,iptr,icb,irb,irow,ic;
110
111 if (ABS(beta)>0.) {
112 if (beta != 1.) {
113 #pragma omp parallel for private(irow) schedule(static)
114 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
115 out[irow] *= beta;
116 }
117 }
118 } else {
119 #pragma omp parallel for private(irow) schedule(static)
120 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
121 out[irow] = 0;
122 }
123
124 /* do the operation: */
125 if (ABS(alpha)>0) {
126 if (A ->col_block_size==1 && A->row_block_size ==1) {
127 /* TODO: parallelize (good luck!) */
128 for (icol=0;icol< A->pattern->numOutput;++icol) {
129 #pragma ivdep
130 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
131 out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
132 }
133 }
134 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
135 /* TODO: parallelize */
136 for (ic=0;ic< A->pattern->numOutput;ic++) {
137 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
138 ir=2*(A->pattern->index[iptr]-1);
139 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
140 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
141 }
142 }
143 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
144 /* TODO: parallelize */
145 for (ic=0;ic< A->pattern->numOutput;ic++) {
146 #pragma ivdep
147 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
148 ir=3*(A->pattern->index[iptr]-1);
149 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
150 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
151 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
152 }
153 }
154 } else {
155 /* TODO: parallelize */
156 for (ic=0;ic< A->pattern->numOutput;ic++) {
157 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
158 for (irb=0;irb< A->row_block_size;irb++) {
159 irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
160 #pragma ivdep
161 for (icb=0;icb< A->col_block_size;icb++) {
162 icol=icb+A->col_block_size*ic;
163 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
164 }
165 }
166 }
167 }
168 }
169 }
170 return;
171 }
172 /* CSR format with offset 1*/
173 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(const double alpha,
174 const Paso_SparseMatrix* A,
175 const double* in,
176 const double beta,
177 double* out) {
178
179 register index_t ir,icol,iptr,icb,irb,irow,ic;
180 register double reg,reg1,reg2,reg3;
181 if (ABS(beta)>0.) {
182 if (beta != 1.) {
183 #pragma omp parallel for private(irow) schedule(static)
184 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
185 out[irow] *= beta;
186 }
187 } else {
188 #pragma omp parallel for private(irow) schedule(static)
189 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
190 out[irow] = 0;
191 }
192 /* do the operation: */
193 if (ABS(alpha)>0) {
194 if (A ->col_block_size==1 && A->row_block_size ==1) {
195 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
196 for (irow=0;irow< A->pattern->numOutput;++irow) {
197 reg=0.;
198 #pragma ivdep
199 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
200 reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
201 }
202 out[irow] += alpha * reg;
203 }
204 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
205 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
206 for (ir=0;ir< A->pattern->numOutput;ir++) {
207 reg1=0.;
208 reg2=0.;
209 #pragma ivdep
210 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
211 ic=2*(A->pattern->index[iptr]-1);
212 reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
213 reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
214 }
215 out[ 2*ir] += alpha * reg1;
216 out[1+2*ir] += alpha * reg2;
217 }
218 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
219 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
220 for (ir=0;ir< A->pattern->numOutput;ir++) {
221 reg1=0.;
222 reg2=0.;
223 reg3=0.;
224 #pragma ivdep
225 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
226 ic=3*(A->pattern->index[iptr]-1);
227 reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
228 reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
229 reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
230 }
231 out[ 3*ir] += alpha * reg1;
232 out[1+3*ir] += alpha * reg2;
233 out[2+3*ir] += alpha * reg3;
234 }
235 } else {
236 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
237 for (ir=0;ir< A->pattern->numOutput;ir++) {
238 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
239 for (irb=0;irb< A->row_block_size;irb++) {
240 irow=irb+A->row_block_size*ir;
241 reg=0.;
242 #pragma ivdep
243 for (icb=0;icb< A->col_block_size;icb++) {
244 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
245 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
246 }
247 out[irow] += alpha * reg;
248 }
249 }
250 }
251 }
252 }
253 return;
254 }
255 /* CSR format with offset 0*/
256 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(const double alpha,
257 const Paso_SparseMatrix* A,
258 const double* in,
259 const double beta,
260 double* out)
261 {
262 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
263
264 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined _OPENMP
265 #define USE_DYNAMIC_SCHEDULING
266 #endif
267
268 char* chksz_chr=NULL;
269 dim_t chunk_size=1;
270 dim_t nrow=A->numRows;
271 dim_t np=1, len, rest, irow, local_n, p;
272
273 #ifdef USE_DYNAMIC_SCHEDULING
274 dim_t n_chunks;
275 #endif
276
277 #ifdef _OPENMP
278 np=omp_get_max_threads();
279 #endif
280 #ifdef USE_DYNAMIC_SCHEDULING
281 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
282 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
283 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
284 n_chunks=nrow/chunk_size;
285 if (n_chunks*chunk_size<nrow) n_chunks+=1;
286 #else
287 len=nrow/np;
288 rest=nrow-len*np;
289 #endif
290
291 #pragma omp parallel private(irow, p, local_n)
292 {
293 #ifdef USE_DYNAMIC_SCHEDULING
294 #pragma omp for schedule(dynamic,1)
295 for (p=0; p<n_chunks;p++) {
296 irow=chunk_size*p;
297 local_n=MIN(chunk_size,nrow-chunk_size*p);
298 #else
299 #pragma omp for schedule(static)
300 for (p=0; p<np;p++) {
301 irow=len*p+MIN(p,rest);
302 local_n=len+(p<rest ? 1 :0 );
303 #endif
304 Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
305 local_n,
306 A->row_block_size,
307 A->col_block_size,
308 &(A->pattern->ptr[irow]),
309 A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
310 #ifdef USE_DYNAMIC_SCHEDULING
311 }
312 #else
313 }
314 #endif
315 }
316 }
317 /* CSR format with offset 0*/
318 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(const double alpha,
319 const dim_t nRows,
320 const dim_t row_block_size,
321 const dim_t col_block_size,
322 const index_t* ptr,
323 const index_t* index,
324 const double* val,
325 const double* in,
326 const double beta,
327 double* out)
328 {
329 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
330 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
331 dim_t block_size;
332 if (ABS(beta)>0.) {
333 if (beta != 1.) {
334 for (irow=0;irow < nRows * row_block_size;irow++)
335 out[irow] *= beta;
336 }
337 } else {
338 for (irow=0;irow < nRows * row_block_size;irow++)
339 out[irow] = 0;
340 }
341 if (ABS(alpha)>0) {
342 if (col_block_size==1 && row_block_size ==1) {
343 for (irow=0;irow< nRows;++irow) {
344 reg=0.;
345 #pragma ivdep
346 for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
347 reg += val[iptr] * in[index[iptr]];
348 }
349 out[irow] += alpha * reg;
350 }
351 } else if (col_block_size==2 && row_block_size ==2) {
352 for (ir=0;ir< nRows;ir++) {
353 reg1=0.;
354 reg2=0.;
355 #pragma ivdep
356 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
357 ic=2*index[iptr];
358 Aiptr=iptr*4;
359 in1=in[ic];
360 in2=in[1+ic];
361 A00=val[Aiptr ];
362 A10=val[Aiptr+1];
363 A01=val[Aiptr+2];
364 A11=val[Aiptr+3];
365 reg1 += A00*in1 + A01*in2;
366 reg2 += A10*in1 + A11*in2;
367 }
368 out[ 2*ir] += alpha * reg1;
369 out[1+2*ir] += alpha * reg2;
370 }
371 } else if (col_block_size==3 && row_block_size ==3) {
372 for (ir=0;ir< nRows;ir++) {
373 reg1=0.;
374 reg2=0.;
375 reg3=0.;
376 #pragma ivdep
377 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
378 ic=3*index[iptr];
379 Aiptr=iptr*9;
380 in1=in[ic];
381 in2=in[1+ic];
382 in3=in[2+ic];
383 A00=val[Aiptr ];
384 A10=val[Aiptr+1];
385 A20=val[Aiptr+2];
386 A01=val[Aiptr+3];
387 A11=val[Aiptr+4];
388 A21=val[Aiptr+5];
389 A02=val[Aiptr+6];
390 A12=val[Aiptr+7];
391 A22=val[Aiptr+8];
392 reg1 += A00*in1 + A01*in2 + A02*in3;
393 reg2 += A10*in1 + A11*in2 + A12*in3;
394 reg3 += A20*in1 + A21*in2 + A22*in3;
395 }
396 out[ 3*ir] += alpha * reg1;
397 out[1+3*ir] += alpha * reg2;
398 out[2+3*ir] += alpha * reg3;
399 }
400 } else {
401 block_size=col_block_size*row_block_size;
402 for (ir=0;ir< nRows;ir++) {
403 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
404 for (irb=0;irb< row_block_size;irb++) {
405 irow=irb+row_block_size*ir;
406 reg=0.;
407 #pragma ivdep
408 for (icb=0;icb< col_block_size;icb++) {
409 icol=icb+col_block_size*index[iptr];
410 reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
411 }
412 out[irow] += alpha * reg;
413 }
414 }
415 }
416 }
417 }
418 return;
419 }

  ViewVC Help
Powered by ViewVC 1.1.26