/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Contents of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1981 - (show annotations)
Thu Nov 6 05:27:33 2008 UTC (10 years, 11 months ago) by jfenwick
File MIME type: text/plain
File size: 15254 byte(s)
More warning removal.

1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2008 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
18
19 /**************************************************************/
20
21 /* Author: gross@access.edu.au */
22
23 /**************************************************************/
24
25 #include "SparseMatrix.h"
26 #ifdef _OPENMP
27 #include <omp.h>
28 #endif
29
30 /* CSC format with offset 0*/
31 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(const double alpha,
32 const Paso_SparseMatrix* A,
33 const double* in,
34 const double beta,
35 double* out) {
36
37 register index_t ir,icol,iptr,icb,irb,irow,ic;
38
39 if (ABS(beta)>0.) {
40 if (beta != 1.) {
41 #pragma omp parallel for private(irow) schedule(static)
42 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
43 out[irow] *= beta;
44 }
45 } else {
46 #pragma omp parallel for private(irow) schedule(static)
47 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
48 out[irow] = 0;
49 }
50 if (Paso_Pattern_isEmpty(A->pattern)) return;
51 /* do the operation: */
52 if (ABS(alpha)>0) {
53 if (A ->col_block_size==1 && A->row_block_size ==1) {
54 /* TODO: parallelize (good luck!) */
55 for (icol=0;icol< A->pattern->numOutput;++icol) {
56 #pragma ivdep
57 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
58 out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
59 }
60 }
61 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
62 /* TODO: parallelize */
63 for (ic=0;ic< A->pattern->numOutput;ic++) {
64 #pragma ivdep
65 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
66 ir=2*(A->pattern->index[iptr]);
67 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
68 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
69 }
70 }
71 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
72 /* TODO: parallelize */
73 for (ic=0;ic< A->pattern->numOutput;ic++) {
74 #pragma ivdep
75 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
76 ir=3*(A->pattern->index[iptr]);
77 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
78 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
79 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
80 }
81 }
82 } else {
83 /* TODO: parallelize */
84 for (ic=0;ic< A->pattern->numOutput;ic++) {
85 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
86 for (irb=0;irb< A->row_block_size;irb++) {
87 irow=irb+A->row_block_size*(A->pattern->index[iptr]);
88 #pragma ivdep
89 for (icb=0;icb< A->col_block_size;icb++) {
90 icol=icb+A->col_block_size*ic;
91 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
92 }
93 }
94 }
95 }
96 }
97 }
98 return;
99 }
100
101 /* CSC format with offset 1*/
102 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(const double alpha,
103 const Paso_SparseMatrix* A,
104 const double* in,
105 const double beta,
106 double* out) {
107
108 register index_t ir,icol,iptr,icb,irb,irow,ic;
109
110 if (ABS(beta)>0.) {
111 if (beta != 1.) {
112 #pragma omp parallel for private(irow) schedule(static)
113 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
114 out[irow] *= beta;
115 }
116 }
117 } else {
118 #pragma omp parallel for private(irow) schedule(static)
119 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
120 out[irow] = 0;
121 }
122
123 /* do the operation: */
124 if (ABS(alpha)>0) {
125 if (A ->col_block_size==1 && A->row_block_size ==1) {
126 /* TODO: parallelize (good luck!) */
127 for (icol=0;icol< A->pattern->numOutput;++icol) {
128 #pragma ivdep
129 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
130 out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
131 }
132 }
133 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
134 /* TODO: parallelize */
135 for (ic=0;ic< A->pattern->numOutput;ic++) {
136 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
137 ir=2*(A->pattern->index[iptr]-1);
138 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
139 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
140 }
141 }
142 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
143 /* TODO: parallelize */
144 for (ic=0;ic< A->pattern->numOutput;ic++) {
145 #pragma ivdep
146 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
147 ir=3*(A->pattern->index[iptr]-1);
148 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
149 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
150 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
151 }
152 }
153 } else {
154 /* TODO: parallelize */
155 for (ic=0;ic< A->pattern->numOutput;ic++) {
156 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
157 for (irb=0;irb< A->row_block_size;irb++) {
158 irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
159 #pragma ivdep
160 for (icb=0;icb< A->col_block_size;icb++) {
161 icol=icb+A->col_block_size*ic;
162 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
163 }
164 }
165 }
166 }
167 }
168 }
169 return;
170 }
171 /* CSR format with offset 1*/
172 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(const double alpha,
173 const Paso_SparseMatrix* A,
174 const double* in,
175 const double beta,
176 double* out) {
177
178 register index_t ir,icol,iptr,icb,irb,irow,ic;
179 register double reg,reg1,reg2,reg3;
180 if (ABS(beta)>0.) {
181 if (beta != 1.) {
182 #pragma omp parallel for private(irow) schedule(static)
183 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
184 out[irow] *= beta;
185 }
186 } else {
187 #pragma omp parallel for private(irow) schedule(static)
188 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
189 out[irow] = 0;
190 }
191 /* do the operation: */
192 if (ABS(alpha)>0) {
193 if (A ->col_block_size==1 && A->row_block_size ==1) {
194 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
195 for (irow=0;irow< A->pattern->numOutput;++irow) {
196 reg=0.;
197 #pragma ivdep
198 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
199 reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
200 }
201 out[irow] += alpha * reg;
202 }
203 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
204 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
205 for (ir=0;ir< A->pattern->numOutput;ir++) {
206 reg1=0.;
207 reg2=0.;
208 #pragma ivdep
209 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
210 ic=2*(A->pattern->index[iptr]-1);
211 reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
212 reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
213 }
214 out[ 2*ir] += alpha * reg1;
215 out[1+2*ir] += alpha * reg2;
216 }
217 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
218 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
219 for (ir=0;ir< A->pattern->numOutput;ir++) {
220 reg1=0.;
221 reg2=0.;
222 reg3=0.;
223 #pragma ivdep
224 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
225 ic=3*(A->pattern->index[iptr]-1);
226 reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
227 reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
228 reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
229 }
230 out[ 3*ir] += alpha * reg1;
231 out[1+3*ir] += alpha * reg2;
232 out[2+3*ir] += alpha * reg3;
233 }
234 } else {
235 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
236 for (ir=0;ir< A->pattern->numOutput;ir++) {
237 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
238 for (irb=0;irb< A->row_block_size;irb++) {
239 irow=irb+A->row_block_size*ir;
240 reg=0.;
241 #pragma ivdep
242 for (icb=0;icb< A->col_block_size;icb++) {
243 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
244 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
245 }
246 out[irow] += alpha * reg;
247 }
248 }
249 }
250 }
251 }
252 return;
253 }
254 /* CSR format with offset 0*/
255 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(const double alpha,
256 const Paso_SparseMatrix* A,
257 const double* in,
258 const double beta,
259 double* out)
260 {
261 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
262
263 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined _OPENMP
264 #define USE_DYNAMIC_SCHEDULING
265 #endif
266
267 #ifdef USE_DYNAMIC_SCHEDULING
268 char* chksz_chr=NULL;
269 dim_t chunk_size=1;
270 #endif
271
272 dim_t nrow=A->numRows;
273 dim_t np=1, len, rest, irow, local_n, p;
274
275 #ifdef USE_DYNAMIC_SCHEDULING
276 dim_t n_chunks;
277 #endif
278
279 #ifdef _OPENMP
280 np=omp_get_max_threads();
281 #endif
282 #ifdef USE_DYNAMIC_SCHEDULING
283 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
284 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
285 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
286 n_chunks=nrow/chunk_size;
287 if (n_chunks*chunk_size<nrow) n_chunks+=1;
288 #else
289 len=nrow/np;
290 rest=nrow-len*np;
291 #endif
292
293 #pragma omp parallel private(irow, p, local_n)
294 {
295 #ifdef USE_DYNAMIC_SCHEDULING
296 #pragma omp for schedule(dynamic,1)
297 for (p=0; p<n_chunks;p++) {
298 irow=chunk_size*p;
299 local_n=MIN(chunk_size,nrow-chunk_size*p);
300 #else
301 #pragma omp for schedule(static)
302 for (p=0; p<np;p++) {
303 irow=len*p+MIN(p,rest);
304 local_n=len+(p<rest ? 1 :0 );
305 #endif
306 Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
307 local_n,
308 A->row_block_size,
309 A->col_block_size,
310 &(A->pattern->ptr[irow]),
311 A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
312 #ifdef USE_DYNAMIC_SCHEDULING
313 }
314 #else
315 }
316 #endif
317 }
318 }
319 /* CSR format with offset 0*/
320 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(const double alpha,
321 const dim_t nRows,
322 const dim_t row_block_size,
323 const dim_t col_block_size,
324 const index_t* ptr,
325 const index_t* index,
326 const double* val,
327 const double* in,
328 const double beta,
329 double* out)
330 {
331 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
332 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
333 dim_t block_size;
334 if (ABS(beta)>0.) {
335 if (beta != 1.) {
336 for (irow=0;irow < nRows * row_block_size;irow++)
337 out[irow] *= beta;
338 }
339 } else {
340 for (irow=0;irow < nRows * row_block_size;irow++)
341 out[irow] = 0;
342 }
343 if (ABS(alpha)>0) {
344 if (col_block_size==1 && row_block_size ==1) {
345 for (irow=0;irow< nRows;++irow) {
346 reg=0.;
347 #pragma ivdep
348 for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
349 reg += val[iptr] * in[index[iptr]];
350 }
351 out[irow] += alpha * reg;
352 }
353 } else if (col_block_size==2 && row_block_size ==2) {
354 for (ir=0;ir< nRows;ir++) {
355 reg1=0.;
356 reg2=0.;
357 #pragma ivdep
358 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
359 ic=2*index[iptr];
360 Aiptr=iptr*4;
361 in1=in[ic];
362 in2=in[1+ic];
363 A00=val[Aiptr ];
364 A10=val[Aiptr+1];
365 A01=val[Aiptr+2];
366 A11=val[Aiptr+3];
367 reg1 += A00*in1 + A01*in2;
368 reg2 += A10*in1 + A11*in2;
369 }
370 out[ 2*ir] += alpha * reg1;
371 out[1+2*ir] += alpha * reg2;
372 }
373 } else if (col_block_size==3 && row_block_size ==3) {
374 for (ir=0;ir< nRows;ir++) {
375 reg1=0.;
376 reg2=0.;
377 reg3=0.;
378 #pragma ivdep
379 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
380 ic=3*index[iptr];
381 Aiptr=iptr*9;
382 in1=in[ic];
383 in2=in[1+ic];
384 in3=in[2+ic];
385 A00=val[Aiptr ];
386 A10=val[Aiptr+1];
387 A20=val[Aiptr+2];
388 A01=val[Aiptr+3];
389 A11=val[Aiptr+4];
390 A21=val[Aiptr+5];
391 A02=val[Aiptr+6];
392 A12=val[Aiptr+7];
393 A22=val[Aiptr+8];
394 reg1 += A00*in1 + A01*in2 + A02*in3;
395 reg2 += A10*in1 + A11*in2 + A12*in3;
396 reg3 += A20*in1 + A21*in2 + A22*in3;
397 }
398 out[ 3*ir] += alpha * reg1;
399 out[1+3*ir] += alpha * reg2;
400 out[2+3*ir] += alpha * reg3;
401 }
402 } else {
403 block_size=col_block_size*row_block_size;
404 for (ir=0;ir< nRows;ir++) {
405 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
406 for (irb=0;irb< row_block_size;irb++) {
407 irow=irb+row_block_size*ir;
408 reg=0.;
409 #pragma ivdep
410 for (icb=0;icb< col_block_size;icb++) {
411 icol=icb+col_block_size*index[iptr];
412 reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
413 }
414 out[irow] += alpha * reg;
415 }
416 }
417 }
418 }
419 }
420 return;
421 }

  ViewVC Help
Powered by ViewVC 1.1.26