/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Contents of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1811 - (show annotations)
Thu Sep 25 23:11:13 2008 UTC (11 years ago) by ksteube
File MIME type: text/plain
File size: 15216 byte(s)
Copyright updated in all files

1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2008 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
18
19 /**************************************************************/
20
21 /* Author: gross@access.edu.au */
22
23 /**************************************************************/
24
25 #include "SparseMatrix.h"
26 #ifdef _OPENMP
27 #include <omp.h>
28 #endif
29
30 /* CSC format with offset 0*/
31 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(const double alpha,
32 const Paso_SparseMatrix* A,
33 const double* in,
34 const double beta,
35 double* out) {
36
37 register index_t ir,icol,iptr,icb,irb,irow,ic;
38
39 if (ABS(beta)>0.) {
40 if (beta != 1.) {
41 #pragma omp parallel for private(irow) schedule(static)
42 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
43 out[irow] *= beta;
44 }
45 } else {
46 #pragma omp parallel for private(irow) schedule(static)
47 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
48 out[irow] = 0;
49 }
50 if (Paso_Pattern_isEmpty(A->pattern)) return;
51 /* do the operation: */
52 if (ABS(alpha)>0) {
53 if (A ->col_block_size==1 && A->row_block_size ==1) {
54 /* TODO: parallelize (good luck!) */
55 for (icol=0;icol< A->pattern->numOutput;++icol) {
56 #pragma ivdep
57 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
58 out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
59 }
60 }
61 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
62 /* TODO: parallelize */
63 for (ic=0;ic< A->pattern->numOutput;ic++) {
64 #pragma ivdep
65 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
66 ir=2*(A->pattern->index[iptr]);
67 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
68 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
69 }
70 }
71 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
72 /* TODO: parallelize */
73 for (ic=0;ic< A->pattern->numOutput;ic++) {
74 #pragma ivdep
75 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
76 ir=3*(A->pattern->index[iptr]);
77 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
78 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
79 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
80 }
81 }
82 } else {
83 /* TODO: parallelize */
84 for (ic=0;ic< A->pattern->numOutput;ic++) {
85 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
86 for (irb=0;irb< A->row_block_size;irb++) {
87 irow=irb+A->row_block_size*(A->pattern->index[iptr]);
88 #pragma ivdep
89 for (icb=0;icb< A->col_block_size;icb++) {
90 icol=icb+A->col_block_size*ic;
91 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
92 }
93 }
94 }
95 }
96 }
97 }
98 return;
99 }
100
101 /* CSC format with offset 1*/
102 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(const double alpha,
103 const Paso_SparseMatrix* A,
104 const double* in,
105 const double beta,
106 double* out) {
107
108 register index_t ir,icol,iptr,icb,irb,irow,ic;
109
110 if (ABS(beta)>0.) {
111 if (beta != 1.) {
112 #pragma omp parallel for private(irow) schedule(static)
113 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
114 out[irow] *= beta;
115 }
116 }
117 } else {
118 #pragma omp parallel for private(irow) schedule(static)
119 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
120 out[irow] = 0;
121 }
122
123 /* do the operation: */
124 if (ABS(alpha)>0) {
125 if (A ->col_block_size==1 && A->row_block_size ==1) {
126 /* TODO: parallelize (good luck!) */
127 for (icol=0;icol< A->pattern->numOutput;++icol) {
128 #pragma ivdep
129 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
130 out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
131 }
132 }
133 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
134 /* TODO: parallelize */
135 for (ic=0;ic< A->pattern->numOutput;ic++) {
136 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
137 ir=2*(A->pattern->index[iptr]-1);
138 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
139 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
140 }
141 }
142 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
143 /* TODO: parallelize */
144 for (ic=0;ic< A->pattern->numOutput;ic++) {
145 #pragma ivdep
146 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
147 ir=3*(A->pattern->index[iptr]-1);
148 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
149 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
150 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
151 }
152 }
153 } else {
154 /* TODO: parallelize */
155 for (ic=0;ic< A->pattern->numOutput;ic++) {
156 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
157 for (irb=0;irb< A->row_block_size;irb++) {
158 irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
159 #pragma ivdep
160 for (icb=0;icb< A->col_block_size;icb++) {
161 icol=icb+A->col_block_size*ic;
162 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
163 }
164 }
165 }
166 }
167 }
168 }
169 return;
170 }
171 /* CSR format with offset 1*/
172 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(const double alpha,
173 const Paso_SparseMatrix* A,
174 const double* in,
175 const double beta,
176 double* out) {
177
178 register index_t ir,icol,iptr,icb,irb,irow,ic;
179 register double reg,reg1,reg2,reg3;
180 if (ABS(beta)>0.) {
181 if (beta != 1.) {
182 #pragma omp parallel for private(irow) schedule(static)
183 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
184 out[irow] *= beta;
185 }
186 } else {
187 #pragma omp parallel for private(irow) schedule(static)
188 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
189 out[irow] = 0;
190 }
191 /* do the operation: */
192 if (ABS(alpha)>0) {
193 if (A ->col_block_size==1 && A->row_block_size ==1) {
194 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
195 for (irow=0;irow< A->pattern->numOutput;++irow) {
196 reg=0.;
197 #pragma ivdep
198 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
199 reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
200 }
201 out[irow] += alpha * reg;
202 }
203 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
204 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
205 for (ir=0;ir< A->pattern->numOutput;ir++) {
206 reg1=0.;
207 reg2=0.;
208 #pragma ivdep
209 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
210 ic=2*(A->pattern->index[iptr]-1);
211 reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
212 reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
213 }
214 out[ 2*ir] += alpha * reg1;
215 out[1+2*ir] += alpha * reg2;
216 }
217 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
218 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
219 for (ir=0;ir< A->pattern->numOutput;ir++) {
220 reg1=0.;
221 reg2=0.;
222 reg3=0.;
223 #pragma ivdep
224 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
225 ic=3*(A->pattern->index[iptr]-1);
226 reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
227 reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
228 reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
229 }
230 out[ 3*ir] += alpha * reg1;
231 out[1+3*ir] += alpha * reg2;
232 out[2+3*ir] += alpha * reg3;
233 }
234 } else {
235 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
236 for (ir=0;ir< A->pattern->numOutput;ir++) {
237 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
238 for (irb=0;irb< A->row_block_size;irb++) {
239 irow=irb+A->row_block_size*ir;
240 reg=0.;
241 #pragma ivdep
242 for (icb=0;icb< A->col_block_size;icb++) {
243 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
244 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
245 }
246 out[irow] += alpha * reg;
247 }
248 }
249 }
250 }
251 }
252 return;
253 }
254 /* CSR format with offset 0*/
255 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(const double alpha,
256 const Paso_SparseMatrix* A,
257 const double* in,
258 const double beta,
259 double* out)
260 {
261 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
262
263 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined _OPENMP
264 #define USE_DYNAMIC_SCHEDULING
265 #endif
266
267 char* chksz_chr=NULL;
268 dim_t chunk_size=1;
269 dim_t nrow=A->numRows;
270 dim_t np=1, len, rest, irow, local_n, p;
271
272 #ifdef USE_DYNAMIC_SCHEDULING
273 dim_t n_chunks;
274 #endif
275
276 #ifdef _OPENMP
277 np=omp_get_max_threads();
278 #endif
279 #ifdef USE_DYNAMIC_SCHEDULING
280 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
281 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
282 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
283 n_chunks=nrow/chunk_size;
284 if (n_chunks*chunk_size<nrow) n_chunks+=1;
285 #else
286 len=nrow/np;
287 rest=nrow-len*np;
288 #endif
289
290 #pragma omp parallel private(irow, p, local_n)
291 {
292 #ifdef USE_DYNAMIC_SCHEDULING
293 #pragma omp for schedule(dynamic,1)
294 for (p=0; p<n_chunks;p++) {
295 irow=chunk_size*p;
296 local_n=MIN(chunk_size,nrow-chunk_size*p);
297 #else
298 #pragma omp for schedule(static)
299 for (p=0; p<np;p++) {
300 irow=len*p+MIN(p,rest);
301 local_n=len+(p<rest ? 1 :0 );
302 #endif
303 Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
304 local_n,
305 A->row_block_size,
306 A->col_block_size,
307 &(A->pattern->ptr[irow]),
308 A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
309 #ifdef USE_DYNAMIC_SCHEDULING
310 }
311 #else
312 }
313 #endif
314 }
315 }
316 /* CSR format with offset 0*/
317 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(const double alpha,
318 const dim_t nRows,
319 const dim_t row_block_size,
320 const dim_t col_block_size,
321 const index_t* ptr,
322 const index_t* index,
323 const double* val,
324 const double* in,
325 const double beta,
326 double* out)
327 {
328 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
329 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
330 dim_t block_size;
331 if (ABS(beta)>0.) {
332 if (beta != 1.) {
333 for (irow=0;irow < nRows * row_block_size;irow++)
334 out[irow] *= beta;
335 }
336 } else {
337 for (irow=0;irow < nRows * row_block_size;irow++)
338 out[irow] = 0;
339 }
340 if (ABS(alpha)>0) {
341 if (col_block_size==1 && row_block_size ==1) {
342 for (irow=0;irow< nRows;++irow) {
343 reg=0.;
344 #pragma ivdep
345 for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
346 reg += val[iptr] * in[index[iptr]];
347 }
348 out[irow] += alpha * reg;
349 }
350 } else if (col_block_size==2 && row_block_size ==2) {
351 for (ir=0;ir< nRows;ir++) {
352 reg1=0.;
353 reg2=0.;
354 #pragma ivdep
355 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
356 ic=2*index[iptr];
357 Aiptr=iptr*4;
358 in1=in[ic];
359 in2=in[1+ic];
360 A00=val[Aiptr ];
361 A10=val[Aiptr+1];
362 A01=val[Aiptr+2];
363 A11=val[Aiptr+3];
364 reg1 += A00*in1 + A01*in2;
365 reg2 += A10*in1 + A11*in2;
366 }
367 out[ 2*ir] += alpha * reg1;
368 out[1+2*ir] += alpha * reg2;
369 }
370 } else if (col_block_size==3 && row_block_size ==3) {
371 for (ir=0;ir< nRows;ir++) {
372 reg1=0.;
373 reg2=0.;
374 reg3=0.;
375 #pragma ivdep
376 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
377 ic=3*index[iptr];
378 Aiptr=iptr*9;
379 in1=in[ic];
380 in2=in[1+ic];
381 in3=in[2+ic];
382 A00=val[Aiptr ];
383 A10=val[Aiptr+1];
384 A20=val[Aiptr+2];
385 A01=val[Aiptr+3];
386 A11=val[Aiptr+4];
387 A21=val[Aiptr+5];
388 A02=val[Aiptr+6];
389 A12=val[Aiptr+7];
390 A22=val[Aiptr+8];
391 reg1 += A00*in1 + A01*in2 + A02*in3;
392 reg2 += A10*in1 + A11*in2 + A12*in3;
393 reg3 += A20*in1 + A21*in2 + A22*in3;
394 }
395 out[ 3*ir] += alpha * reg1;
396 out[1+3*ir] += alpha * reg2;
397 out[2+3*ir] += alpha * reg3;
398 }
399 } else {
400 block_size=col_block_size*row_block_size;
401 for (ir=0;ir< nRows;ir++) {
402 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
403 for (irb=0;irb< row_block_size;irb++) {
404 irow=irb+row_block_size*ir;
405 reg=0.;
406 #pragma ivdep
407 for (icb=0;icb< col_block_size;icb++) {
408 icol=icb+col_block_size*index[iptr];
409 reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
410 }
411 out[irow] += alpha * reg;
412 }
413 }
414 }
415 }
416 }
417 return;
418 }

  ViewVC Help
Powered by ViewVC 1.1.26