1 |
|
2 |
/* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */ |
3 |
|
4 |
/******************************************************* |
5 |
* |
6 |
* Copyright 2003-2007 by ACceSS MNRF |
7 |
* Copyright 2007 by University of Queensland |
8 |
* |
9 |
* http://esscc.uq.edu.au |
10 |
* Primary Business: Queensland, Australia |
11 |
* Licensed under the Open Software License version 3.0 |
12 |
* http://www.opensource.org/licenses/osl-3.0.php |
13 |
* |
14 |
*******************************************************/ |
15 |
|
16 |
/**************************************************************/ |
17 |
|
18 |
/* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */ |
19 |
|
20 |
/**************************************************************/ |
21 |
|
22 |
/* Author: gross@access.edu.au */ |
23 |
|
24 |
/**************************************************************/ |
25 |
|
26 |
#include "SparseMatrix.h" |
27 |
|
28 |
/* CSC format with offset 0*/ |
29 |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha, |
30 |
Paso_SparseMatrix* A, |
31 |
double* in, |
32 |
double beta, |
33 |
double* out) { |
34 |
|
35 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
36 |
register double reg,reg1,reg2,reg3; |
37 |
|
38 |
if (ABS(beta)>0.) { |
39 |
if (beta != 1.) { |
40 |
#pragma omp parallel for private(irow) schedule(static) |
41 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
42 |
out[irow] *= beta; |
43 |
} |
44 |
} else { |
45 |
#pragma omp parallel for private(irow) schedule(static) |
46 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
47 |
out[irow] = 0; |
48 |
} |
49 |
if (Paso_Pattern_isEmpty(A->pattern)) return; |
50 |
/* do the operation: */ |
51 |
if (ABS(alpha)>0) { |
52 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
53 |
/* TODO: parallelize (good luck!) */ |
54 |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
55 |
#pragma ivdep |
56 |
for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) { |
57 |
out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol]; |
58 |
} |
59 |
} |
60 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
61 |
/* TODO: parallelize */ |
62 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
63 |
#pragma ivdep |
64 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
65 |
ir=2*(A->pattern->index[iptr]); |
66 |
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
67 |
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
68 |
} |
69 |
} |
70 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
71 |
/* TODO: parallelize */ |
72 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
73 |
#pragma ivdep |
74 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
75 |
ir=3*(A->pattern->index[iptr]); |
76 |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
77 |
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
78 |
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
79 |
} |
80 |
} |
81 |
} else { |
82 |
/* TODO: parallelize */ |
83 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
84 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
85 |
for (irb=0;irb< A->row_block_size;irb++) { |
86 |
irow=irb+A->row_block_size*(A->pattern->index[iptr]); |
87 |
#pragma ivdep |
88 |
for (icb=0;icb< A->col_block_size;icb++) { |
89 |
icol=icb+A->col_block_size*ic; |
90 |
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
91 |
} |
92 |
} |
93 |
} |
94 |
} |
95 |
} |
96 |
} |
97 |
return; |
98 |
} |
99 |
|
100 |
/* CSC format with offset 1*/ |
101 |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha, |
102 |
Paso_SparseMatrix* A, |
103 |
double* in, |
104 |
double beta, |
105 |
double* out) { |
106 |
|
107 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
108 |
register double reg,reg1,reg2,reg3; |
109 |
if (ABS(beta)>0.) { |
110 |
if (beta != 1.) { |
111 |
#pragma omp parallel for private(irow) schedule(static) |
112 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) { |
113 |
out[irow] *= beta; |
114 |
} |
115 |
} |
116 |
} else { |
117 |
#pragma omp parallel for private(irow) schedule(static) |
118 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
119 |
out[irow] = 0; |
120 |
} |
121 |
|
122 |
/* do the operation: */ |
123 |
if (ABS(alpha)>0) { |
124 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
125 |
/* TODO: parallelize (good luck!) */ |
126 |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
127 |
#pragma ivdep |
128 |
for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) { |
129 |
out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol]; |
130 |
} |
131 |
} |
132 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
133 |
/* TODO: parallelize */ |
134 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
135 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
136 |
ir=2*(A->pattern->index[iptr]-1); |
137 |
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
138 |
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
139 |
} |
140 |
} |
141 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
142 |
/* TODO: parallelize */ |
143 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
144 |
#pragma ivdep |
145 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
146 |
ir=3*(A->pattern->index[iptr]-1); |
147 |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
148 |
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
149 |
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
150 |
} |
151 |
} |
152 |
} else { |
153 |
/* TODO: parallelize */ |
154 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
155 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
156 |
for (irb=0;irb< A->row_block_size;irb++) { |
157 |
irow=irb+A->row_block_size*(A->pattern->index[iptr]-1); |
158 |
#pragma ivdep |
159 |
for (icb=0;icb< A->col_block_size;icb++) { |
160 |
icol=icb+A->col_block_size*ic; |
161 |
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
162 |
} |
163 |
} |
164 |
} |
165 |
} |
166 |
} |
167 |
} |
168 |
return; |
169 |
} |
170 |
/* CSR format with offset 1*/ |
171 |
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha, |
172 |
Paso_SparseMatrix* A, |
173 |
double* in, |
174 |
double beta, |
175 |
double* out) { |
176 |
|
177 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
178 |
register double reg,reg1,reg2,reg3; |
179 |
if (ABS(beta)>0.) { |
180 |
if (beta != 1.) { |
181 |
#pragma omp parallel for private(irow) schedule(static) |
182 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
183 |
out[irow] *= beta; |
184 |
} |
185 |
} else { |
186 |
#pragma omp parallel for private(irow) schedule(static) |
187 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
188 |
out[irow] = 0; |
189 |
} |
190 |
/* do the operation: */ |
191 |
if (ABS(alpha)>0) { |
192 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
193 |
#pragma omp parallel for private(irow,iptr,reg) schedule(static) |
194 |
for (irow=0;irow< A->pattern->numOutput;++irow) { |
195 |
reg=0.; |
196 |
#pragma ivdep |
197 |
for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) { |
198 |
reg += A->val[iptr] * in[A->pattern->index[iptr]-1]; |
199 |
} |
200 |
out[irow] += alpha * reg; |
201 |
} |
202 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
203 |
#pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static) |
204 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
205 |
reg1=0.; |
206 |
reg2=0.; |
207 |
#pragma ivdep |
208 |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
209 |
ic=2*(A->pattern->index[iptr]-1); |
210 |
reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic]; |
211 |
reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic]; |
212 |
} |
213 |
out[ 2*ir] += alpha * reg1; |
214 |
out[1+2*ir] += alpha * reg2; |
215 |
} |
216 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
217 |
#pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static) |
218 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
219 |
reg1=0.; |
220 |
reg2=0.; |
221 |
reg3=0.; |
222 |
#pragma ivdep |
223 |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
224 |
ic=3*(A->pattern->index[iptr]-1); |
225 |
reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic]; |
226 |
reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic]; |
227 |
reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic]; |
228 |
} |
229 |
out[ 3*ir] += alpha * reg1; |
230 |
out[1+3*ir] += alpha * reg2; |
231 |
out[2+3*ir] += alpha * reg3; |
232 |
} |
233 |
} else { |
234 |
#pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
235 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
236 |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
237 |
for (irb=0;irb< A->row_block_size;irb++) { |
238 |
irow=irb+A->row_block_size*ir; |
239 |
reg=0.; |
240 |
#pragma ivdep |
241 |
for (icb=0;icb< A->col_block_size;icb++) { |
242 |
icol=icb+A->col_block_size*(A->pattern->index[iptr]-1); |
243 |
reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
244 |
} |
245 |
out[irow] += alpha * reg; |
246 |
} |
247 |
} |
248 |
} |
249 |
} |
250 |
} |
251 |
return; |
252 |
} |
253 |
/* CSR format with offset 0*/ |
254 |
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha, |
255 |
Paso_SparseMatrix* A, |
256 |
double* in, |
257 |
double beta, |
258 |
double* out) |
259 |
{ |
260 |
/*#define PASO_DYNAMIC_SCHEDULING_MVM */ |
261 |
|
262 |
#if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP |
263 |
#define USE_DYNAMIC_SCHEDULING |
264 |
#endif |
265 |
|
266 |
char* chksz_chr=NULL; |
267 |
dim_t chunk_size=1; |
268 |
dim_t nrow=A->numRows; |
269 |
dim_t np, len, rest, irow, local_n, p, n_chunks; |
270 |
np=omp_get_max_threads(); |
271 |
#ifdef USE_DYNAMIC_SCHEDULING |
272 |
chksz_chr=getenv("PASO_CHUNK_SIZE_MVM"); |
273 |
if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size); |
274 |
chunk_size=MIN(MAX(1,chunk_size),nrow/np); |
275 |
n_chunks=nrow/chunk_size; |
276 |
if (n_chunks*chunk_size<nrow) n_chunks+=1; |
277 |
#else |
278 |
len=nrow/np; |
279 |
rest=nrow-len*np; |
280 |
#endif |
281 |
|
282 |
#pragma omp parallel private(irow, p, local_n) |
283 |
{ |
284 |
#ifdef USE_DYNAMIC_SCHEDULING |
285 |
#pragma omp for schedule(dynamic,1) |
286 |
for (p=0; p<n_chunks;p++) { |
287 |
irow=chunk_size*p; |
288 |
local_n=MIN(chunk_size,nrow-chunk_size*p); |
289 |
#else |
290 |
#pragma omp for schedule(static) |
291 |
for (p=0; p<np;p++) { |
292 |
irow=len*p+MIN(p,rest); |
293 |
local_n=len+(p<rest ? 1 :0 ); |
294 |
#endif |
295 |
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha, |
296 |
local_n, |
297 |
A->row_block_size, |
298 |
A->col_block_size, |
299 |
&(A->pattern->ptr[irow]), |
300 |
A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]); |
301 |
#ifdef USE_DYNAMIC_SCHEDULING |
302 |
} |
303 |
#else |
304 |
} |
305 |
#endif |
306 |
} |
307 |
} |
308 |
/* CSR format with offset 0*/ |
309 |
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha, |
310 |
dim_t nRows, |
311 |
dim_t row_block_size, |
312 |
dim_t col_block_size, |
313 |
index_t* ptr, |
314 |
index_t* index, |
315 |
double* val, |
316 |
double* in, |
317 |
double beta, |
318 |
double* out) |
319 |
{ |
320 |
register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr; |
321 |
register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22; |
322 |
dim_t block_size; |
323 |
if (ABS(beta)>0.) { |
324 |
if (beta != 1.) { |
325 |
for (irow=0;irow < nRows * row_block_size;irow++) |
326 |
out[irow] *= beta; |
327 |
} |
328 |
} else { |
329 |
for (irow=0;irow < nRows * row_block_size;irow++) |
330 |
out[irow] = 0; |
331 |
} |
332 |
if (ABS(alpha)>0) { |
333 |
if (col_block_size==1 && row_block_size ==1) { |
334 |
for (irow=0;irow< nRows;++irow) { |
335 |
reg=0.; |
336 |
#pragma ivdep |
337 |
for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) { |
338 |
reg += val[iptr] * in[index[iptr]]; |
339 |
} |
340 |
out[irow] += alpha * reg; |
341 |
} |
342 |
} else if (col_block_size==2 && row_block_size ==2) { |
343 |
for (ir=0;ir< nRows;ir++) { |
344 |
reg1=0.; |
345 |
reg2=0.; |
346 |
#pragma ivdep |
347 |
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
348 |
ic=2*index[iptr]; |
349 |
Aiptr=iptr*4; |
350 |
in1=in[ic]; |
351 |
in2=in[1+ic]; |
352 |
A00=val[Aiptr ]; |
353 |
A10=val[Aiptr+1]; |
354 |
A01=val[Aiptr+2]; |
355 |
A11=val[Aiptr+3]; |
356 |
reg1 += A00*in1 + A01*in2; |
357 |
reg2 += A10*in1 + A11*in2; |
358 |
} |
359 |
out[ 2*ir] += alpha * reg1; |
360 |
out[1+2*ir] += alpha * reg2; |
361 |
} |
362 |
} else if (col_block_size==3 && row_block_size ==3) { |
363 |
for (ir=0;ir< nRows;ir++) { |
364 |
reg1=0.; |
365 |
reg2=0.; |
366 |
reg3=0.; |
367 |
#pragma ivdep |
368 |
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
369 |
ic=3*index[iptr]; |
370 |
Aiptr=iptr*9; |
371 |
in1=in[ic]; |
372 |
in2=in[1+ic]; |
373 |
in3=in[2+ic]; |
374 |
A00=val[Aiptr ]; |
375 |
A10=val[Aiptr+1]; |
376 |
A20=val[Aiptr+2]; |
377 |
A01=val[Aiptr+3]; |
378 |
A11=val[Aiptr+4]; |
379 |
A21=val[Aiptr+5]; |
380 |
A02=val[Aiptr+6]; |
381 |
A12=val[Aiptr+7]; |
382 |
A22=val[Aiptr+8]; |
383 |
reg1 += A00*in1 + A01*in2 + A02*in3; |
384 |
reg2 += A10*in1 + A11*in2 + A12*in3; |
385 |
reg3 += A20*in1 + A21*in2 + A22*in3; |
386 |
} |
387 |
out[ 3*ir] += alpha * reg1; |
388 |
out[1+3*ir] += alpha * reg2; |
389 |
out[2+3*ir] += alpha * reg3; |
390 |
} |
391 |
} else { |
392 |
block_size=col_block_size*row_block_size; |
393 |
for (ir=0;ir< nRows;ir++) { |
394 |
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
395 |
for (irb=0;irb< row_block_size;irb++) { |
396 |
irow=irb+row_block_size*ir; |
397 |
reg=0.; |
398 |
#pragma ivdep |
399 |
for (icb=0;icb< col_block_size;icb++) { |
400 |
icol=icb+col_block_size*index[iptr]; |
401 |
reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol]; |
402 |
} |
403 |
out[irow] += alpha * reg; |
404 |
} |
405 |
} |
406 |
} |
407 |
} |
408 |
} |
409 |
return; |
410 |
} |