257 |
double beta, |
double beta, |
258 |
double* out) |
double* out) |
259 |
{ |
{ |
260 |
#define PASO_DYNAMIC_SCHEDULING_MVM |
/*#define PASO_DYNAMIC_SCHEDULING_MVM */ |
261 |
|
|
262 |
|
#if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP |
263 |
|
#define USE_DYNAMIC_SCHEDULING |
264 |
|
#endif |
265 |
|
|
266 |
char* chksz_chr=NULL; |
char* chksz_chr=NULL; |
267 |
dim_t chunk_size=-1; |
dim_t chunk_size=1; |
268 |
dim_t nrow=A->numRows; |
dim_t nrow=A->numRows; |
269 |
dim_t np, len, rest, irow, local_n, p, n_chunks; |
dim_t np, len, rest, irow, local_n, p, n_chunks; |
270 |
np=omp_get_max_threads(); |
np=omp_get_max_threads(); |
271 |
#if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP |
#ifdef USE_DYNAMIC_SCHEDULING |
272 |
chksz_chr=getenv("PASO_CHUNK_SIZE_MVM"); |
chksz_chr=getenv("PASO_CHUNK_SIZE_MVM"); |
273 |
if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size); |
if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size); |
274 |
|
chunk_size=MIN(MAX(1,chunk_size),nrow/np); |
275 |
|
n_chunks=nrow/chunk_size; |
276 |
|
if (n_chunks*chunk_size<nrow) n_chunks+=1; |
277 |
|
#else |
278 |
|
len=nrow/np; |
279 |
|
rest=nrow-len*np; |
280 |
#endif |
#endif |
281 |
|
|
282 |
if (chunk_size<1 || np <=1) { |
#pragma omp parallel private(irow, len, p, local_n) |
283 |
#pragma omp parallel private(irow, len, rest, local_n) |
{ |
284 |
{ |
#ifdef USE_DYNAMIC_SCHEDULING |
285 |
len=nrow/np; |
#pragma omp for private(p) schedule(dynamic,1) |
286 |
rest=nrow-len*np; |
for (p=0; p<n_chunks;p++) { |
287 |
#pragma omp for private(p) schedule(static) |
irow=chunk_size*p; |
288 |
for (p=0; p<np;p++) { |
local_n=MIN(chunk_size,nrow-chunk_size*p); |
289 |
irow=len*p+MIN(p,rest); |
#else |
290 |
local_n=len+(p<rest ? 1 :0 ); |
#pragma omp for private(p) schedule(static) |
291 |
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha, |
for (p=0; p<np;p++) { |
292 |
local_n, |
irow=len*p+MIN(p,rest); |
293 |
A->row_block_size, |
local_n=len+(p<rest ? 1 :0 ); |
294 |
A->col_block_size, |
#endif |
295 |
&(A->pattern->ptr[irow]), |
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha, |
296 |
A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]); |
local_n, |
297 |
} |
A->row_block_size, |
298 |
} |
A->col_block_size, |
299 |
} else { |
&(A->pattern->ptr[irow]), |
300 |
#pragma omp parallel private(n_chunks,irow,local_n) |
A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]); |
301 |
{ |
#ifdef USE_DYNAMIC_SCHEDULING |
302 |
n_chunks=nrow/chunk_size; |
} |
303 |
if (n_chunks*chunk_size<nrow) n_chunks+=1; |
#else |
304 |
#pragma omp for private(p) schedule(dynamic,1) |
} |
305 |
for (p=0; p<n_chunks;p++) { |
#endif |
|
irow=chunk_size*p; |
|
|
local_n=MIN(chunk_size,nrow-chunk_size*p); |
|
|
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha, |
|
|
local_n, |
|
|
A->row_block_size, |
|
|
A->col_block_size, |
|
|
&(A->pattern->ptr[irow]), |
|
|
A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]); |
|
|
} |
|
|
} |
|
306 |
} |
} |
307 |
} |
} |
308 |
/* CSR format with offset 0*/ |
/* CSR format with offset 0*/ |