257 |
double beta, |
double beta, |
258 |
double* out) |
double* out) |
259 |
{ |
{ |
260 |
/* #define PASO_DYNAMIC_SCHEDULING_MVM */ |
#define PASO_DYNAMIC_SCHEDULING_MVM |
261 |
|
|
262 |
char* chksz_chr=NULL; |
char* chksz_chr=NULL; |
263 |
dim_t chunk_size=-1; |
dim_t chunk_size=-1; |
322 |
dim_t block_size; |
dim_t block_size; |
323 |
if (ABS(beta)>0.) { |
if (ABS(beta)>0.) { |
324 |
if (beta != 1.) { |
if (beta != 1.) { |
|
#pragma omp for private(irow) schedule(static) |
|
325 |
for (irow=0;irow < nRows * row_block_size;irow++) |
for (irow=0;irow < nRows * row_block_size;irow++) |
326 |
out[irow] *= beta; |
out[irow] *= beta; |
327 |
} |
} |
328 |
} else { |
} else { |
|
#pragma omp for private(irow) schedule(static) |
|
329 |
for (irow=0;irow < nRows * row_block_size;irow++) |
for (irow=0;irow < nRows * row_block_size;irow++) |
330 |
out[irow] = 0; |
out[irow] = 0; |
331 |
} |
} |
332 |
if (ABS(alpha)>0) { |
if (ABS(alpha)>0) { |
333 |
if (col_block_size==1 && row_block_size ==1) { |
if (col_block_size==1 && row_block_size ==1) { |
|
#pragma omp for private(irow,iptr,reg) schedule(static) |
|
334 |
for (irow=0;irow< nRows;++irow) { |
for (irow=0;irow< nRows;++irow) { |
335 |
reg=0.; |
reg=0.; |
336 |
#pragma ivdep |
#pragma ivdep |
340 |
out[irow] += alpha * reg; |
out[irow] += alpha * reg; |
341 |
} |
} |
342 |
} else if (col_block_size==2 && row_block_size ==2) { |
} else if (col_block_size==2 && row_block_size ==2) { |
|
#pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static) |
|
343 |
for (ir=0;ir< nRows;ir++) { |
for (ir=0;ir< nRows;ir++) { |
344 |
reg1=0.; |
reg1=0.; |
345 |
reg2=0.; |
reg2=0.; |
360 |
out[1+2*ir] += alpha * reg2; |
out[1+2*ir] += alpha * reg2; |
361 |
} |
} |
362 |
} else if (col_block_size==3 && row_block_size ==3) { |
} else if (col_block_size==3 && row_block_size ==3) { |
|
#pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static) |
|
363 |
for (ir=0;ir< nRows;ir++) { |
for (ir=0;ir< nRows;ir++) { |
364 |
reg1=0.; |
reg1=0.; |
365 |
reg2=0.; |
reg2=0.; |
390 |
} |
} |
391 |
} else { |
} else { |
392 |
block_size=col_block_size*row_block_size; |
block_size=col_block_size*row_block_size; |
|
#pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
|
393 |
for (ir=0;ir< nRows;ir++) { |
for (ir=0;ir< nRows;ir++) { |
394 |
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
395 |
for (irb=0;irb< row_block_size;irb++) { |
for (irb=0;irb< row_block_size;irb++) { |