1 |
|
|
|
/* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */ |
|
|
|
|
2 |
/******************************************************* |
/******************************************************* |
3 |
* |
* |
4 |
* Copyright 2003-2007 by ACceSS MNRF |
* Copyright (c) 2003-2008 by University of Queensland |
5 |
* Copyright 2007 by University of Queensland |
* Earth Systems Science Computational Center (ESSCC) |
6 |
* |
* http://www.uq.edu.au/esscc |
7 |
* http://esscc.uq.edu.au |
* |
8 |
* Primary Business: Queensland, Australia |
* Primary Business: Queensland, Australia |
9 |
* Licensed under the Open Software License version 3.0 |
* Licensed under the Open Software License version 3.0 |
10 |
* http://www.opensource.org/licenses/osl-3.0.php |
* http://www.opensource.org/licenses/osl-3.0.php |
11 |
* |
* |
12 |
*******************************************************/ |
*******************************************************/ |
13 |
|
|
14 |
|
|
15 |
/**************************************************************/ |
/**************************************************************/ |
16 |
|
|
23 |
/**************************************************************/ |
/**************************************************************/ |
24 |
|
|
25 |
#include "SparseMatrix.h" |
#include "SparseMatrix.h" |
26 |
|
#ifdef _OPENMP |
27 |
|
#include <omp.h> |
28 |
|
#endif |
29 |
|
|
30 |
/* CSC format with offset 0*/ |
/* CSC format with offset 0*/ |
31 |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha, |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(const double alpha, |
32 |
Paso_SparseMatrix* A, |
const Paso_SparseMatrix* A, |
33 |
double* in, |
const double* in, |
34 |
double beta, |
const double beta, |
35 |
double* out) { |
double* out) { |
36 |
|
|
37 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
|
register double reg,reg1,reg2,reg3; |
|
|
#pragma omp barrier |
|
38 |
|
|
39 |
if (ABS(beta)>0.) { |
if (ABS(beta)>0.) { |
40 |
if (beta != 1.) { |
if (beta != 1.) { |
41 |
#pragma omp for private(irow) schedule(static) |
#pragma omp parallel for private(irow) schedule(static) |
42 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
43 |
out[irow] *= beta; |
out[irow] *= beta; |
44 |
} |
} |
45 |
} else { |
} else { |
46 |
#pragma omp for private(irow) schedule(static) |
#pragma omp parallel for private(irow) schedule(static) |
47 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
48 |
out[irow] = 0; |
out[irow] = 0; |
49 |
} |
} |
52 |
if (ABS(alpha)>0) { |
if (ABS(alpha)>0) { |
53 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
54 |
/* TODO: parallelize (good luck!) */ |
/* TODO: parallelize (good luck!) */ |
|
#pragma omp single |
|
55 |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
56 |
#pragma ivdep |
#pragma ivdep |
57 |
for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) { |
for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) { |
60 |
} |
} |
61 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
62 |
/* TODO: parallelize */ |
/* TODO: parallelize */ |
|
#pragma omp single |
|
63 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
64 |
#pragma ivdep |
#pragma ivdep |
65 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
66 |
ic=2*(A->pattern->index[iptr]); |
ir=2*(A->pattern->index[iptr]); |
67 |
out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
68 |
out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
69 |
} |
} |
70 |
} |
} |
71 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
72 |
/* TODO: parallelize */ |
/* TODO: parallelize */ |
|
#pragma omp single |
|
73 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
74 |
#pragma ivdep |
#pragma ivdep |
75 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
76 |
ir=3*(A->pattern->index[iptr]); |
ir=3*(A->pattern->index[iptr]); |
77 |
out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
78 |
out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
79 |
out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
80 |
} |
} |
81 |
} |
} |
82 |
} else { |
} else { |
83 |
/* TODO: parallelize */ |
/* TODO: parallelize */ |
|
#pragma omp single |
|
84 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
85 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
86 |
for (irb=0;irb< A->row_block_size;irb++) { |
for (irb=0;irb< A->row_block_size;irb++) { |
99 |
} |
} |
100 |
|
|
101 |
/* CSC format with offset 1*/ |
/* CSC format with offset 1*/ |
102 |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha, |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(const double alpha, |
103 |
Paso_SparseMatrix* A, |
const Paso_SparseMatrix* A, |
104 |
double* in, |
const double* in, |
105 |
double beta, |
const double beta, |
106 |
double* out) { |
double* out) { |
107 |
|
|
108 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
|
register double reg,reg1,reg2,reg3; |
|
|
#pragma omp barrier |
|
109 |
|
|
110 |
if (ABS(beta)>0.) { |
if (ABS(beta)>0.) { |
111 |
if (beta != 1.) { |
if (beta != 1.) { |
112 |
#pragma omp for private(irow) schedule(static) |
#pragma omp parallel for private(irow) schedule(static) |
113 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) { |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) { |
114 |
out[irow] *= beta; |
out[irow] *= beta; |
115 |
} |
} |
116 |
} |
} |
117 |
} else { |
} else { |
118 |
#pragma omp for private(irow) schedule(static) |
#pragma omp parallel for private(irow) schedule(static) |
119 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
120 |
out[irow] = 0; |
out[irow] = 0; |
121 |
} |
} |
124 |
if (ABS(alpha)>0) { |
if (ABS(alpha)>0) { |
125 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
126 |
/* TODO: parallelize (good luck!) */ |
/* TODO: parallelize (good luck!) */ |
|
#pragma omp single |
|
127 |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
128 |
#pragma ivdep |
#pragma ivdep |
129 |
for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) { |
for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) { |
132 |
} |
} |
133 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
134 |
/* TODO: parallelize */ |
/* TODO: parallelize */ |
|
#pragma omp single |
|
135 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
136 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
137 |
ic=2*(A->pattern->index[iptr]-1); |
ir=2*(A->pattern->index[iptr]-1); |
138 |
out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
139 |
out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
140 |
} |
} |
141 |
} |
} |
142 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
143 |
/* TODO: parallelize */ |
/* TODO: parallelize */ |
|
#pragma omp single |
|
144 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
145 |
#pragma ivdep |
#pragma ivdep |
146 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
147 |
ir=3*(A->pattern->index[iptr]-1); |
ir=3*(A->pattern->index[iptr]-1); |
148 |
out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
149 |
out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
150 |
out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
151 |
} |
} |
152 |
} |
} |
153 |
} else { |
} else { |
154 |
/* TODO: parallelize */ |
/* TODO: parallelize */ |
|
#pragma omp single |
|
155 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
156 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
157 |
for (irb=0;irb< A->row_block_size;irb++) { |
for (irb=0;irb< A->row_block_size;irb++) { |
168 |
} |
} |
169 |
return; |
return; |
170 |
} |
} |
|
/* CSR format with offset 0*/ |
|
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha, |
|
|
Paso_SparseMatrix* A, |
|
|
double* in, |
|
|
double beta, |
|
|
double* out) |
|
|
{ |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr; |
|
|
register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22; |
|
|
if (ABS(beta)>0.) { |
|
|
if (beta != 1.) { |
|
|
#pragma omp for private(irow) schedule(static) |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
|
|
out[irow] *= beta; |
|
|
} |
|
|
} else { |
|
|
#pragma omp for private(irow) schedule(static) |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
|
|
out[irow] = 0; |
|
|
} |
|
|
if (ABS(alpha)>0) { |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
|
|
#pragma omp for private(irow,iptr,reg) schedule(static) |
|
|
for (irow=0;irow< A->pattern->numOutput;++irow) { |
|
|
reg=0.; |
|
|
#pragma ivdep |
|
|
for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) { |
|
|
reg += A->val[iptr] * in[A->pattern->index[iptr]]; |
|
|
} |
|
|
out[irow] += alpha * reg; |
|
|
} |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
|
|
#pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static) |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
|
|
reg1=0.; |
|
|
reg2=0.; |
|
|
#pragma ivdep |
|
|
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
|
|
ic=2*(A->pattern->index[iptr]); |
|
|
Aiptr=iptr*4; |
|
|
in1=in[ic]; |
|
|
in2=in[1+ic]; |
|
|
A00=A->val[Aiptr ]; |
|
|
A10=A->val[Aiptr+1]; |
|
|
A01=A->val[Aiptr+2]; |
|
|
A11=A->val[Aiptr+3]; |
|
|
reg1 += A00*in1 + A01*in2; |
|
|
reg2 += A10*in1 + A11*in2; |
|
|
} |
|
|
out[ 2*ir] += alpha * reg1; |
|
|
out[1+2*ir] += alpha * reg2; |
|
|
} |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
|
|
#pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static) |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
|
|
reg1=0.; |
|
|
reg2=0.; |
|
|
reg3=0.; |
|
|
#pragma ivdep |
|
|
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
|
|
ic=3*(A->pattern->index[iptr]); |
|
|
Aiptr=iptr*9; |
|
|
in1=in[ic]; |
|
|
in2=in[1+ic]; |
|
|
in3=in[2+ic]; |
|
|
A00=A->val[Aiptr ]; |
|
|
A10=A->val[Aiptr+1]; |
|
|
A20=A->val[Aiptr+2]; |
|
|
A01=A->val[Aiptr+3]; |
|
|
A11=A->val[Aiptr+4]; |
|
|
A21=A->val[Aiptr+5]; |
|
|
A02=A->val[Aiptr+6]; |
|
|
A12=A->val[Aiptr+7]; |
|
|
A22=A->val[Aiptr+8]; |
|
|
reg1 += A00*in1 + A01*in2 + A02*in3; |
|
|
reg2 += A10*in1 + A11*in2 + A12*in3; |
|
|
reg3 += A20*in1 + A21*in2 + A22*in3; |
|
|
} |
|
|
out[ 3*ir] += alpha * reg1; |
|
|
out[1+3*ir] += alpha * reg2; |
|
|
out[2+3*ir] += alpha * reg3; |
|
|
} |
|
|
} else { |
|
|
#pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
|
|
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
|
|
irow=irb+A->row_block_size*ir; |
|
|
reg=0.; |
|
|
#pragma ivdep |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
|
|
icol=icb+A->col_block_size*(A->pattern->index[iptr]); |
|
|
reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
|
|
} |
|
|
out[irow] += alpha * reg; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
return; |
|
|
} |
|
171 |
/* CSR format with offset 1*/ |
/* CSR format with offset 1*/ |
172 |
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha, |
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(const double alpha, |
173 |
Paso_SparseMatrix* A, |
const Paso_SparseMatrix* A, |
174 |
double* in, |
const double* in, |
175 |
double beta, |
const double beta, |
176 |
double* out) { |
double* out) { |
177 |
|
|
178 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
179 |
register double reg,reg1,reg2,reg3; |
register double reg,reg1,reg2,reg3; |
|
#pragma omp barrier |
|
|
|
|
180 |
if (ABS(beta)>0.) { |
if (ABS(beta)>0.) { |
181 |
if (beta != 1.) { |
if (beta != 1.) { |
182 |
#pragma omp for private(irow) schedule(static) |
#pragma omp parallel for private(irow) schedule(static) |
183 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
184 |
out[irow] *= beta; |
out[irow] *= beta; |
185 |
} |
} |
186 |
} else { |
} else { |
187 |
#pragma omp for private(irow) schedule(static) |
#pragma omp parallel for private(irow) schedule(static) |
188 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
189 |
out[irow] = 0; |
out[irow] = 0; |
190 |
} |
} |
191 |
/* do the operation: */ |
/* do the operation: */ |
192 |
if (ABS(alpha)>0) { |
if (ABS(alpha)>0) { |
193 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
194 |
#pragma omp for private(irow,iptr,reg) schedule(static) |
#pragma omp parallel for private(irow,iptr,reg) schedule(static) |
195 |
for (irow=0;irow< A->pattern->numOutput;++irow) { |
for (irow=0;irow< A->pattern->numOutput;++irow) { |
196 |
reg=0.; |
reg=0.; |
197 |
#pragma ivdep |
#pragma ivdep |
201 |
out[irow] += alpha * reg; |
out[irow] += alpha * reg; |
202 |
} |
} |
203 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
204 |
#pragma omp for private(ir,reg1,reg2,iptr,ic) schedule(static) |
#pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static) |
205 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
206 |
reg1=0.; |
reg1=0.; |
207 |
reg2=0.; |
reg2=0.; |
215 |
out[1+2*ir] += alpha * reg2; |
out[1+2*ir] += alpha * reg2; |
216 |
} |
} |
217 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
218 |
#pragma omp for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static) |
#pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static) |
219 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
220 |
reg1=0.; |
reg1=0.; |
221 |
reg2=0.; |
reg2=0.; |
232 |
out[2+3*ir] += alpha * reg3; |
out[2+3*ir] += alpha * reg3; |
233 |
} |
} |
234 |
} else { |
} else { |
235 |
#pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
#pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
236 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
237 |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
238 |
for (irb=0;irb< A->row_block_size;irb++) { |
for (irb=0;irb< A->row_block_size;irb++) { |
251 |
} |
} |
252 |
return; |
return; |
253 |
} |
} |
254 |
|
/* CSR format with offset 0*/ |
255 |
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(const double alpha, |
256 |
|
const Paso_SparseMatrix* A, |
257 |
|
const double* in, |
258 |
|
const double beta, |
259 |
|
double* out) |
260 |
|
{ |
261 |
|
/*#define PASO_DYNAMIC_SCHEDULING_MVM */ |
262 |
|
|
263 |
|
#if defined PASO_DYNAMIC_SCHEDULING_MVM && defined _OPENMP |
264 |
|
#define USE_DYNAMIC_SCHEDULING |
265 |
|
#endif |
266 |
|
|
267 |
|
#ifdef USE_DYNAMIC_SCHEDULING |
268 |
|
char* chksz_chr=NULL; |
269 |
|
dim_t chunk_size=1; |
270 |
|
#endif |
271 |
|
|
272 |
|
dim_t nrow=A->numRows; |
273 |
|
dim_t np=1, len, rest, irow, local_n, p; |
274 |
|
|
275 |
|
#ifdef USE_DYNAMIC_SCHEDULING |
276 |
|
dim_t n_chunks; |
277 |
|
#endif |
278 |
|
|
279 |
|
#ifdef _OPENMP |
280 |
|
np=omp_get_max_threads(); |
281 |
|
#endif |
282 |
|
#ifdef USE_DYNAMIC_SCHEDULING |
283 |
|
chksz_chr=getenv("PASO_CHUNK_SIZE_MVM"); |
284 |
|
if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size); |
285 |
|
chunk_size=MIN(MAX(1,chunk_size),nrow/np); |
286 |
|
n_chunks=nrow/chunk_size; |
287 |
|
if (n_chunks*chunk_size<nrow) n_chunks+=1; |
288 |
|
#else |
289 |
|
len=nrow/np; |
290 |
|
rest=nrow-len*np; |
291 |
|
#endif |
292 |
|
|
293 |
|
#pragma omp parallel private(irow, p, local_n) |
294 |
|
{ |
295 |
|
#ifdef USE_DYNAMIC_SCHEDULING |
296 |
|
#pragma omp for schedule(dynamic,1) |
297 |
|
for (p=0; p<n_chunks;p++) { |
298 |
|
irow=chunk_size*p; |
299 |
|
local_n=MIN(chunk_size,nrow-chunk_size*p); |
300 |
|
#else |
301 |
|
#pragma omp for schedule(static) |
302 |
|
for (p=0; p<np;p++) { |
303 |
|
irow=len*p+MIN(p,rest); |
304 |
|
local_n=len+(p<rest ? 1 :0 ); |
305 |
|
#endif |
306 |
|
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha, |
307 |
|
local_n, |
308 |
|
A->row_block_size, |
309 |
|
A->col_block_size, |
310 |
|
&(A->pattern->ptr[irow]), |
311 |
|
A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]); |
312 |
|
#ifdef USE_DYNAMIC_SCHEDULING |
313 |
|
} |
314 |
|
#else |
315 |
|
} |
316 |
|
#endif |
317 |
|
} |
318 |
|
} |
319 |
|
/* CSR format with offset 0*/ |
320 |
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(const double alpha, |
321 |
|
const dim_t nRows, |
322 |
|
const dim_t row_block_size, |
323 |
|
const dim_t col_block_size, |
324 |
|
const index_t* ptr, |
325 |
|
const index_t* index, |
326 |
|
const double* val, |
327 |
|
const double* in, |
328 |
|
const double beta, |
329 |
|
double* out) |
330 |
|
{ |
331 |
|
register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr; |
332 |
|
register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22; |
333 |
|
dim_t block_size; |
334 |
|
if (ABS(beta)>0.) { |
335 |
|
if (beta != 1.) { |
336 |
|
for (irow=0;irow < nRows * row_block_size;irow++) |
337 |
|
out[irow] *= beta; |
338 |
|
} |
339 |
|
} else { |
340 |
|
for (irow=0;irow < nRows * row_block_size;irow++) |
341 |
|
out[irow] = 0; |
342 |
|
} |
343 |
|
if (ABS(alpha)>0) { |
344 |
|
if (col_block_size==1 && row_block_size ==1) { |
345 |
|
for (irow=0;irow< nRows;++irow) { |
346 |
|
reg=0.; |
347 |
|
#pragma ivdep |
348 |
|
for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) { |
349 |
|
reg += val[iptr] * in[index[iptr]]; |
350 |
|
} |
351 |
|
out[irow] += alpha * reg; |
352 |
|
} |
353 |
|
} else if (col_block_size==2 && row_block_size ==2) { |
354 |
|
for (ir=0;ir< nRows;ir++) { |
355 |
|
reg1=0.; |
356 |
|
reg2=0.; |
357 |
|
#pragma ivdep |
358 |
|
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
359 |
|
ic=2*index[iptr]; |
360 |
|
Aiptr=iptr*4; |
361 |
|
in1=in[ic]; |
362 |
|
in2=in[1+ic]; |
363 |
|
A00=val[Aiptr ]; |
364 |
|
A10=val[Aiptr+1]; |
365 |
|
A01=val[Aiptr+2]; |
366 |
|
A11=val[Aiptr+3]; |
367 |
|
reg1 += A00*in1 + A01*in2; |
368 |
|
reg2 += A10*in1 + A11*in2; |
369 |
|
} |
370 |
|
out[ 2*ir] += alpha * reg1; |
371 |
|
out[1+2*ir] += alpha * reg2; |
372 |
|
} |
373 |
|
} else if (col_block_size==3 && row_block_size ==3) { |
374 |
|
for (ir=0;ir< nRows;ir++) { |
375 |
|
reg1=0.; |
376 |
|
reg2=0.; |
377 |
|
reg3=0.; |
378 |
|
#pragma ivdep |
379 |
|
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
380 |
|
ic=3*index[iptr]; |
381 |
|
Aiptr=iptr*9; |
382 |
|
in1=in[ic]; |
383 |
|
in2=in[1+ic]; |
384 |
|
in3=in[2+ic]; |
385 |
|
A00=val[Aiptr ]; |
386 |
|
A10=val[Aiptr+1]; |
387 |
|
A20=val[Aiptr+2]; |
388 |
|
A01=val[Aiptr+3]; |
389 |
|
A11=val[Aiptr+4]; |
390 |
|
A21=val[Aiptr+5]; |
391 |
|
A02=val[Aiptr+6]; |
392 |
|
A12=val[Aiptr+7]; |
393 |
|
A22=val[Aiptr+8]; |
394 |
|
reg1 += A00*in1 + A01*in2 + A02*in3; |
395 |
|
reg2 += A10*in1 + A11*in2 + A12*in3; |
396 |
|
reg3 += A20*in1 + A21*in2 + A22*in3; |
397 |
|
} |
398 |
|
out[ 3*ir] += alpha * reg1; |
399 |
|
out[1+3*ir] += alpha * reg2; |
400 |
|
out[2+3*ir] += alpha * reg3; |
401 |
|
} |
402 |
|
} else { |
403 |
|
block_size=col_block_size*row_block_size; |
404 |
|
for (ir=0;ir< nRows;ir++) { |
405 |
|
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
406 |
|
for (irb=0;irb< row_block_size;irb++) { |
407 |
|
irow=irb+row_block_size*ir; |
408 |
|
reg=0.; |
409 |
|
#pragma ivdep |
410 |
|
for (icb=0;icb< col_block_size;icb++) { |
411 |
|
icol=icb+col_block_size*index[iptr]; |
412 |
|
reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol]; |
413 |
|
} |
414 |
|
out[irow] += alpha * reg; |
415 |
|
} |
416 |
|
} |
417 |
|
} |
418 |
|
} |
419 |
|
} |
420 |
|
return; |
421 |
|
} |