65 |
#pragma ivdep |
#pragma ivdep |
66 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
67 |
ir=2*(A->pattern->index[iptr]); |
ir=2*(A->pattern->index[iptr]); |
68 |
out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
69 |
out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
70 |
} |
} |
71 |
} |
} |
72 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
76 |
#pragma ivdep |
#pragma ivdep |
77 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
78 |
ir=3*(A->pattern->index[iptr]); |
ir=3*(A->pattern->index[iptr]); |
79 |
out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
80 |
out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
81 |
out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
82 |
} |
} |
83 |
} |
} |
84 |
} else { |
} else { |
141 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
142 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
143 |
ir=2*(A->pattern->index[iptr]-1); |
ir=2*(A->pattern->index[iptr]-1); |
144 |
out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
145 |
out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
146 |
} |
} |
147 |
} |
} |
148 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
152 |
#pragma ivdep |
#pragma ivdep |
153 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
154 |
ir=3*(A->pattern->index[iptr]-1); |
ir=3*(A->pattern->index[iptr]-1); |
155 |
out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
156 |
out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
157 |
out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
158 |
} |
} |
159 |
} |
} |
160 |
} else { |
} else { |