1 |
ksteube |
1315 |
|
2 |
|
|
/* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */ |
3 |
|
|
|
4 |
|
|
/******************************************************* |
5 |
|
|
* |
6 |
|
|
* Copyright 2003-2007 by ACceSS MNRF |
7 |
|
|
* Copyright 2007 by University of Queensland |
8 |
|
|
* |
9 |
|
|
* http://esscc.uq.edu.au |
10 |
|
|
* Primary Business: Queensland, Australia |
11 |
|
|
* Licensed under the Open Software License version 3.0 |
12 |
|
|
* http://www.opensource.org/licenses/osl-3.0.php |
13 |
|
|
* |
14 |
|
|
*******************************************************/ |
15 |
|
|
|
16 |
|
|
/**************************************************************/ |
17 |
|
|
|
18 |
|
|
/* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */ |
19 |
|
|
|
20 |
|
|
/**************************************************************/ |
21 |
|
|
|
22 |
|
|
/* Author: gross@access.edu.au */ |
23 |
|
|
|
24 |
|
|
/**************************************************************/ |
25 |
|
|
|
26 |
|
|
#include "SparseMatrix.h" |
27 |
|
|
|
28 |
|
|
/* CSC format with offset 0*/ |
29 |
|
|
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha, |
30 |
|
|
Paso_SparseMatrix* A, |
31 |
|
|
double* in, |
32 |
|
|
double beta, |
33 |
|
|
double* out) { |
34 |
|
|
|
35 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic; |
36 |
|
|
register double reg,reg1,reg2,reg3; |
37 |
|
|
|
38 |
|
|
if (ABS(beta)>0.) { |
39 |
|
|
if (beta != 1.) { |
40 |
gross |
1556 |
#pragma omp parallel for private(irow) schedule(static) |
41 |
ksteube |
1315 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
42 |
|
|
out[irow] *= beta; |
43 |
|
|
} |
44 |
|
|
} else { |
45 |
gross |
1556 |
#pragma omp parallel for private(irow) schedule(static) |
46 |
ksteube |
1315 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
47 |
|
|
out[irow] = 0; |
48 |
|
|
} |
49 |
|
|
if (Paso_Pattern_isEmpty(A->pattern)) return; |
50 |
|
|
/* do the operation: */ |
51 |
|
|
if (ABS(alpha)>0) { |
52 |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
53 |
|
|
/* TODO: parallelize (good luck!) */ |
54 |
|
|
for (icol=0;icol< A->pattern->numOutput;++icol) { |
55 |
|
|
#pragma ivdep |
56 |
|
|
for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) { |
57 |
|
|
out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol]; |
58 |
|
|
} |
59 |
|
|
} |
60 |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
61 |
|
|
/* TODO: parallelize */ |
62 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
63 |
|
|
#pragma ivdep |
64 |
|
|
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
65 |
gross |
1511 |
ir=2*(A->pattern->index[iptr]); |
66 |
|
|
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
67 |
|
|
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
68 |
ksteube |
1315 |
} |
69 |
|
|
} |
70 |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
71 |
|
|
/* TODO: parallelize */ |
72 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
73 |
|
|
#pragma ivdep |
74 |
|
|
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
75 |
|
|
ir=3*(A->pattern->index[iptr]); |
76 |
gross |
1511 |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
77 |
|
|
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
78 |
|
|
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
79 |
ksteube |
1315 |
} |
80 |
|
|
} |
81 |
|
|
} else { |
82 |
|
|
/* TODO: parallelize */ |
83 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
84 |
|
|
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
85 |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
86 |
|
|
irow=irb+A->row_block_size*(A->pattern->index[iptr]); |
87 |
|
|
#pragma ivdep |
88 |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
89 |
|
|
icol=icb+A->col_block_size*ic; |
90 |
|
|
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
91 |
|
|
} |
92 |
|
|
} |
93 |
|
|
} |
94 |
|
|
} |
95 |
|
|
} |
96 |
|
|
} |
97 |
|
|
return; |
98 |
|
|
} |
99 |
|
|
|
100 |
|
|
/* CSC format with offset 1*/ |
101 |
|
|
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha, |
102 |
|
|
Paso_SparseMatrix* A, |
103 |
|
|
double* in, |
104 |
|
|
double beta, |
105 |
|
|
double* out) { |
106 |
|
|
|
107 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic; |
108 |
|
|
register double reg,reg1,reg2,reg3; |
109 |
|
|
if (ABS(beta)>0.) { |
110 |
|
|
if (beta != 1.) { |
111 |
gross |
1556 |
#pragma omp parallel for private(irow) schedule(static) |
112 |
ksteube |
1315 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) { |
113 |
|
|
out[irow] *= beta; |
114 |
|
|
} |
115 |
|
|
} |
116 |
|
|
} else { |
117 |
gross |
1556 |
#pragma omp parallel for private(irow) schedule(static) |
118 |
ksteube |
1315 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
119 |
|
|
out[irow] = 0; |
120 |
|
|
} |
121 |
|
|
|
122 |
|
|
/* do the operation: */ |
123 |
|
|
if (ABS(alpha)>0) { |
124 |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
125 |
|
|
/* TODO: parallelize (good luck!) */ |
126 |
|
|
for (icol=0;icol< A->pattern->numOutput;++icol) { |
127 |
|
|
#pragma ivdep |
128 |
|
|
for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) { |
129 |
|
|
out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol]; |
130 |
|
|
} |
131 |
|
|
} |
132 |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
133 |
|
|
/* TODO: parallelize */ |
134 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
135 |
|
|
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
136 |
gross |
1511 |
ir=2*(A->pattern->index[iptr]-1); |
137 |
|
|
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
138 |
|
|
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
139 |
ksteube |
1315 |
} |
140 |
|
|
} |
141 |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
142 |
|
|
/* TODO: parallelize */ |
143 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
144 |
|
|
#pragma ivdep |
145 |
|
|
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
146 |
|
|
ir=3*(A->pattern->index[iptr]-1); |
147 |
gross |
1511 |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
148 |
|
|
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
149 |
|
|
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
150 |
ksteube |
1315 |
} |
151 |
|
|
} |
152 |
|
|
} else { |
153 |
|
|
/* TODO: parallelize */ |
154 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
155 |
|
|
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
156 |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
157 |
|
|
irow=irb+A->row_block_size*(A->pattern->index[iptr]-1); |
158 |
|
|
#pragma ivdep |
159 |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
160 |
|
|
icol=icb+A->col_block_size*ic; |
161 |
|
|
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
162 |
|
|
} |
163 |
|
|
} |
164 |
|
|
} |
165 |
|
|
} |
166 |
|
|
} |
167 |
|
|
} |
168 |
|
|
return; |
169 |
|
|
} |
170 |
|
|
/* CSR format with offset 1*/ |
171 |
|
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha, |
172 |
|
|
Paso_SparseMatrix* A, |
173 |
|
|
double* in, |
174 |
|
|
double beta, |
175 |
|
|
double* out) { |
176 |
|
|
|
177 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic; |
178 |
|
|
register double reg,reg1,reg2,reg3; |
179 |
|
|
if (ABS(beta)>0.) { |
180 |
|
|
if (beta != 1.) { |
181 |
gross |
1556 |
#pragma omp parallel for private(irow) schedule(static) |
182 |
ksteube |
1315 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
183 |
|
|
out[irow] *= beta; |
184 |
|
|
} |
185 |
|
|
} else { |
186 |
gross |
1556 |
#pragma omp parallel for private(irow) schedule(static) |
187 |
ksteube |
1315 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
188 |
|
|
out[irow] = 0; |
189 |
|
|
} |
190 |
|
|
/* do the operation: */ |
191 |
|
|
if (ABS(alpha)>0) { |
192 |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
193 |
gross |
1556 |
#pragma omp parallel for private(irow,iptr,reg) schedule(static) |
194 |
ksteube |
1315 |
for (irow=0;irow< A->pattern->numOutput;++irow) { |
195 |
|
|
reg=0.; |
196 |
|
|
#pragma ivdep |
197 |
|
|
for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) { |
198 |
|
|
reg += A->val[iptr] * in[A->pattern->index[iptr]-1]; |
199 |
|
|
} |
200 |
|
|
out[irow] += alpha * reg; |
201 |
|
|
} |
202 |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
203 |
gross |
1556 |
#pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static) |
204 |
ksteube |
1315 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
205 |
|
|
reg1=0.; |
206 |
|
|
reg2=0.; |
207 |
|
|
#pragma ivdep |
208 |
|
|
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
209 |
|
|
ic=2*(A->pattern->index[iptr]-1); |
210 |
|
|
reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic]; |
211 |
|
|
reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic]; |
212 |
|
|
} |
213 |
|
|
out[ 2*ir] += alpha * reg1; |
214 |
|
|
out[1+2*ir] += alpha * reg2; |
215 |
|
|
} |
216 |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
217 |
gross |
1556 |
#pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static) |
218 |
ksteube |
1315 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
219 |
|
|
reg1=0.; |
220 |
|
|
reg2=0.; |
221 |
|
|
reg3=0.; |
222 |
|
|
#pragma ivdep |
223 |
|
|
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
224 |
|
|
ic=3*(A->pattern->index[iptr]-1); |
225 |
|
|
reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic]; |
226 |
|
|
reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic]; |
227 |
|
|
reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic]; |
228 |
|
|
} |
229 |
|
|
out[ 3*ir] += alpha * reg1; |
230 |
|
|
out[1+3*ir] += alpha * reg2; |
231 |
|
|
out[2+3*ir] += alpha * reg3; |
232 |
|
|
} |
233 |
|
|
} else { |
234 |
gross |
1556 |
#pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
235 |
ksteube |
1315 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
236 |
|
|
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
237 |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
238 |
|
|
irow=irb+A->row_block_size*ir; |
239 |
|
|
reg=0.; |
240 |
|
|
#pragma ivdep |
241 |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
242 |
|
|
icol=icb+A->col_block_size*(A->pattern->index[iptr]-1); |
243 |
|
|
reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
244 |
|
|
} |
245 |
|
|
out[irow] += alpha * reg; |
246 |
|
|
} |
247 |
|
|
} |
248 |
|
|
} |
249 |
|
|
} |
250 |
|
|
} |
251 |
|
|
return; |
252 |
|
|
} |
253 |
gross |
1564 |
/* CSR format with offset 0*/ |
254 |
|
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha, |
255 |
|
|
Paso_SparseMatrix* A, |
256 |
|
|
double* in, |
257 |
|
|
double beta, |
258 |
|
|
double* out) |
259 |
|
|
{ |
260 |
gross |
1570 |
/*#define PASO_DYNAMIC_SCHEDULING_MVM */ |
261 |
gross |
1564 |
|
262 |
gross |
1570 |
#if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP |
263 |
|
|
#define USE_DYNAMIC_SCHEDULING |
264 |
|
|
#endif |
265 |
|
|
|
266 |
gross |
1564 |
char* chksz_chr=NULL; |
267 |
gross |
1570 |
dim_t chunk_size=1; |
268 |
gross |
1564 |
dim_t nrow=A->numRows; |
269 |
|
|
dim_t np, len, rest, irow, local_n, p, n_chunks; |
270 |
|
|
np=omp_get_max_threads(); |
271 |
gross |
1570 |
#ifdef USE_DYNAMIC_SCHEDULING |
272 |
gross |
1564 |
chksz_chr=getenv("PASO_CHUNK_SIZE_MVM"); |
273 |
|
|
if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size); |
274 |
gross |
1570 |
chunk_size=MIN(MAX(1,chunk_size),nrow/np); |
275 |
|
|
n_chunks=nrow/chunk_size; |
276 |
|
|
if (n_chunks*chunk_size<nrow) n_chunks+=1; |
277 |
|
|
#else |
278 |
|
|
len=nrow/np; |
279 |
|
|
rest=nrow-len*np; |
280 |
gross |
1564 |
#endif |
281 |
|
|
|
282 |
gross |
1571 |
#pragma omp parallel private(irow, p, local_n) |
283 |
gross |
1570 |
{ |
284 |
|
|
#ifdef USE_DYNAMIC_SCHEDULING |
285 |
gross |
1571 |
#pragma omp for schedule(dynamic,1) |
286 |
gross |
1570 |
for (p=0; p<n_chunks;p++) { |
287 |
|
|
irow=chunk_size*p; |
288 |
|
|
local_n=MIN(chunk_size,nrow-chunk_size*p); |
289 |
|
|
#else |
290 |
gross |
1571 |
#pragma omp for schedule(static) |
291 |
gross |
1570 |
for (p=0; p<np;p++) { |
292 |
|
|
irow=len*p+MIN(p,rest); |
293 |
|
|
local_n=len+(p<rest ? 1 :0 ); |
294 |
|
|
#endif |
295 |
|
|
Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha, |
296 |
|
|
local_n, |
297 |
|
|
A->row_block_size, |
298 |
|
|
A->col_block_size, |
299 |
|
|
&(A->pattern->ptr[irow]), |
300 |
|
|
A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]); |
301 |
|
|
#ifdef USE_DYNAMIC_SCHEDULING |
302 |
|
|
} |
303 |
|
|
#else |
304 |
|
|
} |
305 |
|
|
#endif |
306 |
gross |
1564 |
} |
307 |
|
|
} |
308 |
|
|
/* CSR format with offset 0*/ |
309 |
|
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha, |
310 |
|
|
dim_t nRows, |
311 |
|
|
dim_t row_block_size, |
312 |
|
|
dim_t col_block_size, |
313 |
|
|
index_t* ptr, |
314 |
|
|
index_t* index, |
315 |
|
|
double* val, |
316 |
|
|
double* in, |
317 |
|
|
double beta, |
318 |
|
|
double* out) |
319 |
|
|
{ |
320 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr; |
321 |
|
|
register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22; |
322 |
|
|
dim_t block_size; |
323 |
|
|
if (ABS(beta)>0.) { |
324 |
|
|
if (beta != 1.) { |
325 |
|
|
for (irow=0;irow < nRows * row_block_size;irow++) |
326 |
|
|
out[irow] *= beta; |
327 |
|
|
} |
328 |
|
|
} else { |
329 |
|
|
for (irow=0;irow < nRows * row_block_size;irow++) |
330 |
|
|
out[irow] = 0; |
331 |
|
|
} |
332 |
|
|
if (ABS(alpha)>0) { |
333 |
|
|
if (col_block_size==1 && row_block_size ==1) { |
334 |
|
|
for (irow=0;irow< nRows;++irow) { |
335 |
|
|
reg=0.; |
336 |
|
|
#pragma ivdep |
337 |
|
|
for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) { |
338 |
|
|
reg += val[iptr] * in[index[iptr]]; |
339 |
|
|
} |
340 |
|
|
out[irow] += alpha * reg; |
341 |
|
|
} |
342 |
|
|
} else if (col_block_size==2 && row_block_size ==2) { |
343 |
|
|
for (ir=0;ir< nRows;ir++) { |
344 |
|
|
reg1=0.; |
345 |
|
|
reg2=0.; |
346 |
|
|
#pragma ivdep |
347 |
|
|
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
348 |
|
|
ic=2*index[iptr]; |
349 |
|
|
Aiptr=iptr*4; |
350 |
|
|
in1=in[ic]; |
351 |
|
|
in2=in[1+ic]; |
352 |
|
|
A00=val[Aiptr ]; |
353 |
|
|
A10=val[Aiptr+1]; |
354 |
|
|
A01=val[Aiptr+2]; |
355 |
|
|
A11=val[Aiptr+3]; |
356 |
|
|
reg1 += A00*in1 + A01*in2; |
357 |
|
|
reg2 += A10*in1 + A11*in2; |
358 |
|
|
} |
359 |
|
|
out[ 2*ir] += alpha * reg1; |
360 |
|
|
out[1+2*ir] += alpha * reg2; |
361 |
|
|
} |
362 |
|
|
} else if (col_block_size==3 && row_block_size ==3) { |
363 |
|
|
for (ir=0;ir< nRows;ir++) { |
364 |
|
|
reg1=0.; |
365 |
|
|
reg2=0.; |
366 |
|
|
reg3=0.; |
367 |
|
|
#pragma ivdep |
368 |
|
|
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
369 |
|
|
ic=3*index[iptr]; |
370 |
|
|
Aiptr=iptr*9; |
371 |
|
|
in1=in[ic]; |
372 |
|
|
in2=in[1+ic]; |
373 |
|
|
in3=in[2+ic]; |
374 |
|
|
A00=val[Aiptr ]; |
375 |
|
|
A10=val[Aiptr+1]; |
376 |
|
|
A20=val[Aiptr+2]; |
377 |
|
|
A01=val[Aiptr+3]; |
378 |
|
|
A11=val[Aiptr+4]; |
379 |
|
|
A21=val[Aiptr+5]; |
380 |
|
|
A02=val[Aiptr+6]; |
381 |
|
|
A12=val[Aiptr+7]; |
382 |
|
|
A22=val[Aiptr+8]; |
383 |
|
|
reg1 += A00*in1 + A01*in2 + A02*in3; |
384 |
|
|
reg2 += A10*in1 + A11*in2 + A12*in3; |
385 |
|
|
reg3 += A20*in1 + A21*in2 + A22*in3; |
386 |
|
|
} |
387 |
|
|
out[ 3*ir] += alpha * reg1; |
388 |
|
|
out[1+3*ir] += alpha * reg2; |
389 |
|
|
out[2+3*ir] += alpha * reg3; |
390 |
|
|
} |
391 |
|
|
} else { |
392 |
|
|
block_size=col_block_size*row_block_size; |
393 |
|
|
for (ir=0;ir< nRows;ir++) { |
394 |
|
|
for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) { |
395 |
|
|
for (irb=0;irb< row_block_size;irb++) { |
396 |
|
|
irow=irb+row_block_size*ir; |
397 |
|
|
reg=0.; |
398 |
|
|
#pragma ivdep |
399 |
|
|
for (icb=0;icb< col_block_size;icb++) { |
400 |
|
|
icol=icb+col_block_size*index[iptr]; |
401 |
|
|
reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol]; |
402 |
|
|
} |
403 |
|
|
out[irow] += alpha * reg; |
404 |
|
|
} |
405 |
|
|
} |
406 |
|
|
} |
407 |
|
|
} |
408 |
|
|
} |
409 |
|
|
return; |
410 |
|
|
} |