1 |
ksteube |
1315 |
|
2 |
|
|
/* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */ |
3 |
|
|
|
4 |
|
|
/******************************************************* |
5 |
|
|
* |
6 |
|
|
* Copyright 2003-2007 by ACceSS MNRF |
7 |
|
|
* Copyright 2007 by University of Queensland |
8 |
|
|
* |
9 |
|
|
* http://esscc.uq.edu.au |
10 |
|
|
* Primary Business: Queensland, Australia |
11 |
|
|
* Licensed under the Open Software License version 3.0 |
12 |
|
|
* http://www.opensource.org/licenses/osl-3.0.php |
13 |
|
|
* |
14 |
|
|
*******************************************************/ |
15 |
|
|
|
16 |
|
|
/**************************************************************/ |
17 |
|
|
|
18 |
|
|
/* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */ |
19 |
|
|
|
20 |
|
|
/**************************************************************/ |
21 |
|
|
|
22 |
|
|
/* Author: gross@access.edu.au */ |
23 |
|
|
|
24 |
|
|
/**************************************************************/ |
25 |
|
|
|
26 |
|
|
#include "SparseMatrix.h" |
27 |
|
|
|
28 |
|
|
/* CSC format with offset 0*/ |
29 |
|
|
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha, |
30 |
|
|
Paso_SparseMatrix* A, |
31 |
|
|
double* in, |
32 |
|
|
double beta, |
33 |
|
|
double* out) { |
34 |
|
|
|
35 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic; |
36 |
|
|
register double reg,reg1,reg2,reg3; |
37 |
|
|
#pragma omp barrier |
38 |
|
|
|
39 |
|
|
if (ABS(beta)>0.) { |
40 |
|
|
if (beta != 1.) { |
41 |
|
|
#pragma omp for private(irow) schedule(static) |
42 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
43 |
|
|
out[irow] *= beta; |
44 |
|
|
} |
45 |
|
|
} else { |
46 |
|
|
#pragma omp for private(irow) schedule(static) |
47 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
48 |
|
|
out[irow] = 0; |
49 |
|
|
} |
50 |
|
|
if (Paso_Pattern_isEmpty(A->pattern)) return; |
51 |
|
|
/* do the operation: */ |
52 |
|
|
if (ABS(alpha)>0) { |
53 |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
54 |
|
|
/* TODO: parallelize (good luck!) */ |
55 |
|
|
#pragma omp single |
56 |
|
|
for (icol=0;icol< A->pattern->numOutput;++icol) { |
57 |
|
|
#pragma ivdep |
58 |
|
|
for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) { |
59 |
|
|
out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol]; |
60 |
|
|
} |
61 |
|
|
} |
62 |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
63 |
|
|
/* TODO: parallelize */ |
64 |
|
|
#pragma omp single |
65 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
66 |
|
|
#pragma ivdep |
67 |
|
|
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
68 |
gross |
1511 |
ir=2*(A->pattern->index[iptr]); |
69 |
|
|
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
70 |
|
|
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
71 |
ksteube |
1315 |
} |
72 |
|
|
} |
73 |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
74 |
|
|
/* TODO: parallelize */ |
75 |
|
|
#pragma omp single |
76 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
77 |
|
|
#pragma ivdep |
78 |
|
|
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
79 |
|
|
ir=3*(A->pattern->index[iptr]); |
80 |
gross |
1511 |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
81 |
|
|
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
82 |
|
|
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
83 |
ksteube |
1315 |
} |
84 |
|
|
} |
85 |
|
|
} else { |
86 |
|
|
/* TODO: parallelize */ |
87 |
|
|
#pragma omp single |
88 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
89 |
|
|
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
90 |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
91 |
|
|
irow=irb+A->row_block_size*(A->pattern->index[iptr]); |
92 |
|
|
#pragma ivdep |
93 |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
94 |
|
|
icol=icb+A->col_block_size*ic; |
95 |
|
|
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
96 |
|
|
} |
97 |
|
|
} |
98 |
|
|
} |
99 |
|
|
} |
100 |
|
|
} |
101 |
|
|
} |
102 |
|
|
return; |
103 |
|
|
} |
104 |
|
|
|
105 |
|
|
/* CSC format with offset 1*/ |
106 |
|
|
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha, |
107 |
|
|
Paso_SparseMatrix* A, |
108 |
|
|
double* in, |
109 |
|
|
double beta, |
110 |
|
|
double* out) { |
111 |
|
|
|
112 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic; |
113 |
|
|
register double reg,reg1,reg2,reg3; |
114 |
|
|
#pragma omp barrier |
115 |
|
|
|
116 |
|
|
if (ABS(beta)>0.) { |
117 |
|
|
if (beta != 1.) { |
118 |
|
|
#pragma omp for private(irow) schedule(static) |
119 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) { |
120 |
|
|
out[irow] *= beta; |
121 |
|
|
} |
122 |
|
|
} |
123 |
|
|
} else { |
124 |
|
|
#pragma omp for private(irow) schedule(static) |
125 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
126 |
|
|
out[irow] = 0; |
127 |
|
|
} |
128 |
|
|
|
129 |
|
|
/* do the operation: */ |
130 |
|
|
if (ABS(alpha)>0) { |
131 |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
132 |
|
|
/* TODO: parallelize (good luck!) */ |
133 |
|
|
#pragma omp single |
134 |
|
|
for (icol=0;icol< A->pattern->numOutput;++icol) { |
135 |
|
|
#pragma ivdep |
136 |
|
|
for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) { |
137 |
|
|
out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol]; |
138 |
|
|
} |
139 |
|
|
} |
140 |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
141 |
|
|
/* TODO: parallelize */ |
142 |
|
|
#pragma omp single |
143 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
144 |
|
|
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
145 |
gross |
1511 |
ir=2*(A->pattern->index[iptr]-1); |
146 |
|
|
out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
147 |
|
|
out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
148 |
ksteube |
1315 |
} |
149 |
|
|
} |
150 |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
151 |
|
|
/* TODO: parallelize */ |
152 |
|
|
#pragma omp single |
153 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
154 |
|
|
#pragma ivdep |
155 |
|
|
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
156 |
|
|
ir=3*(A->pattern->index[iptr]-1); |
157 |
gross |
1511 |
out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
158 |
|
|
out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
159 |
|
|
out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
160 |
ksteube |
1315 |
} |
161 |
|
|
} |
162 |
|
|
} else { |
163 |
|
|
/* TODO: parallelize */ |
164 |
|
|
#pragma omp single |
165 |
|
|
for (ic=0;ic< A->pattern->numOutput;ic++) { |
166 |
|
|
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
167 |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
168 |
|
|
irow=irb+A->row_block_size*(A->pattern->index[iptr]-1); |
169 |
|
|
#pragma ivdep |
170 |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
171 |
|
|
icol=icb+A->col_block_size*ic; |
172 |
|
|
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
173 |
|
|
} |
174 |
|
|
} |
175 |
|
|
} |
176 |
|
|
} |
177 |
|
|
} |
178 |
|
|
} |
179 |
|
|
return; |
180 |
|
|
} |
181 |
|
|
/* CSR format with offset 0*/ |
182 |
|
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha, |
183 |
|
|
Paso_SparseMatrix* A, |
184 |
|
|
double* in, |
185 |
|
|
double beta, |
186 |
|
|
double* out) |
187 |
|
|
{ |
188 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr; |
189 |
|
|
register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22; |
190 |
|
|
if (ABS(beta)>0.) { |
191 |
|
|
if (beta != 1.) { |
192 |
|
|
#pragma omp for private(irow) schedule(static) |
193 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
194 |
|
|
out[irow] *= beta; |
195 |
|
|
} |
196 |
|
|
} else { |
197 |
|
|
#pragma omp for private(irow) schedule(static) |
198 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
199 |
|
|
out[irow] = 0; |
200 |
|
|
} |
201 |
|
|
if (ABS(alpha)>0) { |
202 |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
203 |
|
|
#pragma omp for private(irow,iptr,reg) schedule(static) |
204 |
|
|
for (irow=0;irow< A->pattern->numOutput;++irow) { |
205 |
|
|
reg=0.; |
206 |
|
|
#pragma ivdep |
207 |
|
|
for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) { |
208 |
|
|
reg += A->val[iptr] * in[A->pattern->index[iptr]]; |
209 |
|
|
} |
210 |
|
|
out[irow] += alpha * reg; |
211 |
|
|
} |
212 |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
213 |
|
|
#pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static) |
214 |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
215 |
|
|
reg1=0.; |
216 |
|
|
reg2=0.; |
217 |
|
|
#pragma ivdep |
218 |
|
|
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
219 |
|
|
ic=2*(A->pattern->index[iptr]); |
220 |
|
|
Aiptr=iptr*4; |
221 |
|
|
in1=in[ic]; |
222 |
|
|
in2=in[1+ic]; |
223 |
|
|
A00=A->val[Aiptr ]; |
224 |
|
|
A10=A->val[Aiptr+1]; |
225 |
|
|
A01=A->val[Aiptr+2]; |
226 |
|
|
A11=A->val[Aiptr+3]; |
227 |
|
|
reg1 += A00*in1 + A01*in2; |
228 |
|
|
reg2 += A10*in1 + A11*in2; |
229 |
|
|
} |
230 |
|
|
out[ 2*ir] += alpha * reg1; |
231 |
|
|
out[1+2*ir] += alpha * reg2; |
232 |
|
|
} |
233 |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
234 |
|
|
#pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static) |
235 |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
236 |
|
|
reg1=0.; |
237 |
|
|
reg2=0.; |
238 |
|
|
reg3=0.; |
239 |
|
|
#pragma ivdep |
240 |
|
|
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
241 |
|
|
ic=3*(A->pattern->index[iptr]); |
242 |
|
|
Aiptr=iptr*9; |
243 |
|
|
in1=in[ic]; |
244 |
|
|
in2=in[1+ic]; |
245 |
|
|
in3=in[2+ic]; |
246 |
|
|
A00=A->val[Aiptr ]; |
247 |
|
|
A10=A->val[Aiptr+1]; |
248 |
|
|
A20=A->val[Aiptr+2]; |
249 |
|
|
A01=A->val[Aiptr+3]; |
250 |
|
|
A11=A->val[Aiptr+4]; |
251 |
|
|
A21=A->val[Aiptr+5]; |
252 |
|
|
A02=A->val[Aiptr+6]; |
253 |
|
|
A12=A->val[Aiptr+7]; |
254 |
|
|
A22=A->val[Aiptr+8]; |
255 |
|
|
reg1 += A00*in1 + A01*in2 + A02*in3; |
256 |
|
|
reg2 += A10*in1 + A11*in2 + A12*in3; |
257 |
|
|
reg3 += A20*in1 + A21*in2 + A22*in3; |
258 |
|
|
} |
259 |
|
|
out[ 3*ir] += alpha * reg1; |
260 |
|
|
out[1+3*ir] += alpha * reg2; |
261 |
|
|
out[2+3*ir] += alpha * reg3; |
262 |
|
|
} |
263 |
|
|
} else { |
264 |
|
|
#pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
265 |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
266 |
|
|
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
267 |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
268 |
|
|
irow=irb+A->row_block_size*ir; |
269 |
|
|
reg=0.; |
270 |
|
|
#pragma ivdep |
271 |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
272 |
|
|
icol=icb+A->col_block_size*(A->pattern->index[iptr]); |
273 |
|
|
reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
274 |
|
|
} |
275 |
|
|
out[irow] += alpha * reg; |
276 |
|
|
} |
277 |
|
|
} |
278 |
|
|
} |
279 |
|
|
} |
280 |
|
|
} |
281 |
|
|
return; |
282 |
|
|
} |
283 |
|
|
/* CSR format with offset 1*/ |
284 |
|
|
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha, |
285 |
|
|
Paso_SparseMatrix* A, |
286 |
|
|
double* in, |
287 |
|
|
double beta, |
288 |
|
|
double* out) { |
289 |
|
|
|
290 |
|
|
register index_t ir,icol,iptr,icb,irb,irow,ic; |
291 |
|
|
register double reg,reg1,reg2,reg3; |
292 |
|
|
#pragma omp barrier |
293 |
|
|
|
294 |
|
|
if (ABS(beta)>0.) { |
295 |
|
|
if (beta != 1.) { |
296 |
|
|
#pragma omp for private(irow) schedule(static) |
297 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
298 |
|
|
out[irow] *= beta; |
299 |
|
|
} |
300 |
|
|
} else { |
301 |
|
|
#pragma omp for private(irow) schedule(static) |
302 |
|
|
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
303 |
|
|
out[irow] = 0; |
304 |
|
|
} |
305 |
|
|
/* do the operation: */ |
306 |
|
|
if (ABS(alpha)>0) { |
307 |
|
|
if (A ->col_block_size==1 && A->row_block_size ==1) { |
308 |
|
|
#pragma omp for private(irow,iptr,reg) schedule(static) |
309 |
|
|
for (irow=0;irow< A->pattern->numOutput;++irow) { |
310 |
|
|
reg=0.; |
311 |
|
|
#pragma ivdep |
312 |
|
|
for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) { |
313 |
|
|
reg += A->val[iptr] * in[A->pattern->index[iptr]-1]; |
314 |
|
|
} |
315 |
|
|
out[irow] += alpha * reg; |
316 |
|
|
} |
317 |
|
|
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
318 |
|
|
#pragma omp for private(ir,reg1,reg2,iptr,ic) schedule(static) |
319 |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
320 |
|
|
reg1=0.; |
321 |
|
|
reg2=0.; |
322 |
|
|
#pragma ivdep |
323 |
|
|
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
324 |
|
|
ic=2*(A->pattern->index[iptr]-1); |
325 |
|
|
reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic]; |
326 |
|
|
reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic]; |
327 |
|
|
} |
328 |
|
|
out[ 2*ir] += alpha * reg1; |
329 |
|
|
out[1+2*ir] += alpha * reg2; |
330 |
|
|
} |
331 |
|
|
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
332 |
|
|
#pragma omp for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static) |
333 |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
334 |
|
|
reg1=0.; |
335 |
|
|
reg2=0.; |
336 |
|
|
reg3=0.; |
337 |
|
|
#pragma ivdep |
338 |
|
|
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
339 |
|
|
ic=3*(A->pattern->index[iptr]-1); |
340 |
|
|
reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic]; |
341 |
|
|
reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic]; |
342 |
|
|
reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic]; |
343 |
|
|
} |
344 |
|
|
out[ 3*ir] += alpha * reg1; |
345 |
|
|
out[1+3*ir] += alpha * reg2; |
346 |
|
|
out[2+3*ir] += alpha * reg3; |
347 |
|
|
} |
348 |
|
|
} else { |
349 |
|
|
#pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
350 |
|
|
for (ir=0;ir< A->pattern->numOutput;ir++) { |
351 |
|
|
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
352 |
|
|
for (irb=0;irb< A->row_block_size;irb++) { |
353 |
|
|
irow=irb+A->row_block_size*ir; |
354 |
|
|
reg=0.; |
355 |
|
|
#pragma ivdep |
356 |
|
|
for (icb=0;icb< A->col_block_size;icb++) { |
357 |
|
|
icol=icb+A->col_block_size*(A->pattern->index[iptr]-1); |
358 |
|
|
reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
359 |
|
|
} |
360 |
|
|
out[irow] += alpha * reg; |
361 |
|
|
} |
362 |
|
|
} |
363 |
|
|
} |
364 |
|
|
} |
365 |
|
|
} |
366 |
|
|
return; |
367 |
|
|
} |