1 |
|
2 |
/* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */ |
3 |
|
4 |
/******************************************************* |
5 |
* |
6 |
* Copyright 2003-2007 by ACceSS MNRF |
7 |
* Copyright 2007 by University of Queensland |
8 |
* |
9 |
* http://esscc.uq.edu.au |
10 |
* Primary Business: Queensland, Australia |
11 |
* Licensed under the Open Software License version 3.0 |
12 |
* http://www.opensource.org/licenses/osl-3.0.php |
13 |
* |
14 |
*******************************************************/ |
15 |
|
16 |
/**************************************************************/ |
17 |
|
18 |
/* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */ |
19 |
|
20 |
/**************************************************************/ |
21 |
|
22 |
/* Author: gross@access.edu.au */ |
23 |
|
24 |
/**************************************************************/ |
25 |
|
26 |
#include "SparseMatrix.h" |
27 |
|
28 |
/* CSC format with offset 0*/ |
29 |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha, |
30 |
Paso_SparseMatrix* A, |
31 |
double* in, |
32 |
double beta, |
33 |
double* out) { |
34 |
|
35 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
36 |
#pragma omp barrier |
37 |
|
38 |
if (ABS(beta)>0.) { |
39 |
if (beta != 1.) { |
40 |
#pragma omp for private(irow) schedule(static) |
41 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
42 |
out[irow] *= beta; |
43 |
} |
44 |
} else { |
45 |
#pragma omp for private(irow) schedule(static) |
46 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
47 |
out[irow] = 0; |
48 |
} |
49 |
if (Paso_Pattern_isEmpty(A->pattern)) return; |
50 |
/* do the operation: */ |
51 |
if (ABS(alpha)>0) { |
52 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
53 |
/* TODO: parallelize (good luck!) */ |
54 |
#pragma omp single |
55 |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
56 |
#pragma ivdep |
57 |
for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) { |
58 |
out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol]; |
59 |
} |
60 |
} |
61 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
62 |
/* TODO: parallelize */ |
63 |
#pragma omp single |
64 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
65 |
#pragma ivdep |
66 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
67 |
ir=2*(A->pattern->index[iptr]); |
68 |
out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
69 |
out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
70 |
} |
71 |
} |
72 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
73 |
/* TODO: parallelize */ |
74 |
#pragma omp single |
75 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
76 |
#pragma ivdep |
77 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
78 |
ir=3*(A->pattern->index[iptr]); |
79 |
out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
80 |
out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
81 |
out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
82 |
} |
83 |
} |
84 |
} else { |
85 |
/* TODO: parallelize */ |
86 |
#pragma omp single |
87 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
88 |
for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) { |
89 |
for (irb=0;irb< A->row_block_size;irb++) { |
90 |
irow=irb+A->row_block_size*(A->pattern->index[iptr]); |
91 |
#pragma ivdep |
92 |
for (icb=0;icb< A->col_block_size;icb++) { |
93 |
icol=icb+A->col_block_size*ic; |
94 |
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
95 |
} |
96 |
} |
97 |
} |
98 |
} |
99 |
} |
100 |
} |
101 |
return; |
102 |
} |
103 |
|
104 |
/* CSC format with offset 1*/ |
105 |
void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha, |
106 |
Paso_SparseMatrix* A, |
107 |
double* in, |
108 |
double beta, |
109 |
double* out) { |
110 |
|
111 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
112 |
#pragma omp barrier |
113 |
|
114 |
if (ABS(beta)>0.) { |
115 |
if (beta != 1.) { |
116 |
#pragma omp for private(irow) schedule(static) |
117 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) { |
118 |
out[irow] *= beta; |
119 |
} |
120 |
} |
121 |
} else { |
122 |
#pragma omp for private(irow) schedule(static) |
123 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
124 |
out[irow] = 0; |
125 |
} |
126 |
|
127 |
/* do the operation: */ |
128 |
if (ABS(alpha)>0) { |
129 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
130 |
/* TODO: parallelize (good luck!) */ |
131 |
#pragma omp single |
132 |
for (icol=0;icol< A->pattern->numOutput;++icol) { |
133 |
#pragma ivdep |
134 |
for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) { |
135 |
out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol]; |
136 |
} |
137 |
} |
138 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
139 |
/* TODO: parallelize */ |
140 |
#pragma omp single |
141 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
142 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
143 |
ir=2*(A->pattern->index[iptr]-1); |
144 |
out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] ); |
145 |
out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] ); |
146 |
} |
147 |
} |
148 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
149 |
/* TODO: parallelize */ |
150 |
#pragma omp single |
151 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
152 |
#pragma ivdep |
153 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
154 |
ir=3*(A->pattern->index[iptr]-1); |
155 |
out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] ); |
156 |
out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] ); |
157 |
out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] ); |
158 |
} |
159 |
} |
160 |
} else { |
161 |
/* TODO: parallelize */ |
162 |
#pragma omp single |
163 |
for (ic=0;ic< A->pattern->numOutput;ic++) { |
164 |
for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) { |
165 |
for (irb=0;irb< A->row_block_size;irb++) { |
166 |
irow=irb+A->row_block_size*(A->pattern->index[iptr]-1); |
167 |
#pragma ivdep |
168 |
for (icb=0;icb< A->col_block_size;icb++) { |
169 |
icol=icb+A->col_block_size*ic; |
170 |
out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
171 |
} |
172 |
} |
173 |
} |
174 |
} |
175 |
} |
176 |
} |
177 |
return; |
178 |
} |
179 |
/* CSR format with offset 0*/ |
180 |
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha, |
181 |
Paso_SparseMatrix* A, |
182 |
double* in, |
183 |
double beta, |
184 |
double* out) |
185 |
{ |
186 |
register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr; |
187 |
register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22; |
188 |
if (ABS(beta)>0.) { |
189 |
if (beta != 1.) { |
190 |
#pragma omp for private(irow) schedule(static) |
191 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
192 |
out[irow] *= beta; |
193 |
} |
194 |
} else { |
195 |
#pragma omp for private(irow) schedule(static) |
196 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
197 |
out[irow] = 0; |
198 |
} |
199 |
if (ABS(alpha)>0) { |
200 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
201 |
#pragma omp for private(irow,iptr,reg) schedule(static) |
202 |
for (irow=0;irow< A->pattern->numOutput;++irow) { |
203 |
reg=0.; |
204 |
#pragma ivdep |
205 |
for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) { |
206 |
reg += A->val[iptr] * in[A->pattern->index[iptr]]; |
207 |
} |
208 |
out[irow] += alpha * reg; |
209 |
} |
210 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
211 |
#pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static) |
212 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
213 |
reg1=0.; |
214 |
reg2=0.; |
215 |
#pragma ivdep |
216 |
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
217 |
ic=2*(A->pattern->index[iptr]); |
218 |
Aiptr=iptr*4; |
219 |
in1=in[ic]; |
220 |
in2=in[1+ic]; |
221 |
A00=A->val[Aiptr ]; |
222 |
A10=A->val[Aiptr+1]; |
223 |
A01=A->val[Aiptr+2]; |
224 |
A11=A->val[Aiptr+3]; |
225 |
reg1 += A00*in1 + A01*in2; |
226 |
reg2 += A10*in1 + A11*in2; |
227 |
} |
228 |
out[ 2*ir] += alpha * reg1; |
229 |
out[1+2*ir] += alpha * reg2; |
230 |
} |
231 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
232 |
#pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static) |
233 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
234 |
reg1=0.; |
235 |
reg2=0.; |
236 |
reg3=0.; |
237 |
#pragma ivdep |
238 |
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
239 |
ic=3*(A->pattern->index[iptr]); |
240 |
Aiptr=iptr*9; |
241 |
in1=in[ic]; |
242 |
in2=in[1+ic]; |
243 |
in3=in[2+ic]; |
244 |
A00=A->val[Aiptr ]; |
245 |
A10=A->val[Aiptr+1]; |
246 |
A20=A->val[Aiptr+2]; |
247 |
A01=A->val[Aiptr+3]; |
248 |
A11=A->val[Aiptr+4]; |
249 |
A21=A->val[Aiptr+5]; |
250 |
A02=A->val[Aiptr+6]; |
251 |
A12=A->val[Aiptr+7]; |
252 |
A22=A->val[Aiptr+8]; |
253 |
reg1 += A00*in1 + A01*in2 + A02*in3; |
254 |
reg2 += A10*in1 + A11*in2 + A12*in3; |
255 |
reg3 += A20*in1 + A21*in2 + A22*in3; |
256 |
} |
257 |
out[ 3*ir] += alpha * reg1; |
258 |
out[1+3*ir] += alpha * reg2; |
259 |
out[2+3*ir] += alpha * reg3; |
260 |
} |
261 |
} else { |
262 |
#pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
263 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
264 |
for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) { |
265 |
for (irb=0;irb< A->row_block_size;irb++) { |
266 |
irow=irb+A->row_block_size*ir; |
267 |
reg=0.; |
268 |
#pragma ivdep |
269 |
for (icb=0;icb< A->col_block_size;icb++) { |
270 |
icol=icb+A->col_block_size*(A->pattern->index[iptr]); |
271 |
reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
272 |
} |
273 |
out[irow] += alpha * reg; |
274 |
} |
275 |
} |
276 |
} |
277 |
} |
278 |
} |
279 |
return; |
280 |
} |
281 |
/* CSR format with offset 1*/ |
282 |
void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha, |
283 |
Paso_SparseMatrix* A, |
284 |
double* in, |
285 |
double beta, |
286 |
double* out) { |
287 |
|
288 |
register index_t ir,icol,iptr,icb,irb,irow,ic; |
289 |
register double reg,reg1,reg2,reg3; |
290 |
#pragma omp barrier |
291 |
|
292 |
if (ABS(beta)>0.) { |
293 |
if (beta != 1.) { |
294 |
#pragma omp for private(irow) schedule(static) |
295 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
296 |
out[irow] *= beta; |
297 |
} |
298 |
} else { |
299 |
#pragma omp for private(irow) schedule(static) |
300 |
for (irow=0;irow < A->numRows * A->row_block_size;irow++) |
301 |
out[irow] = 0; |
302 |
} |
303 |
/* do the operation: */ |
304 |
if (ABS(alpha)>0) { |
305 |
if (A ->col_block_size==1 && A->row_block_size ==1) { |
306 |
#pragma omp for private(irow,iptr,reg) schedule(static) |
307 |
for (irow=0;irow< A->pattern->numOutput;++irow) { |
308 |
reg=0.; |
309 |
#pragma ivdep |
310 |
for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) { |
311 |
reg += A->val[iptr] * in[A->pattern->index[iptr]-1]; |
312 |
} |
313 |
out[irow] += alpha * reg; |
314 |
} |
315 |
} else if (A ->col_block_size==2 && A->row_block_size ==2) { |
316 |
#pragma omp for private(ir,reg1,reg2,iptr,ic) schedule(static) |
317 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
318 |
reg1=0.; |
319 |
reg2=0.; |
320 |
#pragma ivdep |
321 |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
322 |
ic=2*(A->pattern->index[iptr]-1); |
323 |
reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic]; |
324 |
reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic]; |
325 |
} |
326 |
out[ 2*ir] += alpha * reg1; |
327 |
out[1+2*ir] += alpha * reg2; |
328 |
} |
329 |
} else if (A ->col_block_size==3 && A->row_block_size ==3) { |
330 |
#pragma omp for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static) |
331 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
332 |
reg1=0.; |
333 |
reg2=0.; |
334 |
reg3=0.; |
335 |
#pragma ivdep |
336 |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
337 |
ic=3*(A->pattern->index[iptr]-1); |
338 |
reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic]; |
339 |
reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic]; |
340 |
reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic]; |
341 |
} |
342 |
out[ 3*ir] += alpha * reg1; |
343 |
out[1+3*ir] += alpha * reg2; |
344 |
out[2+3*ir] += alpha * reg3; |
345 |
} |
346 |
} else { |
347 |
#pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static) |
348 |
for (ir=0;ir< A->pattern->numOutput;ir++) { |
349 |
for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) { |
350 |
for (irb=0;irb< A->row_block_size;irb++) { |
351 |
irow=irb+A->row_block_size*ir; |
352 |
reg=0.; |
353 |
#pragma ivdep |
354 |
for (icb=0;icb< A->col_block_size;icb++) { |
355 |
icol=icb+A->col_block_size*(A->pattern->index[iptr]-1); |
356 |
reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol]; |
357 |
} |
358 |
out[irow] += alpha * reg; |
359 |
} |
360 |
} |
361 |
} |
362 |
} |
363 |
} |
364 |
return; |
365 |
} |