/[escript]/trunk/paso/src/SystemMatrix_MatrixVector.c
ViewVC logotype

Contents of /trunk/paso/src/SystemMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1098 - (show annotations)
Mon Apr 16 23:15:23 2007 UTC (12 years, 7 months ago) by gross
File MIME type: text/plain
File size: 14129 byte(s)
add a few #pragma ivdep which should speed up MV but cannot confiirm this on Pentium




1 /* $Id$ */
2
3
4 /*
5 ********************************************************************************
6 * Copyright 2006 by ACcESS MNRF *
7 * *
8 * http://www.access.edu.au *
9 * Primary Business: Queensland, Australia *
10 * Licensed under the Open Software License version 3.0 *
11 * http://www.opensource.org/licenses/osl-3.0.php *
12 ********************************************************************************
13 */
14
15 /**************************************************************/
16
17 /* Paso: matrix vector product with sparse matrix */
18
19 /**************************************************************/
20
21 /* Copyrights by ACcESS Australia 2003,2004,2005 */
22 /* Author: gross@access.edu.au */
23
24 /**************************************************************/
25
26 #include "Paso.h"
27 #include "SystemMatrix.h"
28
29 /**************************************************************************/
30
31 /* raw scaled vector update operation: out = alpha * A * in + beta * out */
32
33 /* has to be called within a parallel region */
34 /* barrier synconization is performed to make sure that the input vector available */
35
36 void Paso_SystemMatrix_MatrixVector(double alpha,
37 Paso_SystemMatrix* A,
38 double* in,
39 double beta,
40 double* out) {
41
42 if (A->type & MATRIX_FORMAT_CSC) {
43 if (A->type & MATRIX_FORMAT_OFFSET1) {
44 Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(alpha,A,in,beta,out);
45 } else {
46 Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(alpha,A,in,beta,out);
47 }
48 } else {
49 if (A->type & MATRIX_FORMAT_OFFSET1) {
50 Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(alpha,A,in,beta,out);
51 } else {
52 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(alpha,A,in,beta,out);
53 }
54 }
55 return;
56 }
57
58 void Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(double alpha,
59 Paso_SystemMatrix* A,
60 double* in,
61 double beta,
62 double* out) {
63
64 register index_t ir,icol,iptr,icb,irb,irow,ic;
65 register double reg,reg1,reg2,reg3;
66 #pragma omp barrier
67
68 if (ABS(beta)>0.) {
69 #pragma omp for private(irow) schedule(static)
70 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
71 out[irow] *= beta;
72 } else {
73 #pragma omp for private(irow) schedule(static)
74 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
75 out[irow] = 0;
76 }
77
78 /* do the operation: */
79 if (ABS(alpha)>0) {
80 if (A ->col_block_size==1 && A->row_block_size ==1) {
81 /* TODO: parallelize (good luck!) */
82 #pragma omp single
83 for (icol=0;icol< A->pattern->n_ptr;++icol) {
84 #pragma ivdep
85 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
86 out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
87 }
88 }
89 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
90 /* TODO: parallelize */
91 #pragma omp single
92 for (ic=0;ic< A->pattern->n_ptr;ic++) {
93 #pragma ivdep
94 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
95 ic=2*(A->pattern->index[iptr]);
96 out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
97 out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
98 }
99 }
100 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
101 /* TODO: parallelize */
102 #pragma omp single
103 for (ic=0;ic< A->pattern->n_ptr;ic++) {
104 #pragma ivdep
105 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
106 ir=3*(A->pattern->index[iptr]);
107 out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
108 out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
109 out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
110 }
111 }
112 } else {
113 /* TODO: parallelize */
114 #pragma omp single
115 for (ic=0;ic< A->pattern->n_ptr;ic++) {
116 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
117 for (irb=0;irb< A->row_block_size;irb++) {
118 irow=irb+A->row_block_size*(A->pattern->index[iptr]);
119 #pragma ivdep
120 for (icb=0;icb< A->col_block_size;icb++) {
121 icol=icb+A->col_block_size*ic;
122 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
123 }
124 }
125 }
126 }
127 }
128 }
129 return;
130 }
131
132 void Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(double alpha,
133 Paso_SystemMatrix* A,
134 double* in,
135 double beta,
136 double* out) {
137
138 register index_t ir,icol,iptr,icb,irb,irow,ic;
139 register double reg,reg1,reg2,reg3;
140 #pragma omp barrier
141
142 if (ABS(beta)>0.) {
143 #pragma omp for private(irow) schedule(static)
144 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
145 out[irow] *= beta;
146 } else {
147 #pragma omp for private(irow) schedule(static)
148 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
149 out[irow] = 0;
150 }
151
152 /* do the operation: */
153 if (ABS(alpha)>0) {
154 if (A ->col_block_size==1 && A->row_block_size ==1) {
155 /* TODO: parallelize (good luck!) */
156 #pragma omp single
157 for (icol=0;icol< A->pattern->n_ptr;++icol) {
158 #pragma ivdep
159 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
160 out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
161 }
162 }
163 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
164 /* TODO: parallelize */
165 #pragma omp single
166 for (ic=0;ic< A->pattern->n_ptr;ic++) {
167 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
168 ic=2*(A->pattern->index[iptr]-1);
169 out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
170 out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
171 }
172 }
173 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
174 /* TODO: parallelize */
175 #pragma omp single
176 for (ic=0;ic< A->pattern->n_ptr;ic++) {
177 #pragma ivdep
178 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
179 ir=3*(A->pattern->index[iptr]-1);
180 out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
181 out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
182 out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
183 }
184 }
185 } else {
186 /* TODO: parallelize */
187 #pragma omp single
188 for (ic=0;ic< A->pattern->n_ptr;ic++) {
189 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
190 for (irb=0;irb< A->row_block_size;irb++) {
191 irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
192 #pragma ivdep
193 for (icb=0;icb< A->col_block_size;icb++) {
194 icol=icb+A->col_block_size*ic;
195 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
196 }
197 }
198 }
199 }
200 }
201 }
202 return;
203 }
204
205 void Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(double alpha,
206 Paso_SystemMatrix* A,
207 double* in,
208 double beta,
209 double* out) {
210
211 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
212 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
213 #pragma omp barrier
214 if (ABS(beta)>0.) {
215 #pragma omp for private(irow) schedule(static)
216 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
217 out[irow] *= beta;
218 } else {
219 #pragma omp for private(irow) schedule(static)
220 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
221 out[irow] = 0;
222 }
223 /* do the operation: */
224 if (ABS(alpha)>0) {
225 if (A ->col_block_size==1 && A->row_block_size ==1) {
226 #pragma omp for private(irow,iptr,reg) schedule(static)
227 for (irow=0;irow< A->pattern->n_ptr;++irow) {
228 reg=0.;
229 #pragma ivdep
230 for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) {
231 reg += A->val[iptr] * in[A->pattern->index[iptr]];
232 }
233 out[irow] += alpha * reg;
234 }
235 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
236 #pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static)
237 for (ir=0;ir< A->pattern->n_ptr;ir++) {
238 reg1=0.;
239 reg2=0.;
240 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
241 ic=2*(A->pattern->index[iptr]);
242 Aiptr=iptr*4;
243 in1=in[ic];
244 in2=in[1+ic];
245 A00=A->val[Aiptr ];
246 A10=A->val[Aiptr+1];
247 A01=A->val[Aiptr+2];
248 A11=A->val[Aiptr+3];
249 reg1 += A00*in1 + A01*in2;
250 reg2 += A10*in1 + A11*in2;
251 }
252 out[ 2*ir] += alpha * reg1;
253 out[1+2*ir] += alpha * reg2;
254 }
255 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
256 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static)
257 for (ir=0;ir< A->pattern->n_ptr;ir++) {
258 reg1=0.;
259 reg2=0.;
260 reg3=0.;
261 #pragma ivdep
262 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
263 ic=3*(A->pattern->index[iptr]);
264 Aiptr=iptr*9;
265 in1=in[ic];
266 in2=in[1+ic];
267 in3=in[2+ic];
268 A00=A->val[Aiptr ];
269 A10=A->val[Aiptr+1];
270 A20=A->val[Aiptr+2];
271 A01=A->val[Aiptr+3];
272 A11=A->val[Aiptr+4];
273 A21=A->val[Aiptr+5];
274 A02=A->val[Aiptr+6];
275 A12=A->val[Aiptr+7];
276 A22=A->val[Aiptr+8];
277 reg1 += A00*in1 + A01*in2 + A02*in3;
278 reg2 += A10*in1 + A11*in2 + A12*in3;
279 reg3 += A20*in1 + A21*in2 + A22*in3;
280 }
281 out[ 3*ir] += alpha * reg1;
282 out[1+3*ir] += alpha * reg2;
283 out[2+3*ir] += alpha * reg3;
284 }
285 } else {
286 #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
287 for (ir=0;ir< A->pattern->n_ptr;ir++) {
288 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
289 for (irb=0;irb< A->row_block_size;irb++) {
290 irow=irb+A->row_block_size*ir;
291 reg=0.;
292 #pragma ivdep
293 for (icb=0;icb< A->col_block_size;icb++) {
294 icol=icb+A->col_block_size*(A->pattern->index[iptr]);
295 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
296 }
297 out[irow] += alpha * reg;
298 }
299 }
300 }
301 }
302 }
303 return;
304 }
305
306 void Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(double alpha,
307 Paso_SystemMatrix* A,
308 double* in,
309 double beta,
310 double* out) {
311
312 register index_t ir,icol,iptr,icb,irb,irow,ic;
313 register double reg,reg1,reg2,reg3;
314 #pragma omp barrier
315
316 if (ABS(beta)>0.) {
317 #pragma omp for private(irow) schedule(static)
318 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
319 out[irow] *= beta;
320 } else {
321 #pragma omp for private(irow) schedule(static)
322 for (irow=0;irow < A->num_rows * A->row_block_size;irow++)
323 out[irow] = 0;
324 }
325 /* do the operation: */
326 if (ABS(alpha)>0) {
327 if (A ->col_block_size==1 && A->row_block_size ==1) {
328 #pragma omp for private(irow,iptr,reg) schedule(static)
329 for (irow=0;irow< A->pattern->n_ptr;++irow) {
330 reg=0.;
331 #pragma ivdep
332 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
333 reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
334 }
335 out[irow] += alpha * reg;
336 }
337 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
338 #pragma omp for private(ir,reg1,reg2,iptr,ic) schedule(static)
339 for (ir=0;ir< A->pattern->n_ptr;ir++) {
340 reg1=0.;
341 reg2=0.;
342 #pragma ivdep
343 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
344 ic=2*(A->pattern->index[iptr]-1);
345 reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
346 reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
347 }
348 out[ 2*ir] += alpha * reg1;
349 out[1+2*ir] += alpha * reg2;
350 }
351 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
352 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
353 for (ir=0;ir< A->pattern->n_ptr;ir++) {
354 reg1=0.;
355 reg2=0.;
356 reg3=0.;
357 #pragma ivdep
358 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
359 ic=3*(A->pattern->index[iptr]-1);
360 reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
361 reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
362 reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
363 }
364 out[ 3*ir] += alpha * reg1;
365 out[1+3*ir] += alpha * reg2;
366 out[2+3*ir] += alpha * reg3;
367 }
368 } else {
369 #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
370 for (ir=0;ir< A->pattern->n_ptr;ir++) {
371 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
372 for (irb=0;irb< A->row_block_size;irb++) {
373 irow=irb+A->row_block_size*ir;
374 reg=0.;
375 #pragma ivdep
376 for (icb=0;icb< A->col_block_size;icb++) {
377 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
378 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
379 }
380 out[irow] += alpha * reg;
381 }
382 }
383 }
384 }
385 }
386 return;
387 }
388 /*
389 * $Log$
390 * Revision 1.2 2005/09/15 03:44:39 jgs
391 * Merge of development branch dev-02 back to main trunk on 2005-09-15
392 *
393 * Revision 1.1.2.1 2005/09/05 06:29:47 gross
394 * These files have been extracted from finley to define a stand alone libray for iterative
395 * linear solvers on the ALTIX. main entry through Paso_solve. this version compiles but
396 * has not been tested yet.
397 *
398 *
399 */

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26