/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Contents of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1571 - (show annotations)
Sat May 24 22:28:33 2008 UTC (11 years, 6 months ago) by gross
File MIME type: text/plain
File size: 15129 byte(s)
fixes for OPenmp but PCG still does not work under openmp

1
2 /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /**************************************************************/
17
18 /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19
20 /**************************************************************/
21
22 /* Author: gross@access.edu.au */
23
24 /**************************************************************/
25
26 #include "SparseMatrix.h"
27
28 /* CSC format with offset 0*/
29 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha,
30 Paso_SparseMatrix* A,
31 double* in,
32 double beta,
33 double* out) {
34
35 register index_t ir,icol,iptr,icb,irb,irow,ic;
36 register double reg,reg1,reg2,reg3;
37
38 if (ABS(beta)>0.) {
39 if (beta != 1.) {
40 #pragma omp parallel for private(irow) schedule(static)
41 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
42 out[irow] *= beta;
43 }
44 } else {
45 #pragma omp parallel for private(irow) schedule(static)
46 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
47 out[irow] = 0;
48 }
49 if (Paso_Pattern_isEmpty(A->pattern)) return;
50 /* do the operation: */
51 if (ABS(alpha)>0) {
52 if (A ->col_block_size==1 && A->row_block_size ==1) {
53 /* TODO: parallelize (good luck!) */
54 for (icol=0;icol< A->pattern->numOutput;++icol) {
55 #pragma ivdep
56 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
57 out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
58 }
59 }
60 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
61 /* TODO: parallelize */
62 for (ic=0;ic< A->pattern->numOutput;ic++) {
63 #pragma ivdep
64 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
65 ir=2*(A->pattern->index[iptr]);
66 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
67 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
68 }
69 }
70 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
71 /* TODO: parallelize */
72 for (ic=0;ic< A->pattern->numOutput;ic++) {
73 #pragma ivdep
74 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
75 ir=3*(A->pattern->index[iptr]);
76 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
77 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
78 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
79 }
80 }
81 } else {
82 /* TODO: parallelize */
83 for (ic=0;ic< A->pattern->numOutput;ic++) {
84 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
85 for (irb=0;irb< A->row_block_size;irb++) {
86 irow=irb+A->row_block_size*(A->pattern->index[iptr]);
87 #pragma ivdep
88 for (icb=0;icb< A->col_block_size;icb++) {
89 icol=icb+A->col_block_size*ic;
90 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
91 }
92 }
93 }
94 }
95 }
96 }
97 return;
98 }
99
100 /* CSC format with offset 1*/
101 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha,
102 Paso_SparseMatrix* A,
103 double* in,
104 double beta,
105 double* out) {
106
107 register index_t ir,icol,iptr,icb,irb,irow,ic;
108 register double reg,reg1,reg2,reg3;
109 if (ABS(beta)>0.) {
110 if (beta != 1.) {
111 #pragma omp parallel for private(irow) schedule(static)
112 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
113 out[irow] *= beta;
114 }
115 }
116 } else {
117 #pragma omp parallel for private(irow) schedule(static)
118 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
119 out[irow] = 0;
120 }
121
122 /* do the operation: */
123 if (ABS(alpha)>0) {
124 if (A ->col_block_size==1 && A->row_block_size ==1) {
125 /* TODO: parallelize (good luck!) */
126 for (icol=0;icol< A->pattern->numOutput;++icol) {
127 #pragma ivdep
128 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
129 out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
130 }
131 }
132 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
133 /* TODO: parallelize */
134 for (ic=0;ic< A->pattern->numOutput;ic++) {
135 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
136 ir=2*(A->pattern->index[iptr]-1);
137 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
138 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
139 }
140 }
141 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
142 /* TODO: parallelize */
143 for (ic=0;ic< A->pattern->numOutput;ic++) {
144 #pragma ivdep
145 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
146 ir=3*(A->pattern->index[iptr]-1);
147 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
148 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
149 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
150 }
151 }
152 } else {
153 /* TODO: parallelize */
154 for (ic=0;ic< A->pattern->numOutput;ic++) {
155 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
156 for (irb=0;irb< A->row_block_size;irb++) {
157 irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
158 #pragma ivdep
159 for (icb=0;icb< A->col_block_size;icb++) {
160 icol=icb+A->col_block_size*ic;
161 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
162 }
163 }
164 }
165 }
166 }
167 }
168 return;
169 }
170 /* CSR format with offset 1*/
171 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha,
172 Paso_SparseMatrix* A,
173 double* in,
174 double beta,
175 double* out) {
176
177 register index_t ir,icol,iptr,icb,irb,irow,ic;
178 register double reg,reg1,reg2,reg3;
179 if (ABS(beta)>0.) {
180 if (beta != 1.) {
181 #pragma omp parallel for private(irow) schedule(static)
182 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
183 out[irow] *= beta;
184 }
185 } else {
186 #pragma omp parallel for private(irow) schedule(static)
187 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
188 out[irow] = 0;
189 }
190 /* do the operation: */
191 if (ABS(alpha)>0) {
192 if (A ->col_block_size==1 && A->row_block_size ==1) {
193 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
194 for (irow=0;irow< A->pattern->numOutput;++irow) {
195 reg=0.;
196 #pragma ivdep
197 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
198 reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
199 }
200 out[irow] += alpha * reg;
201 }
202 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
203 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
204 for (ir=0;ir< A->pattern->numOutput;ir++) {
205 reg1=0.;
206 reg2=0.;
207 #pragma ivdep
208 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
209 ic=2*(A->pattern->index[iptr]-1);
210 reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
211 reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
212 }
213 out[ 2*ir] += alpha * reg1;
214 out[1+2*ir] += alpha * reg2;
215 }
216 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
217 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
218 for (ir=0;ir< A->pattern->numOutput;ir++) {
219 reg1=0.;
220 reg2=0.;
221 reg3=0.;
222 #pragma ivdep
223 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
224 ic=3*(A->pattern->index[iptr]-1);
225 reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
226 reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
227 reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
228 }
229 out[ 3*ir] += alpha * reg1;
230 out[1+3*ir] += alpha * reg2;
231 out[2+3*ir] += alpha * reg3;
232 }
233 } else {
234 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
235 for (ir=0;ir< A->pattern->numOutput;ir++) {
236 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
237 for (irb=0;irb< A->row_block_size;irb++) {
238 irow=irb+A->row_block_size*ir;
239 reg=0.;
240 #pragma ivdep
241 for (icb=0;icb< A->col_block_size;icb++) {
242 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
243 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
244 }
245 out[irow] += alpha * reg;
246 }
247 }
248 }
249 }
250 }
251 return;
252 }
253 /* CSR format with offset 0*/
254 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha,
255 Paso_SparseMatrix* A,
256 double* in,
257 double beta,
258 double* out)
259 {
260 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
261
262 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
263 #define USE_DYNAMIC_SCHEDULING
264 #endif
265
266 char* chksz_chr=NULL;
267 dim_t chunk_size=1;
268 dim_t nrow=A->numRows;
269 dim_t np, len, rest, irow, local_n, p, n_chunks;
270 np=omp_get_max_threads();
271 #ifdef USE_DYNAMIC_SCHEDULING
272 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
273 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
274 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
275 n_chunks=nrow/chunk_size;
276 if (n_chunks*chunk_size<nrow) n_chunks+=1;
277 #else
278 len=nrow/np;
279 rest=nrow-len*np;
280 #endif
281
282 #pragma omp parallel private(irow, p, local_n)
283 {
284 #ifdef USE_DYNAMIC_SCHEDULING
285 #pragma omp for schedule(dynamic,1)
286 for (p=0; p<n_chunks;p++) {
287 irow=chunk_size*p;
288 local_n=MIN(chunk_size,nrow-chunk_size*p);
289 #else
290 #pragma omp for schedule(static)
291 for (p=0; p<np;p++) {
292 irow=len*p+MIN(p,rest);
293 local_n=len+(p<rest ? 1 :0 );
294 #endif
295 Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
296 local_n,
297 A->row_block_size,
298 A->col_block_size,
299 &(A->pattern->ptr[irow]),
300 A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
301 #ifdef USE_DYNAMIC_SCHEDULING
302 }
303 #else
304 }
305 #endif
306 }
307 }
308 /* CSR format with offset 0*/
309 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha,
310 dim_t nRows,
311 dim_t row_block_size,
312 dim_t col_block_size,
313 index_t* ptr,
314 index_t* index,
315 double* val,
316 double* in,
317 double beta,
318 double* out)
319 {
320 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
321 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
322 dim_t block_size;
323 if (ABS(beta)>0.) {
324 if (beta != 1.) {
325 for (irow=0;irow < nRows * row_block_size;irow++)
326 out[irow] *= beta;
327 }
328 } else {
329 for (irow=0;irow < nRows * row_block_size;irow++)
330 out[irow] = 0;
331 }
332 if (ABS(alpha)>0) {
333 if (col_block_size==1 && row_block_size ==1) {
334 for (irow=0;irow< nRows;++irow) {
335 reg=0.;
336 #pragma ivdep
337 for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
338 reg += val[iptr] * in[index[iptr]];
339 }
340 out[irow] += alpha * reg;
341 }
342 } else if (col_block_size==2 && row_block_size ==2) {
343 for (ir=0;ir< nRows;ir++) {
344 reg1=0.;
345 reg2=0.;
346 #pragma ivdep
347 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
348 ic=2*index[iptr];
349 Aiptr=iptr*4;
350 in1=in[ic];
351 in2=in[1+ic];
352 A00=val[Aiptr ];
353 A10=val[Aiptr+1];
354 A01=val[Aiptr+2];
355 A11=val[Aiptr+3];
356 reg1 += A00*in1 + A01*in2;
357 reg2 += A10*in1 + A11*in2;
358 }
359 out[ 2*ir] += alpha * reg1;
360 out[1+2*ir] += alpha * reg2;
361 }
362 } else if (col_block_size==3 && row_block_size ==3) {
363 for (ir=0;ir< nRows;ir++) {
364 reg1=0.;
365 reg2=0.;
366 reg3=0.;
367 #pragma ivdep
368 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
369 ic=3*index[iptr];
370 Aiptr=iptr*9;
371 in1=in[ic];
372 in2=in[1+ic];
373 in3=in[2+ic];
374 A00=val[Aiptr ];
375 A10=val[Aiptr+1];
376 A20=val[Aiptr+2];
377 A01=val[Aiptr+3];
378 A11=val[Aiptr+4];
379 A21=val[Aiptr+5];
380 A02=val[Aiptr+6];
381 A12=val[Aiptr+7];
382 A22=val[Aiptr+8];
383 reg1 += A00*in1 + A01*in2 + A02*in3;
384 reg2 += A10*in1 + A11*in2 + A12*in3;
385 reg3 += A20*in1 + A21*in2 + A22*in3;
386 }
387 out[ 3*ir] += alpha * reg1;
388 out[1+3*ir] += alpha * reg2;
389 out[2+3*ir] += alpha * reg3;
390 }
391 } else {
392 block_size=col_block_size*row_block_size;
393 for (ir=0;ir< nRows;ir++) {
394 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
395 for (irb=0;irb< row_block_size;irb++) {
396 irow=irb+row_block_size*ir;
397 reg=0.;
398 #pragma ivdep
399 for (icb=0;icb< col_block_size;icb++) {
400 icol=icb+col_block_size*index[iptr];
401 reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
402 }
403 out[irow] += alpha * reg;
404 }
405 }
406 }
407 }
408 }
409 return;
410 }

  ViewVC Help
Powered by ViewVC 1.1.26