/[escript]/trunk-mpi-branch/paso/src/SystemMatrix_MatrixVector.c
ViewVC logotype

Contents of /trunk-mpi-branch/paso/src/SystemMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1165 - (show annotations)
Thu May 24 00:45:48 2007 UTC (13 years, 5 months ago) by ksteube
File MIME type: text/plain
File size: 23013 byte(s)
Modified matrix/vector product in CSR format: for each row we
now precompute minimum index. In the loop over the non-zeros
of the row we then start at the precomputed min. This avoids
one of the if blocks and speeds up the floating point arith
quite a bit. May be able to get rid of the other if block
and speed it up further.

1 /* $Id$ */
2
3
4 /*
5 ********************************************************************************
6 * Copyright 2006 by ACcESS MNRF *
7 * *
8 * http://www.access.edu.au *
9 * Primary Business: Queensland, Australia *
10 * Licensed under the Open Software License version 3.0 *
11 * http://www.opensource.org/licenses/osl-3.0.php *
12 ********************************************************************************
13 */
14
15 /**************************************************************/
16
17 /* Paso: matrix vector product with sparse matrix */
18
19 /**************************************************************/
20
21 /* Copyrights by ACcESS Australia 2003,2004,2005 */
22 /* Author: gross@access.edu.au */
23
24 /**************************************************************/
25
26 #include "Paso.h"
27 #include "SystemMatrix.h"
28
29 /**************************************************************************/
30
31 /* raw scaled vector update operation: out = alpha * A * in + beta * out */
32
33 /* has to be called within a parallel region */
34 /* barrier synconization is performed to make sure that the input vector available */
35
36 void Paso_SystemMatrix_MatrixVector(double alpha,
37 Paso_SystemMatrix* A,
38 double* in,
39 double beta,
40 double* out) {
41
42 double *buffer0=NULL, *buffer1=NULL;
43 dim_t N;
44 Paso_MPIInfo *mpi_info=A->mpi_info;
45
46 if (A->type & MATRIX_FORMAT_CSC) {
47 if ( mpi_info->size>1) {
48 Paso_setError(SYSTEM_ERROR,"Paso_SystemMatrix_MatrixVector: CSC is not supported by MPI.");
49 return;
50 } else {
51 if (A->type & MATRIX_FORMAT_OFFSET1) {
52 Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(alpha,A,in,beta,out);
53 } else {
54 Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(alpha,A,in,beta,out);
55 }
56 }
57 } else if (A->type & MATRIX_FORMAT_TRILINOS_CRS) {
58 Paso_setError(SYSTEM_ERROR,"Paso_SystemMatrix_MatrixVector: TRILINOS is not supported with MPI.");
59 return;
60 } else {
61 if (A->type & MATRIX_FORMAT_OFFSET1) {
62 if ( mpi_info->size>1) {
63 Paso_setError(SYSTEM_ERROR,"Paso_SystemMatrix_MatrixVector: CSR with index 1 is not supported by MPI.");
64 return;
65 } else {
66 Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(alpha,A,in,beta,out);
67 }
68 } else {
69 if ( mpi_info->size>1) {
70 N=A->maxNumCols * A->col_block_size;
71 buffer0=TMPMEMALLOC(N,double);
72 buffer1=TMPMEMALLOC(N,double);
73 if (Finley_checkPtr(buffer0) || Finley_checkPtr(buffer1) ) {
74 TMPMEMFREE(buffer0);
75 TMPMEMFREE(buffer1);
76 return;
77 }
78 } else{
79 buffer0=NULL;
80 buffer1=NULL;
81 }
82 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(alpha,A,in,beta,out,buffer0,buffer1);
83 if ( mpi_info->size>1) {
84 TMPMEMFREE(buffer0);
85 TMPMEMFREE(buffer1);
86 }
87 }
88 }
89 return;
90 }
91
92 void Paso_SystemMatrix_MatrixVector_CSC_OFFSET0(double alpha,
93 Paso_SystemMatrix* A,
94 double* in,
95 double beta,
96 double* out) {
97
98 register index_t ir,icol,iptr,icb,irb,irow,ic;
99 register double reg,reg1,reg2,reg3;
100 #pragma omp barrier
101
102 if (ABS(beta)>0.) {
103 #pragma omp for private(irow) schedule(static)
104 for (irow=0;irow < A->myNumRows * A->row_block_size;irow++)
105 out[irow] *= beta;
106 } else {
107 #pragma omp for private(irow) schedule(static)
108 for (irow=0;irow < A->myNumRows * A->row_block_size;irow++)
109 out[irow] = 0;
110 }
111
112 /* do the operation: */
113 if (ABS(alpha)>0) {
114 if (A ->col_block_size==1 && A->row_block_size ==1) {
115 /* TODO: parallelize (good luck!) */
116 #pragma omp single
117 for (icol=0;icol< A->pattern->myNumOutput;++icol) {
118 #pragma ivdep
119 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
120 out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
121 }
122 }
123 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
124 /* TODO: parallelize */
125 #pragma omp single
126 for (ic=0;ic< A->pattern->myNumOutput;ic++) {
127 #pragma ivdep
128 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
129 ic=2*(A->pattern->index[iptr]);
130 out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
131 out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
132 }
133 }
134 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
135 /* TODO: parallelize */
136 #pragma omp single
137 for (ic=0;ic< A->pattern->myNumOutput;ic++) {
138 #pragma ivdep
139 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
140 ir=3*(A->pattern->index[iptr]);
141 out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
142 out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
143 out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
144 }
145 }
146 } else {
147 /* TODO: parallelize */
148 #pragma omp single
149 for (ic=0;ic< A->pattern->myNumOutput;ic++) {
150 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
151 for (irb=0;irb< A->row_block_size;irb++) {
152 irow=irb+A->row_block_size*(A->pattern->index[iptr]);
153 #pragma ivdep
154 for (icb=0;icb< A->col_block_size;icb++) {
155 icol=icb+A->col_block_size*ic;
156 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
157 }
158 }
159 }
160 }
161 }
162 }
163 return;
164 }
165
166 void Paso_SystemMatrix_MatrixVector_CSC_OFFSET1(double alpha,
167 Paso_SystemMatrix* A,
168 double* in,
169 double beta,
170 double* out) {
171
172 register index_t ir,icol,iptr,icb,irb,irow,ic;
173 register double reg,reg1,reg2,reg3;
174 #pragma omp barrier
175
176 if (ABS(beta)>0.) {
177 #pragma omp for private(irow) schedule(static)
178 for (irow=0;irow < A->myNumRows * A->row_block_size;irow++)
179 out[irow] *= beta;
180 } else {
181 #pragma omp for private(irow) schedule(static)
182 for (irow=0;irow < A->myNumRows * A->row_block_size;irow++)
183 out[irow] = 0;
184 }
185
186 /* do the operation: */
187 if (ABS(alpha)>0) {
188 if (A ->col_block_size==1 && A->row_block_size ==1) {
189 /* TODO: parallelize (good luck!) */
190 #pragma omp single
191 for (icol=0;icol< A->pattern->myNumOutput;++icol) {
192 #pragma ivdep
193 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
194 out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
195 }
196 }
197 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
198 /* TODO: parallelize */
199 #pragma omp single
200 for (ic=0;ic< A->pattern->myNumOutput;ic++) {
201 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
202 ic=2*(A->pattern->index[iptr]-1);
203 out[ 2*ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
204 out[1+2*ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
205 }
206 }
207 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
208 /* TODO: parallelize */
209 #pragma omp single
210 for (ic=0;ic< A->pattern->myNumOutput;ic++) {
211 #pragma ivdep
212 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
213 ir=3*(A->pattern->index[iptr]-1);
214 out[ 3*ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
215 out[1+3*ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
216 out[2+3*ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
217 }
218 }
219 } else {
220 /* TODO: parallelize */
221 #pragma omp single
222 for (ic=0;ic< A->pattern->myNumOutput;ic++) {
223 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
224 for (irb=0;irb< A->row_block_size;irb++) {
225 irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
226 #pragma ivdep
227 for (icb=0;icb< A->col_block_size;icb++) {
228 icol=icb+A->col_block_size*ic;
229 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
230 }
231 }
232 }
233 }
234 }
235 }
236 return;
237 }
238
239 void Paso_SystemMatrix_MatrixVector_CSR_OFFSET1(double alpha,
240 Paso_SystemMatrix* A,
241 double* in,
242 double beta,
243 double* out) {
244
245 register index_t ir,icol,iptr,icb,irb,irow,ic;
246 register double reg,reg1,reg2,reg3;
247 #pragma omp barrier
248
249 if (ABS(beta)>0.) {
250 #pragma omp for private(irow) schedule(static)
251 for (irow=0;irow < A->myNumRows * A->row_block_size;irow++)
252 out[irow] *= beta;
253 } else {
254 #pragma omp for private(irow) schedule(static)
255 for (irow=0;irow < A->myNumRows * A->row_block_size;irow++)
256 out[irow] = 0;
257 }
258 /* do the operation: */
259 if (ABS(alpha)>0) {
260 if (A ->col_block_size==1 && A->row_block_size ==1) {
261 #pragma omp for private(irow,iptr,reg) schedule(static)
262 for (irow=0;irow< A->pattern->myNumOutput;++irow) {
263 reg=0.;
264 #pragma ivdep
265 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
266 reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
267 }
268 out[irow] += alpha * reg;
269 }
270 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
271 #pragma omp for private(ir,reg1,reg2,iptr,ic) schedule(static)
272 for (ir=0;ir< A->pattern->myNumOutput;ir++) {
273 reg1=0.;
274 reg2=0.;
275 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
276 ic=2*(A->pattern->index[iptr]-1);
277 reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
278 reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
279 }
280 out[ 2*ir] += alpha * reg1;
281 out[1+2*ir] += alpha * reg2;
282 }
283 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
284 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
285 for (ir=0;ir< A->pattern->myNumOutput;ir++) {
286 reg1=0.;
287 reg2=0.;
288 reg3=0.;
289 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
290 ic=3*(A->pattern->index[iptr]-1);
291 reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
292 reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
293 reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
294 }
295 out[ 3*ir] += alpha * reg1;
296 out[1+3*ir] += alpha * reg2;
297 out[2+3*ir] += alpha * reg3;
298 }
299 } else {
300 #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
301 for (ir=0;ir< A->pattern->myNumOutput;ir++) {
302 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
303 for (irb=0;irb< A->row_block_size;irb++) {
304 irow=irb+A->row_block_size*ir;
305 reg=0.;
306 for (icb=0;icb< A->col_block_size;icb++) {
307 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
308 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
309 }
310 out[irow] += alpha * reg;
311 }
312 }
313 }
314 }
315 }
316 return;
317 }
318
319 void Paso_SystemMatrix_MatrixVector_CSR_OFFSET0(double alpha,
320 Paso_SystemMatrix* A,
321 double* in,
322 double beta,
323 double* out,
324 double* buffer0,
325 double* buffer1) {
326
327 register index_t irow;
328 Paso_MPIInfo *mpi_info=A->mpi_info;
329 #pragma omp barrier
330 if (ABS(beta)>0.) {
331 #pragma omp for private(irow) schedule(static)
332 for (irow=0;irow < A->myNumRows * A->row_block_size;++irow)
333 out[irow] *= beta;
334 } else {
335 #pragma omp for private(irow) schedule(static)
336 for (irow=0;irow < A->myNumRows * A->row_block_size;++irow)
337 out[irow] = 0;
338 }
339 /* do the operation: */
340 if (ABS(alpha)>0) {
341 if ( mpi_info->size>1) {
342 #ifdef PASO_MPI
343 {
344 MPI_Status status;
345 MPI_Request snd_rqst, rcv_rqst;
346 double *snd_buf=in, *rcv_buf=buffer0, *swap;
347 dim_t len_snd_buf= A->myNumCols * A->col_block_size;
348 dim_t len_rcv_buf=A->maxNumCols * A->col_block_size;
349 dim_t h, fromRank, toRank;
350 dim_t myRank=mpi_info->rank;
351 dim_t numProc=mpi_info->size;
352 index_t rank_of_snd_buf=mpi_info->rank;
353
354 for (h=0; h<A->pattern->numHops; ++h) {
355
356 /* start asynchronos communication */
357 if (h<A->pattern->numHops-1) {
358 fromRank=PASO_MPI_mod(myRank-(A->pattern->hop[h]),numProc);
359 toRank=PASO_MPI_mod(myRank+(A->pattern->hop[h]),numProc);
360 mpi_info->msg_tag_counter++;
361
362 #pragma omp master
363 {
364 MPI_Issend(snd_buf,len_snd_buf,MPI_DOUBLE,toRank,
365 mpi_info->msg_tag_counter,mpi_info->comm,&rcv_rqst);
366 MPI_Irecv(rcv_buf,len_rcv_buf,MPI_DOUBLE,fromRank,
367 mpi_info->msg_tag_counter,mpi_info->comm,&snd_rqst);
368 }
369 }
370 /* MVM with send buffer as input */
371 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0_P(alpha,
372 A,
373 snd_buf,
374 A->pattern->input_distribution->first_component[rank_of_snd_buf],
375 A->pattern->input_distribution->first_component[rank_of_snd_buf+1],
376 out);
377 /* wait communication to be finished */
378 if (h<A->pattern->numHops-1) {
379 #pragma omp master
380 {
381 MPI_Wait(&rcv_rqst,&status);
382 MPI_Wait(&snd_rqst,&status);
383 }
384 }
385 /* swap recieve and send buffer */
386 if (h==0) {
387 snd_buf = rcv_buf;
388 len_snd_buf = len_rcv_buf;
389 rcv_buf = buffer1;
390 } else {
391 swap = snd_buf;
392 snd_buf = rcv_buf;
393 rcv_buf = swap;
394 }
395 rank_of_snd_buf=PASO_MPI_mod(rank_of_snd_buf-(A->pattern->hop[h]),numProc);
396
397 }
398 }
399 #endif
400 } else {
401 Paso_SystemMatrix_MatrixVector_CSR_OFFSET0_S(alpha,A,in,out);
402 }
403
404 }
405 }
406
407 void Paso_SystemMatrix_MatrixVector_CSR_OFFSET0_S(
408 double alpha,
409 Paso_SystemMatrix* A,
410 double* in,
411 double* out)
412 {
413 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
414 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
415 if (A ->col_block_size==1 && A->row_block_size ==1) {
416 #pragma omp for private(irow,iptr,reg) schedule(static)
417 for (irow=0;irow< A->pattern->myNumOutput;++irow) {
418 reg=0.;
419 for (iptr=(A->pattern->ptr[irow]);iptr<(A->pattern->ptr[irow+1]); ++iptr) {
420 reg += A->val[iptr] * in[A->pattern->index[iptr]];
421 }
422 out[irow] += alpha * reg;
423 }
424 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
425 #pragma omp for private(ir,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11) schedule(static)
426 for (ir=0;ir< A->pattern->myNumOutput;ir++) {
427 reg1=0.;
428 reg2=0.;
429 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
430 ic=2*(A->pattern->index[iptr]);
431 Aiptr=iptr*4;
432 in1=in[ic];
433 in2=in[1+ic];
434 A00=A->val[Aiptr ];
435 A10=A->val[Aiptr+1];
436 A01=A->val[Aiptr+2];
437 A11=A->val[Aiptr+3];
438 reg1 += A00*in1 + A01*in2;
439 reg2 += A10*in1 + A11*in2;
440 }
441 out[ 2*ir] += alpha * reg1;
442 out[1+2*ir] += alpha * reg2;
443 }
444 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
445 #pragma omp for private(ir,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22) schedule(static)
446 for (ir=0;ir< A->pattern->myNumOutput;ir++) {
447 reg1=0.;
448 reg2=0.;
449 reg3=0.;
450 #pragma ivdep
451 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
452 ic=3*(A->pattern->index[iptr]);
453 Aiptr=iptr*9;
454 in1=in[ic];
455 in2=in[1+ic];
456 in3=in[2+ic];
457 A00=A->val[Aiptr ];
458 A10=A->val[Aiptr+1];
459 A20=A->val[Aiptr+2];
460 A01=A->val[Aiptr+3];
461 A11=A->val[Aiptr+4];
462 A21=A->val[Aiptr+5];
463 A02=A->val[Aiptr+6];
464 A12=A->val[Aiptr+7];
465 A22=A->val[Aiptr+8];
466 reg1 += A00*in1 + A01*in2 + A02*in3;
467 reg2 += A10*in1 + A11*in2 + A12*in3;
468 reg3 += A20*in1 + A21*in2 + A22*in3;
469 }
470 out[ 3*ir] += alpha * reg1;
471 out[1+3*ir] += alpha * reg2;
472 out[2+3*ir] += alpha * reg3;
473 }
474 } else {
475 #pragma omp for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
476 for (ir=0;ir< A->pattern->myNumOutput;ir++) {
477 for (iptr=A->pattern->ptr[ir];iptr<A->pattern->ptr[ir+1]; iptr++) {
478 for (irb=0;irb< A->row_block_size;irb++) {
479 irow=irb+A->row_block_size*ir;
480 reg=0.;
481 #pragma ivdep
482 for (icb=0;icb< A->col_block_size;icb++) {
483 icol=icb+A->col_block_size*(A->pattern->index[iptr]);
484 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
485 }
486 out[irow] += alpha * reg;
487 }
488 }
489 }
490 }
491 return;
492 }
493 void Paso_SystemMatrix_MatrixVector_CSR_OFFSET0_P(
494 double alpha,
495 Paso_SystemMatrix* A,
496 double* in,
497 index_t min_index, index_t max_index,
498 double* out)
499 {
500 register index_t iptr,icb,irb,irow,ic,Aiptr,itmp,irtmp,ictmp;
501 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
502 if (A ->col_block_size==1 && A->row_block_size ==1) {
503 int iptr0;
504 #pragma omp for private(irow,iptr,reg,itmp,iptr0) schedule(static)
505 for (irow=0;irow< A->pattern->myNumOutput;++irow) {
506 reg=0.;
507 #pragma ivdep
508 /* index[] is an increasing sequence, can find min first */
509 for (iptr0=(A->pattern->ptr[irow]);iptr0<(A->pattern->ptr[irow+1]); ++iptr0) {
510 itmp=A->pattern->index[iptr0];
511 if ( min_index<= itmp) { break; }
512 }
513 #pragma ivdep
514 for (iptr=iptr0;iptr<(A->pattern->ptr[irow+1]); ++iptr) {
515 itmp=A->pattern->index[iptr];
516 /* index[] is an increasing sequence, can quit loop if at max */
517 if (itmp >= max_index) { break; }
518 reg += A->val[iptr] * in[itmp-min_index];
519 }
520 out[irow] += alpha * reg;
521 }
522 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
523 int iptr0;
524 #pragma omp for private(irow,reg1,reg2,iptr,ic,Aiptr,in1,in2,A00,A10,A01,A11,itmp,iptr0) schedule(static)
525 for (irow=0;irow< A->pattern->myNumOutput;++irow) {
526 reg1=0.;
527 reg2=0.;
528 #pragma ivdep
529 /* index[] is an increasing sequence, can find min first */
530 for (iptr0=(A->pattern->ptr[irow]);iptr0<(A->pattern->ptr[irow+1]); ++iptr0) {
531 itmp=A->pattern->index[iptr0];
532 if ( min_index<= itmp) { break; }
533 }
534 #pragma ivdep
535 for (iptr=iptr0;iptr<(A->pattern->ptr[irow+1]); ++iptr) {
536 itmp=A->pattern->index[iptr];
537 /* index[] is an increasing sequence, can quit loop if at max */
538 if (itmp >= max_index) { break; }
539 ic=2*(itmp-min_index);
540 Aiptr=iptr*4;
541 in1=in[ic];
542 in2=in[1+ic];
543 A00=A->val[Aiptr ];
544 A10=A->val[Aiptr+1];
545 A01=A->val[Aiptr+2];
546 A11=A->val[Aiptr+3];
547 reg1 += A00*in1 + A01*in2;
548 reg2 += A10*in1 + A11*in2;
549 }
550 out[ 2*irow] += alpha * reg1;
551 out[1+2*irow] += alpha * reg2;
552 }
553 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
554 int iptr0;
555 #pragma omp for private(irow,reg1,reg2,reg3,iptr,ic,Aiptr,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22,itmp,iptr0) schedule(static)
556 for (irow=0;irow< A->pattern->myNumOutput;irow++) {
557 reg1=0.;
558 reg2=0.;
559 reg3=0.;
560 #pragma ivdep
561 /* index[] is an increasing sequence, can find min first */
562 for (iptr0=(A->pattern->ptr[irow]);iptr0<(A->pattern->ptr[irow+1]); ++iptr0) {
563 itmp=A->pattern->index[iptr0];
564 if ( min_index<= itmp) { break; }
565 }
566 #pragma ivdep
567 for (iptr=iptr0;iptr<(A->pattern->ptr[irow+1]); ++iptr) {
568 itmp=A->pattern->index[iptr];
569 /* index[] is an increasing sequence, can quit loop if at max */
570 if (itmp >= max_index) { break; }
571 ic=3*(itmp-min_index);
572 Aiptr=iptr*9;
573 in1=in[ic];
574 in2=in[1+ic];
575 in3=in[2+ic];
576 A00=A->val[Aiptr ];
577 A10=A->val[Aiptr+1];
578 A20=A->val[Aiptr+2];
579 A01=A->val[Aiptr+3];
580 A11=A->val[Aiptr+4];
581 A21=A->val[Aiptr+5];
582 A02=A->val[Aiptr+6];
583 A12=A->val[Aiptr+7];
584 A22=A->val[Aiptr+8];
585 reg1 += A00*in1 + A01*in2 + A02*in3;
586 reg2 += A10*in1 + A11*in2 + A12*in3;
587 reg3 += A20*in1 + A21*in2 + A22*in3;
588 }
589 out[ 3*irow] += alpha * reg1;
590 out[1+3*irow] += alpha * reg2;
591 out[2+3*irow] += alpha * reg3;
592 }
593 } else {
594 int iptr0;
595 #pragma omp for private(irow,iptr,irb,icb,irtmp,ictmp,reg,itmp,iptr0) schedule(static)
596 for (irow=0;irow< A->pattern->myNumOutput;irow++) {
597 #pragma ivdep
598 /* index[] is an increasing sequence, can find min first */
599 for (iptr0=(A->pattern->ptr[irow]);iptr0<(A->pattern->ptr[irow+1]); ++iptr0) {
600 itmp=A->pattern->index[iptr0];
601 if ( min_index<= itmp) { break; }
602 }
603 #pragma ivdep
604 for (iptr=iptr0;iptr<(A->pattern->ptr[irow+1]); ++iptr) {
605 itmp=A->pattern->index[iptr];
606 /* index[] is an increasing sequence, can quit loop if at max */
607 if (itmp >= max_index) { break; }
608 for (irb=0;irb< A->row_block_size;irb++) {
609 irtmp=irb+A->row_block_size*irow;
610 reg=0.;
611 for (icb=0;icb< A->col_block_size;icb++) {
612 ictmp=icb+A->col_block_size*(itmp-min_index);
613 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[ictmp];
614 }
615 out[irtmp] += alpha * reg;
616 }
617 }
618 }
619 }
620 return;
621 }

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.26