/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1628 - (hide annotations)
Fri Jul 11 13:12:46 2008 UTC (11 years, 5 months ago) by phornby
File MIME type: text/plain
File size: 15054 byte(s)

Merge in /branches/windows_from_1456_trunk_1620_merged_in branch.

You will find a preserved pre-merge trunk in tags under tags/trunk_at_1625.
That will be useful for diffing & checking on my stupidity.

Here is a list of the conflicts and their resolution at this
point in time.


=================================================================================
(LLWS == looks like white space).

finley/src/Assemble_addToSystemMatrix.c - resolve to branch - unused var. may be wrong.....
finley/src/CPPAdapter/SystemMatrixAdapter.cpp - resolve to branch - LLWS
finley/src/CPPAdapter/MeshAdapter.cpp - resolve to branch - LLWS
paso/src/PCG.c - resolve to branch - unused var fixes.
paso/src/SolverFCT.c - resolve to branch - LLWS
paso/src/FGMRES.c - resolve to branch - LLWS
paso/src/Common.h - resolve to trunk version. It's omp.h's include... not sure it's needed,
but for the sake of saftey.....
paso/src/Functions.c - resolve to branch version, indentation/tab removal and return error
on bad unimplemented Paso_FunctionCall.
paso/src/SolverFCT_solve.c - resolve to branch version, unused vars
paso/src/SparseMatrix_MatrixVector.c - resolve to branch version, unused vars.
escript/src/Utils.cpp - resloved to branch, needs WinSock2.h
escript/src/DataExpanded.cpp - resolved to branch version - LLWS
escript/src/DataFactory.cpp - resolve to branch version
=================================================================================

This currently passes tests on linux (debian), but is not checked on windows or Altix yet.

This checkin is to make a trunk I can check out for windows to do tests on it.

Known outstanding problem is in the operator=() method of exceptions
causing warning messages on the intel compilers.

May the God of doughnuts have mercy on my soul.


1 ksteube 1315
2     /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3    
4     /*******************************************************
5     *
6     * Copyright 2003-2007 by ACceSS MNRF
7     * Copyright 2007 by University of Queensland
8     *
9     * http://esscc.uq.edu.au
10     * Primary Business: Queensland, Australia
11     * Licensed under the Open Software License version 3.0
12     * http://www.opensource.org/licenses/osl-3.0.php
13     *
14     *******************************************************/
15    
16     /**************************************************************/
17    
18     /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19    
20     /**************************************************************/
21    
22     /* Author: gross@access.edu.au */
23    
24     /**************************************************************/
25    
26     #include "SparseMatrix.h"
27    
28     /* CSC format with offset 0*/
29     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha,
30     Paso_SparseMatrix* A,
31     double* in,
32     double beta,
33     double* out) {
34    
35     register index_t ir,icol,iptr,icb,irb,irow,ic;
36    
37     if (ABS(beta)>0.) {
38     if (beta != 1.) {
39 gross 1556 #pragma omp parallel for private(irow) schedule(static)
40 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
41     out[irow] *= beta;
42     }
43     } else {
44 gross 1556 #pragma omp parallel for private(irow) schedule(static)
45 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
46     out[irow] = 0;
47     }
48     if (Paso_Pattern_isEmpty(A->pattern)) return;
49     /* do the operation: */
50     if (ABS(alpha)>0) {
51     if (A ->col_block_size==1 && A->row_block_size ==1) {
52     /* TODO: parallelize (good luck!) */
53     for (icol=0;icol< A->pattern->numOutput;++icol) {
54     #pragma ivdep
55     for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
56     out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
57     }
58     }
59     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
60     /* TODO: parallelize */
61     for (ic=0;ic< A->pattern->numOutput;ic++) {
62     #pragma ivdep
63     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
64 gross 1511 ir=2*(A->pattern->index[iptr]);
65     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
66     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
67 ksteube 1315 }
68     }
69     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
70     /* TODO: parallelize */
71     for (ic=0;ic< A->pattern->numOutput;ic++) {
72     #pragma ivdep
73     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
74     ir=3*(A->pattern->index[iptr]);
75 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
76     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
77     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
78 ksteube 1315 }
79     }
80     } else {
81     /* TODO: parallelize */
82     for (ic=0;ic< A->pattern->numOutput;ic++) {
83     for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
84     for (irb=0;irb< A->row_block_size;irb++) {
85     irow=irb+A->row_block_size*(A->pattern->index[iptr]);
86     #pragma ivdep
87     for (icb=0;icb< A->col_block_size;icb++) {
88     icol=icb+A->col_block_size*ic;
89     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
90     }
91     }
92     }
93     }
94     }
95     }
96     return;
97     }
98    
99     /* CSC format with offset 1*/
100     void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha,
101     Paso_SparseMatrix* A,
102     double* in,
103     double beta,
104     double* out) {
105    
106     register index_t ir,icol,iptr,icb,irb,irow,ic;
107 phornby 1628
108 ksteube 1315 if (ABS(beta)>0.) {
109     if (beta != 1.) {
110 gross 1556 #pragma omp parallel for private(irow) schedule(static)
111 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
112     out[irow] *= beta;
113     }
114     }
115     } else {
116 gross 1556 #pragma omp parallel for private(irow) schedule(static)
117 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
118     out[irow] = 0;
119     }
120    
121     /* do the operation: */
122     if (ABS(alpha)>0) {
123     if (A ->col_block_size==1 && A->row_block_size ==1) {
124     /* TODO: parallelize (good luck!) */
125     for (icol=0;icol< A->pattern->numOutput;++icol) {
126     #pragma ivdep
127     for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
128     out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
129     }
130     }
131     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
132     /* TODO: parallelize */
133     for (ic=0;ic< A->pattern->numOutput;ic++) {
134     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
135 gross 1511 ir=2*(A->pattern->index[iptr]-1);
136     out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
137     out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
138 ksteube 1315 }
139     }
140     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
141     /* TODO: parallelize */
142     for (ic=0;ic< A->pattern->numOutput;ic++) {
143     #pragma ivdep
144     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
145     ir=3*(A->pattern->index[iptr]-1);
146 gross 1511 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
147     out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
148     out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
149 ksteube 1315 }
150     }
151     } else {
152     /* TODO: parallelize */
153     for (ic=0;ic< A->pattern->numOutput;ic++) {
154     for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
155     for (irb=0;irb< A->row_block_size;irb++) {
156     irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
157     #pragma ivdep
158     for (icb=0;icb< A->col_block_size;icb++) {
159     icol=icb+A->col_block_size*ic;
160     out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
161     }
162     }
163     }
164     }
165     }
166     }
167     return;
168     }
169     /* CSR format with offset 1*/
170     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha,
171     Paso_SparseMatrix* A,
172     double* in,
173     double beta,
174     double* out) {
175    
176     register index_t ir,icol,iptr,icb,irb,irow,ic;
177     register double reg,reg1,reg2,reg3;
178     if (ABS(beta)>0.) {
179     if (beta != 1.) {
180 gross 1556 #pragma omp parallel for private(irow) schedule(static)
181 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
182     out[irow] *= beta;
183     }
184     } else {
185 gross 1556 #pragma omp parallel for private(irow) schedule(static)
186 ksteube 1315 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
187     out[irow] = 0;
188     }
189     /* do the operation: */
190     if (ABS(alpha)>0) {
191     if (A ->col_block_size==1 && A->row_block_size ==1) {
192 gross 1556 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
193 ksteube 1315 for (irow=0;irow< A->pattern->numOutput;++irow) {
194     reg=0.;
195     #pragma ivdep
196     for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
197     reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
198     }
199     out[irow] += alpha * reg;
200     }
201     } else if (A ->col_block_size==2 && A->row_block_size ==2) {
202 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
203 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
204     reg1=0.;
205     reg2=0.;
206     #pragma ivdep
207     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
208     ic=2*(A->pattern->index[iptr]-1);
209     reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
210     reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
211     }
212     out[ 2*ir] += alpha * reg1;
213     out[1+2*ir] += alpha * reg2;
214     }
215     } else if (A ->col_block_size==3 && A->row_block_size ==3) {
216 gross 1556 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
217 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
218     reg1=0.;
219     reg2=0.;
220     reg3=0.;
221     #pragma ivdep
222     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
223     ic=3*(A->pattern->index[iptr]-1);
224     reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
225     reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
226     reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
227     }
228     out[ 3*ir] += alpha * reg1;
229     out[1+3*ir] += alpha * reg2;
230     out[2+3*ir] += alpha * reg3;
231     }
232     } else {
233 gross 1556 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
234 ksteube 1315 for (ir=0;ir< A->pattern->numOutput;ir++) {
235     for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
236     for (irb=0;irb< A->row_block_size;irb++) {
237     irow=irb+A->row_block_size*ir;
238     reg=0.;
239     #pragma ivdep
240     for (icb=0;icb< A->col_block_size;icb++) {
241     icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
242     reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
243     }
244     out[irow] += alpha * reg;
245     }
246     }
247     }
248     }
249     }
250     return;
251     }
252 gross 1564 /* CSR format with offset 0*/
253     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha,
254     Paso_SparseMatrix* A,
255     double* in,
256     double beta,
257     double* out)
258     {
259 gross 1570 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
260 gross 1564
261 gross 1570 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
262     #define USE_DYNAMIC_SCHEDULING
263     #endif
264    
265 gross 1564 char* chksz_chr=NULL;
266 gross 1570 dim_t chunk_size=1;
267 gross 1564 dim_t nrow=A->numRows;
268     dim_t np, len, rest, irow, local_n, p, n_chunks;
269     np=omp_get_max_threads();
270 gross 1570 #ifdef USE_DYNAMIC_SCHEDULING
271 gross 1564 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
272     if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
273 gross 1570 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
274     n_chunks=nrow/chunk_size;
275     if (n_chunks*chunk_size<nrow) n_chunks+=1;
276     #else
277     len=nrow/np;
278     rest=nrow-len*np;
279 gross 1564 #endif
280    
281 gross 1571 #pragma omp parallel private(irow, p, local_n)
282 gross 1570 {
283     #ifdef USE_DYNAMIC_SCHEDULING
284 gross 1571 #pragma omp for schedule(dynamic,1)
285 gross 1570 for (p=0; p<n_chunks;p++) {
286     irow=chunk_size*p;
287     local_n=MIN(chunk_size,nrow-chunk_size*p);
288     #else
289 gross 1571 #pragma omp for schedule(static)
290 gross 1570 for (p=0; p<np;p++) {
291     irow=len*p+MIN(p,rest);
292     local_n=len+(p<rest ? 1 :0 );
293     #endif
294     Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
295     local_n,
296     A->row_block_size,
297     A->col_block_size,
298     &(A->pattern->ptr[irow]),
299     A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
300     #ifdef USE_DYNAMIC_SCHEDULING
301     }
302     #else
303     }
304     #endif
305 gross 1564 }
306     }
307     /* CSR format with offset 0*/
308     void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha,
309     dim_t nRows,
310     dim_t row_block_size,
311     dim_t col_block_size,
312     index_t* ptr,
313     index_t* index,
314     double* val,
315     double* in,
316     double beta,
317     double* out)
318     {
319     register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
320     register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
321     dim_t block_size;
322     if (ABS(beta)>0.) {
323     if (beta != 1.) {
324     for (irow=0;irow < nRows * row_block_size;irow++)
325     out[irow] *= beta;
326     }
327     } else {
328     for (irow=0;irow < nRows * row_block_size;irow++)
329     out[irow] = 0;
330     }
331     if (ABS(alpha)>0) {
332     if (col_block_size==1 && row_block_size ==1) {
333     for (irow=0;irow< nRows;++irow) {
334     reg=0.;
335     #pragma ivdep
336     for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
337     reg += val[iptr] * in[index[iptr]];
338     }
339     out[irow] += alpha * reg;
340     }
341     } else if (col_block_size==2 && row_block_size ==2) {
342     for (ir=0;ir< nRows;ir++) {
343     reg1=0.;
344     reg2=0.;
345     #pragma ivdep
346     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
347     ic=2*index[iptr];
348     Aiptr=iptr*4;
349     in1=in[ic];
350     in2=in[1+ic];
351     A00=val[Aiptr ];
352     A10=val[Aiptr+1];
353     A01=val[Aiptr+2];
354     A11=val[Aiptr+3];
355     reg1 += A00*in1 + A01*in2;
356     reg2 += A10*in1 + A11*in2;
357     }
358     out[ 2*ir] += alpha * reg1;
359     out[1+2*ir] += alpha * reg2;
360     }
361     } else if (col_block_size==3 && row_block_size ==3) {
362     for (ir=0;ir< nRows;ir++) {
363     reg1=0.;
364     reg2=0.;
365     reg3=0.;
366     #pragma ivdep
367     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
368     ic=3*index[iptr];
369     Aiptr=iptr*9;
370     in1=in[ic];
371     in2=in[1+ic];
372     in3=in[2+ic];
373     A00=val[Aiptr ];
374     A10=val[Aiptr+1];
375     A20=val[Aiptr+2];
376     A01=val[Aiptr+3];
377     A11=val[Aiptr+4];
378     A21=val[Aiptr+5];
379     A02=val[Aiptr+6];
380     A12=val[Aiptr+7];
381     A22=val[Aiptr+8];
382     reg1 += A00*in1 + A01*in2 + A02*in3;
383     reg2 += A10*in1 + A11*in2 + A12*in3;
384     reg3 += A20*in1 + A21*in2 + A22*in3;
385     }
386     out[ 3*ir] += alpha * reg1;
387     out[1+3*ir] += alpha * reg2;
388     out[2+3*ir] += alpha * reg3;
389     }
390     } else {
391     block_size=col_block_size*row_block_size;
392     for (ir=0;ir< nRows;ir++) {
393     for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
394     for (irb=0;irb< row_block_size;irb++) {
395     irow=irb+row_block_size*ir;
396     reg=0.;
397     #pragma ivdep
398     for (icb=0;icb< col_block_size;icb++) {
399     icol=icb+col_block_size*index[iptr];
400     reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
401     }
402     out[irow] += alpha * reg;
403     }
404     }
405     }
406     }
407     }
408     return;
409     }

  ViewVC Help
Powered by ViewVC 1.1.26