/[escript]/trunk/paso/src/SparseMatrix_MatrixVector.c
ViewVC logotype

Contents of /trunk/paso/src/SparseMatrix_MatrixVector.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1628 - (show annotations)
Fri Jul 11 13:12:46 2008 UTC (11 years, 6 months ago) by phornby
File MIME type: text/plain
File size: 15054 byte(s)

Merge in /branches/windows_from_1456_trunk_1620_merged_in branch.

You will find a preserved pre-merge trunk in tags under tags/trunk_at_1625.
That will be useful for diffing & checking on my stupidity.

Here is a list of the conflicts and their resolution at this
point in time.


=================================================================================
(LLWS == looks like white space).

finley/src/Assemble_addToSystemMatrix.c - resolve to branch - unused var. may be wrong.....
finley/src/CPPAdapter/SystemMatrixAdapter.cpp - resolve to branch - LLWS
finley/src/CPPAdapter/MeshAdapter.cpp - resolve to branch - LLWS
paso/src/PCG.c - resolve to branch - unused var fixes.
paso/src/SolverFCT.c - resolve to branch - LLWS
paso/src/FGMRES.c - resolve to branch - LLWS
paso/src/Common.h - resolve to trunk version. It's omp.h's include... not sure it's needed,
but for the sake of saftey.....
paso/src/Functions.c - resolve to branch version, indentation/tab removal and return error
on bad unimplemented Paso_FunctionCall.
paso/src/SolverFCT_solve.c - resolve to branch version, unused vars
paso/src/SparseMatrix_MatrixVector.c - resolve to branch version, unused vars.
escript/src/Utils.cpp - resloved to branch, needs WinSock2.h
escript/src/DataExpanded.cpp - resolved to branch version - LLWS
escript/src/DataFactory.cpp - resolve to branch version
=================================================================================

This currently passes tests on linux (debian), but is not checked on windows or Altix yet.

This checkin is to make a trunk I can check out for windows to do tests on it.

Known outstanding problem is in the operator=() method of exceptions
causing warning messages on the intel compilers.

May the God of doughnuts have mercy on my soul.


1
2 /* $Id: SparseMatrix_MatrixVector.c 1306 2007-09-18 05:51:09Z ksteube $ */
3
4 /*******************************************************
5 *
6 * Copyright 2003-2007 by ACceSS MNRF
7 * Copyright 2007 by University of Queensland
8 *
9 * http://esscc.uq.edu.au
10 * Primary Business: Queensland, Australia
11 * Licensed under the Open Software License version 3.0
12 * http://www.opensource.org/licenses/osl-3.0.php
13 *
14 *******************************************************/
15
16 /**************************************************************/
17
18 /* Paso: raw scaled vector update operation: out = alpha * A * in + beta * out */
19
20 /**************************************************************/
21
22 /* Author: gross@access.edu.au */
23
24 /**************************************************************/
25
26 #include "SparseMatrix.h"
27
28 /* CSC format with offset 0*/
29 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET0(double alpha,
30 Paso_SparseMatrix* A,
31 double* in,
32 double beta,
33 double* out) {
34
35 register index_t ir,icol,iptr,icb,irb,irow,ic;
36
37 if (ABS(beta)>0.) {
38 if (beta != 1.) {
39 #pragma omp parallel for private(irow) schedule(static)
40 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
41 out[irow] *= beta;
42 }
43 } else {
44 #pragma omp parallel for private(irow) schedule(static)
45 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
46 out[irow] = 0;
47 }
48 if (Paso_Pattern_isEmpty(A->pattern)) return;
49 /* do the operation: */
50 if (ABS(alpha)>0) {
51 if (A ->col_block_size==1 && A->row_block_size ==1) {
52 /* TODO: parallelize (good luck!) */
53 for (icol=0;icol< A->pattern->numOutput;++icol) {
54 #pragma ivdep
55 for (iptr=A->pattern->ptr[icol];iptr<A->pattern->ptr[icol+1]; ++iptr) {
56 out[A->pattern->index[iptr]]+= alpha * A->val[iptr] * in[icol];
57 }
58 }
59 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
60 /* TODO: parallelize */
61 for (ic=0;ic< A->pattern->numOutput;ic++) {
62 #pragma ivdep
63 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
64 ir=2*(A->pattern->index[iptr]);
65 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
66 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
67 }
68 }
69 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
70 /* TODO: parallelize */
71 for (ic=0;ic< A->pattern->numOutput;ic++) {
72 #pragma ivdep
73 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
74 ir=3*(A->pattern->index[iptr]);
75 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
76 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
77 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
78 }
79 }
80 } else {
81 /* TODO: parallelize */
82 for (ic=0;ic< A->pattern->numOutput;ic++) {
83 for (iptr=A->pattern->ptr[ic];iptr<A->pattern->ptr[ic+1]; iptr++) {
84 for (irb=0;irb< A->row_block_size;irb++) {
85 irow=irb+A->row_block_size*(A->pattern->index[iptr]);
86 #pragma ivdep
87 for (icb=0;icb< A->col_block_size;icb++) {
88 icol=icb+A->col_block_size*ic;
89 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
90 }
91 }
92 }
93 }
94 }
95 }
96 return;
97 }
98
99 /* CSC format with offset 1*/
100 void Paso_SparseMatrix_MatrixVector_CSC_OFFSET1(double alpha,
101 Paso_SparseMatrix* A,
102 double* in,
103 double beta,
104 double* out) {
105
106 register index_t ir,icol,iptr,icb,irb,irow,ic;
107
108 if (ABS(beta)>0.) {
109 if (beta != 1.) {
110 #pragma omp parallel for private(irow) schedule(static)
111 for (irow=0;irow < A->numRows * A->row_block_size;irow++) {
112 out[irow] *= beta;
113 }
114 }
115 } else {
116 #pragma omp parallel for private(irow) schedule(static)
117 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
118 out[irow] = 0;
119 }
120
121 /* do the operation: */
122 if (ABS(alpha)>0) {
123 if (A ->col_block_size==1 && A->row_block_size ==1) {
124 /* TODO: parallelize (good luck!) */
125 for (icol=0;icol< A->pattern->numOutput;++icol) {
126 #pragma ivdep
127 for (iptr=A->pattern->ptr[icol]-1;iptr<A->pattern->ptr[icol+1]-1; ++iptr) {
128 out[A->pattern->index[iptr]-1]+= alpha * A->val[iptr] * in[icol];
129 }
130 }
131 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
132 /* TODO: parallelize */
133 for (ic=0;ic< A->pattern->numOutput;ic++) {
134 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
135 ir=2*(A->pattern->index[iptr]-1);
136 out[ ir] += alpha * ( A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic] );
137 out[1+ir] += alpha * ( A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic] );
138 }
139 }
140 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
141 /* TODO: parallelize */
142 for (ic=0;ic< A->pattern->numOutput;ic++) {
143 #pragma ivdep
144 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
145 ir=3*(A->pattern->index[iptr]-1);
146 out[ ir] += alpha * ( A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic] );
147 out[1+ir] += alpha * ( A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic] );
148 out[2+ir] += alpha * ( A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic] );
149 }
150 }
151 } else {
152 /* TODO: parallelize */
153 for (ic=0;ic< A->pattern->numOutput;ic++) {
154 for (iptr=A->pattern->ptr[ic]-1;iptr<A->pattern->ptr[ic+1]-1; iptr++) {
155 for (irb=0;irb< A->row_block_size;irb++) {
156 irow=irb+A->row_block_size*(A->pattern->index[iptr]-1);
157 #pragma ivdep
158 for (icb=0;icb< A->col_block_size;icb++) {
159 icol=icb+A->col_block_size*ic;
160 out[irow] += alpha * A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
161 }
162 }
163 }
164 }
165 }
166 }
167 return;
168 }
169 /* CSR format with offset 1*/
170 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET1(double alpha,
171 Paso_SparseMatrix* A,
172 double* in,
173 double beta,
174 double* out) {
175
176 register index_t ir,icol,iptr,icb,irb,irow,ic;
177 register double reg,reg1,reg2,reg3;
178 if (ABS(beta)>0.) {
179 if (beta != 1.) {
180 #pragma omp parallel for private(irow) schedule(static)
181 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
182 out[irow] *= beta;
183 }
184 } else {
185 #pragma omp parallel for private(irow) schedule(static)
186 for (irow=0;irow < A->numRows * A->row_block_size;irow++)
187 out[irow] = 0;
188 }
189 /* do the operation: */
190 if (ABS(alpha)>0) {
191 if (A ->col_block_size==1 && A->row_block_size ==1) {
192 #pragma omp parallel for private(irow,iptr,reg) schedule(static)
193 for (irow=0;irow< A->pattern->numOutput;++irow) {
194 reg=0.;
195 #pragma ivdep
196 for (iptr=(A->pattern->ptr[irow])-1;iptr<(A->pattern->ptr[irow+1])-1; ++iptr) {
197 reg += A->val[iptr] * in[A->pattern->index[iptr]-1];
198 }
199 out[irow] += alpha * reg;
200 }
201 } else if (A ->col_block_size==2 && A->row_block_size ==2) {
202 #pragma omp parallel for private(ir,reg1,reg2,iptr,ic) schedule(static)
203 for (ir=0;ir< A->pattern->numOutput;ir++) {
204 reg1=0.;
205 reg2=0.;
206 #pragma ivdep
207 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
208 ic=2*(A->pattern->index[iptr]-1);
209 reg1 += A->val[iptr*4 ]*in[ic] + A->val[iptr*4+2]*in[1+ic];
210 reg2 += A->val[iptr*4+1]*in[ic] + A->val[iptr*4+3]*in[1+ic];
211 }
212 out[ 2*ir] += alpha * reg1;
213 out[1+2*ir] += alpha * reg2;
214 }
215 } else if (A ->col_block_size==3 && A->row_block_size ==3) {
216 #pragma omp parallel for private(ir,reg1,reg2,reg3,iptr,ic) schedule(static)
217 for (ir=0;ir< A->pattern->numOutput;ir++) {
218 reg1=0.;
219 reg2=0.;
220 reg3=0.;
221 #pragma ivdep
222 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
223 ic=3*(A->pattern->index[iptr]-1);
224 reg1 += A->val[iptr*9 ]*in[ic] + A->val[iptr*9+3]*in[1+ic] + A->val[iptr*9+6]*in[2+ic];
225 reg2 += A->val[iptr*9+1]*in[ic] + A->val[iptr*9+4]*in[1+ic] + A->val[iptr*9+7]*in[2+ic];
226 reg3 += A->val[iptr*9+2]*in[ic] + A->val[iptr*9+5]*in[1+ic] + A->val[iptr*9+8]*in[2+ic];
227 }
228 out[ 3*ir] += alpha * reg1;
229 out[1+3*ir] += alpha * reg2;
230 out[2+3*ir] += alpha * reg3;
231 }
232 } else {
233 #pragma omp parallel for private(ir,iptr,irb,icb,irow,icol,reg) schedule(static)
234 for (ir=0;ir< A->pattern->numOutput;ir++) {
235 for (iptr=A->pattern->ptr[ir]-1;iptr<A->pattern->ptr[ir+1]-1; iptr++) {
236 for (irb=0;irb< A->row_block_size;irb++) {
237 irow=irb+A->row_block_size*ir;
238 reg=0.;
239 #pragma ivdep
240 for (icb=0;icb< A->col_block_size;icb++) {
241 icol=icb+A->col_block_size*(A->pattern->index[iptr]-1);
242 reg += A->val[iptr*A->block_size+irb+A->row_block_size*icb] * in[icol];
243 }
244 out[irow] += alpha * reg;
245 }
246 }
247 }
248 }
249 }
250 return;
251 }
252 /* CSR format with offset 0*/
253 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0(double alpha,
254 Paso_SparseMatrix* A,
255 double* in,
256 double beta,
257 double* out)
258 {
259 /*#define PASO_DYNAMIC_SCHEDULING_MVM */
260
261 #if defined PASO_DYNAMIC_SCHEDULING_MVM && defined __OPENMP
262 #define USE_DYNAMIC_SCHEDULING
263 #endif
264
265 char* chksz_chr=NULL;
266 dim_t chunk_size=1;
267 dim_t nrow=A->numRows;
268 dim_t np, len, rest, irow, local_n, p, n_chunks;
269 np=omp_get_max_threads();
270 #ifdef USE_DYNAMIC_SCHEDULING
271 chksz_chr=getenv("PASO_CHUNK_SIZE_MVM");
272 if (chksz_chr!=NULL) sscanf(chksz_chr, "%d",&chunk_size);
273 chunk_size=MIN(MAX(1,chunk_size),nrow/np);
274 n_chunks=nrow/chunk_size;
275 if (n_chunks*chunk_size<nrow) n_chunks+=1;
276 #else
277 len=nrow/np;
278 rest=nrow-len*np;
279 #endif
280
281 #pragma omp parallel private(irow, p, local_n)
282 {
283 #ifdef USE_DYNAMIC_SCHEDULING
284 #pragma omp for schedule(dynamic,1)
285 for (p=0; p<n_chunks;p++) {
286 irow=chunk_size*p;
287 local_n=MIN(chunk_size,nrow-chunk_size*p);
288 #else
289 #pragma omp for schedule(static)
290 for (p=0; p<np;p++) {
291 irow=len*p+MIN(p,rest);
292 local_n=len+(p<rest ? 1 :0 );
293 #endif
294 Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(alpha,
295 local_n,
296 A->row_block_size,
297 A->col_block_size,
298 &(A->pattern->ptr[irow]),
299 A->pattern->index, A->val, in, beta, &out[irow*A->row_block_size]);
300 #ifdef USE_DYNAMIC_SCHEDULING
301 }
302 #else
303 }
304 #endif
305 }
306 }
307 /* CSR format with offset 0*/
308 void Paso_SparseMatrix_MatrixVector_CSR_OFFSET0_stripe(double alpha,
309 dim_t nRows,
310 dim_t row_block_size,
311 dim_t col_block_size,
312 index_t* ptr,
313 index_t* index,
314 double* val,
315 double* in,
316 double beta,
317 double* out)
318 {
319 register index_t ir,icol,iptr,icb,irb,irow,ic,Aiptr;
320 register double reg,reg1,reg2,reg3,in1,in2,in3,A00,A10,A20,A01,A11,A21,A02,A12,A22;
321 dim_t block_size;
322 if (ABS(beta)>0.) {
323 if (beta != 1.) {
324 for (irow=0;irow < nRows * row_block_size;irow++)
325 out[irow] *= beta;
326 }
327 } else {
328 for (irow=0;irow < nRows * row_block_size;irow++)
329 out[irow] = 0;
330 }
331 if (ABS(alpha)>0) {
332 if (col_block_size==1 && row_block_size ==1) {
333 for (irow=0;irow< nRows;++irow) {
334 reg=0.;
335 #pragma ivdep
336 for (iptr=ptr[irow];iptr<ptr[irow+1]; ++iptr) {
337 reg += val[iptr] * in[index[iptr]];
338 }
339 out[irow] += alpha * reg;
340 }
341 } else if (col_block_size==2 && row_block_size ==2) {
342 for (ir=0;ir< nRows;ir++) {
343 reg1=0.;
344 reg2=0.;
345 #pragma ivdep
346 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
347 ic=2*index[iptr];
348 Aiptr=iptr*4;
349 in1=in[ic];
350 in2=in[1+ic];
351 A00=val[Aiptr ];
352 A10=val[Aiptr+1];
353 A01=val[Aiptr+2];
354 A11=val[Aiptr+3];
355 reg1 += A00*in1 + A01*in2;
356 reg2 += A10*in1 + A11*in2;
357 }
358 out[ 2*ir] += alpha * reg1;
359 out[1+2*ir] += alpha * reg2;
360 }
361 } else if (col_block_size==3 && row_block_size ==3) {
362 for (ir=0;ir< nRows;ir++) {
363 reg1=0.;
364 reg2=0.;
365 reg3=0.;
366 #pragma ivdep
367 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
368 ic=3*index[iptr];
369 Aiptr=iptr*9;
370 in1=in[ic];
371 in2=in[1+ic];
372 in3=in[2+ic];
373 A00=val[Aiptr ];
374 A10=val[Aiptr+1];
375 A20=val[Aiptr+2];
376 A01=val[Aiptr+3];
377 A11=val[Aiptr+4];
378 A21=val[Aiptr+5];
379 A02=val[Aiptr+6];
380 A12=val[Aiptr+7];
381 A22=val[Aiptr+8];
382 reg1 += A00*in1 + A01*in2 + A02*in3;
383 reg2 += A10*in1 + A11*in2 + A12*in3;
384 reg3 += A20*in1 + A21*in2 + A22*in3;
385 }
386 out[ 3*ir] += alpha * reg1;
387 out[1+3*ir] += alpha * reg2;
388 out[2+3*ir] += alpha * reg3;
389 }
390 } else {
391 block_size=col_block_size*row_block_size;
392 for (ir=0;ir< nRows;ir++) {
393 for (iptr=ptr[ir];iptr<ptr[ir+1]; iptr++) {
394 for (irb=0;irb< row_block_size;irb++) {
395 irow=irb+row_block_size*ir;
396 reg=0.;
397 #pragma ivdep
398 for (icb=0;icb< col_block_size;icb++) {
399 icol=icb+col_block_size*index[iptr];
400 reg += val[iptr*block_size+irb+row_block_size*icb] * in[icol];
401 }
402 out[irow] += alpha * reg;
403 }
404 }
405 }
406 }
407 }
408 return;
409 }

  ViewVC Help
Powered by ViewVC 1.1.26