/[escript]/trunk/paso/src/SparseMatrix_MatrixMatrix.c
ViewVC logotype

Contents of /trunk/paso/src/SparseMatrix_MatrixMatrix.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3315 - (show annotations)
Wed Oct 27 01:20:27 2010 UTC (8 years, 10 months ago) by gross
File MIME type: text/plain
File size: 35965 byte(s)
bug in sparse matrixmatrix fixed.
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2010 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************
16
17 Paso: Sparse matrix product
18
19 **************************************************************
20
21 Author: artak@uq.edu.au, l.gross@uq.edu.au
22
23 **************************************************************/
24
25 #include "SparseMatrix.h"
26 #include "Paso.h"
27
28 Paso_SparseMatrix* Paso_SparseMatrix_MatrixMatrix(const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) {
29 Paso_SparseMatrixType C_type;
30
31 Paso_Pattern* outpattern=NULL;
32 Paso_SparseMatrix *out=NULL;
33 if ( ! ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (A->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & A->type ) ) ) {
34 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of A.");
35 return NULL;
36 }
37 if ( ! ( (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (B->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & B->type ) ) ) {
38 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of B.");
39 return NULL;
40 }
41 if (! (A->col_block_size == B->row_block_size) ) {
42 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Column block size of A and row block size of B must match.");
43 return NULL;
44 }
45 if (! (A->numCols == B->numRows) ) {
46 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: number of columns of A and number of rows of B must match.");
47 return NULL;
48 }
49
50
51 if ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) && (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) ) {
52 C_type=MATRIX_FORMAT_DIAGONAL_BLOCK;
53 } else {
54 C_type=MATRIX_FORMAT_DEFAULT;
55 }
56
57 outpattern=Paso_Pattern_multiply(MATRIX_FORMAT_DEFAULT,A->pattern,B->pattern);
58
59 if (Esys_noError()) {
60 out=Paso_SparseMatrix_alloc(C_type, outpattern, A->row_block_size, B->col_block_size, FALSE);
61 }
62 Paso_Pattern_free(outpattern);
63
64 if (Esys_noError()) {
65 if ( (A->row_block_size == 1) && (B->col_block_size ==1 ) && (A->col_block_size ==1) ) {
66 Paso_SparseMatrix_MatrixMatrix_DD(out, A,B);
67 } else {
68 if (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
69 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
70 Paso_SparseMatrix_MatrixMatrix_DD(out, A,B);
71 } else {
72 Paso_SparseMatrix_MatrixMatrix_DB(out, A,B);
73 }
74 } else {
75 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
76 Paso_SparseMatrix_MatrixMatrix_BD(out, A,B);
77 } else {
78 Paso_SparseMatrix_MatrixMatrix_BB(out, A,B);
79 }
80 }
81 }
82 return out;
83 } else {
84 Paso_SparseMatrix_free(out);
85 return NULL;
86 }
87 }
88 /* not good for block size 1 */
89 void Paso_SparseMatrix_MatrixMatrix_BB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
90 {
91 const dim_t n = C->numRows;
92 const dim_t row_block_size = C->row_block_size;
93 const dim_t col_block_size = C->col_block_size;
94 const dim_t A_col_block_size = A->col_block_size;
95 const dim_t C_block_size =C->block_size;
96 const dim_t B_block_size =B->block_size;
97 const dim_t A_block_size =A->block_size;
98 double *C_ij, *A_ik, *B_kj;
99 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
100 dim_t i, ib, irb, icb;
101 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
102
103 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_col_block_size == 2) ) {
104 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
105 {
106 #pragma omp for schedule(static)
107 for(i = 0; i < n; i++) {
108 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
109 j = C->pattern->index[ij_ptrC];
110
111 C_ij_00=0;
112 C_ij_10=0;
113 C_ij_01=0;
114 C_ij_11=0;
115
116 C_ij=&(C->val[ij_ptrC*4]);
117
118 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
119 k=A->pattern->index[ik_ptrA];
120 kj_ptrB=B->pattern->ptr[k];
121 /* find j in B[k,:] */
122 start_p=&(B->pattern->index[kj_ptrB]);
123 where_p=(index_t*)bsearch(&j, start_p,
124 B->pattern->ptr[k + 1]-kj_ptrB,
125 sizeof(index_t),
126 Paso_comparIndex);
127 if (! (where_p == NULL) ) { /* this should always be the case */
128 kj_ptrB += (index_t)(where_p-start_p);
129 A_ik=&(A->val[ik_ptrA*4]);
130 B_kj=&(B->val[kj_ptrB*4]);
131
132 C_ij_00 +=A_ik[0+2*0]*B_kj[0+2*0]+A_ik[0+2*1]*B_kj[1+2*0];
133 C_ij_10 +=A_ik[1+2*0]*B_kj[0+2*0]+A_ik[1+2*1]*B_kj[1+2*0];
134
135 C_ij_01 +=A_ik[0+2*0]*B_kj[0+2*1]+A_ik[0+2*1]*B_kj[1+2*1];
136 C_ij_11 +=A_ik[1+2*0]*B_kj[0+2*1]+A_ik[1+2*1]*B_kj[1+2*1];
137 }
138
139 }
140 C_ij[0+2*0]=C_ij_00;
141 C_ij[1+2*0]=C_ij_10;
142 C_ij[0+2*1]=C_ij_01;
143 C_ij[1+2*1]=C_ij_11;
144
145 }
146 }
147 } /* end of parallel region */
148
149 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_col_block_size == 3) ){
150 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
151 {
152 #pragma omp for schedule(static)
153 for(i = 0; i < n; i++) {
154 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
155 j = C->pattern->index[ij_ptrC];
156
157 C_ij_00=0;
158 C_ij_10=0;
159 C_ij_20=0;
160 C_ij_01=0;
161 C_ij_11=0;
162 C_ij_21=0;
163 C_ij_02=0;
164 C_ij_12=0;
165 C_ij_22=0;
166
167 C_ij=&(C->val[ij_ptrC*9]);
168
169 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
170 k=A->pattern->index[ik_ptrA];
171 kj_ptrB=B->pattern->ptr[k];
172 /* find j in B[k,:] */
173 start_p=&(B->pattern->index[kj_ptrB]);
174 where_p=(index_t*)bsearch(&j, start_p,
175 B->pattern->ptr[k + 1]-kj_ptrB,
176 sizeof(index_t),
177 Paso_comparIndex);
178 if (! (where_p == NULL) ) { /* this should always be the case */
179 kj_ptrB += (index_t)(where_p-start_p);
180 A_ik=&(A->val[ik_ptrA*9]);
181 B_kj=&(B->val[kj_ptrB*9]);
182
183 C_ij_00 +=A_ik[0+3*0]*B_kj[0+3*0]
184 +A_ik[0+3*1]*B_kj[1+3*0]
185 +A_ik[0+3*2]*B_kj[2+3*0];
186 C_ij_10 +=A_ik[1+3*0]*B_kj[0+3*0]
187 +A_ik[1+3*1]*B_kj[1+3*0]
188 +A_ik[1+3*2]*B_kj[2+3*0];
189 C_ij_20 +=A_ik[2+3*0]*B_kj[0+3*0]
190 +A_ik[2+3*1]*B_kj[1+3*0]
191 +A_ik[2+3*2]*B_kj[2+3*0];
192
193 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*1]
194 +A_ik[0+3*1]*B_kj[1+3*1]
195 +A_ik[0+3*2]*B_kj[2+3*1];
196 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*1]
197 +A_ik[1+3*1]*B_kj[1+3*1]
198 +A_ik[1+3*2]*B_kj[2+3*1];
199 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*1]
200 +A_ik[2+3*1]*B_kj[1+3*1]
201 +A_ik[2+3*2]*B_kj[2+3*1];
202
203 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*2]
204 +A_ik[0+3*1]*B_kj[1+3*2]
205 +A_ik[0+3*2]*B_kj[2+3*2];
206 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*2]
207 +A_ik[1+3*1]*B_kj[1+3*2]
208 +A_ik[1+3*2]*B_kj[2+3*2];
209 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*2]
210 +A_ik[2+3*1]*B_kj[1+3*2]
211 +A_ik[2+3*2]*B_kj[2+3*2];
212 }
213
214 }
215 C_ij[0+3*0]=C_ij_00;
216 C_ij[1+3*0]=C_ij_10;
217 C_ij[2+3*0]=C_ij_20;
218 C_ij[0+3*1]=C_ij_01;
219 C_ij[1+3*1]=C_ij_11;
220 C_ij[2+3*1]=C_ij_21;
221 C_ij[0+3*2]=C_ij_02;
222 C_ij[1+3*2]=C_ij_12;
223 C_ij[2+3*2]=C_ij_22;
224 }
225 }
226 } /* end of parallel region */
227 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_col_block_size == 4) ){
228 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
229 {
230 #pragma omp for schedule(static)
231 for(i = 0; i < n; i++) {
232 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
233 j = C->pattern->index[ij_ptrC];
234
235 C_ij_00=0;
236 C_ij_10=0;
237 C_ij_20=0;
238 C_ij_30=0;
239 C_ij_01=0;
240 C_ij_11=0;
241 C_ij_21=0;
242 C_ij_31=0;
243 C_ij_02=0;
244 C_ij_12=0;
245 C_ij_22=0;
246 C_ij_32=0;
247 C_ij_03=0;
248 C_ij_13=0;
249 C_ij_23=0;
250 C_ij_33=0;
251
252 C_ij=&(C->val[ij_ptrC*16]);
253
254 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
255 k=A->pattern->index[ik_ptrA];
256 kj_ptrB=B->pattern->ptr[k];
257 /* find j in B[k,:] */
258 start_p=&(B->pattern->index[kj_ptrB]);
259 where_p=(index_t*)bsearch(&j, start_p,
260 B->pattern->ptr[k + 1]-kj_ptrB,
261 sizeof(index_t),
262 Paso_comparIndex);
263 if (! (where_p == NULL) ) { /* this should always be the case */
264 kj_ptrB += (index_t)(where_p-start_p);
265 A_ik=&(A->val[ik_ptrA*16]);
266 B_kj=&(B->val[kj_ptrB*16]);
267
268 C_ij_00 +=A_ik[0+4*0]*B_kj[0+4*0]
269 +A_ik[0+4*1]*B_kj[1+4*0]
270 +A_ik[0+4*2]*B_kj[2+4*0]
271 +A_ik[0+4*3]*B_kj[3+4*0];
272 C_ij_10 +=A_ik[1+4*0]*B_kj[0+4*0]
273 +A_ik[1+4*1]*B_kj[1+4*0]
274 +A_ik[1+4*2]*B_kj[2+4*0]
275 +A_ik[1+4*3]*B_kj[3+4*0];
276 C_ij_20 +=A_ik[2+4*0]*B_kj[0+4*0]
277 +A_ik[2+4*1]*B_kj[1+4*0]
278 +A_ik[2+4*2]*B_kj[2+4*0]
279 +A_ik[2+4*3]*B_kj[3+4*0];
280 C_ij_30 +=A_ik[3+4*0]*B_kj[0+4*0]
281 +A_ik[3+4*1]*B_kj[1+4*0]
282 +A_ik[3+4*2]*B_kj[2+4*0]
283 +A_ik[3+4*3]*B_kj[3+4*0];
284
285 C_ij_01 +=A_ik[0+4*0]*B_kj[0+4*1]
286 +A_ik[0+4*1]*B_kj[1+4*1]
287 +A_ik[0+4*2]*B_kj[2+4*1]
288 +A_ik[0+4*3]*B_kj[3+4*1];
289 C_ij_11 +=A_ik[1+4*0]*B_kj[0+4*1]
290 +A_ik[1+4*1]*B_kj[1+4*1]
291 +A_ik[1+4*2]*B_kj[2+4*1]
292 +A_ik[1+4*3]*B_kj[3+4*1];
293 C_ij_21 +=A_ik[2+4*0]*B_kj[0+4*1]
294 +A_ik[2+4*1]*B_kj[1+4*1]
295 +A_ik[2+4*2]*B_kj[2+4*1]
296 +A_ik[2+4*3]*B_kj[3+4*1];
297 C_ij_31 +=A_ik[3+4*0]*B_kj[0+4*1]
298 +A_ik[3+4*1]*B_kj[1+4*1]
299 +A_ik[3+4*2]*B_kj[2+4*1]
300 +A_ik[3+4*3]*B_kj[3+4*1];
301
302 C_ij_02 +=A_ik[0+4*0]*B_kj[0+4*2]
303 +A_ik[0+4*1]*B_kj[1+4*2]
304 +A_ik[0+4*2]*B_kj[2+4*2]
305 +A_ik[0+4*3]*B_kj[3+4*2];
306 C_ij_12 +=A_ik[1+4*0]*B_kj[0+4*2]
307 +A_ik[1+4*1]*B_kj[1+4*2]
308 +A_ik[1+4*2]*B_kj[2+4*2]
309 +A_ik[1+4*3]*B_kj[3+4*2];
310 C_ij_22 +=A_ik[2+4*0]*B_kj[0+4*2]
311 +A_ik[2+4*1]*B_kj[1+4*2]
312 +A_ik[2+4*2]*B_kj[2+4*2]
313 +A_ik[2+4*3]*B_kj[3+4*2];
314 C_ij_32 +=A_ik[3+4*0]*B_kj[0+4*2]
315 +A_ik[3+4*1]*B_kj[1+4*2]
316 +A_ik[3+4*2]*B_kj[2+4*2]
317 +A_ik[3+4*3]*B_kj[3+4*2];
318
319 C_ij_03 +=A_ik[0+4*0]*B_kj[0+4*3]
320 +A_ik[0+4*1]*B_kj[1+4*3]
321 +A_ik[0+4*2]*B_kj[2+4*3]
322 +A_ik[0+4*3]*B_kj[3+4*3];
323 C_ij_13 +=A_ik[1+4*0]*B_kj[0+4*3]
324 +A_ik[1+4*1]*B_kj[1+4*3]
325 +A_ik[1+4*2]*B_kj[2+4*3]
326 +A_ik[1+4*3]*B_kj[3+4*3];
327 C_ij_23 +=A_ik[2+4*0]*B_kj[0+4*3]
328 +A_ik[2+4*1]*B_kj[1+4*3]
329 +A_ik[2+4*2]*B_kj[2+4*3]
330 +A_ik[2+4*3]*B_kj[3+4*3];
331 C_ij_33 +=A_ik[3+4*0]*B_kj[0+4*3]
332 +A_ik[3+4*1]*B_kj[1+4*3]
333 +A_ik[3+4*2]*B_kj[2+4*3]
334 +A_ik[3+4*3]*B_kj[3+4*3];
335 }
336 }
337 C_ij[0+4*0]=C_ij_00;
338 C_ij[1+4*0]=C_ij_10;
339 C_ij[2+4*0]=C_ij_20;
340 C_ij[3+4*0]=C_ij_30;
341 C_ij[0+4*1]=C_ij_01;
342 C_ij[1+4*1]=C_ij_11;
343 C_ij[2+4*1]=C_ij_21;
344 C_ij[3+4*1]=C_ij_31;
345 C_ij[0+4*2]=C_ij_02;
346 C_ij[1+4*2]=C_ij_12;
347 C_ij[2+4*2]=C_ij_22;
348 C_ij[3+4*2]=C_ij_32;
349 C_ij[0+4*3]=C_ij_03;
350 C_ij[1+4*3]=C_ij_13;
351 C_ij[2+4*3]=C_ij_23;
352 C_ij[3+4*3]=C_ij_33;
353 }
354 }
355 } /* end of parallel region */
356
357 } else {
358 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
359 {
360 #pragma omp for schedule(static)
361 for(i = 0; i < n; i++) {
362 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
363 j = C->pattern->index[ij_ptrC];
364 C_ij=&(C->val[ij_ptrC*C_block_size]);
365 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
366 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
367 k=A->pattern->index[ik_ptrA];
368 kj_ptrB=B->pattern->ptr[k];
369 /* find j in B[k,:] */
370 start_p=&(B->pattern->index[kj_ptrB]);
371 where_p=(index_t*)bsearch(&j, start_p,
372 B->pattern->ptr[k + 1]-kj_ptrB,
373 sizeof(index_t),
374 Paso_comparIndex);
375 if (! (where_p == NULL) ) { /* this should always be the case */
376 kj_ptrB += (index_t)(where_p-start_p);
377 A_ik=&(A->val[ik_ptrA*A_block_size]);
378 B_kj=&(B->val[kj_ptrB*B_block_size]);
379
380 for (irb=0; irb<row_block_size; ++irb) {
381 for (icb=0; icb<col_block_size; ++icb) {
382 rtmp=C_ij[irb+row_block_size*icb];
383 for (ib=0; ib<A_col_block_size; ++ib) {
384 rtmp+=A_ik[irb+row_block_size*ib]*B_kj[ib+A_col_block_size*icb];
385 }
386 C_ij[irb+row_block_size*icb]=rtmp;
387 }
388 }
389 }
390 }
391 }
392 }
393 } /* end of parallel region */
394
395 }
396 }
397
398 /* not good for block size 1 */
399 void Paso_SparseMatrix_MatrixMatrix_DB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
400 {
401 const dim_t n = C->numRows;
402 const dim_t row_block_size = C->row_block_size;
403 const dim_t col_block_size = C->col_block_size;
404 const dim_t A_col_block_size = A->col_block_size;
405 const dim_t C_block_size =C->block_size;
406 const dim_t B_block_size =B->block_size;
407 const dim_t A_block_size =A->block_size;
408 double *C_ij, *A_ik, *B_kj;
409 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
410 dim_t i, ib, irb, icb;
411 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
412
413 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_block_size == 2) ) {
414 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
415 {
416 #pragma omp for schedule(static)
417 for(i = 0; i < n; i++) {
418 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
419 j = C->pattern->index[ij_ptrC];
420
421 C_ij_00=0;
422 C_ij_10=0;
423 C_ij_01=0;
424 C_ij_11=0;
425
426 C_ij=&(C->val[ij_ptrC*4]);
427
428 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
429 k=A->pattern->index[ik_ptrA];
430 kj_ptrB=B->pattern->ptr[k];
431 /* find j in B[k,:] */
432 start_p=&(B->pattern->index[kj_ptrB]);
433 where_p=(index_t*)bsearch(&j, start_p,
434 B->pattern->ptr[k + 1]-kj_ptrB,
435 sizeof(index_t),
436 Paso_comparIndex);
437 if (! (where_p == NULL) ) { /* this should always be the case */
438 kj_ptrB += (index_t)(where_p-start_p);
439 A_ik=&(A->val[ik_ptrA*2]);
440 B_kj=&(B->val[kj_ptrB*4]);
441
442 C_ij_00 +=A_ik[0]*B_kj[0+2*0];
443 C_ij_10 +=A_ik[1]*B_kj[1+2*0];
444
445 C_ij_01 +=A_ik[0]*B_kj[0+2*1];
446 C_ij_11 +=A_ik[1]*B_kj[1+2*1];
447 }
448
449 }
450 C_ij[0+2*0]=C_ij_00;
451 C_ij[1+2*0]=C_ij_10;
452 C_ij[0+2*1]=C_ij_01;
453 C_ij[1+2*1]=C_ij_11;
454
455 }
456 }
457 } /* end of parallel region */
458
459 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_block_size == 3) ){
460 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
461 {
462 #pragma omp for schedule(static)
463 for(i = 0; i < n; i++) {
464 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
465 j = C->pattern->index[ij_ptrC];
466
467 C_ij_00=0;
468 C_ij_10=0;
469 C_ij_20=0;
470 C_ij_01=0;
471 C_ij_11=0;
472 C_ij_21=0;
473 C_ij_02=0;
474 C_ij_12=0;
475 C_ij_22=0;
476
477 C_ij=&(C->val[ij_ptrC*9]);
478
479 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
480 k=A->pattern->index[ik_ptrA];
481 kj_ptrB=B->pattern->ptr[k];
482 /* find j in B[k,:] */
483 start_p=&(B->pattern->index[kj_ptrB]);
484 where_p=(index_t*)bsearch(&j, start_p,
485 B->pattern->ptr[k + 1]-kj_ptrB,
486 sizeof(index_t),
487 Paso_comparIndex);
488 if (! (where_p == NULL) ) { /* this should always be the case */
489 kj_ptrB += (index_t)(where_p-start_p);
490 A_ik=&(A->val[ik_ptrA*3]);
491 B_kj=&(B->val[kj_ptrB*9]);
492
493 C_ij_00 +=A_ik[0]*B_kj[0+3*0];
494 C_ij_10 +=A_ik[1]*B_kj[1+3*0];
495 C_ij_20 +=A_ik[2]*B_kj[2+3*0];
496
497 C_ij_01 +=A_ik[0]*B_kj[0+3*1];
498 C_ij_11 +=A_ik[1]*B_kj[1+3*1];
499 C_ij_21 +=A_ik[2]*B_kj[2+3*1];
500
501 C_ij_02 +=A_ik[0]*B_kj[0+3*2];
502 C_ij_12 +=A_ik[1]*B_kj[1+3*2];
503 C_ij_22 +=A_ik[2]*B_kj[2+3*2];
504
505 }
506
507 }
508 C_ij[0+3*0]=C_ij_00;
509 C_ij[1+3*0]=C_ij_10;
510 C_ij[2+3*0]=C_ij_20;
511 C_ij[0+3*1]=C_ij_01;
512 C_ij[1+3*1]=C_ij_11;
513 C_ij[2+3*1]=C_ij_21;
514 C_ij[0+3*2]=C_ij_02;
515 C_ij[1+3*2]=C_ij_12;
516 C_ij[2+3*2]=C_ij_22;
517 }
518 }
519 } /* end of parallel region */
520 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_block_size == 4) ){
521 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
522 {
523 #pragma omp for schedule(static)
524 for(i = 0; i < n; i++) {
525 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
526 j = C->pattern->index[ij_ptrC];
527
528 C_ij_00=0;
529 C_ij_10=0;
530 C_ij_20=0;
531 C_ij_30=0;
532 C_ij_01=0;
533 C_ij_11=0;
534 C_ij_21=0;
535 C_ij_31=0;
536 C_ij_02=0;
537 C_ij_12=0;
538 C_ij_22=0;
539 C_ij_32=0;
540 C_ij_03=0;
541 C_ij_13=0;
542 C_ij_23=0;
543 C_ij_33=0;
544
545 C_ij=&(C->val[ij_ptrC*16]);
546
547 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
548 k=A->pattern->index[ik_ptrA];
549 kj_ptrB=B->pattern->ptr[k];
550 /* find j in B[k,:] */
551 start_p=&(B->pattern->index[kj_ptrB]);
552 where_p=(index_t*)bsearch(&j, start_p,
553 B->pattern->ptr[k + 1]-kj_ptrB,
554 sizeof(index_t),
555 Paso_comparIndex);
556 if (! (where_p == NULL) ) { /* this should always be the case */
557 kj_ptrB += (index_t)(where_p-start_p);
558 A_ik=&(A->val[ik_ptrA*4]);
559 B_kj=&(B->val[kj_ptrB*16]);
560
561 C_ij_00 +=A_ik[0]*B_kj[0+4*0];
562 C_ij_10 +=A_ik[1]*B_kj[1+4*0];
563 C_ij_20 +=A_ik[2]*B_kj[2+4*0];
564 C_ij_30 +=A_ik[3]*B_kj[3+4*0];
565
566 C_ij_01 +=A_ik[0]*B_kj[0+4*1];
567 C_ij_11 +=A_ik[1]*B_kj[1+4*1];
568 C_ij_21 +=A_ik[2]*B_kj[2+4*1];
569 C_ij_31 +=A_ik[3]*B_kj[3+4*1];
570
571 C_ij_02 +=A_ik[0]*B_kj[0+4*2];
572 C_ij_12 +=A_ik[1]*B_kj[1+4*2];
573 C_ij_22 +=A_ik[2]*B_kj[2+4*2];
574 C_ij_32 +=A_ik[3]*B_kj[3+4*2];
575
576 C_ij_03 +=A_ik[0]*B_kj[0+4*3];
577 C_ij_13 +=A_ik[1]*B_kj[1+4*3];
578 C_ij_23 +=A_ik[2]*B_kj[2+4*3];
579 C_ij_33 +=A_ik[3]*B_kj[3+4*3];
580 }
581 }
582 C_ij[0+4*0]=C_ij_00;
583 C_ij[1+4*0]=C_ij_10;
584 C_ij[2+4*0]=C_ij_20;
585 C_ij[3+4*0]=C_ij_30;
586 C_ij[0+4*1]=C_ij_01;
587 C_ij[1+4*1]=C_ij_11;
588 C_ij[2+4*1]=C_ij_21;
589 C_ij[3+4*1]=C_ij_31;
590 C_ij[0+4*2]=C_ij_02;
591 C_ij[1+4*2]=C_ij_12;
592 C_ij[2+4*2]=C_ij_22;
593 C_ij[3+4*2]=C_ij_32;
594 C_ij[0+4*3]=C_ij_03;
595 C_ij[1+4*3]=C_ij_13;
596 C_ij[2+4*3]=C_ij_23;
597 C_ij[3+4*3]=C_ij_33;
598 }
599 }
600 } /* end of parallel region */
601
602 } else {
603 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
604 {
605 #pragma omp for schedule(static)
606 for(i = 0; i < n; i++) {
607 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
608 j = C->pattern->index[ij_ptrC];
609 C_ij=&(C->val[ij_ptrC*C_block_size]);
610 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
611 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
612 k=A->pattern->index[ik_ptrA];
613 kj_ptrB=B->pattern->ptr[k];
614 /* find j in B[k,:] */
615 start_p=&(B->pattern->index[kj_ptrB]);
616 where_p=(index_t*)bsearch(&j, start_p,
617 B->pattern->ptr[k + 1]-kj_ptrB,
618 sizeof(index_t),
619 Paso_comparIndex);
620 if (! (where_p == NULL) ) { /* this should always be the case */
621 kj_ptrB += (index_t)(where_p-start_p);
622 A_ik=&(A->val[ik_ptrA*A_block_size]);
623 B_kj=&(B->val[kj_ptrB*B_block_size]);
624
625 for (irb=0; irb<A_block_size; ++irb) {
626 rtmp=A_ik[irb];
627 for (icb=0; icb<col_block_size; ++icb) {
628 C_ij[irb+row_block_size*icb]+=rtmp*B_kj[irb+A_col_block_size*icb];
629 }
630 }
631 }
632 }
633 }
634 }
635 } /* end of parallel region */
636
637 }
638 }
639
640 /* not good for block size 1 */
641 void Paso_SparseMatrix_MatrixMatrix_BD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
642 {
643 const dim_t n = C->numRows;
644 const dim_t row_block_size = C->row_block_size;
645 const dim_t col_block_size = C->col_block_size;
646 const dim_t C_block_size =C->block_size;
647 const dim_t B_block_size =B->block_size;
648 const dim_t A_block_size =A->block_size;
649 double *C_ij, *A_ik, *B_kj;
650 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
651 dim_t i, ib, irb, icb;
652 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
653
654 if ( (row_block_size == 2) && (col_block_size ==2 ) && (B_block_size == 2) ) {
655 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
656 {
657 #pragma omp for schedule(static)
658 for(i = 0; i < n; i++) {
659 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
660 j = C->pattern->index[ij_ptrC];
661
662 C_ij_00=0;
663 C_ij_10=0;
664 C_ij_01=0;
665 C_ij_11=0;
666
667 C_ij=&(C->val[ij_ptrC*4]);
668
669 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
670 k=A->pattern->index[ik_ptrA];
671 kj_ptrB=B->pattern->ptr[k];
672 /* find j in B[k,:] */
673 start_p=&(B->pattern->index[kj_ptrB]);
674 where_p=(index_t*)bsearch(&j, start_p,
675 B->pattern->ptr[k + 1]-kj_ptrB,
676 sizeof(index_t),
677 Paso_comparIndex);
678 if (! (where_p == NULL) ) { /* this should always be the case */
679 kj_ptrB += (index_t)(where_p-start_p);
680 A_ik=&(A->val[ik_ptrA*4]);
681 B_kj=&(B->val[kj_ptrB*2]);
682
683 C_ij_00 +=A_ik[0+2*0]*B_kj[0];
684 C_ij_10 +=A_ik[1+2*0]*B_kj[0];
685
686 C_ij_01 +=A_ik[0+2*1]*B_kj[1];
687 C_ij_11 +=A_ik[1+2*1]*B_kj[1];
688 }
689
690 }
691 C_ij[0+2*0]=C_ij_00;
692 C_ij[1+2*0]=C_ij_10;
693 C_ij[0+2*1]=C_ij_01;
694 C_ij[1+2*1]=C_ij_11;
695
696 }
697 }
698 } /* end of parallel region */
699
700 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (B_block_size == 3) ){
701 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
702 {
703 #pragma omp for schedule(static)
704 for(i = 0; i < n; i++) {
705 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
706 j = C->pattern->index[ij_ptrC];
707
708 C_ij_00=0;
709 C_ij_10=0;
710 C_ij_20=0;
711 C_ij_01=0;
712 C_ij_11=0;
713 C_ij_21=0;
714 C_ij_02=0;
715 C_ij_12=0;
716 C_ij_22=0;
717
718 C_ij=&(C->val[ij_ptrC*9]);
719
720 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
721 k=A->pattern->index[ik_ptrA];
722 kj_ptrB=B->pattern->ptr[k];
723 /* find j in B[k,:] */
724 start_p=&(B->pattern->index[kj_ptrB]);
725 where_p=(index_t*)bsearch(&j, start_p,
726 B->pattern->ptr[k + 1]-kj_ptrB,
727 sizeof(index_t),
728 Paso_comparIndex);
729 if (! (where_p == NULL) ) { /* this should always be the case */
730 kj_ptrB += (index_t)(where_p-start_p);
731 A_ik=&(A->val[ik_ptrA*9]);
732 B_kj=&(B->val[kj_ptrB*3]);
733
734 C_ij_00 +=A_ik[0+3*0]*B_kj[0];
735 C_ij_10 +=A_ik[1+3*0]*B_kj[0];
736 C_ij_20 +=A_ik[2+3*0]*B_kj[0];
737
738 C_ij_01 +=A_ik[0+3*1]*B_kj[1];
739 C_ij_11 +=A_ik[1+3*1]*B_kj[1];
740 C_ij_21 +=A_ik[2+3*1]*B_kj[1];
741
742 C_ij_02 +=A_ik[0+3*2]*B_kj[2];
743 C_ij_12 +=A_ik[1+3*2]*B_kj[2];
744 C_ij_22 +=A_ik[2+3*2]*B_kj[2];
745 }
746
747 }
748 C_ij[0+3*0]=C_ij_00;
749 C_ij[1+3*0]=C_ij_10;
750 C_ij[2+3*0]=C_ij_20;
751 C_ij[0+3*1]=C_ij_01;
752 C_ij[1+3*1]=C_ij_11;
753 C_ij[2+3*1]=C_ij_21;
754 C_ij[0+3*2]=C_ij_02;
755 C_ij[1+3*2]=C_ij_12;
756 C_ij[2+3*2]=C_ij_22;
757 }
758 }
759 } /* end of parallel region */
760 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (B_block_size == 4) ){
761 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
762 {
763 #pragma omp for schedule(static)
764 for(i = 0; i < n; i++) {
765 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
766 j = C->pattern->index[ij_ptrC];
767
768 C_ij_00=0;
769 C_ij_10=0;
770 C_ij_20=0;
771 C_ij_30=0;
772 C_ij_01=0;
773 C_ij_11=0;
774 C_ij_21=0;
775 C_ij_31=0;
776 C_ij_02=0;
777 C_ij_12=0;
778 C_ij_22=0;
779 C_ij_32=0;
780 C_ij_03=0;
781 C_ij_13=0;
782 C_ij_23=0;
783 C_ij_33=0;
784
785 C_ij=&(C->val[ij_ptrC*16]);
786
787 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
788 k=A->pattern->index[ik_ptrA];
789 kj_ptrB=B->pattern->ptr[k];
790 /* find j in B[k,:] */
791 start_p=&(B->pattern->index[kj_ptrB]);
792 where_p=(index_t*)bsearch(&j, start_p,
793 B->pattern->ptr[k + 1]-kj_ptrB,
794 sizeof(index_t),
795 Paso_comparIndex);
796 if (! (where_p == NULL) ) { /* this should always be the case */
797 kj_ptrB += (index_t)(where_p-start_p);
798 A_ik=&(A->val[ik_ptrA*16]);
799 B_kj=&(B->val[kj_ptrB*4]);
800
801 C_ij_00 +=A_ik[0+4*0]*B_kj[0];
802 C_ij_10 +=A_ik[1+4*0]*B_kj[0];
803 C_ij_20 +=A_ik[2+4*0]*B_kj[0];
804 C_ij_30 +=A_ik[3+4*0]*B_kj[0];
805
806 C_ij_01 +=A_ik[0+4*1]*B_kj[1];
807 C_ij_11 +=A_ik[1+4*1]*B_kj[1];
808 C_ij_21 +=A_ik[2+4*1]*B_kj[1];
809 C_ij_31 +=A_ik[3+4*1]*B_kj[1];
810
811 C_ij_02 +=A_ik[0+4*2]*B_kj[2];
812 C_ij_12 +=A_ik[1+4*2]*B_kj[2];
813 C_ij_22 +=A_ik[2+4*2]*B_kj[2];
814 C_ij_32 +=A_ik[3+4*2]*B_kj[2];
815
816 C_ij_03 +=A_ik[0+4*3]*B_kj[3];
817 C_ij_13 +=A_ik[1+4*3]*B_kj[3];
818 C_ij_23 +=A_ik[2+4*3]*B_kj[3];
819 C_ij_33 +=A_ik[3+4*3]*B_kj[3];
820 }
821 }
822 C_ij[0+4*0]=C_ij_00;
823 C_ij[1+4*0]=C_ij_10;
824 C_ij[2+4*0]=C_ij_20;
825 C_ij[3+4*0]=C_ij_30;
826 C_ij[0+4*1]=C_ij_01;
827 C_ij[1+4*1]=C_ij_11;
828 C_ij[2+4*1]=C_ij_21;
829 C_ij[3+4*1]=C_ij_31;
830 C_ij[0+4*2]=C_ij_02;
831 C_ij[1+4*2]=C_ij_12;
832 C_ij[2+4*2]=C_ij_22;
833 C_ij[3+4*2]=C_ij_32;
834 C_ij[0+4*3]=C_ij_03;
835 C_ij[1+4*3]=C_ij_13;
836 C_ij[2+4*3]=C_ij_23;
837 C_ij[3+4*3]=C_ij_33;
838 }
839 }
840 } /* end of parallel region */
841
842 } else {
843 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
844 {
845 #pragma omp for schedule(static)
846 for(i = 0; i < n; i++) {
847 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
848 j = C->pattern->index[ij_ptrC];
849 C_ij=&(C->val[ij_ptrC*C_block_size]);
850 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
851 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
852 k=A->pattern->index[ik_ptrA];
853 kj_ptrB=B->pattern->ptr[k];
854 /* find j in B[k,:] */
855 start_p=&(B->pattern->index[kj_ptrB]);
856 where_p=(index_t*)bsearch(&j, start_p,
857 B->pattern->ptr[k + 1]-kj_ptrB,
858 sizeof(index_t),
859 Paso_comparIndex);
860 if (! (where_p == NULL) ) { /* this should always be the case */
861 kj_ptrB += (index_t)(where_p-start_p);
862 A_ik=&(A->val[ik_ptrA*A_block_size]);
863 B_kj=&(B->val[kj_ptrB*B_block_size]);
864
865 for (icb=0; icb<B_block_size; ++icb) {
866 rtmp=B_kj[icb];
867 for (irb=0; irb<row_block_size; ++irb) {
868 C_ij[irb+row_block_size*icb]+=A_ik[irb+row_block_size*icb]*rtmp;
869 }
870 }
871 }
872 }
873 }
874 }
875 } /* end of parallel region */
876
877 }
878 }
879
880 /* not good for block size 1 */
881 void Paso_SparseMatrix_MatrixMatrix_DD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
882 {
883 const dim_t n = C->numRows;
884 const dim_t C_block_size =C->block_size;
885 const dim_t B_block_size =B->block_size;
886 const dim_t A_block_size =A->block_size;
887 double *C_ij, *A_ik, *B_kj;
888 register double C_ij_0, C_ij_1, C_ij_2, C_ij_3;
889 dim_t i, ib;
890 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
891
892 if ( (A_block_size == 1) && (B_block_size ==1) && (C_block_size == 1) ) {
893 #pragma omp parallel private(i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, C_ij_0)
894 {
895 #pragma omp for schedule(static)
896 for(i = 0; i < n; i++) {
897 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
898 j = C->pattern->index[ij_ptrC];
899 C_ij_0=0;
900 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
901 k=A->pattern->index[ik_ptrA];
902 kj_ptrB=B->pattern->ptr[k];
903 /* find j in B[k,:] */
904 start_p=&(B->pattern->index[kj_ptrB]);
905 where_p=(index_t*)bsearch(&j, start_p,
906 B->pattern->ptr[k + 1]-kj_ptrB,
907 sizeof(index_t),
908 Paso_comparIndex);
909 if (! (where_p == NULL) ) { /* this should always be the case */
910 kj_ptrB += (index_t)(where_p-start_p);
911 C_ij_0+=A->val[ik_ptrA]*B->val[kj_ptrB];
912 }
913
914 }
915 C->val[ij_ptrC]=C_ij_0;
916 }
917 }
918 } /* end of parallel region */
919 } else if ( (A_block_size == 2) && (B_block_size ==2) && (C_block_size == 2) ) {
920 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1)
921 {
922 #pragma omp for schedule(static)
923 for(i = 0; i < n; i++) {
924 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
925 j = C->pattern->index[ij_ptrC];
926 C_ij=&(C->val[ij_ptrC*2]);
927 C_ij_0=0;
928 C_ij_1=0;
929 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
930 k=A->pattern->index[ik_ptrA];
931 kj_ptrB=B->pattern->ptr[k];
932 /* find j in B[k,:] */
933 start_p=&(B->pattern->index[kj_ptrB]);
934 where_p=(index_t*)bsearch(&j, start_p,
935 B->pattern->ptr[k + 1]-kj_ptrB,
936 sizeof(index_t),
937 Paso_comparIndex);
938 if (! (where_p == NULL) ) { /* this should always be the case */
939 kj_ptrB += (index_t)(where_p-start_p);
940 A_ik=&(A->val[ik_ptrA*2]);
941 B_kj=&(B->val[kj_ptrB*2]);
942
943 C_ij_0+=A_ik[0]*B_kj[0];
944 C_ij_1+=A_ik[1]*B_kj[1];
945 }
946
947 }
948 C_ij[0]=C_ij_0;
949 C_ij[1]=C_ij_1;
950 }
951 }
952 } /* end of parallel region */
953 } else if ( (A_block_size == 3) && (B_block_size ==3) && (C_block_size == 3) ) {
954 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2)
955 {
956 #pragma omp for schedule(static)
957 for(i = 0; i < n; i++) {
958 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
959 j = C->pattern->index[ij_ptrC];
960 C_ij=&(C->val[ij_ptrC*C_block_size]);
961 C_ij_0=0;
962 C_ij_1=0;
963 C_ij_2=0;
964 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
965 k=A->pattern->index[ik_ptrA];
966 kj_ptrB=B->pattern->ptr[k];
967 /* find j in B[k,:] */
968 start_p=&(B->pattern->index[kj_ptrB]);
969 where_p=(index_t*)bsearch(&j, start_p,
970 B->pattern->ptr[k + 1]-kj_ptrB,
971 sizeof(index_t),
972 Paso_comparIndex);
973 if (! (where_p == NULL) ) { /* this should always be the case */
974 kj_ptrB += (index_t)(where_p-start_p);
975 A_ik=&(A->val[ik_ptrA*3]);
976 B_kj=&(B->val[kj_ptrB*3]);
977
978 C_ij_0+=A_ik[0]*B_kj[0];
979 C_ij_1+=A_ik[1]*B_kj[1];
980 C_ij_2+=A_ik[2]*B_kj[2];
981 }
982
983 }
984 C_ij[0]=C_ij_0;
985 C_ij[1]=C_ij_1;
986 C_ij[2]=C_ij_2;
987 }
988 }
989 } /* end of parallel region */
990 } else if ( (A_block_size == 4) && (B_block_size ==4 ) && (C_block_size == 4) ) {
991 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2, C_ij_3 )
992 {
993 #pragma omp for schedule(static)
994 for(i = 0; i < n; i++) {
995 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
996 j = C->pattern->index[ij_ptrC];
997 C_ij=&(C->val[ij_ptrC*C_block_size]);
998 C_ij_0=0;
999 C_ij_1=0;
1000 C_ij_2=0;
1001 C_ij_3=0;
1002 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
1003 k=A->pattern->index[ik_ptrA];
1004 kj_ptrB=B->pattern->ptr[k];
1005 /* find j in B[k,:] */
1006 start_p=&(B->pattern->index[kj_ptrB]);
1007 where_p=(index_t*)bsearch(&j, start_p,
1008 B->pattern->ptr[k + 1]-kj_ptrB,
1009 sizeof(index_t),
1010 Paso_comparIndex);
1011 if (! (where_p == NULL) ) { /* this should always be the case */
1012 kj_ptrB += (index_t)(where_p-start_p);
1013 A_ik=&(A->val[ik_ptrA*4]);
1014 B_kj=&(B->val[kj_ptrB*4]);
1015
1016 C_ij_0+=A_ik[0]*B_kj[0];
1017 C_ij_1+=A_ik[1]*B_kj[1];
1018 C_ij_2+=A_ik[2]*B_kj[2];
1019 C_ij_3+=A_ik[3]*B_kj[3];
1020 }
1021
1022 }
1023 C_ij[0]=C_ij_0;
1024 C_ij[1]=C_ij_1;
1025 C_ij[2]=C_ij_2;
1026 C_ij[3]=C_ij_3;
1027 }
1028 }
1029 } /* end of parallel region */
1030 } else {
1031 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, ib, A_ik,B_kj )
1032 {
1033 #pragma omp for schedule(static)
1034 for(i = 0; i < n; i++) {
1035 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1036 j = C->pattern->index[ij_ptrC];
1037 C_ij=&(C->val[ij_ptrC*C_block_size]);
1038 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
1039 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
1040 k=A->pattern->index[ik_ptrA];
1041 kj_ptrB=B->pattern->ptr[k];
1042 /* find j in B[k,:] */
1043 start_p=&(B->pattern->index[kj_ptrB]);
1044 where_p=(index_t*)bsearch(&j, start_p,
1045 B->pattern->ptr[k + 1]-kj_ptrB,
1046 sizeof(index_t),
1047 Paso_comparIndex);
1048 if (! (where_p == NULL) ) { /* this should always be the case */
1049 kj_ptrB += (index_t)(where_p-start_p);
1050 A_ik=&(A->val[ik_ptrA*A_block_size]);
1051 B_kj=&(B->val[kj_ptrB*B_block_size]);
1052 for (ib=0; ib<MIN(A_block_size, B_block_size); ++ib) C_ij[ib]+=A_ik[ib]*B_kj[ib];
1053 }
1054 }
1055 }
1056 }
1057 } /* end of parallel region */
1058 }
1059 }

  ViewVC Help
Powered by ViewVC 1.1.26