/[escript]/branches/doubleplusgood/paso/src/SparseMatrix_MatrixMatrix.cpp
ViewVC logotype

Contents of /branches/doubleplusgood/paso/src/SparseMatrix_MatrixMatrix.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4261 - (show annotations)
Wed Feb 27 06:09:33 2013 UTC (6 years, 1 month ago) by jfenwick
File size: 36150 byte(s)
Initial all c++ build.
But ... there are now reinterpret_cast<>'s
1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2013 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development since 2012 by School of Earth Sciences
13 *
14 *****************************************************************************/
15
16
17 /************************************************************************************
18
19 Paso: Sparse matrix product
20
21 ************************************************************************************
22
23 Author: artak@uq.edu.au, l.gross@uq.edu.au
24
25 ************************************************************************************/
26
27 #include "SparseMatrix.h"
28 #include "Paso.h"
29
30 Paso_SparseMatrix* Paso_SparseMatrix_MatrixMatrix(const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) {
31 Paso_SparseMatrixType C_type;
32
33 Paso_Pattern* outpattern=NULL;
34 Paso_SparseMatrix *out=NULL;
35 if ( ! ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (A->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & A->type ) ) ) {
36 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of A.");
37 return NULL;
38 }
39 if ( ! ( (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (B->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & B->type ) ) ) {
40 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of B.");
41 return NULL;
42 }
43 if (! (A->col_block_size == B->row_block_size) ) {
44 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Column block size of A and row block size of B must match.");
45 return NULL;
46 }
47 if (! (A->numCols == B->numRows) ) {
48 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: number of columns of A and number of rows of B must match.");
49 return NULL;
50 }
51
52
53 if ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) && (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) ) {
54 C_type=MATRIX_FORMAT_DIAGONAL_BLOCK;
55 } else {
56 C_type=MATRIX_FORMAT_DEFAULT;
57 }
58
59 outpattern=Paso_Pattern_multiply(MATRIX_FORMAT_DEFAULT,A->pattern,B->pattern);
60
61 if (Esys_noError()) {
62 out=Paso_SparseMatrix_alloc(C_type, outpattern, A->row_block_size, B->col_block_size, FALSE);
63 }
64 Paso_Pattern_free(outpattern);
65
66 if (Esys_noError()) {
67 if ( (A->row_block_size == 1) && (B->col_block_size ==1 ) && (A->col_block_size ==1) ) {
68 Paso_SparseMatrix_MatrixMatrix_DD(out, A,B);
69 } else {
70 if (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
71 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
72 Paso_SparseMatrix_MatrixMatrix_DD(out, A,B);
73 } else {
74 Paso_SparseMatrix_MatrixMatrix_DB(out, A,B);
75 }
76 } else {
77 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
78 Paso_SparseMatrix_MatrixMatrix_BD(out, A,B);
79 } else {
80 Paso_SparseMatrix_MatrixMatrix_BB(out, A,B);
81 }
82 }
83 }
84 return out;
85 } else {
86 Paso_SparseMatrix_free(out);
87 return NULL;
88 }
89 }
90 /* not good for block size 1 */
91 void Paso_SparseMatrix_MatrixMatrix_BB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
92 {
93 const dim_t n = C->numRows;
94 const dim_t row_block_size = C->row_block_size;
95 const dim_t col_block_size = C->col_block_size;
96 const dim_t A_col_block_size = A->col_block_size;
97 const dim_t C_block_size =C->block_size;
98 const dim_t B_block_size =B->block_size;
99 const dim_t A_block_size =A->block_size;
100 double *C_ij, *A_ik, *B_kj;
101 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
102 dim_t i, ib, irb, icb;
103 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
104
105 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_col_block_size == 2) ) {
106 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
107 {
108 #pragma omp for schedule(static)
109 for(i = 0; i < n; i++) {
110 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
111 j = C->pattern->index[ij_ptrC];
112
113 C_ij_00=0;
114 C_ij_10=0;
115 C_ij_01=0;
116 C_ij_11=0;
117
118 C_ij=&(C->val[ij_ptrC*4]);
119
120 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
121 k=A->pattern->index[ik_ptrA];
122 kj_ptrB=B->pattern->ptr[k];
123 /* find j in B[k,:] */
124 start_p=&(B->pattern->index[kj_ptrB]);
125 where_p=(index_t*)bsearch(&j, start_p,
126 B->pattern->ptr[k + 1]-kj_ptrB,
127 sizeof(index_t),
128 Paso_comparIndex);
129 if (! (where_p == NULL) ) { /* this should always be the case */
130 kj_ptrB += (index_t)(where_p-start_p);
131 A_ik=&(A->val[ik_ptrA*4]);
132 B_kj=&(B->val[kj_ptrB*4]);
133
134 C_ij_00 +=A_ik[0+2*0]*B_kj[0+2*0]+A_ik[0+2*1]*B_kj[1+2*0];
135 C_ij_10 +=A_ik[1+2*0]*B_kj[0+2*0]+A_ik[1+2*1]*B_kj[1+2*0];
136
137 C_ij_01 +=A_ik[0+2*0]*B_kj[0+2*1]+A_ik[0+2*1]*B_kj[1+2*1];
138 C_ij_11 +=A_ik[1+2*0]*B_kj[0+2*1]+A_ik[1+2*1]*B_kj[1+2*1];
139 }
140
141 }
142 C_ij[0+2*0]=C_ij_00;
143 C_ij[1+2*0]=C_ij_10;
144 C_ij[0+2*1]=C_ij_01;
145 C_ij[1+2*1]=C_ij_11;
146
147 }
148 }
149 } /* end of parallel region */
150
151 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_col_block_size == 3) ){
152 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
153 {
154 #pragma omp for schedule(static)
155 for(i = 0; i < n; i++) {
156 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
157 j = C->pattern->index[ij_ptrC];
158
159 C_ij_00=0;
160 C_ij_10=0;
161 C_ij_20=0;
162 C_ij_01=0;
163 C_ij_11=0;
164 C_ij_21=0;
165 C_ij_02=0;
166 C_ij_12=0;
167 C_ij_22=0;
168
169 C_ij=&(C->val[ij_ptrC*9]);
170
171 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
172 k=A->pattern->index[ik_ptrA];
173 kj_ptrB=B->pattern->ptr[k];
174 /* find j in B[k,:] */
175 start_p=&(B->pattern->index[kj_ptrB]);
176 where_p=(index_t*)bsearch(&j, start_p,
177 B->pattern->ptr[k + 1]-kj_ptrB,
178 sizeof(index_t),
179 Paso_comparIndex);
180 if (! (where_p == NULL) ) { /* this should always be the case */
181 kj_ptrB += (index_t)(where_p-start_p);
182 A_ik=&(A->val[ik_ptrA*9]);
183 B_kj=&(B->val[kj_ptrB*9]);
184
185 C_ij_00 +=A_ik[0+3*0]*B_kj[0+3*0]
186 +A_ik[0+3*1]*B_kj[1+3*0]
187 +A_ik[0+3*2]*B_kj[2+3*0];
188 C_ij_10 +=A_ik[1+3*0]*B_kj[0+3*0]
189 +A_ik[1+3*1]*B_kj[1+3*0]
190 +A_ik[1+3*2]*B_kj[2+3*0];
191 C_ij_20 +=A_ik[2+3*0]*B_kj[0+3*0]
192 +A_ik[2+3*1]*B_kj[1+3*0]
193 +A_ik[2+3*2]*B_kj[2+3*0];
194
195 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*1]
196 +A_ik[0+3*1]*B_kj[1+3*1]
197 +A_ik[0+3*2]*B_kj[2+3*1];
198 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*1]
199 +A_ik[1+3*1]*B_kj[1+3*1]
200 +A_ik[1+3*2]*B_kj[2+3*1];
201 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*1]
202 +A_ik[2+3*1]*B_kj[1+3*1]
203 +A_ik[2+3*2]*B_kj[2+3*1];
204
205 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*2]
206 +A_ik[0+3*1]*B_kj[1+3*2]
207 +A_ik[0+3*2]*B_kj[2+3*2];
208 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*2]
209 +A_ik[1+3*1]*B_kj[1+3*2]
210 +A_ik[1+3*2]*B_kj[2+3*2];
211 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*2]
212 +A_ik[2+3*1]*B_kj[1+3*2]
213 +A_ik[2+3*2]*B_kj[2+3*2];
214 }
215
216 }
217 C_ij[0+3*0]=C_ij_00;
218 C_ij[1+3*0]=C_ij_10;
219 C_ij[2+3*0]=C_ij_20;
220 C_ij[0+3*1]=C_ij_01;
221 C_ij[1+3*1]=C_ij_11;
222 C_ij[2+3*1]=C_ij_21;
223 C_ij[0+3*2]=C_ij_02;
224 C_ij[1+3*2]=C_ij_12;
225 C_ij[2+3*2]=C_ij_22;
226 }
227 }
228 } /* end of parallel region */
229 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_col_block_size == 4) ){
230 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
231 {
232 #pragma omp for schedule(static)
233 for(i = 0; i < n; i++) {
234 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
235 j = C->pattern->index[ij_ptrC];
236
237 C_ij_00=0;
238 C_ij_10=0;
239 C_ij_20=0;
240 C_ij_30=0;
241 C_ij_01=0;
242 C_ij_11=0;
243 C_ij_21=0;
244 C_ij_31=0;
245 C_ij_02=0;
246 C_ij_12=0;
247 C_ij_22=0;
248 C_ij_32=0;
249 C_ij_03=0;
250 C_ij_13=0;
251 C_ij_23=0;
252 C_ij_33=0;
253
254 C_ij=&(C->val[ij_ptrC*16]);
255
256 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
257 k=A->pattern->index[ik_ptrA];
258 kj_ptrB=B->pattern->ptr[k];
259 /* find j in B[k,:] */
260 start_p=&(B->pattern->index[kj_ptrB]);
261 where_p=(index_t*)bsearch(&j, start_p,
262 B->pattern->ptr[k + 1]-kj_ptrB,
263 sizeof(index_t),
264 Paso_comparIndex);
265 if (! (where_p == NULL) ) { /* this should always be the case */
266 kj_ptrB += (index_t)(where_p-start_p);
267 A_ik=&(A->val[ik_ptrA*16]);
268 B_kj=&(B->val[kj_ptrB*16]);
269
270 C_ij_00 +=A_ik[0+4*0]*B_kj[0+4*0]
271 +A_ik[0+4*1]*B_kj[1+4*0]
272 +A_ik[0+4*2]*B_kj[2+4*0]
273 +A_ik[0+4*3]*B_kj[3+4*0];
274 C_ij_10 +=A_ik[1+4*0]*B_kj[0+4*0]
275 +A_ik[1+4*1]*B_kj[1+4*0]
276 +A_ik[1+4*2]*B_kj[2+4*0]
277 +A_ik[1+4*3]*B_kj[3+4*0];
278 C_ij_20 +=A_ik[2+4*0]*B_kj[0+4*0]
279 +A_ik[2+4*1]*B_kj[1+4*0]
280 +A_ik[2+4*2]*B_kj[2+4*0]
281 +A_ik[2+4*3]*B_kj[3+4*0];
282 C_ij_30 +=A_ik[3+4*0]*B_kj[0+4*0]
283 +A_ik[3+4*1]*B_kj[1+4*0]
284 +A_ik[3+4*2]*B_kj[2+4*0]
285 +A_ik[3+4*3]*B_kj[3+4*0];
286
287 C_ij_01 +=A_ik[0+4*0]*B_kj[0+4*1]
288 +A_ik[0+4*1]*B_kj[1+4*1]
289 +A_ik[0+4*2]*B_kj[2+4*1]
290 +A_ik[0+4*3]*B_kj[3+4*1];
291 C_ij_11 +=A_ik[1+4*0]*B_kj[0+4*1]
292 +A_ik[1+4*1]*B_kj[1+4*1]
293 +A_ik[1+4*2]*B_kj[2+4*1]
294 +A_ik[1+4*3]*B_kj[3+4*1];
295 C_ij_21 +=A_ik[2+4*0]*B_kj[0+4*1]
296 +A_ik[2+4*1]*B_kj[1+4*1]
297 +A_ik[2+4*2]*B_kj[2+4*1]
298 +A_ik[2+4*3]*B_kj[3+4*1];
299 C_ij_31 +=A_ik[3+4*0]*B_kj[0+4*1]
300 +A_ik[3+4*1]*B_kj[1+4*1]
301 +A_ik[3+4*2]*B_kj[2+4*1]
302 +A_ik[3+4*3]*B_kj[3+4*1];
303
304 C_ij_02 +=A_ik[0+4*0]*B_kj[0+4*2]
305 +A_ik[0+4*1]*B_kj[1+4*2]
306 +A_ik[0+4*2]*B_kj[2+4*2]
307 +A_ik[0+4*3]*B_kj[3+4*2];
308 C_ij_12 +=A_ik[1+4*0]*B_kj[0+4*2]
309 +A_ik[1+4*1]*B_kj[1+4*2]
310 +A_ik[1+4*2]*B_kj[2+4*2]
311 +A_ik[1+4*3]*B_kj[3+4*2];
312 C_ij_22 +=A_ik[2+4*0]*B_kj[0+4*2]
313 +A_ik[2+4*1]*B_kj[1+4*2]
314 +A_ik[2+4*2]*B_kj[2+4*2]
315 +A_ik[2+4*3]*B_kj[3+4*2];
316 C_ij_32 +=A_ik[3+4*0]*B_kj[0+4*2]
317 +A_ik[3+4*1]*B_kj[1+4*2]
318 +A_ik[3+4*2]*B_kj[2+4*2]
319 +A_ik[3+4*3]*B_kj[3+4*2];
320
321 C_ij_03 +=A_ik[0+4*0]*B_kj[0+4*3]
322 +A_ik[0+4*1]*B_kj[1+4*3]
323 +A_ik[0+4*2]*B_kj[2+4*3]
324 +A_ik[0+4*3]*B_kj[3+4*3];
325 C_ij_13 +=A_ik[1+4*0]*B_kj[0+4*3]
326 +A_ik[1+4*1]*B_kj[1+4*3]
327 +A_ik[1+4*2]*B_kj[2+4*3]
328 +A_ik[1+4*3]*B_kj[3+4*3];
329 C_ij_23 +=A_ik[2+4*0]*B_kj[0+4*3]
330 +A_ik[2+4*1]*B_kj[1+4*3]
331 +A_ik[2+4*2]*B_kj[2+4*3]
332 +A_ik[2+4*3]*B_kj[3+4*3];
333 C_ij_33 +=A_ik[3+4*0]*B_kj[0+4*3]
334 +A_ik[3+4*1]*B_kj[1+4*3]
335 +A_ik[3+4*2]*B_kj[2+4*3]
336 +A_ik[3+4*3]*B_kj[3+4*3];
337 }
338 }
339 C_ij[0+4*0]=C_ij_00;
340 C_ij[1+4*0]=C_ij_10;
341 C_ij[2+4*0]=C_ij_20;
342 C_ij[3+4*0]=C_ij_30;
343 C_ij[0+4*1]=C_ij_01;
344 C_ij[1+4*1]=C_ij_11;
345 C_ij[2+4*1]=C_ij_21;
346 C_ij[3+4*1]=C_ij_31;
347 C_ij[0+4*2]=C_ij_02;
348 C_ij[1+4*2]=C_ij_12;
349 C_ij[2+4*2]=C_ij_22;
350 C_ij[3+4*2]=C_ij_32;
351 C_ij[0+4*3]=C_ij_03;
352 C_ij[1+4*3]=C_ij_13;
353 C_ij[2+4*3]=C_ij_23;
354 C_ij[3+4*3]=C_ij_33;
355 }
356 }
357 } /* end of parallel region */
358
359 } else {
360 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
361 {
362 #pragma omp for schedule(static)
363 for(i = 0; i < n; i++) {
364 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
365 j = C->pattern->index[ij_ptrC];
366 C_ij=&(C->val[ij_ptrC*C_block_size]);
367 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
368 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
369 k=A->pattern->index[ik_ptrA];
370 kj_ptrB=B->pattern->ptr[k];
371 /* find j in B[k,:] */
372 start_p=&(B->pattern->index[kj_ptrB]);
373 where_p=(index_t*)bsearch(&j, start_p,
374 B->pattern->ptr[k + 1]-kj_ptrB,
375 sizeof(index_t),
376 Paso_comparIndex);
377 if (! (where_p == NULL) ) { /* this should always be the case */
378 kj_ptrB += (index_t)(where_p-start_p);
379 A_ik=&(A->val[ik_ptrA*A_block_size]);
380 B_kj=&(B->val[kj_ptrB*B_block_size]);
381
382 for (irb=0; irb<row_block_size; ++irb) {
383 for (icb=0; icb<col_block_size; ++icb) {
384 rtmp=C_ij[irb+row_block_size*icb];
385 for (ib=0; ib<A_col_block_size; ++ib) {
386 rtmp+=A_ik[irb+row_block_size*ib]*B_kj[ib+A_col_block_size*icb];
387 }
388 C_ij[irb+row_block_size*icb]=rtmp;
389 }
390 }
391 }
392 }
393 }
394 }
395 } /* end of parallel region */
396
397 }
398 }
399
400 /* not good for block size 1 */
401 void Paso_SparseMatrix_MatrixMatrix_DB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
402 {
403 const dim_t n = C->numRows;
404 const dim_t row_block_size = C->row_block_size;
405 const dim_t col_block_size = C->col_block_size;
406 const dim_t A_col_block_size = A->col_block_size;
407 const dim_t C_block_size =C->block_size;
408 const dim_t B_block_size =B->block_size;
409 const dim_t A_block_size =A->block_size;
410 double *C_ij, *A_ik, *B_kj;
411 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
412 dim_t i, ib, irb, icb;
413 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
414
415 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_block_size == 2) ) {
416 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
417 {
418 #pragma omp for schedule(static)
419 for(i = 0; i < n; i++) {
420 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
421 j = C->pattern->index[ij_ptrC];
422
423 C_ij_00=0;
424 C_ij_10=0;
425 C_ij_01=0;
426 C_ij_11=0;
427
428 C_ij=&(C->val[ij_ptrC*4]);
429
430 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
431 k=A->pattern->index[ik_ptrA];
432 kj_ptrB=B->pattern->ptr[k];
433 /* find j in B[k,:] */
434 start_p=&(B->pattern->index[kj_ptrB]);
435 where_p=(index_t*)bsearch(&j, start_p,
436 B->pattern->ptr[k + 1]-kj_ptrB,
437 sizeof(index_t),
438 Paso_comparIndex);
439 if (! (where_p == NULL) ) { /* this should always be the case */
440 kj_ptrB += (index_t)(where_p-start_p);
441 A_ik=&(A->val[ik_ptrA*2]);
442 B_kj=&(B->val[kj_ptrB*4]);
443
444 C_ij_00 +=A_ik[0]*B_kj[0+2*0];
445 C_ij_10 +=A_ik[1]*B_kj[1+2*0];
446
447 C_ij_01 +=A_ik[0]*B_kj[0+2*1];
448 C_ij_11 +=A_ik[1]*B_kj[1+2*1];
449 }
450
451 }
452 C_ij[0+2*0]=C_ij_00;
453 C_ij[1+2*0]=C_ij_10;
454 C_ij[0+2*1]=C_ij_01;
455 C_ij[1+2*1]=C_ij_11;
456
457 }
458 }
459 } /* end of parallel region */
460
461 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_block_size == 3) ){
462 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
463 {
464 #pragma omp for schedule(static)
465 for(i = 0; i < n; i++) {
466 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
467 j = C->pattern->index[ij_ptrC];
468
469 C_ij_00=0;
470 C_ij_10=0;
471 C_ij_20=0;
472 C_ij_01=0;
473 C_ij_11=0;
474 C_ij_21=0;
475 C_ij_02=0;
476 C_ij_12=0;
477 C_ij_22=0;
478
479 C_ij=&(C->val[ij_ptrC*9]);
480
481 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
482 k=A->pattern->index[ik_ptrA];
483 kj_ptrB=B->pattern->ptr[k];
484 /* find j in B[k,:] */
485 start_p=&(B->pattern->index[kj_ptrB]);
486 where_p=(index_t*)bsearch(&j, start_p,
487 B->pattern->ptr[k + 1]-kj_ptrB,
488 sizeof(index_t),
489 Paso_comparIndex);
490 if (! (where_p == NULL) ) { /* this should always be the case */
491 kj_ptrB += (index_t)(where_p-start_p);
492 A_ik=&(A->val[ik_ptrA*3]);
493 B_kj=&(B->val[kj_ptrB*9]);
494
495 C_ij_00 +=A_ik[0]*B_kj[0+3*0];
496 C_ij_10 +=A_ik[1]*B_kj[1+3*0];
497 C_ij_20 +=A_ik[2]*B_kj[2+3*0];
498
499 C_ij_01 +=A_ik[0]*B_kj[0+3*1];
500 C_ij_11 +=A_ik[1]*B_kj[1+3*1];
501 C_ij_21 +=A_ik[2]*B_kj[2+3*1];
502
503 C_ij_02 +=A_ik[0]*B_kj[0+3*2];
504 C_ij_12 +=A_ik[1]*B_kj[1+3*2];
505 C_ij_22 +=A_ik[2]*B_kj[2+3*2];
506
507 }
508
509 }
510 C_ij[0+3*0]=C_ij_00;
511 C_ij[1+3*0]=C_ij_10;
512 C_ij[2+3*0]=C_ij_20;
513 C_ij[0+3*1]=C_ij_01;
514 C_ij[1+3*1]=C_ij_11;
515 C_ij[2+3*1]=C_ij_21;
516 C_ij[0+3*2]=C_ij_02;
517 C_ij[1+3*2]=C_ij_12;
518 C_ij[2+3*2]=C_ij_22;
519 }
520 }
521 } /* end of parallel region */
522 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_block_size == 4) ){
523 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
524 {
525 #pragma omp for schedule(static)
526 for(i = 0; i < n; i++) {
527 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
528 j = C->pattern->index[ij_ptrC];
529
530 C_ij_00=0;
531 C_ij_10=0;
532 C_ij_20=0;
533 C_ij_30=0;
534 C_ij_01=0;
535 C_ij_11=0;
536 C_ij_21=0;
537 C_ij_31=0;
538 C_ij_02=0;
539 C_ij_12=0;
540 C_ij_22=0;
541 C_ij_32=0;
542 C_ij_03=0;
543 C_ij_13=0;
544 C_ij_23=0;
545 C_ij_33=0;
546
547 C_ij=&(C->val[ij_ptrC*16]);
548
549 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
550 k=A->pattern->index[ik_ptrA];
551 kj_ptrB=B->pattern->ptr[k];
552 /* find j in B[k,:] */
553 start_p=&(B->pattern->index[kj_ptrB]);
554 where_p=(index_t*)bsearch(&j, start_p,
555 B->pattern->ptr[k + 1]-kj_ptrB,
556 sizeof(index_t),
557 Paso_comparIndex);
558 if (! (where_p == NULL) ) { /* this should always be the case */
559 kj_ptrB += (index_t)(where_p-start_p);
560 A_ik=&(A->val[ik_ptrA*4]);
561 B_kj=&(B->val[kj_ptrB*16]);
562
563 C_ij_00 +=A_ik[0]*B_kj[0+4*0];
564 C_ij_10 +=A_ik[1]*B_kj[1+4*0];
565 C_ij_20 +=A_ik[2]*B_kj[2+4*0];
566 C_ij_30 +=A_ik[3]*B_kj[3+4*0];
567
568 C_ij_01 +=A_ik[0]*B_kj[0+4*1];
569 C_ij_11 +=A_ik[1]*B_kj[1+4*1];
570 C_ij_21 +=A_ik[2]*B_kj[2+4*1];
571 C_ij_31 +=A_ik[3]*B_kj[3+4*1];
572
573 C_ij_02 +=A_ik[0]*B_kj[0+4*2];
574 C_ij_12 +=A_ik[1]*B_kj[1+4*2];
575 C_ij_22 +=A_ik[2]*B_kj[2+4*2];
576 C_ij_32 +=A_ik[3]*B_kj[3+4*2];
577
578 C_ij_03 +=A_ik[0]*B_kj[0+4*3];
579 C_ij_13 +=A_ik[1]*B_kj[1+4*3];
580 C_ij_23 +=A_ik[2]*B_kj[2+4*3];
581 C_ij_33 +=A_ik[3]*B_kj[3+4*3];
582 }
583 }
584 C_ij[0+4*0]=C_ij_00;
585 C_ij[1+4*0]=C_ij_10;
586 C_ij[2+4*0]=C_ij_20;
587 C_ij[3+4*0]=C_ij_30;
588 C_ij[0+4*1]=C_ij_01;
589 C_ij[1+4*1]=C_ij_11;
590 C_ij[2+4*1]=C_ij_21;
591 C_ij[3+4*1]=C_ij_31;
592 C_ij[0+4*2]=C_ij_02;
593 C_ij[1+4*2]=C_ij_12;
594 C_ij[2+4*2]=C_ij_22;
595 C_ij[3+4*2]=C_ij_32;
596 C_ij[0+4*3]=C_ij_03;
597 C_ij[1+4*3]=C_ij_13;
598 C_ij[2+4*3]=C_ij_23;
599 C_ij[3+4*3]=C_ij_33;
600 }
601 }
602 } /* end of parallel region */
603
604 } else {
605 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
606 {
607 #pragma omp for schedule(static)
608 for(i = 0; i < n; i++) {
609 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
610 j = C->pattern->index[ij_ptrC];
611 C_ij=&(C->val[ij_ptrC*C_block_size]);
612 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
613 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
614 k=A->pattern->index[ik_ptrA];
615 kj_ptrB=B->pattern->ptr[k];
616 /* find j in B[k,:] */
617 start_p=&(B->pattern->index[kj_ptrB]);
618 where_p=(index_t*)bsearch(&j, start_p,
619 B->pattern->ptr[k + 1]-kj_ptrB,
620 sizeof(index_t),
621 Paso_comparIndex);
622 if (! (where_p == NULL) ) { /* this should always be the case */
623 kj_ptrB += (index_t)(where_p-start_p);
624 A_ik=&(A->val[ik_ptrA*A_block_size]);
625 B_kj=&(B->val[kj_ptrB*B_block_size]);
626
627 for (irb=0; irb<A_block_size; ++irb) {
628 rtmp=A_ik[irb];
629 for (icb=0; icb<col_block_size; ++icb) {
630 C_ij[irb+row_block_size*icb]+=rtmp*B_kj[irb+A_col_block_size*icb];
631 }
632 }
633 }
634 }
635 }
636 }
637 } /* end of parallel region */
638
639 }
640 }
641
642 /* not good for block size 1 */
643 void Paso_SparseMatrix_MatrixMatrix_BD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
644 {
645 const dim_t n = C->numRows;
646 const dim_t row_block_size = C->row_block_size;
647 const dim_t col_block_size = C->col_block_size;
648 const dim_t C_block_size =C->block_size;
649 const dim_t B_block_size =B->block_size;
650 const dim_t A_block_size =A->block_size;
651 double *C_ij, *A_ik, *B_kj;
652 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
653 dim_t i, ib, irb, icb;
654 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
655
656 if ( (row_block_size == 2) && (col_block_size ==2 ) && (B_block_size == 2) ) {
657 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
658 {
659 #pragma omp for schedule(static)
660 for(i = 0; i < n; i++) {
661 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
662 j = C->pattern->index[ij_ptrC];
663
664 C_ij_00=0;
665 C_ij_10=0;
666 C_ij_01=0;
667 C_ij_11=0;
668
669 C_ij=&(C->val[ij_ptrC*4]);
670
671 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
672 k=A->pattern->index[ik_ptrA];
673 kj_ptrB=B->pattern->ptr[k];
674 /* find j in B[k,:] */
675 start_p=&(B->pattern->index[kj_ptrB]);
676 where_p=(index_t*)bsearch(&j, start_p,
677 B->pattern->ptr[k + 1]-kj_ptrB,
678 sizeof(index_t),
679 Paso_comparIndex);
680 if (! (where_p == NULL) ) { /* this should always be the case */
681 kj_ptrB += (index_t)(where_p-start_p);
682 A_ik=&(A->val[ik_ptrA*4]);
683 B_kj=&(B->val[kj_ptrB*2]);
684
685 C_ij_00 +=A_ik[0+2*0]*B_kj[0];
686 C_ij_10 +=A_ik[1+2*0]*B_kj[0];
687
688 C_ij_01 +=A_ik[0+2*1]*B_kj[1];
689 C_ij_11 +=A_ik[1+2*1]*B_kj[1];
690 }
691
692 }
693 C_ij[0+2*0]=C_ij_00;
694 C_ij[1+2*0]=C_ij_10;
695 C_ij[0+2*1]=C_ij_01;
696 C_ij[1+2*1]=C_ij_11;
697
698 }
699 }
700 } /* end of parallel region */
701
702 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (B_block_size == 3) ){
703 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
704 {
705 #pragma omp for schedule(static)
706 for(i = 0; i < n; i++) {
707 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
708 j = C->pattern->index[ij_ptrC];
709
710 C_ij_00=0;
711 C_ij_10=0;
712 C_ij_20=0;
713 C_ij_01=0;
714 C_ij_11=0;
715 C_ij_21=0;
716 C_ij_02=0;
717 C_ij_12=0;
718 C_ij_22=0;
719
720 C_ij=&(C->val[ij_ptrC*9]);
721
722 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
723 k=A->pattern->index[ik_ptrA];
724 kj_ptrB=B->pattern->ptr[k];
725 /* find j in B[k,:] */
726 start_p=&(B->pattern->index[kj_ptrB]);
727 where_p=(index_t*)bsearch(&j, start_p,
728 B->pattern->ptr[k + 1]-kj_ptrB,
729 sizeof(index_t),
730 Paso_comparIndex);
731 if (! (where_p == NULL) ) { /* this should always be the case */
732 kj_ptrB += (index_t)(where_p-start_p);
733 A_ik=&(A->val[ik_ptrA*9]);
734 B_kj=&(B->val[kj_ptrB*3]);
735
736 C_ij_00 +=A_ik[0+3*0]*B_kj[0];
737 C_ij_10 +=A_ik[1+3*0]*B_kj[0];
738 C_ij_20 +=A_ik[2+3*0]*B_kj[0];
739
740 C_ij_01 +=A_ik[0+3*1]*B_kj[1];
741 C_ij_11 +=A_ik[1+3*1]*B_kj[1];
742 C_ij_21 +=A_ik[2+3*1]*B_kj[1];
743
744 C_ij_02 +=A_ik[0+3*2]*B_kj[2];
745 C_ij_12 +=A_ik[1+3*2]*B_kj[2];
746 C_ij_22 +=A_ik[2+3*2]*B_kj[2];
747 }
748
749 }
750 C_ij[0+3*0]=C_ij_00;
751 C_ij[1+3*0]=C_ij_10;
752 C_ij[2+3*0]=C_ij_20;
753 C_ij[0+3*1]=C_ij_01;
754 C_ij[1+3*1]=C_ij_11;
755 C_ij[2+3*1]=C_ij_21;
756 C_ij[0+3*2]=C_ij_02;
757 C_ij[1+3*2]=C_ij_12;
758 C_ij[2+3*2]=C_ij_22;
759 }
760 }
761 } /* end of parallel region */
762 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (B_block_size == 4) ){
763 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
764 {
765 #pragma omp for schedule(static)
766 for(i = 0; i < n; i++) {
767 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
768 j = C->pattern->index[ij_ptrC];
769
770 C_ij_00=0;
771 C_ij_10=0;
772 C_ij_20=0;
773 C_ij_30=0;
774 C_ij_01=0;
775 C_ij_11=0;
776 C_ij_21=0;
777 C_ij_31=0;
778 C_ij_02=0;
779 C_ij_12=0;
780 C_ij_22=0;
781 C_ij_32=0;
782 C_ij_03=0;
783 C_ij_13=0;
784 C_ij_23=0;
785 C_ij_33=0;
786
787 C_ij=&(C->val[ij_ptrC*16]);
788
789 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
790 k=A->pattern->index[ik_ptrA];
791 kj_ptrB=B->pattern->ptr[k];
792 /* find j in B[k,:] */
793 start_p=&(B->pattern->index[kj_ptrB]);
794 where_p=(index_t*)bsearch(&j, start_p,
795 B->pattern->ptr[k + 1]-kj_ptrB,
796 sizeof(index_t),
797 Paso_comparIndex);
798 if (! (where_p == NULL) ) { /* this should always be the case */
799 kj_ptrB += (index_t)(where_p-start_p);
800 A_ik=&(A->val[ik_ptrA*16]);
801 B_kj=&(B->val[kj_ptrB*4]);
802
803 C_ij_00 +=A_ik[0+4*0]*B_kj[0];
804 C_ij_10 +=A_ik[1+4*0]*B_kj[0];
805 C_ij_20 +=A_ik[2+4*0]*B_kj[0];
806 C_ij_30 +=A_ik[3+4*0]*B_kj[0];
807
808 C_ij_01 +=A_ik[0+4*1]*B_kj[1];
809 C_ij_11 +=A_ik[1+4*1]*B_kj[1];
810 C_ij_21 +=A_ik[2+4*1]*B_kj[1];
811 C_ij_31 +=A_ik[3+4*1]*B_kj[1];
812
813 C_ij_02 +=A_ik[0+4*2]*B_kj[2];
814 C_ij_12 +=A_ik[1+4*2]*B_kj[2];
815 C_ij_22 +=A_ik[2+4*2]*B_kj[2];
816 C_ij_32 +=A_ik[3+4*2]*B_kj[2];
817
818 C_ij_03 +=A_ik[0+4*3]*B_kj[3];
819 C_ij_13 +=A_ik[1+4*3]*B_kj[3];
820 C_ij_23 +=A_ik[2+4*3]*B_kj[3];
821 C_ij_33 +=A_ik[3+4*3]*B_kj[3];
822 }
823 }
824 C_ij[0+4*0]=C_ij_00;
825 C_ij[1+4*0]=C_ij_10;
826 C_ij[2+4*0]=C_ij_20;
827 C_ij[3+4*0]=C_ij_30;
828 C_ij[0+4*1]=C_ij_01;
829 C_ij[1+4*1]=C_ij_11;
830 C_ij[2+4*1]=C_ij_21;
831 C_ij[3+4*1]=C_ij_31;
832 C_ij[0+4*2]=C_ij_02;
833 C_ij[1+4*2]=C_ij_12;
834 C_ij[2+4*2]=C_ij_22;
835 C_ij[3+4*2]=C_ij_32;
836 C_ij[0+4*3]=C_ij_03;
837 C_ij[1+4*3]=C_ij_13;
838 C_ij[2+4*3]=C_ij_23;
839 C_ij[3+4*3]=C_ij_33;
840 }
841 }
842 } /* end of parallel region */
843
844 } else {
845 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
846 {
847 #pragma omp for schedule(static)
848 for(i = 0; i < n; i++) {
849 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
850 j = C->pattern->index[ij_ptrC];
851 C_ij=&(C->val[ij_ptrC*C_block_size]);
852 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
853 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
854 k=A->pattern->index[ik_ptrA];
855 kj_ptrB=B->pattern->ptr[k];
856 /* find j in B[k,:] */
857 start_p=&(B->pattern->index[kj_ptrB]);
858 where_p=(index_t*)bsearch(&j, start_p,
859 B->pattern->ptr[k + 1]-kj_ptrB,
860 sizeof(index_t),
861 Paso_comparIndex);
862 if (! (where_p == NULL) ) { /* this should always be the case */
863 kj_ptrB += (index_t)(where_p-start_p);
864 A_ik=&(A->val[ik_ptrA*A_block_size]);
865 B_kj=&(B->val[kj_ptrB*B_block_size]);
866
867 for (icb=0; icb<B_block_size; ++icb) {
868 rtmp=B_kj[icb];
869 for (irb=0; irb<row_block_size; ++irb) {
870 C_ij[irb+row_block_size*icb]+=A_ik[irb+row_block_size*icb]*rtmp;
871 }
872 }
873 }
874 }
875 }
876 }
877 } /* end of parallel region */
878
879 }
880 }
881
882 /* not good for block size 1 */
883 void Paso_SparseMatrix_MatrixMatrix_DD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
884 {
885 const dim_t n = C->numRows;
886 const dim_t C_block_size =C->block_size;
887 const dim_t B_block_size =B->block_size;
888 const dim_t A_block_size =A->block_size;
889 double *C_ij, *A_ik, *B_kj;
890 register double C_ij_0, C_ij_1, C_ij_2, C_ij_3;
891 dim_t i, ib;
892 index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
893
894 if ( (A_block_size == 1) && (B_block_size ==1) && (C_block_size == 1) ) {
895 #pragma omp parallel private(i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, C_ij_0)
896 {
897 #pragma omp for schedule(static)
898 for(i = 0; i < n; i++) {
899 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
900 j = C->pattern->index[ij_ptrC];
901 C_ij_0=0;
902 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
903 k=A->pattern->index[ik_ptrA];
904 kj_ptrB=B->pattern->ptr[k];
905 /* find j in B[k,:] */
906 start_p=&(B->pattern->index[kj_ptrB]);
907 where_p=(index_t*)bsearch(&j, start_p,
908 B->pattern->ptr[k + 1]-kj_ptrB,
909 sizeof(index_t),
910 Paso_comparIndex);
911 if (! (where_p == NULL) ) { /* this should always be the case */
912 kj_ptrB += (index_t)(where_p-start_p);
913 C_ij_0+=A->val[ik_ptrA]*B->val[kj_ptrB];
914 }
915
916 }
917 C->val[ij_ptrC]=C_ij_0;
918 }
919 }
920 } /* end of parallel region */
921 } else if ( (A_block_size == 2) && (B_block_size ==2) && (C_block_size == 2) ) {
922 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1)
923 {
924 #pragma omp for schedule(static)
925 for(i = 0; i < n; i++) {
926 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
927 j = C->pattern->index[ij_ptrC];
928 C_ij=&(C->val[ij_ptrC*2]);
929 C_ij_0=0;
930 C_ij_1=0;
931 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
932 k=A->pattern->index[ik_ptrA];
933 kj_ptrB=B->pattern->ptr[k];
934 /* find j in B[k,:] */
935 start_p=&(B->pattern->index[kj_ptrB]);
936 where_p=(index_t*)bsearch(&j, start_p,
937 B->pattern->ptr[k + 1]-kj_ptrB,
938 sizeof(index_t),
939 Paso_comparIndex);
940 if (! (where_p == NULL) ) { /* this should always be the case */
941 kj_ptrB += (index_t)(where_p-start_p);
942 A_ik=&(A->val[ik_ptrA*2]);
943 B_kj=&(B->val[kj_ptrB*2]);
944
945 C_ij_0+=A_ik[0]*B_kj[0];
946 C_ij_1+=A_ik[1]*B_kj[1];
947 }
948
949 }
950 C_ij[0]=C_ij_0;
951 C_ij[1]=C_ij_1;
952 }
953 }
954 } /* end of parallel region */
955 } else if ( (A_block_size == 3) && (B_block_size ==3) && (C_block_size == 3) ) {
956 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2)
957 {
958 #pragma omp for schedule(static)
959 for(i = 0; i < n; i++) {
960 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
961 j = C->pattern->index[ij_ptrC];
962 C_ij=&(C->val[ij_ptrC*C_block_size]);
963 C_ij_0=0;
964 C_ij_1=0;
965 C_ij_2=0;
966 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
967 k=A->pattern->index[ik_ptrA];
968 kj_ptrB=B->pattern->ptr[k];
969 /* find j in B[k,:] */
970 start_p=&(B->pattern->index[kj_ptrB]);
971 where_p=(index_t*)bsearch(&j, start_p,
972 B->pattern->ptr[k + 1]-kj_ptrB,
973 sizeof(index_t),
974 Paso_comparIndex);
975 if (! (where_p == NULL) ) { /* this should always be the case */
976 kj_ptrB += (index_t)(where_p-start_p);
977 A_ik=&(A->val[ik_ptrA*3]);
978 B_kj=&(B->val[kj_ptrB*3]);
979
980 C_ij_0+=A_ik[0]*B_kj[0];
981 C_ij_1+=A_ik[1]*B_kj[1];
982 C_ij_2+=A_ik[2]*B_kj[2];
983 }
984
985 }
986 C_ij[0]=C_ij_0;
987 C_ij[1]=C_ij_1;
988 C_ij[2]=C_ij_2;
989 }
990 }
991 } /* end of parallel region */
992 } else if ( (A_block_size == 4) && (B_block_size ==4 ) && (C_block_size == 4) ) {
993 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2, C_ij_3 )
994 {
995 #pragma omp for schedule(static)
996 for(i = 0; i < n; i++) {
997 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
998 j = C->pattern->index[ij_ptrC];
999 C_ij=&(C->val[ij_ptrC*C_block_size]);
1000 C_ij_0=0;
1001 C_ij_1=0;
1002 C_ij_2=0;
1003 C_ij_3=0;
1004 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
1005 k=A->pattern->index[ik_ptrA];
1006 kj_ptrB=B->pattern->ptr[k];
1007 /* find j in B[k,:] */
1008 start_p=&(B->pattern->index[kj_ptrB]);
1009 where_p=(index_t*)bsearch(&j, start_p,
1010 B->pattern->ptr[k + 1]-kj_ptrB,
1011 sizeof(index_t),
1012 Paso_comparIndex);
1013 if (! (where_p == NULL) ) { /* this should always be the case */
1014 kj_ptrB += (index_t)(where_p-start_p);
1015 A_ik=&(A->val[ik_ptrA*4]);
1016 B_kj=&(B->val[kj_ptrB*4]);
1017
1018 C_ij_0+=A_ik[0]*B_kj[0];
1019 C_ij_1+=A_ik[1]*B_kj[1];
1020 C_ij_2+=A_ik[2]*B_kj[2];
1021 C_ij_3+=A_ik[3]*B_kj[3];
1022 }
1023
1024 }
1025 C_ij[0]=C_ij_0;
1026 C_ij[1]=C_ij_1;
1027 C_ij[2]=C_ij_2;
1028 C_ij[3]=C_ij_3;
1029 }
1030 }
1031 } /* end of parallel region */
1032 } else {
1033 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, ib, A_ik,B_kj )
1034 {
1035 #pragma omp for schedule(static)
1036 for(i = 0; i < n; i++) {
1037 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1038 j = C->pattern->index[ij_ptrC];
1039 C_ij=&(C->val[ij_ptrC*C_block_size]);
1040 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
1041 for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
1042 k=A->pattern->index[ik_ptrA];
1043 kj_ptrB=B->pattern->ptr[k];
1044 /* find j in B[k,:] */
1045 start_p=&(B->pattern->index[kj_ptrB]);
1046 where_p=(index_t*)bsearch(&j, start_p,
1047 B->pattern->ptr[k + 1]-kj_ptrB,
1048 sizeof(index_t),
1049 Paso_comparIndex);
1050 if (! (where_p == NULL) ) { /* this should always be the case */
1051 kj_ptrB += (index_t)(where_p-start_p);
1052 A_ik=&(A->val[ik_ptrA*A_block_size]);
1053 B_kj=&(B->val[kj_ptrB*B_block_size]);
1054 for (ib=0; ib<MIN(A_block_size, B_block_size); ++ib) C_ij[ib]+=A_ik[ib]*B_kj[ib];
1055 }
1056 }
1057 }
1058 }
1059 } /* end of parallel region */
1060 }
1061 }

  ViewVC Help
Powered by ViewVC 1.1.26