1 |
|
2 |
/***************************************************************************** |
3 |
* |
4 |
* Copyright (c) 2003-2013 by University of Queensland |
5 |
* http://www.uq.edu.au |
6 |
* |
7 |
* Primary Business: Queensland, Australia |
8 |
* Licensed under the Open Software License version 3.0 |
9 |
* http://www.opensource.org/licenses/osl-3.0.php |
10 |
* |
11 |
* Development until 2012 by Earth Systems Science Computational Center (ESSCC) |
12 |
* Development since 2012 by School of Earth Sciences |
13 |
* |
14 |
*****************************************************************************/ |
15 |
|
16 |
|
17 |
/************************************************************************************ |
18 |
|
19 |
Paso: Sparse matrix product |
20 |
|
21 |
************************************************************************************ |
22 |
|
23 |
Author: artak@uq.edu.au, l.gross@uq.edu.au |
24 |
|
25 |
************************************************************************************/ |
26 |
|
27 |
#include "SparseMatrix.h" |
28 |
#include "Paso.h" |
29 |
|
30 |
Paso_SparseMatrix* Paso_SparseMatrix_MatrixMatrix(const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) { |
31 |
Paso_SparseMatrixType C_type; |
32 |
|
33 |
Paso_Pattern* outpattern=NULL; |
34 |
Paso_SparseMatrix *out=NULL; |
35 |
if ( ! ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (A->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & A->type ) ) ) { |
36 |
Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of A."); |
37 |
return NULL; |
38 |
} |
39 |
if ( ! ( (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (B->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & B->type ) ) ) { |
40 |
Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of B."); |
41 |
return NULL; |
42 |
} |
43 |
if (! (A->col_block_size == B->row_block_size) ) { |
44 |
Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Column block size of A and row block size of B must match."); |
45 |
return NULL; |
46 |
} |
47 |
if (! (A->numCols == B->numRows) ) { |
48 |
Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: number of columns of A and number of rows of B must match."); |
49 |
return NULL; |
50 |
} |
51 |
|
52 |
|
53 |
if ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) && (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) ) { |
54 |
C_type=MATRIX_FORMAT_DIAGONAL_BLOCK; |
55 |
} else { |
56 |
C_type=MATRIX_FORMAT_DEFAULT; |
57 |
} |
58 |
|
59 |
outpattern=Paso_Pattern_multiply(MATRIX_FORMAT_DEFAULT,A->pattern,B->pattern); |
60 |
|
61 |
if (Esys_noError()) { |
62 |
out=Paso_SparseMatrix_alloc(C_type, outpattern, A->row_block_size, B->col_block_size, FALSE); |
63 |
} |
64 |
Paso_Pattern_free(outpattern); |
65 |
|
66 |
if (Esys_noError()) { |
67 |
if ( (A->row_block_size == 1) && (B->col_block_size ==1 ) && (A->col_block_size ==1) ) { |
68 |
Paso_SparseMatrix_MatrixMatrix_DD(out, A,B); |
69 |
} else { |
70 |
if (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) { |
71 |
if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) { |
72 |
Paso_SparseMatrix_MatrixMatrix_DD(out, A,B); |
73 |
} else { |
74 |
Paso_SparseMatrix_MatrixMatrix_DB(out, A,B); |
75 |
} |
76 |
} else { |
77 |
if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) { |
78 |
Paso_SparseMatrix_MatrixMatrix_BD(out, A,B); |
79 |
} else { |
80 |
Paso_SparseMatrix_MatrixMatrix_BB(out, A,B); |
81 |
} |
82 |
} |
83 |
} |
84 |
return out; |
85 |
} else { |
86 |
Paso_SparseMatrix_free(out); |
87 |
return NULL; |
88 |
} |
89 |
} |
90 |
/* not good for block size 1 */ |
91 |
void Paso_SparseMatrix_MatrixMatrix_BB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) |
92 |
{ |
93 |
const dim_t n = C->numRows; |
94 |
const dim_t row_block_size = C->row_block_size; |
95 |
const dim_t col_block_size = C->col_block_size; |
96 |
const dim_t A_col_block_size = A->col_block_size; |
97 |
const dim_t C_block_size =C->block_size; |
98 |
const dim_t B_block_size =B->block_size; |
99 |
const dim_t A_block_size =A->block_size; |
100 |
double *C_ij, *A_ik, *B_kj; |
101 |
register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33; |
102 |
dim_t i, ib, irb, icb; |
103 |
index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p; |
104 |
|
105 |
if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_col_block_size == 2) ) { |
106 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11) |
107 |
{ |
108 |
#pragma omp for schedule(static) |
109 |
for(i = 0; i < n; i++) { |
110 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
111 |
j = C->pattern->index[ij_ptrC]; |
112 |
|
113 |
C_ij_00=0; |
114 |
C_ij_10=0; |
115 |
C_ij_01=0; |
116 |
C_ij_11=0; |
117 |
|
118 |
C_ij=&(C->val[ij_ptrC*4]); |
119 |
|
120 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
121 |
k=A->pattern->index[ik_ptrA]; |
122 |
kj_ptrB=B->pattern->ptr[k]; |
123 |
/* find j in B[k,:] */ |
124 |
start_p=&(B->pattern->index[kj_ptrB]); |
125 |
where_p=(index_t*)bsearch(&j, start_p, |
126 |
B->pattern->ptr[k + 1]-kj_ptrB, |
127 |
sizeof(index_t), |
128 |
Paso_comparIndex); |
129 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
130 |
kj_ptrB += (index_t)(where_p-start_p); |
131 |
A_ik=&(A->val[ik_ptrA*4]); |
132 |
B_kj=&(B->val[kj_ptrB*4]); |
133 |
|
134 |
C_ij_00 +=A_ik[0+2*0]*B_kj[0+2*0]+A_ik[0+2*1]*B_kj[1+2*0]; |
135 |
C_ij_10 +=A_ik[1+2*0]*B_kj[0+2*0]+A_ik[1+2*1]*B_kj[1+2*0]; |
136 |
|
137 |
C_ij_01 +=A_ik[0+2*0]*B_kj[0+2*1]+A_ik[0+2*1]*B_kj[1+2*1]; |
138 |
C_ij_11 +=A_ik[1+2*0]*B_kj[0+2*1]+A_ik[1+2*1]*B_kj[1+2*1]; |
139 |
} |
140 |
|
141 |
} |
142 |
C_ij[0+2*0]=C_ij_00; |
143 |
C_ij[1+2*0]=C_ij_10; |
144 |
C_ij[0+2*1]=C_ij_01; |
145 |
C_ij[1+2*1]=C_ij_11; |
146 |
|
147 |
} |
148 |
} |
149 |
} /* end of parallel region */ |
150 |
|
151 |
} else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_col_block_size == 3) ){ |
152 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22) |
153 |
{ |
154 |
#pragma omp for schedule(static) |
155 |
for(i = 0; i < n; i++) { |
156 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
157 |
j = C->pattern->index[ij_ptrC]; |
158 |
|
159 |
C_ij_00=0; |
160 |
C_ij_10=0; |
161 |
C_ij_20=0; |
162 |
C_ij_01=0; |
163 |
C_ij_11=0; |
164 |
C_ij_21=0; |
165 |
C_ij_02=0; |
166 |
C_ij_12=0; |
167 |
C_ij_22=0; |
168 |
|
169 |
C_ij=&(C->val[ij_ptrC*9]); |
170 |
|
171 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
172 |
k=A->pattern->index[ik_ptrA]; |
173 |
kj_ptrB=B->pattern->ptr[k]; |
174 |
/* find j in B[k,:] */ |
175 |
start_p=&(B->pattern->index[kj_ptrB]); |
176 |
where_p=(index_t*)bsearch(&j, start_p, |
177 |
B->pattern->ptr[k + 1]-kj_ptrB, |
178 |
sizeof(index_t), |
179 |
Paso_comparIndex); |
180 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
181 |
kj_ptrB += (index_t)(where_p-start_p); |
182 |
A_ik=&(A->val[ik_ptrA*9]); |
183 |
B_kj=&(B->val[kj_ptrB*9]); |
184 |
|
185 |
C_ij_00 +=A_ik[0+3*0]*B_kj[0+3*0] |
186 |
+A_ik[0+3*1]*B_kj[1+3*0] |
187 |
+A_ik[0+3*2]*B_kj[2+3*0]; |
188 |
C_ij_10 +=A_ik[1+3*0]*B_kj[0+3*0] |
189 |
+A_ik[1+3*1]*B_kj[1+3*0] |
190 |
+A_ik[1+3*2]*B_kj[2+3*0]; |
191 |
C_ij_20 +=A_ik[2+3*0]*B_kj[0+3*0] |
192 |
+A_ik[2+3*1]*B_kj[1+3*0] |
193 |
+A_ik[2+3*2]*B_kj[2+3*0]; |
194 |
|
195 |
C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*1] |
196 |
+A_ik[0+3*1]*B_kj[1+3*1] |
197 |
+A_ik[0+3*2]*B_kj[2+3*1]; |
198 |
C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*1] |
199 |
+A_ik[1+3*1]*B_kj[1+3*1] |
200 |
+A_ik[1+3*2]*B_kj[2+3*1]; |
201 |
C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*1] |
202 |
+A_ik[2+3*1]*B_kj[1+3*1] |
203 |
+A_ik[2+3*2]*B_kj[2+3*1]; |
204 |
|
205 |
C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*2] |
206 |
+A_ik[0+3*1]*B_kj[1+3*2] |
207 |
+A_ik[0+3*2]*B_kj[2+3*2]; |
208 |
C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*2] |
209 |
+A_ik[1+3*1]*B_kj[1+3*2] |
210 |
+A_ik[1+3*2]*B_kj[2+3*2]; |
211 |
C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*2] |
212 |
+A_ik[2+3*1]*B_kj[1+3*2] |
213 |
+A_ik[2+3*2]*B_kj[2+3*2]; |
214 |
} |
215 |
|
216 |
} |
217 |
C_ij[0+3*0]=C_ij_00; |
218 |
C_ij[1+3*0]=C_ij_10; |
219 |
C_ij[2+3*0]=C_ij_20; |
220 |
C_ij[0+3*1]=C_ij_01; |
221 |
C_ij[1+3*1]=C_ij_11; |
222 |
C_ij[2+3*1]=C_ij_21; |
223 |
C_ij[0+3*2]=C_ij_02; |
224 |
C_ij[1+3*2]=C_ij_12; |
225 |
C_ij[2+3*2]=C_ij_22; |
226 |
} |
227 |
} |
228 |
} /* end of parallel region */ |
229 |
} else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_col_block_size == 4) ){ |
230 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33) |
231 |
{ |
232 |
#pragma omp for schedule(static) |
233 |
for(i = 0; i < n; i++) { |
234 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
235 |
j = C->pattern->index[ij_ptrC]; |
236 |
|
237 |
C_ij_00=0; |
238 |
C_ij_10=0; |
239 |
C_ij_20=0; |
240 |
C_ij_30=0; |
241 |
C_ij_01=0; |
242 |
C_ij_11=0; |
243 |
C_ij_21=0; |
244 |
C_ij_31=0; |
245 |
C_ij_02=0; |
246 |
C_ij_12=0; |
247 |
C_ij_22=0; |
248 |
C_ij_32=0; |
249 |
C_ij_03=0; |
250 |
C_ij_13=0; |
251 |
C_ij_23=0; |
252 |
C_ij_33=0; |
253 |
|
254 |
C_ij=&(C->val[ij_ptrC*16]); |
255 |
|
256 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
257 |
k=A->pattern->index[ik_ptrA]; |
258 |
kj_ptrB=B->pattern->ptr[k]; |
259 |
/* find j in B[k,:] */ |
260 |
start_p=&(B->pattern->index[kj_ptrB]); |
261 |
where_p=(index_t*)bsearch(&j, start_p, |
262 |
B->pattern->ptr[k + 1]-kj_ptrB, |
263 |
sizeof(index_t), |
264 |
Paso_comparIndex); |
265 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
266 |
kj_ptrB += (index_t)(where_p-start_p); |
267 |
A_ik=&(A->val[ik_ptrA*16]); |
268 |
B_kj=&(B->val[kj_ptrB*16]); |
269 |
|
270 |
C_ij_00 +=A_ik[0+4*0]*B_kj[0+4*0] |
271 |
+A_ik[0+4*1]*B_kj[1+4*0] |
272 |
+A_ik[0+4*2]*B_kj[2+4*0] |
273 |
+A_ik[0+4*3]*B_kj[3+4*0]; |
274 |
C_ij_10 +=A_ik[1+4*0]*B_kj[0+4*0] |
275 |
+A_ik[1+4*1]*B_kj[1+4*0] |
276 |
+A_ik[1+4*2]*B_kj[2+4*0] |
277 |
+A_ik[1+4*3]*B_kj[3+4*0]; |
278 |
C_ij_20 +=A_ik[2+4*0]*B_kj[0+4*0] |
279 |
+A_ik[2+4*1]*B_kj[1+4*0] |
280 |
+A_ik[2+4*2]*B_kj[2+4*0] |
281 |
+A_ik[2+4*3]*B_kj[3+4*0]; |
282 |
C_ij_30 +=A_ik[3+4*0]*B_kj[0+4*0] |
283 |
+A_ik[3+4*1]*B_kj[1+4*0] |
284 |
+A_ik[3+4*2]*B_kj[2+4*0] |
285 |
+A_ik[3+4*3]*B_kj[3+4*0]; |
286 |
|
287 |
C_ij_01 +=A_ik[0+4*0]*B_kj[0+4*1] |
288 |
+A_ik[0+4*1]*B_kj[1+4*1] |
289 |
+A_ik[0+4*2]*B_kj[2+4*1] |
290 |
+A_ik[0+4*3]*B_kj[3+4*1]; |
291 |
C_ij_11 +=A_ik[1+4*0]*B_kj[0+4*1] |
292 |
+A_ik[1+4*1]*B_kj[1+4*1] |
293 |
+A_ik[1+4*2]*B_kj[2+4*1] |
294 |
+A_ik[1+4*3]*B_kj[3+4*1]; |
295 |
C_ij_21 +=A_ik[2+4*0]*B_kj[0+4*1] |
296 |
+A_ik[2+4*1]*B_kj[1+4*1] |
297 |
+A_ik[2+4*2]*B_kj[2+4*1] |
298 |
+A_ik[2+4*3]*B_kj[3+4*1]; |
299 |
C_ij_31 +=A_ik[3+4*0]*B_kj[0+4*1] |
300 |
+A_ik[3+4*1]*B_kj[1+4*1] |
301 |
+A_ik[3+4*2]*B_kj[2+4*1] |
302 |
+A_ik[3+4*3]*B_kj[3+4*1]; |
303 |
|
304 |
C_ij_02 +=A_ik[0+4*0]*B_kj[0+4*2] |
305 |
+A_ik[0+4*1]*B_kj[1+4*2] |
306 |
+A_ik[0+4*2]*B_kj[2+4*2] |
307 |
+A_ik[0+4*3]*B_kj[3+4*2]; |
308 |
C_ij_12 +=A_ik[1+4*0]*B_kj[0+4*2] |
309 |
+A_ik[1+4*1]*B_kj[1+4*2] |
310 |
+A_ik[1+4*2]*B_kj[2+4*2] |
311 |
+A_ik[1+4*3]*B_kj[3+4*2]; |
312 |
C_ij_22 +=A_ik[2+4*0]*B_kj[0+4*2] |
313 |
+A_ik[2+4*1]*B_kj[1+4*2] |
314 |
+A_ik[2+4*2]*B_kj[2+4*2] |
315 |
+A_ik[2+4*3]*B_kj[3+4*2]; |
316 |
C_ij_32 +=A_ik[3+4*0]*B_kj[0+4*2] |
317 |
+A_ik[3+4*1]*B_kj[1+4*2] |
318 |
+A_ik[3+4*2]*B_kj[2+4*2] |
319 |
+A_ik[3+4*3]*B_kj[3+4*2]; |
320 |
|
321 |
C_ij_03 +=A_ik[0+4*0]*B_kj[0+4*3] |
322 |
+A_ik[0+4*1]*B_kj[1+4*3] |
323 |
+A_ik[0+4*2]*B_kj[2+4*3] |
324 |
+A_ik[0+4*3]*B_kj[3+4*3]; |
325 |
C_ij_13 +=A_ik[1+4*0]*B_kj[0+4*3] |
326 |
+A_ik[1+4*1]*B_kj[1+4*3] |
327 |
+A_ik[1+4*2]*B_kj[2+4*3] |
328 |
+A_ik[1+4*3]*B_kj[3+4*3]; |
329 |
C_ij_23 +=A_ik[2+4*0]*B_kj[0+4*3] |
330 |
+A_ik[2+4*1]*B_kj[1+4*3] |
331 |
+A_ik[2+4*2]*B_kj[2+4*3] |
332 |
+A_ik[2+4*3]*B_kj[3+4*3]; |
333 |
C_ij_33 +=A_ik[3+4*0]*B_kj[0+4*3] |
334 |
+A_ik[3+4*1]*B_kj[1+4*3] |
335 |
+A_ik[3+4*2]*B_kj[2+4*3] |
336 |
+A_ik[3+4*3]*B_kj[3+4*3]; |
337 |
} |
338 |
} |
339 |
C_ij[0+4*0]=C_ij_00; |
340 |
C_ij[1+4*0]=C_ij_10; |
341 |
C_ij[2+4*0]=C_ij_20; |
342 |
C_ij[3+4*0]=C_ij_30; |
343 |
C_ij[0+4*1]=C_ij_01; |
344 |
C_ij[1+4*1]=C_ij_11; |
345 |
C_ij[2+4*1]=C_ij_21; |
346 |
C_ij[3+4*1]=C_ij_31; |
347 |
C_ij[0+4*2]=C_ij_02; |
348 |
C_ij[1+4*2]=C_ij_12; |
349 |
C_ij[2+4*2]=C_ij_22; |
350 |
C_ij[3+4*2]=C_ij_32; |
351 |
C_ij[0+4*3]=C_ij_03; |
352 |
C_ij[1+4*3]=C_ij_13; |
353 |
C_ij[2+4*3]=C_ij_23; |
354 |
C_ij[3+4*3]=C_ij_33; |
355 |
} |
356 |
} |
357 |
} /* end of parallel region */ |
358 |
|
359 |
} else { |
360 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj ) |
361 |
{ |
362 |
#pragma omp for schedule(static) |
363 |
for(i = 0; i < n; i++) { |
364 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
365 |
j = C->pattern->index[ij_ptrC]; |
366 |
C_ij=&(C->val[ij_ptrC*C_block_size]); |
367 |
for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0; |
368 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
369 |
k=A->pattern->index[ik_ptrA]; |
370 |
kj_ptrB=B->pattern->ptr[k]; |
371 |
/* find j in B[k,:] */ |
372 |
start_p=&(B->pattern->index[kj_ptrB]); |
373 |
where_p=(index_t*)bsearch(&j, start_p, |
374 |
B->pattern->ptr[k + 1]-kj_ptrB, |
375 |
sizeof(index_t), |
376 |
Paso_comparIndex); |
377 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
378 |
kj_ptrB += (index_t)(where_p-start_p); |
379 |
A_ik=&(A->val[ik_ptrA*A_block_size]); |
380 |
B_kj=&(B->val[kj_ptrB*B_block_size]); |
381 |
|
382 |
for (irb=0; irb<row_block_size; ++irb) { |
383 |
for (icb=0; icb<col_block_size; ++icb) { |
384 |
rtmp=C_ij[irb+row_block_size*icb]; |
385 |
for (ib=0; ib<A_col_block_size; ++ib) { |
386 |
rtmp+=A_ik[irb+row_block_size*ib]*B_kj[ib+A_col_block_size*icb]; |
387 |
} |
388 |
C_ij[irb+row_block_size*icb]=rtmp; |
389 |
} |
390 |
} |
391 |
} |
392 |
} |
393 |
} |
394 |
} |
395 |
} /* end of parallel region */ |
396 |
|
397 |
} |
398 |
} |
399 |
|
400 |
/* not good for block size 1 */ |
401 |
void Paso_SparseMatrix_MatrixMatrix_DB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) |
402 |
{ |
403 |
const dim_t n = C->numRows; |
404 |
const dim_t row_block_size = C->row_block_size; |
405 |
const dim_t col_block_size = C->col_block_size; |
406 |
const dim_t A_col_block_size = A->col_block_size; |
407 |
const dim_t C_block_size =C->block_size; |
408 |
const dim_t B_block_size =B->block_size; |
409 |
const dim_t A_block_size =A->block_size; |
410 |
double *C_ij, *A_ik, *B_kj; |
411 |
register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33; |
412 |
dim_t i, ib, irb, icb; |
413 |
index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p; |
414 |
|
415 |
if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_block_size == 2) ) { |
416 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11) |
417 |
{ |
418 |
#pragma omp for schedule(static) |
419 |
for(i = 0; i < n; i++) { |
420 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
421 |
j = C->pattern->index[ij_ptrC]; |
422 |
|
423 |
C_ij_00=0; |
424 |
C_ij_10=0; |
425 |
C_ij_01=0; |
426 |
C_ij_11=0; |
427 |
|
428 |
C_ij=&(C->val[ij_ptrC*4]); |
429 |
|
430 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
431 |
k=A->pattern->index[ik_ptrA]; |
432 |
kj_ptrB=B->pattern->ptr[k]; |
433 |
/* find j in B[k,:] */ |
434 |
start_p=&(B->pattern->index[kj_ptrB]); |
435 |
where_p=(index_t*)bsearch(&j, start_p, |
436 |
B->pattern->ptr[k + 1]-kj_ptrB, |
437 |
sizeof(index_t), |
438 |
Paso_comparIndex); |
439 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
440 |
kj_ptrB += (index_t)(where_p-start_p); |
441 |
A_ik=&(A->val[ik_ptrA*2]); |
442 |
B_kj=&(B->val[kj_ptrB*4]); |
443 |
|
444 |
C_ij_00 +=A_ik[0]*B_kj[0+2*0]; |
445 |
C_ij_10 +=A_ik[1]*B_kj[1+2*0]; |
446 |
|
447 |
C_ij_01 +=A_ik[0]*B_kj[0+2*1]; |
448 |
C_ij_11 +=A_ik[1]*B_kj[1+2*1]; |
449 |
} |
450 |
|
451 |
} |
452 |
C_ij[0+2*0]=C_ij_00; |
453 |
C_ij[1+2*0]=C_ij_10; |
454 |
C_ij[0+2*1]=C_ij_01; |
455 |
C_ij[1+2*1]=C_ij_11; |
456 |
|
457 |
} |
458 |
} |
459 |
} /* end of parallel region */ |
460 |
|
461 |
} else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_block_size == 3) ){ |
462 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22) |
463 |
{ |
464 |
#pragma omp for schedule(static) |
465 |
for(i = 0; i < n; i++) { |
466 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
467 |
j = C->pattern->index[ij_ptrC]; |
468 |
|
469 |
C_ij_00=0; |
470 |
C_ij_10=0; |
471 |
C_ij_20=0; |
472 |
C_ij_01=0; |
473 |
C_ij_11=0; |
474 |
C_ij_21=0; |
475 |
C_ij_02=0; |
476 |
C_ij_12=0; |
477 |
C_ij_22=0; |
478 |
|
479 |
C_ij=&(C->val[ij_ptrC*9]); |
480 |
|
481 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
482 |
k=A->pattern->index[ik_ptrA]; |
483 |
kj_ptrB=B->pattern->ptr[k]; |
484 |
/* find j in B[k,:] */ |
485 |
start_p=&(B->pattern->index[kj_ptrB]); |
486 |
where_p=(index_t*)bsearch(&j, start_p, |
487 |
B->pattern->ptr[k + 1]-kj_ptrB, |
488 |
sizeof(index_t), |
489 |
Paso_comparIndex); |
490 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
491 |
kj_ptrB += (index_t)(where_p-start_p); |
492 |
A_ik=&(A->val[ik_ptrA*3]); |
493 |
B_kj=&(B->val[kj_ptrB*9]); |
494 |
|
495 |
C_ij_00 +=A_ik[0]*B_kj[0+3*0]; |
496 |
C_ij_10 +=A_ik[1]*B_kj[1+3*0]; |
497 |
C_ij_20 +=A_ik[2]*B_kj[2+3*0]; |
498 |
|
499 |
C_ij_01 +=A_ik[0]*B_kj[0+3*1]; |
500 |
C_ij_11 +=A_ik[1]*B_kj[1+3*1]; |
501 |
C_ij_21 +=A_ik[2]*B_kj[2+3*1]; |
502 |
|
503 |
C_ij_02 +=A_ik[0]*B_kj[0+3*2]; |
504 |
C_ij_12 +=A_ik[1]*B_kj[1+3*2]; |
505 |
C_ij_22 +=A_ik[2]*B_kj[2+3*2]; |
506 |
|
507 |
} |
508 |
|
509 |
} |
510 |
C_ij[0+3*0]=C_ij_00; |
511 |
C_ij[1+3*0]=C_ij_10; |
512 |
C_ij[2+3*0]=C_ij_20; |
513 |
C_ij[0+3*1]=C_ij_01; |
514 |
C_ij[1+3*1]=C_ij_11; |
515 |
C_ij[2+3*1]=C_ij_21; |
516 |
C_ij[0+3*2]=C_ij_02; |
517 |
C_ij[1+3*2]=C_ij_12; |
518 |
C_ij[2+3*2]=C_ij_22; |
519 |
} |
520 |
} |
521 |
} /* end of parallel region */ |
522 |
} else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_block_size == 4) ){ |
523 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33) |
524 |
{ |
525 |
#pragma omp for schedule(static) |
526 |
for(i = 0; i < n; i++) { |
527 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
528 |
j = C->pattern->index[ij_ptrC]; |
529 |
|
530 |
C_ij_00=0; |
531 |
C_ij_10=0; |
532 |
C_ij_20=0; |
533 |
C_ij_30=0; |
534 |
C_ij_01=0; |
535 |
C_ij_11=0; |
536 |
C_ij_21=0; |
537 |
C_ij_31=0; |
538 |
C_ij_02=0; |
539 |
C_ij_12=0; |
540 |
C_ij_22=0; |
541 |
C_ij_32=0; |
542 |
C_ij_03=0; |
543 |
C_ij_13=0; |
544 |
C_ij_23=0; |
545 |
C_ij_33=0; |
546 |
|
547 |
C_ij=&(C->val[ij_ptrC*16]); |
548 |
|
549 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
550 |
k=A->pattern->index[ik_ptrA]; |
551 |
kj_ptrB=B->pattern->ptr[k]; |
552 |
/* find j in B[k,:] */ |
553 |
start_p=&(B->pattern->index[kj_ptrB]); |
554 |
where_p=(index_t*)bsearch(&j, start_p, |
555 |
B->pattern->ptr[k + 1]-kj_ptrB, |
556 |
sizeof(index_t), |
557 |
Paso_comparIndex); |
558 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
559 |
kj_ptrB += (index_t)(where_p-start_p); |
560 |
A_ik=&(A->val[ik_ptrA*4]); |
561 |
B_kj=&(B->val[kj_ptrB*16]); |
562 |
|
563 |
C_ij_00 +=A_ik[0]*B_kj[0+4*0]; |
564 |
C_ij_10 +=A_ik[1]*B_kj[1+4*0]; |
565 |
C_ij_20 +=A_ik[2]*B_kj[2+4*0]; |
566 |
C_ij_30 +=A_ik[3]*B_kj[3+4*0]; |
567 |
|
568 |
C_ij_01 +=A_ik[0]*B_kj[0+4*1]; |
569 |
C_ij_11 +=A_ik[1]*B_kj[1+4*1]; |
570 |
C_ij_21 +=A_ik[2]*B_kj[2+4*1]; |
571 |
C_ij_31 +=A_ik[3]*B_kj[3+4*1]; |
572 |
|
573 |
C_ij_02 +=A_ik[0]*B_kj[0+4*2]; |
574 |
C_ij_12 +=A_ik[1]*B_kj[1+4*2]; |
575 |
C_ij_22 +=A_ik[2]*B_kj[2+4*2]; |
576 |
C_ij_32 +=A_ik[3]*B_kj[3+4*2]; |
577 |
|
578 |
C_ij_03 +=A_ik[0]*B_kj[0+4*3]; |
579 |
C_ij_13 +=A_ik[1]*B_kj[1+4*3]; |
580 |
C_ij_23 +=A_ik[2]*B_kj[2+4*3]; |
581 |
C_ij_33 +=A_ik[3]*B_kj[3+4*3]; |
582 |
} |
583 |
} |
584 |
C_ij[0+4*0]=C_ij_00; |
585 |
C_ij[1+4*0]=C_ij_10; |
586 |
C_ij[2+4*0]=C_ij_20; |
587 |
C_ij[3+4*0]=C_ij_30; |
588 |
C_ij[0+4*1]=C_ij_01; |
589 |
C_ij[1+4*1]=C_ij_11; |
590 |
C_ij[2+4*1]=C_ij_21; |
591 |
C_ij[3+4*1]=C_ij_31; |
592 |
C_ij[0+4*2]=C_ij_02; |
593 |
C_ij[1+4*2]=C_ij_12; |
594 |
C_ij[2+4*2]=C_ij_22; |
595 |
C_ij[3+4*2]=C_ij_32; |
596 |
C_ij[0+4*3]=C_ij_03; |
597 |
C_ij[1+4*3]=C_ij_13; |
598 |
C_ij[2+4*3]=C_ij_23; |
599 |
C_ij[3+4*3]=C_ij_33; |
600 |
} |
601 |
} |
602 |
} /* end of parallel region */ |
603 |
|
604 |
} else { |
605 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj ) |
606 |
{ |
607 |
#pragma omp for schedule(static) |
608 |
for(i = 0; i < n; i++) { |
609 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
610 |
j = C->pattern->index[ij_ptrC]; |
611 |
C_ij=&(C->val[ij_ptrC*C_block_size]); |
612 |
for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0; |
613 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
614 |
k=A->pattern->index[ik_ptrA]; |
615 |
kj_ptrB=B->pattern->ptr[k]; |
616 |
/* find j in B[k,:] */ |
617 |
start_p=&(B->pattern->index[kj_ptrB]); |
618 |
where_p=(index_t*)bsearch(&j, start_p, |
619 |
B->pattern->ptr[k + 1]-kj_ptrB, |
620 |
sizeof(index_t), |
621 |
Paso_comparIndex); |
622 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
623 |
kj_ptrB += (index_t)(where_p-start_p); |
624 |
A_ik=&(A->val[ik_ptrA*A_block_size]); |
625 |
B_kj=&(B->val[kj_ptrB*B_block_size]); |
626 |
|
627 |
for (irb=0; irb<A_block_size; ++irb) { |
628 |
rtmp=A_ik[irb]; |
629 |
for (icb=0; icb<col_block_size; ++icb) { |
630 |
C_ij[irb+row_block_size*icb]+=rtmp*B_kj[irb+A_col_block_size*icb]; |
631 |
} |
632 |
} |
633 |
} |
634 |
} |
635 |
} |
636 |
} |
637 |
} /* end of parallel region */ |
638 |
|
639 |
} |
640 |
} |
641 |
|
642 |
/* not good for block size 1 */ |
643 |
void Paso_SparseMatrix_MatrixMatrix_BD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) |
644 |
{ |
645 |
const dim_t n = C->numRows; |
646 |
const dim_t row_block_size = C->row_block_size; |
647 |
const dim_t col_block_size = C->col_block_size; |
648 |
const dim_t C_block_size =C->block_size; |
649 |
const dim_t B_block_size =B->block_size; |
650 |
const dim_t A_block_size =A->block_size; |
651 |
double *C_ij, *A_ik, *B_kj; |
652 |
register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33; |
653 |
dim_t i, ib, irb, icb; |
654 |
index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p; |
655 |
|
656 |
if ( (row_block_size == 2) && (col_block_size ==2 ) && (B_block_size == 2) ) { |
657 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11) |
658 |
{ |
659 |
#pragma omp for schedule(static) |
660 |
for(i = 0; i < n; i++) { |
661 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
662 |
j = C->pattern->index[ij_ptrC]; |
663 |
|
664 |
C_ij_00=0; |
665 |
C_ij_10=0; |
666 |
C_ij_01=0; |
667 |
C_ij_11=0; |
668 |
|
669 |
C_ij=&(C->val[ij_ptrC*4]); |
670 |
|
671 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
672 |
k=A->pattern->index[ik_ptrA]; |
673 |
kj_ptrB=B->pattern->ptr[k]; |
674 |
/* find j in B[k,:] */ |
675 |
start_p=&(B->pattern->index[kj_ptrB]); |
676 |
where_p=(index_t*)bsearch(&j, start_p, |
677 |
B->pattern->ptr[k + 1]-kj_ptrB, |
678 |
sizeof(index_t), |
679 |
Paso_comparIndex); |
680 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
681 |
kj_ptrB += (index_t)(where_p-start_p); |
682 |
A_ik=&(A->val[ik_ptrA*4]); |
683 |
B_kj=&(B->val[kj_ptrB*2]); |
684 |
|
685 |
C_ij_00 +=A_ik[0+2*0]*B_kj[0]; |
686 |
C_ij_10 +=A_ik[1+2*0]*B_kj[0]; |
687 |
|
688 |
C_ij_01 +=A_ik[0+2*1]*B_kj[1]; |
689 |
C_ij_11 +=A_ik[1+2*1]*B_kj[1]; |
690 |
} |
691 |
|
692 |
} |
693 |
C_ij[0+2*0]=C_ij_00; |
694 |
C_ij[1+2*0]=C_ij_10; |
695 |
C_ij[0+2*1]=C_ij_01; |
696 |
C_ij[1+2*1]=C_ij_11; |
697 |
|
698 |
} |
699 |
} |
700 |
} /* end of parallel region */ |
701 |
|
702 |
} else if ( (row_block_size == 3) && (col_block_size ==3 ) && (B_block_size == 3) ){ |
703 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22) |
704 |
{ |
705 |
#pragma omp for schedule(static) |
706 |
for(i = 0; i < n; i++) { |
707 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
708 |
j = C->pattern->index[ij_ptrC]; |
709 |
|
710 |
C_ij_00=0; |
711 |
C_ij_10=0; |
712 |
C_ij_20=0; |
713 |
C_ij_01=0; |
714 |
C_ij_11=0; |
715 |
C_ij_21=0; |
716 |
C_ij_02=0; |
717 |
C_ij_12=0; |
718 |
C_ij_22=0; |
719 |
|
720 |
C_ij=&(C->val[ij_ptrC*9]); |
721 |
|
722 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
723 |
k=A->pattern->index[ik_ptrA]; |
724 |
kj_ptrB=B->pattern->ptr[k]; |
725 |
/* find j in B[k,:] */ |
726 |
start_p=&(B->pattern->index[kj_ptrB]); |
727 |
where_p=(index_t*)bsearch(&j, start_p, |
728 |
B->pattern->ptr[k + 1]-kj_ptrB, |
729 |
sizeof(index_t), |
730 |
Paso_comparIndex); |
731 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
732 |
kj_ptrB += (index_t)(where_p-start_p); |
733 |
A_ik=&(A->val[ik_ptrA*9]); |
734 |
B_kj=&(B->val[kj_ptrB*3]); |
735 |
|
736 |
C_ij_00 +=A_ik[0+3*0]*B_kj[0]; |
737 |
C_ij_10 +=A_ik[1+3*0]*B_kj[0]; |
738 |
C_ij_20 +=A_ik[2+3*0]*B_kj[0]; |
739 |
|
740 |
C_ij_01 +=A_ik[0+3*1]*B_kj[1]; |
741 |
C_ij_11 +=A_ik[1+3*1]*B_kj[1]; |
742 |
C_ij_21 +=A_ik[2+3*1]*B_kj[1]; |
743 |
|
744 |
C_ij_02 +=A_ik[0+3*2]*B_kj[2]; |
745 |
C_ij_12 +=A_ik[1+3*2]*B_kj[2]; |
746 |
C_ij_22 +=A_ik[2+3*2]*B_kj[2]; |
747 |
} |
748 |
|
749 |
} |
750 |
C_ij[0+3*0]=C_ij_00; |
751 |
C_ij[1+3*0]=C_ij_10; |
752 |
C_ij[2+3*0]=C_ij_20; |
753 |
C_ij[0+3*1]=C_ij_01; |
754 |
C_ij[1+3*1]=C_ij_11; |
755 |
C_ij[2+3*1]=C_ij_21; |
756 |
C_ij[0+3*2]=C_ij_02; |
757 |
C_ij[1+3*2]=C_ij_12; |
758 |
C_ij[2+3*2]=C_ij_22; |
759 |
} |
760 |
} |
761 |
} /* end of parallel region */ |
762 |
} else if ( (row_block_size == 4) && (col_block_size ==4 ) && (B_block_size == 4) ){ |
763 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33) |
764 |
{ |
765 |
#pragma omp for schedule(static) |
766 |
for(i = 0; i < n; i++) { |
767 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
768 |
j = C->pattern->index[ij_ptrC]; |
769 |
|
770 |
C_ij_00=0; |
771 |
C_ij_10=0; |
772 |
C_ij_20=0; |
773 |
C_ij_30=0; |
774 |
C_ij_01=0; |
775 |
C_ij_11=0; |
776 |
C_ij_21=0; |
777 |
C_ij_31=0; |
778 |
C_ij_02=0; |
779 |
C_ij_12=0; |
780 |
C_ij_22=0; |
781 |
C_ij_32=0; |
782 |
C_ij_03=0; |
783 |
C_ij_13=0; |
784 |
C_ij_23=0; |
785 |
C_ij_33=0; |
786 |
|
787 |
C_ij=&(C->val[ij_ptrC*16]); |
788 |
|
789 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
790 |
k=A->pattern->index[ik_ptrA]; |
791 |
kj_ptrB=B->pattern->ptr[k]; |
792 |
/* find j in B[k,:] */ |
793 |
start_p=&(B->pattern->index[kj_ptrB]); |
794 |
where_p=(index_t*)bsearch(&j, start_p, |
795 |
B->pattern->ptr[k + 1]-kj_ptrB, |
796 |
sizeof(index_t), |
797 |
Paso_comparIndex); |
798 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
799 |
kj_ptrB += (index_t)(where_p-start_p); |
800 |
A_ik=&(A->val[ik_ptrA*16]); |
801 |
B_kj=&(B->val[kj_ptrB*4]); |
802 |
|
803 |
C_ij_00 +=A_ik[0+4*0]*B_kj[0]; |
804 |
C_ij_10 +=A_ik[1+4*0]*B_kj[0]; |
805 |
C_ij_20 +=A_ik[2+4*0]*B_kj[0]; |
806 |
C_ij_30 +=A_ik[3+4*0]*B_kj[0]; |
807 |
|
808 |
C_ij_01 +=A_ik[0+4*1]*B_kj[1]; |
809 |
C_ij_11 +=A_ik[1+4*1]*B_kj[1]; |
810 |
C_ij_21 +=A_ik[2+4*1]*B_kj[1]; |
811 |
C_ij_31 +=A_ik[3+4*1]*B_kj[1]; |
812 |
|
813 |
C_ij_02 +=A_ik[0+4*2]*B_kj[2]; |
814 |
C_ij_12 +=A_ik[1+4*2]*B_kj[2]; |
815 |
C_ij_22 +=A_ik[2+4*2]*B_kj[2]; |
816 |
C_ij_32 +=A_ik[3+4*2]*B_kj[2]; |
817 |
|
818 |
C_ij_03 +=A_ik[0+4*3]*B_kj[3]; |
819 |
C_ij_13 +=A_ik[1+4*3]*B_kj[3]; |
820 |
C_ij_23 +=A_ik[2+4*3]*B_kj[3]; |
821 |
C_ij_33 +=A_ik[3+4*3]*B_kj[3]; |
822 |
} |
823 |
} |
824 |
C_ij[0+4*0]=C_ij_00; |
825 |
C_ij[1+4*0]=C_ij_10; |
826 |
C_ij[2+4*0]=C_ij_20; |
827 |
C_ij[3+4*0]=C_ij_30; |
828 |
C_ij[0+4*1]=C_ij_01; |
829 |
C_ij[1+4*1]=C_ij_11; |
830 |
C_ij[2+4*1]=C_ij_21; |
831 |
C_ij[3+4*1]=C_ij_31; |
832 |
C_ij[0+4*2]=C_ij_02; |
833 |
C_ij[1+4*2]=C_ij_12; |
834 |
C_ij[2+4*2]=C_ij_22; |
835 |
C_ij[3+4*2]=C_ij_32; |
836 |
C_ij[0+4*3]=C_ij_03; |
837 |
C_ij[1+4*3]=C_ij_13; |
838 |
C_ij[2+4*3]=C_ij_23; |
839 |
C_ij[3+4*3]=C_ij_33; |
840 |
} |
841 |
} |
842 |
} /* end of parallel region */ |
843 |
|
844 |
} else { |
845 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj ) |
846 |
{ |
847 |
#pragma omp for schedule(static) |
848 |
for(i = 0; i < n; i++) { |
849 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
850 |
j = C->pattern->index[ij_ptrC]; |
851 |
C_ij=&(C->val[ij_ptrC*C_block_size]); |
852 |
for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0; |
853 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
854 |
k=A->pattern->index[ik_ptrA]; |
855 |
kj_ptrB=B->pattern->ptr[k]; |
856 |
/* find j in B[k,:] */ |
857 |
start_p=&(B->pattern->index[kj_ptrB]); |
858 |
where_p=(index_t*)bsearch(&j, start_p, |
859 |
B->pattern->ptr[k + 1]-kj_ptrB, |
860 |
sizeof(index_t), |
861 |
Paso_comparIndex); |
862 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
863 |
kj_ptrB += (index_t)(where_p-start_p); |
864 |
A_ik=&(A->val[ik_ptrA*A_block_size]); |
865 |
B_kj=&(B->val[kj_ptrB*B_block_size]); |
866 |
|
867 |
for (icb=0; icb<B_block_size; ++icb) { |
868 |
rtmp=B_kj[icb]; |
869 |
for (irb=0; irb<row_block_size; ++irb) { |
870 |
C_ij[irb+row_block_size*icb]+=A_ik[irb+row_block_size*icb]*rtmp; |
871 |
} |
872 |
} |
873 |
} |
874 |
} |
875 |
} |
876 |
} |
877 |
} /* end of parallel region */ |
878 |
|
879 |
} |
880 |
} |
881 |
|
882 |
/* not good for block size 1 */ |
883 |
void Paso_SparseMatrix_MatrixMatrix_DD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) |
884 |
{ |
885 |
const dim_t n = C->numRows; |
886 |
const dim_t C_block_size =C->block_size; |
887 |
const dim_t B_block_size =B->block_size; |
888 |
const dim_t A_block_size =A->block_size; |
889 |
double *C_ij, *A_ik, *B_kj; |
890 |
register double C_ij_0, C_ij_1, C_ij_2, C_ij_3; |
891 |
dim_t i, ib; |
892 |
index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p; |
893 |
|
894 |
if ( (A_block_size == 1) && (B_block_size ==1) && (C_block_size == 1) ) { |
895 |
#pragma omp parallel private(i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, C_ij_0) |
896 |
{ |
897 |
#pragma omp for schedule(static) |
898 |
for(i = 0; i < n; i++) { |
899 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
900 |
j = C->pattern->index[ij_ptrC]; |
901 |
C_ij_0=0; |
902 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
903 |
k=A->pattern->index[ik_ptrA]; |
904 |
kj_ptrB=B->pattern->ptr[k]; |
905 |
/* find j in B[k,:] */ |
906 |
start_p=&(B->pattern->index[kj_ptrB]); |
907 |
where_p=(index_t*)bsearch(&j, start_p, |
908 |
B->pattern->ptr[k + 1]-kj_ptrB, |
909 |
sizeof(index_t), |
910 |
Paso_comparIndex); |
911 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
912 |
kj_ptrB += (index_t)(where_p-start_p); |
913 |
C_ij_0+=A->val[ik_ptrA]*B->val[kj_ptrB]; |
914 |
} |
915 |
|
916 |
} |
917 |
C->val[ij_ptrC]=C_ij_0; |
918 |
} |
919 |
} |
920 |
} /* end of parallel region */ |
921 |
} else if ( (A_block_size == 2) && (B_block_size ==2) && (C_block_size == 2) ) { |
922 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1) |
923 |
{ |
924 |
#pragma omp for schedule(static) |
925 |
for(i = 0; i < n; i++) { |
926 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
927 |
j = C->pattern->index[ij_ptrC]; |
928 |
C_ij=&(C->val[ij_ptrC*2]); |
929 |
C_ij_0=0; |
930 |
C_ij_1=0; |
931 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
932 |
k=A->pattern->index[ik_ptrA]; |
933 |
kj_ptrB=B->pattern->ptr[k]; |
934 |
/* find j in B[k,:] */ |
935 |
start_p=&(B->pattern->index[kj_ptrB]); |
936 |
where_p=(index_t*)bsearch(&j, start_p, |
937 |
B->pattern->ptr[k + 1]-kj_ptrB, |
938 |
sizeof(index_t), |
939 |
Paso_comparIndex); |
940 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
941 |
kj_ptrB += (index_t)(where_p-start_p); |
942 |
A_ik=&(A->val[ik_ptrA*2]); |
943 |
B_kj=&(B->val[kj_ptrB*2]); |
944 |
|
945 |
C_ij_0+=A_ik[0]*B_kj[0]; |
946 |
C_ij_1+=A_ik[1]*B_kj[1]; |
947 |
} |
948 |
|
949 |
} |
950 |
C_ij[0]=C_ij_0; |
951 |
C_ij[1]=C_ij_1; |
952 |
} |
953 |
} |
954 |
} /* end of parallel region */ |
955 |
} else if ( (A_block_size == 3) && (B_block_size ==3) && (C_block_size == 3) ) { |
956 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2) |
957 |
{ |
958 |
#pragma omp for schedule(static) |
959 |
for(i = 0; i < n; i++) { |
960 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
961 |
j = C->pattern->index[ij_ptrC]; |
962 |
C_ij=&(C->val[ij_ptrC*C_block_size]); |
963 |
C_ij_0=0; |
964 |
C_ij_1=0; |
965 |
C_ij_2=0; |
966 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
967 |
k=A->pattern->index[ik_ptrA]; |
968 |
kj_ptrB=B->pattern->ptr[k]; |
969 |
/* find j in B[k,:] */ |
970 |
start_p=&(B->pattern->index[kj_ptrB]); |
971 |
where_p=(index_t*)bsearch(&j, start_p, |
972 |
B->pattern->ptr[k + 1]-kj_ptrB, |
973 |
sizeof(index_t), |
974 |
Paso_comparIndex); |
975 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
976 |
kj_ptrB += (index_t)(where_p-start_p); |
977 |
A_ik=&(A->val[ik_ptrA*3]); |
978 |
B_kj=&(B->val[kj_ptrB*3]); |
979 |
|
980 |
C_ij_0+=A_ik[0]*B_kj[0]; |
981 |
C_ij_1+=A_ik[1]*B_kj[1]; |
982 |
C_ij_2+=A_ik[2]*B_kj[2]; |
983 |
} |
984 |
|
985 |
} |
986 |
C_ij[0]=C_ij_0; |
987 |
C_ij[1]=C_ij_1; |
988 |
C_ij[2]=C_ij_2; |
989 |
} |
990 |
} |
991 |
} /* end of parallel region */ |
992 |
} else if ( (A_block_size == 4) && (B_block_size ==4 ) && (C_block_size == 4) ) { |
993 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2, C_ij_3 ) |
994 |
{ |
995 |
#pragma omp for schedule(static) |
996 |
for(i = 0; i < n; i++) { |
997 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
998 |
j = C->pattern->index[ij_ptrC]; |
999 |
C_ij=&(C->val[ij_ptrC*C_block_size]); |
1000 |
C_ij_0=0; |
1001 |
C_ij_1=0; |
1002 |
C_ij_2=0; |
1003 |
C_ij_3=0; |
1004 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
1005 |
k=A->pattern->index[ik_ptrA]; |
1006 |
kj_ptrB=B->pattern->ptr[k]; |
1007 |
/* find j in B[k,:] */ |
1008 |
start_p=&(B->pattern->index[kj_ptrB]); |
1009 |
where_p=(index_t*)bsearch(&j, start_p, |
1010 |
B->pattern->ptr[k + 1]-kj_ptrB, |
1011 |
sizeof(index_t), |
1012 |
Paso_comparIndex); |
1013 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
1014 |
kj_ptrB += (index_t)(where_p-start_p); |
1015 |
A_ik=&(A->val[ik_ptrA*4]); |
1016 |
B_kj=&(B->val[kj_ptrB*4]); |
1017 |
|
1018 |
C_ij_0+=A_ik[0]*B_kj[0]; |
1019 |
C_ij_1+=A_ik[1]*B_kj[1]; |
1020 |
C_ij_2+=A_ik[2]*B_kj[2]; |
1021 |
C_ij_3+=A_ik[3]*B_kj[3]; |
1022 |
} |
1023 |
|
1024 |
} |
1025 |
C_ij[0]=C_ij_0; |
1026 |
C_ij[1]=C_ij_1; |
1027 |
C_ij[2]=C_ij_2; |
1028 |
C_ij[3]=C_ij_3; |
1029 |
} |
1030 |
} |
1031 |
} /* end of parallel region */ |
1032 |
} else { |
1033 |
#pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, ib, A_ik,B_kj ) |
1034 |
{ |
1035 |
#pragma omp for schedule(static) |
1036 |
for(i = 0; i < n; i++) { |
1037 |
for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) { |
1038 |
j = C->pattern->index[ij_ptrC]; |
1039 |
C_ij=&(C->val[ij_ptrC*C_block_size]); |
1040 |
for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0; |
1041 |
for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) { |
1042 |
k=A->pattern->index[ik_ptrA]; |
1043 |
kj_ptrB=B->pattern->ptr[k]; |
1044 |
/* find j in B[k,:] */ |
1045 |
start_p=&(B->pattern->index[kj_ptrB]); |
1046 |
where_p=(index_t*)bsearch(&j, start_p, |
1047 |
B->pattern->ptr[k + 1]-kj_ptrB, |
1048 |
sizeof(index_t), |
1049 |
Paso_comparIndex); |
1050 |
if (! (where_p == NULL) ) { /* this should always be the case */ |
1051 |
kj_ptrB += (index_t)(where_p-start_p); |
1052 |
A_ik=&(A->val[ik_ptrA*A_block_size]); |
1053 |
B_kj=&(B->val[kj_ptrB*B_block_size]); |
1054 |
for (ib=0; ib<MIN(A_block_size, B_block_size); ++ib) C_ij[ib]+=A_ik[ib]*B_kj[ib]; |
1055 |
} |
1056 |
} |
1057 |
} |
1058 |
} |
1059 |
} /* end of parallel region */ |
1060 |
} |
1061 |
} |