/[escript]/trunk/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp
ViewVC logotype

Contents of /trunk/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4803 - (show annotations)
Wed Mar 26 06:52:28 2014 UTC (5 years, 10 months ago) by caltinay
File size: 43544 byte(s)
Removed obsolete wrappers for malloc and friends.
Paso_Pattern -> paso::Pattern

1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2014 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development 2012-2013 by School of Earth Sciences
13 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
14 *
15 *****************************************************************************/
16
17
18 /************************************************************************************
19
20 Paso: Sparse matrix product (for efficiency, use the transpose
21 of Matrix B when B^T is available)
22
23 ************************************************************************************
24
25 Author: l.gao@uq.edu.au
26
27 ************************************************************************************/
28
29 #include "SparseMatrix.h"
30 #include "Paso.h"
31
32 namespace paso {
33
34 // forward declarations
35 void SparseMatrix_MatrixMatrixTranspose_DD(SparseMatrix *C,
36 const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T);
37 void SparseMatrix_MatrixMatrixTranspose_DB(SparseMatrix *C,
38 const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T);
39 void SparseMatrix_MatrixMatrixTranspose_BD(SparseMatrix *C,
40 const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T);
41 void SparseMatrix_MatrixMatrixTranspose_BB(SparseMatrix *C,
42 const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T);
43
44 SparseMatrix* SparseMatrix_MatrixMatrixTranspose(const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T)
45 {
46 SparseMatrixType C_type;
47
48 Pattern* outpattern=NULL;
49 SparseMatrix *out=NULL;
50
51 if ( ! ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (A->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & A->type ) ) ) {
52 Esys_setError(TYPE_ERROR,"SparseMatrix_MatrixMatrix: Unsupported matrix format of A.");
53 return NULL;
54 }
55 if ( ! ( (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (B->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & B->type ) ) ) {
56 Esys_setError(TYPE_ERROR,"SparseMatrix_MatrixMatrix: Unsupported matrix format of B.");
57 return NULL;
58 }
59 if (! (A->col_block_size == B->row_block_size) ) {
60 Esys_setError(TYPE_ERROR,"SparseMatrix_MatrixMatrix: Column block size of A and row block size of B must match.");
61 return NULL;
62 }
63 if (! (A->numCols == B->numRows) ) {
64 Esys_setError(TYPE_ERROR,"SparseMatrix_MatrixMatrix: number of columns of A and number of rows of B must match.");
65 return NULL;
66 }
67
68
69 if ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) && (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) ) {
70 C_type=MATRIX_FORMAT_DIAGONAL_BLOCK;
71 } else {
72 C_type=MATRIX_FORMAT_DEFAULT;
73 }
74
75 outpattern=Pattern_multiply(MATRIX_FORMAT_DEFAULT,A->pattern,B->pattern);
76
77 if (Esys_noError()) {
78 out=SparseMatrix_alloc(C_type, outpattern, A->row_block_size, B->col_block_size, FALSE);
79 }
80 Pattern_free(outpattern);
81
82 if (Esys_noError()) {
83 if ( (A->row_block_size == 1) && (B->col_block_size ==1 ) && (A->col_block_size ==1) ) {
84 SparseMatrix_MatrixMatrixTranspose_DD(out, A,B,T);
85 } else {
86 if (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
87 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
88 SparseMatrix_MatrixMatrixTranspose_DD(out, A,B,T);
89 } else {
90 SparseMatrix_MatrixMatrixTranspose_DB(out, A,B,T);
91 }
92 } else {
93 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
94 SparseMatrix_MatrixMatrixTranspose_BD(out, A,B,T);
95 } else {
96 SparseMatrix_MatrixMatrixTranspose_BB(out, A,B,T);
97 }
98 }
99 }
100 return out;
101 } else {
102 SparseMatrix_free(out);
103 return NULL;
104 }
105 }
106
107 /* not good for block size 1 */
108 void SparseMatrix_MatrixMatrixTranspose_BB(SparseMatrix* C, const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T)
109 {
110 const dim_t n = C->numRows;
111 const dim_t row_block_size = C->row_block_size;
112 const dim_t col_block_size = C->col_block_size;
113 const dim_t A_col_block_size = A->col_block_size;
114 const dim_t C_block_size =C->block_size;
115 const dim_t B_block_size =B->block_size;
116 const dim_t A_block_size =A->block_size;
117 double *C_ij, *A_ik, *B_kj;
118 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
119 dim_t i, ib, irb, icb;
120 index_t ij_ptrC, j, ik_ptrA, kj_ptrB, kA, kB, ikb, kjb;
121
122 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_col_block_size == 2) ) {
123 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
124 {
125 #pragma omp for schedule(static)
126 for(i = 0; i < n; i++) {
127 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
128 j = C->pattern->index[ij_ptrC];
129
130 C_ij_00=0;
131 C_ij_10=0;
132 C_ij_01=0;
133 C_ij_11=0;
134
135 C_ij=&(C->val[ij_ptrC*4]);
136
137 ik_ptrA=A->pattern->ptr[i];
138 ikb = A->pattern->ptr[i+1];
139 kj_ptrB=T->pattern->ptr[j];
140 kjb = T->pattern->ptr[j+1];
141 kA=A->pattern->index[ik_ptrA];
142 kB=T->pattern->index[kj_ptrB];
143 while (ik_ptrA < ikb && kj_ptrB < kjb) {
144 if (kA < kB) {
145 ik_ptrA ++;
146 if (ik_ptrA >= ikb) break;
147 kA=A->pattern->index[ik_ptrA];
148 } else if (kA > kB) {
149 kj_ptrB ++;
150 if (kj_ptrB >= kjb) break;
151 kB=T->pattern->index[kj_ptrB];
152 } else {
153 A_ik=&(A->val[ik_ptrA*4]);
154 B_kj=&(T->val[kj_ptrB*4]);
155 C_ij_00 +=A_ik[0+2*0]*B_kj[0+2*0]+A_ik[0+2*1]*B_kj[1+2*0];
156 C_ij_10 +=A_ik[1+2*0]*B_kj[0+2*0]+A_ik[1+2*1]*B_kj[1+2*0];
157 C_ij_01 +=A_ik[0+2*0]*B_kj[0+2*1]+A_ik[0+2*1]*B_kj[1+2*1];
158 C_ij_11 +=A_ik[1+2*0]*B_kj[0+2*1]+A_ik[1+2*1]*B_kj[1+2*1];
159 ik_ptrA ++;
160 kj_ptrB ++;
161 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
162 kA=A->pattern->index[ik_ptrA];
163 kB=T->pattern->index[kj_ptrB];
164 }
165 }
166 C_ij[0+2*0]=C_ij_00;
167 C_ij[1+2*0]=C_ij_10;
168 C_ij[0+2*1]=C_ij_01;
169 C_ij[1+2*1]=C_ij_11;
170
171 }
172 }
173 } /* end of parallel region */
174
175 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_col_block_size == 3) ){
176 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
177 {
178 #pragma omp for schedule(static)
179 for(i = 0; i < n; i++) {
180 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
181 j = C->pattern->index[ij_ptrC];
182
183 C_ij_00=0;
184 C_ij_10=0;
185 C_ij_20=0;
186 C_ij_01=0;
187 C_ij_11=0;
188 C_ij_21=0;
189 C_ij_02=0;
190 C_ij_12=0;
191 C_ij_22=0;
192
193 C_ij=&(C->val[ij_ptrC*9]);
194
195 ik_ptrA=A->pattern->ptr[i];
196 ikb = A->pattern->ptr[i+1];
197 kj_ptrB=T->pattern->ptr[j];
198 kjb = T->pattern->ptr[j+1];
199 kA=A->pattern->index[ik_ptrA];
200 kB=T->pattern->index[kj_ptrB];
201 while (ik_ptrA < ikb && kj_ptrB < kjb) {
202 if (kA < kB) {
203 ik_ptrA ++;
204 if (ik_ptrA >= ikb) break;
205 kA=A->pattern->index[ik_ptrA];
206 } else if (kA > kB) {
207 kj_ptrB ++;
208 if (kj_ptrB >= kjb) break;
209 kB=T->pattern->index[kj_ptrB];
210 } else {
211 A_ik=&(A->val[ik_ptrA*9]);
212 B_kj=&(T->val[kj_ptrB*9]);
213
214 C_ij_00 +=A_ik[0+3*0]*B_kj[0+3*0]
215 +A_ik[0+3*1]*B_kj[1+3*0]
216 +A_ik[0+3*2]*B_kj[2+3*0];
217 C_ij_10 +=A_ik[1+3*0]*B_kj[0+3*0]
218 +A_ik[1+3*1]*B_kj[1+3*0]
219 +A_ik[1+3*2]*B_kj[2+3*0];
220 C_ij_20 +=A_ik[2+3*0]*B_kj[0+3*0]
221 +A_ik[2+3*1]*B_kj[1+3*0]
222 +A_ik[2+3*2]*B_kj[2+3*0];
223
224 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*1]
225 +A_ik[0+3*1]*B_kj[1+3*1]
226 +A_ik[0+3*2]*B_kj[2+3*1];
227 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*1]
228 +A_ik[1+3*1]*B_kj[1+3*1]
229 +A_ik[1+3*2]*B_kj[2+3*1];
230 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*1]
231 +A_ik[2+3*1]*B_kj[1+3*1]
232 +A_ik[2+3*2]*B_kj[2+3*1];
233
234 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*2]
235 +A_ik[0+3*1]*B_kj[1+3*2]
236 +A_ik[0+3*2]*B_kj[2+3*2];
237 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*2]
238 +A_ik[1+3*1]*B_kj[1+3*2]
239 +A_ik[1+3*2]*B_kj[2+3*2];
240 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*2]
241 +A_ik[2+3*1]*B_kj[1+3*2]
242 +A_ik[2+3*2]*B_kj[2+3*2];
243 ik_ptrA ++;
244 kj_ptrB ++;
245 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
246 kA=A->pattern->index[ik_ptrA];
247 kB=T->pattern->index[kj_ptrB];
248 }
249 }
250 C_ij[0+3*0]=C_ij_00;
251 C_ij[1+3*0]=C_ij_10;
252 C_ij[2+3*0]=C_ij_20;
253 C_ij[0+3*1]=C_ij_01;
254 C_ij[1+3*1]=C_ij_11;
255 C_ij[2+3*1]=C_ij_21;
256 C_ij[0+3*2]=C_ij_02;
257 C_ij[1+3*2]=C_ij_12;
258 C_ij[2+3*2]=C_ij_22;
259 }
260 }
261 } /* end of parallel region */
262 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_col_block_size == 4) ){
263 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
264 {
265 #pragma omp for schedule(static)
266 for(i = 0; i < n; i++) {
267 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
268 j = C->pattern->index[ij_ptrC];
269
270 C_ij_00=0;
271 C_ij_10=0;
272 C_ij_20=0;
273 C_ij_30=0;
274 C_ij_01=0;
275 C_ij_11=0;
276 C_ij_21=0;
277 C_ij_31=0;
278 C_ij_02=0;
279 C_ij_12=0;
280 C_ij_22=0;
281 C_ij_32=0;
282 C_ij_03=0;
283 C_ij_13=0;
284 C_ij_23=0;
285 C_ij_33=0;
286
287 C_ij=&(C->val[ij_ptrC*16]);
288
289 ik_ptrA=A->pattern->ptr[i];
290 ikb = A->pattern->ptr[i+1];
291 kj_ptrB=T->pattern->ptr[j];
292 kjb = T->pattern->ptr[j+1];
293 kA=A->pattern->index[ik_ptrA];
294 kB=T->pattern->index[kj_ptrB];
295 while (ik_ptrA < ikb && kj_ptrB < kjb) {
296 if (kA < kB) {
297 ik_ptrA ++;
298 if (ik_ptrA >= ikb) break;
299 kA=A->pattern->index[ik_ptrA];
300 } else if (kA > kB) {
301 kj_ptrB ++;
302 if (kj_ptrB >= kjb) break;
303 kB=T->pattern->index[kj_ptrB];
304 } else {
305 A_ik=&(A->val[ik_ptrA*16]);
306 B_kj=&(T->val[kj_ptrB*16]);
307
308 C_ij_00 +=A_ik[0+4*0]*B_kj[0+4*0]
309 +A_ik[0+4*1]*B_kj[1+4*0]
310 +A_ik[0+4*2]*B_kj[2+4*0]
311 +A_ik[0+4*3]*B_kj[3+4*0];
312 C_ij_10 +=A_ik[1+4*0]*B_kj[0+4*0]
313 +A_ik[1+4*1]*B_kj[1+4*0]
314 +A_ik[1+4*2]*B_kj[2+4*0]
315 +A_ik[1+4*3]*B_kj[3+4*0];
316 C_ij_20 +=A_ik[2+4*0]*B_kj[0+4*0]
317 +A_ik[2+4*1]*B_kj[1+4*0]
318 +A_ik[2+4*2]*B_kj[2+4*0]
319 +A_ik[2+4*3]*B_kj[3+4*0];
320 C_ij_30 +=A_ik[3+4*0]*B_kj[0+4*0]
321 +A_ik[3+4*1]*B_kj[1+4*0]
322 +A_ik[3+4*2]*B_kj[2+4*0]
323 +A_ik[3+4*3]*B_kj[3+4*0];
324
325 C_ij_01 +=A_ik[0+4*0]*B_kj[0+4*1]
326 +A_ik[0+4*1]*B_kj[1+4*1]
327 +A_ik[0+4*2]*B_kj[2+4*1]
328 +A_ik[0+4*3]*B_kj[3+4*1];
329 C_ij_11 +=A_ik[1+4*0]*B_kj[0+4*1]
330 +A_ik[1+4*1]*B_kj[1+4*1]
331 +A_ik[1+4*2]*B_kj[2+4*1]
332 +A_ik[1+4*3]*B_kj[3+4*1];
333 C_ij_21 +=A_ik[2+4*0]*B_kj[0+4*1]
334 +A_ik[2+4*1]*B_kj[1+4*1]
335 +A_ik[2+4*2]*B_kj[2+4*1]
336 +A_ik[2+4*3]*B_kj[3+4*1];
337 C_ij_31 +=A_ik[3+4*0]*B_kj[0+4*1]
338 +A_ik[3+4*1]*B_kj[1+4*1]
339 +A_ik[3+4*2]*B_kj[2+4*1]
340 +A_ik[3+4*3]*B_kj[3+4*1];
341
342 C_ij_02 +=A_ik[0+4*0]*B_kj[0+4*2]
343 +A_ik[0+4*1]*B_kj[1+4*2]
344 +A_ik[0+4*2]*B_kj[2+4*2]
345 +A_ik[0+4*3]*B_kj[3+4*2];
346 C_ij_12 +=A_ik[1+4*0]*B_kj[0+4*2]
347 +A_ik[1+4*1]*B_kj[1+4*2]
348 +A_ik[1+4*2]*B_kj[2+4*2]
349 +A_ik[1+4*3]*B_kj[3+4*2];
350 C_ij_22 +=A_ik[2+4*0]*B_kj[0+4*2]
351 +A_ik[2+4*1]*B_kj[1+4*2]
352 +A_ik[2+4*2]*B_kj[2+4*2]
353 +A_ik[2+4*3]*B_kj[3+4*2];
354 C_ij_32 +=A_ik[3+4*0]*B_kj[0+4*2]
355 +A_ik[3+4*1]*B_kj[1+4*2]
356 +A_ik[3+4*2]*B_kj[2+4*2]
357 +A_ik[3+4*3]*B_kj[3+4*2];
358
359 C_ij_03 +=A_ik[0+4*0]*B_kj[0+4*3]
360 +A_ik[0+4*1]*B_kj[1+4*3]
361 +A_ik[0+4*2]*B_kj[2+4*3]
362 +A_ik[0+4*3]*B_kj[3+4*3];
363 C_ij_13 +=A_ik[1+4*0]*B_kj[0+4*3]
364 +A_ik[1+4*1]*B_kj[1+4*3]
365 +A_ik[1+4*2]*B_kj[2+4*3]
366 +A_ik[1+4*3]*B_kj[3+4*3];
367 C_ij_23 +=A_ik[2+4*0]*B_kj[0+4*3]
368 +A_ik[2+4*1]*B_kj[1+4*3]
369 +A_ik[2+4*2]*B_kj[2+4*3]
370 +A_ik[2+4*3]*B_kj[3+4*3];
371 C_ij_33 +=A_ik[3+4*0]*B_kj[0+4*3]
372 +A_ik[3+4*1]*B_kj[1+4*3]
373 +A_ik[3+4*2]*B_kj[2+4*3]
374 +A_ik[3+4*3]*B_kj[3+4*3];
375 ik_ptrA ++;
376 kj_ptrB ++;
377 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
378 kA=A->pattern->index[ik_ptrA];
379 kB=T->pattern->index[kj_ptrB];
380 }
381 }
382 C_ij[0+4*0]=C_ij_00;
383 C_ij[1+4*0]=C_ij_10;
384 C_ij[2+4*0]=C_ij_20;
385 C_ij[3+4*0]=C_ij_30;
386 C_ij[0+4*1]=C_ij_01;
387 C_ij[1+4*1]=C_ij_11;
388 C_ij[2+4*1]=C_ij_21;
389 C_ij[3+4*1]=C_ij_31;
390 C_ij[0+4*2]=C_ij_02;
391 C_ij[1+4*2]=C_ij_12;
392 C_ij[2+4*2]=C_ij_22;
393 C_ij[3+4*2]=C_ij_32;
394 C_ij[0+4*3]=C_ij_03;
395 C_ij[1+4*3]=C_ij_13;
396 C_ij[2+4*3]=C_ij_23;
397 C_ij[3+4*3]=C_ij_33;
398 }
399 }
400 } /* end of parallel region */
401
402 } else {
403 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, rtmp, A_ik,B_kj )
404 {
405 #pragma omp for schedule(static)
406 for(i = 0; i < n; i++) {
407 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
408 j = C->pattern->index[ij_ptrC];
409 C_ij=&(C->val[ij_ptrC*C_block_size]);
410 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
411 ik_ptrA=A->pattern->ptr[i];
412 ikb = A->pattern->ptr[i+1];
413 kj_ptrB=T->pattern->ptr[j];
414 kjb = T->pattern->ptr[j+1];
415 kA=A->pattern->index[ik_ptrA];
416 kB=T->pattern->index[kj_ptrB];
417 while (ik_ptrA < ikb && kj_ptrB < kjb) {
418 if (kA < kB) {
419 ik_ptrA ++;
420 if (ik_ptrA >= ikb) break;
421 kA=A->pattern->index[ik_ptrA];
422 } else if (kA > kB) {
423 kj_ptrB ++;
424 if (kj_ptrB >= kjb) break;
425 kB=T->pattern->index[kj_ptrB];
426 } else {
427 A_ik=&(A->val[ik_ptrA*A_block_size]);
428 B_kj=&(T->val[kj_ptrB*B_block_size]);
429 for (irb=0; irb<row_block_size; ++irb) {
430 for (icb=0; icb<col_block_size; ++icb) {
431 rtmp=C_ij[irb+row_block_size*icb];
432 for (ib=0; ib<A_col_block_size; ++ib) {
433 rtmp+=A_ik[irb+row_block_size*ib]*B_kj[ib+A_col_block_size*icb];
434 }
435 C_ij[irb+row_block_size*icb]=rtmp;
436 }
437 }
438 ik_ptrA ++;
439 kj_ptrB ++;
440 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
441 kA=A->pattern->index[ik_ptrA];
442 kB=T->pattern->index[kj_ptrB];
443 }
444 }
445 }
446 }
447 } /* end of parallel region */
448
449 }
450 }
451
452 /* not good for block size 1 */
453 void SparseMatrix_MatrixMatrixTranspose_DB(SparseMatrix* C, const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T)
454 {
455 const dim_t n = C->numRows;
456 const dim_t row_block_size = C->row_block_size;
457 const dim_t col_block_size = C->col_block_size;
458 const dim_t A_col_block_size = A->col_block_size;
459 const dim_t C_block_size =C->block_size;
460 const dim_t B_block_size =B->block_size;
461 const dim_t A_block_size =A->block_size;
462 double *C_ij, *A_ik, *B_kj;
463 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
464 dim_t i, ib, irb, icb;
465 index_t ij_ptrC, j, ik_ptrA, kj_ptrB, kA, kB, ikb, kjb;
466
467 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_block_size == 2) ) {
468 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
469 {
470 #pragma omp for schedule(static)
471 for(i = 0; i < n; i++) {
472 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
473 j = C->pattern->index[ij_ptrC];
474
475 C_ij_00=0;
476 C_ij_10=0;
477 C_ij_01=0;
478 C_ij_11=0;
479
480 C_ij=&(C->val[ij_ptrC*4]);
481
482 ik_ptrA=A->pattern->ptr[i];
483 ikb = A->pattern->ptr[i+1];
484 kj_ptrB=T->pattern->ptr[j];
485 kjb = T->pattern->ptr[j+1];
486 kA=A->pattern->index[ik_ptrA];
487 kB=T->pattern->index[kj_ptrB];
488 while (ik_ptrA < ikb && kj_ptrB < kjb) {
489 if (kA < kB) {
490 ik_ptrA ++;
491 if (ik_ptrA >= ikb) break;
492 kA=A->pattern->index[ik_ptrA];
493 } else if (kA > kB) {
494 kj_ptrB ++;
495 if (kj_ptrB >= kjb) break;
496 kB=T->pattern->index[kj_ptrB];
497 } else {
498 A_ik=&(A->val[ik_ptrA*2]);
499 B_kj=&(T->val[kj_ptrB*4]);
500
501 C_ij_00 +=A_ik[0]*B_kj[0+2*0];
502 C_ij_10 +=A_ik[1]*B_kj[1+2*0];
503
504 C_ij_01 +=A_ik[0]*B_kj[0+2*1];
505 C_ij_11 +=A_ik[1]*B_kj[1+2*1];
506 ik_ptrA ++;
507 kj_ptrB ++;
508 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
509 kA=A->pattern->index[ik_ptrA];
510 kB=T->pattern->index[kj_ptrB];
511
512 }
513 }
514 C_ij[0+2*0]=C_ij_00;
515 C_ij[1+2*0]=C_ij_10;
516 C_ij[0+2*1]=C_ij_01;
517 C_ij[1+2*1]=C_ij_11;
518
519 }
520 }
521 } /* end of parallel region */
522
523 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_block_size == 3) ){
524 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
525 {
526 #pragma omp for schedule(static)
527 for(i = 0; i < n; i++) {
528 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
529 j = C->pattern->index[ij_ptrC];
530
531 C_ij_00=0;
532 C_ij_10=0;
533 C_ij_20=0;
534 C_ij_01=0;
535 C_ij_11=0;
536 C_ij_21=0;
537 C_ij_02=0;
538 C_ij_12=0;
539 C_ij_22=0;
540
541 C_ij=&(C->val[ij_ptrC*9]);
542
543 ik_ptrA=A->pattern->ptr[i];
544 ikb = A->pattern->ptr[i+1];
545 kj_ptrB=T->pattern->ptr[j];
546 kjb = T->pattern->ptr[j+1];
547 kA=A->pattern->index[ik_ptrA];
548 kB=T->pattern->index[kj_ptrB];
549 while (ik_ptrA < ikb && kj_ptrB < kjb) {
550 if (kA < kB) {
551 ik_ptrA ++;
552 if (ik_ptrA >= ikb) break;
553 kA=A->pattern->index[ik_ptrA];
554 } else if (kA > kB) {
555 kj_ptrB ++;
556 if (kj_ptrB >= kjb) break;
557 kB=T->pattern->index[kj_ptrB];
558 } else {
559 A_ik=&(A->val[ik_ptrA*3]);
560 B_kj=&(T->val[kj_ptrB*9]);
561
562 C_ij_00 +=A_ik[0]*B_kj[0+3*0];
563 C_ij_10 +=A_ik[1]*B_kj[1+3*0];
564 C_ij_20 +=A_ik[2]*B_kj[2+3*0];
565
566 C_ij_01 +=A_ik[0]*B_kj[0+3*1];
567 C_ij_11 +=A_ik[1]*B_kj[1+3*1];
568 C_ij_21 +=A_ik[2]*B_kj[2+3*1];
569
570 C_ij_02 +=A_ik[0]*B_kj[0+3*2];
571 C_ij_12 +=A_ik[1]*B_kj[1+3*2];
572 C_ij_22 +=A_ik[2]*B_kj[2+3*2];
573 ik_ptrA ++;
574 kj_ptrB ++;
575 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
576 kA=A->pattern->index[ik_ptrA];
577 kB=T->pattern->index[kj_ptrB];
578 }
579 }
580 C_ij[0+3*0]=C_ij_00;
581 C_ij[1+3*0]=C_ij_10;
582 C_ij[2+3*0]=C_ij_20;
583 C_ij[0+3*1]=C_ij_01;
584 C_ij[1+3*1]=C_ij_11;
585 C_ij[2+3*1]=C_ij_21;
586 C_ij[0+3*2]=C_ij_02;
587 C_ij[1+3*2]=C_ij_12;
588 C_ij[2+3*2]=C_ij_22;
589 }
590 }
591 } /* end of parallel region */
592 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_block_size == 4) ){
593 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
594 {
595 #pragma omp for schedule(static)
596 for(i = 0; i < n; i++) {
597 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
598 j = C->pattern->index[ij_ptrC];
599
600 C_ij_00=0;
601 C_ij_10=0;
602 C_ij_20=0;
603 C_ij_30=0;
604 C_ij_01=0;
605 C_ij_11=0;
606 C_ij_21=0;
607 C_ij_31=0;
608 C_ij_02=0;
609 C_ij_12=0;
610 C_ij_22=0;
611 C_ij_32=0;
612 C_ij_03=0;
613 C_ij_13=0;
614 C_ij_23=0;
615 C_ij_33=0;
616
617 C_ij=&(C->val[ij_ptrC*16]);
618
619 ik_ptrA=A->pattern->ptr[i];
620 ikb = A->pattern->ptr[i+1];
621 kj_ptrB=T->pattern->ptr[j];
622 kjb = T->pattern->ptr[j+1];
623 kA=A->pattern->index[ik_ptrA];
624 kB=T->pattern->index[kj_ptrB];
625 while (ik_ptrA < ikb && kj_ptrB < kjb) {
626 if (kA < kB) {
627 ik_ptrA ++;
628 if (ik_ptrA >= ikb) break;
629 kA=A->pattern->index[ik_ptrA];
630 } else if (kA > kB) {
631 kj_ptrB ++;
632 if (kj_ptrB >= kjb) break;
633 kB=T->pattern->index[kj_ptrB];
634 } else {
635 A_ik=&(A->val[ik_ptrA*4]);
636 B_kj=&(T->val[kj_ptrB*16]);
637
638 C_ij_00 +=A_ik[0]*B_kj[0+4*0];
639 C_ij_10 +=A_ik[1]*B_kj[1+4*0];
640 C_ij_20 +=A_ik[2]*B_kj[2+4*0];
641 C_ij_30 +=A_ik[3]*B_kj[3+4*0];
642
643 C_ij_01 +=A_ik[0]*B_kj[0+4*1];
644 C_ij_11 +=A_ik[1]*B_kj[1+4*1];
645 C_ij_21 +=A_ik[2]*B_kj[2+4*1];
646 C_ij_31 +=A_ik[3]*B_kj[3+4*1];
647
648 C_ij_02 +=A_ik[0]*B_kj[0+4*2];
649 C_ij_12 +=A_ik[1]*B_kj[1+4*2];
650 C_ij_22 +=A_ik[2]*B_kj[2+4*2];
651 C_ij_32 +=A_ik[3]*B_kj[3+4*2];
652
653 C_ij_03 +=A_ik[0]*B_kj[0+4*3];
654 C_ij_13 +=A_ik[1]*B_kj[1+4*3];
655 C_ij_23 +=A_ik[2]*B_kj[2+4*3];
656 C_ij_33 +=A_ik[3]*B_kj[3+4*3];
657
658 ik_ptrA ++;
659 kj_ptrB ++;
660 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
661 kA=A->pattern->index[ik_ptrA];
662 kB=T->pattern->index[kj_ptrB];
663 }
664 }
665 C_ij[0+4*0]=C_ij_00;
666 C_ij[1+4*0]=C_ij_10;
667 C_ij[2+4*0]=C_ij_20;
668 C_ij[3+4*0]=C_ij_30;
669 C_ij[0+4*1]=C_ij_01;
670 C_ij[1+4*1]=C_ij_11;
671 C_ij[2+4*1]=C_ij_21;
672 C_ij[3+4*1]=C_ij_31;
673 C_ij[0+4*2]=C_ij_02;
674 C_ij[1+4*2]=C_ij_12;
675 C_ij[2+4*2]=C_ij_22;
676 C_ij[3+4*2]=C_ij_32;
677 C_ij[0+4*3]=C_ij_03;
678 C_ij[1+4*3]=C_ij_13;
679 C_ij[2+4*3]=C_ij_23;
680 C_ij[3+4*3]=C_ij_33;
681 }
682 }
683 } /* end of parallel region */
684
685 } else {
686 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, rtmp, A_ik,B_kj )
687 {
688 #pragma omp for schedule(static)
689 for(i = 0; i < n; i++) {
690 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
691 j = C->pattern->index[ij_ptrC];
692 C_ij=&(C->val[ij_ptrC*C_block_size]);
693 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
694 ik_ptrA=A->pattern->ptr[i];
695 ikb = A->pattern->ptr[i+1];
696 kj_ptrB=T->pattern->ptr[j];
697 kjb = T->pattern->ptr[j+1];
698 kA=A->pattern->index[ik_ptrA];
699 kB=T->pattern->index[kj_ptrB];
700 while (ik_ptrA < ikb && kj_ptrB < kjb) {
701 if (kA < kB) {
702 ik_ptrA ++;
703 if (ik_ptrA >= ikb) break;
704 kA=A->pattern->index[ik_ptrA];
705 } else if (kA > kB) {
706 kj_ptrB ++;
707 if (kj_ptrB >= kjb) break;
708 kB=T->pattern->index[kj_ptrB];
709 } else {
710 A_ik=&(A->val[ik_ptrA*A_block_size]);
711 B_kj=&(T->val[kj_ptrB*B_block_size]);
712
713 for (irb=0; irb<A_block_size; ++irb) {
714 rtmp=A_ik[irb];
715 for (icb=0; icb<col_block_size; ++icb) {
716 C_ij[irb+row_block_size*icb]+=rtmp*B_kj[irb+A_col_block_size*icb];
717 }
718 }
719 ik_ptrA ++;
720 kj_ptrB ++;
721 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
722 kA=A->pattern->index[ik_ptrA];
723 kB=T->pattern->index[kj_ptrB];
724 }
725 }
726 }
727 }
728 } /* end of parallel region */
729
730 }
731 }
732
733 /* not good for block size 1 */
734 void SparseMatrix_MatrixMatrixTranspose_BD(SparseMatrix* C, const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T)
735 {
736 const dim_t n = C->numRows;
737 const dim_t row_block_size = C->row_block_size;
738 const dim_t col_block_size = C->col_block_size;
739 const dim_t C_block_size =C->block_size;
740 const dim_t B_block_size =B->block_size;
741 const dim_t A_block_size =A->block_size;
742 double *C_ij, *A_ik, *B_kj;
743 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
744 dim_t i, ib, irb, icb;
745 index_t ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB;
746
747 if ( (row_block_size == 2) && (col_block_size ==2 ) && (B_block_size == 2) ) {
748 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
749 {
750 #pragma omp for schedule(static)
751 for(i = 0; i < n; i++) {
752 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
753 j = C->pattern->index[ij_ptrC];
754
755 C_ij_00=0;
756 C_ij_10=0;
757 C_ij_01=0;
758 C_ij_11=0;
759
760 C_ij=&(C->val[ij_ptrC*4]);
761
762 ik_ptrA=A->pattern->ptr[i];
763 ikb = A->pattern->ptr[i+1];
764 kj_ptrB=T->pattern->ptr[j];
765 kjb = T->pattern->ptr[j+1];
766 kA=A->pattern->index[ik_ptrA];
767 kB=T->pattern->index[kj_ptrB];
768 while (ik_ptrA < ikb && kj_ptrB < kjb) {
769 if (kA < kB) {
770 ik_ptrA ++;
771 if (ik_ptrA >= ikb) break;
772 kA=A->pattern->index[ik_ptrA];
773 } else if (kA > kB) {
774 kj_ptrB ++;
775 if (kj_ptrB >= kjb) break;
776 kB=T->pattern->index[kj_ptrB];
777 } else {
778 A_ik=&(A->val[ik_ptrA*4]);
779 B_kj=&(T->val[kj_ptrB*2]);
780
781 C_ij_00 +=A_ik[0+2*0]*B_kj[0];
782 C_ij_10 +=A_ik[1+2*0]*B_kj[0];
783
784 C_ij_01 +=A_ik[0+2*1]*B_kj[1];
785 C_ij_11 +=A_ik[1+2*1]*B_kj[1];
786
787 ik_ptrA ++;
788 kj_ptrB ++;
789 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
790 kA=A->pattern->index[ik_ptrA];
791 kB=T->pattern->index[kj_ptrB];
792 }
793 }
794 C_ij[0+2*0]=C_ij_00;
795 C_ij[1+2*0]=C_ij_10;
796 C_ij[0+2*1]=C_ij_01;
797 C_ij[1+2*1]=C_ij_11;
798
799 }
800 }
801 } /* end of parallel region */
802
803 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (B_block_size == 3) ){
804 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
805 {
806 #pragma omp for schedule(static)
807 for(i = 0; i < n; i++) {
808 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
809 j = C->pattern->index[ij_ptrC];
810
811 C_ij_00=0;
812 C_ij_10=0;
813 C_ij_20=0;
814 C_ij_01=0;
815 C_ij_11=0;
816 C_ij_21=0;
817 C_ij_02=0;
818 C_ij_12=0;
819 C_ij_22=0;
820
821 C_ij=&(C->val[ij_ptrC*9]);
822
823 ik_ptrA=A->pattern->ptr[i];
824 ikb = A->pattern->ptr[i+1];
825 kj_ptrB=T->pattern->ptr[j];
826 kjb = T->pattern->ptr[j+1];
827 kA=A->pattern->index[ik_ptrA];
828 kB=T->pattern->index[kj_ptrB];
829 while (ik_ptrA < ikb && kj_ptrB < kjb) {
830 if (kA < kB) {
831 ik_ptrA ++;
832 if (ik_ptrA >= ikb) break;
833 kA=A->pattern->index[ik_ptrA];
834 } else if (kA > kB) {
835 kj_ptrB ++;
836 if (kj_ptrB >= kjb) break;
837 kB=T->pattern->index[kj_ptrB];
838 } else {
839 A_ik=&(A->val[ik_ptrA*9]);
840 B_kj=&(T->val[kj_ptrB*3]);
841
842 C_ij_00 +=A_ik[0+3*0]*B_kj[0];
843 C_ij_10 +=A_ik[1+3*0]*B_kj[0];
844 C_ij_20 +=A_ik[2+3*0]*B_kj[0];
845
846 C_ij_01 +=A_ik[0+3*1]*B_kj[1];
847 C_ij_11 +=A_ik[1+3*1]*B_kj[1];
848 C_ij_21 +=A_ik[2+3*1]*B_kj[1];
849
850 C_ij_02 +=A_ik[0+3*2]*B_kj[2];
851 C_ij_12 +=A_ik[1+3*2]*B_kj[2];
852 C_ij_22 +=A_ik[2+3*2]*B_kj[2];
853
854 ik_ptrA ++;
855 kj_ptrB ++;
856 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
857 kA=A->pattern->index[ik_ptrA];
858 kB=T->pattern->index[kj_ptrB];
859 }
860 }
861 C_ij[0+3*0]=C_ij_00;
862 C_ij[1+3*0]=C_ij_10;
863 C_ij[2+3*0]=C_ij_20;
864 C_ij[0+3*1]=C_ij_01;
865 C_ij[1+3*1]=C_ij_11;
866 C_ij[2+3*1]=C_ij_21;
867 C_ij[0+3*2]=C_ij_02;
868 C_ij[1+3*2]=C_ij_12;
869 C_ij[2+3*2]=C_ij_22;
870 }
871 }
872 } /* end of parallel region */
873 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (B_block_size == 4) ){
874 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
875 {
876 #pragma omp for schedule(static)
877 for(i = 0; i < n; i++) {
878 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
879 j = C->pattern->index[ij_ptrC];
880
881 C_ij_00=0;
882 C_ij_10=0;
883 C_ij_20=0;
884 C_ij_30=0;
885 C_ij_01=0;
886 C_ij_11=0;
887 C_ij_21=0;
888 C_ij_31=0;
889 C_ij_02=0;
890 C_ij_12=0;
891 C_ij_22=0;
892 C_ij_32=0;
893 C_ij_03=0;
894 C_ij_13=0;
895 C_ij_23=0;
896 C_ij_33=0;
897
898 C_ij=&(C->val[ij_ptrC*16]);
899
900 ik_ptrA=A->pattern->ptr[i];
901 ikb = A->pattern->ptr[i+1];
902 kj_ptrB=T->pattern->ptr[j];
903 kjb = T->pattern->ptr[j+1];
904 kA=A->pattern->index[ik_ptrA];
905 kB=T->pattern->index[kj_ptrB];
906 while (ik_ptrA < ikb && kj_ptrB < kjb) {
907 if (kA < kB) {
908 ik_ptrA ++;
909 if (ik_ptrA >= ikb) break;
910 kA=A->pattern->index[ik_ptrA];
911 } else if (kA > kB) {
912 kj_ptrB ++;
913 if (kj_ptrB >= kjb) break;
914 kB=T->pattern->index[kj_ptrB];
915 } else {
916 A_ik=&(A->val[ik_ptrA*16]);
917 B_kj=&(T->val[kj_ptrB*4]);
918
919 C_ij_00 +=A_ik[0+4*0]*B_kj[0];
920 C_ij_10 +=A_ik[1+4*0]*B_kj[0];
921 C_ij_20 +=A_ik[2+4*0]*B_kj[0];
922 C_ij_30 +=A_ik[3+4*0]*B_kj[0];
923
924 C_ij_01 +=A_ik[0+4*1]*B_kj[1];
925 C_ij_11 +=A_ik[1+4*1]*B_kj[1];
926 C_ij_21 +=A_ik[2+4*1]*B_kj[1];
927 C_ij_31 +=A_ik[3+4*1]*B_kj[1];
928
929 C_ij_02 +=A_ik[0+4*2]*B_kj[2];
930 C_ij_12 +=A_ik[1+4*2]*B_kj[2];
931 C_ij_22 +=A_ik[2+4*2]*B_kj[2];
932 C_ij_32 +=A_ik[3+4*2]*B_kj[2];
933
934 C_ij_03 +=A_ik[0+4*3]*B_kj[3];
935 C_ij_13 +=A_ik[1+4*3]*B_kj[3];
936 C_ij_23 +=A_ik[2+4*3]*B_kj[3];
937 C_ij_33 +=A_ik[3+4*3]*B_kj[3];
938
939 ik_ptrA ++;
940 kj_ptrB ++;
941 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
942 kA=A->pattern->index[ik_ptrA];
943 kB=T->pattern->index[kj_ptrB];
944 }
945 }
946 C_ij[0+4*0]=C_ij_00;
947 C_ij[1+4*0]=C_ij_10;
948 C_ij[2+4*0]=C_ij_20;
949 C_ij[3+4*0]=C_ij_30;
950 C_ij[0+4*1]=C_ij_01;
951 C_ij[1+4*1]=C_ij_11;
952 C_ij[2+4*1]=C_ij_21;
953 C_ij[3+4*1]=C_ij_31;
954 C_ij[0+4*2]=C_ij_02;
955 C_ij[1+4*2]=C_ij_12;
956 C_ij[2+4*2]=C_ij_22;
957 C_ij[3+4*2]=C_ij_32;
958 C_ij[0+4*3]=C_ij_03;
959 C_ij[1+4*3]=C_ij_13;
960 C_ij[2+4*3]=C_ij_23;
961 C_ij[3+4*3]=C_ij_33;
962 }
963 }
964 } /* end of parallel region */
965
966 } else {
967 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, rtmp, A_ik,B_kj )
968 {
969 #pragma omp for schedule(static)
970 for(i = 0; i < n; i++) {
971 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
972 j = C->pattern->index[ij_ptrC];
973 C_ij=&(C->val[ij_ptrC*C_block_size]);
974 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
975 ik_ptrA=A->pattern->ptr[i];
976 ikb = A->pattern->ptr[i+1];
977 kj_ptrB=T->pattern->ptr[j];
978 kjb = T->pattern->ptr[j+1];
979 kA=A->pattern->index[ik_ptrA];
980 kB=T->pattern->index[kj_ptrB];
981 while (ik_ptrA < ikb && kj_ptrB < kjb) {
982 if (kA < kB) {
983 ik_ptrA ++;
984 if (ik_ptrA >= ikb) break;
985 kA=A->pattern->index[ik_ptrA];
986 } else if (kA > kB) {
987 kj_ptrB ++;
988 if (kj_ptrB >= kjb) break;
989 kB=T->pattern->index[kj_ptrB];
990 } else {
991 A_ik=&(A->val[ik_ptrA*A_block_size]);
992 B_kj=&(T->val[kj_ptrB*B_block_size]);
993
994 for (icb=0; icb<B_block_size; ++icb) {
995 rtmp=B_kj[icb];
996 for (irb=0; irb<row_block_size; ++irb) {
997 C_ij[irb+row_block_size*icb]+=A_ik[irb+row_block_size*icb]*rtmp;
998 }
999 }
1000
1001 ik_ptrA ++;
1002 kj_ptrB ++;
1003 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1004 kA=A->pattern->index[ik_ptrA];
1005 kB=T->pattern->index[kj_ptrB];
1006 }
1007 }
1008 }
1009 }
1010 } /* end of parallel region */
1011
1012 }
1013 }
1014
1015 /* not good for block size 1 */
1016 void SparseMatrix_MatrixMatrixTranspose_DD(SparseMatrix* C, const SparseMatrix* A, const SparseMatrix* B, const SparseMatrix* T)
1017 {
1018 const dim_t n = C->numRows;
1019 const dim_t C_block_size =C->block_size;
1020 const dim_t B_block_size =B->block_size;
1021 const dim_t A_block_size =A->block_size;
1022 double *C_ij, *A_ik, *B_kj;
1023 register double C_ij_0, C_ij_1, C_ij_2, C_ij_3;
1024 dim_t i, ib;
1025 index_t ij_ptrC, j, ik_ptrA, kA, kB, kj_ptrB, ikb, kjb;
1026
1027 if ( (A_block_size == 1) && (B_block_size ==1) && (C_block_size == 1) ) {
1028 #pragma omp parallel private(i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, C_ij_0)
1029 {
1030 #pragma omp for schedule(static)
1031 for(i = 0; i < n; i++) {
1032 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1033 j = C->pattern->index[ij_ptrC];
1034 C_ij_0=0;
1035 ik_ptrA=A->pattern->ptr[i];
1036 ikb = A->pattern->ptr[i+1];
1037 kj_ptrB=T->pattern->ptr[j];
1038 kjb = T->pattern->ptr[j+1];
1039 kA=A->pattern->index[ik_ptrA];
1040 kB=T->pattern->index[kj_ptrB];
1041 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1042 if (kA < kB) {
1043 ik_ptrA ++;
1044 if (ik_ptrA >= ikb) break;
1045 kA=A->pattern->index[ik_ptrA];
1046 } else if (kA > kB) {
1047 kj_ptrB ++;
1048 if (kj_ptrB >= kjb) break;
1049 kB=T->pattern->index[kj_ptrB];
1050 } else {
1051 C_ij_0+=A->val[ik_ptrA]*T->val[kj_ptrB];
1052 ik_ptrA ++;
1053 kj_ptrB ++;
1054 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1055 kA=A->pattern->index[ik_ptrA];
1056 kB=T->pattern->index[kj_ptrB];
1057 }
1058 }
1059 C->val[ij_ptrC]=C_ij_0;
1060 }
1061 }
1062 } /* end of parallel region */
1063 } else if ( (A_block_size == 2) && (B_block_size ==2) && (C_block_size == 2) ) {
1064 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, A_ik,B_kj, C_ij_0, C_ij_1)
1065 {
1066 #pragma omp for schedule(static)
1067 for(i = 0; i < n; i++) {
1068 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1069 j = C->pattern->index[ij_ptrC];
1070 C_ij=&(C->val[ij_ptrC*2]);
1071 C_ij_0=0;
1072 C_ij_1=0;
1073 ik_ptrA=A->pattern->ptr[i];
1074 ikb = A->pattern->ptr[i+1];
1075 kj_ptrB=T->pattern->ptr[j];
1076 kjb = T->pattern->ptr[j+1];
1077 kA=A->pattern->index[ik_ptrA];
1078 kB=T->pattern->index[kj_ptrB];
1079 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1080 if (kA < kB) {
1081 ik_ptrA ++;
1082 if (ik_ptrA >= ikb) break;
1083 kA=A->pattern->index[ik_ptrA];
1084 } else if (kA > kB) {
1085 kj_ptrB ++;
1086 if (kj_ptrB >= kjb) break;
1087 kB=T->pattern->index[kj_ptrB];
1088 } else {
1089 A_ik=&(A->val[ik_ptrA*2]);
1090 B_kj=&(T->val[kj_ptrB*2]);
1091 C_ij_0+=A_ik[0]*B_kj[0];
1092 C_ij_1+=A_ik[1]*B_kj[1];
1093 ik_ptrA ++;
1094 kj_ptrB ++;
1095 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1096 kA=A->pattern->index[ik_ptrA];
1097 kB=T->pattern->index[kj_ptrB];
1098 }
1099 }
1100 C_ij[0]=C_ij_0;
1101 C_ij[1]=C_ij_1;
1102 }
1103 }
1104 } /* end of parallel region */
1105 } else if ( (A_block_size == 3) && (B_block_size ==3) && (C_block_size == 3) ) {
1106 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2)
1107 {
1108 #pragma omp for schedule(static)
1109 for(i = 0; i < n; i++) {
1110 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1111 j = C->pattern->index[ij_ptrC];
1112 C_ij=&(C->val[ij_ptrC*C_block_size]);
1113 C_ij_0=0;
1114 C_ij_1=0;
1115 C_ij_2=0;
1116 ik_ptrA=A->pattern->ptr[i];
1117 ikb = A->pattern->ptr[i+1];
1118 kj_ptrB=T->pattern->ptr[j];
1119 kjb = T->pattern->ptr[j+1];
1120 kA=A->pattern->index[ik_ptrA];
1121 kB=T->pattern->index[kj_ptrB];
1122 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1123 if (kA < kB) {
1124 ik_ptrA ++;
1125 if (ik_ptrA >= ikb) break;
1126 kA=A->pattern->index[ik_ptrA];
1127 } else if (kA > kB) {
1128 kj_ptrB ++;
1129 if (kj_ptrB >= kjb) break;
1130 kB=T->pattern->index[kj_ptrB];
1131 } else {
1132 A_ik=&(A->val[ik_ptrA*3]);
1133 B_kj=&(T->val[kj_ptrB*3]);
1134 C_ij_0+=A_ik[0]*B_kj[0];
1135 C_ij_1+=A_ik[1]*B_kj[1];
1136 C_ij_2+=A_ik[2]*B_kj[2];
1137 ik_ptrA ++;
1138 kj_ptrB ++;
1139 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1140 kA=A->pattern->index[ik_ptrA];
1141 kB=T->pattern->index[kj_ptrB];
1142 }
1143 }
1144 C_ij[0]=C_ij_0;
1145 C_ij[1]=C_ij_1;
1146 C_ij[2]=C_ij_2;
1147 }
1148 }
1149 } /* end of parallel region */
1150 } else if ( (A_block_size == 4) && (B_block_size ==4 ) && (C_block_size == 4) ) {
1151 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2, C_ij_3 )
1152 {
1153 #pragma omp for schedule(static)
1154 for(i = 0; i < n; i++) {
1155 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1156 j = C->pattern->index[ij_ptrC];
1157 C_ij=&(C->val[ij_ptrC*C_block_size]);
1158 C_ij_0=0;
1159 C_ij_1=0;
1160 C_ij_2=0;
1161 C_ij_3=0;
1162 ik_ptrA=A->pattern->ptr[i];
1163 ikb = A->pattern->ptr[i+1];
1164 kj_ptrB=T->pattern->ptr[j];
1165 kjb = T->pattern->ptr[j+1];
1166 kA=A->pattern->index[ik_ptrA];
1167 kB=T->pattern->index[kj_ptrB];
1168 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1169 if (kA < kB) {
1170 ik_ptrA ++;
1171 if (ik_ptrA >= ikb) break;
1172 kA=A->pattern->index[ik_ptrA];
1173 } else if (kA > kB) {
1174 kj_ptrB ++;
1175 if (kj_ptrB >= kjb) break;
1176 kB=T->pattern->index[kj_ptrB];
1177 } else {
1178 A_ik=&(A->val[ik_ptrA*4]);
1179 B_kj=&(T->val[kj_ptrB*4]);
1180 C_ij_0+=A_ik[0]*B_kj[0];
1181 C_ij_1+=A_ik[1]*B_kj[1];
1182 C_ij_2+=A_ik[2]*B_kj[2];
1183 C_ij_3+=A_ik[3]*B_kj[3];
1184 ik_ptrA ++;
1185 kj_ptrB ++;
1186 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1187 kA=A->pattern->index[ik_ptrA];
1188 kB=T->pattern->index[kj_ptrB];
1189 }
1190 }
1191 C_ij[0]=C_ij_0;
1192 C_ij[1]=C_ij_1;
1193 C_ij[2]=C_ij_2;
1194 C_ij[3]=C_ij_3;
1195 }
1196 }
1197 } /* end of parallel region */
1198 } else {
1199 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, ib, A_ik,B_kj )
1200 {
1201 #pragma omp for schedule(static)
1202 for(i = 0; i < n; i++) {
1203 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1204 j = C->pattern->index[ij_ptrC];
1205 C_ij=&(C->val[ij_ptrC*C_block_size]);
1206 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
1207 ik_ptrA=A->pattern->ptr[i];
1208 ikb = A->pattern->ptr[i+1];
1209 kj_ptrB=T->pattern->ptr[j];
1210 kjb = T->pattern->ptr[j+1];
1211 kA=A->pattern->index[ik_ptrA];
1212 kB=T->pattern->index[kj_ptrB];
1213 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1214 if (kA < kB) {
1215 ik_ptrA ++;
1216 if (ik_ptrA >= ikb) break;
1217 kA=A->pattern->index[ik_ptrA];
1218 } else if (kA > kB) {
1219 kj_ptrB ++;
1220 if (kj_ptrB >= kjb) break;
1221 kB=T->pattern->index[kj_ptrB];
1222 } else {
1223 A_ik=&(A->val[ik_ptrA*A_block_size]);
1224 B_kj=&(T->val[kj_ptrB*B_block_size]);
1225 for (ib=0; ib<MIN(A_block_size, B_block_size); ++ib) C_ij[ib]+=A_ik[ib]*B_kj[ib];
1226 ik_ptrA ++;
1227 kj_ptrB ++;
1228 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1229 kA=A->pattern->index[ik_ptrA];
1230 kB=T->pattern->index[kj_ptrB];
1231 }
1232 }
1233 }
1234 }
1235 } /* end of parallel region */
1236 }
1237 }
1238
1239 } // namespace paso
1240

Properties

Name Value
svn:mergeinfo /branches/amg_from_3530/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:3531-3826 /branches/lapack2681/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:2682-2741 /branches/pasowrap/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:3661-3674 /branches/py3_attempt2/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:3871-3891 /branches/restext/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:2610-2624 /branches/ripleygmg_from_3668/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:3669-3791 /branches/stage3.0/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:2569-2590 /branches/symbolic_from_3470/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:3471-3974 /branches/symbolic_from_3470/ripley/test/python/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:3517-3974 /release/3.0/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:2591-2601 /trunk/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:4257-4344 /trunk/ripley/test/python/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp:3480-3515

  ViewVC Help
Powered by ViewVC 1.1.26