/[escript]/branches/doubleplusgood/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp
ViewVC logotype

Contents of /branches/doubleplusgood/paso/src/SparseMatrix_MatrixMatrixTranspose.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4261 - (show annotations)
Wed Feb 27 06:09:33 2013 UTC (6 years, 1 month ago) by jfenwick
File size: 43063 byte(s)
Initial all c++ build.
But ... there are now reinterpret_cast<>'s
1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2013 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development since 2012 by School of Earth Sciences
13 *
14 *****************************************************************************/
15
16
17 /************************************************************************************
18
19 Paso: Sparse matrix product (for efficiency, use the transpose
20 of Matrix B when B^T is available)
21
22 ************************************************************************************
23
24 Author: l.gao@uq.edu.au
25
26 ************************************************************************************/
27
28 #include "SparseMatrix.h"
29 #include "Paso.h"
30
31 Paso_SparseMatrix* Paso_SparseMatrix_MatrixMatrixTranspose(const Paso_SparseMatrix* A, const Paso_SparseMatrix* B, const Paso_SparseMatrix* T) {
32 Paso_SparseMatrixType C_type;
33
34 Paso_Pattern* outpattern=NULL;
35 Paso_SparseMatrix *out=NULL;
36
37 if ( ! ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (A->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & A->type ) ) ) {
38 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of A.");
39 return NULL;
40 }
41 if ( ! ( (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (B->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & B->type ) ) ) {
42 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of B.");
43 return NULL;
44 }
45 if (! (A->col_block_size == B->row_block_size) ) {
46 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Column block size of A and row block size of B must match.");
47 return NULL;
48 }
49 if (! (A->numCols == B->numRows) ) {
50 Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: number of columns of A and number of rows of B must match.");
51 return NULL;
52 }
53
54
55 if ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) && (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) ) {
56 C_type=MATRIX_FORMAT_DIAGONAL_BLOCK;
57 } else {
58 C_type=MATRIX_FORMAT_DEFAULT;
59 }
60
61 outpattern=Paso_Pattern_multiply(MATRIX_FORMAT_DEFAULT,A->pattern,B->pattern);
62
63 if (Esys_noError()) {
64 out=Paso_SparseMatrix_alloc(C_type, outpattern, A->row_block_size, B->col_block_size, FALSE);
65 }
66 Paso_Pattern_free(outpattern);
67
68 if (Esys_noError()) {
69 if ( (A->row_block_size == 1) && (B->col_block_size ==1 ) && (A->col_block_size ==1) ) {
70 Paso_SparseMatrix_MatrixMatrixTranspose_DD(out, A,B,T);
71 } else {
72 if (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
73 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
74 Paso_SparseMatrix_MatrixMatrixTranspose_DD(out, A,B,T);
75 } else {
76 Paso_SparseMatrix_MatrixMatrixTranspose_DB(out, A,B,T);
77 }
78 } else {
79 if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
80 Paso_SparseMatrix_MatrixMatrixTranspose_BD(out, A,B,T);
81 } else {
82 Paso_SparseMatrix_MatrixMatrixTranspose_BB(out, A,B,T);
83 }
84 }
85 }
86 return out;
87 } else {
88 Paso_SparseMatrix_free(out);
89 return NULL;
90 }
91 }
92 /* not good for block size 1 */
93 void Paso_SparseMatrix_MatrixMatrixTranspose_BB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B, const Paso_SparseMatrix* T)
94 {
95 const dim_t n = C->numRows;
96 const dim_t row_block_size = C->row_block_size;
97 const dim_t col_block_size = C->col_block_size;
98 const dim_t A_col_block_size = A->col_block_size;
99 const dim_t C_block_size =C->block_size;
100 const dim_t B_block_size =B->block_size;
101 const dim_t A_block_size =A->block_size;
102 double *C_ij, *A_ik, *B_kj;
103 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
104 dim_t i, ib, irb, icb;
105 index_t ij_ptrC, j, ik_ptrA, kj_ptrB, kA, kB, ikb, kjb;
106
107 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_col_block_size == 2) ) {
108 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
109 {
110 #pragma omp for schedule(static)
111 for(i = 0; i < n; i++) {
112 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
113 j = C->pattern->index[ij_ptrC];
114
115 C_ij_00=0;
116 C_ij_10=0;
117 C_ij_01=0;
118 C_ij_11=0;
119
120 C_ij=&(C->val[ij_ptrC*4]);
121
122 ik_ptrA=A->pattern->ptr[i];
123 ikb = A->pattern->ptr[i+1];
124 kj_ptrB=T->pattern->ptr[j];
125 kjb = T->pattern->ptr[j+1];
126 kA=A->pattern->index[ik_ptrA];
127 kB=T->pattern->index[kj_ptrB];
128 while (ik_ptrA < ikb && kj_ptrB < kjb) {
129 if (kA < kB) {
130 ik_ptrA ++;
131 if (ik_ptrA >= ikb) break;
132 kA=A->pattern->index[ik_ptrA];
133 } else if (kA > kB) {
134 kj_ptrB ++;
135 if (kj_ptrB >= kjb) break;
136 kB=T->pattern->index[kj_ptrB];
137 } else {
138 A_ik=&(A->val[ik_ptrA*4]);
139 B_kj=&(T->val[kj_ptrB*4]);
140 C_ij_00 +=A_ik[0+2*0]*B_kj[0+2*0]+A_ik[0+2*1]*B_kj[1+2*0];
141 C_ij_10 +=A_ik[1+2*0]*B_kj[0+2*0]+A_ik[1+2*1]*B_kj[1+2*0];
142 C_ij_01 +=A_ik[0+2*0]*B_kj[0+2*1]+A_ik[0+2*1]*B_kj[1+2*1];
143 C_ij_11 +=A_ik[1+2*0]*B_kj[0+2*1]+A_ik[1+2*1]*B_kj[1+2*1];
144 ik_ptrA ++;
145 kj_ptrB ++;
146 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
147 kA=A->pattern->index[ik_ptrA];
148 kB=T->pattern->index[kj_ptrB];
149 }
150 }
151 C_ij[0+2*0]=C_ij_00;
152 C_ij[1+2*0]=C_ij_10;
153 C_ij[0+2*1]=C_ij_01;
154 C_ij[1+2*1]=C_ij_11;
155
156 }
157 }
158 } /* end of parallel region */
159
160 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_col_block_size == 3) ){
161 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
162 {
163 #pragma omp for schedule(static)
164 for(i = 0; i < n; i++) {
165 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
166 j = C->pattern->index[ij_ptrC];
167
168 C_ij_00=0;
169 C_ij_10=0;
170 C_ij_20=0;
171 C_ij_01=0;
172 C_ij_11=0;
173 C_ij_21=0;
174 C_ij_02=0;
175 C_ij_12=0;
176 C_ij_22=0;
177
178 C_ij=&(C->val[ij_ptrC*9]);
179
180 ik_ptrA=A->pattern->ptr[i];
181 ikb = A->pattern->ptr[i+1];
182 kj_ptrB=T->pattern->ptr[j];
183 kjb = T->pattern->ptr[j+1];
184 kA=A->pattern->index[ik_ptrA];
185 kB=T->pattern->index[kj_ptrB];
186 while (ik_ptrA < ikb && kj_ptrB < kjb) {
187 if (kA < kB) {
188 ik_ptrA ++;
189 if (ik_ptrA >= ikb) break;
190 kA=A->pattern->index[ik_ptrA];
191 } else if (kA > kB) {
192 kj_ptrB ++;
193 if (kj_ptrB >= kjb) break;
194 kB=T->pattern->index[kj_ptrB];
195 } else {
196 A_ik=&(A->val[ik_ptrA*9]);
197 B_kj=&(T->val[kj_ptrB*9]);
198
199 C_ij_00 +=A_ik[0+3*0]*B_kj[0+3*0]
200 +A_ik[0+3*1]*B_kj[1+3*0]
201 +A_ik[0+3*2]*B_kj[2+3*0];
202 C_ij_10 +=A_ik[1+3*0]*B_kj[0+3*0]
203 +A_ik[1+3*1]*B_kj[1+3*0]
204 +A_ik[1+3*2]*B_kj[2+3*0];
205 C_ij_20 +=A_ik[2+3*0]*B_kj[0+3*0]
206 +A_ik[2+3*1]*B_kj[1+3*0]
207 +A_ik[2+3*2]*B_kj[2+3*0];
208
209 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*1]
210 +A_ik[0+3*1]*B_kj[1+3*1]
211 +A_ik[0+3*2]*B_kj[2+3*1];
212 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*1]
213 +A_ik[1+3*1]*B_kj[1+3*1]
214 +A_ik[1+3*2]*B_kj[2+3*1];
215 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*1]
216 +A_ik[2+3*1]*B_kj[1+3*1]
217 +A_ik[2+3*2]*B_kj[2+3*1];
218
219 C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*2]
220 +A_ik[0+3*1]*B_kj[1+3*2]
221 +A_ik[0+3*2]*B_kj[2+3*2];
222 C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*2]
223 +A_ik[1+3*1]*B_kj[1+3*2]
224 +A_ik[1+3*2]*B_kj[2+3*2];
225 C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*2]
226 +A_ik[2+3*1]*B_kj[1+3*2]
227 +A_ik[2+3*2]*B_kj[2+3*2];
228 ik_ptrA ++;
229 kj_ptrB ++;
230 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
231 kA=A->pattern->index[ik_ptrA];
232 kB=T->pattern->index[kj_ptrB];
233 }
234 }
235 C_ij[0+3*0]=C_ij_00;
236 C_ij[1+3*0]=C_ij_10;
237 C_ij[2+3*0]=C_ij_20;
238 C_ij[0+3*1]=C_ij_01;
239 C_ij[1+3*1]=C_ij_11;
240 C_ij[2+3*1]=C_ij_21;
241 C_ij[0+3*2]=C_ij_02;
242 C_ij[1+3*2]=C_ij_12;
243 C_ij[2+3*2]=C_ij_22;
244 }
245 }
246 } /* end of parallel region */
247 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_col_block_size == 4) ){
248 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
249 {
250 #pragma omp for schedule(static)
251 for(i = 0; i < n; i++) {
252 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
253 j = C->pattern->index[ij_ptrC];
254
255 C_ij_00=0;
256 C_ij_10=0;
257 C_ij_20=0;
258 C_ij_30=0;
259 C_ij_01=0;
260 C_ij_11=0;
261 C_ij_21=0;
262 C_ij_31=0;
263 C_ij_02=0;
264 C_ij_12=0;
265 C_ij_22=0;
266 C_ij_32=0;
267 C_ij_03=0;
268 C_ij_13=0;
269 C_ij_23=0;
270 C_ij_33=0;
271
272 C_ij=&(C->val[ij_ptrC*16]);
273
274 ik_ptrA=A->pattern->ptr[i];
275 ikb = A->pattern->ptr[i+1];
276 kj_ptrB=T->pattern->ptr[j];
277 kjb = T->pattern->ptr[j+1];
278 kA=A->pattern->index[ik_ptrA];
279 kB=T->pattern->index[kj_ptrB];
280 while (ik_ptrA < ikb && kj_ptrB < kjb) {
281 if (kA < kB) {
282 ik_ptrA ++;
283 if (ik_ptrA >= ikb) break;
284 kA=A->pattern->index[ik_ptrA];
285 } else if (kA > kB) {
286 kj_ptrB ++;
287 if (kj_ptrB >= kjb) break;
288 kB=T->pattern->index[kj_ptrB];
289 } else {
290 A_ik=&(A->val[ik_ptrA*16]);
291 B_kj=&(T->val[kj_ptrB*16]);
292
293 C_ij_00 +=A_ik[0+4*0]*B_kj[0+4*0]
294 +A_ik[0+4*1]*B_kj[1+4*0]
295 +A_ik[0+4*2]*B_kj[2+4*0]
296 +A_ik[0+4*3]*B_kj[3+4*0];
297 C_ij_10 +=A_ik[1+4*0]*B_kj[0+4*0]
298 +A_ik[1+4*1]*B_kj[1+4*0]
299 +A_ik[1+4*2]*B_kj[2+4*0]
300 +A_ik[1+4*3]*B_kj[3+4*0];
301 C_ij_20 +=A_ik[2+4*0]*B_kj[0+4*0]
302 +A_ik[2+4*1]*B_kj[1+4*0]
303 +A_ik[2+4*2]*B_kj[2+4*0]
304 +A_ik[2+4*3]*B_kj[3+4*0];
305 C_ij_30 +=A_ik[3+4*0]*B_kj[0+4*0]
306 +A_ik[3+4*1]*B_kj[1+4*0]
307 +A_ik[3+4*2]*B_kj[2+4*0]
308 +A_ik[3+4*3]*B_kj[3+4*0];
309
310 C_ij_01 +=A_ik[0+4*0]*B_kj[0+4*1]
311 +A_ik[0+4*1]*B_kj[1+4*1]
312 +A_ik[0+4*2]*B_kj[2+4*1]
313 +A_ik[0+4*3]*B_kj[3+4*1];
314 C_ij_11 +=A_ik[1+4*0]*B_kj[0+4*1]
315 +A_ik[1+4*1]*B_kj[1+4*1]
316 +A_ik[1+4*2]*B_kj[2+4*1]
317 +A_ik[1+4*3]*B_kj[3+4*1];
318 C_ij_21 +=A_ik[2+4*0]*B_kj[0+4*1]
319 +A_ik[2+4*1]*B_kj[1+4*1]
320 +A_ik[2+4*2]*B_kj[2+4*1]
321 +A_ik[2+4*3]*B_kj[3+4*1];
322 C_ij_31 +=A_ik[3+4*0]*B_kj[0+4*1]
323 +A_ik[3+4*1]*B_kj[1+4*1]
324 +A_ik[3+4*2]*B_kj[2+4*1]
325 +A_ik[3+4*3]*B_kj[3+4*1];
326
327 C_ij_02 +=A_ik[0+4*0]*B_kj[0+4*2]
328 +A_ik[0+4*1]*B_kj[1+4*2]
329 +A_ik[0+4*2]*B_kj[2+4*2]
330 +A_ik[0+4*3]*B_kj[3+4*2];
331 C_ij_12 +=A_ik[1+4*0]*B_kj[0+4*2]
332 +A_ik[1+4*1]*B_kj[1+4*2]
333 +A_ik[1+4*2]*B_kj[2+4*2]
334 +A_ik[1+4*3]*B_kj[3+4*2];
335 C_ij_22 +=A_ik[2+4*0]*B_kj[0+4*2]
336 +A_ik[2+4*1]*B_kj[1+4*2]
337 +A_ik[2+4*2]*B_kj[2+4*2]
338 +A_ik[2+4*3]*B_kj[3+4*2];
339 C_ij_32 +=A_ik[3+4*0]*B_kj[0+4*2]
340 +A_ik[3+4*1]*B_kj[1+4*2]
341 +A_ik[3+4*2]*B_kj[2+4*2]
342 +A_ik[3+4*3]*B_kj[3+4*2];
343
344 C_ij_03 +=A_ik[0+4*0]*B_kj[0+4*3]
345 +A_ik[0+4*1]*B_kj[1+4*3]
346 +A_ik[0+4*2]*B_kj[2+4*3]
347 +A_ik[0+4*3]*B_kj[3+4*3];
348 C_ij_13 +=A_ik[1+4*0]*B_kj[0+4*3]
349 +A_ik[1+4*1]*B_kj[1+4*3]
350 +A_ik[1+4*2]*B_kj[2+4*3]
351 +A_ik[1+4*3]*B_kj[3+4*3];
352 C_ij_23 +=A_ik[2+4*0]*B_kj[0+4*3]
353 +A_ik[2+4*1]*B_kj[1+4*3]
354 +A_ik[2+4*2]*B_kj[2+4*3]
355 +A_ik[2+4*3]*B_kj[3+4*3];
356 C_ij_33 +=A_ik[3+4*0]*B_kj[0+4*3]
357 +A_ik[3+4*1]*B_kj[1+4*3]
358 +A_ik[3+4*2]*B_kj[2+4*3]
359 +A_ik[3+4*3]*B_kj[3+4*3];
360 ik_ptrA ++;
361 kj_ptrB ++;
362 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
363 kA=A->pattern->index[ik_ptrA];
364 kB=T->pattern->index[kj_ptrB];
365 }
366 }
367 C_ij[0+4*0]=C_ij_00;
368 C_ij[1+4*0]=C_ij_10;
369 C_ij[2+4*0]=C_ij_20;
370 C_ij[3+4*0]=C_ij_30;
371 C_ij[0+4*1]=C_ij_01;
372 C_ij[1+4*1]=C_ij_11;
373 C_ij[2+4*1]=C_ij_21;
374 C_ij[3+4*1]=C_ij_31;
375 C_ij[0+4*2]=C_ij_02;
376 C_ij[1+4*2]=C_ij_12;
377 C_ij[2+4*2]=C_ij_22;
378 C_ij[3+4*2]=C_ij_32;
379 C_ij[0+4*3]=C_ij_03;
380 C_ij[1+4*3]=C_ij_13;
381 C_ij[2+4*3]=C_ij_23;
382 C_ij[3+4*3]=C_ij_33;
383 }
384 }
385 } /* end of parallel region */
386
387 } else {
388 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, rtmp, A_ik,B_kj )
389 {
390 #pragma omp for schedule(static)
391 for(i = 0; i < n; i++) {
392 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
393 j = C->pattern->index[ij_ptrC];
394 C_ij=&(C->val[ij_ptrC*C_block_size]);
395 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
396 ik_ptrA=A->pattern->ptr[i];
397 ikb = A->pattern->ptr[i+1];
398 kj_ptrB=T->pattern->ptr[j];
399 kjb = T->pattern->ptr[j+1];
400 kA=A->pattern->index[ik_ptrA];
401 kB=T->pattern->index[kj_ptrB];
402 while (ik_ptrA < ikb && kj_ptrB < kjb) {
403 if (kA < kB) {
404 ik_ptrA ++;
405 if (ik_ptrA >= ikb) break;
406 kA=A->pattern->index[ik_ptrA];
407 } else if (kA > kB) {
408 kj_ptrB ++;
409 if (kj_ptrB >= kjb) break;
410 kB=T->pattern->index[kj_ptrB];
411 } else {
412 A_ik=&(A->val[ik_ptrA*A_block_size]);
413 B_kj=&(T->val[kj_ptrB*B_block_size]);
414 for (irb=0; irb<row_block_size; ++irb) {
415 for (icb=0; icb<col_block_size; ++icb) {
416 rtmp=C_ij[irb+row_block_size*icb];
417 for (ib=0; ib<A_col_block_size; ++ib) {
418 rtmp+=A_ik[irb+row_block_size*ib]*B_kj[ib+A_col_block_size*icb];
419 }
420 C_ij[irb+row_block_size*icb]=rtmp;
421 }
422 }
423 ik_ptrA ++;
424 kj_ptrB ++;
425 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
426 kA=A->pattern->index[ik_ptrA];
427 kB=T->pattern->index[kj_ptrB];
428 }
429 }
430 }
431 }
432 } /* end of parallel region */
433
434 }
435 }
436
437 /* not good for block size 1 */
438 void Paso_SparseMatrix_MatrixMatrixTranspose_DB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B, const Paso_SparseMatrix* T)
439 {
440 const dim_t n = C->numRows;
441 const dim_t row_block_size = C->row_block_size;
442 const dim_t col_block_size = C->col_block_size;
443 const dim_t A_col_block_size = A->col_block_size;
444 const dim_t C_block_size =C->block_size;
445 const dim_t B_block_size =B->block_size;
446 const dim_t A_block_size =A->block_size;
447 double *C_ij, *A_ik, *B_kj;
448 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
449 dim_t i, ib, irb, icb;
450 index_t ij_ptrC, j, ik_ptrA, kj_ptrB, kA, kB, ikb, kjb;
451
452 if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_block_size == 2) ) {
453 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
454 {
455 #pragma omp for schedule(static)
456 for(i = 0; i < n; i++) {
457 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
458 j = C->pattern->index[ij_ptrC];
459
460 C_ij_00=0;
461 C_ij_10=0;
462 C_ij_01=0;
463 C_ij_11=0;
464
465 C_ij=&(C->val[ij_ptrC*4]);
466
467 ik_ptrA=A->pattern->ptr[i];
468 ikb = A->pattern->ptr[i+1];
469 kj_ptrB=T->pattern->ptr[j];
470 kjb = T->pattern->ptr[j+1];
471 kA=A->pattern->index[ik_ptrA];
472 kB=T->pattern->index[kj_ptrB];
473 while (ik_ptrA < ikb && kj_ptrB < kjb) {
474 if (kA < kB) {
475 ik_ptrA ++;
476 if (ik_ptrA >= ikb) break;
477 kA=A->pattern->index[ik_ptrA];
478 } else if (kA > kB) {
479 kj_ptrB ++;
480 if (kj_ptrB >= kjb) break;
481 kB=T->pattern->index[kj_ptrB];
482 } else {
483 A_ik=&(A->val[ik_ptrA*2]);
484 B_kj=&(T->val[kj_ptrB*4]);
485
486 C_ij_00 +=A_ik[0]*B_kj[0+2*0];
487 C_ij_10 +=A_ik[1]*B_kj[1+2*0];
488
489 C_ij_01 +=A_ik[0]*B_kj[0+2*1];
490 C_ij_11 +=A_ik[1]*B_kj[1+2*1];
491 ik_ptrA ++;
492 kj_ptrB ++;
493 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
494 kA=A->pattern->index[ik_ptrA];
495 kB=T->pattern->index[kj_ptrB];
496
497 }
498 }
499 C_ij[0+2*0]=C_ij_00;
500 C_ij[1+2*0]=C_ij_10;
501 C_ij[0+2*1]=C_ij_01;
502 C_ij[1+2*1]=C_ij_11;
503
504 }
505 }
506 } /* end of parallel region */
507
508 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_block_size == 3) ){
509 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
510 {
511 #pragma omp for schedule(static)
512 for(i = 0; i < n; i++) {
513 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
514 j = C->pattern->index[ij_ptrC];
515
516 C_ij_00=0;
517 C_ij_10=0;
518 C_ij_20=0;
519 C_ij_01=0;
520 C_ij_11=0;
521 C_ij_21=0;
522 C_ij_02=0;
523 C_ij_12=0;
524 C_ij_22=0;
525
526 C_ij=&(C->val[ij_ptrC*9]);
527
528 ik_ptrA=A->pattern->ptr[i];
529 ikb = A->pattern->ptr[i+1];
530 kj_ptrB=T->pattern->ptr[j];
531 kjb = T->pattern->ptr[j+1];
532 kA=A->pattern->index[ik_ptrA];
533 kB=T->pattern->index[kj_ptrB];
534 while (ik_ptrA < ikb && kj_ptrB < kjb) {
535 if (kA < kB) {
536 ik_ptrA ++;
537 if (ik_ptrA >= ikb) break;
538 kA=A->pattern->index[ik_ptrA];
539 } else if (kA > kB) {
540 kj_ptrB ++;
541 if (kj_ptrB >= kjb) break;
542 kB=T->pattern->index[kj_ptrB];
543 } else {
544 A_ik=&(A->val[ik_ptrA*3]);
545 B_kj=&(T->val[kj_ptrB*9]);
546
547 C_ij_00 +=A_ik[0]*B_kj[0+3*0];
548 C_ij_10 +=A_ik[1]*B_kj[1+3*0];
549 C_ij_20 +=A_ik[2]*B_kj[2+3*0];
550
551 C_ij_01 +=A_ik[0]*B_kj[0+3*1];
552 C_ij_11 +=A_ik[1]*B_kj[1+3*1];
553 C_ij_21 +=A_ik[2]*B_kj[2+3*1];
554
555 C_ij_02 +=A_ik[0]*B_kj[0+3*2];
556 C_ij_12 +=A_ik[1]*B_kj[1+3*2];
557 C_ij_22 +=A_ik[2]*B_kj[2+3*2];
558 ik_ptrA ++;
559 kj_ptrB ++;
560 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
561 kA=A->pattern->index[ik_ptrA];
562 kB=T->pattern->index[kj_ptrB];
563 }
564 }
565 C_ij[0+3*0]=C_ij_00;
566 C_ij[1+3*0]=C_ij_10;
567 C_ij[2+3*0]=C_ij_20;
568 C_ij[0+3*1]=C_ij_01;
569 C_ij[1+3*1]=C_ij_11;
570 C_ij[2+3*1]=C_ij_21;
571 C_ij[0+3*2]=C_ij_02;
572 C_ij[1+3*2]=C_ij_12;
573 C_ij[2+3*2]=C_ij_22;
574 }
575 }
576 } /* end of parallel region */
577 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_block_size == 4) ){
578 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
579 {
580 #pragma omp for schedule(static)
581 for(i = 0; i < n; i++) {
582 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
583 j = C->pattern->index[ij_ptrC];
584
585 C_ij_00=0;
586 C_ij_10=0;
587 C_ij_20=0;
588 C_ij_30=0;
589 C_ij_01=0;
590 C_ij_11=0;
591 C_ij_21=0;
592 C_ij_31=0;
593 C_ij_02=0;
594 C_ij_12=0;
595 C_ij_22=0;
596 C_ij_32=0;
597 C_ij_03=0;
598 C_ij_13=0;
599 C_ij_23=0;
600 C_ij_33=0;
601
602 C_ij=&(C->val[ij_ptrC*16]);
603
604 ik_ptrA=A->pattern->ptr[i];
605 ikb = A->pattern->ptr[i+1];
606 kj_ptrB=T->pattern->ptr[j];
607 kjb = T->pattern->ptr[j+1];
608 kA=A->pattern->index[ik_ptrA];
609 kB=T->pattern->index[kj_ptrB];
610 while (ik_ptrA < ikb && kj_ptrB < kjb) {
611 if (kA < kB) {
612 ik_ptrA ++;
613 if (ik_ptrA >= ikb) break;
614 kA=A->pattern->index[ik_ptrA];
615 } else if (kA > kB) {
616 kj_ptrB ++;
617 if (kj_ptrB >= kjb) break;
618 kB=T->pattern->index[kj_ptrB];
619 } else {
620 A_ik=&(A->val[ik_ptrA*4]);
621 B_kj=&(T->val[kj_ptrB*16]);
622
623 C_ij_00 +=A_ik[0]*B_kj[0+4*0];
624 C_ij_10 +=A_ik[1]*B_kj[1+4*0];
625 C_ij_20 +=A_ik[2]*B_kj[2+4*0];
626 C_ij_30 +=A_ik[3]*B_kj[3+4*0];
627
628 C_ij_01 +=A_ik[0]*B_kj[0+4*1];
629 C_ij_11 +=A_ik[1]*B_kj[1+4*1];
630 C_ij_21 +=A_ik[2]*B_kj[2+4*1];
631 C_ij_31 +=A_ik[3]*B_kj[3+4*1];
632
633 C_ij_02 +=A_ik[0]*B_kj[0+4*2];
634 C_ij_12 +=A_ik[1]*B_kj[1+4*2];
635 C_ij_22 +=A_ik[2]*B_kj[2+4*2];
636 C_ij_32 +=A_ik[3]*B_kj[3+4*2];
637
638 C_ij_03 +=A_ik[0]*B_kj[0+4*3];
639 C_ij_13 +=A_ik[1]*B_kj[1+4*3];
640 C_ij_23 +=A_ik[2]*B_kj[2+4*3];
641 C_ij_33 +=A_ik[3]*B_kj[3+4*3];
642
643 ik_ptrA ++;
644 kj_ptrB ++;
645 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
646 kA=A->pattern->index[ik_ptrA];
647 kB=T->pattern->index[kj_ptrB];
648 }
649 }
650 C_ij[0+4*0]=C_ij_00;
651 C_ij[1+4*0]=C_ij_10;
652 C_ij[2+4*0]=C_ij_20;
653 C_ij[3+4*0]=C_ij_30;
654 C_ij[0+4*1]=C_ij_01;
655 C_ij[1+4*1]=C_ij_11;
656 C_ij[2+4*1]=C_ij_21;
657 C_ij[3+4*1]=C_ij_31;
658 C_ij[0+4*2]=C_ij_02;
659 C_ij[1+4*2]=C_ij_12;
660 C_ij[2+4*2]=C_ij_22;
661 C_ij[3+4*2]=C_ij_32;
662 C_ij[0+4*3]=C_ij_03;
663 C_ij[1+4*3]=C_ij_13;
664 C_ij[2+4*3]=C_ij_23;
665 C_ij[3+4*3]=C_ij_33;
666 }
667 }
668 } /* end of parallel region */
669
670 } else {
671 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, rtmp, A_ik,B_kj )
672 {
673 #pragma omp for schedule(static)
674 for(i = 0; i < n; i++) {
675 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
676 j = C->pattern->index[ij_ptrC];
677 C_ij=&(C->val[ij_ptrC*C_block_size]);
678 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
679 ik_ptrA=A->pattern->ptr[i];
680 ikb = A->pattern->ptr[i+1];
681 kj_ptrB=T->pattern->ptr[j];
682 kjb = T->pattern->ptr[j+1];
683 kA=A->pattern->index[ik_ptrA];
684 kB=T->pattern->index[kj_ptrB];
685 while (ik_ptrA < ikb && kj_ptrB < kjb) {
686 if (kA < kB) {
687 ik_ptrA ++;
688 if (ik_ptrA >= ikb) break;
689 kA=A->pattern->index[ik_ptrA];
690 } else if (kA > kB) {
691 kj_ptrB ++;
692 if (kj_ptrB >= kjb) break;
693 kB=T->pattern->index[kj_ptrB];
694 } else {
695 A_ik=&(A->val[ik_ptrA*A_block_size]);
696 B_kj=&(T->val[kj_ptrB*B_block_size]);
697
698 for (irb=0; irb<A_block_size; ++irb) {
699 rtmp=A_ik[irb];
700 for (icb=0; icb<col_block_size; ++icb) {
701 C_ij[irb+row_block_size*icb]+=rtmp*B_kj[irb+A_col_block_size*icb];
702 }
703 }
704 ik_ptrA ++;
705 kj_ptrB ++;
706 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
707 kA=A->pattern->index[ik_ptrA];
708 kB=T->pattern->index[kj_ptrB];
709 }
710 }
711 }
712 }
713 } /* end of parallel region */
714
715 }
716 }
717
718 /* not good for block size 1 */
719 void Paso_SparseMatrix_MatrixMatrixTranspose_BD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B, const Paso_SparseMatrix* T)
720 {
721 const dim_t n = C->numRows;
722 const dim_t row_block_size = C->row_block_size;
723 const dim_t col_block_size = C->col_block_size;
724 const dim_t C_block_size =C->block_size;
725 const dim_t B_block_size =B->block_size;
726 const dim_t A_block_size =A->block_size;
727 double *C_ij, *A_ik, *B_kj;
728 register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
729 dim_t i, ib, irb, icb;
730 index_t ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB;
731
732 if ( (row_block_size == 2) && (col_block_size ==2 ) && (B_block_size == 2) ) {
733 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
734 {
735 #pragma omp for schedule(static)
736 for(i = 0; i < n; i++) {
737 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
738 j = C->pattern->index[ij_ptrC];
739
740 C_ij_00=0;
741 C_ij_10=0;
742 C_ij_01=0;
743 C_ij_11=0;
744
745 C_ij=&(C->val[ij_ptrC*4]);
746
747 ik_ptrA=A->pattern->ptr[i];
748 ikb = A->pattern->ptr[i+1];
749 kj_ptrB=T->pattern->ptr[j];
750 kjb = T->pattern->ptr[j+1];
751 kA=A->pattern->index[ik_ptrA];
752 kB=T->pattern->index[kj_ptrB];
753 while (ik_ptrA < ikb && kj_ptrB < kjb) {
754 if (kA < kB) {
755 ik_ptrA ++;
756 if (ik_ptrA >= ikb) break;
757 kA=A->pattern->index[ik_ptrA];
758 } else if (kA > kB) {
759 kj_ptrB ++;
760 if (kj_ptrB >= kjb) break;
761 kB=T->pattern->index[kj_ptrB];
762 } else {
763 A_ik=&(A->val[ik_ptrA*4]);
764 B_kj=&(T->val[kj_ptrB*2]);
765
766 C_ij_00 +=A_ik[0+2*0]*B_kj[0];
767 C_ij_10 +=A_ik[1+2*0]*B_kj[0];
768
769 C_ij_01 +=A_ik[0+2*1]*B_kj[1];
770 C_ij_11 +=A_ik[1+2*1]*B_kj[1];
771
772 ik_ptrA ++;
773 kj_ptrB ++;
774 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
775 kA=A->pattern->index[ik_ptrA];
776 kB=T->pattern->index[kj_ptrB];
777 }
778 }
779 C_ij[0+2*0]=C_ij_00;
780 C_ij[1+2*0]=C_ij_10;
781 C_ij[0+2*1]=C_ij_01;
782 C_ij[1+2*1]=C_ij_11;
783
784 }
785 }
786 } /* end of parallel region */
787
788 } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (B_block_size == 3) ){
789 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
790 {
791 #pragma omp for schedule(static)
792 for(i = 0; i < n; i++) {
793 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
794 j = C->pattern->index[ij_ptrC];
795
796 C_ij_00=0;
797 C_ij_10=0;
798 C_ij_20=0;
799 C_ij_01=0;
800 C_ij_11=0;
801 C_ij_21=0;
802 C_ij_02=0;
803 C_ij_12=0;
804 C_ij_22=0;
805
806 C_ij=&(C->val[ij_ptrC*9]);
807
808 ik_ptrA=A->pattern->ptr[i];
809 ikb = A->pattern->ptr[i+1];
810 kj_ptrB=T->pattern->ptr[j];
811 kjb = T->pattern->ptr[j+1];
812 kA=A->pattern->index[ik_ptrA];
813 kB=T->pattern->index[kj_ptrB];
814 while (ik_ptrA < ikb && kj_ptrB < kjb) {
815 if (kA < kB) {
816 ik_ptrA ++;
817 if (ik_ptrA >= ikb) break;
818 kA=A->pattern->index[ik_ptrA];
819 } else if (kA > kB) {
820 kj_ptrB ++;
821 if (kj_ptrB >= kjb) break;
822 kB=T->pattern->index[kj_ptrB];
823 } else {
824 A_ik=&(A->val[ik_ptrA*9]);
825 B_kj=&(T->val[kj_ptrB*3]);
826
827 C_ij_00 +=A_ik[0+3*0]*B_kj[0];
828 C_ij_10 +=A_ik[1+3*0]*B_kj[0];
829 C_ij_20 +=A_ik[2+3*0]*B_kj[0];
830
831 C_ij_01 +=A_ik[0+3*1]*B_kj[1];
832 C_ij_11 +=A_ik[1+3*1]*B_kj[1];
833 C_ij_21 +=A_ik[2+3*1]*B_kj[1];
834
835 C_ij_02 +=A_ik[0+3*2]*B_kj[2];
836 C_ij_12 +=A_ik[1+3*2]*B_kj[2];
837 C_ij_22 +=A_ik[2+3*2]*B_kj[2];
838
839 ik_ptrA ++;
840 kj_ptrB ++;
841 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
842 kA=A->pattern->index[ik_ptrA];
843 kB=T->pattern->index[kj_ptrB];
844 }
845 }
846 C_ij[0+3*0]=C_ij_00;
847 C_ij[1+3*0]=C_ij_10;
848 C_ij[2+3*0]=C_ij_20;
849 C_ij[0+3*1]=C_ij_01;
850 C_ij[1+3*1]=C_ij_11;
851 C_ij[2+3*1]=C_ij_21;
852 C_ij[0+3*2]=C_ij_02;
853 C_ij[1+3*2]=C_ij_12;
854 C_ij[2+3*2]=C_ij_22;
855 }
856 }
857 } /* end of parallel region */
858 } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (B_block_size == 4) ){
859 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
860 {
861 #pragma omp for schedule(static)
862 for(i = 0; i < n; i++) {
863 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
864 j = C->pattern->index[ij_ptrC];
865
866 C_ij_00=0;
867 C_ij_10=0;
868 C_ij_20=0;
869 C_ij_30=0;
870 C_ij_01=0;
871 C_ij_11=0;
872 C_ij_21=0;
873 C_ij_31=0;
874 C_ij_02=0;
875 C_ij_12=0;
876 C_ij_22=0;
877 C_ij_32=0;
878 C_ij_03=0;
879 C_ij_13=0;
880 C_ij_23=0;
881 C_ij_33=0;
882
883 C_ij=&(C->val[ij_ptrC*16]);
884
885 ik_ptrA=A->pattern->ptr[i];
886 ikb = A->pattern->ptr[i+1];
887 kj_ptrB=T->pattern->ptr[j];
888 kjb = T->pattern->ptr[j+1];
889 kA=A->pattern->index[ik_ptrA];
890 kB=T->pattern->index[kj_ptrB];
891 while (ik_ptrA < ikb && kj_ptrB < kjb) {
892 if (kA < kB) {
893 ik_ptrA ++;
894 if (ik_ptrA >= ikb) break;
895 kA=A->pattern->index[ik_ptrA];
896 } else if (kA > kB) {
897 kj_ptrB ++;
898 if (kj_ptrB >= kjb) break;
899 kB=T->pattern->index[kj_ptrB];
900 } else {
901 A_ik=&(A->val[ik_ptrA*16]);
902 B_kj=&(T->val[kj_ptrB*4]);
903
904 C_ij_00 +=A_ik[0+4*0]*B_kj[0];
905 C_ij_10 +=A_ik[1+4*0]*B_kj[0];
906 C_ij_20 +=A_ik[2+4*0]*B_kj[0];
907 C_ij_30 +=A_ik[3+4*0]*B_kj[0];
908
909 C_ij_01 +=A_ik[0+4*1]*B_kj[1];
910 C_ij_11 +=A_ik[1+4*1]*B_kj[1];
911 C_ij_21 +=A_ik[2+4*1]*B_kj[1];
912 C_ij_31 +=A_ik[3+4*1]*B_kj[1];
913
914 C_ij_02 +=A_ik[0+4*2]*B_kj[2];
915 C_ij_12 +=A_ik[1+4*2]*B_kj[2];
916 C_ij_22 +=A_ik[2+4*2]*B_kj[2];
917 C_ij_32 +=A_ik[3+4*2]*B_kj[2];
918
919 C_ij_03 +=A_ik[0+4*3]*B_kj[3];
920 C_ij_13 +=A_ik[1+4*3]*B_kj[3];
921 C_ij_23 +=A_ik[2+4*3]*B_kj[3];
922 C_ij_33 +=A_ik[3+4*3]*B_kj[3];
923
924 ik_ptrA ++;
925 kj_ptrB ++;
926 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
927 kA=A->pattern->index[ik_ptrA];
928 kB=T->pattern->index[kj_ptrB];
929 }
930 }
931 C_ij[0+4*0]=C_ij_00;
932 C_ij[1+4*0]=C_ij_10;
933 C_ij[2+4*0]=C_ij_20;
934 C_ij[3+4*0]=C_ij_30;
935 C_ij[0+4*1]=C_ij_01;
936 C_ij[1+4*1]=C_ij_11;
937 C_ij[2+4*1]=C_ij_21;
938 C_ij[3+4*1]=C_ij_31;
939 C_ij[0+4*2]=C_ij_02;
940 C_ij[1+4*2]=C_ij_12;
941 C_ij[2+4*2]=C_ij_22;
942 C_ij[3+4*2]=C_ij_32;
943 C_ij[0+4*3]=C_ij_03;
944 C_ij[1+4*3]=C_ij_13;
945 C_ij[2+4*3]=C_ij_23;
946 C_ij[3+4*3]=C_ij_33;
947 }
948 }
949 } /* end of parallel region */
950
951 } else {
952 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, irb, icb, ib, rtmp, A_ik,B_kj )
953 {
954 #pragma omp for schedule(static)
955 for(i = 0; i < n; i++) {
956 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
957 j = C->pattern->index[ij_ptrC];
958 C_ij=&(C->val[ij_ptrC*C_block_size]);
959 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
960 ik_ptrA=A->pattern->ptr[i];
961 ikb = A->pattern->ptr[i+1];
962 kj_ptrB=T->pattern->ptr[j];
963 kjb = T->pattern->ptr[j+1];
964 kA=A->pattern->index[ik_ptrA];
965 kB=T->pattern->index[kj_ptrB];
966 while (ik_ptrA < ikb && kj_ptrB < kjb) {
967 if (kA < kB) {
968 ik_ptrA ++;
969 if (ik_ptrA >= ikb) break;
970 kA=A->pattern->index[ik_ptrA];
971 } else if (kA > kB) {
972 kj_ptrB ++;
973 if (kj_ptrB >= kjb) break;
974 kB=T->pattern->index[kj_ptrB];
975 } else {
976 A_ik=&(A->val[ik_ptrA*A_block_size]);
977 B_kj=&(T->val[kj_ptrB*B_block_size]);
978
979 for (icb=0; icb<B_block_size; ++icb) {
980 rtmp=B_kj[icb];
981 for (irb=0; irb<row_block_size; ++irb) {
982 C_ij[irb+row_block_size*icb]+=A_ik[irb+row_block_size*icb]*rtmp;
983 }
984 }
985
986 ik_ptrA ++;
987 kj_ptrB ++;
988 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
989 kA=A->pattern->index[ik_ptrA];
990 kB=T->pattern->index[kj_ptrB];
991 }
992 }
993 }
994 }
995 } /* end of parallel region */
996
997 }
998 }
999
1000 /* not good for block size 1 */
1001 void Paso_SparseMatrix_MatrixMatrixTranspose_DD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B, const Paso_SparseMatrix* T)
1002 {
1003 const dim_t n = C->numRows;
1004 const dim_t C_block_size =C->block_size;
1005 const dim_t B_block_size =B->block_size;
1006 const dim_t A_block_size =A->block_size;
1007 double *C_ij, *A_ik, *B_kj;
1008 register double C_ij_0, C_ij_1, C_ij_2, C_ij_3;
1009 dim_t i, ib;
1010 index_t ij_ptrC, j, ik_ptrA, kA, kB, kj_ptrB, ikb, kjb;
1011
1012 if ( (A_block_size == 1) && (B_block_size ==1) && (C_block_size == 1) ) {
1013 #pragma omp parallel private(i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, C_ij_0)
1014 {
1015 #pragma omp for schedule(static)
1016 for(i = 0; i < n; i++) {
1017 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1018 j = C->pattern->index[ij_ptrC];
1019 C_ij_0=0;
1020 ik_ptrA=A->pattern->ptr[i];
1021 ikb = A->pattern->ptr[i+1];
1022 kj_ptrB=T->pattern->ptr[j];
1023 kjb = T->pattern->ptr[j+1];
1024 kA=A->pattern->index[ik_ptrA];
1025 kB=T->pattern->index[kj_ptrB];
1026 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1027 if (kA < kB) {
1028 ik_ptrA ++;
1029 if (ik_ptrA >= ikb) break;
1030 kA=A->pattern->index[ik_ptrA];
1031 } else if (kA > kB) {
1032 kj_ptrB ++;
1033 if (kj_ptrB >= kjb) break;
1034 kB=T->pattern->index[kj_ptrB];
1035 } else {
1036 C_ij_0+=A->val[ik_ptrA]*T->val[kj_ptrB];
1037 ik_ptrA ++;
1038 kj_ptrB ++;
1039 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1040 kA=A->pattern->index[ik_ptrA];
1041 kB=T->pattern->index[kj_ptrB];
1042 }
1043 }
1044 C->val[ij_ptrC]=C_ij_0;
1045 }
1046 }
1047 } /* end of parallel region */
1048 } else if ( (A_block_size == 2) && (B_block_size ==2) && (C_block_size == 2) ) {
1049 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, A_ik,B_kj, C_ij_0, C_ij_1)
1050 {
1051 #pragma omp for schedule(static)
1052 for(i = 0; i < n; i++) {
1053 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1054 j = C->pattern->index[ij_ptrC];
1055 C_ij=&(C->val[ij_ptrC*2]);
1056 C_ij_0=0;
1057 C_ij_1=0;
1058 ik_ptrA=A->pattern->ptr[i];
1059 ikb = A->pattern->ptr[i+1];
1060 kj_ptrB=T->pattern->ptr[j];
1061 kjb = T->pattern->ptr[j+1];
1062 kA=A->pattern->index[ik_ptrA];
1063 kB=T->pattern->index[kj_ptrB];
1064 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1065 if (kA < kB) {
1066 ik_ptrA ++;
1067 if (ik_ptrA >= ikb) break;
1068 kA=A->pattern->index[ik_ptrA];
1069 } else if (kA > kB) {
1070 kj_ptrB ++;
1071 if (kj_ptrB >= kjb) break;
1072 kB=T->pattern->index[kj_ptrB];
1073 } else {
1074 A_ik=&(A->val[ik_ptrA*2]);
1075 B_kj=&(T->val[kj_ptrB*2]);
1076 C_ij_0+=A_ik[0]*B_kj[0];
1077 C_ij_1+=A_ik[1]*B_kj[1];
1078 ik_ptrA ++;
1079 kj_ptrB ++;
1080 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1081 kA=A->pattern->index[ik_ptrA];
1082 kB=T->pattern->index[kj_ptrB];
1083 }
1084 }
1085 C_ij[0]=C_ij_0;
1086 C_ij[1]=C_ij_1;
1087 }
1088 }
1089 } /* end of parallel region */
1090 } else if ( (A_block_size == 3) && (B_block_size ==3) && (C_block_size == 3) ) {
1091 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2)
1092 {
1093 #pragma omp for schedule(static)
1094 for(i = 0; i < n; i++) {
1095 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1096 j = C->pattern->index[ij_ptrC];
1097 C_ij=&(C->val[ij_ptrC*C_block_size]);
1098 C_ij_0=0;
1099 C_ij_1=0;
1100 C_ij_2=0;
1101 ik_ptrA=A->pattern->ptr[i];
1102 ikb = A->pattern->ptr[i+1];
1103 kj_ptrB=T->pattern->ptr[j];
1104 kjb = T->pattern->ptr[j+1];
1105 kA=A->pattern->index[ik_ptrA];
1106 kB=T->pattern->index[kj_ptrB];
1107 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1108 if (kA < kB) {
1109 ik_ptrA ++;
1110 if (ik_ptrA >= ikb) break;
1111 kA=A->pattern->index[ik_ptrA];
1112 } else if (kA > kB) {
1113 kj_ptrB ++;
1114 if (kj_ptrB >= kjb) break;
1115 kB=T->pattern->index[kj_ptrB];
1116 } else {
1117 A_ik=&(A->val[ik_ptrA*3]);
1118 B_kj=&(T->val[kj_ptrB*3]);
1119 C_ij_0+=A_ik[0]*B_kj[0];
1120 C_ij_1+=A_ik[1]*B_kj[1];
1121 C_ij_2+=A_ik[2]*B_kj[2];
1122 ik_ptrA ++;
1123 kj_ptrB ++;
1124 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1125 kA=A->pattern->index[ik_ptrA];
1126 kB=T->pattern->index[kj_ptrB];
1127 }
1128 }
1129 C_ij[0]=C_ij_0;
1130 C_ij[1]=C_ij_1;
1131 C_ij[2]=C_ij_2;
1132 }
1133 }
1134 } /* end of parallel region */
1135 } else if ( (A_block_size == 4) && (B_block_size ==4 ) && (C_block_size == 4) ) {
1136 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2, C_ij_3 )
1137 {
1138 #pragma omp for schedule(static)
1139 for(i = 0; i < n; i++) {
1140 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1141 j = C->pattern->index[ij_ptrC];
1142 C_ij=&(C->val[ij_ptrC*C_block_size]);
1143 C_ij_0=0;
1144 C_ij_1=0;
1145 C_ij_2=0;
1146 C_ij_3=0;
1147 ik_ptrA=A->pattern->ptr[i];
1148 ikb = A->pattern->ptr[i+1];
1149 kj_ptrB=T->pattern->ptr[j];
1150 kjb = T->pattern->ptr[j+1];
1151 kA=A->pattern->index[ik_ptrA];
1152 kB=T->pattern->index[kj_ptrB];
1153 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1154 if (kA < kB) {
1155 ik_ptrA ++;
1156 if (ik_ptrA >= ikb) break;
1157 kA=A->pattern->index[ik_ptrA];
1158 } else if (kA > kB) {
1159 kj_ptrB ++;
1160 if (kj_ptrB >= kjb) break;
1161 kB=T->pattern->index[kj_ptrB];
1162 } else {
1163 A_ik=&(A->val[ik_ptrA*4]);
1164 B_kj=&(T->val[kj_ptrB*4]);
1165 C_ij_0+=A_ik[0]*B_kj[0];
1166 C_ij_1+=A_ik[1]*B_kj[1];
1167 C_ij_2+=A_ik[2]*B_kj[2];
1168 C_ij_3+=A_ik[3]*B_kj[3];
1169 ik_ptrA ++;
1170 kj_ptrB ++;
1171 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1172 kA=A->pattern->index[ik_ptrA];
1173 kB=T->pattern->index[kj_ptrB];
1174 }
1175 }
1176 C_ij[0]=C_ij_0;
1177 C_ij[1]=C_ij_1;
1178 C_ij[2]=C_ij_2;
1179 C_ij[3]=C_ij_3;
1180 }
1181 }
1182 } /* end of parallel region */
1183 } else {
1184 #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, kj_ptrB, ikb, kjb, kA, kB, ib, A_ik,B_kj )
1185 {
1186 #pragma omp for schedule(static)
1187 for(i = 0; i < n; i++) {
1188 for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1189 j = C->pattern->index[ij_ptrC];
1190 C_ij=&(C->val[ij_ptrC*C_block_size]);
1191 for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
1192 ik_ptrA=A->pattern->ptr[i];
1193 ikb = A->pattern->ptr[i+1];
1194 kj_ptrB=T->pattern->ptr[j];
1195 kjb = T->pattern->ptr[j+1];
1196 kA=A->pattern->index[ik_ptrA];
1197 kB=T->pattern->index[kj_ptrB];
1198 while (ik_ptrA < ikb && kj_ptrB < kjb) {
1199 if (kA < kB) {
1200 ik_ptrA ++;
1201 if (ik_ptrA >= ikb) break;
1202 kA=A->pattern->index[ik_ptrA];
1203 } else if (kA > kB) {
1204 kj_ptrB ++;
1205 if (kj_ptrB >= kjb) break;
1206 kB=T->pattern->index[kj_ptrB];
1207 } else {
1208 A_ik=&(A->val[ik_ptrA*A_block_size]);
1209 B_kj=&(T->val[kj_ptrB*B_block_size]);
1210 for (ib=0; ib<MIN(A_block_size, B_block_size); ++ib) C_ij[ib]+=A_ik[ib]*B_kj[ib];
1211 ik_ptrA ++;
1212 kj_ptrB ++;
1213 if (ik_ptrA >= ikb || kj_ptrB >= kjb) break;
1214 kA=A->pattern->index[ik_ptrA];
1215 kB=T->pattern->index[kj_ptrB];
1216 }
1217 }
1218 }
1219 }
1220 } /* end of parallel region */
1221 }
1222 }

  ViewVC Help
Powered by ViewVC 1.1.26