/[escript]/trunk/paso/src/SparseMatrix_MatrixMatrix.c
ViewVC logotype

Annotation of /trunk/paso/src/SparseMatrix_MatrixMatrix.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3303 - (hide annotations)
Mon Oct 25 04:33:31 2010 UTC (10 years, 6 months ago) by gross
File MIME type: text/plain
File size: 35965 byte(s)
more clean up work on the AMG
1 gross 3303
2     /*******************************************************
3     *
4     * Copyright (c) 2003-2010 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13    
14    
15     /**************************************************************
16    
17     Paso: Sparse matrix product
18    
19     **************************************************************
20    
21     Author: artak@uq.edu.au, l.gross@uq.edu.au
22    
23     **************************************************************/
24    
25     #include "SparseMatrix.h"
26     #include "Paso.h"
27    
28     Paso_SparseMatrix* Paso_SparseMatrix_MatrixMatrix(const Paso_SparseMatrix* A, const Paso_SparseMatrix* B) {
29     Paso_SparseMatrixType C_type;
30    
31     Paso_Pattern* outpattern=NULL;
32     Paso_SparseMatrix *out=NULL;
33     if ( ! ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (A->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & A->type ) ) ) {
34     Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of A.");
35     return NULL;
36     }
37     if ( ! ( (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) || (B->type & MATRIX_FORMAT_DEFAULT) || (MATRIX_FORMAT_BLK1 & B->type ) ) ) {
38     Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Unsupported matrix format of B.");
39     return NULL;
40     }
41     if (! (A->col_block_size == B->row_block_size) ) {
42     Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: Column block size of A and row block size of B must match.");
43     return NULL;
44     }
45     if (! (A->numCols == B->numRows) ) {
46     Esys_setError(TYPE_ERROR,"Paso_SparseMatrix_MatrixMatrix: number of columns of A and number of rows of B must match.");
47     return NULL;
48     }
49    
50    
51     if ( (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) && (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) ) {
52     C_type=MATRIX_FORMAT_DIAGONAL_BLOCK;
53     } else {
54     C_type=MATRIX_FORMAT_DEFAULT;
55     }
56    
57     outpattern=Paso_Pattern_multiply(PATTERN_FORMAT_DEFAULT,A->pattern,B->pattern);
58    
59     if (Esys_noError()) {
60     out=Paso_SparseMatrix_alloc(C_type, outpattern, A->row_block_size, B->col_block_size, FALSE);
61     }
62     Paso_Pattern_free(outpattern);
63    
64     if (Esys_noError()) {
65     if ( (A->row_block_size == 1) && (B->col_block_size ==1 ) && (A->col_block_size ==1) ) {
66     Paso_SparseMatrix_MatrixMatrix_DD(out, A,B);
67     } else {
68     if (A->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
69     if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
70     Paso_SparseMatrix_MatrixMatrix_DD(out, A,B);
71     } else {
72     Paso_SparseMatrix_MatrixMatrix_DB(out, A,B);
73     }
74     } else {
75     if (B->type & MATRIX_FORMAT_DIAGONAL_BLOCK) {
76     Paso_SparseMatrix_MatrixMatrix_BD(out, A,B);
77     } else {
78     Paso_SparseMatrix_MatrixMatrix_BB(out, A,B);
79     }
80     }
81     }
82     return out;
83     } else {
84     Paso_SparseMatrix_free(out);
85     return NULL;
86     }
87     }
88     /* not good for block size 1 */
89     void Paso_SparseMatrix_MatrixMatrix_BB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
90     {
91     const dim_t n = C->numRows;
92     const dim_t row_block_size = C->row_block_size;
93     const dim_t col_block_size = C->col_block_size;
94     const dim_t A_col_block_size = A->col_block_size;
95     const dim_t C_block_size =C->block_size;
96     const dim_t B_block_size =B->block_size;
97     const dim_t A_block_size =A->block_size;
98     double *C_ij, *A_ik, *B_kj;
99     register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
100     dim_t i, ib, irb, icb;
101     index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
102    
103     if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_col_block_size == 2) ) {
104     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
105     {
106     #pragma omp for schedule(static)
107     for(i = 0; i < n; i++) {
108     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
109     j = C->pattern->index[ij_ptrC];
110    
111     C_ij_00=0;
112     C_ij_10=0;
113     C_ij_01=0;
114     C_ij_11=0;
115    
116     C_ij=&(C->val[ij_ptrC*4]);
117    
118     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
119     k=A->pattern->index[ik_ptrA];
120     kj_ptrB=B->pattern->ptr[k];
121     /* find j in B[k,:] */
122     start_p=&(B->pattern->index[kj_ptrB]);
123     where_p=(index_t*)bsearch(&j, start_p,
124     B->pattern->ptr[k + 1]-kj_ptrB,
125     sizeof(index_t),
126     Paso_comparIndex);
127     if (! (where_p == NULL) ) { /* this should always be the case */
128     kj_ptrB += (index_t)(where_p-start_p);
129     A_ik=&(A->val[ik_ptrA*4]);
130     B_kj=&(B->val[kj_ptrB*4]);
131    
132     C_ij_00 +=A_ik[0+2*0]*B_kj[0+2*0]+A_ik[0+2*1]*B_kj[1+2*0];
133     C_ij_10 +=A_ik[1+2*0]*B_kj[0+2*0]+A_ik[1+2*1]*B_kj[1+2*0];
134    
135     C_ij_01 +=A_ik[0+2*0]*B_kj[0+2*1]+A_ik[0+2*1]*B_kj[1+2*1];
136     C_ij_11 +=A_ik[1+2*0]*B_kj[0+2*1]+A_ik[1+2*1]*B_kj[1+2*1];
137     }
138    
139     }
140     C_ij[0+2*0]=C_ij_00;
141     C_ij[1+2*0]=C_ij_10;
142     C_ij[0+2*1]=C_ij_01;
143     C_ij[1+2*1]=C_ij_11;
144    
145     }
146     }
147     } /* end of parallel region */
148    
149     } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_col_block_size == 3) ){
150     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
151     {
152     #pragma omp for schedule(static)
153     for(i = 0; i < n; i++) {
154     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
155     j = C->pattern->index[ij_ptrC];
156    
157     C_ij_00=0;
158     C_ij_10=0;
159     C_ij_20=0;
160     C_ij_01=0;
161     C_ij_11=0;
162     C_ij_21=0;
163     C_ij_02=0;
164     C_ij_12=0;
165     C_ij_22=0;
166    
167     C_ij=&(C->val[ij_ptrC*9]);
168    
169     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
170     k=A->pattern->index[ik_ptrA];
171     kj_ptrB=B->pattern->ptr[k];
172     /* find j in B[k,:] */
173     start_p=&(B->pattern->index[kj_ptrB]);
174     where_p=(index_t*)bsearch(&j, start_p,
175     B->pattern->ptr[k + 1]-kj_ptrB,
176     sizeof(index_t),
177     Paso_comparIndex);
178     if (! (where_p == NULL) ) { /* this should always be the case */
179     kj_ptrB += (index_t)(where_p-start_p);
180     A_ik=&(A->val[ik_ptrA*9]);
181     B_kj=&(B->val[kj_ptrB*9]);
182    
183     C_ij_00 +=A_ik[0+3*0]*B_kj[0+3*0]
184     +A_ik[0+3*1]*B_kj[1+3*0]
185     +A_ik[0+3*2]*B_kj[2+3*0];
186     C_ij_10 +=A_ik[1+3*0]*B_kj[0+3*0]
187     +A_ik[1+3*1]*B_kj[1+3*0]
188     +A_ik[1+3*2]*B_kj[2+3*0];
189     C_ij_20 +=A_ik[2+3*0]*B_kj[0+3*0]
190     +A_ik[2+3*1]*B_kj[1+3*0]
191     +A_ik[2+3*2]*B_kj[2+3*0];
192    
193     C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*1]
194     +A_ik[0+3*1]*B_kj[1+3*1]
195     +A_ik[0+3*2]*B_kj[2+3*1];
196     C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*1]
197     +A_ik[1+3*1]*B_kj[1+3*1]
198     +A_ik[1+3*2]*B_kj[2+3*1];
199     C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*1]
200     +A_ik[2+3*1]*B_kj[1+3*1]
201     +A_ik[2+3*2]*B_kj[2+3*1];
202    
203     C_ij_01 +=A_ik[0+3*0]*B_kj[0+3*2]
204     +A_ik[0+3*1]*B_kj[1+3*2]
205     +A_ik[0+3*2]*B_kj[2+3*2];
206     C_ij_11 +=A_ik[1+3*0]*B_kj[0+3*2]
207     +A_ik[1+3*1]*B_kj[1+3*2]
208     +A_ik[1+3*2]*B_kj[2+3*2];
209     C_ij_21 +=A_ik[2+3*0]*B_kj[0+3*2]
210     +A_ik[2+3*1]*B_kj[1+3*2]
211     +A_ik[2+3*2]*B_kj[2+3*2];
212     }
213    
214     }
215     C_ij[0+3*0]=C_ij_00;
216     C_ij[1+3*0]=C_ij_10;
217     C_ij[2+3*0]=C_ij_20;
218     C_ij[0+3*1]=C_ij_01;
219     C_ij[1+3*1]=C_ij_11;
220     C_ij[2+3*1]=C_ij_21;
221     C_ij[0+3*2]=C_ij_02;
222     C_ij[1+3*2]=C_ij_12;
223     C_ij[2+3*2]=C_ij_22;
224     }
225     }
226     } /* end of parallel region */
227     } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_col_block_size == 4) ){
228     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
229     {
230     #pragma omp for schedule(static)
231     for(i = 0; i < n; i++) {
232     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
233     j = C->pattern->index[ij_ptrC];
234    
235     C_ij_00=0;
236     C_ij_10=0;
237     C_ij_20=0;
238     C_ij_30=0;
239     C_ij_01=0;
240     C_ij_11=0;
241     C_ij_21=0;
242     C_ij_31=0;
243     C_ij_02=0;
244     C_ij_12=0;
245     C_ij_22=0;
246     C_ij_32=0;
247     C_ij_03=0;
248     C_ij_13=0;
249     C_ij_23=0;
250     C_ij_33=0;
251    
252     C_ij=&(C->val[ij_ptrC*16]);
253    
254     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
255     k=A->pattern->index[ik_ptrA];
256     kj_ptrB=B->pattern->ptr[k];
257     /* find j in B[k,:] */
258     start_p=&(B->pattern->index[kj_ptrB]);
259     where_p=(index_t*)bsearch(&j, start_p,
260     B->pattern->ptr[k + 1]-kj_ptrB,
261     sizeof(index_t),
262     Paso_comparIndex);
263     if (! (where_p == NULL) ) { /* this should always be the case */
264     kj_ptrB += (index_t)(where_p-start_p);
265     A_ik=&(A->val[ik_ptrA*16]);
266     B_kj=&(B->val[kj_ptrB*16]);
267    
268     C_ij_00 +=A_ik[0+4*0]*B_kj[0+4*0]
269     +A_ik[0+4*1]*B_kj[1+4*0]
270     +A_ik[0+4*2]*B_kj[2+4*0]
271     +A_ik[0+4*3]*B_kj[3+4*0];
272     C_ij_10 +=A_ik[1+4*0]*B_kj[0+4*0]
273     +A_ik[1+4*1]*B_kj[1+4*0]
274     +A_ik[1+4*2]*B_kj[2+4*0]
275     +A_ik[1+4*3]*B_kj[3+4*0];
276     C_ij_20 +=A_ik[2+4*0]*B_kj[0+4*0]
277     +A_ik[2+4*1]*B_kj[1+4*0]
278     +A_ik[2+4*2]*B_kj[2+4*0]
279     +A_ik[2+4*3]*B_kj[3+4*0];
280     C_ij_30 +=A_ik[3+4*0]*B_kj[0+4*0]
281     +A_ik[3+4*1]*B_kj[1+4*0]
282     +A_ik[3+4*2]*B_kj[2+4*0]
283     +A_ik[3+4*3]*B_kj[3+4*0];
284    
285     C_ij_01 +=A_ik[0+4*0]*B_kj[0+4*1]
286     +A_ik[0+4*1]*B_kj[1+4*1]
287     +A_ik[0+4*2]*B_kj[2+4*1]
288     +A_ik[0+4*3]*B_kj[3+4*1];
289     C_ij_11 +=A_ik[1+4*0]*B_kj[0+4*1]
290     +A_ik[1+4*1]*B_kj[1+4*1]
291     +A_ik[1+4*2]*B_kj[2+4*1]
292     +A_ik[1+4*3]*B_kj[3+4*1];
293     C_ij_21 +=A_ik[2+4*0]*B_kj[0+4*1]
294     +A_ik[2+4*1]*B_kj[1+4*1]
295     +A_ik[2+4*2]*B_kj[2+4*1]
296     +A_ik[2+4*3]*B_kj[3+4*1];
297     C_ij_31 +=A_ik[3+4*0]*B_kj[0+4*1]
298     +A_ik[3+4*1]*B_kj[1+4*1]
299     +A_ik[3+4*2]*B_kj[2+4*1]
300     +A_ik[3+4*3]*B_kj[3+4*1];
301    
302     C_ij_02 +=A_ik[0+4*0]*B_kj[0+4*2]
303     +A_ik[0+4*1]*B_kj[1+4*2]
304     +A_ik[0+4*2]*B_kj[2+4*2]
305     +A_ik[0+4*3]*B_kj[3+4*2];
306     C_ij_12 +=A_ik[1+4*0]*B_kj[0+4*2]
307     +A_ik[1+4*1]*B_kj[1+4*2]
308     +A_ik[1+4*2]*B_kj[2+4*2]
309     +A_ik[1+4*3]*B_kj[3+4*2];
310     C_ij_22 +=A_ik[2+4*0]*B_kj[0+4*2]
311     +A_ik[2+4*1]*B_kj[1+4*2]
312     +A_ik[2+4*2]*B_kj[2+4*2]
313     +A_ik[2+4*3]*B_kj[3+4*2];
314     C_ij_32 +=A_ik[3+4*0]*B_kj[0+4*2]
315     +A_ik[3+4*1]*B_kj[1+4*2]
316     +A_ik[3+4*2]*B_kj[2+4*2]
317     +A_ik[3+4*3]*B_kj[3+4*2];
318    
319     C_ij_03 +=A_ik[0+4*0]*B_kj[0+4*3]
320     +A_ik[0+4*1]*B_kj[1+4*3]
321     +A_ik[0+4*2]*B_kj[2+4*3]
322     +A_ik[0+4*3]*B_kj[3+4*3];
323     C_ij_13 +=A_ik[1+4*0]*B_kj[0+4*3]
324     +A_ik[1+4*1]*B_kj[1+4*3]
325     +A_ik[1+4*2]*B_kj[2+4*3]
326     +A_ik[1+4*3]*B_kj[3+4*3];
327     C_ij_23 +=A_ik[2+4*0]*B_kj[0+4*3]
328     +A_ik[2+4*1]*B_kj[1+4*3]
329     +A_ik[2+4*2]*B_kj[2+4*3]
330     +A_ik[2+4*3]*B_kj[3+4*3];
331     C_ij_33 +=A_ik[3+4*0]*B_kj[0+4*3]
332     +A_ik[3+4*1]*B_kj[1+4*3]
333     +A_ik[3+4*2]*B_kj[2+4*3]
334     +A_ik[3+4*3]*B_kj[3+4*3];
335     }
336     }
337     C_ij[0+4*0]=C_ij_00;
338     C_ij[1+4*0]=C_ij_10;
339     C_ij[2+4*0]=C_ij_20;
340     C_ij[3+4*0]=C_ij_30;
341     C_ij[0+4*1]=C_ij_01;
342     C_ij[1+4*1]=C_ij_11;
343     C_ij[2+4*1]=C_ij_21;
344     C_ij[3+4*1]=C_ij_31;
345     C_ij[0+4*2]=C_ij_02;
346     C_ij[1+4*2]=C_ij_12;
347     C_ij[2+4*2]=C_ij_22;
348     C_ij[3+4*2]=C_ij_32;
349     C_ij[0+4*3]=C_ij_03;
350     C_ij[1+4*3]=C_ij_13;
351     C_ij[2+4*3]=C_ij_23;
352     C_ij[3+4*3]=C_ij_33;
353     }
354     }
355     } /* end of parallel region */
356    
357     } else {
358     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
359     {
360     #pragma omp for schedule(static)
361     for(i = 0; i < n; i++) {
362     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
363     j = C->pattern->index[ij_ptrC];
364     C_ij=&(C->val[ij_ptrC*C_block_size]);
365     for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
366     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
367     k=A->pattern->index[ik_ptrA];
368     kj_ptrB=B->pattern->ptr[k];
369     /* find j in B[k,:] */
370     start_p=&(B->pattern->index[kj_ptrB]);
371     where_p=(index_t*)bsearch(&j, start_p,
372     B->pattern->ptr[k + 1]-kj_ptrB,
373     sizeof(index_t),
374     Paso_comparIndex);
375     if (! (where_p == NULL) ) { /* this should always be the case */
376     kj_ptrB += (index_t)(where_p-start_p);
377     A_ik=&(A->val[ik_ptrA*A_block_size]);
378     B_kj=&(B->val[kj_ptrB*B_block_size]);
379    
380     for (irb=0; irb<row_block_size; ++irb) {
381     for (icb=0; icb<col_block_size; ++icb) {
382     rtmp=C_ij[irb+row_block_size*icb];
383     for (ib=0; ib<A_col_block_size; ++ib) {
384     rtmp+=A_ik[irb+row_block_size*ib]*B_kj[ib+A_col_block_size*icb];
385     }
386     C_ij[irb+row_block_size*icb]=rtmp;
387     }
388     }
389     }
390     }
391     }
392     }
393     } /* end of parallel region */
394    
395     }
396     }
397    
398     /* not good for block size 1 */
399     void Paso_SparseMatrix_MatrixMatrix_DB(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
400     {
401     const dim_t n = C->numRows;
402     const dim_t row_block_size = C->row_block_size;
403     const dim_t col_block_size = C->col_block_size;
404     const dim_t A_col_block_size = A->col_block_size;
405     const dim_t C_block_size =C->block_size;
406     const dim_t B_block_size =B->block_size;
407     const dim_t A_block_size =A->block_size;
408     double *C_ij, *A_ik, *B_kj;
409     register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
410     dim_t i, ib, irb, icb;
411     index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
412    
413     if ( (row_block_size == 2) && (col_block_size ==2 ) && (A_block_size == 2) ) {
414     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
415     {
416     #pragma omp for schedule(static)
417     for(i = 0; i < n; i++) {
418     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
419     j = C->pattern->index[ij_ptrC];
420    
421     C_ij_00=0;
422     C_ij_10=0;
423     C_ij_01=0;
424     C_ij_11=0;
425    
426     C_ij=&(C->val[ij_ptrC*4]);
427    
428     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
429     k=A->pattern->index[ik_ptrA];
430     kj_ptrB=B->pattern->ptr[k];
431     /* find j in B[k,:] */
432     start_p=&(B->pattern->index[kj_ptrB]);
433     where_p=(index_t*)bsearch(&j, start_p,
434     B->pattern->ptr[k + 1]-kj_ptrB,
435     sizeof(index_t),
436     Paso_comparIndex);
437     if (! (where_p == NULL) ) { /* this should always be the case */
438     kj_ptrB += (index_t)(where_p-start_p);
439     A_ik=&(A->val[ik_ptrA*2]);
440     B_kj=&(B->val[kj_ptrB*4]);
441    
442     C_ij_00 +=A_ik[0]*B_kj[0+2*0];
443     C_ij_10 +=A_ik[1]*B_kj[1+2*0];
444    
445     C_ij_01 +=A_ik[0]*B_kj[0+2*1];
446     C_ij_11 +=A_ik[1]*B_kj[1+2*1];
447     }
448    
449     }
450     C_ij[0+2*0]=C_ij_00;
451     C_ij[1+2*0]=C_ij_10;
452     C_ij[0+2*1]=C_ij_01;
453     C_ij[1+2*1]=C_ij_11;
454    
455     }
456     }
457     } /* end of parallel region */
458    
459     } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (A_block_size == 3) ){
460     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
461     {
462     #pragma omp for schedule(static)
463     for(i = 0; i < n; i++) {
464     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
465     j = C->pattern->index[ij_ptrC];
466    
467     C_ij_00=0;
468     C_ij_10=0;
469     C_ij_20=0;
470     C_ij_01=0;
471     C_ij_11=0;
472     C_ij_21=0;
473     C_ij_02=0;
474     C_ij_12=0;
475     C_ij_22=0;
476    
477     C_ij=&(C->val[ij_ptrC*9]);
478    
479     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
480     k=A->pattern->index[ik_ptrA];
481     kj_ptrB=B->pattern->ptr[k];
482     /* find j in B[k,:] */
483     start_p=&(B->pattern->index[kj_ptrB]);
484     where_p=(index_t*)bsearch(&j, start_p,
485     B->pattern->ptr[k + 1]-kj_ptrB,
486     sizeof(index_t),
487     Paso_comparIndex);
488     if (! (where_p == NULL) ) { /* this should always be the case */
489     kj_ptrB += (index_t)(where_p-start_p);
490     A_ik=&(A->val[ik_ptrA*3]);
491     B_kj=&(B->val[kj_ptrB*9]);
492    
493     C_ij_00 +=A_ik[0]*B_kj[0+3*0];
494     C_ij_10 +=A_ik[1]*B_kj[1+3*0];
495     C_ij_20 +=A_ik[2]*B_kj[2+3*0];
496    
497     C_ij_01 +=A_ik[0]*B_kj[0+3*1];
498     C_ij_11 +=A_ik[1]*B_kj[1+3*1];
499     C_ij_21 +=A_ik[2]*B_kj[2+3*1];
500    
501     C_ij_02 +=A_ik[0]*B_kj[0+3*2];
502     C_ij_12 +=A_ik[1]*B_kj[1+3*2];
503     C_ij_22 +=A_ik[2]*B_kj[2+3*2];
504    
505     }
506    
507     }
508     C_ij[0+3*0]=C_ij_00;
509     C_ij[1+3*0]=C_ij_10;
510     C_ij[2+3*0]=C_ij_20;
511     C_ij[0+3*1]=C_ij_01;
512     C_ij[1+3*1]=C_ij_11;
513     C_ij[2+3*1]=C_ij_21;
514     C_ij[0+3*2]=C_ij_02;
515     C_ij[1+3*2]=C_ij_12;
516     C_ij[2+3*2]=C_ij_22;
517     }
518     }
519     } /* end of parallel region */
520     } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (A_block_size == 4) ){
521     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
522     {
523     #pragma omp for schedule(static)
524     for(i = 0; i < n; i++) {
525     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
526     j = C->pattern->index[ij_ptrC];
527    
528     C_ij_00=0;
529     C_ij_10=0;
530     C_ij_20=0;
531     C_ij_30=0;
532     C_ij_01=0;
533     C_ij_11=0;
534     C_ij_21=0;
535     C_ij_31=0;
536     C_ij_02=0;
537     C_ij_12=0;
538     C_ij_22=0;
539     C_ij_32=0;
540     C_ij_03=0;
541     C_ij_13=0;
542     C_ij_23=0;
543     C_ij_33=0;
544    
545     C_ij=&(C->val[ij_ptrC*16]);
546    
547     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
548     k=A->pattern->index[ik_ptrA];
549     kj_ptrB=B->pattern->ptr[k];
550     /* find j in B[k,:] */
551     start_p=&(B->pattern->index[kj_ptrB]);
552     where_p=(index_t*)bsearch(&j, start_p,
553     B->pattern->ptr[k + 1]-kj_ptrB,
554     sizeof(index_t),
555     Paso_comparIndex);
556     if (! (where_p == NULL) ) { /* this should always be the case */
557     kj_ptrB += (index_t)(where_p-start_p);
558     A_ik=&(A->val[ik_ptrA*4]);
559     B_kj=&(B->val[kj_ptrB*16]);
560    
561     C_ij_00 +=A_ik[0]*B_kj[0+4*0];
562     C_ij_10 +=A_ik[1]*B_kj[1+4*0];
563     C_ij_20 +=A_ik[2]*B_kj[2+4*0];
564     C_ij_30 +=A_ik[3]*B_kj[3+4*0];
565    
566     C_ij_01 +=A_ik[0]*B_kj[0+4*1];
567     C_ij_11 +=A_ik[1]*B_kj[1+4*1];
568     C_ij_21 +=A_ik[2]*B_kj[2+4*1];
569     C_ij_31 +=A_ik[3]*B_kj[3+4*1];
570    
571     C_ij_02 +=A_ik[0]*B_kj[0+4*2];
572     C_ij_12 +=A_ik[1]*B_kj[1+4*2];
573     C_ij_22 +=A_ik[2]*B_kj[2+4*2];
574     C_ij_32 +=A_ik[3]*B_kj[3+4*2];
575    
576     C_ij_03 +=A_ik[0]*B_kj[0+4*3];
577     C_ij_13 +=A_ik[1]*B_kj[1+4*3];
578     C_ij_23 +=A_ik[2]*B_kj[2+4*3];
579     C_ij_33 +=A_ik[3]*B_kj[3+4*3];
580     }
581     }
582     C_ij[0+4*0]=C_ij_00;
583     C_ij[1+4*0]=C_ij_10;
584     C_ij[2+4*0]=C_ij_20;
585     C_ij[3+4*0]=C_ij_30;
586     C_ij[0+4*1]=C_ij_01;
587     C_ij[1+4*1]=C_ij_11;
588     C_ij[2+4*1]=C_ij_21;
589     C_ij[3+4*1]=C_ij_31;
590     C_ij[0+4*2]=C_ij_02;
591     C_ij[1+4*2]=C_ij_12;
592     C_ij[2+4*2]=C_ij_22;
593     C_ij[3+4*2]=C_ij_32;
594     C_ij[0+4*3]=C_ij_03;
595     C_ij[1+4*3]=C_ij_13;
596     C_ij[2+4*3]=C_ij_23;
597     C_ij[3+4*3]=C_ij_33;
598     }
599     }
600     } /* end of parallel region */
601    
602     } else {
603     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
604     {
605     #pragma omp for schedule(static)
606     for(i = 0; i < n; i++) {
607     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
608     j = C->pattern->index[ij_ptrC];
609     C_ij=&(C->val[ij_ptrC*C_block_size]);
610     for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
611     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
612     k=A->pattern->index[ik_ptrA];
613     kj_ptrB=B->pattern->ptr[k];
614     /* find j in B[k,:] */
615     start_p=&(B->pattern->index[kj_ptrB]);
616     where_p=(index_t*)bsearch(&j, start_p,
617     B->pattern->ptr[k + 1]-kj_ptrB,
618     sizeof(index_t),
619     Paso_comparIndex);
620     if (! (where_p == NULL) ) { /* this should always be the case */
621     kj_ptrB += (index_t)(where_p-start_p);
622     A_ik=&(A->val[ik_ptrA*A_block_size]);
623     B_kj=&(B->val[kj_ptrB*B_block_size]);
624    
625     for (irb=0; irb<A_block_size; ++irb) {
626     rtmp=A_ik[irb];
627     for (icb=0; icb<col_block_size; ++icb) {
628     C_ij[irb+row_block_size*icb]+=rtmp*B_kj[irb+A_col_block_size*icb];
629     }
630     }
631     }
632     }
633     }
634     }
635     } /* end of parallel region */
636    
637     }
638     }
639    
640     /* not good for block size 1 */
641     void Paso_SparseMatrix_MatrixMatrix_BD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
642     {
643     const dim_t n = C->numRows;
644     const dim_t row_block_size = C->row_block_size;
645     const dim_t col_block_size = C->col_block_size;
646     const dim_t C_block_size =C->block_size;
647     const dim_t B_block_size =B->block_size;
648     const dim_t A_block_size =A->block_size;
649     double *C_ij, *A_ik, *B_kj;
650     register double rtmp, C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33;
651     dim_t i, ib, irb, icb;
652     index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
653    
654     if ( (row_block_size == 2) && (col_block_size ==2 ) && (B_block_size == 2) ) {
655     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_01, C_ij_11)
656     {
657     #pragma omp for schedule(static)
658     for(i = 0; i < n; i++) {
659     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
660     j = C->pattern->index[ij_ptrC];
661    
662     C_ij_00=0;
663     C_ij_10=0;
664     C_ij_01=0;
665     C_ij_11=0;
666    
667     C_ij=&(C->val[ij_ptrC*4]);
668    
669     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
670     k=A->pattern->index[ik_ptrA];
671     kj_ptrB=B->pattern->ptr[k];
672     /* find j in B[k,:] */
673     start_p=&(B->pattern->index[kj_ptrB]);
674     where_p=(index_t*)bsearch(&j, start_p,
675     B->pattern->ptr[k + 1]-kj_ptrB,
676     sizeof(index_t),
677     Paso_comparIndex);
678     if (! (where_p == NULL) ) { /* this should always be the case */
679     kj_ptrB += (index_t)(where_p-start_p);
680     A_ik=&(A->val[ik_ptrA*4]);
681     B_kj=&(B->val[kj_ptrB*2]);
682    
683     C_ij_00 +=A_ik[0+2*0]*B_kj[0];
684     C_ij_10 +=A_ik[1+2*0]*B_kj[0];
685    
686     C_ij_01 +=A_ik[0+2*1]*B_kj[1];
687     C_ij_11 +=A_ik[1+2*1]*B_kj[1];
688     }
689    
690     }
691     C_ij[0+2*0]=C_ij_00;
692     C_ij[1+2*0]=C_ij_10;
693     C_ij[0+2*1]=C_ij_01;
694     C_ij[1+2*1]=C_ij_11;
695    
696     }
697     }
698     } /* end of parallel region */
699    
700     } else if ( (row_block_size == 3) && (col_block_size ==3 ) && (B_block_size == 3) ){
701     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_01, C_ij_11, C_ij_21, C_ij_02, C_ij_12, C_ij_22)
702     {
703     #pragma omp for schedule(static)
704     for(i = 0; i < n; i++) {
705     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
706     j = C->pattern->index[ij_ptrC];
707    
708     C_ij_00=0;
709     C_ij_10=0;
710     C_ij_20=0;
711     C_ij_01=0;
712     C_ij_11=0;
713     C_ij_21=0;
714     C_ij_02=0;
715     C_ij_12=0;
716     C_ij_22=0;
717    
718     C_ij=&(C->val[ij_ptrC*9]);
719    
720     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
721     k=A->pattern->index[ik_ptrA];
722     kj_ptrB=B->pattern->ptr[k];
723     /* find j in B[k,:] */
724     start_p=&(B->pattern->index[kj_ptrB]);
725     where_p=(index_t*)bsearch(&j, start_p,
726     B->pattern->ptr[k + 1]-kj_ptrB,
727     sizeof(index_t),
728     Paso_comparIndex);
729     if (! (where_p == NULL) ) { /* this should always be the case */
730     kj_ptrB += (index_t)(where_p-start_p);
731     A_ik=&(A->val[ik_ptrA*9]);
732     B_kj=&(B->val[kj_ptrB*3]);
733    
734     C_ij_00 +=A_ik[0+3*0]*B_kj[0];
735     C_ij_10 +=A_ik[1+3*0]*B_kj[0];
736     C_ij_20 +=A_ik[2+3*0]*B_kj[0];
737    
738     C_ij_01 +=A_ik[0+3*1]*B_kj[1];
739     C_ij_11 +=A_ik[1+3*1]*B_kj[1];
740     C_ij_21 +=A_ik[2+3*1]*B_kj[1];
741    
742     C_ij_01 +=A_ik[0+3*2]*B_kj[2];
743     C_ij_11 +=A_ik[1+3*2]*B_kj[2];
744     C_ij_21 +=A_ik[2+3*2]*B_kj[2];
745     }
746    
747     }
748     C_ij[0+3*0]=C_ij_00;
749     C_ij[1+3*0]=C_ij_10;
750     C_ij[2+3*0]=C_ij_20;
751     C_ij[0+3*1]=C_ij_01;
752     C_ij[1+3*1]=C_ij_11;
753     C_ij[2+3*1]=C_ij_21;
754     C_ij[0+3*2]=C_ij_02;
755     C_ij[1+3*2]=C_ij_12;
756     C_ij[2+3*2]=C_ij_22;
757     }
758     }
759     } /* end of parallel region */
760     } else if ( (row_block_size == 4) && (col_block_size ==4 ) && (B_block_size == 4) ){
761     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, A_ik,B_kj,C_ij_00, C_ij_10, C_ij_20, C_ij_30, C_ij_01, C_ij_11, C_ij_21, C_ij_31, C_ij_02, C_ij_12, C_ij_22, C_ij_32, C_ij_03, C_ij_13, C_ij_23, C_ij_33)
762     {
763     #pragma omp for schedule(static)
764     for(i = 0; i < n; i++) {
765     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
766     j = C->pattern->index[ij_ptrC];
767    
768     C_ij_00=0;
769     C_ij_10=0;
770     C_ij_20=0;
771     C_ij_30=0;
772     C_ij_01=0;
773     C_ij_11=0;
774     C_ij_21=0;
775     C_ij_31=0;
776     C_ij_02=0;
777     C_ij_12=0;
778     C_ij_22=0;
779     C_ij_32=0;
780     C_ij_03=0;
781     C_ij_13=0;
782     C_ij_23=0;
783     C_ij_33=0;
784    
785     C_ij=&(C->val[ij_ptrC*16]);
786    
787     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
788     k=A->pattern->index[ik_ptrA];
789     kj_ptrB=B->pattern->ptr[k];
790     /* find j in B[k,:] */
791     start_p=&(B->pattern->index[kj_ptrB]);
792     where_p=(index_t*)bsearch(&j, start_p,
793     B->pattern->ptr[k + 1]-kj_ptrB,
794     sizeof(index_t),
795     Paso_comparIndex);
796     if (! (where_p == NULL) ) { /* this should always be the case */
797     kj_ptrB += (index_t)(where_p-start_p);
798     A_ik=&(A->val[ik_ptrA*16]);
799     B_kj=&(B->val[kj_ptrB*4]);
800    
801     C_ij_00 +=A_ik[0+4*0]*B_kj[0];
802     C_ij_10 +=A_ik[1+4*0]*B_kj[0];
803     C_ij_20 +=A_ik[2+4*0]*B_kj[0];
804     C_ij_30 +=A_ik[3+4*0]*B_kj[0];
805    
806     C_ij_01 +=A_ik[0+4*1]*B_kj[1];
807     C_ij_11 +=A_ik[1+4*1]*B_kj[1];
808     C_ij_21 +=A_ik[2+4*1]*B_kj[1];
809     C_ij_31 +=A_ik[3+4*1]*B_kj[1];
810    
811     C_ij_02 +=A_ik[0+4*2]*B_kj[2];
812     C_ij_12 +=A_ik[1+4*2]*B_kj[2];
813     C_ij_22 +=A_ik[2+4*2]*B_kj[2];
814     C_ij_32 +=A_ik[3+4*2]*B_kj[2];
815    
816     C_ij_03 +=A_ik[0+4*3]*B_kj[3];
817     C_ij_13 +=A_ik[1+4*3]*B_kj[3];
818     C_ij_23 +=A_ik[2+4*3]*B_kj[3];
819     C_ij_33 +=A_ik[3+4*3]*B_kj[3];
820     }
821     }
822     C_ij[0+4*0]=C_ij_00;
823     C_ij[1+4*0]=C_ij_10;
824     C_ij[2+4*0]=C_ij_20;
825     C_ij[3+4*0]=C_ij_30;
826     C_ij[0+4*1]=C_ij_01;
827     C_ij[1+4*1]=C_ij_11;
828     C_ij[2+4*1]=C_ij_21;
829     C_ij[3+4*1]=C_ij_31;
830     C_ij[0+4*2]=C_ij_02;
831     C_ij[1+4*2]=C_ij_12;
832     C_ij[2+4*2]=C_ij_22;
833     C_ij[3+4*2]=C_ij_32;
834     C_ij[0+4*3]=C_ij_03;
835     C_ij[1+4*3]=C_ij_13;
836     C_ij[2+4*3]=C_ij_23;
837     C_ij[3+4*3]=C_ij_33;
838     }
839     }
840     } /* end of parallel region */
841    
842     } else {
843     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, irb, icb, ib, rtmp, A_ik,B_kj )
844     {
845     #pragma omp for schedule(static)
846     for(i = 0; i < n; i++) {
847     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
848     j = C->pattern->index[ij_ptrC];
849     C_ij=&(C->val[ij_ptrC*C_block_size]);
850     for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
851     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
852     k=A->pattern->index[ik_ptrA];
853     kj_ptrB=B->pattern->ptr[k];
854     /* find j in B[k,:] */
855     start_p=&(B->pattern->index[kj_ptrB]);
856     where_p=(index_t*)bsearch(&j, start_p,
857     B->pattern->ptr[k + 1]-kj_ptrB,
858     sizeof(index_t),
859     Paso_comparIndex);
860     if (! (where_p == NULL) ) { /* this should always be the case */
861     kj_ptrB += (index_t)(where_p-start_p);
862     A_ik=&(A->val[ik_ptrA*A_block_size]);
863     B_kj=&(B->val[kj_ptrB*B_block_size]);
864    
865     for (icb=0; icb<B_block_size; ++icb) {
866     rtmp=B_kj[icb];
867     for (irb=0; irb<row_block_size; ++irb) {
868     C_ij[irb+row_block_size*icb]+=A_ik[irb+row_block_size*icb]*rtmp;
869     }
870     }
871     }
872     }
873     }
874     }
875     } /* end of parallel region */
876    
877     }
878     }
879    
880     /* not good for block size 1 */
881     void Paso_SparseMatrix_MatrixMatrix_DD(Paso_SparseMatrix* C, const Paso_SparseMatrix* A, const Paso_SparseMatrix* B)
882     {
883     const dim_t n = C->numRows;
884     const dim_t C_block_size =C->block_size;
885     const dim_t B_block_size =B->block_size;
886     const dim_t A_block_size =A->block_size;
887     double *C_ij, *A_ik, *B_kj;
888     register double C_ij_0, C_ij_1, C_ij_2, C_ij_3;
889     dim_t i, ib;
890     index_t ij_ptrC, j, ik_ptrA, k, kj_ptrB, *start_p, *where_p;
891    
892     if ( (A_block_size == 1) && (B_block_size ==1) && (C_block_size == 1) ) {
893     #pragma omp parallel private(i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, C_ij_0)
894     {
895     #pragma omp for schedule(static)
896     for(i = 0; i < n; i++) {
897     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
898     j = C->pattern->index[ij_ptrC];
899     C_ij_0=0;
900     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
901     k=A->pattern->index[ik_ptrA];
902     kj_ptrB=B->pattern->ptr[k];
903     /* find j in B[k,:] */
904     start_p=&(B->pattern->index[kj_ptrB]);
905     where_p=(index_t*)bsearch(&j, start_p,
906     B->pattern->ptr[k + 1]-kj_ptrB,
907     sizeof(index_t),
908     Paso_comparIndex);
909     if (! (where_p == NULL) ) { /* this should always be the case */
910     kj_ptrB += (index_t)(where_p-start_p);
911     C_ij_0+=A->val[ik_ptrA]*B->val[kj_ptrB];
912     }
913    
914     }
915     C->val[ij_ptrC]=C_ij_0;
916     }
917     }
918     } /* end of parallel region */
919     } else if ( (A_block_size == 2) && (B_block_size ==2) && (C_block_size == 2) ) {
920     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1)
921     {
922     #pragma omp for schedule(static)
923     for(i = 0; i < n; i++) {
924     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
925     j = C->pattern->index[ij_ptrC];
926     C_ij=&(C->val[ij_ptrC*2]);
927     C_ij_0=0;
928     C_ij_1=0;
929     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
930     k=A->pattern->index[ik_ptrA];
931     kj_ptrB=B->pattern->ptr[k];
932     /* find j in B[k,:] */
933     start_p=&(B->pattern->index[kj_ptrB]);
934     where_p=(index_t*)bsearch(&j, start_p,
935     B->pattern->ptr[k + 1]-kj_ptrB,
936     sizeof(index_t),
937     Paso_comparIndex);
938     if (! (where_p == NULL) ) { /* this should always be the case */
939     kj_ptrB += (index_t)(where_p-start_p);
940     A_ik=&(A->val[ik_ptrA*2]);
941     B_kj=&(B->val[kj_ptrB*2]);
942    
943     C_ij_0+=A_ik[0]*B_kj[0];
944     C_ij_1+=A_ik[1]*B_kj[1];
945     }
946    
947     }
948     C_ij[0]=C_ij_0;
949     C_ij[1]=C_ij_1;
950     }
951     }
952     } /* end of parallel region */
953     } else if ( (A_block_size == 3) && (B_block_size ==3) && (C_block_size == 3) ) {
954     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2)
955     {
956     #pragma omp for schedule(static)
957     for(i = 0; i < n; i++) {
958     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
959     j = C->pattern->index[ij_ptrC];
960     C_ij=&(C->val[ij_ptrC*C_block_size]);
961     C_ij_0=0;
962     C_ij_1=0;
963     C_ij_2=0;
964     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
965     k=A->pattern->index[ik_ptrA];
966     kj_ptrB=B->pattern->ptr[k];
967     /* find j in B[k,:] */
968     start_p=&(B->pattern->index[kj_ptrB]);
969     where_p=(index_t*)bsearch(&j, start_p,
970     B->pattern->ptr[k + 1]-kj_ptrB,
971     sizeof(index_t),
972     Paso_comparIndex);
973     if (! (where_p == NULL) ) { /* this should always be the case */
974     kj_ptrB += (index_t)(where_p-start_p);
975     A_ik=&(A->val[ik_ptrA*3]);
976     B_kj=&(B->val[kj_ptrB*3]);
977    
978     C_ij_0+=A_ik[0]*B_kj[0];
979     C_ij_1+=A_ik[1]*B_kj[1];
980     C_ij_2+=A_ik[2]*B_kj[2];
981     }
982    
983     }
984     C_ij[0]=C_ij_0;
985     C_ij[1]=C_ij_1;
986     C_ij[2]=C_ij_2;
987     }
988     }
989     } /* end of parallel region */
990     } else if ( (A_block_size == 4) && (B_block_size ==4 ) && (C_block_size == 4) ) {
991     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, A_ik,B_kj, C_ij_0, C_ij_1, C_ij_2, C_ij_3 )
992     {
993     #pragma omp for schedule(static)
994     for(i = 0; i < n; i++) {
995     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
996     j = C->pattern->index[ij_ptrC];
997     C_ij=&(C->val[ij_ptrC*C_block_size]);
998     C_ij_0=0;
999     C_ij_1=0;
1000     C_ij_2=0;
1001     C_ij_3=0;
1002     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
1003     k=A->pattern->index[ik_ptrA];
1004     kj_ptrB=B->pattern->ptr[k];
1005     /* find j in B[k,:] */
1006     start_p=&(B->pattern->index[kj_ptrB]);
1007     where_p=(index_t*)bsearch(&j, start_p,
1008     B->pattern->ptr[k + 1]-kj_ptrB,
1009     sizeof(index_t),
1010     Paso_comparIndex);
1011     if (! (where_p == NULL) ) { /* this should always be the case */
1012     kj_ptrB += (index_t)(where_p-start_p);
1013     A_ik=&(A->val[ik_ptrA*4]);
1014     B_kj=&(B->val[kj_ptrB*4]);
1015    
1016     C_ij_0+=A_ik[0]*B_kj[0];
1017     C_ij_1+=A_ik[1]*B_kj[1];
1018     C_ij_2+=A_ik[2]*B_kj[2];
1019     C_ij_3+=A_ik[3]*B_kj[3];
1020     }
1021    
1022     }
1023     C_ij[0]=C_ij_0;
1024     C_ij[1]=C_ij_1;
1025     C_ij[2]=C_ij_2;
1026     C_ij[3]=C_ij_3;
1027     }
1028     }
1029     } /* end of parallel region */
1030     } else {
1031     #pragma omp parallel private(C_ij, i, ij_ptrC, j, ik_ptrA, k, kj_ptrB, start_p, where_p, ib, A_ik,B_kj )
1032     {
1033     #pragma omp for schedule(static)
1034     for(i = 0; i < n; i++) {
1035     for(ij_ptrC = C->pattern->ptr[i]; ij_ptrC < C->pattern->ptr[i+1]; ++ij_ptrC) {
1036     j = C->pattern->index[ij_ptrC];
1037     C_ij=&(C->val[ij_ptrC*C_block_size]);
1038     for (ib=0; ib<C_block_size; ++ib) C_ij[ib]=0;
1039     for(ik_ptrA = A->pattern->ptr[i]; ik_ptrA < A->pattern->ptr[i+1]; ++ik_ptrA ) {
1040     k=A->pattern->index[ik_ptrA];
1041     kj_ptrB=B->pattern->ptr[k];
1042     /* find j in B[k,:] */
1043     start_p=&(B->pattern->index[kj_ptrB]);
1044     where_p=(index_t*)bsearch(&j, start_p,
1045     B->pattern->ptr[k + 1]-kj_ptrB,
1046     sizeof(index_t),
1047     Paso_comparIndex);
1048     if (! (where_p == NULL) ) { /* this should always be the case */
1049     kj_ptrB += (index_t)(where_p-start_p);
1050     A_ik=&(A->val[ik_ptrA*A_block_size]);
1051     B_kj=&(B->val[kj_ptrB*B_block_size]);
1052     for (ib=0; ib<MIN(A_block_size, B_block_size); ++ib) C_ij[ib]+=A_ik[ib]*B_kj[ib];
1053     }
1054     }
1055     }
1056     }
1057     } /* end of parallel region */
1058     }
1059     }

  ViewVC Help
Powered by ViewVC 1.1.26