/[escript]/trunk/cusplibrary/cusp/detail/host/spmv.h
ViewVC logotype

Contents of /trunk/cusplibrary/cusp/detail/host/spmv.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5244 - (show annotations)
Thu Nov 6 11:03:58 2014 UTC (5 years, 3 months ago) by caltinay
File MIME type: text/plain
File size: 17964 byte(s)
block size 2 optimizations.

1 #pragma once
2
3 #include <thrust/functional.h>
4 #include <cusp/detail/functional.h>
5
6 #ifndef DIA_CHUNKSIZE
7 #define DIA_CHUNKSIZE 1024
8 #endif
9
10 //MW: add some OpenMP pragmas
11 namespace cusp
12 {
13 namespace detail
14 {
15 namespace host
16 {
17
18 //////////////
19 // COO SpMV //
20 //////////////
21 template <typename Matrix,
22 typename Vector1,
23 typename Vector2,
24 typename UnaryFunction,
25 typename BinaryFunction1,
26 typename BinaryFunction2>
27 void spmv_coo(const Matrix& A,
28 const Vector1& x,
29 Vector2& y,
30 UnaryFunction initialize,
31 BinaryFunction1 combine,
32 BinaryFunction2 reduce)
33 {
34 typedef typename Matrix::index_type IndexType;
35 typedef typename Vector2::value_type ValueType;
36
37 for(size_t i = 0; i < A.num_rows; i++)
38 y[i] = initialize(y[i]);
39
40 for(size_t n = 0; n < A.num_entries; n++)
41 {
42 const IndexType& i = A.row_indices[n];
43 const IndexType& j = A.column_indices[n];
44 const ValueType& Aij = A.values[n];
45 const ValueType& xj = x[j];
46
47 y[i] = reduce(y[i], combine(Aij, xj));
48 }
49 }
50
51 template <typename Matrix,
52 typename Vector1,
53 typename Vector2>
54 void spmv_coo(const Matrix& A,
55 const Vector1& x,
56 Vector2& y)
57 {
58 typedef typename Vector2::value_type ValueType;
59
60 spmv_coo(A, x, y,
61 cusp::detail::zero_function<ValueType>(),
62 thrust::multiplies<ValueType>(),
63 thrust::plus<ValueType>());
64 }
65
66
67 //////////////
68 // CSR SpMV //
69 //////////////
70 template <typename Matrix,
71 typename Vector1,
72 typename Vector2,
73 typename UnaryFunction,
74 typename BinaryFunction1,
75 typename BinaryFunction2>
76 void spmv_csr(const Matrix& A,
77 const Vector1& x,
78 Vector2& y,
79 UnaryFunction initialize,
80 BinaryFunction1 combine,
81 BinaryFunction2 reduce)
82 {
83 typedef typename Matrix::index_type IndexType;
84 typedef typename Vector2::value_type ValueType;
85
86 #pragma omp parallel for
87 for(size_t i = 0; i < A.num_rows; i++)
88 {
89 const IndexType& row_start = A.row_offsets[i];
90 const IndexType& row_end = A.row_offsets[i+1];
91
92 ValueType accumulator = initialize(y[i]);
93
94 for (IndexType jj = row_start; jj < row_end; jj++)
95 {
96 const IndexType& j = A.column_indices[jj];
97 const ValueType& Aij = A.values[jj];
98 const ValueType& xj = x[j];
99
100 accumulator = reduce(accumulator, combine(Aij, xj));
101 }
102
103 y[i] = accumulator;
104 }
105 }
106
107
108 template <typename Matrix,
109 typename Vector1,
110 typename Vector2>
111 void spmv_csr(const Matrix& A,
112 const Vector1& x,
113 Vector2& y)
114 {
115 typedef typename Vector2::value_type ValueType;
116
117 spmv_csr(A, x, y,
118 cusp::detail::zero_function<ValueType>(),
119 thrust::multiplies<ValueType>(),
120 thrust::plus<ValueType>());
121 }
122
123
124 //////////////
125 // DIA SpMV //
126 //////////////
127 template <typename Matrix,
128 typename Vector1,
129 typename Vector2,
130 typename UnaryFunction,
131 typename BinaryFunction1,
132 typename BinaryFunction2>
133 void spmv_dia(const Matrix& A,
134 const Vector1& x,
135 Vector2& y,
136 UnaryFunction initialize,
137 BinaryFunction1 combine,
138 BinaryFunction2 reduce)
139 {
140 typedef typename Matrix::index_type IndexType;
141 //typedef typename Vector2::value_type ValueType;
142
143 const size_t num_diagonals = A.values.num_cols;
144
145 if (A.symmetric) {
146 #pragma omp parallel for
147 for (size_t ch = 0; ch < A.num_rows; ch += DIA_CHUNKSIZE) {
148 for (size_t row = ch; row < std::min(ch+DIA_CHUNKSIZE,A.num_rows); row++)
149 {
150 // initialize
151 y[row] = initialize(y[row]);
152
153 // process subdiagonals
154 for (size_t d = 0; d < num_diagonals; d++)
155 {
156 const size_t diag = num_diagonals-d-1;
157 // skip main diagonal which is handled below
158 if (A.diagonal_offsets[diag] == 0)
159 continue;
160 const IndexType col = row - A.diagonal_offsets[diag];
161 if (col >= 0 && col < A.num_rows)
162 {
163 y[row] = reduce(y[row], combine(A.values(col, diag), x[col]));
164 }
165 }
166 // process main and upper diagonals
167 for (size_t d = 0; d < num_diagonals; d++)
168 {
169 const IndexType col = row + A.diagonal_offsets[d];
170 if (col >= 0 && col < A.num_cols)
171 {
172 y[row] = reduce(y[row], combine(A.values(row, d), x[col]));
173 }
174 }
175 }
176 }
177 } else {
178 #pragma omp parallel for
179 for (size_t ch = 0; ch < A.num_rows; ch += DIA_CHUNKSIZE) {
180 // initialize chunk
181 for (size_t row = ch; row < std::min(ch+DIA_CHUNKSIZE,A.num_rows); row++)
182 {
183 y[row] = initialize(y[row]);
184 }
185 // for each diagonal
186 for (size_t d = 0; d < num_diagonals; d++)
187 {
188 for (IndexType row=ch; row<std::min(ch+DIA_CHUNKSIZE,A.num_rows); row++)
189 {
190 const IndexType col = row + A.diagonal_offsets[d];
191 if (col >= 0 && col < A.num_cols)
192 {
193 y[row] = reduce(y[row], combine(A.values(row, d), x[col]));
194 }
195 }
196 }
197 }
198 }
199 }
200
201 template <typename Matrix,
202 typename Vector1,
203 typename Vector2>
204 void spmv_dia(const Matrix& A,
205 const Vector1& x,
206 Vector2& y)
207 {
208 typedef typename Vector2::value_type ValueType;
209
210 spmv_dia(A, x, y,
211 cusp::detail::zero_function<ValueType>(),
212 thrust::multiplies<ValueType>(),
213 thrust::plus<ValueType>());
214 }
215
216
217 //////////////
218 // CDS SpMV //
219 //////////////
220 template <typename Matrix,
221 typename Vector1,
222 typename Vector2,
223 typename UnaryFunction,
224 typename BinaryFunction1,
225 typename BinaryFunction2>
226 void spmv_cds(const Matrix& A,
227 const Vector1& x,
228 Vector2& y,
229 UnaryFunction initialize,
230 BinaryFunction1 combine,
231 BinaryFunction2 reduce)
232 {
233 typedef typename Matrix::index_type IndexType;
234 typedef typename Vector2::value_type ValueType;
235
236 const IndexType num_diagonals = A.diagonal_offsets.size();
237 const IndexType block_size = (IndexType)A.block_size;
238 const IndexType num_rows = (IndexType)A.num_rows;
239 // make chunksize a multiple of block_size
240 const IndexType chunksize = block_size*(DIA_CHUNKSIZE/block_size);
241
242 // optimization for special case
243 if (block_size == 2) {
244 if (A.symmetric) {
245 // if there is a main diagonal block, it is the first in offsets
246 // and should be skipped in the first loop below since the main
247 // diagonal is processed in the second loop
248 const IndexType d0 = (A.diagonal_offsets[0] == 0 ? 1 : 0);
249 #pragma omp parallel for
250 for (IndexType ch = 0; ch < num_rows; ch+=chunksize)
251 {
252 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row++)
253 {
254 y[row] = initialize(y[row]);
255 }
256
257 // process subdiagonal blocks
258 for (IndexType d = 0; d < num_diagonals-d0; d++)
259 {
260 const IndexType diag = num_diagonals-d-1;
261 const IndexType k = -2*A.diagonal_offsets[diag];
262 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row+=2)
263 {
264 const IndexType col = row + k;
265 if (col >= 0 && col <= num_rows-2)
266 {
267 y[row] = reduce(y[row], combine(A.values(col, 2*diag), x[col]));
268 y[row] = reduce(y[row], combine(A.values(col+1,2*diag), x[col+1]));
269 y[row+1] = reduce(y[row+1],combine(A.values(col, 2*diag+1),x[col]));
270 y[row+1] = reduce(y[row+1],combine(A.values(col+1,2*diag+1),x[col+1]));
271 }
272 }
273 }
274 // process main and upper diagonal blocks
275 for (IndexType d = 0; d < num_diagonals; d++)
276 {
277 const IndexType k = 2*A.diagonal_offsets[d];
278 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row+=2)
279 {
280 const IndexType col = row + k;
281 if (col >= 0 && col <= num_rows-2)
282 {
283 y[row] = reduce(y[row], combine(A.values(row, 2*d), x[col]));
284 y[row+1] = reduce(y[row+1],combine(A.values(row+1,2*d), x[col]));
285 y[row] = reduce(y[row], combine(A.values(row, 2*d+1),x[col+1]));
286 y[row+1] = reduce(y[row+1],combine(A.values(row+1,2*d+1),x[col+1]));
287 }
288 }
289 }
290 }
291 } else { // !A.symmetric
292 #pragma omp parallel for
293 for (IndexType ch = 0; ch < num_rows; ch+=chunksize)
294 {
295 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row+=2)
296 {
297 ValueType sum1 = initialize(y[row]);
298 ValueType sum2 = initialize(y[row+1]);
299 // for each diagonal block
300 for (IndexType d = 0; d < num_diagonals; d++)
301 {
302 const IndexType col = row + A.diagonal_offsets[d]*2;
303 if (col >= 0 && col <= num_rows-2)
304 {
305 sum1 = reduce(sum1,combine(A.values(row, 2*d), x[col]));
306 sum2 = reduce(sum2,combine(A.values(row+1,2*d), x[col]));
307 sum1 = reduce(sum1,combine(A.values(row, 2*d+1),x[col+1]));
308 sum2 = reduce(sum2,combine(A.values(row+1,2*d+1),x[col+1]));
309 }
310 }
311 y[row] = sum1;
312 y[row+1] = sum2;
313 }
314 }
315 } // A.symmetric
316 } else { // block size
317 if (A.symmetric) {
318 // if there is a main diagonal block, it is the first in offsets
319 // and should be skipped in the first loop below since the main
320 // diagonal is processed in the second loop
321 const IndexType d0 = (A.diagonal_offsets[0] == 0 ? 1 : 0);
322 const ValueType* values = thrust::raw_pointer_cast(&A.values.values[0]);
323 const IndexType pitch = A.values.pitch;
324 #pragma omp parallel for
325 for (IndexType ch = 0; ch < num_rows; ch+=chunksize)
326 {
327 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row++)
328 {
329 y[row] = initialize(y[row]);
330 }
331
332 IndexType idx = pitch*block_size*(num_diagonals-1);
333 // process subdiagonal blocks
334 for (IndexType d = 0; d < num_diagonals-d0; d++)
335 {
336 const IndexType diag = num_diagonals-d-1;
337 const IndexType k = -block_size*A.diagonal_offsets[diag];
338 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row+=block_size)
339 {
340 const IndexType col = row + k;
341 if (col >= 0 && col <= num_rows-block_size)
342 {
343 // for each row in block
344 for (IndexType j = 0; j < block_size; j++)
345 {
346 // for each column in block
347 for (IndexType i = 0; i < block_size; i++)
348 {
349 const ValueType& Aij = values[idx+col+i+j*pitch];
350 const ValueType& xj = x[col + i];
351
352 y[row+j] = reduce(y[row+j], combine(Aij, xj));
353 }
354 }
355 }
356 }
357 idx -= block_size*pitch;
358 }
359 // process main and upper diagonal blocks
360 for (IndexType d = 0; d < num_diagonals; d++)
361 {
362 const IndexType k = A.diagonal_offsets[d]*block_size;
363 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row+=block_size)
364 {
365 const IndexType col = row + k;
366 if (col >= 0 && col <= num_rows-block_size)
367 {
368 // for each column in block
369 for (IndexType i = 0; i < block_size; i++)
370 {
371 // for each row in block
372 for (IndexType j = 0; j < block_size; j++)
373 {
374 const ValueType& Aij = values[row+j+(d*block_size+i)*pitch];
375 const ValueType& xj = x[col + i];
376
377 y[row+j] = reduce(y[row+j], combine(Aij, xj));
378 }
379 }
380 }
381 }
382 } // diagonals
383 }
384 } else { // !A.symmetric
385 #pragma omp parallel for
386 for (IndexType ch = 0; ch < num_rows; ch+=chunksize)
387 {
388 for (IndexType row = ch; row<std::min(ch+chunksize,num_rows); row++)
389 {
390 y[row] = initialize(y[row]);
391 }
392
393 // for each diagonal block
394 for (IndexType d = 0; d < num_diagonals; d++)
395 {
396 const IndexType k = A.diagonal_offsets[d]*block_size;
397
398 for (IndexType row=ch; row<std::min(ch+chunksize,num_rows); row+=block_size)
399 {
400 const IndexType col = row + k;
401 if (col >= 0 && col <= num_rows-block_size)
402 {
403 // for each column in block
404 for (IndexType i = 0; i < block_size; i++)
405 {
406 // for each row in block
407 for (IndexType j = 0; j < block_size; j++)
408 {
409 const ValueType& Aij = A.values(row+j, d*block_size+i);
410 const ValueType& xj = x[col + i];
411
412 y[row+j] = reduce(y[row+j], combine(Aij, xj));
413 }
414 }
415 }
416 }
417 } // diagonals
418 } // row chunks
419 } // A.symmetric
420 } // block size
421 }
422
423 template <typename Matrix,
424 typename Vector1,
425 typename Vector2>
426 void spmv_cds(const Matrix& A,
427 const Vector1& x,
428 Vector2& y)
429 {
430 typedef typename Vector2::value_type ValueType;
431
432 if (A.block_size == 1) {
433 spmv_dia(A, x, y,
434 cusp::detail::zero_function<ValueType>(),
435 thrust::multiplies<ValueType>(),
436 thrust::plus<ValueType>());
437 } else {
438 spmv_cds(A, x, y,
439 cusp::detail::zero_function<ValueType>(),
440 thrust::multiplies<ValueType>(),
441 thrust::plus<ValueType>());
442 }
443 }
444
445
446 //////////////
447 // ELL SpMV //
448 //////////////
449 template <typename Matrix,
450 typename Vector1,
451 typename Vector2,
452 typename UnaryFunction,
453 typename BinaryFunction1,
454 typename BinaryFunction2>
455 void spmv_ell(const Matrix& A,
456 const Vector1& x,
457 Vector2& y,
458 UnaryFunction initialize,
459 BinaryFunction1 combine,
460 BinaryFunction2 reduce)
461 {
462 typedef typename Matrix::index_type IndexType;
463 typedef typename Vector2::value_type ValueType;
464
465 const size_t& num_entries_per_row = A.column_indices.num_cols;
466
467 const IndexType invalid_index = Matrix::invalid_index;
468
469 for(size_t i = 0; i < A.num_rows; i++)
470 y[i] = initialize(y[i]);
471
472 for(size_t n = 0; n < num_entries_per_row; n++)
473 {
474 for(size_t i = 0; i < A.num_rows; i++)
475 {
476 const IndexType& j = A.column_indices(i, n);
477 const ValueType& Aij = A.values(i,n);
478
479 if (j != invalid_index)
480 {
481 const ValueType& xj = x[j];
482 y[i] = reduce(y[i], combine(Aij, xj));
483 }
484 }
485 }
486 }
487
488
489 template <typename Matrix,
490 typename Vector1,
491 typename Vector2>
492 void spmv_ell(const Matrix& A,
493 const Vector1& x,
494 Vector2& y)
495 {
496 typedef typename Vector2::value_type ValueType;
497
498 spmv_ell(A, x, y,
499 cusp::detail::zero_function<ValueType>(),
500 thrust::multiplies<ValueType>(),
501 thrust::plus<ValueType>());
502 }
503
504 } // end namespace host
505 } // end namespace detail
506 } // end namespace cusp
507
508

  ViewVC Help
Powered by ViewVC 1.1.26