1 |
|
2 |
/***************************************************************************** |
3 |
* |
4 |
* Copyright (c) 2014 by University of Queensland |
5 |
* http://www.uq.edu.au |
6 |
* |
7 |
* Primary Business: Queensland, Australia |
8 |
* Licensed under the Open Software License version 3.0 |
9 |
* http://www.opensource.org/licenses/osl-3.0.php |
10 |
* |
11 |
* Development until 2012 by Earth Systems Science Computational Center (ESSCC) |
12 |
* Development 2012-2013 by School of Earth Sciences |
13 |
* Development from 2014 by Centre for Geoscience Computing (GeoComp) |
14 |
* |
15 |
*****************************************************************************/ |
16 |
|
17 |
#pragma once |
18 |
|
19 |
#include <thrust/functional.h> |
20 |
#include <cusp/detail/functional.h> |
21 |
|
22 |
#ifndef DIA_CHUNKSIZE |
23 |
#define DIA_CHUNKSIZE 1024 |
24 |
#endif |
25 |
|
26 |
namespace cusp |
27 |
{ |
28 |
namespace detail |
29 |
{ |
30 |
namespace host |
31 |
{ |
32 |
|
33 |
///////////////////////// |
34 |
// DIA transposed SpMV // |
35 |
///////////////////////// |
36 |
template <typename Matrix, |
37 |
typename Vector1, |
38 |
typename Vector2, |
39 |
typename UnaryFunction, |
40 |
typename BinaryFunction1, |
41 |
typename BinaryFunction2> |
42 |
void transposed_spmv_dia(const Matrix& A, |
43 |
const Vector1& x, |
44 |
Vector2& y, |
45 |
UnaryFunction initialize, |
46 |
BinaryFunction1 combine, |
47 |
BinaryFunction2 reduce) |
48 |
{ |
49 |
typedef typename Matrix::index_type IndexType; |
50 |
//typedef typename Vector2::value_type ValueType; |
51 |
|
52 |
const size_t num_diagonals = A.values.num_cols; |
53 |
|
54 |
#pragma omp parallel for |
55 |
for (size_t ch = 0; ch < A.num_cols; ch += DIA_CHUNKSIZE) { |
56 |
// initialize chunk |
57 |
for (size_t row = ch; row < std::min(ch+DIA_CHUNKSIZE,A.num_cols); row++) |
58 |
{ |
59 |
y[row] = initialize(y[row]); |
60 |
} |
61 |
// for each diagonal |
62 |
for (size_t d = 0; d < num_diagonals; d++) |
63 |
{ |
64 |
for (IndexType row=ch; row<std::min(ch+DIA_CHUNKSIZE,A.num_cols); row++) |
65 |
{ |
66 |
const IndexType col = row - A.diagonal_offsets[d]; |
67 |
if (col >= 0 && col < A.num_rows) |
68 |
{ |
69 |
y[row] = reduce(y[row], combine(A.values(col, d), x[col])); |
70 |
} |
71 |
} |
72 |
} |
73 |
} |
74 |
} |
75 |
|
76 |
template <typename Matrix, |
77 |
typename Vector1, |
78 |
typename Vector2> |
79 |
void transposed_spmv_dia(const Matrix& A, |
80 |
const Vector1& x, |
81 |
Vector2& y) |
82 |
{ |
83 |
typedef typename Vector2::value_type ValueType; |
84 |
|
85 |
transposed_spmv_dia(A, x, y, |
86 |
cusp::detail::zero_function<ValueType>(), |
87 |
thrust::multiplies<ValueType>(), |
88 |
thrust::plus<ValueType>()); |
89 |
} |
90 |
|
91 |
template <typename Matrix, |
92 |
typename Vector1, |
93 |
typename Vector2, |
94 |
typename UnaryFunction, |
95 |
typename BinaryFunction1, |
96 |
typename BinaryFunction2> |
97 |
void transposed_spmv_cds(const Matrix& A, |
98 |
const Vector1& x, |
99 |
Vector2& y, |
100 |
UnaryFunction initialize, |
101 |
BinaryFunction1 combine, |
102 |
BinaryFunction2 reduce) |
103 |
{ |
104 |
typedef typename Matrix::index_type IndexType; |
105 |
typedef typename Vector2::value_type ValueType; |
106 |
|
107 |
const IndexType num_diagonals = A.diagonal_offsets.size(); |
108 |
const IndexType block_size = (IndexType)A.block_size; |
109 |
const IndexType num_cols = (IndexType)A.num_cols; |
110 |
// make chunksize a multiple of block_size |
111 |
const IndexType chunksize = block_size*(DIA_CHUNKSIZE/block_size); |
112 |
|
113 |
// optimisation for special case |
114 |
if (block_size == 2) { |
115 |
#pragma omp parallel for |
116 |
for (IndexType ch = 0; ch < num_cols; ch += chunksize) { |
117 |
for (IndexType row = ch; row < std::min(ch+chunksize,num_cols); row+=2) |
118 |
{ |
119 |
ValueType sum1 = initialize(y[row]); |
120 |
ValueType sum2 = initialize(y[row+1]); |
121 |
// for each diagonal block |
122 |
for (IndexType d = 0; d < num_diagonals; d++) |
123 |
{ |
124 |
const IndexType col = row - A.diagonal_offsets[d]*2; |
125 |
if (col >= 0 && col < A.num_rows) |
126 |
{ |
127 |
sum1 = reduce(sum1, combine(A.values(col, 2*d), x[col])); |
128 |
sum2 = reduce(sum2, combine(A.values(col, 2*d+1),x[col])); |
129 |
sum1 = reduce(sum1, combine(A.values(col+1, 2*d), x[col+1])); |
130 |
sum2 = reduce(sum2, combine(A.values(col+1, 2*d+1),x[col+1])); |
131 |
} |
132 |
} |
133 |
y[row] = sum1; |
134 |
y[row+1] = sum2; |
135 |
} |
136 |
} |
137 |
} else { // block_size!=2 |
138 |
#pragma omp parallel for |
139 |
for (IndexType ch = 0; ch < num_cols; ch += chunksize) { |
140 |
for (IndexType row = ch; row < std::min(ch+chunksize,num_cols); row++) |
141 |
{ |
142 |
y[row] = initialize(y[row]); |
143 |
|
144 |
// for each diagonal block |
145 |
for (IndexType d = 0; d < num_diagonals; d++) |
146 |
{ |
147 |
const IndexType k = A.diagonal_offsets[d]*block_size; |
148 |
const IndexType col = block_size*(row/block_size) - k; |
149 |
if (col >= 0 && col <= A.num_rows-block_size) |
150 |
{ |
151 |
// for each column in block |
152 |
for (IndexType i = 0; i < block_size; i++) |
153 |
{ |
154 |
const ValueType& Aij = A.values(col+i, d*block_size+row%block_size); |
155 |
const ValueType& xj = x[col + i]; |
156 |
y[row] = reduce(y[row], combine(Aij, xj)); |
157 |
} |
158 |
} |
159 |
} // diagonals |
160 |
} // rows |
161 |
} // chunks |
162 |
} // block_size |
163 |
} |
164 |
|
165 |
template <typename Matrix, |
166 |
typename Vector1, |
167 |
typename Vector2> |
168 |
void transposed_spmv_cds(const Matrix& A, |
169 |
const Vector1& x, |
170 |
Vector2& y) |
171 |
{ |
172 |
typedef typename Vector2::value_type ValueType; |
173 |
|
174 |
if (A.block_size == 1) { |
175 |
transposed_spmv_dia(A, x, y, |
176 |
cusp::detail::zero_function<ValueType>(), |
177 |
thrust::multiplies<ValueType>(), |
178 |
thrust::plus<ValueType>()); |
179 |
} else { |
180 |
transposed_spmv_cds(A, x, y, |
181 |
cusp::detail::zero_function<ValueType>(), |
182 |
thrust::multiplies<ValueType>(), |
183 |
thrust::plus<ValueType>()); |
184 |
} |
185 |
} |
186 |
|
187 |
} // end namespace host |
188 |
} // end namespace detail |
189 |
} // end namespace cusp |
190 |
|
191 |
|