/[escript]/trunk/ripley/src/RipleySystemMatrix.cpp
ViewVC logotype

Contents of /trunk/ripley/src/RipleySystemMatrix.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5148 - (show annotations)
Mon Sep 15 01:25:23 2014 UTC (4 years, 11 months ago) by caltinay
File size: 15855 byte(s)
Merging ripley diagonal storage + CUDA support into trunk.
Options file version has been incremented due to new options
'cuda' and 'nvccflags'.

1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2014 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development 2012-2013 by School of Earth Sciences
13 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
14 *
15 *****************************************************************************/
16
17 #include "RipleySystemMatrix.h"
18 #include "RipleyException.h"
19
20 #include <esysUtils/index.h>
21 #include <escript/Data.h>
22 #include <escript/SolverOptions.h>
23
24 #include <cusp/multiply.h>
25 #include <cusp/krylov/bicgstab.h>
26 #include <cusp/krylov/cg.h>
27 #include <cusp/krylov/cgls.h>
28 #include <cusp/krylov/gmres.h>
29 #include <cusp/krylov/lsqr.h>
30 #include <cusp/precond/diagonal.h>
31
32 namespace bp = boost::python;
33
34 namespace ripley {
35
36 double gettime()
37 {
38 #ifdef _OPENMP
39 return omp_get_wtime();
40 #else
41 struct timeval tv;
42 gettimeofday(&tv, NULL);
43 suseconds_t ret = tv.tv_usec + tv.tv_sec*1e6;
44 return 1e-6*(double)ret;
45 #endif
46 }
47
48 std::vector<int> SystemMatrix::cudaDevices;
49
50 void SystemMatrix::checkCUDA()
51 {
52 #ifdef USE_CUDA
53 cudaDevices.clear();
54 int deviceCount;
55 cudaError err = cudaGetDeviceCount(&deviceCount);
56 if (deviceCount == 0 || err == cudaErrorNoDevice) {
57 std::cout << "Note: There is no device supporting CUDA" << std::endl;
58 cudaDevices.push_back(-1);
59 return;
60 } else if (deviceCount == 1) {
61 std::cout << "There is 1 GPU device" << std::endl;
62 } else {
63 std::cout << "There are " << deviceCount << " GPU devices" << std::endl;
64 }
65
66 for (int dev = 0; dev < deviceCount; ++dev) {
67 cudaDeviceProp deviceProp;
68 CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
69
70 std::cout << "\nDevice " << dev << ": \"" << deviceProp.name << "\" -- ";
71 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
72 std::cout << "No CUDA support." << std::endl;
73 } else {
74 cudaDevices.push_back(dev);
75 std::cout << "Rev. " << deviceProp.major << "." << deviceProp.minor
76 << ", Mem: "
77 << deviceProp.totalGlobalMem << " bytes." << std::endl;
78 }
79 }
80 // neither of the found devices supports CUDA:
81 if (cudaDevices.empty()) {
82 cudaDevices.push_back(-1);
83 }
84 #endif
85 }
86
87 SystemMatrix::SystemMatrix(esysUtils::JMPI mpiInfo, int blocksize,
88 const escript::FunctionSpace& fs, int nRows,
89 const IndexVector& diagonalOffsets) :
90 AbstractSystemMatrix(blocksize, fs, blocksize, fs),
91 m_mpiInfo(mpiInfo)
92 {
93 // count nonzero entries
94 int numEntries = 0;
95 for (size_t i = 0; i < diagonalOffsets.size(); i++) {
96 numEntries += blocksize*blocksize*(nRows-std::abs(diagonalOffsets[i]));
97 }
98 //std::cout << "Matrix has " <<numEntries<<" entries."<<std::endl;
99
100 mat.resize(nRows*blocksize, numEntries, diagonalOffsets.size(), blocksize);
101 mat.diagonal_offsets.assign(diagonalOffsets.begin(), diagonalOffsets.end());
102 matrixAltered = true;
103 }
104
105 void SystemMatrix::add(const IndexVector& rowIdx,
106 const std::vector<double>& array)
107 {
108 const int blockSize = getBlockSize();
109 const int emSize = rowIdx.size();
110 for (int i=0; i<emSize; i++) {
111 for (int j=0; j<emSize; j++) {
112 const int revi = emSize-1-i;
113 const int diag = j%2 + revi%2 + 3*(j/2+revi/2 + j/4+revi/4);
114 for (int k=0; k<blockSize; k++) {
115 for (int m=0; m<blockSize; m++) {
116 const int row = rowIdx[i]*blockSize + k;
117 const int d = diag*blockSize + m;
118 const int srcIdx = INDEX4(k, m, i, j, blockSize, blockSize, emSize);
119 mat.values(row, d) += array[srcIdx];
120 }
121 }
122 }
123 }
124 matrixAltered = true;
125 }
126
127 void SystemMatrix::ypAx(escript::Data& y, escript::Data& x) const
128 {
129 if (x.getDataPointSize() != getBlockSize()) {
130 throw RipleyException("matrix vector product: block size does not match the number of components in input.");
131 } else if (y.getDataPointSize() != getBlockSize()) {
132 throw RipleyException("matrix vector product: block size does not match the number of components in output.");
133 } else if (x.getFunctionSpace() != getColumnFunctionSpace()) {
134 throw RipleyException("matrix vector product: matrix function space and function space of input don't match.");
135 } else if (y.getFunctionSpace() != getRowFunctionSpace()) {
136 throw RipleyException("matrix vector product: matrix function space and function space of output don't match.");
137 }
138
139 // expand data object if necessary to be able to grab the whole data
140 const_cast<escript::Data*>(&x)->expand();
141 y.expand();
142 y.requireWrite();
143 const double* x_dp = x.getSampleDataRO(0);
144 double* y_dp = y.getSampleDataRW(0);
145 //double T0 = gettime();
146 HostVectorType xx(x_dp, x_dp+mat.num_rows);
147 HostVectorType yy(mat.num_rows, 0.);
148 cusp::multiply(mat, xx, yy);
149 thrust::copy(yy.begin(), yy.end(), y_dp);
150 //std::cout << "ypAx: " << gettime()-T0 << " seconds." << std::endl;
151 }
152
153 template<class LinearOperator,
154 class Vector,
155 class Preconditioner>
156 void SystemMatrix::runSolver(LinearOperator& A, Vector& x, Vector& b,
157 Preconditioner& M, escript::SolverBuddy& sb) const
158 {
159 //cusp::verbose_monitor<double> monitor(b, sb.getIterMax(), sb.getTolerance(), sb.getAbsoluteTolerance());
160 cusp::default_monitor<double> monitor(b, sb.getIterMax(), sb.getTolerance(), sb.getAbsoluteTolerance());
161 int solver = sb.getSolverMethod();
162 if (solver == escript::SO_DEFAULT) {
163 if (sb.isSymmetric()) {
164 solver = escript::SO_METHOD_PCG;
165 } else {
166 solver = escript::SO_METHOD_BICGSTAB;
167 }
168 }
169
170 double T0 = gettime();
171 switch (solver) {
172 case escript::SO_DEFAULT:
173 case escript::SO_METHOD_PCG:
174 cusp::krylov::cg(A, x, b, monitor, M);
175 break;
176 case escript::SO_METHOD_CGLS:
177 cusp::krylov::cgls(A, x, b, 0., monitor); //shift=0 for now
178 break;
179 case escript::SO_METHOD_LSQR:
180 cusp::krylov::lsqr(A, x, b, cusp::krylov::lsqr_parameters<double>(), monitor);
181 break;
182 case escript::SO_METHOD_BICGSTAB:
183 cusp::krylov::bicgstab(A, x, b, monitor, M);
184 break;
185 case escript::SO_METHOD_GMRES:
186 {
187 const int restart = (sb.getRestart()==0 ? 1000 : sb.getRestart());
188 if (restart < 1)
189 throw RipleyException("Invalid restart parameter for GMRES");
190 cusp::krylov::gmres(A, x, b, restart, monitor, M);
191 }
192 break;
193 case escript::SO_METHOD_PRES20:
194 {
195 const int restart = 20;
196 cusp::krylov::gmres(A, x, b, restart, monitor, M);
197 }
198 break;
199 default:
200 throw RipleyException("Unsupported solver.");
201 }
202 double solvertime = gettime()-T0;
203
204 if (monitor.converged()) {
205 if (sb.isVerbose()) {
206 std::cout << "Solver converged to " << monitor.relative_tolerance()
207 << " relative tolerance after " << monitor.iteration_count()
208 << " iterations and " << solvertime << " seconds."<< std::endl;
209 }
210 } else {
211 std::cout << "Solver reached iteration limit "
212 << monitor.iteration_limit() << " before converging"
213 << " to " << monitor.relative_tolerance() << " rel. tolerance."
214 << std::endl;
215 }
216 }
217
218 void SystemMatrix::setToSolution(escript::Data& out, escript::Data& in,
219 bp::object& options) const
220 {
221 if (m_mpiInfo->size > 1) {
222 throw RipleyException("solve: ripley's block diagonal matrix "
223 "is incompatible with MPI.");
224 }
225 if (out.getDataPointSize() != getBlockSize()) {
226 throw RipleyException("solve: block size does not match the number of components of solution.");
227 } else if (in.getDataPointSize() != getBlockSize()) {
228 throw RipleyException("solve: block size does not match the number of components of right hand side.");
229 } else if (out.getFunctionSpace() != getColumnFunctionSpace()) {
230 throw RipleyException("solve: matrix function space and function space of solution don't match.");
231 } else if (in.getFunctionSpace() != getRowFunctionSpace()) {
232 throw RipleyException("solve: matrix function space and function space of right hand side don't match.");
233 }
234
235 options.attr("resetDiagnostics")();
236 escript::SolverBuddy sb = bp::extract<escript::SolverBuddy>(options);
237 out.expand();
238 in.expand();
239
240 double* out_dp = out.getSampleDataRW(0);
241 const double* in_dp = in.getSampleDataRO(0);
242 double T0;
243
244 if (sb.getSolverTarget() == escript::SO_TARGET_GPU) {
245 #ifdef USE_CUDA
246 if (cudaDevices.empty()) {
247 checkCUDA();
248 }
249
250 if (cudaDevices[0] == -1) {
251 throw RipleyException("solve: GPU-based solver requested but no "
252 "CUDA compatible device available.");
253 }
254
255 //TODO: give users options...
256 if (sb.isVerbose()) {
257 std::cout << "Using CUDA device " << cudaDevices[0] << std::endl;
258 }
259 cudaSetDevice(cudaDevices[0]);
260
261 T0 = gettime();
262 DeviceVectorType b(in_dp, in_dp+mat.num_rows);
263 double host2dev = gettime()-T0;
264 if (sb.isVerbose())
265 std::cout << "Copy of b: " << host2dev << " seconds." << std::endl;
266 if (matrixAltered) {
267 T0 = gettime();
268 dmat = mat;
269 host2dev = gettime()-T0;
270 if (sb.isVerbose())
271 std::cout << "Copy of A: " << host2dev << " seconds." << std::endl;
272 matrixAltered = false;
273 }
274 DeviceVectorType x(mat.num_rows, 0.);
275 if (sb.isVerbose())
276 std::cout << "Solving on CUDA device..." << std::endl;
277
278 if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_NONE) {
279 cusp::identity_operator<double, cusp::device_memory> M(mat.num_rows, mat.num_rows);
280 runSolver(dmat, x, b, M, sb);
281 } else if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_JACOBI) {
282 if (sb.isVerbose())
283 std::cout << "Using Jacobi preconditioner" << std::endl;
284 // TODO: This should be cached as well but that's not supported
285 // at the moment.
286 cusp::precond::diagonal<double, cusp::device_memory> M(dmat);
287 runSolver(dmat, x, b, M, sb);
288 } else {
289 throw RipleyException("Unsupported preconditioner requested.");
290 }
291
292 T0 = gettime();
293 thrust::copy(x.begin(), x.end(), out_dp);
294 const double copyTime = gettime()-T0;
295 if (sb.isVerbose())
296 std::cout << "Copy of x: " << copyTime << " seconds." << std::endl;
297 #else
298 throw RipleyException("solve: GPU-based solver requested but escript "
299 "not compiled with CUDA.");
300 #endif
301 } else { // CPU
302 T0 = gettime();
303 HostVectorType b(in_dp, in_dp+mat.num_rows);
304 double copytime = gettime()-T0;
305 if (sb.isVerbose()) {
306 std::cout << "Copy of b: " << copytime << " seconds." << std::endl;
307 std::cout << "Solving on the CPU..." << std::endl;
308 }
309 HostVectorType x(mat.num_rows, 0.);
310 if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_NONE) {
311 cusp::identity_operator<double, cusp::host_memory> M(mat.num_rows, mat.num_rows);
312 runSolver(mat, x, b, M, sb);
313 } else if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_JACOBI) {
314 if (sb.isVerbose())
315 std::cout << "Using Jacobi preconditioner" << std::endl;
316 // TODO: This should be cached as well but that's not supported
317 // at the moment.
318 cusp::precond::diagonal<double, cusp::host_memory> M(mat);
319 runSolver(mat, x, b, M, sb);
320 } else {
321 throw RipleyException("Unsupported preconditioner requested.");
322 }
323
324 T0 = gettime();
325 thrust::copy(x.begin(), x.end(), out_dp);
326 const double copyTime = gettime()-T0;
327 if (sb.isVerbose()) {
328 std::cout << "Copy of x: " << copyTime << " seconds." << std::endl;
329 }
330 }
331 }
332
333 void SystemMatrix::nullifyRowsAndCols(escript::Data& row_q,
334 escript::Data& col_q,
335 double mdv)
336 {
337 //double T0 = gettime();
338 if (col_q.getDataPointSize() != getColumnBlockSize()) {
339 throw RipleyException("nullifyRowsAndCols: column block size does not match the number of components of column mask.");
340 } else if (row_q.getDataPointSize() != getRowBlockSize()) {
341 throw RipleyException("nullifyRowsAndCols: row block size does not match the number of components of row mask.");
342 } else if (col_q.getFunctionSpace() != getColumnFunctionSpace()) {
343 throw RipleyException("nullifyRowsAndCols: column function space and function space of column mask don't match.");
344 } else if (row_q.getFunctionSpace() != getRowFunctionSpace()) {
345 throw RipleyException("nullifyRowsAndCols: row function space and function space of row mask don't match.");
346 }
347
348 row_q.expand();
349 col_q.expand();
350 const double* rowMask = row_q.getSampleDataRO(0);
351 const double* colMask = col_q.getSampleDataRO(0);
352 const int blockSize = getBlockSize();
353 #pragma omp parallel for
354 for (int row=0; row < mat.num_rows; row++) {
355 for (int diag=0; diag < mat.diagonal_offsets.size(); diag++) {
356 const int col = blockSize*(row/blockSize)+mat.diagonal_offsets[diag]*blockSize;
357 if (col >= 0 && col <= mat.num_rows-blockSize) {
358 for (int i=0; i<blockSize; i++) {
359 if (rowMask[row] > 0. || colMask[col+i] > 0.) {
360 mat.values(row, diag*blockSize+i) =
361 (row==col+i ? mdv : 0);
362 }
363 }
364 }
365 }
366 }
367 //std::cout << "nullifyRowsAndCols: " << gettime()-T0 << " seconds." << std::endl;
368 matrixAltered = true;
369 }
370
371 void SystemMatrix::saveMM(const std::string& filename) const
372 {
373 const int blockSize = getBlockSize();
374
375 std::ofstream f(filename.c_str());
376 f << "%%MatrixMarket matrix coordinate real general" << std::endl;
377 f << mat.num_rows << " " << mat.num_cols << " " << mat.num_entries << std::endl;
378 f.setf(std::ios_base::scientific, std::ios_base::floatfield);
379 f.precision(15);
380 for (int row=0; row < mat.num_rows; row++) {
381 for (int diag=0; diag < mat.diagonal_offsets.size(); diag++) {
382 const int col = blockSize*(row/blockSize)+mat.diagonal_offsets[diag]*blockSize;
383 if (col >= 0 && col <= mat.num_rows-blockSize) {
384 for (int i=0; i<blockSize; i++) {
385 f << row+1 << " " << col+i+1 << " "
386 << mat.values(row, diag*blockSize+i) << std::endl;
387 }
388 }
389 }
390 }
391 }
392
393 void SystemMatrix::saveHB(const std::string& filename) const
394 {
395 throw RipleyException("Harwell-Boeing interface not available.");
396 }
397
398 void SystemMatrix::resetValues()
399 {
400 mat.values.values.assign(mat.values.values.size(), 0.);
401 matrixAltered = true;
402 }
403
404 } // end of namespace

  ViewVC Help
Powered by ViewVC 1.1.26