/[escript]/branches/diaplayground/ripley/src/RipleySystemMatrix.cpp
ViewVC logotype

Contents of /branches/diaplayground/ripley/src/RipleySystemMatrix.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5146 - (show annotations)
Thu Sep 11 06:29:28 2014 UTC (4 years, 7 months ago) by caltinay
File size: 15914 byte(s)
Check for CUDA devices only once when target=GPU.

1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2014 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development 2012-2013 by School of Earth Sciences
13 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
14 *
15 *****************************************************************************/
16
17 #include "RipleySystemMatrix.h"
18 #include "RipleyException.h"
19
20 #include <esysUtils/index.h>
21 #include <escript/Data.h>
22 #include <escript/SolverOptions.h>
23
24 #include <cusp/multiply.h>
25 #include <cusp/krylov/bicgstab.h>
26 #include <cusp/krylov/cg.h>
27 #include <cusp/krylov/cgls.h>
28 #include <cusp/krylov/gmres.h>
29 #include <cusp/krylov/lsqr.h>
30 #include <cusp/precond/diagonal.h>
31
32 namespace bp = boost::python;
33
34 namespace ripley {
35
36 double gettime()
37 {
38 #ifdef _OPENMP
39 return omp_get_wtime();
40 #else
41 struct timeval tv;
42 gettimeofday(&tv, NULL);
43 suseconds_t ret = tv.tv_usec + tv.tv_sec*1e6;
44 return 1e-6*(double)ret;
45 #endif
46 }
47
48 std::vector<int> SystemMatrix::cudaDevices;
49
50 void SystemMatrix::checkCUDA()
51 {
52 #ifdef USE_CUDA
53 cudaDevices.clear();
54 int deviceCount;
55 cudaError err = cudaGetDeviceCount(&deviceCount);
56 if (deviceCount == 0 || err == cudaErrorNoDevice) {
57 std::cout << "Note: There is no device supporting CUDA" << std::endl;
58 cudaDevices.push_back(-1);
59 return;
60 } else if (deviceCount == 1) {
61 std::cout << "There is 1 GPU device" << std::endl;
62 } else {
63 std::cout << "There are " << deviceCount << " GPU devices" << std::endl;
64 }
65
66 for (int dev = 0; dev < deviceCount; ++dev) {
67 cudaDeviceProp deviceProp;
68 CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
69
70 std::cout << "\nDevice " << dev << ": \"" << deviceProp.name << "\" -- ";
71 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
72 std::cout << "No CUDA support." << std::endl;
73 } else {
74 cudaDevices.push_back(dev);
75 std::cout << "Rev. " << deviceProp.major << "." << deviceProp.minor
76 << ", Mem: "
77 << deviceProp.totalGlobalMem << " bytes." << std::endl;
78 }
79 }
80 // neither of the found devices supports CUDA:
81 if (cudaDevices.empty()) {
82 cudaDevices.push_back(-1);
83 }
84 #endif
85 }
86
87 SystemMatrix::SystemMatrix(esysUtils::JMPI mpiInfo, int blocksize,
88 const escript::FunctionSpace& fs, int nRows,
89 const IndexVector& diagonalOffsets) :
90 AbstractSystemMatrix(blocksize, fs, blocksize, fs),
91 m_mpiInfo(mpiInfo)
92 {
93 // count nonzero entries
94 int numEntries = 0;
95 for (size_t i = 0; i < diagonalOffsets.size(); i++) {
96 numEntries += blocksize*blocksize*(nRows-std::abs(diagonalOffsets[i]));
97 }
98 //std::cout << "Matrix has " <<numEntries<<" entries."<<std::endl;
99
100 mat.resize(nRows*blocksize, numEntries, diagonalOffsets.size(), blocksize);
101 mat.diagonal_offsets.assign(diagonalOffsets.begin(), diagonalOffsets.end());
102 matrixAltered = true;
103 }
104
105 void SystemMatrix::add(const IndexVector& rowIdx,
106 const std::vector<double>& array)
107 {
108 const int blockSize = getBlockSize();
109 const int emSize = rowIdx.size();
110 for (int i=0; i<emSize; i++) {
111 for (int j=0; j<emSize; j++) {
112 const int revi = emSize-1-i;
113 const int diag = j%2 + revi%2 + 3*(j/2+revi/2 + j/4+revi/4);
114 for (int k=0; k<blockSize; k++) {
115 for (int m=0; m<blockSize; m++) {
116 const int row = rowIdx[i]*blockSize + k;
117 const int d = diag*blockSize + m;
118 const int srcIdx = INDEX4(k, m, i, j, blockSize, blockSize, emSize);
119 mat.values(row, d) += array[srcIdx];
120 }
121 }
122 }
123 }
124 matrixAltered = true;
125 }
126
127 void SystemMatrix::ypAx(escript::Data& y, escript::Data& x) const
128 {
129 if (x.getDataPointSize() != getBlockSize()) {
130 throw RipleyException("matrix vector product: block size does not match the number of components in input.");
131 } else if (y.getDataPointSize() != getBlockSize()) {
132 throw RipleyException("matrix vector product: block size does not match the number of components in output.");
133 } else if (x.getFunctionSpace() != getColumnFunctionSpace()) {
134 throw RipleyException("matrix vector product: matrix function space and function space of input don't match.");
135 } else if (y.getFunctionSpace() != getRowFunctionSpace()) {
136 throw RipleyException("matrix vector product: matrix function space and function space of output don't match.");
137 }
138
139 // expand data object if necessary to be able to grab the whole data
140 const_cast<escript::Data*>(&x)->expand();
141 y.expand();
142 y.requireWrite();
143 const double* x_dp = x.getSampleDataRO(0);
144 double* y_dp = y.getSampleDataRW(0);
145 double T0 = gettime();
146 HostVectorType xx(x_dp, x_dp+mat.num_rows);
147 HostVectorType yy(mat.num_rows, 0.);
148 cusp::multiply(mat, xx, yy);
149 thrust::copy(yy.begin(), yy.end(), y_dp);
150 //std::cout << "ypAx: " << gettime()-T0 << " seconds." << std::endl;
151 }
152
153 template<class LinearOperator,
154 class Vector,
155 class Preconditioner>
156 void SystemMatrix::runSolver(LinearOperator& A, Vector& x, Vector& b,
157 Preconditioner& M, escript::SolverBuddy& sb) const
158 {
159 typedef typename LinearOperator::memory_space MemorySpace;
160 //cusp::verbose_monitor<double> monitor(b, sb.getIterMax(), sb.getTolerance(), sb.getAbsoluteTolerance());
161 cusp::default_monitor<double> monitor(b, sb.getIterMax(), sb.getTolerance(), sb.getAbsoluteTolerance());
162 int solver = sb.getSolverMethod();
163 if (solver == escript::SO_DEFAULT) {
164 if (sb.isSymmetric()) {
165 solver = escript::SO_METHOD_PCG;
166 } else {
167 solver = escript::SO_METHOD_BICGSTAB;
168 }
169 }
170
171 double T0 = gettime();
172 switch (solver) {
173 case escript::SO_DEFAULT:
174 case escript::SO_METHOD_PCG:
175 cusp::krylov::cg(A, x, b, monitor, M);
176 break;
177 case escript::SO_METHOD_CGLS:
178 cusp::krylov::cgls(A, x, b, 0., monitor); //shift=0 for now
179 break;
180 case escript::SO_METHOD_LSQR:
181 cusp::krylov::lsqr(A, x, b, cusp::krylov::lsqr_parameters<double>(), monitor);
182 break;
183 case escript::SO_METHOD_BICGSTAB:
184 cusp::krylov::bicgstab(A, x, b, monitor, M);
185 break;
186 case escript::SO_METHOD_GMRES:
187 {
188 const int restart = (sb.getRestart()==0 ? 1000 : sb.getRestart());
189 if (restart < 1)
190 throw RipleyException("Invalid restart parameter for GMRES");
191 cusp::krylov::gmres(A, x, b, restart, monitor, M);
192 }
193 break;
194 case escript::SO_METHOD_PRES20:
195 {
196 const int restart = 20;
197 cusp::krylov::gmres(A, x, b, restart, monitor, M);
198 }
199 break;
200 default:
201 throw RipleyException("Unsupported solver.");
202 }
203 double solvertime = gettime()-T0;
204
205 if (monitor.converged()) {
206 if (sb.isVerbose()) {
207 std::cout << "Solver converged to " << monitor.relative_tolerance()
208 << " relative tolerance after " << monitor.iteration_count()
209 << " iterations and " << solvertime << " seconds."<< std::endl;
210 }
211 } else {
212 std::cout << "Solver reached iteration limit "
213 << monitor.iteration_limit() << " before converging"
214 << " to " << monitor.relative_tolerance() << " rel. tolerance."
215 << std::endl;
216 }
217 }
218
219 void SystemMatrix::setToSolution(escript::Data& out, escript::Data& in,
220 bp::object& options) const
221 {
222 if (m_mpiInfo->size > 1) {
223 throw RipleyException("solve: ripley's block diagonal matrix "
224 "is incompatible with MPI.");
225 }
226 if (out.getDataPointSize() != getBlockSize()) {
227 throw RipleyException("solve: block size does not match the number of components of solution.");
228 } else if (in.getDataPointSize() != getBlockSize()) {
229 throw RipleyException("solve: block size does not match the number of components of right hand side.");
230 } else if (out.getFunctionSpace() != getColumnFunctionSpace()) {
231 throw RipleyException("solve: matrix function space and function space of solution don't match.");
232 } else if (in.getFunctionSpace() != getRowFunctionSpace()) {
233 throw RipleyException("solve: matrix function space and function space of right hand side don't match.");
234 }
235
236 options.attr("resetDiagnostics")();
237 escript::SolverBuddy sb = bp::extract<escript::SolverBuddy>(options);
238 out.expand();
239 in.expand();
240
241 double* out_dp = out.getSampleDataRW(0);
242 const double* in_dp = in.getSampleDataRO(0);
243 double T0;
244
245 if (sb.getSolverTarget() == escript::SO_TARGET_GPU) {
246 #ifdef USE_CUDA
247 if (cudaDevices.empty()) {
248 checkCUDA();
249 }
250
251 if (cudaDevices[0] == -1) {
252 throw RipleyException("solve: GPU-based solver requested but no "
253 "CUDA compatible device available.");
254 }
255
256 //TODO: give users options...
257 if (sb.isVerbose()) {
258 std::cout << "Using CUDA device " << cudaDevices[0] << std::endl;
259 }
260 cudaSetDevice(cudaDevices[0]);
261
262 T0 = gettime();
263 DeviceVectorType b(in_dp, in_dp+mat.num_rows);
264 double host2dev = gettime()-T0;
265 if (sb.isVerbose())
266 std::cout << "Copy of b: " << host2dev << " seconds." << std::endl;
267 if (matrixAltered) {
268 T0 = gettime();
269 dmat = mat;
270 host2dev = gettime()-T0;
271 if (sb.isVerbose())
272 std::cout << "Copy of A: " << host2dev << " seconds." << std::endl;
273 matrixAltered = false;
274 }
275 DeviceVectorType x(mat.num_rows, 0.);
276 if (sb.isVerbose())
277 std::cout << "Solving on CUDA device..." << std::endl;
278
279 if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_NONE) {
280 cusp::identity_operator<double, cusp::device_memory> M(mat.num_rows, mat.num_rows);
281 runSolver(dmat, x, b, M, sb);
282 } else if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_JACOBI) {
283 if (sb.isVerbose())
284 std::cout << "Using Jacobi preconditioner" << std::endl;
285 // TODO: This should be cached as well but that's not supported
286 // at the moment.
287 cusp::precond::diagonal<double, cusp::device_memory> M(dmat);
288 runSolver(dmat, x, b, M, sb);
289 } else {
290 throw RipleyException("Unsupported preconditioner requested.");
291 }
292
293 T0 = gettime();
294 thrust::copy(x.begin(), x.end(), out_dp);
295 const double copyTime = gettime()-T0;
296 if (sb.isVerbose())
297 std::cout << "Copy of x: " << copyTime << " seconds." << std::endl;
298 #else
299 throw RipleyException("solve: GPU-based solver requested but escript "
300 "not compiled with CUDA.");
301 #endif
302 } else { // CPU
303 T0 = gettime();
304 HostVectorType b(in_dp, in_dp+mat.num_rows);
305 double copytime = gettime()-T0;
306 if (sb.isVerbose()) {
307 std::cout << "Copy of b: " << copytime << " seconds." << std::endl;
308 std::cout << "Solving on the CPU..." << std::endl;
309 }
310 HostVectorType x(mat.num_rows, 0.);
311 if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_NONE) {
312 cusp::identity_operator<double, cusp::host_memory> M(mat.num_rows, mat.num_rows);
313 runSolver(mat, x, b, M, sb);
314 } else if (sb.getPreconditioner() == escript::SO_PRECONDITIONER_JACOBI) {
315 if (sb.isVerbose())
316 std::cout << "Using Jacobi preconditioner" << std::endl;
317 // TODO: This should be cached as well but that's not supported
318 // at the moment.
319 cusp::precond::diagonal<double, cusp::host_memory> M(mat);
320 runSolver(mat, x, b, M, sb);
321 } else {
322 throw RipleyException("Unsupported preconditioner requested.");
323 }
324
325 T0 = gettime();
326 thrust::copy(x.begin(), x.end(), out_dp);
327 const double copyTime = gettime()-T0;
328 if (sb.isVerbose()) {
329 std::cout << "Copy of x: " << copyTime << " seconds." << std::endl;
330 }
331 }
332 }
333
334 void SystemMatrix::nullifyRowsAndCols(escript::Data& row_q,
335 escript::Data& col_q,
336 double mdv)
337 {
338 double T0 = gettime();
339 if (col_q.getDataPointSize() != getColumnBlockSize()) {
340 throw RipleyException("nullifyRowsAndCols: column block size does not match the number of components of column mask.");
341 } else if (row_q.getDataPointSize() != getRowBlockSize()) {
342 throw RipleyException("nullifyRowsAndCols: row block size does not match the number of components of row mask.");
343 } else if (col_q.getFunctionSpace() != getColumnFunctionSpace()) {
344 throw RipleyException("nullifyRowsAndCols: column function space and function space of column mask don't match.");
345 } else if (row_q.getFunctionSpace() != getRowFunctionSpace()) {
346 throw RipleyException("nullifyRowsAndCols: row function space and function space of row mask don't match.");
347 }
348
349 row_q.expand();
350 col_q.expand();
351 const double* rowMask = row_q.getSampleDataRO(0);
352 const double* colMask = col_q.getSampleDataRO(0);
353 const int blockSize = getBlockSize();
354 #pragma omp parallel for
355 for (int row=0; row < mat.num_rows; row++) {
356 for (int diag=0; diag < mat.diagonal_offsets.size(); diag++) {
357 const int col = blockSize*(row/blockSize)+mat.diagonal_offsets[diag]*blockSize;
358 if (col >= 0 && col <= mat.num_rows-blockSize) {
359 for (int i=0; i<blockSize; i++) {
360 if (rowMask[row] > 0. || colMask[col+i] > 0.) {
361 mat.values(row, diag*blockSize+i) =
362 (row==col+i ? mdv : 0);
363 }
364 }
365 }
366 }
367 }
368 //std::cout << "nullifyRowsAndCols: " << gettime()-T0 << " seconds." << std::endl;
369 matrixAltered = true;
370 }
371
372 void SystemMatrix::saveMM(const std::string& filename) const
373 {
374 const int blockSize = getBlockSize();
375
376 std::ofstream f(filename.c_str());
377 f << "%%MatrixMarket matrix coordinate real general" << std::endl;
378 f << mat.num_rows << " " << mat.num_rows << " " << mat.num_entries << std::endl;
379 f.setf(std::ios_base::scientific, std::ios_base::floatfield);
380 f.precision(15);
381 for (int row=0; row < mat.num_rows; row++) {
382 for (int diag=0; diag < mat.diagonal_offsets.size(); diag++) {
383 const int col = blockSize*(row/blockSize)+mat.diagonal_offsets[diag]*blockSize;
384 if (col >= 0 && col <= mat.num_rows-blockSize) {
385 for (int i=0; i<blockSize; i++) {
386 f << row+1 << " " << col+i+1 << " "
387 << mat.values(row, diag*blockSize+i) << std::endl;
388 }
389 }
390 }
391 }
392 }
393
394 void SystemMatrix::saveHB(const std::string& filename) const
395 {
396 throw RipleyException("Harwell-Boeing interface not available.");
397 }
398
399 void SystemMatrix::resetValues()
400 {
401 mat.values.values.assign(mat.values.values.size(), 0.);
402 matrixAltered = true;
403 }
404
405 } // end of namespace

  ViewVC Help
Powered by ViewVC 1.1.26