/[escript]/trunk/ripley/src/Rectangle.cpp
ViewVC logotype

Contents of /trunk/ripley/src/Rectangle.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5148 - (show annotations)
Mon Sep 15 01:25:23 2014 UTC (4 years, 11 months ago) by caltinay
File size: 96457 byte(s)
Merging ripley diagonal storage + CUDA support into trunk.
Options file version has been incremented due to new options
'cuda' and 'nvccflags'.

1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2014 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development 2012-2013 by School of Earth Sciences
13 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
14 *
15 *****************************************************************************/
16
17 #include <ripley/Rectangle.h>
18 #include <ripley/DefaultAssembler2D.h>
19 #include <ripley/LameAssembler2D.h>
20 #include <ripley/WaveAssembler2D.h>
21 #include <ripley/blocktools.h>
22 #include <ripley/domainhelpers.h>
23 #include <paso/SystemMatrix.h>
24 #include <esysUtils/esysFileWriter.h>
25 #include <esysUtils/EsysRandom.h>
26
27 #ifdef USE_NETCDF
28 #include <netcdfcpp.h>
29 #endif
30
31 #if USE_SILO
32 #include <silo.h>
33 #ifdef ESYS_MPI
34 #include <pmpio.h>
35 #endif
36 #endif
37
38 #include <boost/scoped_array.hpp>
39
40 #include <algorithm>
41 #include <iomanip>
42 #include <limits>
43
44 namespace bp = boost::python;
45 using namespace std;
46 using esysUtils::FileWriter;
47 using escript::AbstractSystemMatrix;
48
49 namespace ripley {
50
51 Rectangle::Rectangle(dim_t n0, dim_t n1, double x0, double y0, double x1,
52 double y1, int d0, int d1,
53 const vector<double>& points,
54 const vector<int>& tags,
55 const TagMap& tagnamestonums,
56 escript::SubWorld_ptr w) :
57 RipleyDomain(2, w)
58 {
59 if (static_cast<long>(n0 + 1) * static_cast<long>(n1 + 1)
60 > numeric_limits<dim_t>::max())
61 throw RipleyException("The number of elements has overflowed, this "
62 "limit may be raised in future releases.");
63
64 if (n0 <= 0 || n1 <= 0)
65 throw RipleyException("Number of elements in each spatial dimension "
66 "must be positive");
67
68 // ignore subdivision parameters for serial run
69 if (m_mpiInfo->size == 1) {
70 d0=1;
71 d1=1;
72 }
73
74 bool warn=false;
75 vector<int> factors;
76 int ranks = m_mpiInfo->size;
77 dim_t epr[2] = {n0,n1};
78 int d[2] = {d0,d1};
79 if (d0<=0 || d1<=0) {
80 for (int i = 0; i < 2; i++) {
81 if (d[i] < 1) {
82 d[i] = 1;
83 continue;
84 }
85 epr[i] = -1; // can no longer be max
86 //remove
87 if (ranks % d[i] != 0) {
88 throw RipleyException("Invalid number of spatial subdivisions");
89 }
90 ranks /= d[i];
91 }
92 factorise(factors, ranks);
93 if (factors.size() != 0) {
94 warn = true;
95 }
96 }
97
98 while (factors.size() > 0) {
99 int i = epr[0] > epr[1] ? 0 : 1;
100 int f = factors.back();
101 factors.pop_back();
102 d[i] *= f;
103 epr[i] /= f;
104 }
105 d0 = d[0]; d1 = d[1];
106
107 // ensure number of subdivisions is valid and nodes can be distributed
108 // among number of ranks
109 if (d0*d1 != m_mpiInfo->size)
110 throw RipleyException("Invalid number of spatial subdivisions");
111
112 if (warn) {
113 cout << "Warning: Automatic domain subdivision (d0=" << d0 << ", d1="
114 << d1 << "). This may not be optimal!" << endl;
115 }
116
117 double l0 = x1-x0;
118 double l1 = y1-y0;
119 m_dx[0] = l0/n0;
120 m_dx[1] = l1/n1;
121
122 if ((n0+1)%d0 > 0) {
123 n0=(dim_t)round((float)(n0+1)/d0+0.5)*d0-1;
124 l0=m_dx[0]*n0;
125 cout << "Warning: Adjusted number of elements and length. N0="
126 << n0 << ", l0=" << l0 << endl;
127 }
128 if ((n1+1)%d1 > 0) {
129 n1=(dim_t)round((float)(n1+1)/d1+0.5)*d1-1;
130 l1=m_dx[1]*n1;
131 cout << "Warning: Adjusted number of elements and length. N1="
132 << n1 << ", l1=" << l1 << endl;
133 }
134
135 if ((d0 > 1 && (n0+1)/d0<2) || (d1 > 1 && (n1+1)/d1<2))
136 throw RipleyException("Too few elements for the number of ranks");
137
138 m_gNE[0] = n0;
139 m_gNE[1] = n1;
140 m_origin[0] = x0;
141 m_origin[1] = y0;
142 m_length[0] = l0;
143 m_length[1] = l1;
144 m_NX[0] = d0;
145 m_NX[1] = d1;
146
147 // local number of elements (with and without overlap)
148 m_NE[0] = m_ownNE[0] = (d0>1 ? (n0+1)/d0 : n0);
149 if (m_mpiInfo->rank%d0>0 && m_mpiInfo->rank%d0<d0-1)
150 m_NE[0]++;
151 else if (d0>1 && m_mpiInfo->rank%d0==d0-1)
152 m_ownNE[0]--;
153
154 m_NE[1] = m_ownNE[1] = (d1>1 ? (n1+1)/d1 : n1);
155 if (m_mpiInfo->rank/d0>0 && m_mpiInfo->rank/d0<d1-1)
156 m_NE[1]++;
157 else if (d1>1 && m_mpiInfo->rank/d0==d1-1)
158 m_ownNE[1]--;
159
160 // local number of nodes
161 m_NN[0] = m_NE[0]+1;
162 m_NN[1] = m_NE[1]+1;
163
164 // bottom-left node is at (offset0,offset1) in global mesh
165 m_offset[0] = (n0+1)/d0*(m_mpiInfo->rank%d0);
166 if (m_offset[0] > 0)
167 m_offset[0]--;
168 m_offset[1] = (n1+1)/d1*(m_mpiInfo->rank/d0);
169 if (m_offset[1] > 0)
170 m_offset[1]--;
171
172 populateSampleIds();
173
174 for (TagMap::const_iterator i = tagnamestonums.begin();
175 i != tagnamestonums.end(); i++) {
176 setTagMap(i->first, i->second);
177 }
178 addPoints(points, tags);
179 }
180
181 Rectangle::~Rectangle()
182 {
183 }
184
185 string Rectangle::getDescription() const
186 {
187 return "ripley::Rectangle";
188 }
189
190 bool Rectangle::operator==(const AbstractDomain& other) const
191 {
192 const Rectangle* o=dynamic_cast<const Rectangle*>(&other);
193 if (o) {
194 return (RipleyDomain::operator==(other) &&
195 m_gNE[0]==o->m_gNE[0] && m_gNE[1]==o->m_gNE[1]
196 && m_origin[0]==o->m_origin[0] && m_origin[1]==o->m_origin[1]
197 && m_length[0]==o->m_length[0] && m_length[1]==o->m_length[1]
198 && m_NX[0]==o->m_NX[0] && m_NX[1]==o->m_NX[1]);
199 }
200
201 return false;
202 }
203
204 void Rectangle::readNcGrid(escript::Data& out, string filename, string varname,
205 const ReaderParameters& params) const
206 {
207 #ifdef USE_NETCDF
208 // check destination function space
209 dim_t myN0, myN1;
210 if (out.getFunctionSpace().getTypeCode() == Nodes) {
211 myN0 = m_NN[0];
212 myN1 = m_NN[1];
213 } else if (out.getFunctionSpace().getTypeCode() == Elements ||
214 out.getFunctionSpace().getTypeCode() == ReducedElements) {
215 myN0 = m_NE[0];
216 myN1 = m_NE[1];
217 } else
218 throw RipleyException("readNcGrid(): invalid function space for output data object");
219
220 if (params.first.size() != 2)
221 throw RipleyException("readNcGrid(): argument 'first' must have 2 entries");
222
223 if (params.numValues.size() != 2)
224 throw RipleyException("readNcGrid(): argument 'numValues' must have 2 entries");
225
226 if (params.multiplier.size() != 2)
227 throw RipleyException("readNcGrid(): argument 'multiplier' must have 2 entries");
228 for (size_t i=0; i<params.multiplier.size(); i++)
229 if (params.multiplier[i]<1)
230 throw RipleyException("readNcGrid(): all multipliers must be positive");
231 if (params.reverse.size() != 2)
232 throw RipleyException("readNcGrid(): argument 'reverse' must have 2 entries");
233
234 // check file existence and size
235 NcFile f(filename.c_str(), NcFile::ReadOnly);
236 if (!f.is_valid())
237 throw RipleyException("readNcGrid(): cannot open file");
238
239 NcVar* var = f.get_var(varname.c_str());
240 if (!var)
241 throw RipleyException("readNcGrid(): invalid variable");
242
243 // TODO: rank>0 data support
244 const int numComp = out.getDataPointSize();
245 if (numComp > 1)
246 throw RipleyException("readNcGrid(): only scalar data supported");
247
248 const int dims = var->num_dims();
249 boost::scoped_array<long> edges(var->edges());
250
251 // is this a slice of the data object (dims!=2)?
252 // note the expected ordering of edges (as in numpy: y,x)
253 if ( (dims==2 && (params.numValues[1] > edges[0] || params.numValues[0] > edges[1]))
254 || (dims==1 && params.numValues[1]>1) ) {
255 throw RipleyException("readNcGrid(): not enough data in file");
256 }
257
258 // check if this rank contributes anything
259 if (params.first[0] >= m_offset[0]+myN0 ||
260 params.first[0]+params.numValues[0]*params.multiplier[0] <= m_offset[0] ||
261 params.first[1] >= m_offset[1]+myN1 ||
262 params.first[1]+params.numValues[1]*params.multiplier[1] <= m_offset[1])
263 return;
264
265 // now determine how much this rank has to write
266
267 // first coordinates in data object to write to
268 const dim_t first0 = max(0, params.first[0]-m_offset[0]);
269 const dim_t first1 = max(0, params.first[1]-m_offset[1]);
270 // indices to first value in file (not accounting for reverse yet)
271 dim_t idx0 = max(0, m_offset[0]-params.first[0]);
272 dim_t idx1 = max(0, m_offset[1]-params.first[1]);
273 // number of values to read
274 const dim_t num0 = min(params.numValues[0]-idx0, myN0-first0);
275 const dim_t num1 = min(params.numValues[1]-idx1, myN1-first1);
276
277 // make sure we read the right block if going backwards through file
278 if (params.reverse[0])
279 idx0 = edges[dims-1]-num0-idx0;
280 if (dims>1 && params.reverse[1])
281 idx1 = edges[dims-2]-num1-idx1;
282
283 vector<double> values(num0*num1);
284 if (dims==2) {
285 var->set_cur(idx1, idx0);
286 var->get(&values[0], num1, num0);
287 } else {
288 var->set_cur(idx0);
289 var->get(&values[0], num0);
290 }
291
292 const int dpp = out.getNumDataPointsPerSample();
293 out.requireWrite();
294
295 // helpers for reversing
296 const dim_t x0 = (params.reverse[0] ? num0-1 : 0);
297 const int x_mult = (params.reverse[0] ? -1 : 1);
298 const dim_t y0 = (params.reverse[1] ? num1-1 : 0);
299 const int y_mult = (params.reverse[1] ? -1 : 1);
300
301 for (index_t y=0; y<num1; y++) {
302 #pragma omp parallel for
303 for (index_t x=0; x<num0; x++) {
304 const dim_t baseIndex = first0+x*params.multiplier[0]
305 +(first1+y*params.multiplier[1])*myN0;
306 const dim_t srcIndex = (y0+y_mult*y)*num0+(x0+x_mult*x);
307 if (!isnan(values[srcIndex])) {
308 for (index_t m1=0; m1<params.multiplier[1]; m1++) {
309 for (index_t m0=0; m0<params.multiplier[0]; m0++) {
310 const dim_t dataIndex = baseIndex+m0+m1*myN0;
311 double* dest = out.getSampleDataRW(dataIndex);
312 for (index_t q=0; q<dpp; q++) {
313 *dest++ = values[srcIndex];
314 }
315 }
316 }
317 }
318 }
319 }
320 #else
321 throw RipleyException("readNcGrid(): not compiled with netCDF support");
322 #endif
323 }
324
325 void Rectangle::readBinaryGrid(escript::Data& out, string filename,
326 const ReaderParameters& params) const
327 {
328 // the mapping is not universally correct but should work on our
329 // supported platforms
330 switch (params.dataType) {
331 case DATATYPE_INT32:
332 readBinaryGridImpl<int>(out, filename, params);
333 break;
334 case DATATYPE_FLOAT32:
335 readBinaryGridImpl<float>(out, filename, params);
336 break;
337 case DATATYPE_FLOAT64:
338 readBinaryGridImpl<double>(out, filename, params);
339 break;
340 default:
341 throw RipleyException("readBinaryGrid(): invalid or unsupported datatype");
342 }
343 }
344
345 #ifdef USE_BOOSTIO
346 void Rectangle::readBinaryGridFromZipped(escript::Data& out, string filename,
347 const ReaderParameters& params) const
348 {
349 // the mapping is not universally correct but should work on our
350 // supported platforms
351 switch (params.dataType) {
352 case DATATYPE_INT32:
353 readBinaryGridZippedImpl<int>(out, filename, params);
354 break;
355 case DATATYPE_FLOAT32:
356 readBinaryGridZippedImpl<float>(out, filename, params);
357 break;
358 case DATATYPE_FLOAT64:
359 readBinaryGridZippedImpl<double>(out, filename, params);
360 break;
361 default:
362 throw RipleyException("readBinaryGridFromZipped(): invalid or unsupported datatype");
363 }
364 }
365 #endif
366
367 template<typename ValueType>
368 void Rectangle::readBinaryGridImpl(escript::Data& out, const string& filename,
369 const ReaderParameters& params) const
370 {
371 // check destination function space
372 dim_t myN0, myN1;
373 if (out.getFunctionSpace().getTypeCode() == Nodes) {
374 myN0 = m_NN[0];
375 myN1 = m_NN[1];
376 } else if (out.getFunctionSpace().getTypeCode() == Elements ||
377 out.getFunctionSpace().getTypeCode() == ReducedElements) {
378 myN0 = m_NE[0];
379 myN1 = m_NE[1];
380 } else
381 throw RipleyException("readBinaryGrid(): invalid function space for output data object");
382
383 if (params.first.size() != 2)
384 throw RipleyException("readBinaryGrid(): argument 'first' must have 2 entries");
385
386 if (params.numValues.size() != 2)
387 throw RipleyException("readBinaryGrid(): argument 'numValues' must have 2 entries");
388
389 if (params.multiplier.size() != 2)
390 throw RipleyException("readBinaryGrid(): argument 'multiplier' must have 2 entries");
391 for (size_t i=0; i<params.multiplier.size(); i++)
392 if (params.multiplier[i]<1)
393 throw RipleyException("readBinaryGrid(): all multipliers must be positive");
394 if (params.reverse[0] != 0 || params.reverse[1] != 0)
395 throw RipleyException("readBinaryGrid(): reversing not supported yet");
396
397 // check file existence and size
398 ifstream f(filename.c_str(), ifstream::binary);
399 if (f.fail()) {
400 throw RipleyException("readBinaryGrid(): cannot open file");
401 }
402 f.seekg(0, ios::end);
403 const int numComp = out.getDataPointSize();
404 const dim_t filesize = f.tellg();
405 const dim_t reqsize = params.numValues[0]*params.numValues[1]*numComp*sizeof(ValueType);
406 if (filesize < reqsize) {
407 f.close();
408 throw RipleyException("readBinaryGrid(): not enough data in file");
409 }
410
411 // check if this rank contributes anything
412 if (params.first[0] >= m_offset[0]+myN0 ||
413 params.first[0]+(params.numValues[0]*params.multiplier[0]) <= m_offset[0] ||
414 params.first[1] >= m_offset[1]+myN1 ||
415 params.first[1]+(params.numValues[1]*params.multiplier[1]) <= m_offset[1]) {
416 f.close();
417 return;
418 }
419
420 // now determine how much this rank has to write
421
422 // first coordinates in data object to write to
423 const dim_t first0 = max(0, params.first[0]-m_offset[0]);
424 const dim_t first1 = max(0, params.first[1]-m_offset[1]);
425 // indices to first value in file
426 const dim_t idx0 = max(0, (m_offset[0]/params.multiplier[0])-params.first[0]);
427 const dim_t idx1 = max(0, (m_offset[1]/params.multiplier[1])-params.first[1]);
428 // if restX > 0 the first value in the respective dimension has been
429 // written restX times already in a previous rank so this rank only
430 // contributes (multiplier-rank) copies of that value
431 const dim_t rest0 = m_offset[0]%params.multiplier[0];
432 const dim_t rest1 = m_offset[1]%params.multiplier[1];
433 // number of values to read
434 const dim_t num0 = min(params.numValues[0]-idx0, myN0-first0);
435 const dim_t num1 = min(params.numValues[1]-idx1, myN1-first1);
436
437 out.requireWrite();
438 vector<ValueType> values(num0*numComp);
439 const int dpp = out.getNumDataPointsPerSample();
440
441 for (dim_t y=0; y<num1; y++) {
442 const dim_t fileofs = numComp*(idx0+(idx1+y)*params.numValues[0]);
443 f.seekg(fileofs*sizeof(ValueType));
444 f.read((char*)&values[0], num0*numComp*sizeof(ValueType));
445 const dim_t m1limit = (y==0 ? params.multiplier[1]-rest1 : params.multiplier[1]);
446 dim_t dataYbase = first1+y*params.multiplier[1];
447 if (y>0)
448 dataYbase -= rest1;
449 for (dim_t x=0; x<num0; x++) {
450 const dim_t m0limit = (x==0 ? params.multiplier[0]-rest0 : params.multiplier[0]);
451 dim_t dataXbase = first0+x*params.multiplier[0];
452 if (x>0)
453 dataXbase -= rest0;
454 // write a block of mult0 x mult1 identical values into Data object
455 for (dim_t m1=0; m1<m1limit; m1++) {
456 const dim_t dataY = dataYbase+m1;
457 if (dataY >= myN1)
458 break;
459 for (dim_t m0=0; m0<m0limit; m0++) {
460 const dim_t dataX = dataXbase+m0;
461 if (dataX >= myN0)
462 break;
463 const dim_t dataIndex = dataX+dataY*myN0;
464 double* dest = out.getSampleDataRW(dataIndex);
465 for (int c=0; c<numComp; c++) {
466 ValueType val = values[x*numComp+c];
467
468 if (params.byteOrder != BYTEORDER_NATIVE) {
469 char* cval = reinterpret_cast<char*>(&val);
470 // this will alter val!!
471 if (sizeof(ValueType) > 4) {
472 byte_swap64(cval);
473 } else {
474 byte_swap32(cval);
475 }
476 }
477 if (!isnan(val)) {
478 for (int q=0; q<dpp; q++) {
479 *dest++ = static_cast<double>(val);
480 }
481 }
482 }
483 }
484 }
485 }
486 }
487
488 f.close();
489 }
490
491 #ifdef USE_BOOSTIO
492 template<typename ValueType>
493 void Rectangle::readBinaryGridZippedImpl(escript::Data& out, const string& filename,
494 const ReaderParameters& params) const
495 {
496 // check destination function space
497 dim_t myN0, myN1;
498 if (out.getFunctionSpace().getTypeCode() == Nodes) {
499 myN0 = m_NN[0];
500 myN1 = m_NN[1];
501 } else if (out.getFunctionSpace().getTypeCode() == Elements ||
502 out.getFunctionSpace().getTypeCode() == ReducedElements) {
503 myN0 = m_NE[0];
504 myN1 = m_NE[1];
505 } else
506 throw RipleyException("readBinaryGrid(): invalid function space for output data object");
507
508 // check file existence and size
509 ifstream f(filename.c_str(), ifstream::binary);
510 if (f.fail()) {
511 throw RipleyException("readBinaryGridFromZipped(): cannot open file");
512 }
513 f.seekg(0, ios::end);
514 const int numComp = out.getDataPointSize();
515 dim_t filesize = f.tellg();
516 f.seekg(0, ios::beg);
517 vector<char> compressed(filesize);
518 f.read((char*)&compressed[0], filesize);
519 f.close();
520 vector<char> decompressed = unzip(compressed);
521 filesize = decompressed.size();
522 const dim_t reqsize = params.numValues[0]*params.numValues[1]*numComp*sizeof(ValueType);
523 if (filesize < reqsize) {
524 throw RipleyException("readBinaryGridFromZipped(): not enough data in file");
525 }
526
527 // check if this rank contributes anything
528 if (params.first[0] >= m_offset[0]+myN0 ||
529 params.first[0]+params.numValues[0] <= m_offset[0] ||
530 params.first[1] >= m_offset[1]+myN1 ||
531 params.first[1]+params.numValues[1] <= m_offset[1]) {
532 f.close();
533 return;
534 }
535
536 // now determine how much this rank has to write
537
538 // first coordinates in data object to write to
539 const dim_t first0 = max(0, params.first[0]-m_offset[0]);
540 const dim_t first1 = max(0, params.first[1]-m_offset[1]);
541 // indices to first value in file
542 const dim_t idx0 = max(0, m_offset[0]-params.first[0]);
543 const dim_t idx1 = max(0, m_offset[1]-params.first[1]);
544 // number of values to read
545 const dim_t num0 = min(params.numValues[0]-idx0, myN0-first0);
546 const dim_t num1 = min(params.numValues[1]-idx1, myN1-first1);
547
548 out.requireWrite();
549 vector<ValueType> values(num0*numComp);
550 const int dpp = out.getNumDataPointsPerSample();
551
552 for (dim_t y=0; y<num1; y++) {
553 const dim_t fileofs = numComp*(idx0+(idx1+y)*params.numValues[0]);
554 memcpy((char*)&values[0], (char*)&decompressed[fileofs*sizeof(ValueType)], num0*numComp*sizeof(ValueType));
555 for (dim_t x=0; x<num0; x++) {
556 const dim_t baseIndex = first0+x*params.multiplier[0]
557 +(first1+y*params.multiplier[1])*myN0;
558 for (int m1=0; m1<params.multiplier[1]; m1++) {
559 for (int m0=0; m0<params.multiplier[0]; m0++) {
560 const dim_t dataIndex = baseIndex+m0+m1*myN0;
561 double* dest = out.getSampleDataRW(dataIndex);
562 for (int c=0; c<numComp; c++) {
563 ValueType val = values[x*numComp+c];
564
565 if (params.byteOrder != BYTEORDER_NATIVE) {
566 char* cval = reinterpret_cast<char*>(&val);
567 // this will alter val!!
568 byte_swap32(cval);
569 }
570 if (!isnan(val)) {
571 for (int q=0; q<dpp; q++) {
572 *dest++ = static_cast<double>(val);
573 }
574 }
575 }
576 }
577 }
578 }
579 }
580
581 f.close();
582 }
583 #endif
584
585 void Rectangle::writeBinaryGrid(const escript::Data& in, string filename,
586 int byteOrder, int dataType) const
587 {
588 // the mapping is not universally correct but should work on our
589 // supported platforms
590 switch (dataType) {
591 case DATATYPE_INT32:
592 writeBinaryGridImpl<int>(in, filename, byteOrder);
593 break;
594 case DATATYPE_FLOAT32:
595 writeBinaryGridImpl<float>(in, filename, byteOrder);
596 break;
597 case DATATYPE_FLOAT64:
598 writeBinaryGridImpl<double>(in, filename, byteOrder);
599 break;
600 default:
601 throw RipleyException("writeBinaryGrid(): invalid or unsupported datatype");
602 }
603 }
604
605 template<typename ValueType>
606 void Rectangle::writeBinaryGridImpl(const escript::Data& in,
607 const string& filename, int byteOrder) const
608 {
609 // check function space and determine number of points
610 dim_t myN0, myN1;
611 dim_t totalN0, totalN1;
612 if (in.getFunctionSpace().getTypeCode() == Nodes) {
613 myN0 = m_NN[0];
614 myN1 = m_NN[1];
615 totalN0 = m_gNE[0]+1;
616 totalN1 = m_gNE[1]+1;
617 } else if (in.getFunctionSpace().getTypeCode() == Elements ||
618 in.getFunctionSpace().getTypeCode() == ReducedElements) {
619 myN0 = m_NE[0];
620 myN1 = m_NE[1];
621 totalN0 = m_gNE[0];
622 totalN1 = m_gNE[1];
623 } else
624 throw RipleyException("writeBinaryGrid(): invalid function space of data object");
625
626 const int numComp = in.getDataPointSize();
627 const int dpp = in.getNumDataPointsPerSample();
628
629 if (numComp > 1 || dpp > 1)
630 throw RipleyException("writeBinaryGrid(): only scalar, single-value data supported");
631
632 const dim_t fileSize = sizeof(ValueType)*numComp*dpp*totalN0*totalN1;
633
634 // from here on we know that each sample consists of one value
635 FileWriter fw;
636 fw.openFile(filename, fileSize);
637 MPIBarrier();
638
639 for (index_t y=0; y<myN1; y++) {
640 const dim_t fileofs = (m_offset[0]+(m_offset[1]+y)*totalN0)*sizeof(ValueType);
641 ostringstream oss;
642
643 for (index_t x=0; x<myN0; x++) {
644 const double* sample = in.getSampleDataRO(y*myN0+x);
645 ValueType fvalue = static_cast<ValueType>(*sample);
646 if (byteOrder == BYTEORDER_NATIVE) {
647 oss.write((char*)&fvalue, sizeof(fvalue));
648 } else {
649 char* value = reinterpret_cast<char*>(&fvalue);
650 if (sizeof(fvalue)>4) {
651 byte_swap64(value);
652 } else {
653 byte_swap32(value);
654 }
655 oss.write(value, sizeof(fvalue));
656 }
657 }
658 fw.writeAt(oss, fileofs);
659 }
660 fw.close();
661 }
662
663 void Rectangle::write(const std::string& filename) const
664 {
665 throw RipleyException("write: not supported");
666 }
667
668 void Rectangle::dump(const string& fileName) const
669 {
670 #if USE_SILO
671 string fn(fileName);
672 if (fileName.length() < 6 || fileName.compare(fileName.length()-5, 5, ".silo") != 0) {
673 fn+=".silo";
674 }
675
676 int driver=DB_HDF5;
677 DBfile* dbfile = NULL;
678 const char* blockDirFmt = "/block%04d";
679
680 #ifdef ESYS_MPI
681 PMPIO_baton_t* baton = NULL;
682 const int NUM_SILO_FILES = 1;
683 #endif
684
685 if (m_mpiInfo->size > 1) {
686 #ifdef ESYS_MPI
687 baton = PMPIO_Init(NUM_SILO_FILES, PMPIO_WRITE, m_mpiInfo->comm,
688 0x1337, PMPIO_DefaultCreate, PMPIO_DefaultOpen,
689 PMPIO_DefaultClose, (void*)&driver);
690 // try the fallback driver in case of error
691 if (!baton && driver != DB_PDB) {
692 driver = DB_PDB;
693 baton = PMPIO_Init(NUM_SILO_FILES, PMPIO_WRITE, m_mpiInfo->comm,
694 0x1338, PMPIO_DefaultCreate, PMPIO_DefaultOpen,
695 PMPIO_DefaultClose, (void*)&driver);
696 }
697 if (baton) {
698 char siloPath[64];
699 snprintf(siloPath, 64, blockDirFmt, PMPIO_RankInGroup(baton, m_mpiInfo->rank));
700 dbfile = (DBfile*) PMPIO_WaitForBaton(baton, fn.c_str(), siloPath);
701 }
702 #endif
703 } else {
704 dbfile = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL,
705 getDescription().c_str(), driver);
706 // try the fallback driver in case of error
707 if (!dbfile && driver != DB_PDB) {
708 driver = DB_PDB;
709 dbfile = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL,
710 getDescription().c_str(), driver);
711 }
712 char siloPath[64];
713 snprintf(siloPath, 64, blockDirFmt, 0);
714 DBMkDir(dbfile, siloPath);
715 DBSetDir(dbfile, siloPath);
716 }
717
718 if (!dbfile)
719 throw RipleyException("dump: Could not create Silo file");
720
721 /*
722 if (driver==DB_HDF5) {
723 // gzip level 1 already provides good compression with minimal
724 // performance penalty. Some tests showed that gzip levels >3 performed
725 // rather badly on escript data both in terms of time and space
726 DBSetCompression("ERRMODE=FALLBACK METHOD=GZIP LEVEL=1");
727 }
728 */
729
730 const dim_t NN0 = m_NN[0];
731 const dim_t NN1 = m_NN[1];
732 boost::scoped_ptr<double> x(new double[NN0]);
733 boost::scoped_ptr<double> y(new double[NN1]);
734 double* coords[2] = { x.get(), y.get() };
735 #pragma omp parallel
736 {
737 #pragma omp for nowait
738 for (dim_t i0 = 0; i0 < NN0; i0++) {
739 coords[0][i0]=getLocalCoordinate(i0, 0);
740 }
741 #pragma omp for nowait
742 for (dim_t i1 = 0; i1 < NN1; i1++) {
743 coords[1][i1]=getLocalCoordinate(i1, 1);
744 }
745 }
746 dim_t* dims = const_cast<dim_t*>(getNumNodesPerDim());
747
748 // write mesh
749 DBPutQuadmesh(dbfile, "mesh", NULL, coords, dims, 2, DB_DOUBLE,
750 DB_COLLINEAR, NULL);
751
752 // write node ids
753 DBPutQuadvar1(dbfile, "nodeId", "mesh", (void*)&m_nodeId[0], dims, 2,
754 NULL, 0, DB_INT, DB_NODECENT, NULL);
755
756 // write element ids
757 dims = const_cast<dim_t*>(getNumElementsPerDim());
758 DBPutQuadvar1(dbfile, "elementId", "mesh", (void*)&m_elementId[0],
759 dims, 2, NULL, 0, DB_INT, DB_ZONECENT, NULL);
760
761 // rank 0 writes multimesh and multivar
762 if (m_mpiInfo->rank == 0) {
763 vector<string> tempstrings;
764 vector<char*> names;
765 for (dim_t i=0; i<m_mpiInfo->size; i++) {
766 stringstream path;
767 path << "/block" << setw(4) << setfill('0') << right << i << "/mesh";
768 tempstrings.push_back(path.str());
769 names.push_back((char*)tempstrings.back().c_str());
770 }
771 vector<int> types(m_mpiInfo->size, DB_QUAD_RECT);
772 DBSetDir(dbfile, "/");
773 DBPutMultimesh(dbfile, "multimesh", m_mpiInfo->size, &names[0],
774 &types[0], NULL);
775 tempstrings.clear();
776 names.clear();
777 for (dim_t i=0; i<m_mpiInfo->size; i++) {
778 stringstream path;
779 path << "/block" << setw(4) << setfill('0') << right << i << "/nodeId";
780 tempstrings.push_back(path.str());
781 names.push_back((char*)tempstrings.back().c_str());
782 }
783 types.assign(m_mpiInfo->size, DB_QUADVAR);
784 DBPutMultivar(dbfile, "nodeId", m_mpiInfo->size, &names[0],
785 &types[0], NULL);
786 tempstrings.clear();
787 names.clear();
788 for (dim_t i=0; i<m_mpiInfo->size; i++) {
789 stringstream path;
790 path << "/block" << setw(4) << setfill('0') << right << i << "/elementId";
791 tempstrings.push_back(path.str());
792 names.push_back((char*)tempstrings.back().c_str());
793 }
794 DBPutMultivar(dbfile, "elementId", m_mpiInfo->size, &names[0],
795 &types[0], NULL);
796 }
797
798 if (m_mpiInfo->size > 1) {
799 #ifdef ESYS_MPI
800 PMPIO_HandOffBaton(baton, dbfile);
801 PMPIO_Finish(baton);
802 #endif
803 } else {
804 DBClose(dbfile);
805 }
806
807 #else // USE_SILO
808 throw RipleyException("dump: no Silo support");
809 #endif
810 }
811
812 const dim_t* Rectangle::borrowSampleReferenceIDs(int fsType) const
813 {
814 switch (fsType) {
815 case Nodes:
816 case ReducedNodes: // FIXME: reduced
817 return &m_nodeId[0];
818 case DegreesOfFreedom:
819 case ReducedDegreesOfFreedom: // FIXME: reduced
820 return &m_dofId[0];
821 case Elements:
822 case ReducedElements:
823 return &m_elementId[0];
824 case FaceElements:
825 case ReducedFaceElements:
826 return &m_faceId[0];
827 case Points:
828 return &m_diracPointNodeIDs[0];
829 default:
830 break;
831 }
832
833 stringstream msg;
834 msg << "borrowSampleReferenceIDs: invalid function space type " << fsType;
835 throw RipleyException(msg.str());
836 }
837
838 bool Rectangle::ownSample(int fsType, index_t id) const
839 {
840 if (getMPISize()==1)
841 return true;
842
843 switch (fsType) {
844 case Nodes:
845 case ReducedNodes: // FIXME: reduced
846 return (m_dofMap[id] < getNumDOF());
847 case DegreesOfFreedom:
848 case ReducedDegreesOfFreedom:
849 return true;
850 case Elements:
851 case ReducedElements:
852 // check ownership of element's bottom left node
853 return (m_dofMap[id%m_NE[0]+m_NN[0]*(id/m_NE[0])] < getNumDOF());
854 case FaceElements:
855 case ReducedFaceElements:
856 {
857 // determine which face the sample belongs to before
858 // checking ownership of corresponding element's first node
859 dim_t n=0;
860 for (size_t i=0; i<4; i++) {
861 n+=m_faceCount[i];
862 if (id<n) {
863 index_t k;
864 if (i==1)
865 k=m_NN[0]-2;
866 else if (i==3)
867 k=m_NN[0]*(m_NN[1]-2);
868 else
869 k=0;
870 // determine whether to move right or up
871 const index_t delta=(i/2==0 ? m_NN[0] : 1);
872 return (m_dofMap[k+(id-n+m_faceCount[i])*delta] < getNumDOF());
873 }
874 }
875 return false;
876 }
877 default:
878 break;
879 }
880
881 stringstream msg;
882 msg << "ownSample: invalid function space type " << fsType;
883 throw RipleyException(msg.str());
884 }
885
886 void Rectangle::setToNormal(escript::Data& out) const
887 {
888 const dim_t NE0=m_NE[0];
889 const dim_t NE1=m_NE[1];
890 if (out.getFunctionSpace().getTypeCode() == FaceElements) {
891 out.requireWrite();
892 #pragma omp parallel
893 {
894 if (m_faceOffset[0] > -1) {
895 #pragma omp for nowait
896 for (index_t k1 = 0; k1 < NE1; ++k1) {
897 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
898 // set vector at two quadrature points
899 *o++ = -1.;
900 *o++ = 0.;
901 *o++ = -1.;
902 *o = 0.;
903 }
904 }
905
906 if (m_faceOffset[1] > -1) {
907 #pragma omp for nowait
908 for (index_t k1 = 0; k1 < NE1; ++k1) {
909 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
910 // set vector at two quadrature points
911 *o++ = 1.;
912 *o++ = 0.;
913 *o++ = 1.;
914 *o = 0.;
915 }
916 }
917
918 if (m_faceOffset[2] > -1) {
919 #pragma omp for nowait
920 for (index_t k0 = 0; k0 < NE0; ++k0) {
921 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
922 // set vector at two quadrature points
923 *o++ = 0.;
924 *o++ = -1.;
925 *o++ = 0.;
926 *o = -1.;
927 }
928 }
929
930 if (m_faceOffset[3] > -1) {
931 #pragma omp for nowait
932 for (index_t k0 = 0; k0 < NE0; ++k0) {
933 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
934 // set vector at two quadrature points
935 *o++ = 0.;
936 *o++ = 1.;
937 *o++ = 0.;
938 *o = 1.;
939 }
940 }
941 } // end of parallel section
942 } else if (out.getFunctionSpace().getTypeCode() == ReducedFaceElements) {
943 out.requireWrite();
944 #pragma omp parallel
945 {
946 if (m_faceOffset[0] > -1) {
947 #pragma omp for nowait
948 for (index_t k1 = 0; k1 < NE1; ++k1) {
949 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
950 *o++ = -1.;
951 *o = 0.;
952 }
953 }
954
955 if (m_faceOffset[1] > -1) {
956 #pragma omp for nowait
957 for (index_t k1 = 0; k1 < NE1; ++k1) {
958 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
959 *o++ = 1.;
960 *o = 0.;
961 }
962 }
963
964 if (m_faceOffset[2] > -1) {
965 #pragma omp for nowait
966 for (index_t k0 = 0; k0 < NE0; ++k0) {
967 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
968 *o++ = 0.;
969 *o = -1.;
970 }
971 }
972
973 if (m_faceOffset[3] > -1) {
974 #pragma omp for nowait
975 for (index_t k0 = 0; k0 < NE0; ++k0) {
976 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
977 *o++ = 0.;
978 *o = 1.;
979 }
980 }
981 } // end of parallel section
982
983 } else {
984 stringstream msg;
985 msg << "setToNormal: invalid function space type "
986 << out.getFunctionSpace().getTypeCode();
987 throw RipleyException(msg.str());
988 }
989 }
990
991 void Rectangle::setToSize(escript::Data& out) const
992 {
993 if (out.getFunctionSpace().getTypeCode() == Elements
994 || out.getFunctionSpace().getTypeCode() == ReducedElements) {
995 out.requireWrite();
996 const dim_t numQuad = out.getNumDataPointsPerSample();
997 const dim_t numElements = getNumElements();
998 const double size=sqrt(m_dx[0]*m_dx[0]+m_dx[1]*m_dx[1]);
999 #pragma omp parallel for
1000 for (index_t k = 0; k < numElements; ++k) {
1001 double* o = out.getSampleDataRW(k);
1002 fill(o, o+numQuad, size);
1003 }
1004 } else if (out.getFunctionSpace().getTypeCode() == FaceElements
1005 || out.getFunctionSpace().getTypeCode() == ReducedFaceElements) {
1006 out.requireWrite();
1007 const dim_t numQuad=out.getNumDataPointsPerSample();
1008 const dim_t NE0 = m_NE[0];
1009 const dim_t NE1 = m_NE[1];
1010 #pragma omp parallel
1011 {
1012 if (m_faceOffset[0] > -1) {
1013 #pragma omp for nowait
1014 for (index_t k1 = 0; k1 < NE1; ++k1) {
1015 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
1016 fill(o, o+numQuad, m_dx[1]);
1017 }
1018 }
1019
1020 if (m_faceOffset[1] > -1) {
1021 #pragma omp for nowait
1022 for (index_t k1 = 0; k1 < NE1; ++k1) {
1023 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
1024 fill(o, o+numQuad, m_dx[1]);
1025 }
1026 }
1027
1028 if (m_faceOffset[2] > -1) {
1029 #pragma omp for nowait
1030 for (index_t k0 = 0; k0 < NE0; ++k0) {
1031 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
1032 fill(o, o+numQuad, m_dx[0]);
1033 }
1034 }
1035
1036 if (m_faceOffset[3] > -1) {
1037 #pragma omp for nowait
1038 for (index_t k0 = 0; k0 < NE0; ++k0) {
1039 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
1040 fill(o, o+numQuad, m_dx[0]);
1041 }
1042 }
1043 } // end of parallel section
1044
1045 } else {
1046 stringstream msg;
1047 msg << "setToSize: invalid function space type "
1048 << out.getFunctionSpace().getTypeCode();
1049 throw RipleyException(msg.str());
1050 }
1051 }
1052
1053 void Rectangle::Print_Mesh_Info(const bool full) const
1054 {
1055 RipleyDomain::Print_Mesh_Info(full);
1056 if (full) {
1057 cout << " Id Coordinates" << endl;
1058 cout.precision(15);
1059 cout.setf(ios::scientific, ios::floatfield);
1060 for (index_t i=0; i < getNumNodes(); i++) {
1061 cout << " " << setw(5) << m_nodeId[i]
1062 << " " << getLocalCoordinate(i%m_NN[0], 0)
1063 << " " << getLocalCoordinate(i/m_NN[0], 1) << endl;
1064 }
1065 }
1066 }
1067
1068
1069 //protected
1070 void Rectangle::assembleCoordinates(escript::Data& arg) const
1071 {
1072 escriptDataC x = arg.getDataC();
1073 int numDim = m_numDim;
1074 if (!isDataPointShapeEqual(&x, 1, &numDim))
1075 throw RipleyException("setToX: Invalid Data object shape");
1076 if (!numSamplesEqual(&x, 1, getNumNodes()))
1077 throw RipleyException("setToX: Illegal number of samples in Data object");
1078
1079 const dim_t NN0 = m_NN[0];
1080 const dim_t NN1 = m_NN[1];
1081 arg.requireWrite();
1082 #pragma omp parallel for
1083 for (dim_t i1 = 0; i1 < NN1; i1++) {
1084 for (dim_t i0 = 0; i0 < NN0; i0++) {
1085 double* point = arg.getSampleDataRW(i0+NN0*i1);
1086 point[0] = getLocalCoordinate(i0, 0);
1087 point[1] = getLocalCoordinate(i1, 1);
1088 }
1089 }
1090 }
1091
1092 //protected
1093 void Rectangle::assembleGradient(escript::Data& out, const escript::Data& in) const
1094 {
1095 const dim_t numComp = in.getDataPointSize();
1096 const dim_t NE0 = m_NE[0];
1097 const dim_t NE1 = m_NE[1];
1098 const double cx0 = .21132486540518711775/m_dx[0];
1099 const double cx1 = .78867513459481288225/m_dx[0];
1100 const double cx2 = 1./m_dx[0];
1101 const double cy0 = .21132486540518711775/m_dx[1];
1102 const double cy1 = .78867513459481288225/m_dx[1];
1103 const double cy2 = 1./m_dx[1];
1104
1105 if (out.getFunctionSpace().getTypeCode() == Elements) {
1106 out.requireWrite();
1107 #pragma omp parallel
1108 {
1109 vector<double> f_00(numComp);
1110 vector<double> f_01(numComp);
1111 vector<double> f_10(numComp);
1112 vector<double> f_11(numComp);
1113 #pragma omp for
1114 for (index_t k1=0; k1 < NE1; ++k1) {
1115 for (index_t k0=0; k0 < NE0; ++k0) {
1116 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
1117 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
1118 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
1119 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
1120 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
1121 for (index_t i=0; i < numComp; ++i) {
1122 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
1123 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
1124 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
1125 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
1126 o[INDEX3(i,0,2,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
1127 o[INDEX3(i,1,2,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
1128 o[INDEX3(i,0,3,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
1129 o[INDEX3(i,1,3,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
1130 } // end of component loop i
1131 } // end of k0 loop
1132 } // end of k1 loop
1133 } // end of parallel section
1134 } else if (out.getFunctionSpace().getTypeCode() == ReducedElements) {
1135 out.requireWrite();
1136 #pragma omp parallel
1137 {
1138 vector<double> f_00(numComp);
1139 vector<double> f_01(numComp);
1140 vector<double> f_10(numComp);
1141 vector<double> f_11(numComp);
1142 #pragma omp for
1143 for (index_t k1=0; k1 < NE1; ++k1) {
1144 for (index_t k0=0; k0 < NE0; ++k0) {
1145 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
1146 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
1147 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
1148 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
1149 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
1150 for (index_t i=0; i < numComp; ++i) {
1151 o[INDEX3(i,0,0,numComp,2)] = (f_10[i] + f_11[i] - f_00[i] - f_01[i])*cx2/2;
1152 o[INDEX3(i,1,0,numComp,2)] = (f_01[i] + f_11[i] - f_00[i] - f_10[i])*cy2/2;
1153 } // end of component loop i
1154 } // end of k0 loop
1155 } // end of k1 loop
1156 } // end of parallel section
1157 } else if (out.getFunctionSpace().getTypeCode() == FaceElements) {
1158 out.requireWrite();
1159 #pragma omp parallel
1160 {
1161 vector<double> f_00(numComp);
1162 vector<double> f_01(numComp);
1163 vector<double> f_10(numComp);
1164 vector<double> f_11(numComp);
1165 if (m_faceOffset[0] > -1) {
1166 #pragma omp for nowait
1167 for (index_t k1=0; k1 < NE1; ++k1) {
1168 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
1169 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
1170 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(1,k1, m_NN[0])), numComp*sizeof(double));
1171 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(1,k1+1, m_NN[0])), numComp*sizeof(double));
1172 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
1173 for (index_t i=0; i < numComp; ++i) {
1174 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
1175 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy2;
1176 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
1177 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy2;
1178 } // end of component loop i
1179 } // end of k1 loop
1180 } // end of face 0
1181 if (m_faceOffset[1] > -1) {
1182 #pragma omp for nowait
1183 for (index_t k1=0; k1 < NE1; ++k1) {
1184 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1, m_NN[0])), numComp*sizeof(double));
1185 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1+1, m_NN[0])), numComp*sizeof(double));
1186 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
1187 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
1188 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
1189 for (index_t i=0; i < numComp; ++i) {
1190 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
1191 o[INDEX3(i,1,0,numComp,2)] = (f_11[i]-f_10[i])*cy2;
1192 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
1193 o[INDEX3(i,1,1,numComp,2)] = (f_11[i]-f_10[i])*cy2;
1194 } // end of component loop i
1195 } // end of k1 loop
1196 } // end of face 1
1197 if (m_faceOffset[2] > -1) {
1198 #pragma omp for nowait
1199 for (index_t k0=0; k0 < NE0; ++k0) {
1200 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
1201 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,1, m_NN[0])), numComp*sizeof(double));
1202 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
1203 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,1, m_NN[0])), numComp*sizeof(double));
1204 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
1205 for (index_t i=0; i < numComp; ++i) {
1206 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx2;
1207 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
1208 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx2;
1209 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
1210 } // end of component loop i
1211 } // end of k0 loop
1212 } // end of face 2
1213 if (m_faceOffset[3] > -1) {
1214 #pragma omp for nowait
1215 for (index_t k0=0; k0 < NE0; ++k0) {
1216 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
1217 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1218 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
1219 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1220 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
1221 for (index_t i=0; i < numComp; ++i) {
1222 o[INDEX3(i,0,0,numComp,2)] = (f_11[i]-f_01[i])*cx2;
1223 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
1224 o[INDEX3(i,0,1,numComp,2)] = (f_11[i]-f_01[i])*cx2;
1225 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
1226 } // end of component loop i
1227 } // end of k0 loop
1228 } // end of face 3
1229 } // end of parallel section
1230
1231 } else if (out.getFunctionSpace().getTypeCode() == ReducedFaceElements) {
1232 out.requireWrite();
1233 #pragma omp parallel
1234 {
1235 vector<double> f_00(numComp);
1236 vector<double> f_01(numComp);
1237 vector<double> f_10(numComp);
1238 vector<double> f_11(numComp);
1239 if (m_faceOffset[0] > -1) {
1240 #pragma omp for nowait
1241 for (index_t k1=0; k1 < NE1; ++k1) {
1242 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
1243 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
1244 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(1,k1, m_NN[0])), numComp*sizeof(double));
1245 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(1,k1+1, m_NN[0])), numComp*sizeof(double));
1246 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
1247 for (index_t i=0; i < numComp; ++i) {
1248 o[INDEX3(i,0,0,numComp,2)] = (f_10[i] + f_11[i] - f_00[i] - f_01[i])*cx2/2;
1249 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy2;
1250 } // end of component loop i
1251 } // end of k1 loop
1252 } // end of face 0
1253 if (m_faceOffset[1] > -1) {
1254 #pragma omp for nowait
1255 for (index_t k1=0; k1 < NE1; ++k1) {
1256 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1, m_NN[0])), numComp*sizeof(double));
1257 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1+1, m_NN[0])), numComp*sizeof(double));
1258 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
1259 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
1260 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
1261 for (index_t i=0; i < numComp; ++i) {
1262 o[INDEX3(i,0,0,numComp,2)] = (f_10[i] + f_11[i] - f_00[i] - f_01[i])*cx2/2;
1263 o[INDEX3(i,1,0,numComp,2)] = (f_11[i]-f_10[i])*cy2;
1264 } // end of component loop i
1265 } // end of k1 loop
1266 } // end of face 1
1267 if (m_faceOffset[2] > -1) {
1268 #pragma omp for nowait
1269 for (index_t k0=0; k0 < NE0; ++k0) {
1270 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
1271 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,1, m_NN[0])), numComp*sizeof(double));
1272 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
1273 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,1, m_NN[0])), numComp*sizeof(double));
1274 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
1275 for (index_t i=0; i < numComp; ++i) {
1276 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx2;
1277 o[INDEX3(i,1,0,numComp,2)] = (f_01[i] + f_11[i] - f_00[i] - f_10[i])*cy2/2;
1278 } // end of component loop i
1279 } // end of k0 loop
1280 } // end of face 2
1281 if (m_faceOffset[3] > -1) {
1282 #pragma omp for nowait
1283 for (index_t k0=0; k0 < NE0; ++k0) {
1284 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
1285 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1286 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
1287 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1288 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
1289 for (index_t i=0; i < numComp; ++i) {
1290 o[INDEX3(i,0,0,numComp,2)] = (f_11[i]-f_01[i])*cx2;
1291 o[INDEX3(i,1,0,numComp,2)] = (f_01[i] + f_11[i] - f_00[i] - f_10[i])*cy2/2;
1292 } // end of component loop i
1293 } // end of k0 loop
1294 } // end of face 3
1295 } // end of parallel section
1296 }
1297 }
1298
1299 //protected
1300 void Rectangle::assembleIntegrate(vector<double>& integrals,
1301 const escript::Data& arg) const
1302 {
1303 const dim_t numComp = arg.getDataPointSize();
1304 const index_t left = (m_offset[0]==0 ? 0 : 1);
1305 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1306 const int fs=arg.getFunctionSpace().getTypeCode();
1307 if (fs == Elements && arg.actsExpanded()) {
1308 #pragma omp parallel
1309 {
1310 vector<double> int_local(numComp, 0);
1311 const double w = m_dx[0]*m_dx[1]/4.;
1312 #pragma omp for nowait
1313 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1314 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1315 const double* f = arg.getSampleDataRO(INDEX2(k0, k1, m_NE[0]));
1316 for (index_t i=0; i < numComp; ++i) {
1317 const double f0 = f[INDEX2(i,0,numComp)];
1318 const double f1 = f[INDEX2(i,1,numComp)];
1319 const double f2 = f[INDEX2(i,2,numComp)];
1320 const double f3 = f[INDEX2(i,3,numComp)];
1321 int_local[i]+=(f0+f1+f2+f3)*w;
1322 } // end of component loop i
1323 } // end of k0 loop
1324 } // end of k1 loop
1325 #pragma omp critical
1326 for (index_t i=0; i<numComp; i++)
1327 integrals[i]+=int_local[i];
1328 } // end of parallel section
1329
1330 } else if (fs==ReducedElements || (fs==Elements && !arg.actsExpanded())) {
1331 const double w = m_dx[0]*m_dx[1];
1332 #pragma omp parallel
1333 {
1334 vector<double> int_local(numComp, 0);
1335 #pragma omp for nowait
1336 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1337 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1338 const double* f = arg.getSampleDataRO(INDEX2(k0, k1, m_NE[0]));
1339 for (index_t i=0; i < numComp; ++i) {
1340 int_local[i]+=f[i]*w;
1341 }
1342 }
1343 }
1344 #pragma omp critical
1345 for (index_t i=0; i<numComp; i++)
1346 integrals[i]+=int_local[i];
1347 } // end of parallel section
1348
1349 } else if (fs == FaceElements && arg.actsExpanded()) {
1350 #pragma omp parallel
1351 {
1352 vector<double> int_local(numComp, 0);
1353 const double w0 = m_dx[0]/2.;
1354 const double w1 = m_dx[1]/2.;
1355 if (m_faceOffset[0] > -1) {
1356 #pragma omp for nowait
1357 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1358 const double* f = arg.getSampleDataRO(m_faceOffset[0]+k1);
1359 for (index_t i=0; i < numComp; ++i) {
1360 const double f0 = f[INDEX2(i,0,numComp)];
1361 const double f1 = f[INDEX2(i,1,numComp)];
1362 int_local[i]+=(f0+f1)*w1;
1363 } // end of component loop i
1364 } // end of k1 loop
1365 }
1366
1367 if (m_faceOffset[1] > -1) {
1368 #pragma omp for nowait
1369 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1370 const double* f = arg.getSampleDataRO(m_faceOffset[1]+k1);
1371 for (index_t i=0; i < numComp; ++i) {
1372 const double f0 = f[INDEX2(i,0,numComp)];
1373 const double f1 = f[INDEX2(i,1,numComp)];
1374 int_local[i]+=(f0+f1)*w1;
1375 } // end of component loop i
1376 } // end of k1 loop
1377 }
1378
1379 if (m_faceOffset[2] > -1) {
1380 #pragma omp for nowait
1381 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1382 const double* f = arg.getSampleDataRO(m_faceOffset[2]+k0);
1383 for (index_t i=0; i < numComp; ++i) {
1384 const double f0 = f[INDEX2(i,0,numComp)];
1385 const double f1 = f[INDEX2(i,1,numComp)];
1386 int_local[i]+=(f0+f1)*w0;
1387 } // end of component loop i
1388 } // end of k0 loop
1389 }
1390
1391 if (m_faceOffset[3] > -1) {
1392 #pragma omp for nowait
1393 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1394 const double* f = arg.getSampleDataRO(m_faceOffset[3]+k0);
1395 for (index_t i=0; i < numComp; ++i) {
1396 const double f0 = f[INDEX2(i,0,numComp)];
1397 const double f1 = f[INDEX2(i,1,numComp)];
1398 int_local[i]+=(f0+f1)*w0;
1399 } // end of component loop i
1400 } // end of k0 loop
1401 }
1402 #pragma omp critical
1403 for (index_t i=0; i<numComp; i++)
1404 integrals[i]+=int_local[i];
1405 } // end of parallel section
1406
1407 } else if (fs==ReducedFaceElements || (fs==FaceElements && !arg.actsExpanded())) {
1408 #pragma omp parallel
1409 {
1410 vector<double> int_local(numComp, 0);
1411 if (m_faceOffset[0] > -1) {
1412 #pragma omp for nowait
1413 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1414 const double* f = arg.getSampleDataRO(m_faceOffset[0]+k1);
1415 for (index_t i=0; i < numComp; ++i) {
1416 int_local[i]+=f[i]*m_dx[1];
1417 }
1418 }
1419 }
1420
1421 if (m_faceOffset[1] > -1) {
1422 #pragma omp for nowait
1423 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1424 const double* f = arg.getSampleDataRO(m_faceOffset[1]+k1);
1425 for (index_t i=0; i < numComp; ++i) {
1426 int_local[i]+=f[i]*m_dx[1];
1427 }
1428 }
1429 }
1430
1431 if (m_faceOffset[2] > -1) {
1432 #pragma omp for nowait
1433 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1434 const double* f = arg.getSampleDataRO(m_faceOffset[2]+k0);
1435 for (index_t i=0; i < numComp; ++i) {
1436 int_local[i]+=f[i]*m_dx[0];
1437 }
1438 }
1439 }
1440
1441 if (m_faceOffset[3] > -1) {
1442 #pragma omp for nowait
1443 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1444 const double* f = arg.getSampleDataRO(m_faceOffset[3]+k0);
1445 for (index_t i=0; i < numComp; ++i) {
1446 int_local[i]+=f[i]*m_dx[0];
1447 }
1448 }
1449 }
1450
1451 #pragma omp critical
1452 for (index_t i=0; i<numComp; i++)
1453 integrals[i]+=int_local[i];
1454 } // end of parallel section
1455 } // function space selector
1456 }
1457
1458 //protected
1459 IndexVector Rectangle::getDiagonalIndices() const
1460 {
1461 IndexVector ret(9);
1462 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1463 size_t idx = 0;
1464 for (int i1=-1; i1<2; i1++) {
1465 for (int i0=-1; i0<2; i0++) {
1466 ret[idx++] = i1*nDOF0 + i0;
1467 }
1468 }
1469
1470 return ret;
1471 }
1472
1473 //protected
1474 void Rectangle::nodesToDOF(escript::Data& out, const escript::Data& in) const
1475 {
1476 const dim_t numComp = in.getDataPointSize();
1477 out.requireWrite();
1478
1479 const index_t left = (m_offset[0]==0 ? 0 : 1);
1480 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1481 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1482 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1483 #pragma omp parallel for
1484 for (index_t i=0; i<nDOF1; i++) {
1485 for (index_t j=0; j<nDOF0; j++) {
1486 const index_t n=j+left+(i+bottom)*m_NN[0];
1487 const double* src=in.getSampleDataRO(n);
1488 copy(src, src+numComp, out.getSampleDataRW(j+i*nDOF0));
1489 }
1490 }
1491 }
1492
1493 //protected
1494 void Rectangle::dofToNodes(escript::Data& out, const escript::Data& in) const
1495 {
1496 const dim_t numComp = in.getDataPointSize();
1497 paso::Coupler_ptr coupler(new paso::Coupler(m_connector, numComp));
1498 // expand data object if necessary to be able to grab the whole data
1499 const_cast<escript::Data*>(&in)->expand();
1500 coupler->startCollect(in.getDataRO());
1501
1502 const dim_t numDOF = getNumDOF();
1503 const dim_t numNodes = getNumNodes();
1504 out.requireWrite();
1505 const double* buffer = coupler->finishCollect();
1506
1507 #pragma omp parallel for
1508 for (index_t i=0; i < numNodes; i++) {
1509 const double* src=(m_dofMap[i]<numDOF ?
1510 in.getSampleDataRO(m_dofMap[i])
1511 : &buffer[(m_dofMap[i]-numDOF)*numComp]);
1512 copy(src, src+numComp, out.getSampleDataRW(i));
1513 }
1514 }
1515
1516 //protected
1517 paso::SystemMatrixPattern_ptr Rectangle::getPasoMatrixPattern(
1518 bool reducedRowOrder,
1519 bool reducedColOrder) const
1520 {
1521 if (m_pattern.get())
1522 return m_pattern;
1523
1524 // first call - create pattern, then return
1525 const dim_t numDOF = getNumDOF();
1526 const dim_t numShared = m_connector->send->numSharedComponents;
1527 const dim_t numNeighbours = m_connector->send->numNeighbors;
1528 const index_t* offsetInShared = m_connector->send->offsetInShared;
1529 const index_t* sendShared = m_connector->send->shared;
1530
1531 // these are for the couple blocks
1532 vector<IndexVector> colIndices(numDOF);
1533 vector<IndexVector> rowIndices(numShared);
1534
1535 for (dim_t i=0; i<numNeighbours; i++) {
1536 const dim_t start = offsetInShared[i];
1537 const dim_t end = offsetInShared[i+1];
1538 for (dim_t j = start; j < end; j++) {
1539 if (j > start)
1540 doublyLink(colIndices, rowIndices, sendShared[j-1], j);
1541 doublyLink(colIndices, rowIndices, sendShared[j], j);
1542 if (j < end-1)
1543 doublyLink(colIndices, rowIndices, sendShared[j+1], j);
1544 }
1545 }
1546 #pragma omp parallel for
1547 for (dim_t i = 0; i < numShared; i++) {
1548 sort(rowIndices[i].begin(), rowIndices[i].end());
1549 }
1550
1551 // create main and couple blocks
1552 paso::Pattern_ptr mainPattern = createPasoPattern(getConnections(), numDOF);
1553 paso::Pattern_ptr colPattern = createPasoPattern(colIndices, numShared);
1554 paso::Pattern_ptr rowPattern = createPasoPattern(rowIndices, numDOF);
1555
1556 // allocate paso distribution
1557 paso::Distribution_ptr distribution(new paso::Distribution(m_mpiInfo,
1558 const_cast<index_t*>(&m_nodeDistribution[0]), 1, 0));
1559
1560 // finally create the system matrix pattern
1561 m_pattern.reset(new paso::SystemMatrixPattern(MATRIX_FORMAT_DEFAULT,
1562 distribution, distribution, mainPattern, colPattern, rowPattern,
1563 m_connector, m_connector));
1564 return m_pattern;
1565 }
1566
1567 //private
1568 void Rectangle::populateSampleIds()
1569 {
1570 // degrees of freedom are numbered from left to right, bottom to top in
1571 // each rank, continuing on the next rank (ranks also go left-right,
1572 // bottom-top).
1573 // This means rank 0 has id 0...n0-1, rank 1 has id n0...n1-1 etc. which
1574 // helps when writing out data rank after rank.
1575
1576 // build node distribution vector first.
1577 // rank i owns m_nodeDistribution[i+1]-nodeDistribution[i] nodes which is
1578 // constant for all ranks in this implementation
1579 m_nodeDistribution.assign(m_mpiInfo->size+1, 0);
1580 const dim_t numDOF=getNumDOF();
1581 for (dim_t k=1; k<m_mpiInfo->size; k++) {
1582 m_nodeDistribution[k]=k*numDOF;
1583 }
1584 m_nodeDistribution[m_mpiInfo->size]=getNumDataPointsGlobal();
1585 try {
1586 m_nodeId.resize(getNumNodes());
1587 m_dofId.resize(numDOF);
1588 m_elementId.resize(getNumElements());
1589 } catch (const length_error& le) {
1590 throw RipleyException("The system does not have sufficient memory for a domain of this size.");
1591 }
1592
1593 // populate face element counts
1594 //left
1595 if (m_offset[0]==0)
1596 m_faceCount[0]=m_NE[1];
1597 else
1598 m_faceCount[0]=0;
1599 //right
1600 if (m_mpiInfo->rank%m_NX[0]==m_NX[0]-1)
1601 m_faceCount[1]=m_NE[1];
1602 else
1603 m_faceCount[1]=0;
1604 //bottom
1605 if (m_offset[1]==0)
1606 m_faceCount[2]=m_NE[0];
1607 else
1608 m_faceCount[2]=0;
1609 //top
1610 if (m_mpiInfo->rank/m_NX[0]==m_NX[1]-1)
1611 m_faceCount[3]=m_NE[0];
1612 else
1613 m_faceCount[3]=0;
1614
1615 const dim_t NFE = getNumFaceElements();
1616 m_faceId.resize(NFE);
1617
1618 const index_t left = (m_offset[0]==0 ? 0 : 1);
1619 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1620 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1621 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1622 const dim_t NE0 = m_NE[0];
1623 const dim_t NE1 = m_NE[1];
1624
1625 #define globalNodeId(x,y) \
1626 ((m_offset[0]+x)/nDOF0)*nDOF0*nDOF1+(m_offset[0]+x)%nDOF0 \
1627 + ((m_offset[1]+y)/nDOF1)*nDOF0*nDOF1*m_NX[0]+((m_offset[1]+y)%nDOF1)*nDOF0
1628
1629 // set corner id's outside the parallel region
1630 m_nodeId[0] = globalNodeId(0, 0);
1631 m_nodeId[m_NN[0]-1] = globalNodeId(m_NN[0]-1, 0);
1632 m_nodeId[m_NN[0]*(m_NN[1]-1)] = globalNodeId(0, m_NN[1]-1);
1633 m_nodeId[m_NN[0]*m_NN[1]-1] = globalNodeId(m_NN[0]-1,m_NN[1]-1);
1634 #undef globalNodeId
1635
1636 #pragma omp parallel
1637 {
1638 // populate degrees of freedom and own nodes (identical id)
1639 #pragma omp for nowait
1640 for (dim_t i=0; i<nDOF1; i++) {
1641 for (dim_t j=0; j<nDOF0; j++) {
1642 const index_t nodeIdx=j+left+(i+bottom)*m_NN[0];
1643 const index_t dofIdx=j+i*nDOF0;
1644 m_dofId[dofIdx] = m_nodeId[nodeIdx]
1645 = m_nodeDistribution[m_mpiInfo->rank]+dofIdx;
1646 }
1647 }
1648
1649 // populate the rest of the nodes (shared with other ranks)
1650 if (m_faceCount[0]==0) { // left column
1651 #pragma omp for nowait
1652 for (dim_t i=0; i<nDOF1; i++) {
1653 const index_t nodeIdx=(i+bottom)*m_NN[0];
1654 const index_t dofId=(i+1)*nDOF0-1;
1655 m_nodeId[nodeIdx]
1656 = m_nodeDistribution[m_mpiInfo->rank-1]+dofId;
1657 }
1658 }
1659 if (m_faceCount[1]==0) { // right column
1660 #pragma omp for nowait
1661 for (dim_t i=0; i<nDOF1; i++) {
1662 const index_t nodeIdx=(i+bottom+1)*m_NN[0]-1;
1663 const index_t dofId=i*nDOF0;
1664 m_nodeId[nodeIdx]
1665 = m_nodeDistribution[m_mpiInfo->rank+1]+dofId;
1666 }
1667 }
1668 if (m_faceCount[2]==0) { // bottom row
1669 #pragma omp for nowait
1670 for (dim_t i=0; i<nDOF0; i++) {
1671 const index_t nodeIdx=i+left;
1672 const index_t dofId=nDOF0*(nDOF1-1)+i;
1673 m_nodeId[nodeIdx]
1674 = m_nodeDistribution[m_mpiInfo->rank-m_NX[0]]+dofId;
1675 }
1676 }
1677 if (m_faceCount[3]==0) { // top row
1678 #pragma omp for nowait
1679 for (dim_t i=0; i<nDOF0; i++) {
1680 const index_t nodeIdx=m_NN[0]*(m_NN[1]-1)+i+left;
1681 const index_t dofId=i;
1682 m_nodeId[nodeIdx]
1683 = m_nodeDistribution[m_mpiInfo->rank+m_NX[0]]+dofId;
1684 }
1685 }
1686
1687 // populate element id's
1688 #pragma omp for nowait
1689 for (dim_t i1=0; i1<NE1; i1++) {
1690 for (dim_t i0=0; i0<NE0; i0++) {
1691 m_elementId[i0+i1*NE0]=(m_offset[1]+i1)*m_gNE[0]+m_offset[0]+i0;
1692 }
1693 }
1694
1695 // face elements
1696 #pragma omp for
1697 for (dim_t k=0; k<NFE; k++)
1698 m_faceId[k]=k;
1699 } // end parallel section
1700
1701 m_nodeTags.assign(getNumNodes(), 0);
1702 updateTagsInUse(Nodes);
1703
1704 m_elementTags.assign(getNumElements(), 0);
1705 updateTagsInUse(Elements);
1706
1707 // generate face offset vector and set face tags
1708 const index_t LEFT=1, RIGHT=2, BOTTOM=10, TOP=20;
1709 const index_t faceTag[] = { LEFT, RIGHT, BOTTOM, TOP };
1710 m_faceOffset.assign(4, -1);
1711 m_faceTags.clear();
1712 index_t offset=0;
1713 for (size_t i=0; i<4; i++) {
1714 if (m_faceCount[i]>0) {
1715 m_faceOffset[i]=offset;
1716 offset+=m_faceCount[i];
1717 m_faceTags.insert(m_faceTags.end(), m_faceCount[i], faceTag[i]);
1718 }
1719 }
1720 setTagMap("left", LEFT);
1721 setTagMap("right", RIGHT);
1722 setTagMap("bottom", BOTTOM);
1723 setTagMap("top", TOP);
1724 updateTagsInUse(FaceElements);
1725
1726 populateDofMap();
1727 }
1728
1729 //private
1730 vector<IndexVector> Rectangle::getConnections() const
1731 {
1732 // returns a vector v of size numDOF where v[i] is a vector with indices
1733 // of DOFs connected to i (up to 9 in 2D)
1734 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1735 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1736 const dim_t M = nDOF0*nDOF1;
1737 vector<IndexVector> indices(M);
1738
1739 #pragma omp parallel for
1740 for (index_t i=0; i < M; i++) {
1741 const index_t x = i % nDOF0;
1742 const index_t y = i / nDOF0;
1743 // loop through potential neighbours and add to index if positions are
1744 // within bounds
1745 for (dim_t i1=y-1; i1<y+2; i1++) {
1746 for (dim_t i0=x-1; i0<x+2; i0++) {
1747 if (i0>=0 && i1>=0 && i0<nDOF0 && i1<nDOF1) {
1748 indices[i].push_back(i1*nDOF0 + i0);
1749 }
1750 }
1751 }
1752 }
1753 return indices;
1754 }
1755
1756 //private
1757 void Rectangle::populateDofMap()
1758 {
1759 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1760 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1761 const index_t left = (m_offset[0]==0 ? 0 : 1);
1762 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1763
1764 // populate node->DOF mapping with own degrees of freedom.
1765 // The rest is assigned in the loop further down
1766 m_dofMap.assign(getNumNodes(), 0);
1767 #pragma omp parallel for
1768 for (index_t i=bottom; i<bottom+nDOF1; i++) {
1769 for (index_t j=left; j<left+nDOF0; j++) {
1770 m_dofMap[i*m_NN[0]+j]=(i-bottom)*nDOF0+j-left;
1771 }
1772 }
1773
1774 // build list of shared components and neighbours by looping through
1775 // all potential neighbouring ranks and checking if positions are
1776 // within bounds
1777 const dim_t numDOF=nDOF0*nDOF1;
1778 RankVector neighbour;
1779 IndexVector offsetInShared(1,0);
1780 IndexVector sendShared, recvShared;
1781 const int x=m_mpiInfo->rank%m_NX[0];
1782 const int y=m_mpiInfo->rank/m_NX[0];
1783 // numShared will contain the number of shared DOFs after the following
1784 // blocks
1785 dim_t numShared=0;
1786 // sharing bottom edge
1787 if (y > 0) {
1788 neighbour.push_back((y-1)*m_NX[0] + x);
1789 const dim_t num = nDOF0;
1790 offsetInShared.push_back(offsetInShared.back()+num);
1791 for (dim_t i=0; i<num; i++, numShared++) {
1792 sendShared.push_back(i);
1793 recvShared.push_back(numDOF+numShared);
1794 m_dofMap[left+i]=numDOF+numShared;
1795 }
1796 }
1797 // sharing top edge
1798 if (y < m_NX[1] - 1) {
1799 neighbour.push_back((y+1)*m_NX[0] + x);
1800 const dim_t num = nDOF0;
1801 offsetInShared.push_back(offsetInShared.back()+num);
1802 for (dim_t i=0; i<num; i++, numShared++) {
1803 sendShared.push_back(numDOF-num+i);
1804 recvShared.push_back(numDOF+numShared);
1805 m_dofMap[m_NN[0]*(m_NN[1]-1)+left+i]=numDOF+numShared;
1806 }
1807 }
1808 // sharing left edge
1809 if (x > 0) {
1810 neighbour.push_back(y*m_NX[0] + x-1);
1811 const dim_t num = nDOF1;
1812 offsetInShared.push_back(offsetInShared.back()+num);
1813 for (dim_t i=0; i<num; i++, numShared++) {
1814 sendShared.push_back(i*nDOF0);
1815 recvShared.push_back(numDOF+numShared);
1816 m_dofMap[(bottom+i)*m_NN[0]]=numDOF+numShared;
1817 }
1818 }
1819 // sharing right edge
1820 if (x < m_NX[0] - 1) {
1821 neighbour.push_back(y*m_NX[0] + x+1);
1822 const dim_t num = nDOF1;
1823 offsetInShared.push_back(offsetInShared.back()+num);
1824 for (dim_t i=0; i<num; i++, numShared++) {
1825 sendShared.push_back((i+1)*nDOF0-1);
1826 recvShared.push_back(numDOF+numShared);
1827 m_dofMap[(bottom+1+i)*m_NN[0]-1]=numDOF+numShared;
1828 }
1829 }
1830 // sharing bottom-left node
1831 if (x > 0 && y > 0) {
1832 neighbour.push_back((y-1)*m_NX[0] + x-1);
1833 // sharing a node
1834 offsetInShared.push_back(offsetInShared.back()+1);
1835 sendShared.push_back(0);
1836 recvShared.push_back(numDOF+numShared);
1837 m_dofMap[0]=numDOF+numShared;
1838 ++numShared;
1839 }
1840 // sharing top-left node
1841 if (x > 0 && y < m_NX[1]-1) {
1842 neighbour.push_back((y+1)*m_NX[0] + x-1);
1843 offsetInShared.push_back(offsetInShared.back()+1);
1844 sendShared.push_back(numDOF-nDOF0);
1845 recvShared.push_back(numDOF+numShared);
1846 m_dofMap[m_NN[0]*(m_NN[1]-1)]=numDOF+numShared;
1847 ++numShared;
1848 }
1849 // sharing bottom-right node
1850 if (x < m_NX[0]-1 && y > 0) {
1851 neighbour.push_back((y-1)*m_NX[0] + x+1);
1852 offsetInShared.push_back(offsetInShared.back()+1);
1853 sendShared.push_back(nDOF0-1);
1854 recvShared.push_back(numDOF+numShared);
1855 m_dofMap[m_NN[0]-1]=numDOF+numShared;
1856 ++numShared;
1857 }
1858 // sharing top-right node
1859 if (x < m_NX[0]-1 && y < m_NX[1]-1) {
1860 neighbour.push_back((y+1)*m_NX[0] + x+1);
1861 offsetInShared.push_back(offsetInShared.back()+1);
1862 sendShared.push_back(numDOF-1);
1863 recvShared.push_back(numDOF+numShared);
1864 m_dofMap[m_NN[0]*m_NN[1]-1]=numDOF+numShared;
1865 ++numShared;
1866 }
1867
1868 // TODO: paso::SharedComponents should take vectors to avoid this
1869 Esys_MPI_rank* neighPtr = NULL;
1870 index_t* sendPtr = NULL;
1871 index_t* recvPtr = NULL;
1872 if (neighbour.size() > 0) {
1873 neighPtr = &neighbour[0];
1874 sendPtr = &sendShared[0];
1875 recvPtr = &recvShared[0];
1876 }
1877
1878 // create connector
1879 paso::SharedComponents_ptr snd_shcomp(new paso::SharedComponents(
1880 numDOF, neighbour.size(), neighPtr, sendPtr,
1881 &offsetInShared[0], 1, 0, m_mpiInfo));
1882 paso::SharedComponents_ptr rcv_shcomp(new paso::SharedComponents(
1883 numDOF, neighbour.size(), neighPtr, recvPtr,
1884 &offsetInShared[0], 1, 0, m_mpiInfo));
1885 m_connector.reset(new paso::Connector(snd_shcomp, rcv_shcomp));
1886
1887 // useful debug output
1888 /*
1889 cout << "--- rcv_shcomp ---" << endl;
1890 cout << "numDOF=" << numDOF << ", numNeighbors=" << neighbour.size() << endl;
1891 for (size_t i=0; i<neighbour.size(); i++) {
1892 cout << "neighbor[" << i << "]=" << neighbour[i]
1893 << " offsetInShared[" << i+1 << "]=" << offsetInShared[i+1] << endl;
1894 }
1895 for (size_t i=0; i<recvShared.size(); i++) {
1896 cout << "shared[" << i << "]=" << recvShared[i] << endl;
1897 }
1898 cout << "--- snd_shcomp ---" << endl;
1899 for (size_t i=0; i<sendShared.size(); i++) {
1900 cout << "shared[" << i << "]=" << sendShared[i] << endl;
1901 }
1902 cout << "--- dofMap ---" << endl;
1903 for (size_t i=0; i<m_dofMap.size(); i++) {
1904 cout << "m_dofMap[" << i << "]=" << m_dofMap[i] << endl;
1905 }
1906 */
1907 }
1908
1909 //private
1910 void Rectangle::addToMatrixAndRHS(AbstractSystemMatrix* S, escript::Data& F,
1911 const vector<double>& EM_S, const vector<double>& EM_F, bool addS,
1912 bool addF, index_t firstNode, dim_t nEq, dim_t nComp) const
1913 {
1914 IndexVector rowIndex(4);
1915 rowIndex[0] = m_dofMap[firstNode];
1916 rowIndex[1] = m_dofMap[firstNode+1];
1917 rowIndex[2] = m_dofMap[firstNode+m_NN[0]];
1918 rowIndex[3] = m_dofMap[firstNode+m_NN[0]+1];
1919 if (addF) {
1920 double *F_p=F.getSampleDataRW(0);
1921 for (index_t i=0; i<rowIndex.size(); i++) {
1922 if (rowIndex[i]<getNumDOF()) {
1923 for (index_t eq=0; eq<nEq; eq++) {
1924 F_p[INDEX2(eq, rowIndex[i], nEq)]+=EM_F[INDEX2(eq,i,nEq)];
1925 }
1926 }
1927 }
1928 }
1929 if (addS) {
1930 addToSystemMatrix(S, rowIndex, nEq, EM_S);
1931 }
1932 }
1933
1934 //protected
1935 void Rectangle::interpolateNodesOnElements(escript::Data& out,
1936 const escript::Data& in,
1937 bool reduced) const
1938 {
1939 const dim_t numComp = in.getDataPointSize();
1940 const dim_t NE0 = m_NE[0];
1941 const dim_t NE1 = m_NE[1];
1942 if (reduced) {
1943 out.requireWrite();
1944 const double c0 = 0.25;
1945 #pragma omp parallel
1946 {
1947 vector<double> f_00(numComp);
1948 vector<double> f_01(numComp);
1949 vector<double> f_10(numComp);
1950 vector<double> f_11(numComp);
1951 #pragma omp for
1952 for (index_t k1=0; k1 < NE1; ++k1) {
1953 for (index_t k0=0; k0 < NE0; ++k0) {
1954 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
1955 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
1956 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
1957 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
1958 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
1959 for (index_t i=0; i < numComp; ++i) {
1960 o[INDEX2(i,numComp,0)] = c0*(f_00[i] + f_01[i] + f_10[i] + f_11[i]);
1961 } /* end of component loop i */
1962 } /* end of k0 loop */
1963 } /* end of k1 loop */
1964 } /* end of parallel section */
1965 } else {
1966 out.requireWrite();
1967 const double c0 = 0.16666666666666666667;
1968 const double c1 = 0.044658198738520451079;
1969 const double c2 = 0.62200846792814621559;
1970 #pragma omp parallel
1971 {
1972 vector<double> f_00(numComp);
1973 vector<double> f_01(numComp);
1974 vector<double> f_10(numComp);
1975 vector<double> f_11(numComp);
1976 #pragma omp for
1977 for (index_t k1=0; k1 < NE1; ++k1) {
1978 for (index_t k0=0; k0 < NE0; ++k0) {
1979 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
1980 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
1981 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
1982 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
1983 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
1984 for (index_t i=0; i < numComp; ++i) {
1985 o[INDEX2(i,numComp,0)] = c0*(f_01[i] + f_10[i]) + c1*f_11[i] + c2*f_00[i];
1986 o[INDEX2(i,numComp,1)] = c0*(f_00[i] + f_11[i]) + c1*f_01[i] + c2*f_10[i];
1987 o[INDEX2(i,numComp,2)] = c0*(f_00[i] + f_11[i]) + c1*f_10[i] + c2*f_01[i];
1988 o[INDEX2(i,numComp,3)] = c0*(f_01[i] + f_10[i]) + c1*f_00[i] + c2*f_11[i];
1989 } /* end of component loop i */
1990 } /* end of k0 loop */
1991 } /* end of k1 loop */
1992 } /* end of parallel section */
1993 }
1994 }
1995
1996 //protected
1997 void Rectangle::interpolateNodesOnFaces(escript::Data& out,
1998 const escript::Data& in,
1999 bool reduced) const
2000 {
2001 const dim_t numComp = in.getDataPointSize();
2002 const dim_t NE0 = m_NE[0];
2003 const dim_t NE1 = m_NE[1];
2004 if (reduced) {
2005 out.requireWrite();
2006 #pragma omp parallel
2007 {
2008 vector<double> f_00(numComp);
2009 vector<double> f_01(numComp);
2010 vector<double> f_10(numComp);
2011 vector<double> f_11(numComp);
2012 if (m_faceOffset[0] > -1) {
2013 #pragma omp for nowait
2014 for (index_t k1=0; k1 < NE1; ++k1) {
2015 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
2016 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
2017 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
2018 for (index_t i=0; i < numComp; ++i) {
2019 o[INDEX2(i,numComp,0)] = (f_00[i] + f_01[i])/2;
2020 } /* end of component loop i */
2021 } /* end of k1 loop */
2022 } /* end of face 0 */
2023 if (m_faceOffset[1] > -1) {
2024 #pragma omp for nowait
2025 for (index_t k1=0; k1 < NE1; ++k1) {
2026 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
2027 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
2028 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
2029 for (index_t i=0; i < numComp; ++i) {
2030 o[INDEX2(i,numComp,0)] = (f_10[i] + f_11[i])/2;
2031 } /* end of component loop i */
2032 } /* end of k1 loop */
2033 } /* end of face 1 */
2034 if (m_faceOffset[2] > -1) {
2035 #pragma omp for nowait
2036 for (index_t k0=0; k0 < NE0; ++k0) {
2037 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
2038 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
2039 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
2040 for (index_t i=0; i < numComp; ++i) {
2041 o[INDEX2(i,numComp,0)] = (f_00[i] + f_10[i])/2;
2042 } /* end of component loop i */
2043 } /* end of k0 loop */
2044 } /* end of face 2 */
2045 if (m_faceOffset[3] > -1) {
2046 #pragma omp for nowait
2047 for (index_t k0=0; k0 < NE0; ++k0) {
2048 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
2049 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
2050 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
2051 for (index_t i=0; i < numComp; ++i) {
2052 o[INDEX2(i,numComp,0)] = (f_01[i] + f_11[i])/2;
2053 } /* end of component loop i */
2054 } /* end of k0 loop */
2055 } /* end of face 3 */
2056 } /* end of parallel section */
2057 } else {
2058 out.requireWrite();
2059 const double c0 = 0.21132486540518711775;
2060 const double c1 = 0.78867513459481288225;
2061 #pragma omp parallel
2062 {
2063 vector<double> f_00(numComp);
2064 vector<double> f_01(numComp);
2065 vector<double> f_10(numComp);
2066 vector<double> f_11(numComp);
2067 if (m_faceOffset[0] > -1) {
2068 #pragma omp for nowait
2069 for (index_t k1=0; k1 < NE1; ++k1) {
2070 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
2071 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
2072 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
2073 for (index_t i=0; i < numComp; ++i) {
2074 o[INDEX2(i,numComp,0)] = c0*f_01[i] + c1*f_00[i];
2075 o[INDEX2(i,numComp,1)] = c0*f_00[i] + c1*f_01[i];
2076 } /* end of component loop i */
2077 } /* end of k1 loop */
2078 } /* end of face 0 */
2079 if (m_faceOffset[1] > -1) {
2080 #pragma omp for nowait
2081 for (index_t k1=0; k1 < NE1; ++k1) {
2082 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
2083 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
2084 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
2085 for (index_t i=0; i < numComp; ++i) {
2086 o[INDEX2(i,numComp,0)] = c1*f_10[i] + c0*f_11[i];
2087 o[INDEX2(i,numComp,1)] = c1*f_11[i] + c0*f_10[i];
2088 } /* end of component loop i */
2089 } /* end of k1 loop */
2090 } /* end of face 1 */
2091 if (m_faceOffset[2] > -1) {
2092 #pragma omp for nowait
2093 for (index_t k0=0; k0 < NE0; ++k0) {
2094 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
2095 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
2096 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
2097 for (index_t i=0; i < numComp; ++i) {
2098 o[INDEX2(i,numComp,0)] = c0*f_10[i] + c1*f_00[i];
2099 o[INDEX2(i,numComp,1)] = c0*f_00[i] + c1*f_10[i];
2100 } /* end of component loop i */
2101 } /* end of k0 loop */
2102 } /* end of face 2 */
2103 if (m_faceOffset[3] > -1) {
2104 #pragma omp for nowait
2105 for (index_t k0=0; k0 < NE0; ++k0) {
2106 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
2107 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
2108 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
2109 for (index_t i=0; i < numComp; ++i) {
2110 o[INDEX2(i,numComp,0)] = c0*f_11[i] + c1*f_01[i];
2111 o[INDEX2(i,numComp,1)] = c0*f_01[i] + c1*f_11[i];
2112 } /* end of component loop i */
2113 } /* end of k0 loop */
2114 } /* end of face 3 */
2115 } /* end of parallel section */
2116 }
2117 }
2118
2119
2120
2121 namespace
2122 {
2123 // Calculates a Gaussian blur convolution matrix for 2D
2124 // See wiki article on the subject
2125 double* get2DGauss(unsigned radius, double sigma)
2126 {
2127 double* arr = new double[(radius*2+1)*(radius*2+1)];
2128 const double common = M_1_PI * 0.5 / (sigma*sigma);
2129 const int r = static_cast<int>(radius);
2130 double total = 0;
2131 for (int y = -r; y <= r; ++y) {
2132 for (int x = -r; x <= r; ++x) {
2133 arr[(x+r)+(y+r)*(r*2+1)]=common*exp(-(x*x+y*y)/(2*sigma*sigma));
2134 total+=arr[(x+r)+(y+r)*(r*2+1)];
2135 }
2136 }
2137 const double invtotal = 1/total;
2138 for (size_t p=0; p<(radius*2+1)*(radius*2+1); ++p) {
2139 arr[p] *= invtotal;
2140 }
2141 return arr;
2142 }
2143
2144 // applies conv to source to get a point.
2145 // (xp, yp) are the coords in the source matrix not the destination matrix
2146 double Convolve2D(double* conv, double* source, size_t xp, size_t yp,
2147 unsigned radius, size_t width)
2148 {
2149 const size_t bx = xp-radius, by=yp-radius;
2150 const size_t sbase = bx+by*width;
2151 double result = 0;
2152 for (int y=0; y<2*radius+1; ++y) {
2153 for (int x=0; x<2*radius+1; ++x) {
2154 result += conv[x+y*(2*radius+1)] * source[sbase + x+y*width];
2155 }
2156 }
2157 return result;
2158 }
2159 }
2160
2161 /* This is a wrapper for filtered (and non-filtered) randoms
2162 * For detailed doco see randomFillWorker
2163 */
2164 escript::Data Rectangle::randomFill(const escript::DataTypes::ShapeType& shape,
2165 const escript::FunctionSpace& what, long seed,
2166 const bp::tuple& filter) const
2167 {
2168 int numvals=escript::DataTypes::noValues(shape);
2169 if (len(filter) > 0 && numvals != 1)
2170 throw RipleyException("Ripley only supports filters for scalar data.");
2171
2172 escript::Data res = randomFillWorker(shape, seed, filter);
2173 if (res.getFunctionSpace() != what) {
2174 escript::Data r(res, what);
2175 return r;
2176 }
2177 return res;
2178 }
2179
2180
2181 /* This routine produces a Data object filled with smoothed random data.
2182 * The dimensions of the rectangle being filled are internal[0] x internal[1]
2183 * points. A parameter radius gives the size of the stencil used for the
2184 * smoothing. A point on the left hand edge for example, will still require
2185 * `radius` extra points to the left in order to complete the stencil.
2186 *
2187 * All local calculation is done on an array called `src`, with
2188 * dimensions = ext[0] * ext[1], where ext[i]= internal[i]+2*radius.
2189 *
2190 * Now for MPI there is overlap to deal with. We need to share both the
2191 * overlapping values themselves but also the external region.
2192 *
2193 * In a hypothetical 1-D case:
2194 *
2195 * 1234567 would be split into two ranks thus:
2196 * 123(4) (4)567 [4 being a shared element]
2197 *
2198 * If the radius is 2. There will be padding elements on the outside:
2199 * pp123(4) (4)567pp
2200 *
2201 * To ensure that 4 can be correctly computed on both ranks, values from the
2202 * other rank need to be known.
2203 *
2204 * pp123(4)56 23(4)567pp
2205 *
2206 * Now in our case, we set all the values 23456 on the left rank and send them
2207 * to the right hand rank.
2208 *
2209 * So the edges _may_ need to be shared at a distance `inset` from all
2210 * boundaries.
2211 *
2212 * inset=2*radius+1
2213 * This is to ensure that values at distance `radius` from the
2214 * shared/overlapped element that ripley has.
2215 */
2216 escript::Data Rectangle::randomFillWorker(
2217 const escript::DataTypes::ShapeType& shape, long seed,
2218 const bp::tuple& filter) const
2219 {
2220 unsigned int radius=0; // these are only used by gaussian
2221 double sigma=0.5;
2222
2223 unsigned int numvals=escript::DataTypes::noValues(shape);
2224
2225 if (len(filter) == 0) {
2226 // nothing special required here yet
2227 } else if (len(filter) == 3) {
2228 bp::extract<string> ex(filter[0]);
2229 if (!ex.check() || (ex()!="gaussian")) {
2230 throw RipleyException("Unsupported random filter");
2231 }
2232 bp::extract<unsigned int> ex1(filter[1]);
2233 if (!ex1.check()) {
2234 throw RipleyException("Radius of Gaussian filter must be a positive integer.");
2235 }
2236 radius = ex1();
2237 sigma = 0.5;
2238 bp::extract<double> ex2(filter[2]);
2239 if (!ex2.check() || (sigma=ex2()) <= 0) {
2240 throw RipleyException("Sigma must be a positive floating point number.");
2241 }
2242 } else {
2243 throw RipleyException("Unsupported random filter for Rectangle.");
2244 }
2245
2246 // number of points in the internal region
2247 // that is, the ones we need smoothed versions of
2248 const dim_t internal[2] = { m_NN[0], m_NN[1] };
2249 size_t ext[2];
2250 ext[0]=(size_t)internal[0]+2*radius; // includes points we need as input
2251 ext[1]=(size_t)internal[1]+2*radius; // for smoothing
2252
2253 // now we check to see if the radius is acceptable
2254 // That is, would not cross multiple ranks in MPI
2255
2256 if (2*radius >= internal[0]-4) {
2257 throw RipleyException("Radius of gaussian filter is too large for X dimension of a rank");
2258 }
2259 if (2*radius >= internal[1]-4) {
2260 throw RipleyException("Radius of gaussian filter is too large for Y dimension of a rank");
2261 }
2262
2263 double* src = new double[ext[0]*ext[1]*numvals];
2264 esysUtils::randomFillArray(seed, src, ext[0]*ext[1]*numvals);
2265
2266 #ifdef ESYS_MPI
2267 if ((internal[0] < 5) || (internal[1] < 5)) {
2268 // since the dimensions are equal for all ranks, this exception
2269 // will be thrown on all ranks
2270 throw RipleyException("Random Data in Ripley requires at least five elements per side per rank.");
2271 }
2272 dim_t X = m_mpiInfo->rank%m_NX[0];
2273 dim_t Y = m_mpiInfo->rank/m_NX[0];
2274 #endif
2275
2276 /*
2277 // if we wanted to test a repeating pattern
2278 size_t basex=0;
2279 size_t basey=0;
2280 #ifdef ESYS_MPI
2281 basex=X*m_gNE[0]/m_NX[0];
2282 basey=Y*m_gNE[1]/m_NX[1];
2283 #endif
2284
2285 esysUtils::patternFillArray2D(ext[0], ext[1], src, 4, basex, basey, numvals);
2286 */
2287
2288 #ifdef ESYS_MPI
2289 BlockGrid2 grid(m_NX[0]-1, m_NX[1]-1);
2290 // it's +2 not +1 because a whole element is shared (and hence there is
2291 // an overlap of two points both of which need to have "radius" points on
2292 // either side.
2293 size_t inset=2*radius+2;
2294
2295 // how wide is the x-dimension between the two insets
2296 size_t xmidlen=ext[0]-2*inset;
2297 size_t ymidlen=ext[1]-2*inset;
2298
2299 Block2 block(ext[0], ext[1], inset, xmidlen, ymidlen, numvals);
2300
2301 // a non-tight upper bound on how many we need
2302 MPI_Request reqs[40];
2303 MPI_Status stats[40];
2304 short rused=0;
2305
2306 messvec incoms;
2307 messvec outcoms;
2308
2309 grid.generateInNeighbours(X, Y, incoms);
2310 grid.generateOutNeighbours(X, Y, outcoms);
2311
2312 block.copyAllToBuffer(src);
2313
2314 int comserr = 0;
2315 for (size_t i=0; i < incoms.size(); ++i) {
2316 message& m = incoms[i];
2317 comserr |= MPI_Irecv(block.getInBuffer(m.destbuffid),
2318 block.getBuffSize(m.destbuffid), MPI_DOUBLE,
2319 m.sourceID, m.tag, m_mpiInfo->comm,
2320 reqs+(rused++));
2321 block.setUsed(m.destbuffid);
2322 }
2323
2324 for (size_t i=0; i < outcoms.size(); ++i) {
2325 message& m = outcoms[i];
2326 comserr |= MPI_Isend(block.getOutBuffer(m.srcbuffid),
2327 block.getBuffSize(m.srcbuffid), MPI_DOUBLE,
2328 m.destID, m.tag, m_mpiInfo->comm, reqs+(rused++));
2329 }
2330
2331 if (!comserr) {
2332 comserr = MPI_Waitall(rused, reqs, stats);
2333 }
2334
2335 if (comserr) {
2336 // Yes this is throwing an exception as a result of an MPI error
2337 // and no we don't inform the other ranks that we are doing this.
2338 // However, we have no reason to believe coms work at this point anyway
2339 throw RipleyException("Error in coms for randomFill");
2340 }
2341
2342 block.copyUsedFromBuffer(src);
2343 #endif
2344
2345 // the truth of either should imply the truth of the other but let's be safe
2346 if (radius==0 || numvals > 1) {
2347 escript::FunctionSpace fs(getPtr(), getContinuousFunctionCode());
2348 escript::Data resdat(0, shape, fs, true);
2349 // don't need to check for exwrite because we just made it
2350 escript::DataVector& dv = resdat.getExpandedVectorReference();
2351
2352 // now we need to copy values over
2353 for (size_t y=0; y < internal[1]; ++y) {
2354 for (size_t x=0; x < internal[0]; ++x) {
2355 for (unsigned int i=0; i < numvals; ++i) {
2356 dv[i+(x+y*(internal[0]))*numvals]=src[i+(x+y*ext[0])*numvals];
2357 }
2358 }
2359 }
2360 delete[] src;
2361 return resdat;
2362 } else { // filter enabled
2363 escript::FunctionSpace fs(getPtr(), getContinuousFunctionCode());
2364 escript::Data resdat(0, escript::DataTypes::scalarShape, fs, true);
2365 // don't need to check for exwrite because we just made it
2366 escript::DataVector& dv=resdat.getExpandedVectorReference();
2367 double* convolution=get2DGauss(radius, sigma);
2368 for (size_t y=0; y < internal[1]; ++y) {
2369 for (size_t x=0; x < internal[0]; ++x) {
2370 dv[x+y*(internal[0])] = Convolve2D(convolution, src, x+radius, y+radius, radius, ext[0]);
2371 }
2372 }
2373 delete[] convolution;
2374 delete[] src;
2375 return resdat;
2376 }
2377 }
2378
2379 dim_t Rectangle::findNode(const double *coords) const
2380 {
2381 const dim_t NOT_MINE = -1;
2382 //is the found element even owned by this rank
2383 // (inside owned or shared elements but will map to an owned element)
2384 for (int dim = 0; dim < m_numDim; dim++) {
2385 double min = m_origin[dim] + m_offset[dim]* m_dx[dim]
2386 - m_dx[dim]/2.; //allows for point outside mapping onto node
2387 double max = m_origin[dim] + (m_offset[dim] + m_NE[dim])*m_dx[dim]
2388 + m_dx[dim]/2.;
2389 if (min > coords[dim] || max < coords[dim]) {
2390 return NOT_MINE;
2391 }
2392 }
2393 // get distance from origin
2394 double x = coords[0] - m_origin[0];
2395 double y = coords[1] - m_origin[1];
2396
2397 //check if the point is even inside the domain
2398 if (x < 0 || y < 0 || x > m_length[0] || y > m_length[1])
2399 return NOT_MINE;
2400
2401 // distance in elements
2402 dim_t ex = (dim_t) floor(x / m_dx[0] + 0.01*m_dx[0]);
2403 dim_t ey = (dim_t) floor(y / m_dx[1] + 0.01*m_dx[1]);
2404 // set the min distance high enough to be outside the element plus a bit
2405 dim_t closest = NOT_MINE;
2406 double minDist = 1;
2407 for (int dim = 0; dim < m_numDim; dim++) {
2408 minDist += m_dx[dim]*m_dx[dim];
2409 }
2410 //find the closest node
2411 for (int dx = 0; dx < 1; dx++) {
2412 double xdist = (x - (ex + dx)*m_dx[0]);
2413 for (int dy = 0; dy < 1; dy++) {
2414 double ydist