/[escript]/trunk/ripley/src/Rectangle.cpp
ViewVC logotype

Contents of /trunk/ripley/src/Rectangle.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4615 - (show annotations)
Mon Jan 13 05:05:33 2014 UTC (5 years, 9 months ago) by caltinay
File size: 209474 byte(s)
Step 1 for #31: wrap parameters in an object.

1
2 /*****************************************************************************
3 *
4 * Copyright (c) 2003-2013 by University of Queensland
5 * http://www.uq.edu.au
6 *
7 * Primary Business: Queensland, Australia
8 * Licensed under the Open Software License version 3.0
9 * http://www.opensource.org/licenses/osl-3.0.php
10 *
11 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
12 * Development since 2012 by School of Earth Sciences
13 *
14 *****************************************************************************/
15
16 #include <ripley/Rectangle.h>
17 #include <paso/SystemMatrix.h>
18 #include <esysUtils/esysFileWriter.h>
19
20 #include <boost/scoped_array.hpp>
21 #include "esysUtils/EsysRandom.h"
22
23 #ifdef USE_NETCDF
24 #include <netcdfcpp.h>
25 #endif
26
27 #if USE_SILO
28 #include <silo.h>
29 #ifdef ESYS_MPI
30 #include <pmpio.h>
31 #endif
32 #endif
33
34 #include <iomanip>
35
36 using namespace std;
37 using esysUtils::FileWriter;
38
39 namespace ripley {
40
41 Rectangle::Rectangle(int n0, int n1, double x0, double y0, double x1,
42 double y1, int d0, int d1) :
43 RipleyDomain(2)
44 {
45 // ignore subdivision parameters for serial run
46 if (m_mpiInfo->size == 1) {
47 d0=1;
48 d1=1;
49 }
50
51 bool warn=false;
52 // if number of subdivisions is non-positive, try to subdivide by the same
53 // ratio as the number of elements
54 if (d0<=0 && d1<=0) {
55 warn=true;
56 d0=max(1, (int)(sqrt(m_mpiInfo->size*(n0+1)/(float)(n1+1))));
57 d1=m_mpiInfo->size/d0;
58 if (d0*d1 != m_mpiInfo->size) {
59 // ratios not the same so subdivide side with more elements only
60 if (n0>n1) {
61 d0=0;
62 d1=1;
63 } else {
64 d0=1;
65 d1=0;
66 }
67 }
68 }
69 if (d0<=0) {
70 // d1 is preset, determine d0 - throw further down if result is no good
71 d0=m_mpiInfo->size/d1;
72 } else if (d1<=0) {
73 // d0 is preset, determine d1 - throw further down if result is no good
74 d1=m_mpiInfo->size/d0;
75 }
76
77 // ensure number of subdivisions is valid and nodes can be distributed
78 // among number of ranks
79 if (d0*d1 != m_mpiInfo->size)
80 throw RipleyException("Invalid number of spatial subdivisions");
81
82 if (warn) {
83 cout << "Warning: Automatic domain subdivision (d0=" << d0 << ", d1="
84 << d1 << "). This may not be optimal!" << endl;
85 }
86
87 double l0 = x1-x0;
88 double l1 = y1-y0;
89 m_dx[0] = l0/n0;
90 m_dx[1] = l1/n1;
91
92 if ((n0+1)%d0 > 0) {
93 n0=(int)round((float)(n0+1)/d0+0.5)*d0-1;
94 l0=m_dx[0]*n0;
95 cout << "Warning: Adjusted number of elements and length. N0="
96 << n0 << ", l0=" << l0 << endl;
97 }
98 if ((n1+1)%d1 > 0) {
99 n1=(int)round((float)(n1+1)/d1+0.5)*d1-1;
100 l1=m_dx[1]*n1;
101 cout << "Warning: Adjusted number of elements and length. N1="
102 << n1 << ", l1=" << l1 << endl;
103 }
104
105 if ((d0 > 1 && (n0+1)/d0<2) || (d1 > 1 && (n1+1)/d1<2))
106 throw RipleyException("Too few elements for the number of ranks");
107
108 m_gNE[0] = n0;
109 m_gNE[1] = n1;
110 m_origin[0] = x0;
111 m_origin[1] = y0;
112 m_length[0] = l0;
113 m_length[1] = l1;
114 m_NX[0] = d0;
115 m_NX[1] = d1;
116
117 // local number of elements (with and without overlap)
118 m_NE[0] = m_ownNE[0] = (d0>1 ? (n0+1)/d0 : n0);
119 if (m_mpiInfo->rank%d0>0 && m_mpiInfo->rank%d0<d0-1)
120 m_NE[0]++;
121 else if (d0>1 && m_mpiInfo->rank%d0==d0-1)
122 m_ownNE[0]--;
123
124 m_NE[1] = m_ownNE[1] = (d1>1 ? (n1+1)/d1 : n1);
125 if (m_mpiInfo->rank/d0>0 && m_mpiInfo->rank/d0<d1-1)
126 m_NE[1]++;
127 else if (d1>1 && m_mpiInfo->rank/d0==d1-1)
128 m_ownNE[1]--;
129
130 // local number of nodes
131 m_NN[0] = m_NE[0]+1;
132 m_NN[1] = m_NE[1]+1;
133
134 // bottom-left node is at (offset0,offset1) in global mesh
135 m_offset[0] = (n0+1)/d0*(m_mpiInfo->rank%d0);
136 if (m_offset[0] > 0)
137 m_offset[0]--;
138 m_offset[1] = (n1+1)/d1*(m_mpiInfo->rank/d0);
139 if (m_offset[1] > 0)
140 m_offset[1]--;
141
142 populateSampleIds();
143 createPattern();
144 }
145
146 Rectangle::~Rectangle()
147 {
148 Paso_SystemMatrixPattern_free(m_pattern);
149 Paso_Connector_free(m_connector);
150 }
151
152 string Rectangle::getDescription() const
153 {
154 return "ripley::Rectangle";
155 }
156
157 bool Rectangle::operator==(const AbstractDomain& other) const
158 {
159 const Rectangle* o=dynamic_cast<const Rectangle*>(&other);
160 if (o) {
161 return (RipleyDomain::operator==(other) &&
162 m_gNE[0]==o->m_gNE[0] && m_gNE[1]==o->m_gNE[1]
163 && m_origin[0]==o->m_origin[0] && m_origin[1]==o->m_origin[1]
164 && m_length[0]==o->m_length[0] && m_length[1]==o->m_length[1]
165 && m_NX[0]==o->m_NX[0] && m_NX[1]==o->m_NX[1]);
166 }
167
168 return false;
169 }
170
171 void Rectangle::readNcGrid(escript::Data& out, string filename, string varname,
172 const GridParameters& params) const
173 {
174 #ifdef USE_NETCDF
175 // check destination function space
176 int myN0, myN1;
177 if (out.getFunctionSpace().getTypeCode() == Nodes) {
178 myN0 = m_NN[0];
179 myN1 = m_NN[1];
180 } else if (out.getFunctionSpace().getTypeCode() == Elements ||
181 out.getFunctionSpace().getTypeCode() == ReducedElements) {
182 myN0 = m_NE[0];
183 myN1 = m_NE[1];
184 } else
185 throw RipleyException("readNcGrid(): invalid function space for output data object");
186
187 if (params.first.size() != 2)
188 throw RipleyException("readNcGrid(): argument 'first' must have 2 entries");
189
190 if (params.numValues.size() != 2)
191 throw RipleyException("readNcGrid(): argument 'numValues' must have 2 entries");
192
193 if (params.multiplier.size() != 2)
194 throw RipleyException("readNcGrid(): argument 'multiplier' must have 2 entries");
195 for (size_t i=0; i<params.multiplier.size(); i++)
196 if (params.multiplier[i]<1)
197 throw RipleyException("readNcGrid(): all multipliers must be positive");
198
199 // check file existence and size
200 NcFile f(filename.c_str(), NcFile::ReadOnly);
201 if (!f.is_valid())
202 throw RipleyException("readNcGrid(): cannot open file");
203
204 NcVar* var = f.get_var(varname.c_str());
205 if (!var)
206 throw RipleyException("readNcGrid(): invalid variable");
207
208 // TODO: rank>0 data support
209 const int numComp = out.getDataPointSize();
210 if (numComp > 1)
211 throw RipleyException("readNcGrid(): only scalar data supported");
212
213 const int dims = var->num_dims();
214 boost::scoped_array<long> edges(var->edges());
215
216 // is this a slice of the data object (dims!=2)?
217 // note the expected ordering of edges (as in numpy: y,x)
218 if ( (dims==2 && (params.numValues[1] > edges[0] || params.numValues[0] > edges[1]))
219 || (dims==1 && params.numValues[1]>1) ) {
220 throw RipleyException("readNcGrid(): not enough data in file");
221 }
222
223 // check if this rank contributes anything
224 if (params.first[0] >= m_offset[0]+myN0 ||
225 params.first[0]+params.numValues[0]*params.multiplier[0] <= m_offset[0] ||
226 params.first[1] >= m_offset[1]+myN1 ||
227 params.first[1]+params.numValues[1]*params.multiplier[1] <= m_offset[1])
228 return;
229
230 // now determine how much this rank has to write
231
232 // first coordinates in data object to write to
233 const int first0 = max(0, params.first[0]-m_offset[0]);
234 const int first1 = max(0, params.first[1]-m_offset[1]);
235 // indices to first value in file
236 const int idx0 = max(0, m_offset[0]-params.first[0]);
237 const int idx1 = max(0, m_offset[1]-params.first[1]);
238 // number of values to read
239 const int num0 = min(params.numValues[0]-idx0, myN0-first0);
240 const int num1 = min(params.numValues[1]-idx1, myN1-first1);
241
242 vector<double> values(num0*num1);
243 if (dims==2) {
244 var->set_cur(idx1, idx0);
245 var->get(&values[0], num1, num0);
246 } else {
247 var->set_cur(idx0);
248 var->get(&values[0], num0);
249 }
250
251 const int dpp = out.getNumDataPointsPerSample();
252 out.requireWrite();
253
254 for (index_t y=0; y<num1; y++) {
255 #pragma omp parallel for
256 for (index_t x=0; x<num0; x++) {
257 const int baseIndex = first0+x*params.multiplier[0]
258 +(first1+y*params.multiplier[1])*myN0;
259 const int srcIndex = y*num0+x;
260 if (!isnan(values[srcIndex])) {
261 for (index_t m1=0; m1<params.multiplier[1]; m1++) {
262 for (index_t m0=0; m0<params.multiplier[0]; m0++) {
263 const int dataIndex = baseIndex+m0+m1*myN0;
264 double* dest = out.getSampleDataRW(dataIndex);
265 for (index_t q=0; q<dpp; q++) {
266 *dest++ = values[srcIndex];
267 }
268 }
269 }
270 }
271 }
272 }
273 #else
274 throw RipleyException("readNcGrid(): not compiled with netCDF support");
275 #endif
276 }
277
278 void Rectangle::readBinaryGrid(escript::Data& out, string filename,
279 const GridParameters& params) const
280 {
281 // the mapping is not universally correct but should work on our
282 // supported platforms
283 switch (params.dataType) {
284 case DATATYPE_INT32:
285 readBinaryGridImpl<int>(out, filename, params);
286 break;
287 case DATATYPE_FLOAT32:
288 readBinaryGridImpl<float>(out, filename, params);
289 break;
290 case DATATYPE_FLOAT64:
291 readBinaryGridImpl<double>(out, filename, params);
292 break;
293 default:
294 throw RipleyException("readBinaryGrid(): invalid or unsupported datatype");
295 }
296 }
297
298 template<typename ValueType>
299 void Rectangle::readBinaryGridImpl(escript::Data& out, const string& filename,
300 const GridParameters& params) const
301 {
302 // check destination function space
303 int myN0, myN1;
304 if (out.getFunctionSpace().getTypeCode() == Nodes) {
305 myN0 = m_NN[0];
306 myN1 = m_NN[1];
307 } else if (out.getFunctionSpace().getTypeCode() == Elements ||
308 out.getFunctionSpace().getTypeCode() == ReducedElements) {
309 myN0 = m_NE[0];
310 myN1 = m_NE[1];
311 } else
312 throw RipleyException("readBinaryGrid(): invalid function space for output data object");
313
314 // check file existence and size
315 ifstream f(filename.c_str(), ifstream::binary);
316 if (f.fail()) {
317 throw RipleyException("readBinaryGrid(): cannot open file");
318 }
319 f.seekg(0, ios::end);
320 const int numComp = out.getDataPointSize();
321 const int filesize = f.tellg();
322 const int reqsize = params.numValues[0]*params.numValues[1]*numComp*sizeof(ValueType);
323 if (filesize < reqsize) {
324 f.close();
325 throw RipleyException("readBinaryGrid(): not enough data in file");
326 }
327
328 // check if this rank contributes anything
329 if (params.first[0] >= m_offset[0]+myN0 ||
330 params.first[0]+params.numValues[0] <= m_offset[0] ||
331 params.first[1] >= m_offset[1]+myN1 ||
332 params.first[1]+params.numValues[1] <= m_offset[1]) {
333 f.close();
334 return;
335 }
336
337 // now determine how much this rank has to write
338
339 // first coordinates in data object to write to
340 const int first0 = max(0, params.first[0]-m_offset[0]);
341 const int first1 = max(0, params.first[1]-m_offset[1]);
342 // indices to first value in file
343 const int idx0 = max(0, m_offset[0]-params.first[0]);
344 const int idx1 = max(0, m_offset[1]-params.first[1]);
345 // number of values to read
346 const int num0 = min(params.numValues[0]-idx0, myN0-first0);
347 const int num1 = min(params.numValues[1]-idx1, myN1-first1);
348
349 out.requireWrite();
350 vector<ValueType> values(num0*numComp);
351 const int dpp = out.getNumDataPointsPerSample();
352
353 for (int y=0; y<num1; y++) {
354 const int fileofs = numComp*(idx0+(idx1+y)*params.numValues[0]);
355 f.seekg(fileofs*sizeof(ValueType));
356 f.read((char*)&values[0], num0*numComp*sizeof(ValueType));
357 for (int x=0; x<num0; x++) {
358 const int baseIndex = first0+x*params.multiplier[0]
359 +(first1+y*params.multiplier[1])*myN0;
360 for (int m1=0; m1<params.multiplier[1]; m1++) {
361 for (int m0=0; m0<params.multiplier[0]; m0++) {
362 const int dataIndex = baseIndex+m0+m1*myN0;
363 double* dest = out.getSampleDataRW(dataIndex);
364 for (int c=0; c<numComp; c++) {
365 ValueType val = values[x*numComp+c];
366
367 if (params.byteOrder != BYTEORDER_NATIVE) {
368 char* cval = reinterpret_cast<char*>(&val);
369 // this will alter val!!
370 byte_swap32(cval);
371 }
372 if (!std::isnan(val)) {
373 for (int q=0; q<dpp; q++) {
374 *dest++ = static_cast<double>(val);
375 }
376 }
377 }
378 }
379 }
380 }
381 }
382
383 f.close();
384 }
385
386 void Rectangle::writeBinaryGrid(const escript::Data& in, string filename,
387 int byteOrder, int dataType) const
388 {
389 // the mapping is not universally correct but should work on our
390 // supported platforms
391 switch (dataType) {
392 case DATATYPE_INT32:
393 writeBinaryGridImpl<int>(in, filename, byteOrder);
394 break;
395 case DATATYPE_FLOAT32:
396 writeBinaryGridImpl<float>(in, filename, byteOrder);
397 break;
398 case DATATYPE_FLOAT64:
399 writeBinaryGridImpl<double>(in, filename, byteOrder);
400 break;
401 default:
402 throw RipleyException("writeBinaryGrid(): invalid or unsupported datatype");
403 }
404 }
405
406 template<typename ValueType>
407 void Rectangle::writeBinaryGridImpl(const escript::Data& in,
408 const string& filename, int byteOrder) const
409 {
410 // check function space and determine number of points
411 int myN0, myN1;
412 int totalN0, totalN1;
413 if (in.getFunctionSpace().getTypeCode() == Nodes) {
414 myN0 = m_NN[0];
415 myN1 = m_NN[1];
416 totalN0 = m_gNE[0]+1;
417 totalN1 = m_gNE[1]+1;
418 } else if (in.getFunctionSpace().getTypeCode() == Elements ||
419 in.getFunctionSpace().getTypeCode() == ReducedElements) {
420 myN0 = m_NE[0];
421 myN1 = m_NE[1];
422 totalN0 = m_gNE[0];
423 totalN1 = m_gNE[1];
424 } else
425 throw RipleyException("writeBinaryGrid(): invalid function space of data object");
426
427 const int numComp = in.getDataPointSize();
428 const int dpp = in.getNumDataPointsPerSample();
429
430 if (numComp > 1 || dpp > 1)
431 throw RipleyException("writeBinaryGrid(): only scalar, single-value data supported");
432
433 escript::Data* _in = const_cast<escript::Data*>(&in);
434 const int fileSize = sizeof(ValueType)*numComp*dpp*totalN0*totalN1;
435
436 // from here on we know that each sample consists of one value
437 FileWriter fw;
438 fw.openFile(filename, fileSize);
439 MPIBarrier();
440
441 for (index_t y=0; y<myN1; y++) {
442 const int fileofs = (m_offset[0]+(m_offset[1]+y)*totalN0)*sizeof(ValueType);
443 ostringstream oss;
444
445 for (index_t x=0; x<myN0; x++) {
446 const double* sample = _in->getSampleDataRO(y*myN0+x);
447 ValueType fvalue = static_cast<ValueType>(*sample);
448 if (byteOrder == BYTEORDER_NATIVE) {
449 oss.write((char*)&fvalue, sizeof(fvalue));
450 } else {
451 char* value = reinterpret_cast<char*>(&fvalue);
452 oss.write(byte_swap32(value), sizeof(fvalue));
453 }
454 }
455 fw.writeAt(oss, fileofs);
456 }
457 fw.close();
458 }
459
460 void Rectangle::dump(const string& fileName) const
461 {
462 #if USE_SILO
463 string fn(fileName);
464 if (fileName.length() < 6 || fileName.compare(fileName.length()-5, 5, ".silo") != 0) {
465 fn+=".silo";
466 }
467
468 int driver=DB_HDF5;
469 DBfile* dbfile = NULL;
470 const char* blockDirFmt = "/block%04d";
471
472 #ifdef ESYS_MPI
473 PMPIO_baton_t* baton = NULL;
474 const int NUM_SILO_FILES = 1;
475 #endif
476
477 if (m_mpiInfo->size > 1) {
478 #ifdef ESYS_MPI
479 baton = PMPIO_Init(NUM_SILO_FILES, PMPIO_WRITE, m_mpiInfo->comm,
480 0x1337, PMPIO_DefaultCreate, PMPIO_DefaultOpen,
481 PMPIO_DefaultClose, (void*)&driver);
482 // try the fallback driver in case of error
483 if (!baton && driver != DB_PDB) {
484 driver = DB_PDB;
485 baton = PMPIO_Init(NUM_SILO_FILES, PMPIO_WRITE, m_mpiInfo->comm,
486 0x1338, PMPIO_DefaultCreate, PMPIO_DefaultOpen,
487 PMPIO_DefaultClose, (void*)&driver);
488 }
489 if (baton) {
490 char siloPath[64];
491 snprintf(siloPath, 64, blockDirFmt, PMPIO_RankInGroup(baton, m_mpiInfo->rank));
492 dbfile = (DBfile*) PMPIO_WaitForBaton(baton, fn.c_str(), siloPath);
493 }
494 #endif
495 } else {
496 dbfile = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL,
497 getDescription().c_str(), driver);
498 // try the fallback driver in case of error
499 if (!dbfile && driver != DB_PDB) {
500 driver = DB_PDB;
501 dbfile = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL,
502 getDescription().c_str(), driver);
503 }
504 char siloPath[64];
505 snprintf(siloPath, 64, blockDirFmt, 0);
506 DBMkDir(dbfile, siloPath);
507 DBSetDir(dbfile, siloPath);
508 }
509
510 if (!dbfile)
511 throw RipleyException("dump: Could not create Silo file");
512
513 /*
514 if (driver==DB_HDF5) {
515 // gzip level 1 already provides good compression with minimal
516 // performance penalty. Some tests showed that gzip levels >3 performed
517 // rather badly on escript data both in terms of time and space
518 DBSetCompression("ERRMODE=FALLBACK METHOD=GZIP LEVEL=1");
519 }
520 */
521
522 boost::scoped_ptr<double> x(new double[m_NN[0]]);
523 boost::scoped_ptr<double> y(new double[m_NN[1]]);
524 double* coords[2] = { x.get(), y.get() };
525 #pragma omp parallel
526 {
527 #pragma omp for nowait
528 for (dim_t i0 = 0; i0 < m_NN[0]; i0++) {
529 coords[0][i0]=getLocalCoordinate(i0, 0);
530 }
531 #pragma omp for nowait
532 for (dim_t i1 = 0; i1 < m_NN[1]; i1++) {
533 coords[1][i1]=getLocalCoordinate(i1, 1);
534 }
535 }
536 int* dims = const_cast<int*>(getNumNodesPerDim());
537
538 // write mesh
539 DBPutQuadmesh(dbfile, "mesh", NULL, coords, dims, 2, DB_DOUBLE,
540 DB_COLLINEAR, NULL);
541
542 // write node ids
543 DBPutQuadvar1(dbfile, "nodeId", "mesh", (void*)&m_nodeId[0], dims, 2,
544 NULL, 0, DB_INT, DB_NODECENT, NULL);
545
546 // write element ids
547 dims = const_cast<int*>(getNumElementsPerDim());
548 DBPutQuadvar1(dbfile, "elementId", "mesh", (void*)&m_elementId[0],
549 dims, 2, NULL, 0, DB_INT, DB_ZONECENT, NULL);
550
551 // rank 0 writes multimesh and multivar
552 if (m_mpiInfo->rank == 0) {
553 vector<string> tempstrings;
554 vector<char*> names;
555 for (dim_t i=0; i<m_mpiInfo->size; i++) {
556 stringstream path;
557 path << "/block" << setw(4) << setfill('0') << right << i << "/mesh";
558 tempstrings.push_back(path.str());
559 names.push_back((char*)tempstrings.back().c_str());
560 }
561 vector<int> types(m_mpiInfo->size, DB_QUAD_RECT);
562 DBSetDir(dbfile, "/");
563 DBPutMultimesh(dbfile, "multimesh", m_mpiInfo->size, &names[0],
564 &types[0], NULL);
565 tempstrings.clear();
566 names.clear();
567 for (dim_t i=0; i<m_mpiInfo->size; i++) {
568 stringstream path;
569 path << "/block" << setw(4) << setfill('0') << right << i << "/nodeId";
570 tempstrings.push_back(path.str());
571 names.push_back((char*)tempstrings.back().c_str());
572 }
573 types.assign(m_mpiInfo->size, DB_QUADVAR);
574 DBPutMultivar(dbfile, "nodeId", m_mpiInfo->size, &names[0],
575 &types[0], NULL);
576 tempstrings.clear();
577 names.clear();
578 for (dim_t i=0; i<m_mpiInfo->size; i++) {
579 stringstream path;
580 path << "/block" << setw(4) << setfill('0') << right << i << "/elementId";
581 tempstrings.push_back(path.str());
582 names.push_back((char*)tempstrings.back().c_str());
583 }
584 DBPutMultivar(dbfile, "elementId", m_mpiInfo->size, &names[0],
585 &types[0], NULL);
586 }
587
588 if (m_mpiInfo->size > 1) {
589 #ifdef ESYS_MPI
590 PMPIO_HandOffBaton(baton, dbfile);
591 PMPIO_Finish(baton);
592 #endif
593 } else {
594 DBClose(dbfile);
595 }
596
597 #else // USE_SILO
598 throw RipleyException("dump: no Silo support");
599 #endif
600 }
601
602 const int* Rectangle::borrowSampleReferenceIDs(int fsType) const
603 {
604 switch (fsType) {
605 case Nodes:
606 case ReducedNodes: // FIXME: reduced
607 return &m_nodeId[0];
608 case DegreesOfFreedom:
609 case ReducedDegreesOfFreedom: // FIXME: reduced
610 return &m_dofId[0];
611 case Elements:
612 case ReducedElements:
613 return &m_elementId[0];
614 case FaceElements:
615 case ReducedFaceElements:
616 return &m_faceId[0];
617 default:
618 break;
619 }
620
621 stringstream msg;
622 msg << "borrowSampleReferenceIDs: invalid function space type " << fsType;
623 throw RipleyException(msg.str());
624 }
625
626 bool Rectangle::ownSample(int fsType, index_t id) const
627 {
628 if (getMPISize()==1)
629 return true;
630
631 switch (fsType) {
632 case Nodes:
633 case ReducedNodes: // FIXME: reduced
634 return (m_dofMap[id] < getNumDOF());
635 case DegreesOfFreedom:
636 case ReducedDegreesOfFreedom:
637 return true;
638 case Elements:
639 case ReducedElements:
640 // check ownership of element's bottom left node
641 return (m_dofMap[id%m_NE[0]+m_NN[0]*(id/m_NE[0])] < getNumDOF());
642 case FaceElements:
643 case ReducedFaceElements:
644 {
645 // determine which face the sample belongs to before
646 // checking ownership of corresponding element's first node
647 dim_t n=0;
648 for (size_t i=0; i<4; i++) {
649 n+=m_faceCount[i];
650 if (id<n) {
651 index_t k;
652 if (i==1)
653 k=m_NN[0]-2;
654 else if (i==3)
655 k=m_NN[0]*(m_NN[1]-2);
656 else
657 k=0;
658 // determine whether to move right or up
659 const index_t delta=(i/2==0 ? m_NN[0] : 1);
660 return (m_dofMap[k+(id-n+m_faceCount[i])*delta] < getNumDOF());
661 }
662 }
663 return false;
664 }
665 default:
666 break;
667 }
668
669 stringstream msg;
670 msg << "ownSample: invalid function space type " << fsType;
671 throw RipleyException(msg.str());
672 }
673
674 void Rectangle::setToNormal(escript::Data& out) const
675 {
676 if (out.getFunctionSpace().getTypeCode() == FaceElements) {
677 out.requireWrite();
678 #pragma omp parallel
679 {
680 if (m_faceOffset[0] > -1) {
681 #pragma omp for nowait
682 for (index_t k1 = 0; k1 < m_NE[1]; ++k1) {
683 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
684 // set vector at two quadrature points
685 *o++ = -1.;
686 *o++ = 0.;
687 *o++ = -1.;
688 *o = 0.;
689 }
690 }
691
692 if (m_faceOffset[1] > -1) {
693 #pragma omp for nowait
694 for (index_t k1 = 0; k1 < m_NE[1]; ++k1) {
695 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
696 // set vector at two quadrature points
697 *o++ = 1.;
698 *o++ = 0.;
699 *o++ = 1.;
700 *o = 0.;
701 }
702 }
703
704 if (m_faceOffset[2] > -1) {
705 #pragma omp for nowait
706 for (index_t k0 = 0; k0 < m_NE[0]; ++k0) {
707 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
708 // set vector at two quadrature points
709 *o++ = 0.;
710 *o++ = -1.;
711 *o++ = 0.;
712 *o = -1.;
713 }
714 }
715
716 if (m_faceOffset[3] > -1) {
717 #pragma omp for nowait
718 for (index_t k0 = 0; k0 < m_NE[0]; ++k0) {
719 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
720 // set vector at two quadrature points
721 *o++ = 0.;
722 *o++ = 1.;
723 *o++ = 0.;
724 *o = 1.;
725 }
726 }
727 } // end of parallel section
728 } else if (out.getFunctionSpace().getTypeCode() == ReducedFaceElements) {
729 out.requireWrite();
730 #pragma omp parallel
731 {
732 if (m_faceOffset[0] > -1) {
733 #pragma omp for nowait
734 for (index_t k1 = 0; k1 < m_NE[1]; ++k1) {
735 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
736 *o++ = -1.;
737 *o = 0.;
738 }
739 }
740
741 if (m_faceOffset[1] > -1) {
742 #pragma omp for nowait
743 for (index_t k1 = 0; k1 < m_NE[1]; ++k1) {
744 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
745 *o++ = 1.;
746 *o = 0.;
747 }
748 }
749
750 if (m_faceOffset[2] > -1) {
751 #pragma omp for nowait
752 for (index_t k0 = 0; k0 < m_NE[0]; ++k0) {
753 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
754 *o++ = 0.;
755 *o = -1.;
756 }
757 }
758
759 if (m_faceOffset[3] > -1) {
760 #pragma omp for nowait
761 for (index_t k0 = 0; k0 < m_NE[0]; ++k0) {
762 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
763 *o++ = 0.;
764 *o = 1.;
765 }
766 }
767 } // end of parallel section
768
769 } else {
770 stringstream msg;
771 msg << "setToNormal: invalid function space type "
772 << out.getFunctionSpace().getTypeCode();
773 throw RipleyException(msg.str());
774 }
775 }
776
777 void Rectangle::setToSize(escript::Data& out) const
778 {
779 if (out.getFunctionSpace().getTypeCode() == Elements
780 || out.getFunctionSpace().getTypeCode() == ReducedElements) {
781 out.requireWrite();
782 const dim_t numQuad=out.getNumDataPointsPerSample();
783 const double size=sqrt(m_dx[0]*m_dx[0]+m_dx[1]*m_dx[1]);
784 #pragma omp parallel for
785 for (index_t k = 0; k < getNumElements(); ++k) {
786 double* o = out.getSampleDataRW(k);
787 fill(o, o+numQuad, size);
788 }
789 } else if (out.getFunctionSpace().getTypeCode() == FaceElements
790 || out.getFunctionSpace().getTypeCode() == ReducedFaceElements) {
791 out.requireWrite();
792 const dim_t numQuad=out.getNumDataPointsPerSample();
793 #pragma omp parallel
794 {
795 if (m_faceOffset[0] > -1) {
796 #pragma omp for nowait
797 for (index_t k1 = 0; k1 < m_NE[1]; ++k1) {
798 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
799 fill(o, o+numQuad, m_dx[1]);
800 }
801 }
802
803 if (m_faceOffset[1] > -1) {
804 #pragma omp for nowait
805 for (index_t k1 = 0; k1 < m_NE[1]; ++k1) {
806 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
807 fill(o, o+numQuad, m_dx[1]);
808 }
809 }
810
811 if (m_faceOffset[2] > -1) {
812 #pragma omp for nowait
813 for (index_t k0 = 0; k0 < m_NE[0]; ++k0) {
814 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
815 fill(o, o+numQuad, m_dx[0]);
816 }
817 }
818
819 if (m_faceOffset[3] > -1) {
820 #pragma omp for nowait
821 for (index_t k0 = 0; k0 < m_NE[0]; ++k0) {
822 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
823 fill(o, o+numQuad, m_dx[0]);
824 }
825 }
826 } // end of parallel section
827
828 } else {
829 stringstream msg;
830 msg << "setToSize: invalid function space type "
831 << out.getFunctionSpace().getTypeCode();
832 throw RipleyException(msg.str());
833 }
834 }
835
836 void Rectangle::Print_Mesh_Info(const bool full) const
837 {
838 RipleyDomain::Print_Mesh_Info(full);
839 if (full) {
840 cout << " Id Coordinates" << endl;
841 cout.precision(15);
842 cout.setf(ios::scientific, ios::floatfield);
843 for (index_t i=0; i < getNumNodes(); i++) {
844 cout << " " << setw(5) << m_nodeId[i]
845 << " " << getLocalCoordinate(i%m_NN[0], 0)
846 << " " << getLocalCoordinate(i/m_NN[0], 1) << endl;
847 }
848 }
849 }
850
851
852 //protected
853 void Rectangle::assembleCoordinates(escript::Data& arg) const
854 {
855 escriptDataC x = arg.getDataC();
856 int numDim = m_numDim;
857 if (!isDataPointShapeEqual(&x, 1, &numDim))
858 throw RipleyException("setToX: Invalid Data object shape");
859 if (!numSamplesEqual(&x, 1, getNumNodes()))
860 throw RipleyException("setToX: Illegal number of samples in Data object");
861
862 arg.requireWrite();
863 #pragma omp parallel for
864 for (dim_t i1 = 0; i1 < m_NN[1]; i1++) {
865 for (dim_t i0 = 0; i0 < m_NN[0]; i0++) {
866 double* point = arg.getSampleDataRW(i0+m_NN[0]*i1);
867 point[0] = getLocalCoordinate(i0, 0);
868 point[1] = getLocalCoordinate(i1, 1);
869 }
870 }
871 }
872
873 //protected
874 void Rectangle::assembleGradient(escript::Data& out, escript::Data& in) const
875 {
876 const dim_t numComp = in.getDataPointSize();
877 const double cx0 = .21132486540518711775/m_dx[0];
878 const double cx1 = .78867513459481288225/m_dx[0];
879 const double cx2 = 1./m_dx[0];
880 const double cy0 = .21132486540518711775/m_dx[1];
881 const double cy1 = .78867513459481288225/m_dx[1];
882 const double cy2 = 1./m_dx[1];
883
884 if (out.getFunctionSpace().getTypeCode() == Elements) {
885 out.requireWrite();
886 #pragma omp parallel
887 {
888 vector<double> f_00(numComp);
889 vector<double> f_01(numComp);
890 vector<double> f_10(numComp);
891 vector<double> f_11(numComp);
892 #pragma omp for
893 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
894 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
895 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
896 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
897 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
898 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
899 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
900 for (index_t i=0; i < numComp; ++i) {
901 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
902 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
903 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
904 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
905 o[INDEX3(i,0,2,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
906 o[INDEX3(i,1,2,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
907 o[INDEX3(i,0,3,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
908 o[INDEX3(i,1,3,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
909 } // end of component loop i
910 } // end of k0 loop
911 } // end of k1 loop
912 } // end of parallel section
913 } else if (out.getFunctionSpace().getTypeCode() == ReducedElements) {
914 out.requireWrite();
915 #pragma omp parallel
916 {
917 vector<double> f_00(numComp);
918 vector<double> f_01(numComp);
919 vector<double> f_10(numComp);
920 vector<double> f_11(numComp);
921 #pragma omp for
922 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
923 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
924 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
925 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
926 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
927 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
928 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
929 for (index_t i=0; i < numComp; ++i) {
930 o[INDEX3(i,0,0,numComp,2)] = (f_10[i] + f_11[i] - f_00[i] - f_01[i])*cx2/2;
931 o[INDEX3(i,1,0,numComp,2)] = (f_01[i] + f_11[i] - f_00[i] - f_10[i])*cy2/2;
932 } // end of component loop i
933 } // end of k0 loop
934 } // end of k1 loop
935 } // end of parallel section
936 } else if (out.getFunctionSpace().getTypeCode() == FaceElements) {
937 out.requireWrite();
938 #pragma omp parallel
939 {
940 vector<double> f_00(numComp);
941 vector<double> f_01(numComp);
942 vector<double> f_10(numComp);
943 vector<double> f_11(numComp);
944 if (m_faceOffset[0] > -1) {
945 #pragma omp for nowait
946 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
947 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
948 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
949 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(1,k1, m_NN[0])), numComp*sizeof(double));
950 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(1,k1+1, m_NN[0])), numComp*sizeof(double));
951 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
952 for (index_t i=0; i < numComp; ++i) {
953 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
954 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy2;
955 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
956 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy2;
957 } // end of component loop i
958 } // end of k1 loop
959 } // end of face 0
960 if (m_faceOffset[1] > -1) {
961 #pragma omp for nowait
962 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
963 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1, m_NN[0])), numComp*sizeof(double));
964 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1+1, m_NN[0])), numComp*sizeof(double));
965 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
966 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
967 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
968 for (index_t i=0; i < numComp; ++i) {
969 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx1 + (f_11[i]-f_01[i])*cx0;
970 o[INDEX3(i,1,0,numComp,2)] = (f_11[i]-f_10[i])*cy2;
971 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx0 + (f_11[i]-f_01[i])*cx1;
972 o[INDEX3(i,1,1,numComp,2)] = (f_11[i]-f_10[i])*cy2;
973 } // end of component loop i
974 } // end of k1 loop
975 } // end of face 1
976 if (m_faceOffset[2] > -1) {
977 #pragma omp for nowait
978 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
979 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
980 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,1, m_NN[0])), numComp*sizeof(double));
981 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
982 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,1, m_NN[0])), numComp*sizeof(double));
983 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
984 for (index_t i=0; i < numComp; ++i) {
985 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx2;
986 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
987 o[INDEX3(i,0,1,numComp,2)] = (f_10[i]-f_00[i])*cx2;
988 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
989 } // end of component loop i
990 } // end of k0 loop
991 } // end of face 2
992 if (m_faceOffset[3] > -1) {
993 #pragma omp for nowait
994 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
995 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
996 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
997 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
998 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
999 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
1000 for (index_t i=0; i < numComp; ++i) {
1001 o[INDEX3(i,0,0,numComp,2)] = (f_11[i]-f_01[i])*cx2;
1002 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy1 + (f_11[i]-f_10[i])*cy0;
1003 o[INDEX3(i,0,1,numComp,2)] = (f_11[i]-f_01[i])*cx2;
1004 o[INDEX3(i,1,1,numComp,2)] = (f_01[i]-f_00[i])*cy0 + (f_11[i]-f_10[i])*cy1;
1005 } // end of component loop i
1006 } // end of k0 loop
1007 } // end of face 3
1008 } // end of parallel section
1009
1010 } else if (out.getFunctionSpace().getTypeCode() == ReducedFaceElements) {
1011 out.requireWrite();
1012 #pragma omp parallel
1013 {
1014 vector<double> f_00(numComp);
1015 vector<double> f_01(numComp);
1016 vector<double> f_10(numComp);
1017 vector<double> f_11(numComp);
1018 if (m_faceOffset[0] > -1) {
1019 #pragma omp for nowait
1020 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1021 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
1022 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
1023 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(1,k1, m_NN[0])), numComp*sizeof(double));
1024 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(1,k1+1, m_NN[0])), numComp*sizeof(double));
1025 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
1026 for (index_t i=0; i < numComp; ++i) {
1027 o[INDEX3(i,0,0,numComp,2)] = (f_10[i] + f_11[i] - f_00[i] - f_01[i])*cx2/2;
1028 o[INDEX3(i,1,0,numComp,2)] = (f_01[i]-f_00[i])*cy2;
1029 } // end of component loop i
1030 } // end of k1 loop
1031 } // end of face 0
1032 if (m_faceOffset[1] > -1) {
1033 #pragma omp for nowait
1034 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1035 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1, m_NN[0])), numComp*sizeof(double));
1036 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(m_NN[0]-2,k1+1, m_NN[0])), numComp*sizeof(double));
1037 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
1038 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
1039 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
1040 for (index_t i=0; i < numComp; ++i) {
1041 o[INDEX3(i,0,0,numComp,2)] = (f_10[i] + f_11[i] - f_00[i] - f_01[i])*cx2/2;
1042 o[INDEX3(i,1,0,numComp,2)] = (f_11[i]-f_10[i])*cy2;
1043 } // end of component loop i
1044 } // end of k1 loop
1045 } // end of face 1
1046 if (m_faceOffset[2] > -1) {
1047 #pragma omp for nowait
1048 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1049 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
1050 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,1, m_NN[0])), numComp*sizeof(double));
1051 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
1052 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,1, m_NN[0])), numComp*sizeof(double));
1053 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
1054 for (index_t i=0; i < numComp; ++i) {
1055 o[INDEX3(i,0,0,numComp,2)] = (f_10[i]-f_00[i])*cx2;
1056 o[INDEX3(i,1,0,numComp,2)] = (f_01[i] + f_11[i] - f_00[i] - f_10[i])*cy2/2;
1057 } // end of component loop i
1058 } // end of k0 loop
1059 } // end of face 2
1060 if (m_faceOffset[3] > -1) {
1061 #pragma omp for nowait
1062 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1063 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
1064 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1065 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-2, m_NN[0])), numComp*sizeof(double));
1066 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1067 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
1068 for (index_t i=0; i < numComp; ++i) {
1069 o[INDEX3(i,0,0,numComp,2)] = (f_11[i]-f_01[i])*cx2;
1070 o[INDEX3(i,1,0,numComp,2)] = (f_01[i] + f_11[i] - f_00[i] - f_10[i])*cy2/2;
1071 } // end of component loop i
1072 } // end of k0 loop
1073 } // end of face 3
1074 } // end of parallel section
1075 }
1076 }
1077
1078 //protected
1079 void Rectangle::assembleIntegrate(vector<double>& integrals, escript::Data& arg) const
1080 {
1081 const dim_t numComp = arg.getDataPointSize();
1082 const index_t left = (m_offset[0]==0 ? 0 : 1);
1083 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1084 const int fs=arg.getFunctionSpace().getTypeCode();
1085 if (fs == Elements && arg.actsExpanded()) {
1086 #pragma omp parallel
1087 {
1088 vector<double> int_local(numComp, 0);
1089 const double w = m_dx[0]*m_dx[1]/4.;
1090 #pragma omp for nowait
1091 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1092 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1093 const double* f = arg.getSampleDataRO(INDEX2(k0, k1, m_NE[0]));
1094 for (index_t i=0; i < numComp; ++i) {
1095 const double f0 = f[INDEX2(i,0,numComp)];
1096 const double f1 = f[INDEX2(i,1,numComp)];
1097 const double f2 = f[INDEX2(i,2,numComp)];
1098 const double f3 = f[INDEX2(i,3,numComp)];
1099 int_local[i]+=(f0+f1+f2+f3)*w;
1100 } // end of component loop i
1101 } // end of k0 loop
1102 } // end of k1 loop
1103 #pragma omp critical
1104 for (index_t i=0; i<numComp; i++)
1105 integrals[i]+=int_local[i];
1106 } // end of parallel section
1107
1108 } else if (fs==ReducedElements || (fs==Elements && !arg.actsExpanded())) {
1109 const double w = m_dx[0]*m_dx[1];
1110 #pragma omp parallel
1111 {
1112 vector<double> int_local(numComp, 0);
1113 #pragma omp for nowait
1114 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1115 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1116 const double* f = arg.getSampleDataRO(INDEX2(k0, k1, m_NE[0]));
1117 for (index_t i=0; i < numComp; ++i) {
1118 int_local[i]+=f[i]*w;
1119 }
1120 }
1121 }
1122 #pragma omp critical
1123 for (index_t i=0; i<numComp; i++)
1124 integrals[i]+=int_local[i];
1125 } // end of parallel section
1126
1127 } else if (fs == FaceElements && arg.actsExpanded()) {
1128 #pragma omp parallel
1129 {
1130 vector<double> int_local(numComp, 0);
1131 const double w0 = m_dx[0]/2.;
1132 const double w1 = m_dx[1]/2.;
1133 if (m_faceOffset[0] > -1) {
1134 #pragma omp for nowait
1135 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1136 const double* f = arg.getSampleDataRO(m_faceOffset[0]+k1);
1137 for (index_t i=0; i < numComp; ++i) {
1138 const double f0 = f[INDEX2(i,0,numComp)];
1139 const double f1 = f[INDEX2(i,1,numComp)];
1140 int_local[i]+=(f0+f1)*w1;
1141 } // end of component loop i
1142 } // end of k1 loop
1143 }
1144
1145 if (m_faceOffset[1] > -1) {
1146 #pragma omp for nowait
1147 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1148 const double* f = arg.getSampleDataRO(m_faceOffset[1]+k1);
1149 for (index_t i=0; i < numComp; ++i) {
1150 const double f0 = f[INDEX2(i,0,numComp)];
1151 const double f1 = f[INDEX2(i,1,numComp)];
1152 int_local[i]+=(f0+f1)*w1;
1153 } // end of component loop i
1154 } // end of k1 loop
1155 }
1156
1157 if (m_faceOffset[2] > -1) {
1158 #pragma omp for nowait
1159 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1160 const double* f = arg.getSampleDataRO(m_faceOffset[2]+k0);
1161 for (index_t i=0; i < numComp; ++i) {
1162 const double f0 = f[INDEX2(i,0,numComp)];
1163 const double f1 = f[INDEX2(i,1,numComp)];
1164 int_local[i]+=(f0+f1)*w0;
1165 } // end of component loop i
1166 } // end of k0 loop
1167 }
1168
1169 if (m_faceOffset[3] > -1) {
1170 #pragma omp for nowait
1171 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1172 const double* f = arg.getSampleDataRO(m_faceOffset[3]+k0);
1173 for (index_t i=0; i < numComp; ++i) {
1174 const double f0 = f[INDEX2(i,0,numComp)];
1175 const double f1 = f[INDEX2(i,1,numComp)];
1176 int_local[i]+=(f0+f1)*w0;
1177 } // end of component loop i
1178 } // end of k0 loop
1179 }
1180 #pragma omp critical
1181 for (index_t i=0; i<numComp; i++)
1182 integrals[i]+=int_local[i];
1183 } // end of parallel section
1184
1185 } else if (fs==ReducedFaceElements || (fs==FaceElements && !arg.actsExpanded())) {
1186 #pragma omp parallel
1187 {
1188 vector<double> int_local(numComp, 0);
1189 if (m_faceOffset[0] > -1) {
1190 #pragma omp for nowait
1191 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1192 const double* f = arg.getSampleDataRO(m_faceOffset[0]+k1);
1193 for (index_t i=0; i < numComp; ++i) {
1194 int_local[i]+=f[i]*m_dx[1];
1195 }
1196 }
1197 }
1198
1199 if (m_faceOffset[1] > -1) {
1200 #pragma omp for nowait
1201 for (index_t k1 = bottom; k1 < bottom+m_ownNE[1]; ++k1) {
1202 const double* f = arg.getSampleDataRO(m_faceOffset[1]+k1);
1203 for (index_t i=0; i < numComp; ++i) {
1204 int_local[i]+=f[i]*m_dx[1];
1205 }
1206 }
1207 }
1208
1209 if (m_faceOffset[2] > -1) {
1210 #pragma omp for nowait
1211 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1212 const double* f = arg.getSampleDataRO(m_faceOffset[2]+k0);
1213 for (index_t i=0; i < numComp; ++i) {
1214 int_local[i]+=f[i]*m_dx[0];
1215 }
1216 }
1217 }
1218
1219 if (m_faceOffset[3] > -1) {
1220 #pragma omp for nowait
1221 for (index_t k0 = left; k0 < left+m_ownNE[0]; ++k0) {
1222 const double* f = arg.getSampleDataRO(m_faceOffset[3]+k0);
1223 for (index_t i=0; i < numComp; ++i) {
1224 int_local[i]+=f[i]*m_dx[0];
1225 }
1226 }
1227 }
1228
1229 #pragma omp critical
1230 for (index_t i=0; i<numComp; i++)
1231 integrals[i]+=int_local[i];
1232 } // end of parallel section
1233 } // function space selector
1234 }
1235
1236 //protected
1237 dim_t Rectangle::insertNeighbourNodes(IndexVector& index, index_t node) const
1238 {
1239 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1240 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1241 const int x=node%nDOF0;
1242 const int y=node/nDOF0;
1243 dim_t num=0;
1244 // loop through potential neighbours and add to index if positions are
1245 // within bounds
1246 for (int i1=-1; i1<2; i1++) {
1247 for (int i0=-1; i0<2; i0++) {
1248 // skip node itself
1249 if (i0==0 && i1==0)
1250 continue;
1251 // location of neighbour node
1252 const int nx=x+i0;
1253 const int ny=y+i1;
1254 if (nx>=0 && ny>=0 && nx<nDOF0 && ny<nDOF1) {
1255 index.push_back(ny*nDOF0+nx);
1256 num++;
1257 }
1258 }
1259 }
1260
1261 return num;
1262 }
1263
1264 //protected
1265 void Rectangle::nodesToDOF(escript::Data& out, escript::Data& in) const
1266 {
1267 const dim_t numComp = in.getDataPointSize();
1268 out.requireWrite();
1269
1270 const index_t left = (m_offset[0]==0 ? 0 : 1);
1271 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1272 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1273 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1274 #pragma omp parallel for
1275 for (index_t i=0; i<nDOF1; i++) {
1276 for (index_t j=0; j<nDOF0; j++) {
1277 const index_t n=j+left+(i+bottom)*m_NN[0];
1278 const double* src=in.getSampleDataRO(n);
1279 copy(src, src+numComp, out.getSampleDataRW(j+i*nDOF0));
1280 }
1281 }
1282 }
1283
1284 //protected
1285 void Rectangle::dofToNodes(escript::Data& out, escript::Data& in) const
1286 {
1287 const dim_t numComp = in.getDataPointSize();
1288 Paso_Coupler* coupler = Paso_Coupler_alloc(m_connector, numComp);
1289 in.requireWrite();
1290 Paso_Coupler_startCollect(coupler, in.getSampleDataRW(0));
1291
1292 const dim_t numDOF = getNumDOF();
1293 out.requireWrite();
1294 const double* buffer = Paso_Coupler_finishCollect(coupler);
1295
1296 #pragma omp parallel for
1297 for (index_t i=0; i<getNumNodes(); i++) {
1298 const double* src=(m_dofMap[i]<numDOF ?
1299 in.getSampleDataRO(m_dofMap[i])
1300 : &buffer[(m_dofMap[i]-numDOF)*numComp]);
1301 copy(src, src+numComp, out.getSampleDataRW(i));
1302 }
1303 Paso_Coupler_free(coupler);
1304 }
1305
1306 //private
1307 void Rectangle::populateSampleIds()
1308 {
1309 // degrees of freedom are numbered from left to right, bottom to top in
1310 // each rank, continuing on the next rank (ranks also go left-right,
1311 // bottom-top).
1312 // This means rank 0 has id 0...n0-1, rank 1 has id n0...n1-1 etc. which
1313 // helps when writing out data rank after rank.
1314
1315 // build node distribution vector first.
1316 // rank i owns m_nodeDistribution[i+1]-nodeDistribution[i] nodes which is
1317 // constant for all ranks in this implementation
1318 m_nodeDistribution.assign(m_mpiInfo->size+1, 0);
1319 const dim_t numDOF=getNumDOF();
1320 for (dim_t k=1; k<m_mpiInfo->size; k++) {
1321 m_nodeDistribution[k]=k*numDOF;
1322 }
1323 m_nodeDistribution[m_mpiInfo->size]=getNumDataPointsGlobal();
1324 m_nodeId.resize(getNumNodes());
1325 m_dofId.resize(numDOF);
1326 m_elementId.resize(getNumElements());
1327
1328 // populate face element counts
1329 //left
1330 if (m_offset[0]==0)
1331 m_faceCount[0]=m_NE[1];
1332 else
1333 m_faceCount[0]=0;
1334 //right
1335 if (m_mpiInfo->rank%m_NX[0]==m_NX[0]-1)
1336 m_faceCount[1]=m_NE[1];
1337 else
1338 m_faceCount[1]=0;
1339 //bottom
1340 if (m_offset[1]==0)
1341 m_faceCount[2]=m_NE[0];
1342 else
1343 m_faceCount[2]=0;
1344 //top
1345 if (m_mpiInfo->rank/m_NX[0]==m_NX[1]-1)
1346 m_faceCount[3]=m_NE[0];
1347 else
1348 m_faceCount[3]=0;
1349
1350 m_faceId.resize(getNumFaceElements());
1351
1352 const index_t left = (m_offset[0]==0 ? 0 : 1);
1353 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1354 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1355 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1356
1357 #define globalNodeId(x,y) \
1358 ((m_offset[0]+x)/nDOF0)*nDOF0*nDOF1+(m_offset[0]+x)%nDOF0 \
1359 + ((m_offset[1]+y)/nDOF1)*nDOF0*nDOF1*m_NX[0]+((m_offset[1]+y)%nDOF1)*nDOF0
1360
1361 // set corner id's outside the parallel region
1362 m_nodeId[0] = globalNodeId(0, 0);
1363 m_nodeId[m_NN[0]-1] = globalNodeId(m_NN[0]-1, 0);
1364 m_nodeId[m_NN[0]*(m_NN[1]-1)] = globalNodeId(0, m_NN[1]-1);
1365 m_nodeId[m_NN[0]*m_NN[1]-1] = globalNodeId(m_NN[0]-1,m_NN[1]-1);
1366 #undef globalNodeId
1367
1368 #pragma omp parallel
1369 {
1370 // populate degrees of freedom and own nodes (identical id)
1371 #pragma omp for nowait
1372 for (dim_t i=0; i<nDOF1; i++) {
1373 for (dim_t j=0; j<nDOF0; j++) {
1374 const index_t nodeIdx=j+left+(i+bottom)*m_NN[0];
1375 const index_t dofIdx=j+i*nDOF0;
1376 m_dofId[dofIdx] = m_nodeId[nodeIdx]
1377 = m_nodeDistribution[m_mpiInfo->rank]+dofIdx;
1378 }
1379 }
1380
1381 // populate the rest of the nodes (shared with other ranks)
1382 if (m_faceCount[0]==0) { // left column
1383 #pragma omp for nowait
1384 for (dim_t i=0; i<nDOF1; i++) {
1385 const index_t nodeIdx=(i+bottom)*m_NN[0];
1386 const index_t dofId=(i+1)*nDOF0-1;
1387 m_nodeId[nodeIdx]
1388 = m_nodeDistribution[m_mpiInfo->rank-1]+dofId;
1389 }
1390 }
1391 if (m_faceCount[1]==0) { // right column
1392 #pragma omp for nowait
1393 for (dim_t i=0; i<nDOF1; i++) {
1394 const index_t nodeIdx=(i+bottom+1)*m_NN[0]-1;
1395 const index_t dofId=i*nDOF0;
1396 m_nodeId[nodeIdx]
1397 = m_nodeDistribution[m_mpiInfo->rank+1]+dofId;
1398 }
1399 }
1400 if (m_faceCount[2]==0) { // bottom row
1401 #pragma omp for nowait
1402 for (dim_t i=0; i<nDOF0; i++) {
1403 const index_t nodeIdx=i+left;
1404 const index_t dofId=nDOF0*(nDOF1-1)+i;
1405 m_nodeId[nodeIdx]
1406 = m_nodeDistribution[m_mpiInfo->rank-m_NX[0]]+dofId;
1407 }
1408 }
1409 if (m_faceCount[3]==0) { // top row
1410 #pragma omp for nowait
1411 for (dim_t i=0; i<nDOF0; i++) {
1412 const index_t nodeIdx=m_NN[0]*(m_NN[1]-1)+i+left;
1413 const index_t dofId=i;
1414 m_nodeId[nodeIdx]
1415 = m_nodeDistribution[m_mpiInfo->rank+m_NX[0]]+dofId;
1416 }
1417 }
1418
1419 // populate element id's
1420 #pragma omp for nowait
1421 for (dim_t i1=0; i1<m_NE[1]; i1++) {
1422 for (dim_t i0=0; i0<m_NE[0]; i0++) {
1423 m_elementId[i0+i1*m_NE[0]]=(m_offset[1]+i1)*m_gNE[0]+m_offset[0]+i0;
1424 }
1425 }
1426
1427 // face elements
1428 #pragma omp for
1429 for (dim_t k=0; k<getNumFaceElements(); k++)
1430 m_faceId[k]=k;
1431 } // end parallel section
1432
1433 m_nodeTags.assign(getNumNodes(), 0);
1434 updateTagsInUse(Nodes);
1435
1436 m_elementTags.assign(getNumElements(), 0);
1437 updateTagsInUse(Elements);
1438
1439 // generate face offset vector and set face tags
1440 const index_t LEFT=1, RIGHT=2, BOTTOM=10, TOP=20;
1441 const index_t faceTag[] = { LEFT, RIGHT, BOTTOM, TOP };
1442 m_faceOffset.assign(4, -1);
1443 m_faceTags.clear();
1444 index_t offset=0;
1445 for (size_t i=0; i<4; i++) {
1446 if (m_faceCount[i]>0) {
1447 m_faceOffset[i]=offset;
1448 offset+=m_faceCount[i];
1449 m_faceTags.insert(m_faceTags.end(), m_faceCount[i], faceTag[i]);
1450 }
1451 }
1452 setTagMap("left", LEFT);
1453 setTagMap("right", RIGHT);
1454 setTagMap("bottom", BOTTOM);
1455 setTagMap("top", TOP);
1456 updateTagsInUse(FaceElements);
1457 }
1458
1459 //private
1460 void Rectangle::createPattern()
1461 {
1462 const dim_t nDOF0 = (m_gNE[0]+1)/m_NX[0];
1463 const dim_t nDOF1 = (m_gNE[1]+1)/m_NX[1];
1464 const index_t left = (m_offset[0]==0 ? 0 : 1);
1465 const index_t bottom = (m_offset[1]==0 ? 0 : 1);
1466
1467 // populate node->DOF mapping with own degrees of freedom.
1468 // The rest is assigned in the loop further down
1469 m_dofMap.assign(getNumNodes(), 0);
1470 #pragma omp parallel for
1471 for (index_t i=bottom; i<bottom+nDOF1; i++) {
1472 for (index_t j=left; j<left+nDOF0; j++) {
1473 m_dofMap[i*m_NN[0]+j]=(i-bottom)*nDOF0+j-left;
1474 }
1475 }
1476
1477 // build list of shared components and neighbours by looping through
1478 // all potential neighbouring ranks and checking if positions are
1479 // within bounds
1480 const dim_t numDOF=nDOF0*nDOF1;
1481 vector<IndexVector> colIndices(numDOF); // for the couple blocks
1482 RankVector neighbour;
1483 IndexVector offsetInShared(1,0);
1484 IndexVector sendShared, recvShared;
1485 int numShared=0;
1486 const int x=m_mpiInfo->rank%m_NX[0];
1487 const int y=m_mpiInfo->rank/m_NX[0];
1488 for (int i1=-1; i1<2; i1++) {
1489 for (int i0=-1; i0<2; i0++) {
1490 // skip this rank
1491 if (i0==0 && i1==0)
1492 continue;
1493 // location of neighbour rank
1494 const int nx=x+i0;
1495 const int ny=y+i1;
1496 if (nx>=0 && ny>=0 && nx<m_NX[0] && ny<m_NX[1]) {
1497 neighbour.push_back(ny*m_NX[0]+nx);
1498 if (i0==0) {
1499 // sharing top or bottom edge
1500 const int firstDOF=(i1==-1 ? 0 : numDOF-nDOF0);
1501 const int firstNode=(i1==-1 ? left : m_NN[0]*(m_NN[1]-1)+left);
1502 offsetInShared.push_back(offsetInShared.back()+nDOF0);
1503 for (dim_t i=0; i<nDOF0; i++, numShared++) {
1504 sendShared.push_back(firstDOF+i);
1505 recvShared.push_back(numDOF+numShared);
1506 if (i>0)
1507 colIndices[firstDOF+i-1].push_back(numShared);
1508 colIndices[firstDOF+i].push_back(numShared);
1509 if (i<nDOF0-1)
1510 colIndices[firstDOF+i+1].push_back(numShared);
1511 m_dofMap[firstNode+i]=numDOF+numShared;
1512 }
1513 } else if (i1==0) {
1514 // sharing left or right edge
1515 const int firstDOF=(i0==-1 ? 0 : nDOF0-1);
1516 const int firstNode=(i0==-1 ? bottom*m_NN[0] : (bottom+1)*m_NN[0]-1);
1517 offsetInShared.push_back(offsetInShared.back()+nDOF1);
1518 for (dim_t i=0; i<nDOF1; i++, numShared++) {
1519 sendShared.push_back(firstDOF+i*nDOF0);
1520 recvShared.push_back(numDOF+numShared);
1521 if (i>0)
1522 colIndices[firstDOF+(i-1)*nDOF0].push_back(numShared);
1523 colIndices[firstDOF+i*nDOF0].push_back(numShared);
1524 if (i<nDOF1-1)
1525 colIndices[firstDOF+(i+1)*nDOF0].push_back(numShared);
1526 m_dofMap[firstNode+i*m_NN[0]]=numDOF+numShared;
1527 }
1528 } else {
1529 // sharing a node
1530 const int dof=(i0+1)/2*(nDOF0-1)+(i1+1)/2*(numDOF-nDOF0);
1531 const int node=(i0+1)/2*(m_NN[0]-1)+(i1+1)/2*m_NN[0]*(m_NN[1]-1);
1532 offsetInShared.push_back(offsetInShared.back()+1);
1533 sendShared.push_back(dof);
1534 recvShared.push_back(numDOF+numShared);
1535 colIndices[dof].push_back(numShared);
1536 m_dofMap[node]=numDOF+numShared;
1537 ++numShared;
1538 }
1539 }
1540 }
1541 }
1542
1543 // create connector
1544 Paso_SharedComponents *snd_shcomp = Paso_SharedComponents_alloc(
1545 numDOF, neighbour.size(), &neighbour[0], &sendShared[0],
1546 &offsetInShared[0], 1, 0, m_mpiInfo);
1547 Paso_SharedComponents *rcv_shcomp = Paso_SharedComponents_alloc(
1548 numDOF, neighbour.size(), &neighbour[0], &recvShared[0],
1549 &offsetInShared[0], 1, 0, m_mpiInfo);
1550 m_connector = Paso_Connector_alloc(snd_shcomp, rcv_shcomp);
1551 Paso_SharedComponents_free(snd_shcomp);
1552 Paso_SharedComponents_free(rcv_shcomp);
1553
1554 // create main and couple blocks
1555 Paso_Pattern *mainPattern = createMainPattern();
1556 Paso_Pattern *colPattern, *rowPattern;
1557 createCouplePatterns(colIndices, numShared, &colPattern, &rowPattern);
1558
1559 // allocate paso distribution
1560 Paso_Distribution* distribution = Paso_Distribution_alloc(m_mpiInfo,
1561 const_cast<index_t*>(&m_nodeDistribution[0]), 1, 0);
1562
1563 // finally create the system matrix
1564 m_pattern = Paso_SystemMatrixPattern_alloc(MATRIX_FORMAT_DEFAULT,
1565 distribution, distribution, mainPattern, colPattern, rowPattern,
1566 m_connector, m_connector);
1567
1568 Paso_Distribution_free(distribution);
1569
1570 // useful debug output
1571 /*
1572 cout << "--- rcv_shcomp ---" << endl;
1573 cout << "numDOF=" << numDOF << ", numNeighbors=" << neighbour.size() << endl;
1574 for (size_t i=0; i<neighbour.size(); i++) {
1575 cout << "neighbor[" << i << "]=" << neighbour[i]
1576 << " offsetInShared[" << i+1 << "]=" << offsetInShared[i+1] << endl;
1577 }
1578 for (size_t i=0; i<recvShared.size(); i++) {
1579 cout << "shared[" << i << "]=" << recvShared[i] << endl;
1580 }
1581 cout << "--- snd_shcomp ---" << endl;
1582 for (size_t i=0; i<sendShared.size(); i++) {
1583 cout << "shared[" << i << "]=" << sendShared[i] << endl;
1584 }
1585 cout << "--- dofMap ---" << endl;
1586 for (size_t i=0; i<m_dofMap.size(); i++) {
1587 cout << "m_dofMap[" << i << "]=" << m_dofMap[i] << endl;
1588 }
1589 cout << "--- colIndices ---" << endl;
1590 for (size_t i=0; i<colIndices.size(); i++) {
1591 cout << "colIndices[" << i << "].size()=" << colIndices[i].size() << endl;
1592 }
1593 */
1594
1595 /*
1596 cout << "--- main_pattern ---" << endl;
1597 cout << "M=" << mainPattern->numOutput << ", N=" << mainPattern->numInput << endl;
1598 for (size_t i=0; i<mainPattern->numOutput+1; i++) {
1599 cout << "ptr[" << i << "]=" << mainPattern->ptr[i] << endl;
1600 }
1601 for (size_t i=0; i<mainPattern->ptr[mainPattern->numOutput]; i++) {
1602 cout << "index[" << i << "]=" << mainPattern->index[i] << endl;
1603 }
1604 */
1605
1606 /*
1607 cout << "--- colCouple_pattern ---" << endl;
1608 cout << "M=" << colPattern->numOutput << ", N=" << colPattern->numInput << endl;
1609 for (size_t i=0; i<colPattern->numOutput+1; i++) {
1610 cout << "ptr[" << i << "]=" << colPattern->ptr[i] << endl;
1611 }
1612 for (size_t i=0; i<colPattern->ptr[colPattern->numOutput]; i++) {
1613 cout << "index[" << i << "]=" << colPattern->index[i] << endl;
1614 }
1615 */
1616
1617 /*
1618 cout << "--- rowCouple_pattern ---" << endl;
1619 cout << "M=" << rowPattern->numOutput << ", N=" << rowPattern->numInput << endl;
1620 for (size_t i=0; i<rowPattern->numOutput+1; i++) {
1621 cout << "ptr[" << i << "]=" << rowPattern->ptr[i] << endl;
1622 }
1623 for (size_t i=0; i<rowPattern->ptr[rowPattern->numOutput]; i++) {
1624 cout << "index[" << i << "]=" << rowPattern->index[i] << endl;
1625 }
1626 */
1627
1628 Paso_Pattern_free(mainPattern);
1629 Paso_Pattern_free(colPattern);
1630 Paso_Pattern_free(rowPattern);
1631 }
1632
1633 //private
1634 void Rectangle::addToMatrixAndRHS(Paso_SystemMatrix* S, escript::Data& F,
1635 const vector<double>& EM_S, const vector<double>& EM_F, bool addS,
1636 bool addF, index_t firstNode, dim_t nEq, dim_t nComp) const
1637 {
1638 IndexVector rowIndex;
1639 rowIndex.push_back(m_dofMap[firstNode]);
1640 rowIndex.push_back(m_dofMap[firstNode+1]);
1641 rowIndex.push_back(m_dofMap[firstNode+m_NN[0]]);
1642 rowIndex.push_back(m_dofMap[firstNode+m_NN[0]+1]);
1643 if (addF) {
1644 double *F_p=F.getSampleDataRW(0);
1645 for (index_t i=0; i<rowIndex.size(); i++) {
1646 if (rowIndex[i]<getNumDOF()) {
1647 for (index_t eq=0; eq<nEq; eq++) {
1648 F_p[INDEX2(eq, rowIndex[i], nEq)]+=EM_F[INDEX2(eq,i,nEq)];
1649 }
1650 }
1651 }
1652 }
1653 if (addS) {
1654 addToSystemMatrix(S, rowIndex, nEq, rowIndex, nComp, EM_S);
1655 }
1656 }
1657
1658 //protected
1659 void Rectangle::interpolateNodesOnElements(escript::Data& out,
1660 escript::Data& in, bool reduced) const
1661 {
1662 const dim_t numComp = in.getDataPointSize();
1663 if (reduced) {
1664 out.requireWrite();
1665 const double c0 = 0.25;
1666 #pragma omp parallel
1667 {
1668 vector<double> f_00(numComp);
1669 vector<double> f_01(numComp);
1670 vector<double> f_10(numComp);
1671 vector<double> f_11(numComp);
1672 #pragma omp for
1673 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1674 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1675 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
1676 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
1677 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
1678 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
1679 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
1680 for (index_t i=0; i < numComp; ++i) {
1681 o[INDEX2(i,numComp,0)] = c0*(f_00[i] + f_01[i] + f_10[i] + f_11[i]);
1682 } /* end of component loop i */
1683 } /* end of k0 loop */
1684 } /* end of k1 loop */
1685 } /* end of parallel section */
1686 } else {
1687 out.requireWrite();
1688 const double c0 = 0.16666666666666666667;
1689 const double c1 = 0.044658198738520451079;
1690 const double c2 = 0.62200846792814621559;
1691 #pragma omp parallel
1692 {
1693 vector<double> f_00(numComp);
1694 vector<double> f_01(numComp);
1695 vector<double> f_10(numComp);
1696 vector<double> f_11(numComp);
1697 #pragma omp for
1698 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1699 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1700 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,k1, m_NN[0])), numComp*sizeof(double));
1701 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,k1+1, m_NN[0])), numComp*sizeof(double));
1702 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,k1, m_NN[0])), numComp*sizeof(double));
1703 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,k1+1, m_NN[0])), numComp*sizeof(double));
1704 double* o = out.getSampleDataRW(INDEX2(k0,k1,m_NE[0]));
1705 for (index_t i=0; i < numComp; ++i) {
1706 o[INDEX2(i,numComp,0)] = c0*(f_01[i] + f_10[i]) + c1*f_11[i] + c2*f_00[i];
1707 o[INDEX2(i,numComp,1)] = c0*(f_00[i] + f_11[i]) + c1*f_01[i] + c2*f_10[i];
1708 o[INDEX2(i,numComp,2)] = c0*(f_00[i] + f_11[i]) + c1*f_10[i] + c2*f_01[i];
1709 o[INDEX2(i,numComp,3)] = c0*(f_01[i] + f_10[i]) + c1*f_00[i] + c2*f_11[i];
1710 } /* end of component loop i */
1711 } /* end of k0 loop */
1712 } /* end of k1 loop */
1713 } /* end of parallel section */
1714 }
1715 }
1716
1717 //protected
1718 void Rectangle::interpolateNodesOnFaces(escript::Data& out, escript::Data& in,
1719 bool reduced) const
1720 {
1721 const dim_t numComp = in.getDataPointSize();
1722 if (reduced) {
1723 out.requireWrite();
1724 #pragma omp parallel
1725 {
1726 vector<double> f_00(numComp);
1727 vector<double> f_01(numComp);
1728 vector<double> f_10(numComp);
1729 vector<double> f_11(numComp);
1730 if (m_faceOffset[0] > -1) {
1731 #pragma omp for nowait
1732 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1733 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
1734 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
1735 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
1736 for (index_t i=0; i < numComp; ++i) {
1737 o[INDEX2(i,numComp,0)] = (f_00[i] + f_01[i])/2;
1738 } /* end of component loop i */
1739 } /* end of k1 loop */
1740 } /* end of face 0 */
1741 if (m_faceOffset[1] > -1) {
1742 #pragma omp for nowait
1743 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1744 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
1745 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
1746 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
1747 for (index_t i=0; i < numComp; ++i) {
1748 o[INDEX2(i,numComp,0)] = (f_10[i] + f_11[i])/2;
1749 } /* end of component loop i */
1750 } /* end of k1 loop */
1751 } /* end of face 1 */
1752 if (m_faceOffset[2] > -1) {
1753 #pragma omp for nowait
1754 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1755 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
1756 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
1757 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
1758 for (index_t i=0; i < numComp; ++i) {
1759 o[INDEX2(i,numComp,0)] = (f_00[i] + f_10[i])/2;
1760 } /* end of component loop i */
1761 } /* end of k0 loop */
1762 } /* end of face 2 */
1763 if (m_faceOffset[3] > -1) {
1764 #pragma omp for nowait
1765 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1766 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1767 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1768 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
1769 for (index_t i=0; i < numComp; ++i) {
1770 o[INDEX2(i,numComp,0)] = (f_01[i] + f_11[i])/2;
1771 } /* end of component loop i */
1772 } /* end of k0 loop */
1773 } /* end of face 3 */
1774 } /* end of parallel section */
1775 } else {
1776 out.requireWrite();
1777 const double c0 = 0.21132486540518711775;
1778 const double c1 = 0.78867513459481288225;
1779 #pragma omp parallel
1780 {
1781 vector<double> f_00(numComp);
1782 vector<double> f_01(numComp);
1783 vector<double> f_10(numComp);
1784 vector<double> f_11(numComp);
1785 if (m_faceOffset[0] > -1) {
1786 #pragma omp for nowait
1787 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1788 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(0,k1, m_NN[0])), numComp*sizeof(double));
1789 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(0,k1+1, m_NN[0])), numComp*sizeof(double));
1790 double* o = out.getSampleDataRW(m_faceOffset[0]+k1);
1791 for (index_t i=0; i < numComp; ++i) {
1792 o[INDEX2(i,numComp,0)] = c0*f_01[i] + c1*f_00[i];
1793 o[INDEX2(i,numComp,1)] = c0*f_00[i] + c1*f_01[i];
1794 } /* end of component loop i */
1795 } /* end of k1 loop */
1796 } /* end of face 0 */
1797 if (m_faceOffset[1] > -1) {
1798 #pragma omp for nowait
1799 for (index_t k1=0; k1 < m_NE[1]; ++k1) {
1800 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1, m_NN[0])), numComp*sizeof(double));
1801 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(m_NN[0]-1,k1+1, m_NN[0])), numComp*sizeof(double));
1802 double* o = out.getSampleDataRW(m_faceOffset[1]+k1);
1803 for (index_t i=0; i < numComp; ++i) {
1804 o[INDEX2(i,numComp,0)] = c1*f_10[i] + c0*f_11[i];
1805 o[INDEX2(i,numComp,1)] = c1*f_11[i] + c0*f_10[i];
1806 } /* end of component loop i */
1807 } /* end of k1 loop */
1808 } /* end of face 1 */
1809 if (m_faceOffset[2] > -1) {
1810 #pragma omp for nowait
1811 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1812 memcpy(&f_00[0], in.getSampleDataRO(INDEX2(k0,0, m_NN[0])), numComp*sizeof(double));
1813 memcpy(&f_10[0], in.getSampleDataRO(INDEX2(k0+1,0, m_NN[0])), numComp*sizeof(double));
1814 double* o = out.getSampleDataRW(m_faceOffset[2]+k0);
1815 for (index_t i=0; i < numComp; ++i) {
1816 o[INDEX2(i,numComp,0)] = c0*f_10[i] + c1*f_00[i];
1817 o[INDEX2(i,numComp,1)] = c0*f_00[i] + c1*f_10[i];
1818 } /* end of component loop i */
1819 } /* end of k0 loop */
1820 } /* end of face 2 */
1821 if (m_faceOffset[3] > -1) {
1822 #pragma omp for nowait
1823 for (index_t k0=0; k0 < m_NE[0]; ++k0) {
1824 memcpy(&f_01[0], in.getSampleDataRO(INDEX2(k0,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1825 memcpy(&f_11[0], in.getSampleDataRO(INDEX2(k0+1,m_NN[1]-1, m_NN[0])), numComp*sizeof(double));
1826 double* o = out.getSampleDataRW(m_faceOffset[3]+k0);
1827 for (index_t i=0; i < numComp; ++i) {
1828 o[INDEX2(i,numComp,0)] = c0*f_11[i] + c1*f_01[i];
1829 o[INDEX2(i,numComp,1)] = c0*f_01[i] + c1*f_11[i];
1830 } /* end of component loop i */
1831 } /* end of k0 loop */
1832 } /* end of face 3 */
1833 } /* end of parallel section */
1834 }
1835 }
1836
1837 //protected
1838 void Rectangle::assemblePDESingle(Paso_SystemMatrix* mat,
1839 escript::Data& rhs, const escript::Data& A, const escript::Data& B,
1840 const escript::Data& C, const escript::Data& D,
1841 const escript::Data& X, const escript::Data& Y) const
1842 {
1843 const double SQRT3 = 1.73205080756887719318;
1844 const double w1 = 1.0/24.0;
1845 const double w5 = -SQRT3/24 + 1.0/12;
1846 const double w2 = -SQRT3/24 - 1.0/12;
1847 const double w19 = -m_dx[0]/12;
1848 const double w11 = w19*(SQRT3 + 3)/12;
1849 const double w14 = w19*(-SQRT3 + 3)/12;
1850 const double w16 = w19*(5*SQRT3 + 9)/12;
1851 const double w17 = w19*(-5*SQRT3 + 9)/12;
1852 const double w27 = w19*(-SQRT3 - 3)/2;
1853 const double w28 = w19*(SQRT3 - 3)/2;
1854 const double w18 = -m_dx[1]/12;
1855 const double w12 = w18*(5*SQRT3 + 9)/12;
1856 const double w13 = w18*(-5*SQRT3 + 9)/12;
1857 const double w10 = w18*(SQRT3 + 3)/12;
1858 const double w15 = w18*(-SQRT3 + 3)/12;
1859 const double w25 = w18*(-SQRT3 - 3)/2;
1860 const double w26 = w18*(SQRT3 - 3)/2;
1861 const double w22 = m_dx[0]*m_dx[1]/144;
1862 const double w20 = w22*(SQRT3 + 2);
1863 const double w21 = w22*(-SQRT3 + 2);
1864 const double w23 = w22*(4*SQRT3 + 7);
1865 const double w24 = w22*(-4*SQRT3 + 7);
1866 const double w3 = m_dx[0]/(24*m_dx[1]);
1867 const double w7 = w3*(SQRT3 + 2);
1868 const double w8 = w3*(-SQRT3 + 2);
1869 const double w6 = -m_dx[1]/(24*m_dx[0]);
1870 const double w0 = w6*(SQRT3 + 2);
1871 const double w4 = w6*(-SQRT3 + 2);
1872
1873 rhs.requireWrite();
1874 #pragma omp parallel
1875 {
1876 for (index_t k1_0=0; k1_0<2; k1_0++) { // colouring
1877 #pragma omp for
1878 for (index_t k1=k1_0; k1<m_NE[1]; k1+=2) {
1879 for (index_t k0=0; k0<m_NE[0]; ++k0) {
1880 bool add_EM_S=false;
1881 bool add_EM_F=false;
1882 vector<double> EM_S(4*4, 0);
1883 vector<double> EM_F(4, 0);
1884 const index_t e = k0 + m_NE[0]*k1;
1885 ///////////////
1886 // process A //
1887 ///////////////
1888 if (!A.isEmpty()) {
1889 add_EM_S = true;
1890 const double* A_p = const_cast<escript::Data*>(&A)->getSampleDataRO(e);
1891 if (A.actsExpanded()) {
1892 const double A_00_0 = A_p[INDEX3(0,0,0,2,2)];
1893 const double A_01_0 = A_p[INDEX3(0,1,0,2,2)];
1894 const double A_10_0 = A_p[INDEX3(1,0,0,2,2)];
1895 const double A_11_0 = A_p[INDEX3(1,1,0,2,2)];
1896 const double A_00_1 = A_p[INDEX3(0,0,1,2,2)];
1897 const double A_01_1 = A_p[INDEX3(0,1,1,2,2)];
1898 const double A_10_1 = A_p[INDEX3(1,0,1,2,2)];
1899 const double A_11_1 = A_p[INDEX3(1,1,1,2,2)];
1900 const double A_00_2 = A_p[INDEX3(0,0,2,2,2)];
1901 const double A_01_2 = A_p[INDEX3(0,1,2,2,2)];
1902 const double A_10_2 = A_p[INDEX3(1,0,2,2,2)];
1903 const double A_11_2 = A_p[INDEX3(1,1,2,2,2)];
1904 const double A_00_3 = A_p[INDEX3(0,0,3,2,2)];
1905 const double A_01_3 = A_p[INDEX3(0,1,3,2,2)];
1906 const double A_10_3 = A_p[INDEX3(1,0,3,2,2)];
1907 const double A_11_3 = A_p[INDEX3(1,1,3,2,2)];
1908 const double tmp0 = w3*(A_11_0 + A_11_1 + A_11_2 + A_11_3);
1909 const double tmp1 = w1*(A_01_0 + A_01_3 - A_10_1 - A_10_2);
1910 const double tmp2 = w4*(A_00_2 + A_00_3);
1911 const double tmp3 = w0*(A_00_0 + A_00_1);
1912 const double tmp4 = w5*(A_01_2 - A_10_3);
1913 const double tmp5 = w2*(-A_01_1 + A_10_0);
1914 const double tmp6 = w5*(A_01_3 + A_10_0);
1915 const double tmp7 = w3*(-A_11_0 - A_11_1 - A_11_2 - A_11_3);
1916 const double tmp8 = w6*(A_00_0 + A_00_1 + A_00_2 + A_00_3);
1917 const double tmp9 = w1*(A_01_1 + A_01_2 + A_10_1 + A_10_2);
1918 const double tmp10 = w2*(-A_01_0 - A_10_3);
1919 const double tmp11 = w4*(A_00_0 + A_00_1);
1920 const double tmp12 = w0*(A_00_2 + A_00_3);
1921 const double tmp13 = w5*(A_01_1 - A_10_0);
1922 const double tmp14 = w2*(-A_01_2 + A_10_3);
1923 const double tmp15 = w7*(A_11_0 + A_11_2);
1924 const double tmp16 = w4*(-A_00_2 - A_00_3);
1925 const double tmp17 = w0*(-A_00_0 - A_00_1);
1926 const double tmp18 = w5*(A_01_3 + A_10_3);
1927 const double tmp19 = w8*(A_11_1 + A_11_3);
1928 const double tmp20 = w2*(-A_01_0 - A_10_0);
1929 const double tmp21 = w7*(A_11_1 + A_11_3);
1930 const double tmp22 = w4*(-A_00_0 - A_00_1);
1931 const double tmp23 = w0*(-A_00_2 - A_00_3);
1932 const double tmp24 = w5*(A_01_0 + A_10_0);
1933 const double tmp25 = w8*(A_11_0 + A_11_2);
1934 const double tmp26 = w2*(-A_01_3 - A_10_3);
1935 const double tmp27 = w5*(-A_01_1 - A_10_2);
1936 const double tmp28 = w1*(-A_01_0 - A_01_3 - A_10_0 - A_10_3);
1937 const double tmp29 = w2*(A_01_2 + A_10_1);
1938 const double tmp30 = w7*(-A_11_1 - A_11_3);
1939 const double tmp31 = w1*(-A_01_1 - A_01_2 + A_10_0 + A_10_3);
1940 const double tmp32 = w5*(-A_01_0 + A_10_2);
1941 const double tmp33 = w8*(-A_11_0 - A_11_2);
1942 const double tmp34 = w6*(-A_00_0 - A_00_1 - A_00_2 - A_00_3);
1943 const double tmp35 = w2*(A_01_3 - A_10_1);
1944 const double tmp36 = w5*(A_01_0 + A_10_3);
1945 const double tmp37 = w2*(-A_01_3 - A_10_0);
1946 const double tmp38 = w7*(-A_11_0 - A_11_2);
1947 const double tmp39 = w5*(-A_01_3 + A_10_1);
1948 const double tmp40 = w8*(-A_11_1 - A_11_3);
1949 const double tmp41 = w2*(A_01_0 - A_10_2);
1950 const double tmp42 = w5*(A_01_1 - A_10_3);
1951 const double tmp43 = w2*(-A_01_2 + A_10_0);
1952 const double tmp44 = w5*(A_01_2 - A_10_0);
1953 const double tmp45 = w2*(-A_01_1 + A_10_3);
1954 const double tmp46 = w5*(-A_01_0 + A_10_1);
1955 const double tmp47 = w2*(A_01_3 - A_10_2);
1956 const double tmp48 = w5*(-A_01_1 - A_10_1);
1957 const double tmp49 = w2*(A_01_2 + A_10_2);
1958 const double tmp50 = w5*(-A_01_3 + A_10_2);
1959 const double tmp51 = w2*(A_01_0 - A_10_1);
1960 const double tmp52 = w5*(-A_01_2 - A_10_1);
1961 const double tmp53 = w2*(A_01_1 + A_10_2);
1962 const double tmp54 = w5*(-A_01_2 - A_10_2);
1963 const double tmp55 = w2*(A_01_1 + A_10_1);
1964 EM_S[INDEX2(0,0,4)]+=tmp15 + tmp16 + tmp17 + tmp18 + tmp19 + tmp20 + tmp9;
1965 EM_S[INDEX2(0,1,4)]+=tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5;
1966 EM_S[INDEX2(0,2,4)]+=tmp31 + tmp34 + tmp38 + tmp39 + tmp40 + tmp41;
1967 EM_S[INDEX2(0,3,4)]+=tmp28 + tmp52 + tmp53 + tmp7 + tmp8;
1968 EM_S[INDEX2(1,0,4)]+=tmp0 + tmp2 + tmp3 + tmp31 + tmp50 + tmp51;
1969 EM_S[INDEX2(1,1,4)]+=tmp16 + tmp17 + tmp21 + tmp25 + tmp28 + tmp54 + tmp55;
1970 EM_S[INDEX2(1,2,4)]+=tmp10 + tmp6 + tmp7 + tmp8 + tmp9;
1971 EM_S[INDEX2(1,3,4)]+=tmp1 + tmp30 + tmp33 + tmp34 + tmp44 + tmp45;
1972 EM_S[INDEX2(2,0,4)]+=tmp1 + tmp34 + tmp38 + tmp40 + tmp42 + tmp43;
1973 EM_S[INDEX2(2,1,4)]+=tmp36 + tmp37 + tmp7 + tmp8 + tmp9;
1974 EM_S[INDEX2(2,2,4)]+=tmp15 + tmp19 + tmp22 + tmp23 + tmp28 + tmp48 + tmp49;
1975 EM_S[INDEX2(2,3,4)]+=tmp0 + tmp11 + tmp12 + tmp31 + tmp46 + tmp47;
1976 EM_S[INDEX2(3,0,4)]+=tmp27 + tmp28 + tmp29 + tmp7 + tmp8;
1977 EM_S[INDEX2(3,1,4)]+=tmp30 + tmp31 + tmp32 + tmp33 + tmp34 + tmp35;
1978 EM_S[INDEX2(3,2,4)]+=tmp0 + tmp1 + tmp11 + tmp12 + tmp13 + tmp14;
1979 EM_S[INDEX2(3,3,4)]+=tmp21 + tmp22 + tmp23 + tmp24 + tmp25 + tmp26 + tmp9;
1980 } else { // constant data
1981 const double A_00 = A_p[INDEX2(0,0,2)];
1982 const double A_01 = A_p[INDEX2(0,1,2)];
1983 const double A_10 = A_p[INDEX2(1,0,2)];
1984 const double A_11 = A_p[INDEX2(1,1,2)];
1985 const double tmp0 = 6*w1*(A_01 - A_10);
1986 const double tmp1 = 6*w1*(A_01 + A_10);
1987 const double tmp2 = 6*w1*(-A_01 - A_10);
1988 const double tmp3 = 6*w1*(-A_01 + A_10);
1989 EM_S[INDEX2(0,0,4)]+=-8*A_00*w6 + 8*A_11*w3 + tmp1;
1990 EM_S[INDEX2(0,1,4)]+=8*A_00*w6 + 4*A_11*w3 + tmp0;
1991 EM_S[INDEX2(0,2,4)]+=-4*A_00*w6 - 8*A_11*w3 + tmp3;
1992 EM_S[INDEX2(0,3,4)]+=4*A_00*w6 - 4*A_11*w3 + tmp2;
1993 EM_S[INDEX2(1,0,4)]+=8*A_00*w6 + 4*A_11*w3 + tmp3;
1994 EM_S[INDEX2(1,1,4)]+=-8*A_00*w6 + 8*A_11*w3 + tmp2;
1995 EM_S[INDEX2(1,2,4)]+=4*A_00*w6 - 4*A_11*w3 + tmp1;
1996 EM_S[INDEX2(1,3,4)]+=-4*A_00*w6 - 8*A_11*w3 + tmp0;
1997 EM_S[INDEX2(2,0,4)]+=-4*A_00*w6 - 8*A_11*w3 + tmp0;
1998 EM_S[INDEX2(2,1,4)]+=4*A_00*w6 - 4*A_11*w3 + tmp1;
1999 EM_S[INDEX2(2,2,4)]+=-8*A_00*w6 + 8*A_11*w3 + tmp2;
2000 EM_S[INDEX2(2,3,4)]+=8*A_00*w6 + 4*A_11*w3 + tmp3;
2001 EM_S[INDEX2(3,0,4)]+=4*A_00*w6 - 4*A_11*w3 + tmp2;
2002 EM_S[INDEX2(3,1,4)]+=-4*A_00*w6 - 8*A_11*w3 + tmp3;
2003 EM_S[INDEX2(3,2,4)]+=8*A_00*w6 + 4*A_11*w3 + tmp0;
2004 EM_S[INDEX2(3,3,4)]+=-8*A_00*w6 + 8*A_11*w3 + tmp1;
2005 }
2006 }
2007 ///////////////
2008 // process B //
2009 ///////////////
2010 if (!B.isEmpty()) {
2011 add_EM_S=true;
2012 const double* B_p=const_cast<escript::Data*>(&B)->getSampleDataRO(e);
2013 if (B.actsExpanded()) {
2014 const double B_0_0 = B_p[INDEX2(0,0,2)];
2015 const double B_1_0 = B_p[INDEX2(1,0,2)];
2016 const double B_0_1 = B_p[INDEX2(0,1,2)];
2017 const double B_1_1 = B_p[INDEX2(1,1,2)];
2018 const double B_0_2 = B_p[INDEX2(0,2,2)];
2019 const double B_1_2 = B_p[INDEX2(1,2,2)];
2020 const double B_0_3 = B_p[INDEX2(0,3,2)];
2021 const double B_1_3 = B_p[INDEX2(1,3,2)];
2022 const double tmp0 = w11*(B_1_0 + B_1_1);
2023 const double tmp1 = w14*(B_1_2 + B_1_3);
2024 const double tmp2 = w15*(-B_0_1 - B_0_3);
2025 const double tmp3 = w10*(-B_0_0 - B_0_2);
2026 const double tmp4 = w11*(B_1_2 + B_1_3);
2027 const double tmp5 = w14*(B_1_0 + B_1_1);
2028 const double tmp6 = w11*(-B_1_2 - B_1_3);
2029 const double tmp7 = w14*(-B_1_0 - B_1_1);
2030 const double tmp8 = w11*(-B_1_0 - B_1_1);
2031 const double tmp9 = w14*(-B_1_2 - B_1_3);
2032 const double tmp10 = w10*(-B_0_1 - B_0_3);
2033 const double tmp11 = w15*(-B_0_0 - B_0_2);
2034 const double tmp12 = w15*(B_0_0 + B_0_2);
2035 const double tmp13 = w10*(B_0_1 + B_0_3);
2036 const double tmp14 = w10*(B_0_0 + B_0_2);
2037 const double tmp15 = w15*(B_0_1 + B_0_3);
2038 EM_S[INDEX2(0,0,4)]+=B_0_0*w12 + B_0_1*w10 + B_0_2*w15 + B_0_3*w13 + B_1_0*w16 + B_1_1*w14 + B_1_2*w11 + B_1_3*w17;
2039 EM_S[INDEX2(0,1,4)]+=B_0_0*w10 + B_0_1*w12 + B_0_2*w13 + B_0_3*w15 + tmp0 + tmp1;
2040 EM_S[INDEX2(0,2,4)]+=B_1_0*w11 + B_1_1*w17 + B_1_2*w16 + B_1_3*w14 + tmp14 + tmp15;
2041 EM_S[INDEX2(0,3,4)]+=tmp12 + tmp13 + tmp4 + tmp5;
2042 EM_S[INDEX2(1,0,4)]+=-B_0_0*w12 - B_0_1*w10 - B_0_2*w15 - B_0_3*w13 + tmp0 + tmp1;
2043 EM_S[INDEX2(1,1,4)]+=-B_0_0*w10 - B_0_1*w12 - B_0_2*w13 - B_0_3*w15 + B_1_0*w14 + B_1_1*w16 + B_1_2*w17 + B_1_3*w11;
2044 EM_S[INDEX2(1,2,4)]+=tmp2 + tmp3 + tmp4 + tmp5;
2045 EM_S[INDEX2(1,3,4)]+=B_1_0*w17 + B_1_1*w11 + B_1_2*w14 + B_1_3*w16 + tmp10 + tmp11;
2046 EM_S[INDEX2(2,0,4)]+=-B_1_0*w16 - B_1_1*w14 - B_1_2*w11 - B_1_3*w17 + tmp14 + tmp15;
2047 EM_S[INDEX2(2,1,4)]+=tmp12 + tmp13 + tmp8 + tmp9;
2048 EM_S[INDEX2(2,2,4)]+=B_0_0*w15 + B_0_1*w13 + B_0_2*w12 + B_0_3*w10 - B_1_0*w11 - B_1_1*w17 - B_1_2*w16 - B_1_3*w14;
2049 EM_S[INDEX2(2,3,4)]+=B_0_0*w13 + B_0_1*w15 + B_0_2*w10 + B_0_3*w12 + tmp6 + tmp7;
2050 EM_S[INDEX2(3,0,4)]+=tmp2 + tmp3 + tmp8 + tmp9;
2051 EM_S[INDEX2(3,1,4)]+=-B_1_0*w14 - B_1_1*w16 - B_1_2*w17 - B_1_3*w11 + tmp10 + tmp11;
2052 EM_S[INDEX2(3,2,4)]+=-B_0_0*w15 - B_0_1*w13 - B_0_2*w12 - B_0_3*w10 + tmp6 + tmp7;
2053 EM_S[INDEX2(3,3,4)]+=-B_0_0*w13 - B_0_1*w15 - B_0_2*w10 - B_0_3*w12 - B_1_0*w17 - B_1_1*w11 - B_1_2*w14 - B_1_3*w16;
2054 } else { // constant data
2055 const double B_0 = B_p[0];
2056 const double B_1 = B_p[1];
2057 EM_S[INDEX2(0,0,4)]+= 2*B_0*w18 + 2*B_1*w19;
2058 EM_S[INDEX2(0,1,4)]+= 2*B_0*w18 + B_1*w19;
2059 EM_S[INDEX2(0,2,4)]+= B_0*w18 + 2*B_1*w19;
2060 EM_S[INDEX2(0,3,4)]+= B_0*w18 + B_1*w19;
2061 EM_S[INDEX2(1,0,4)]+=-2*B_0*w18 + B_1*w19;
2062 EM_S[INDEX2(1,1,4)]+=-2*B_0*w18 + 2*B_1*w19;
2063 EM_S[INDEX2(1,2,4)]+= -B_0*w18 + B_1*w19;
2064 EM_S[INDEX2(1,3,4)]+= -B_0*w18 + 2*B_1*w19;
2065 EM_S[INDEX2(2,0,4)]+= B_0*w18 - 2*B_1*w19;
2066 EM_S[INDEX2(2,1,4)]+= B_0*w18 - B_1*w19;
2067 EM_S[INDEX2(2,2,4)]+= 2*B_0*w18 - 2*B_1*w19;
2068 EM_S[INDEX2(2,3,4)]+= 2*B_0*w18 - B_1*w19;
2069 EM_S[INDEX2(3,0,4)]+= -B_0*w18 - B_1*w19;
2070 EM_S[INDEX2(3,1,4)]+= -B_0*w18 - 2*B_1*w19;
2071 EM_S[INDEX2(3,2,4)]+=-2*B_0*w18 - B_1*w19;
2072 EM_S[INDEX2(3,3,4)]+=-2*B_0*w18 - 2*B_1*w19;
2073 }
2074 }
2075 ///////////////
2076 // process C //
2077 ///////////////
2078 if (!C.isEmpty()) {
2079 add_EM_S=true;
2080 const double* C_p=const_cast<escript::Data*>(&C)->getSampleDataRO(e);
2081 if (C.actsExpanded()) {
2082 const double C_0_0 = C_p[INDEX2(0,0,2)];
2083 const double C_1_0 = C_p[INDEX2(1,0,2)];
2084 const double C_0_1 = C_p[INDEX2(0,1,2)];
2085 const double C_1_1 = C_p[INDEX2(1,1,2)];
2086 const double C_0_2 = C_p[INDEX2(0,2,2)];
2087 const double C_1_2 = C_p[INDEX2(1,2,2)];
2088 const double C_0_3 = C_p[INDEX2(0,3,2)];
2089 const double C_1_3 = C_p[INDEX2(1,3,2)];
2090 const double tmp0 = w11*(C_1_0 + C_1_1);
2091 const double tmp1 = w14*(C_1_2 + C_1_3);
2092 const double tmp2 = w15*(C_0_0 + C_0_2);
2093 const double tmp3 = w10*(C_0_1 + C_0_3);
2094 const double tmp4 = w11*(-C_1_0 - C_1_1);
2095 const double tmp5 = w14*(-C_1_2 - C_1_3);
2096 const double tmp6 = w11*(-C_1_2 - C_1_3);
2097 const double tmp7 = w14*(-C_1_0 - C_1_1);
2098 const double tmp8 = w11*(C_1_2 + C_1_3);
2099 const double tmp9 = w14*(C_1_0 + C_1_1);
2100 const double tmp10 = w10*(-C_0_1 - C_0_3);
2101 const double tmp11 = w15*(-C_0_0 - C_0_2);
2102 const double tmp12 = w15*(-C_0_1 - C_0_3);
2103 const double tmp13 = w10*(-C_0_0 - C_0_2);
2104 const double tmp14 = w10*(C_0_0 + C_0_2);
2105 const double tmp15 = w15*(C_0_1 + C_0_3);
2106 EM_S[INDEX2(0,0,4)]+=C_0_0*w12 + C_0_1*w10 + C_0_2*w15 + C_0_3*w13 + C_1_0*w16 + C_1_1*w14 + C_1_2*w11 + C_1_3*w17;
2107 EM_S[INDEX2(0,1,4)]+=-C_0_0*w12 - C_0_1*w10 - C_0_2*w15 - C_0_3*w13 + tmp0 + tmp1;
2108 EM_S[INDEX2(0,2,4)]+=-C_1_0*w16 - C_1_1*w14 - C_1_2*w11 - C_1_3*w17 + tmp14 + tmp15;
2109 EM_S[INDEX2(0,3,4)]+=tmp12 + tmp13 + tmp4 + tmp5;
2110 EM_S[INDEX2(1,0,4)]+=C_0_0*w10 + C_0_1*w12 + C_0_2*w13 + C_0_3*w15 + tmp0 + tmp1;
2111 EM_S[INDEX2(1,1,4)]+=-C_0_0*w10 - C_0_1*w12 - C_0_2*w13 - C_0_3*w15 + C_1_0*w14 + C_1_1*w16 + C_1_2*w17 + C_1_3*w11;
2112 EM_S[INDEX2(1,2,4)]+=tmp2 + tmp3 + tmp4 + tmp5;
2113 EM_S[INDEX2(1,3,4)]+=-C_1_0*w14 - C_1_1*w16 - C_1_2*w17 - C_1_3*w11 + tmp10 + tmp11;
2114 EM_S[INDEX2(2,0,4)]+=C_1_0*w11 + C_1_1*w17 + C_1_2*w16 + C_1_3*w14 + tmp14 + tmp15;
2115 EM_S[INDEX2(2,1,4)]+=tmp12 + tmp13 + tmp8 + tmp9;
2116 EM_S[INDEX2(2,2,4)]+=C_0_0*w15 + C_0_1*w13 + C_0_2*w12 + C_0_3*w10 - C_1_0*w11 - C_1_1*w17 - C_1_2*w16 - C_1_3*w14;
2117 EM_S[INDEX2(2,3,4)]+=-C_0_0*w15 - C_0_1*w13 - C_0_2*w12 - C_0_3*w10 + tmp6 + tmp7;
2118 EM_S[INDEX2(3,0,4)]+=tmp2 + tmp3 + tmp8 + tmp9;
2119 EM_S[INDEX2(3,1,4)]+=C_1_0*w17 + C_1_1*w11 + C_1_2*w14 + C_1_3*w16 + tmp10 + tmp11;
2120 EM_S[INDEX2(3,2,4)]+=C_0_0*w13 + C_0_1*w15 + C_0_2*w10 + C_0_3*w12 + tmp6 + tmp7;
2121 EM_S[INDEX2(3,3,4)]+=-C_0_0*w13 - C_0_1*w15 - C_0_2*w10 - C_0_3*w12 - C_1_0*w17 - C_1_1*w11 - C_1_2*w14 - C_1_3*w16;
2122 } else { // constant data
2123 const double C_0 = C_p[0];
2124 const double C_1 = C_p[1];
2125 EM_S[INDEX2(0,0,4)]+= 2*C_0*w18 + 2*C_1*w19;
2126 EM_S[INDEX2(0,1,4)]+=-2*C_0*w18 + C_1*w19;
2127 EM_S[INDEX2(0,2,4)]+= C_0*w18 - 2*C_1*w19;
2128 EM_S[INDEX2(0,3,4)]+= -C_0*w18 - C_1*w19;
2129 EM_S[INDEX2(1,0,4)]+= 2*C_0*w18 + C_1*w19;
2130 EM_S[INDEX2(1,1,4)]+=-2*C_0*w18 + 2*C_1*w19;
2131 EM_S[INDEX2(1,2,4)]+= C_0*w18 - C_1*w19;
2132 EM_S[INDEX2(1,3,4)]+= -C_0*w18 - 2*C_1*w19;
2133 EM_S[INDEX2(2,0,4)]+= C_0*w18 + 2*C_1*w19;
2134 EM_S[INDEX2(2,1,4)]+= -C_0*w18 + C_1*w19;
2135 EM_S[INDEX2(2,2,4)]+= 2*C_0*w18 - 2*C_1*w19;
2136 EM_S[INDEX2(2,3,4)]+=-2*C_0*w18 - C_1*w19;
2137 EM_S[INDEX2(3,0,4)]+= C_0*w18 + C_1*w19;
2138 EM_S[INDEX2(3,1,4)]+= -C_0*w18 + 2*C_1*w19;
2139 EM_S[INDEX2(3,2,4)]+= 2*C_0*w18 - C_1*w19;
2140 EM_S[INDEX2(3,3,4)]+=-2*C_0*w18 - 2*C_1*w19;
2141 }
2142 }
2143 ///////////////
2144 // process D //
2145 ///////////////
2146 if (!D.isEmpty()) {
2147 add_EM_S=true;
2148 const double* D_p=const_cast<escript::Data*>(&D)->getSampleDataRO(e);
2149 if (D.actsExpanded()) {
2150 const double D_0 = D_p[0];
2151 const double D_1 = D_p[1];
2152 const double D_2 = D_p[2];
2153 const double D_3 = D_p[3];
2154 const double tmp0 = w21*(