/[escript]/branches/split/escriptcore/src/SplitWorld.cpp
ViewVC logotype

Contents of /branches/split/escriptcore/src/SplitWorld.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4746 - (show annotations)
Thu Mar 13 06:23:15 2014 UTC (5 years, 5 months ago) by jfenwick
File size: 10291 byte(s)
Code to gather information about exceptions on remote worlds
and rethrow to all ranks in all worlds.

Note: If this exception brings down the world, then some ranks may be
killed by the MPI system before the exception reaches _their_ top level.


1 /*****************************************************************************
2 *
3 * Copyright (c) 2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16 #include "esysUtils/Esys_MPI.h"
17 #include "SplitWorld.h"
18 #include "AbstractDomain.h"
19 #include "SplitWorldException.h"
20 #include "SplitWorldException.h"
21
22 #include <iostream>
23
24 using namespace boost::python;
25 using namespace escript;
26
27 SplitWorld::SplitWorld(unsigned int numgroups, MPI_Comm global)
28 :globalcom(global), subcom(MPI_COMM_NULL), localworld((SubWorld*)0), swcount(numgroups>0?numgroups:1), jobcounter(1)
29 {
30 int grank=0;
31 int wsize=1; // each world has this many processes
32 #ifdef ESYS_MPI
33 int gsize=1;
34 if ((MPI_Comm_size(global, &gsize)!=MPI_SUCCESS) || (MPI_Comm_rank(global, &grank)!=MPI_SUCCESS))
35 {
36 throw SplitWorldException("MPI appears to be inoperative.");
37 }
38 if (gsize%swcount!=0)
39 {
40 throw SplitWorldException("SplitWorld error: requested number of groups is not a factor of global communicator size.");
41 }
42 wsize=gsize/swcount; // each world has this many processes
43 int res=MPI_Comm_split(MPI_COMM_WORLD, grank/wsize, grank%wsize, &subcom);
44 if (res!=MPI_SUCCESS)
45 {
46 throw SplitWorldException("SplitWorld error: Unable to form communicator.");
47 }
48 #endif
49 localworld=SubWorld_ptr(new SubWorld(subcom));
50 localid=grank/wsize;
51 }
52
53 // We may need to look into this more closely.
54 // What if the domain lives longer than the world splitter?
55 SplitWorld::~SplitWorld()
56 {
57 #ifdef ESYS_MPI
58 if (subcom!=MPI_COMM_NULL)
59 {
60 MPI_Comm_free(&subcom);
61 }
62 #endif
63 }
64
65
66 // The boost wrapper will ensure that there is at least one entry in the tuple
67 object SplitWorld::buildDomains(tuple t, dict kwargs)
68 {
69 int tsize=len(t);
70 // get the callable that we will invoke in a sec
71 object tocall=t[0];
72 // make a new tuple without the first element
73 tuple ntup=tuple(t.slice(1,tsize));
74 // now add the subworld to the kwargs
75 kwargs["escriptworld"]=localworld;
76
77 // std::cerr << "About to call function with:\n";
78 // //extract<std::string> ex(ntup.attr("__str__")());
79 // std::cerr << extract<std::string>(ntup.attr("__str__")())() << std::endl;
80 // // for (int i=0;i<tsize-1;++i)
81 // // {
82 // // std::cout << extract<const char*>(ntup[i])() << " ";
83 // // }
84 // std::cerr << std::endl;
85
86 // pass the whole package to the python call
87 object dobj=tocall(*ntup, **kwargs);
88 extract<Domain_ptr> ex1(dobj);
89 Domain_ptr dptr=ex1();
90
91 // now do a sanity check to see if the domain has respected the communicator info we passed it.
92 if (dptr->getMPIComm()!=localworld->getComm())
93 {
94 throw SplitWorldException("The newly constructed domain is not using the correct communicator.");
95 }
96 localworld->setDomain(dptr);
97 return object(); // return None
98 }
99
100
101 // Executes all pending jobs on all subworlds
102 void SplitWorld::runJobs()
103 {
104 distributeJobs();
105 int mres=0;
106 std::string err;
107 do
108 {
109 // now we actually need to run the jobs
110 // everybody will be executing their localworld's jobs
111 int res=localworld->runJobs(err);
112 // now we find out about the other worlds
113 if (!esysUtils::checkResult(res, mres, globalcom))
114 {
115 throw SplitWorldException("MPI appears to have failed.");
116 }
117 std::cerr << "I got a res of " << mres << std::endl;
118
119 } while (mres==1);
120 if (mres==2)
121 {
122 throw SplitWorldException("At least one Job's work() function did not return True/False.");
123 }
124 else if (mres==3)
125 {
126 std::cerr << "My err string is [" << err <<"]\n";
127
128 char* resultstr=0;
129 // now we ship around the error message
130 if (!esysUtils::shipString(err.c_str(), &resultstr, globalcom))
131 {
132 throw SplitWorldException("MPI appears to have failed.");
133 }
134 //throw SplitWorldException("At least one Job's work() function raised an exception.");
135 std::string s("At least one Job's work() function raised the following exception:\n");
136 s+=resultstr;
137 // std::cerr << "My combined [" << s.c_str() << std::endl;
138 // char* testing=new char[s.size()+1];
139 // strcpy(testing, s.c_str());
140 std::cerr << "Pre-throw [[[" << s << "]]]\n";
141 throw SplitWorldException(s);
142 }
143 }
144
145
146 /** a list of tuples/sequences: (Job class, number of instances)*/
147 // void SplitWorld::runJobs(boost::python::list l)
148 // {
149 // // first count up how many jobs we have in total
150 // unsigned int numjobs=0;
151 // std::vector<object> classvec;
152 // std::vector<unsigned int> countvec;
153 // std::vector<unsigned int> partialcounts;
154 // for (int i=0;i<len(l);++i)
155 // {
156 // extract<tuple> ex(l[i]);
157 // if (!ex.check())
158 // {
159 // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
160 // }
161 // tuple t=ex();
162 // if (len(t)!=2)
163 // {
164 // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
165 // }
166 // extract<unsigned int> ex2(t[1]);
167 // unsigned int c=0;
168 // if (!ex2.check() || ((c=ex2())==0))
169 // {
170 // throw SplitWorldException("Number of jobs must be a strictly positive integer.");
171 //
172 // }
173 // classvec.push_back(t[0]);
174 // countvec.push_back(c);
175 // numjobs+=c;
176 // partialcounts.push_back(numjobs);
177 // }
178 // unsigned int classnum=0;
179 // unsigned int lowend=1;
180 // unsigned int highend=lowend+numjobs/swcount+(numjobs%swcount);
181 // // std::cout << localid << std::endl;
182 // for (int i=1;i<=localid;++i)
183 // {
184 // lowend=highend;
185 // highend=lowend+numjobs/swcount;
186 // if (i<numjobs%swcount)
187 // {
188 // highend++;
189 // }
190 // }
191 // // std::cout << "There are " << numjobs << " jobs with range [" << lowend << ", " << highend << ")\n";
192 // // We could do something more clever about trying to fit Jobs to subworlds
193 // // to ensure that instances sharing the same Job class would share the same
194 // // world as much as possible but for now we'll do this:
195 // for (unsigned int j=1;j<=numjobs;++j) // job #0 is a sentinel
196 // {
197 // if (j>partialcounts[classnum])
198 // {
199 // classnum++; // we dont' need to loop this because each count >0
200 // }
201 // // now if this is one of the job numbers in our local range,
202 // // create an instance of the appropriate class
203 // if (j>=lowend and j<highend)
204 // {
205 // object o=classvec[classnum](localworld->getDomain(), object(j));
206 // localworld->addJob(o);
207 // }
208 // }
209 // int mres=0;
210 // do
211 // {
212 // // now we actually need to run the jobs
213 // // everybody will be executing their localworld's jobs
214 // int res=localworld->runJobs();
215 // // now we find out about the other worlds
216 // mres=0;
217 // if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
218 // {
219 // throw SplitWorldException("MPI appears to have failed.");
220 // }
221 // } while (mres==1);
222 // if (mres==2)
223 // {
224 // throw SplitWorldException("At least one Job's work() function did not return True/False.");
225 // }
226 // else if (mres==3)
227 // {
228 // throw SplitWorldException("At least one Job's work() function raised an exception.");
229 // }
230 // }
231
232
233 void SplitWorld::registerCrate(escript::crate_ptr c)
234 {
235 protocrates.push_back(c);
236 }
237
238 /**
239 stores the constructor/factory to make Jobs and the parameters.
240 */
241 void SplitWorld::addJob(boost::python::object creator, boost::python::tuple tup, boost::python::dict kw)
242 {
243 create.push_back(creator);
244 tupargs.push_back(tup);
245 kwargs.push_back(kw);
246 }
247
248 void SplitWorld::clearPendingJobs()
249 {
250 create.clear();
251 tupargs.clear();
252 kwargs.clear();
253 }
254
255 void SplitWorld::clearActiveJobs()
256 {
257 localworld->clearJobs();
258 }
259
260 // All the job params are known on all the ranks.
261 void SplitWorld::distributeJobs()
262 {
263 unsigned int numjobs=create.size()/swcount;
264 unsigned int start=create.size()/swcount*localid;
265 if (localid<create.size()%swcount)
266 {
267 numjobs++;
268 start+=localid;
269 }
270 else
271 {
272 start+=create.size()%swcount;
273 }
274 int errstat=0;
275 try
276 {
277 std::cerr << "Numjobs=" << numjobs << " start=" << start << std::endl;
278 // No other subworld will be looking at this portion of the array
279 // so jobs will only be created on one subworld
280 for (unsigned int i=start;i<start+numjobs;++i)
281 {
282 // we need to add some things to the kw map
283 kwargs[i]["domain"]=localworld->getDomain();
284 kwargs[i]["jobid"]=object(jobcounter+i);
285 object job=create[i](*(tupargs[i]), **(kwargs[i]));
286 localworld->addJob(job);
287 }
288 }
289 catch (boost::python::error_already_set e)
290 {
291 errstat=1;
292 }
293 jobcounter+=create.size();
294 clearPendingJobs();
295
296 // MPI check to ensure that it worked for everybody
297 int mstat=0;
298 if (!esysUtils::checkResult(errstat, mstat, globalcom))
299 {
300 throw SplitWorldException("MPI appears to have failed.");
301 }
302
303 if (errstat==1)
304 {
305 throw SplitWorldException("distributeJobs: Job creation failed.");
306 clearActiveJobs();
307 }
308 }
309
310
311 namespace escript
312 {
313
314 boost::python::object raw_buildDomains(boost::python::tuple t, boost::python::dict kwargs)
315 {
316 int l=len(t);
317 if (l<2)
318 {
319 throw SplitWorldException("Insufficient parameters to buildDomains.");
320 }
321 extract<SplitWorld&> exw(t[0]);
322 if (!exw.check())
323 {
324 throw SplitWorldException("First parameter to buildDomains must be a SplitWorld.");
325 }
326 SplitWorld& ws=exw();
327 tuple ntup=tuple(t.slice(1,l)); // strip off the object param
328 return ws.buildDomains(ntup, kwargs);
329 }
330
331 boost::python::object raw_addJob(boost::python::tuple t, boost::python::dict kwargs)
332 {
333 int l=len(t);
334 if (l<2)
335 {
336 throw SplitWorldException("Insufficient parameters to addJob.");
337 }
338 extract<SplitWorld&> exw(t[0]);
339 if (!exw.check())
340 {
341 throw SplitWorldException("First parameter to addJob must be a SplitWorld.");
342 }
343 SplitWorld& ws=exw();
344 object creator=t[1];
345 tuple ntup=tuple(t.slice(2,l)); // strip off the object param
346 ws.addJob(creator, ntup, kwargs);
347 return object();
348 }
349
350
351 }

  ViewVC Help
Powered by ViewVC 1.1.26