/[escript]/branches/split/escriptcore/src/SplitWorld.cpp
ViewVC logotype

Annotation of /branches/split/escriptcore/src/SplitWorld.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4746 - (hide annotations)
Thu Mar 13 06:23:15 2014 UTC (5 years, 5 months ago) by jfenwick
File size: 10291 byte(s)
Code to gather information about exceptions on remote worlds
and rethrow to all ranks in all worlds.

Note: If this exception brings down the world, then some ranks may be
killed by the MPI system before the exception reaches _their_ top level.


1 jfenwick 4730 /*****************************************************************************
2     *
3     * Copyright (c) 2014 by University of Queensland
4     * http://www.uq.edu.au
5     *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10     * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11     * Development 2012-2013 by School of Earth Sciences
12     * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13     *
14     *****************************************************************************/
15    
16     #include "esysUtils/Esys_MPI.h"
17 jfenwick 4734 #include "SplitWorld.h"
18 jfenwick 4730 #include "AbstractDomain.h"
19 jfenwick 4731 #include "SplitWorldException.h"
20 jfenwick 4734 #include "SplitWorldException.h"
21 jfenwick 4730
22 jfenwick 4731 #include <iostream>
23    
24 jfenwick 4730 using namespace boost::python;
25     using namespace escript;
26    
27 jfenwick 4734 SplitWorld::SplitWorld(unsigned int numgroups, MPI_Comm global)
28 jfenwick 4746 :globalcom(global), subcom(MPI_COMM_NULL), localworld((SubWorld*)0), swcount(numgroups>0?numgroups:1), jobcounter(1)
29 jfenwick 4730 {
30 jfenwick 4746 int grank=0;
31     int wsize=1; // each world has this many processes
32     #ifdef ESYS_MPI
33     int gsize=1;
34     if ((MPI_Comm_size(global, &gsize)!=MPI_SUCCESS) || (MPI_Comm_rank(global, &grank)!=MPI_SUCCESS))
35     {
36     throw SplitWorldException("MPI appears to be inoperative.");
37     }
38     if (gsize%swcount!=0)
39     {
40     throw SplitWorldException("SplitWorld error: requested number of groups is not a factor of global communicator size.");
41     }
42     wsize=gsize/swcount; // each world has this many processes
43     int res=MPI_Comm_split(MPI_COMM_WORLD, grank/wsize, grank%wsize, &subcom);
44     if (res!=MPI_SUCCESS)
45     {
46     throw SplitWorldException("SplitWorld error: Unable to form communicator.");
47     }
48     #endif
49 jfenwick 4730 localworld=SubWorld_ptr(new SubWorld(subcom));
50 jfenwick 4731 localid=grank/wsize;
51 jfenwick 4730 }
52    
53     // We may need to look into this more closely.
54     // What if the domain lives longer than the world splitter?
55 jfenwick 4734 SplitWorld::~SplitWorld()
56 jfenwick 4730 {
57 jfenwick 4746 #ifdef ESYS_MPI
58 jfenwick 4730 if (subcom!=MPI_COMM_NULL)
59     {
60     MPI_Comm_free(&subcom);
61     }
62 jfenwick 4746 #endif
63 jfenwick 4730 }
64    
65    
66     // The boost wrapper will ensure that there is at least one entry in the tuple
67 jfenwick 4734 object SplitWorld::buildDomains(tuple t, dict kwargs)
68 jfenwick 4730 {
69     int tsize=len(t);
70     // get the callable that we will invoke in a sec
71     object tocall=t[0];
72     // make a new tuple without the first element
73     tuple ntup=tuple(t.slice(1,tsize));
74     // now add the subworld to the kwargs
75     kwargs["escriptworld"]=localworld;
76    
77     // std::cerr << "About to call function with:\n";
78     // //extract<std::string> ex(ntup.attr("__str__")());
79     // std::cerr << extract<std::string>(ntup.attr("__str__")())() << std::endl;
80     // // for (int i=0;i<tsize-1;++i)
81     // // {
82     // // std::cout << extract<const char*>(ntup[i])() << " ";
83     // // }
84     // std::cerr << std::endl;
85    
86     // pass the whole package to the python call
87     object dobj=tocall(*ntup, **kwargs);
88     extract<Domain_ptr> ex1(dobj);
89     Domain_ptr dptr=ex1();
90    
91     // now do a sanity check to see if the domain has respected the communicator info we passed it.
92     if (dptr->getMPIComm()!=localworld->getComm())
93     {
94 jfenwick 4734 throw SplitWorldException("The newly constructed domain is not using the correct communicator.");
95 jfenwick 4730 }
96     localworld->setDomain(dptr);
97     return object(); // return None
98     }
99    
100 jfenwick 4745
101     // Executes all pending jobs on all subworlds
102     void SplitWorld::runJobs()
103 jfenwick 4731 {
104 jfenwick 4745 distributeJobs();
105 jfenwick 4734 int mres=0;
106 jfenwick 4746 std::string err;
107 jfenwick 4734 do
108     {
109     // now we actually need to run the jobs
110     // everybody will be executing their localworld's jobs
111 jfenwick 4746 int res=localworld->runJobs(err);
112 jfenwick 4734 // now we find out about the other worlds
113 jfenwick 4746 if (!esysUtils::checkResult(res, mres, globalcom))
114 jfenwick 4734 {
115     throw SplitWorldException("MPI appears to have failed.");
116     }
117 jfenwick 4746 std::cerr << "I got a res of " << mres << std::endl;
118    
119 jfenwick 4734 } while (mres==1);
120     if (mres==2)
121     {
122     throw SplitWorldException("At least one Job's work() function did not return True/False.");
123     }
124     else if (mres==3)
125     {
126 jfenwick 4746 std::cerr << "My err string is [" << err <<"]\n";
127    
128     char* resultstr=0;
129     // now we ship around the error message
130     if (!esysUtils::shipString(err.c_str(), &resultstr, globalcom))
131     {
132     throw SplitWorldException("MPI appears to have failed.");
133     }
134     //throw SplitWorldException("At least one Job's work() function raised an exception.");
135     std::string s("At least one Job's work() function raised the following exception:\n");
136     s+=resultstr;
137     // std::cerr << "My combined [" << s.c_str() << std::endl;
138     // char* testing=new char[s.size()+1];
139     // strcpy(testing, s.c_str());
140     std::cerr << "Pre-throw [[[" << s << "]]]\n";
141     throw SplitWorldException(s);
142 jfenwick 4734 }
143 jfenwick 4731 }
144    
145 jfenwick 4736
146 jfenwick 4745 /** a list of tuples/sequences: (Job class, number of instances)*/
147     // void SplitWorld::runJobs(boost::python::list l)
148     // {
149     // // first count up how many jobs we have in total
150     // unsigned int numjobs=0;
151     // std::vector<object> classvec;
152     // std::vector<unsigned int> countvec;
153     // std::vector<unsigned int> partialcounts;
154     // for (int i=0;i<len(l);++i)
155     // {
156     // extract<tuple> ex(l[i]);
157     // if (!ex.check())
158     // {
159     // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
160     // }
161     // tuple t=ex();
162     // if (len(t)!=2)
163     // {
164     // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
165     // }
166     // extract<unsigned int> ex2(t[1]);
167     // unsigned int c=0;
168     // if (!ex2.check() || ((c=ex2())==0))
169     // {
170     // throw SplitWorldException("Number of jobs must be a strictly positive integer.");
171     //
172     // }
173     // classvec.push_back(t[0]);
174     // countvec.push_back(c);
175     // numjobs+=c;
176     // partialcounts.push_back(numjobs);
177     // }
178     // unsigned int classnum=0;
179     // unsigned int lowend=1;
180     // unsigned int highend=lowend+numjobs/swcount+(numjobs%swcount);
181     // // std::cout << localid << std::endl;
182     // for (int i=1;i<=localid;++i)
183     // {
184     // lowend=highend;
185     // highend=lowend+numjobs/swcount;
186     // if (i<numjobs%swcount)
187     // {
188     // highend++;
189     // }
190     // }
191     // // std::cout << "There are " << numjobs << " jobs with range [" << lowend << ", " << highend << ")\n";
192     // // We could do something more clever about trying to fit Jobs to subworlds
193     // // to ensure that instances sharing the same Job class would share the same
194     // // world as much as possible but for now we'll do this:
195     // for (unsigned int j=1;j<=numjobs;++j) // job #0 is a sentinel
196     // {
197     // if (j>partialcounts[classnum])
198     // {
199     // classnum++; // we dont' need to loop this because each count >0
200     // }
201     // // now if this is one of the job numbers in our local range,
202     // // create an instance of the appropriate class
203     // if (j>=lowend and j<highend)
204     // {
205     // object o=classvec[classnum](localworld->getDomain(), object(j));
206     // localworld->addJob(o);
207     // }
208     // }
209     // int mres=0;
210     // do
211     // {
212     // // now we actually need to run the jobs
213     // // everybody will be executing their localworld's jobs
214     // int res=localworld->runJobs();
215     // // now we find out about the other worlds
216     // mres=0;
217     // if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
218     // {
219     // throw SplitWorldException("MPI appears to have failed.");
220     // }
221     // } while (mres==1);
222     // if (mres==2)
223     // {
224     // throw SplitWorldException("At least one Job's work() function did not return True/False.");
225     // }
226     // else if (mres==3)
227     // {
228     // throw SplitWorldException("At least one Job's work() function raised an exception.");
229     // }
230     // }
231    
232    
233 jfenwick 4736 void SplitWorld::registerCrate(escript::crate_ptr c)
234     {
235     protocrates.push_back(c);
236     }
237    
238 jfenwick 4745 /**
239     stores the constructor/factory to make Jobs and the parameters.
240     */
241     void SplitWorld::addJob(boost::python::object creator, boost::python::tuple tup, boost::python::dict kw)
242     {
243     create.push_back(creator);
244     tupargs.push_back(tup);
245     kwargs.push_back(kw);
246     }
247    
248     void SplitWorld::clearPendingJobs()
249     {
250     create.clear();
251     tupargs.clear();
252     kwargs.clear();
253     }
254    
255     void SplitWorld::clearActiveJobs()
256     {
257     localworld->clearJobs();
258     }
259    
260     // All the job params are known on all the ranks.
261     void SplitWorld::distributeJobs()
262     {
263     unsigned int numjobs=create.size()/swcount;
264     unsigned int start=create.size()/swcount*localid;
265     if (localid<create.size()%swcount)
266     {
267     numjobs++;
268     start+=localid;
269     }
270     else
271     {
272     start+=create.size()%swcount;
273     }
274 jfenwick 4746 int errstat=0;
275 jfenwick 4745 try
276     {
277     std::cerr << "Numjobs=" << numjobs << " start=" << start << std::endl;
278     // No other subworld will be looking at this portion of the array
279     // so jobs will only be created on one subworld
280     for (unsigned int i=start;i<start+numjobs;++i)
281     {
282     // we need to add some things to the kw map
283     kwargs[i]["domain"]=localworld->getDomain();
284     kwargs[i]["jobid"]=object(jobcounter+i);
285     object job=create[i](*(tupargs[i]), **(kwargs[i]));
286     localworld->addJob(job);
287     }
288     }
289     catch (boost::python::error_already_set e)
290     {
291     errstat=1;
292     }
293     jobcounter+=create.size();
294     clearPendingJobs();
295    
296     // MPI check to ensure that it worked for everybody
297 jfenwick 4746 int mstat=0;
298     if (!esysUtils::checkResult(errstat, mstat, globalcom))
299 jfenwick 4745 {
300     throw SplitWorldException("MPI appears to have failed.");
301     }
302    
303     if (errstat==1)
304     {
305     throw SplitWorldException("distributeJobs: Job creation failed.");
306     clearActiveJobs();
307     }
308     }
309    
310    
311 jfenwick 4730 namespace escript
312     {
313    
314     boost::python::object raw_buildDomains(boost::python::tuple t, boost::python::dict kwargs)
315     {
316     int l=len(t);
317     if (l<2)
318     {
319 jfenwick 4734 throw SplitWorldException("Insufficient parameters to buildDomains.");
320 jfenwick 4730 }
321 jfenwick 4734 extract<SplitWorld&> exw(t[0]);
322 jfenwick 4730 if (!exw.check())
323     {
324 jfenwick 4734 throw SplitWorldException("First parameter to buildDomains must be a SplitWorld.");
325 jfenwick 4730 }
326 jfenwick 4734 SplitWorld& ws=exw();
327 jfenwick 4730 tuple ntup=tuple(t.slice(1,l)); // strip off the object param
328     return ws.buildDomains(ntup, kwargs);
329     }
330    
331 jfenwick 4745 boost::python::object raw_addJob(boost::python::tuple t, boost::python::dict kwargs)
332     {
333     int l=len(t);
334     if (l<2)
335     {
336     throw SplitWorldException("Insufficient parameters to addJob.");
337     }
338     extract<SplitWorld&> exw(t[0]);
339     if (!exw.check())
340     {
341     throw SplitWorldException("First parameter to addJob must be a SplitWorld.");
342     }
343     SplitWorld& ws=exw();
344     object creator=t[1];
345     tuple ntup=tuple(t.slice(2,l)); // strip off the object param
346     ws.addJob(creator, ntup, kwargs);
347     return object();
348 jfenwick 4730 }
349 jfenwick 4745
350    
351     }

  ViewVC Help
Powered by ViewVC 1.1.26