/[escript]/branches/split/escriptcore/src/SplitWorld.cpp
ViewVC logotype

Annotation of /branches/split/escriptcore/src/SplitWorld.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4745 - (hide annotations)
Thu Mar 13 00:00:06 2014 UTC (5 years, 5 months ago) by jfenwick
File size: 9629 byte(s)
Suddenly segfaulting and I don\'t know why. Need to add the non-mpi checks so I can debug it.   Also changed the params of the Job constructor so that our extra info is added via kwargs
1 jfenwick 4730 /*****************************************************************************
2     *
3     * Copyright (c) 2014 by University of Queensland
4     * http://www.uq.edu.au
5     *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10     * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11     * Development 2012-2013 by School of Earth Sciences
12     * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13     *
14     *****************************************************************************/
15    
16     #include "esysUtils/Esys_MPI.h"
17 jfenwick 4734 #include "SplitWorld.h"
18 jfenwick 4730 #include "AbstractDomain.h"
19 jfenwick 4731 #include "SplitWorldException.h"
20 jfenwick 4734 #include "SplitWorldException.h"
21 jfenwick 4730
22 jfenwick 4731 #include <iostream>
23    
24 jfenwick 4730 using namespace boost::python;
25     using namespace escript;
26    
27 jfenwick 4734 SplitWorld::SplitWorld(unsigned int numgroups, MPI_Comm global)
28 jfenwick 4745 :globalcom(global), subcom(MPI_COMM_NULL), localworld((SubWorld*)0), swcount(numgroups), jobcounter(1)
29 jfenwick 4730 {
30     int gsize;
31     int grank;
32 jfenwick 4731 if ((MPI_Comm_size(global, &gsize)!=MPI_SUCCESS) || (MPI_Comm_rank(global, &grank)!=MPI_SUCCESS))
33 jfenwick 4730 {
34 jfenwick 4734 throw SplitWorldException("MPI appears to be inoperative.");
35 jfenwick 4730 }
36     if (gsize%numgroups!=0)
37     {
38 jfenwick 4734 throw SplitWorldException("SplitWorld error: requested number of groups is not a factor of global communicator size.");
39 jfenwick 4730 }
40 jfenwick 4731 int wsize=gsize/numgroups; // each world has this many processes
41     int res=MPI_Comm_split(MPI_COMM_WORLD, grank/wsize, grank%wsize, &subcom);
42 jfenwick 4730 if (res!=MPI_SUCCESS)
43     {
44 jfenwick 4734 throw SplitWorldException("SplitWorld error: Unable to form communicator.");
45 jfenwick 4731 }
46 jfenwick 4730 localworld=SubWorld_ptr(new SubWorld(subcom));
47 jfenwick 4731 localid=grank/wsize;
48 jfenwick 4730 }
49    
50     // We may need to look into this more closely.
51     // What if the domain lives longer than the world splitter?
52 jfenwick 4734 SplitWorld::~SplitWorld()
53 jfenwick 4730 {
54     if (subcom!=MPI_COMM_NULL)
55     {
56     MPI_Comm_free(&subcom);
57     }
58     }
59    
60    
61     // The boost wrapper will ensure that there is at least one entry in the tuple
62 jfenwick 4734 object SplitWorld::buildDomains(tuple t, dict kwargs)
63 jfenwick 4730 {
64     int tsize=len(t);
65     // get the callable that we will invoke in a sec
66     object tocall=t[0];
67     // make a new tuple without the first element
68     tuple ntup=tuple(t.slice(1,tsize));
69     // now add the subworld to the kwargs
70     kwargs["escriptworld"]=localworld;
71    
72     // std::cerr << "About to call function with:\n";
73     // //extract<std::string> ex(ntup.attr("__str__")());
74     // std::cerr << extract<std::string>(ntup.attr("__str__")())() << std::endl;
75     // // for (int i=0;i<tsize-1;++i)
76     // // {
77     // // std::cout << extract<const char*>(ntup[i])() << " ";
78     // // }
79     // std::cerr << std::endl;
80    
81     // pass the whole package to the python call
82     object dobj=tocall(*ntup, **kwargs);
83     extract<Domain_ptr> ex1(dobj);
84     Domain_ptr dptr=ex1();
85    
86     // now do a sanity check to see if the domain has respected the communicator info we passed it.
87     if (dptr->getMPIComm()!=localworld->getComm())
88     {
89 jfenwick 4734 throw SplitWorldException("The newly constructed domain is not using the correct communicator.");
90 jfenwick 4730 }
91     localworld->setDomain(dptr);
92     return object(); // return None
93     }
94    
95 jfenwick 4745
96     // Executes all pending jobs on all subworlds
97     void SplitWorld::runJobs()
98 jfenwick 4731 {
99 jfenwick 4745 distributeJobs();
100 jfenwick 4734 int mres=0;
101     do
102     {
103     // now we actually need to run the jobs
104     // everybody will be executing their localworld's jobs
105     int res=localworld->runJobs();
106     // now we find out about the other worlds
107     mres=0;
108     if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
109     {
110     throw SplitWorldException("MPI appears to have failed.");
111     }
112     } while (mres==1);
113     if (mres==2)
114     {
115     throw SplitWorldException("At least one Job's work() function did not return True/False.");
116     }
117     else if (mres==3)
118     {
119     throw SplitWorldException("At least one Job's work() function raised an exception.");
120     }
121 jfenwick 4731 }
122    
123 jfenwick 4736
124 jfenwick 4745 /** a list of tuples/sequences: (Job class, number of instances)*/
125     // void SplitWorld::runJobs(boost::python::list l)
126     // {
127     // // first count up how many jobs we have in total
128     // unsigned int numjobs=0;
129     // std::vector<object> classvec;
130     // std::vector<unsigned int> countvec;
131     // std::vector<unsigned int> partialcounts;
132     // for (int i=0;i<len(l);++i)
133     // {
134     // extract<tuple> ex(l[i]);
135     // if (!ex.check())
136     // {
137     // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
138     // }
139     // tuple t=ex();
140     // if (len(t)!=2)
141     // {
142     // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
143     // }
144     // extract<unsigned int> ex2(t[1]);
145     // unsigned int c=0;
146     // if (!ex2.check() || ((c=ex2())==0))
147     // {
148     // throw SplitWorldException("Number of jobs must be a strictly positive integer.");
149     //
150     // }
151     // classvec.push_back(t[0]);
152     // countvec.push_back(c);
153     // numjobs+=c;
154     // partialcounts.push_back(numjobs);
155     // }
156     // unsigned int classnum=0;
157     // unsigned int lowend=1;
158     // unsigned int highend=lowend+numjobs/swcount+(numjobs%swcount);
159     // // std::cout << localid << std::endl;
160     // for (int i=1;i<=localid;++i)
161     // {
162     // lowend=highend;
163     // highend=lowend+numjobs/swcount;
164     // if (i<numjobs%swcount)
165     // {
166     // highend++;
167     // }
168     // }
169     // // std::cout << "There are " << numjobs << " jobs with range [" << lowend << ", " << highend << ")\n";
170     // // We could do something more clever about trying to fit Jobs to subworlds
171     // // to ensure that instances sharing the same Job class would share the same
172     // // world as much as possible but for now we'll do this:
173     // for (unsigned int j=1;j<=numjobs;++j) // job #0 is a sentinel
174     // {
175     // if (j>partialcounts[classnum])
176     // {
177     // classnum++; // we dont' need to loop this because each count >0
178     // }
179     // // now if this is one of the job numbers in our local range,
180     // // create an instance of the appropriate class
181     // if (j>=lowend and j<highend)
182     // {
183     // object o=classvec[classnum](localworld->getDomain(), object(j));
184     // localworld->addJob(o);
185     // }
186     // }
187     // int mres=0;
188     // do
189     // {
190     // // now we actually need to run the jobs
191     // // everybody will be executing their localworld's jobs
192     // int res=localworld->runJobs();
193     // // now we find out about the other worlds
194     // mres=0;
195     // if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
196     // {
197     // throw SplitWorldException("MPI appears to have failed.");
198     // }
199     // } while (mres==1);
200     // if (mres==2)
201     // {
202     // throw SplitWorldException("At least one Job's work() function did not return True/False.");
203     // }
204     // else if (mres==3)
205     // {
206     // throw SplitWorldException("At least one Job's work() function raised an exception.");
207     // }
208     // }
209    
210    
211 jfenwick 4736 void SplitWorld::registerCrate(escript::crate_ptr c)
212     {
213     protocrates.push_back(c);
214     }
215    
216 jfenwick 4745 /**
217     stores the constructor/factory to make Jobs and the parameters.
218     */
219     void SplitWorld::addJob(boost::python::object creator, boost::python::tuple tup, boost::python::dict kw)
220     {
221     create.push_back(creator);
222     tupargs.push_back(tup);
223     kwargs.push_back(kw);
224     }
225    
226     void SplitWorld::clearPendingJobs()
227     {
228     create.clear();
229     tupargs.clear();
230     kwargs.clear();
231     }
232    
233     void SplitWorld::clearActiveJobs()
234     {
235     localworld->clearJobs();
236     }
237    
238     // All the job params are known on all the ranks.
239     void SplitWorld::distributeJobs()
240     {
241     unsigned int numjobs=create.size()/swcount;
242     unsigned int start=create.size()/swcount*localid;
243     if (localid<create.size()%swcount)
244     {
245     numjobs++;
246     start+=localid;
247     }
248     else
249     {
250     start+=create.size()%swcount;
251     }
252     short errstat=0;
253     try
254     {
255     std::cerr << "Numjobs=" << numjobs << " start=" << start << std::endl;
256     // No other subworld will be looking at this portion of the array
257     // so jobs will only be created on one subworld
258     for (unsigned int i=start;i<start+numjobs;++i)
259     {
260     // we need to add some things to the kw map
261     kwargs[i]["domain"]=localworld->getDomain();
262     kwargs[i]["jobid"]=object(jobcounter+i);
263     object job=create[i](*(tupargs[i]), **(kwargs[i]));
264     localworld->addJob(job);
265     }
266     }
267     catch (boost::python::error_already_set e)
268     {
269     errstat=1;
270     }
271     jobcounter+=create.size();
272     clearPendingJobs();
273    
274     // MPI check to ensure that it worked for everybody
275     short mstat=0;
276     if (MPI_Allreduce(&errstat, &mstat, 1, MPI_SHORT, MPI_MAX, globalcom)!=MPI_SUCCESS)
277     {
278     throw SplitWorldException("MPI appears to have failed.");
279     }
280     errstat=mstat;
281    
282     if (errstat==1)
283     {
284     throw SplitWorldException("distributeJobs: Job creation failed.");
285     clearActiveJobs();
286     }
287     }
288    
289    
290 jfenwick 4730 namespace escript
291     {
292    
293     boost::python::object raw_buildDomains(boost::python::tuple t, boost::python::dict kwargs)
294     {
295     int l=len(t);
296     if (l<2)
297     {
298 jfenwick 4734 throw SplitWorldException("Insufficient parameters to buildDomains.");
299 jfenwick 4730 }
300 jfenwick 4734 extract<SplitWorld&> exw(t[0]);
301 jfenwick 4730 if (!exw.check())
302     {
303 jfenwick 4734 throw SplitWorldException("First parameter to buildDomains must be a SplitWorld.");
304 jfenwick 4730 }
305 jfenwick 4734 SplitWorld& ws=exw();
306 jfenwick 4730 tuple ntup=tuple(t.slice(1,l)); // strip off the object param
307     return ws.buildDomains(ntup, kwargs);
308     }
309    
310 jfenwick 4745 boost::python::object raw_addJob(boost::python::tuple t, boost::python::dict kwargs)
311     {
312     int l=len(t);
313     if (l<2)
314     {
315     throw SplitWorldException("Insufficient parameters to addJob.");
316     }
317     extract<SplitWorld&> exw(t[0]);
318     if (!exw.check())
319     {
320     throw SplitWorldException("First parameter to addJob must be a SplitWorld.");
321     }
322     SplitWorld& ws=exw();
323     object creator=t[1];
324     tuple ntup=tuple(t.slice(2,l)); // strip off the object param
325     ws.addJob(creator, ntup, kwargs);
326     return object();
327 jfenwick 4730 }
328 jfenwick 4745
329    
330     }

  ViewVC Help
Powered by ViewVC 1.1.26