/[escript]/branches/split/escriptcore/src/SplitWorld.cpp
ViewVC logotype

Annotation of /branches/split/escriptcore/src/SplitWorld.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4747 - (hide annotations)
Thu Mar 13 22:52:45 2014 UTC (5 years, 5 months ago) by jfenwick
File size: 9836 byte(s)
switching begins
1 jfenwick 4730 /*****************************************************************************
2     *
3     * Copyright (c) 2014 by University of Queensland
4     * http://www.uq.edu.au
5     *
6     * Primary Business: Queensland, Australia
7     * Licensed under the Open Software License version 3.0
8     * http://www.opensource.org/licenses/osl-3.0.php
9     *
10     * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11     * Development 2012-2013 by School of Earth Sciences
12     * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13     *
14     *****************************************************************************/
15    
16     #include "esysUtils/Esys_MPI.h"
17 jfenwick 4734 #include "SplitWorld.h"
18 jfenwick 4730 #include "AbstractDomain.h"
19 jfenwick 4731 #include "SplitWorldException.h"
20 jfenwick 4734 #include "SplitWorldException.h"
21 jfenwick 4730
22 jfenwick 4731 #include <iostream>
23    
24 jfenwick 4730 using namespace boost::python;
25     using namespace escript;
26    
27 jfenwick 4734 SplitWorld::SplitWorld(unsigned int numgroups, MPI_Comm global)
28 jfenwick 4747 :localworld((SubWorld*)0), swcount(numgroups>0?numgroups:1), jobcounter(1)
29 jfenwick 4730 {
30 jfenwick 4747 globalcom=esysUtils::makeInfo(global);
31    
32 jfenwick 4746 int grank=0;
33     int wsize=1; // each world has this many processes
34     #ifdef ESYS_MPI
35 jfenwick 4747 int gsize=globalcom->size;
36     grank=globalcom->rank;
37 jfenwick 4746 if (gsize%swcount!=0)
38     {
39     throw SplitWorldException("SplitWorld error: requested number of groups is not a factor of global communicator size.");
40     }
41     wsize=gsize/swcount; // each world has this many processes
42 jfenwick 4747 MPI_Comm sub;
43     int res=MPI_Comm_split(MPI_COMM_WORLD, grank/wsize, grank%wsize, &sub);
44 jfenwick 4746 if (res!=MPI_SUCCESS)
45     {
46     throw SplitWorldException("SplitWorld error: Unable to form communicator.");
47     }
48 jfenwick 4747 subcom=esysUtils::makeInfo(sub,true);
49     #else
50     subcom=esysUtils::makeInfo(0);
51 jfenwick 4746 #endif
52 jfenwick 4730 localworld=SubWorld_ptr(new SubWorld(subcom));
53 jfenwick 4731 localid=grank/wsize;
54 jfenwick 4730 }
55    
56 jfenwick 4734 SplitWorld::~SplitWorld()
57 jfenwick 4730 {
58 jfenwick 4747 // communicator cleanup handled by the MPI_Info
59 jfenwick 4730 }
60    
61    
62     // The boost wrapper will ensure that there is at least one entry in the tuple
63 jfenwick 4734 object SplitWorld::buildDomains(tuple t, dict kwargs)
64 jfenwick 4730 {
65     int tsize=len(t);
66     // get the callable that we will invoke in a sec
67     object tocall=t[0];
68     // make a new tuple without the first element
69     tuple ntup=tuple(t.slice(1,tsize));
70     // now add the subworld to the kwargs
71     kwargs["escriptworld"]=localworld;
72    
73     // std::cerr << "About to call function with:\n";
74     // //extract<std::string> ex(ntup.attr("__str__")());
75     // std::cerr << extract<std::string>(ntup.attr("__str__")())() << std::endl;
76     // // for (int i=0;i<tsize-1;++i)
77     // // {
78     // // std::cout << extract<const char*>(ntup[i])() << " ";
79     // // }
80     // std::cerr << std::endl;
81    
82     // pass the whole package to the python call
83     object dobj=tocall(*ntup, **kwargs);
84     extract<Domain_ptr> ex1(dobj);
85     Domain_ptr dptr=ex1();
86    
87     // now do a sanity check to see if the domain has respected the communicator info we passed it.
88 jfenwick 4747 if (dptr->getMPIComm()!=localworld->getMPI()->comm)
89 jfenwick 4730 {
90 jfenwick 4734 throw SplitWorldException("The newly constructed domain is not using the correct communicator.");
91 jfenwick 4730 }
92     localworld->setDomain(dptr);
93     return object(); // return None
94     }
95    
96 jfenwick 4745
97     // Executes all pending jobs on all subworlds
98     void SplitWorld::runJobs()
99 jfenwick 4731 {
100 jfenwick 4745 distributeJobs();
101 jfenwick 4734 int mres=0;
102 jfenwick 4746 std::string err;
103 jfenwick 4734 do
104     {
105     // now we actually need to run the jobs
106     // everybody will be executing their localworld's jobs
107 jfenwick 4746 int res=localworld->runJobs(err);
108 jfenwick 4734 // now we find out about the other worlds
109 jfenwick 4747 if (!esysUtils::checkResult(res, mres, globalcom->comm))
110 jfenwick 4734 {
111     throw SplitWorldException("MPI appears to have failed.");
112     }
113     } while (mres==1);
114     if (mres==2)
115     {
116     throw SplitWorldException("At least one Job's work() function did not return True/False.");
117     }
118     else if (mres==3)
119     {
120 jfenwick 4746 char* resultstr=0;
121     // now we ship around the error message
122 jfenwick 4747 if (!esysUtils::shipString(err.c_str(), &resultstr, globalcom->comm))
123 jfenwick 4746 {
124     throw SplitWorldException("MPI appears to have failed.");
125     }
126     //throw SplitWorldException("At least one Job's work() function raised an exception.");
127     std::string s("At least one Job's work() function raised the following exception:\n");
128     s+=resultstr;
129     throw SplitWorldException(s);
130 jfenwick 4734 }
131 jfenwick 4731 }
132    
133 jfenwick 4736
134 jfenwick 4745 /** a list of tuples/sequences: (Job class, number of instances)*/
135     // void SplitWorld::runJobs(boost::python::list l)
136     // {
137     // // first count up how many jobs we have in total
138     // unsigned int numjobs=0;
139     // std::vector<object> classvec;
140     // std::vector<unsigned int> countvec;
141     // std::vector<unsigned int> partialcounts;
142     // for (int i=0;i<len(l);++i)
143     // {
144     // extract<tuple> ex(l[i]);
145     // if (!ex.check())
146     // {
147     // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
148     // }
149     // tuple t=ex();
150     // if (len(t)!=2)
151     // {
152     // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
153     // }
154     // extract<unsigned int> ex2(t[1]);
155     // unsigned int c=0;
156     // if (!ex2.check() || ((c=ex2())==0))
157     // {
158     // throw SplitWorldException("Number of jobs must be a strictly positive integer.");
159     //
160     // }
161     // classvec.push_back(t[0]);
162     // countvec.push_back(c);
163     // numjobs+=c;
164     // partialcounts.push_back(numjobs);
165     // }
166     // unsigned int classnum=0;
167     // unsigned int lowend=1;
168     // unsigned int highend=lowend+numjobs/swcount+(numjobs%swcount);
169     // // std::cout << localid << std::endl;
170     // for (int i=1;i<=localid;++i)
171     // {
172     // lowend=highend;
173     // highend=lowend+numjobs/swcount;
174     // if (i<numjobs%swcount)
175     // {
176     // highend++;
177     // }
178     // }
179     // // std::cout << "There are " << numjobs << " jobs with range [" << lowend << ", " << highend << ")\n";
180     // // We could do something more clever about trying to fit Jobs to subworlds
181     // // to ensure that instances sharing the same Job class would share the same
182     // // world as much as possible but for now we'll do this:
183     // for (unsigned int j=1;j<=numjobs;++j) // job #0 is a sentinel
184     // {
185     // if (j>partialcounts[classnum])
186     // {
187     // classnum++; // we dont' need to loop this because each count >0
188     // }
189     // // now if this is one of the job numbers in our local range,
190     // // create an instance of the appropriate class
191     // if (j>=lowend and j<highend)
192     // {
193     // object o=classvec[classnum](localworld->getDomain(), object(j));
194     // localworld->addJob(o);
195     // }
196     // }
197     // int mres=0;
198     // do
199     // {
200     // // now we actually need to run the jobs
201     // // everybody will be executing their localworld's jobs
202     // int res=localworld->runJobs();
203     // // now we find out about the other worlds
204     // mres=0;
205     // if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
206     // {
207     // throw SplitWorldException("MPI appears to have failed.");
208     // }
209     // } while (mres==1);
210     // if (mres==2)
211     // {
212     // throw SplitWorldException("At least one Job's work() function did not return True/False.");
213     // }
214     // else if (mres==3)
215     // {
216     // throw SplitWorldException("At least one Job's work() function raised an exception.");
217     // }
218     // }
219    
220    
221 jfenwick 4736 void SplitWorld::registerCrate(escript::crate_ptr c)
222     {
223     protocrates.push_back(c);
224     }
225    
226 jfenwick 4745 /**
227     stores the constructor/factory to make Jobs and the parameters.
228     */
229     void SplitWorld::addJob(boost::python::object creator, boost::python::tuple tup, boost::python::dict kw)
230     {
231     create.push_back(creator);
232     tupargs.push_back(tup);
233     kwargs.push_back(kw);
234     }
235    
236     void SplitWorld::clearPendingJobs()
237     {
238     create.clear();
239     tupargs.clear();
240     kwargs.clear();
241     }
242    
243     void SplitWorld::clearActiveJobs()
244     {
245     localworld->clearJobs();
246     }
247    
248     // All the job params are known on all the ranks.
249     void SplitWorld::distributeJobs()
250     {
251     unsigned int numjobs=create.size()/swcount;
252     unsigned int start=create.size()/swcount*localid;
253     if (localid<create.size()%swcount)
254     {
255     numjobs++;
256     start+=localid;
257     }
258     else
259     {
260     start+=create.size()%swcount;
261     }
262 jfenwick 4746 int errstat=0;
263 jfenwick 4745 try
264     {
265     std::cerr << "Numjobs=" << numjobs << " start=" << start << std::endl;
266     // No other subworld will be looking at this portion of the array
267     // so jobs will only be created on one subworld
268     for (unsigned int i=start;i<start+numjobs;++i)
269     {
270     // we need to add some things to the kw map
271     kwargs[i]["domain"]=localworld->getDomain();
272     kwargs[i]["jobid"]=object(jobcounter+i);
273     object job=create[i](*(tupargs[i]), **(kwargs[i]));
274     localworld->addJob(job);
275     }
276     }
277     catch (boost::python::error_already_set e)
278     {
279     errstat=1;
280     }
281     jobcounter+=create.size();
282     clearPendingJobs();
283    
284     // MPI check to ensure that it worked for everybody
285 jfenwick 4746 int mstat=0;
286 jfenwick 4747 if (!esysUtils::checkResult(errstat, mstat, globalcom->comm))
287 jfenwick 4745 {
288     throw SplitWorldException("MPI appears to have failed.");
289     }
290    
291     if (errstat==1)
292     {
293     throw SplitWorldException("distributeJobs: Job creation failed.");
294     clearActiveJobs();
295     }
296     }
297    
298    
299 jfenwick 4730 namespace escript
300     {
301    
302     boost::python::object raw_buildDomains(boost::python::tuple t, boost::python::dict kwargs)
303     {
304     int l=len(t);
305     if (l<2)
306     {
307 jfenwick 4734 throw SplitWorldException("Insufficient parameters to buildDomains.");
308 jfenwick 4730 }
309 jfenwick 4734 extract<SplitWorld&> exw(t[0]);
310 jfenwick 4730 if (!exw.check())
311     {
312 jfenwick 4734 throw SplitWorldException("First parameter to buildDomains must be a SplitWorld.");
313 jfenwick 4730 }
314 jfenwick 4734 SplitWorld& ws=exw();
315 jfenwick 4730 tuple ntup=tuple(t.slice(1,l)); // strip off the object param
316     return ws.buildDomains(ntup, kwargs);
317     }
318    
319 jfenwick 4745 boost::python::object raw_addJob(boost::python::tuple t, boost::python::dict kwargs)
320     {
321     int l=len(t);
322     if (l<2)
323     {
324     throw SplitWorldException("Insufficient parameters to addJob.");
325     }
326     extract<SplitWorld&> exw(t[0]);
327     if (!exw.check())
328     {
329     throw SplitWorldException("First parameter to addJob must be a SplitWorld.");
330     }
331     SplitWorld& ws=exw();
332     object creator=t[1];
333     tuple ntup=tuple(t.slice(2,l)); // strip off the object param
334     ws.addJob(creator, ntup, kwargs);
335     return object();
336 jfenwick 4730 }
337 jfenwick 4745
338    
339     }

  ViewVC Help
Powered by ViewVC 1.1.26