/[escript]/branches/split/escriptcore/src/SplitWorld.cpp
ViewVC logotype

Contents of /branches/split/escriptcore/src/SplitWorld.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4747 - (show annotations)
Thu Mar 13 22:52:45 2014 UTC (5 years, 5 months ago) by jfenwick
File size: 9836 byte(s)
switching begins
1 /*****************************************************************************
2 *
3 * Copyright (c) 2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16 #include "esysUtils/Esys_MPI.h"
17 #include "SplitWorld.h"
18 #include "AbstractDomain.h"
19 #include "SplitWorldException.h"
20 #include "SplitWorldException.h"
21
22 #include <iostream>
23
24 using namespace boost::python;
25 using namespace escript;
26
27 SplitWorld::SplitWorld(unsigned int numgroups, MPI_Comm global)
28 :localworld((SubWorld*)0), swcount(numgroups>0?numgroups:1), jobcounter(1)
29 {
30 globalcom=esysUtils::makeInfo(global);
31
32 int grank=0;
33 int wsize=1; // each world has this many processes
34 #ifdef ESYS_MPI
35 int gsize=globalcom->size;
36 grank=globalcom->rank;
37 if (gsize%swcount!=0)
38 {
39 throw SplitWorldException("SplitWorld error: requested number of groups is not a factor of global communicator size.");
40 }
41 wsize=gsize/swcount; // each world has this many processes
42 MPI_Comm sub;
43 int res=MPI_Comm_split(MPI_COMM_WORLD, grank/wsize, grank%wsize, &sub);
44 if (res!=MPI_SUCCESS)
45 {
46 throw SplitWorldException("SplitWorld error: Unable to form communicator.");
47 }
48 subcom=esysUtils::makeInfo(sub,true);
49 #else
50 subcom=esysUtils::makeInfo(0);
51 #endif
52 localworld=SubWorld_ptr(new SubWorld(subcom));
53 localid=grank/wsize;
54 }
55
56 SplitWorld::~SplitWorld()
57 {
58 // communicator cleanup handled by the MPI_Info
59 }
60
61
62 // The boost wrapper will ensure that there is at least one entry in the tuple
63 object SplitWorld::buildDomains(tuple t, dict kwargs)
64 {
65 int tsize=len(t);
66 // get the callable that we will invoke in a sec
67 object tocall=t[0];
68 // make a new tuple without the first element
69 tuple ntup=tuple(t.slice(1,tsize));
70 // now add the subworld to the kwargs
71 kwargs["escriptworld"]=localworld;
72
73 // std::cerr << "About to call function with:\n";
74 // //extract<std::string> ex(ntup.attr("__str__")());
75 // std::cerr << extract<std::string>(ntup.attr("__str__")())() << std::endl;
76 // // for (int i=0;i<tsize-1;++i)
77 // // {
78 // // std::cout << extract<const char*>(ntup[i])() << " ";
79 // // }
80 // std::cerr << std::endl;
81
82 // pass the whole package to the python call
83 object dobj=tocall(*ntup, **kwargs);
84 extract<Domain_ptr> ex1(dobj);
85 Domain_ptr dptr=ex1();
86
87 // now do a sanity check to see if the domain has respected the communicator info we passed it.
88 if (dptr->getMPIComm()!=localworld->getMPI()->comm)
89 {
90 throw SplitWorldException("The newly constructed domain is not using the correct communicator.");
91 }
92 localworld->setDomain(dptr);
93 return object(); // return None
94 }
95
96
97 // Executes all pending jobs on all subworlds
98 void SplitWorld::runJobs()
99 {
100 distributeJobs();
101 int mres=0;
102 std::string err;
103 do
104 {
105 // now we actually need to run the jobs
106 // everybody will be executing their localworld's jobs
107 int res=localworld->runJobs(err);
108 // now we find out about the other worlds
109 if (!esysUtils::checkResult(res, mres, globalcom->comm))
110 {
111 throw SplitWorldException("MPI appears to have failed.");
112 }
113 } while (mres==1);
114 if (mres==2)
115 {
116 throw SplitWorldException("At least one Job's work() function did not return True/False.");
117 }
118 else if (mres==3)
119 {
120 char* resultstr=0;
121 // now we ship around the error message
122 if (!esysUtils::shipString(err.c_str(), &resultstr, globalcom->comm))
123 {
124 throw SplitWorldException("MPI appears to have failed.");
125 }
126 //throw SplitWorldException("At least one Job's work() function raised an exception.");
127 std::string s("At least one Job's work() function raised the following exception:\n");
128 s+=resultstr;
129 throw SplitWorldException(s);
130 }
131 }
132
133
134 /** a list of tuples/sequences: (Job class, number of instances)*/
135 // void SplitWorld::runJobs(boost::python::list l)
136 // {
137 // // first count up how many jobs we have in total
138 // unsigned int numjobs=0;
139 // std::vector<object> classvec;
140 // std::vector<unsigned int> countvec;
141 // std::vector<unsigned int> partialcounts;
142 // for (int i=0;i<len(l);++i)
143 // {
144 // extract<tuple> ex(l[i]);
145 // if (!ex.check())
146 // {
147 // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
148 // }
149 // tuple t=ex();
150 // if (len(t)!=2)
151 // {
152 // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
153 // }
154 // extract<unsigned int> ex2(t[1]);
155 // unsigned int c=0;
156 // if (!ex2.check() || ((c=ex2())==0))
157 // {
158 // throw SplitWorldException("Number of jobs must be a strictly positive integer.");
159 //
160 // }
161 // classvec.push_back(t[0]);
162 // countvec.push_back(c);
163 // numjobs+=c;
164 // partialcounts.push_back(numjobs);
165 // }
166 // unsigned int classnum=0;
167 // unsigned int lowend=1;
168 // unsigned int highend=lowend+numjobs/swcount+(numjobs%swcount);
169 // // std::cout << localid << std::endl;
170 // for (int i=1;i<=localid;++i)
171 // {
172 // lowend=highend;
173 // highend=lowend+numjobs/swcount;
174 // if (i<numjobs%swcount)
175 // {
176 // highend++;
177 // }
178 // }
179 // // std::cout << "There are " << numjobs << " jobs with range [" << lowend << ", " << highend << ")\n";
180 // // We could do something more clever about trying to fit Jobs to subworlds
181 // // to ensure that instances sharing the same Job class would share the same
182 // // world as much as possible but for now we'll do this:
183 // for (unsigned int j=1;j<=numjobs;++j) // job #0 is a sentinel
184 // {
185 // if (j>partialcounts[classnum])
186 // {
187 // classnum++; // we dont' need to loop this because each count >0
188 // }
189 // // now if this is one of the job numbers in our local range,
190 // // create an instance of the appropriate class
191 // if (j>=lowend and j<highend)
192 // {
193 // object o=classvec[classnum](localworld->getDomain(), object(j));
194 // localworld->addJob(o);
195 // }
196 // }
197 // int mres=0;
198 // do
199 // {
200 // // now we actually need to run the jobs
201 // // everybody will be executing their localworld's jobs
202 // int res=localworld->runJobs();
203 // // now we find out about the other worlds
204 // mres=0;
205 // if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
206 // {
207 // throw SplitWorldException("MPI appears to have failed.");
208 // }
209 // } while (mres==1);
210 // if (mres==2)
211 // {
212 // throw SplitWorldException("At least one Job's work() function did not return True/False.");
213 // }
214 // else if (mres==3)
215 // {
216 // throw SplitWorldException("At least one Job's work() function raised an exception.");
217 // }
218 // }
219
220
221 void SplitWorld::registerCrate(escript::crate_ptr c)
222 {
223 protocrates.push_back(c);
224 }
225
226 /**
227 stores the constructor/factory to make Jobs and the parameters.
228 */
229 void SplitWorld::addJob(boost::python::object creator, boost::python::tuple tup, boost::python::dict kw)
230 {
231 create.push_back(creator);
232 tupargs.push_back(tup);
233 kwargs.push_back(kw);
234 }
235
236 void SplitWorld::clearPendingJobs()
237 {
238 create.clear();
239 tupargs.clear();
240 kwargs.clear();
241 }
242
243 void SplitWorld::clearActiveJobs()
244 {
245 localworld->clearJobs();
246 }
247
248 // All the job params are known on all the ranks.
249 void SplitWorld::distributeJobs()
250 {
251 unsigned int numjobs=create.size()/swcount;
252 unsigned int start=create.size()/swcount*localid;
253 if (localid<create.size()%swcount)
254 {
255 numjobs++;
256 start+=localid;
257 }
258 else
259 {
260 start+=create.size()%swcount;
261 }
262 int errstat=0;
263 try
264 {
265 std::cerr << "Numjobs=" << numjobs << " start=" << start << std::endl;
266 // No other subworld will be looking at this portion of the array
267 // so jobs will only be created on one subworld
268 for (unsigned int i=start;i<start+numjobs;++i)
269 {
270 // we need to add some things to the kw map
271 kwargs[i]["domain"]=localworld->getDomain();
272 kwargs[i]["jobid"]=object(jobcounter+i);
273 object job=create[i](*(tupargs[i]), **(kwargs[i]));
274 localworld->addJob(job);
275 }
276 }
277 catch (boost::python::error_already_set e)
278 {
279 errstat=1;
280 }
281 jobcounter+=create.size();
282 clearPendingJobs();
283
284 // MPI check to ensure that it worked for everybody
285 int mstat=0;
286 if (!esysUtils::checkResult(errstat, mstat, globalcom->comm))
287 {
288 throw SplitWorldException("MPI appears to have failed.");
289 }
290
291 if (errstat==1)
292 {
293 throw SplitWorldException("distributeJobs: Job creation failed.");
294 clearActiveJobs();
295 }
296 }
297
298
299 namespace escript
300 {
301
302 boost::python::object raw_buildDomains(boost::python::tuple t, boost::python::dict kwargs)
303 {
304 int l=len(t);
305 if (l<2)
306 {
307 throw SplitWorldException("Insufficient parameters to buildDomains.");
308 }
309 extract<SplitWorld&> exw(t[0]);
310 if (!exw.check())
311 {
312 throw SplitWorldException("First parameter to buildDomains must be a SplitWorld.");
313 }
314 SplitWorld& ws=exw();
315 tuple ntup=tuple(t.slice(1,l)); // strip off the object param
316 return ws.buildDomains(ntup, kwargs);
317 }
318
319 boost::python::object raw_addJob(boost::python::tuple t, boost::python::dict kwargs)
320 {
321 int l=len(t);
322 if (l<2)
323 {
324 throw SplitWorldException("Insufficient parameters to addJob.");
325 }
326 extract<SplitWorld&> exw(t[0]);
327 if (!exw.check())
328 {
329 throw SplitWorldException("First parameter to addJob must be a SplitWorld.");
330 }
331 SplitWorld& ws=exw();
332 object creator=t[1];
333 tuple ntup=tuple(t.slice(2,l)); // strip off the object param
334 ws.addJob(creator, ntup, kwargs);
335 return object();
336 }
337
338
339 }

  ViewVC Help
Powered by ViewVC 1.1.26