/[escript]/branches/split/escriptcore/src/SplitWorld.cpp
ViewVC logotype

Contents of /branches/split/escriptcore/src/SplitWorld.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4745 - (show annotations)
Thu Mar 13 00:00:06 2014 UTC (5 years, 5 months ago) by jfenwick
File size: 9629 byte(s)
Suddenly segfaulting and I don\'t know why. Need to add the non-mpi checks so I can debug it.   Also changed the params of the Job constructor so that our extra info is added via kwargs
1 /*****************************************************************************
2 *
3 * Copyright (c) 2014 by University of Queensland
4 * http://www.uq.edu.au
5 *
6 * Primary Business: Queensland, Australia
7 * Licensed under the Open Software License version 3.0
8 * http://www.opensource.org/licenses/osl-3.0.php
9 *
10 * Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 * Development 2012-2013 by School of Earth Sciences
12 * Development from 2014 by Centre for Geoscience Computing (GeoComp)
13 *
14 *****************************************************************************/
15
16 #include "esysUtils/Esys_MPI.h"
17 #include "SplitWorld.h"
18 #include "AbstractDomain.h"
19 #include "SplitWorldException.h"
20 #include "SplitWorldException.h"
21
22 #include <iostream>
23
24 using namespace boost::python;
25 using namespace escript;
26
27 SplitWorld::SplitWorld(unsigned int numgroups, MPI_Comm global)
28 :globalcom(global), subcom(MPI_COMM_NULL), localworld((SubWorld*)0), swcount(numgroups), jobcounter(1)
29 {
30 int gsize;
31 int grank;
32 if ((MPI_Comm_size(global, &gsize)!=MPI_SUCCESS) || (MPI_Comm_rank(global, &grank)!=MPI_SUCCESS))
33 {
34 throw SplitWorldException("MPI appears to be inoperative.");
35 }
36 if (gsize%numgroups!=0)
37 {
38 throw SplitWorldException("SplitWorld error: requested number of groups is not a factor of global communicator size.");
39 }
40 int wsize=gsize/numgroups; // each world has this many processes
41 int res=MPI_Comm_split(MPI_COMM_WORLD, grank/wsize, grank%wsize, &subcom);
42 if (res!=MPI_SUCCESS)
43 {
44 throw SplitWorldException("SplitWorld error: Unable to form communicator.");
45 }
46 localworld=SubWorld_ptr(new SubWorld(subcom));
47 localid=grank/wsize;
48 }
49
50 // We may need to look into this more closely.
51 // What if the domain lives longer than the world splitter?
52 SplitWorld::~SplitWorld()
53 {
54 if (subcom!=MPI_COMM_NULL)
55 {
56 MPI_Comm_free(&subcom);
57 }
58 }
59
60
61 // The boost wrapper will ensure that there is at least one entry in the tuple
62 object SplitWorld::buildDomains(tuple t, dict kwargs)
63 {
64 int tsize=len(t);
65 // get the callable that we will invoke in a sec
66 object tocall=t[0];
67 // make a new tuple without the first element
68 tuple ntup=tuple(t.slice(1,tsize));
69 // now add the subworld to the kwargs
70 kwargs["escriptworld"]=localworld;
71
72 // std::cerr << "About to call function with:\n";
73 // //extract<std::string> ex(ntup.attr("__str__")());
74 // std::cerr << extract<std::string>(ntup.attr("__str__")())() << std::endl;
75 // // for (int i=0;i<tsize-1;++i)
76 // // {
77 // // std::cout << extract<const char*>(ntup[i])() << " ";
78 // // }
79 // std::cerr << std::endl;
80
81 // pass the whole package to the python call
82 object dobj=tocall(*ntup, **kwargs);
83 extract<Domain_ptr> ex1(dobj);
84 Domain_ptr dptr=ex1();
85
86 // now do a sanity check to see if the domain has respected the communicator info we passed it.
87 if (dptr->getMPIComm()!=localworld->getComm())
88 {
89 throw SplitWorldException("The newly constructed domain is not using the correct communicator.");
90 }
91 localworld->setDomain(dptr);
92 return object(); // return None
93 }
94
95
96 // Executes all pending jobs on all subworlds
97 void SplitWorld::runJobs()
98 {
99 distributeJobs();
100 int mres=0;
101 do
102 {
103 // now we actually need to run the jobs
104 // everybody will be executing their localworld's jobs
105 int res=localworld->runJobs();
106 // now we find out about the other worlds
107 mres=0;
108 if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
109 {
110 throw SplitWorldException("MPI appears to have failed.");
111 }
112 } while (mres==1);
113 if (mres==2)
114 {
115 throw SplitWorldException("At least one Job's work() function did not return True/False.");
116 }
117 else if (mres==3)
118 {
119 throw SplitWorldException("At least one Job's work() function raised an exception.");
120 }
121 }
122
123
124 /** a list of tuples/sequences: (Job class, number of instances)*/
125 // void SplitWorld::runJobs(boost::python::list l)
126 // {
127 // // first count up how many jobs we have in total
128 // unsigned int numjobs=0;
129 // std::vector<object> classvec;
130 // std::vector<unsigned int> countvec;
131 // std::vector<unsigned int> partialcounts;
132 // for (int i=0;i<len(l);++i)
133 // {
134 // extract<tuple> ex(l[i]);
135 // if (!ex.check())
136 // {
137 // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
138 // }
139 // tuple t=ex();
140 // if (len(t)!=2)
141 // {
142 // throw SplitWorldException("runJobs takes a list of tuples (jobclass, number).");
143 // }
144 // extract<unsigned int> ex2(t[1]);
145 // unsigned int c=0;
146 // if (!ex2.check() || ((c=ex2())==0))
147 // {
148 // throw SplitWorldException("Number of jobs must be a strictly positive integer.");
149 //
150 // }
151 // classvec.push_back(t[0]);
152 // countvec.push_back(c);
153 // numjobs+=c;
154 // partialcounts.push_back(numjobs);
155 // }
156 // unsigned int classnum=0;
157 // unsigned int lowend=1;
158 // unsigned int highend=lowend+numjobs/swcount+(numjobs%swcount);
159 // // std::cout << localid << std::endl;
160 // for (int i=1;i<=localid;++i)
161 // {
162 // lowend=highend;
163 // highend=lowend+numjobs/swcount;
164 // if (i<numjobs%swcount)
165 // {
166 // highend++;
167 // }
168 // }
169 // // std::cout << "There are " << numjobs << " jobs with range [" << lowend << ", " << highend << ")\n";
170 // // We could do something more clever about trying to fit Jobs to subworlds
171 // // to ensure that instances sharing the same Job class would share the same
172 // // world as much as possible but for now we'll do this:
173 // for (unsigned int j=1;j<=numjobs;++j) // job #0 is a sentinel
174 // {
175 // if (j>partialcounts[classnum])
176 // {
177 // classnum++; // we dont' need to loop this because each count >0
178 // }
179 // // now if this is one of the job numbers in our local range,
180 // // create an instance of the appropriate class
181 // if (j>=lowend and j<highend)
182 // {
183 // object o=classvec[classnum](localworld->getDomain(), object(j));
184 // localworld->addJob(o);
185 // }
186 // }
187 // int mres=0;
188 // do
189 // {
190 // // now we actually need to run the jobs
191 // // everybody will be executing their localworld's jobs
192 // int res=localworld->runJobs();
193 // // now we find out about the other worlds
194 // mres=0;
195 // if (MPI_Allreduce(&res, &mres, 1, MPI_INT, MPI_MAX, globalcom)!=MPI_SUCCESS)
196 // {
197 // throw SplitWorldException("MPI appears to have failed.");
198 // }
199 // } while (mres==1);
200 // if (mres==2)
201 // {
202 // throw SplitWorldException("At least one Job's work() function did not return True/False.");
203 // }
204 // else if (mres==3)
205 // {
206 // throw SplitWorldException("At least one Job's work() function raised an exception.");
207 // }
208 // }
209
210
211 void SplitWorld::registerCrate(escript::crate_ptr c)
212 {
213 protocrates.push_back(c);
214 }
215
216 /**
217 stores the constructor/factory to make Jobs and the parameters.
218 */
219 void SplitWorld::addJob(boost::python::object creator, boost::python::tuple tup, boost::python::dict kw)
220 {
221 create.push_back(creator);
222 tupargs.push_back(tup);
223 kwargs.push_back(kw);
224 }
225
226 void SplitWorld::clearPendingJobs()
227 {
228 create.clear();
229 tupargs.clear();
230 kwargs.clear();
231 }
232
233 void SplitWorld::clearActiveJobs()
234 {
235 localworld->clearJobs();
236 }
237
238 // All the job params are known on all the ranks.
239 void SplitWorld::distributeJobs()
240 {
241 unsigned int numjobs=create.size()/swcount;
242 unsigned int start=create.size()/swcount*localid;
243 if (localid<create.size()%swcount)
244 {
245 numjobs++;
246 start+=localid;
247 }
248 else
249 {
250 start+=create.size()%swcount;
251 }
252 short errstat=0;
253 try
254 {
255 std::cerr << "Numjobs=" << numjobs << " start=" << start << std::endl;
256 // No other subworld will be looking at this portion of the array
257 // so jobs will only be created on one subworld
258 for (unsigned int i=start;i<start+numjobs;++i)
259 {
260 // we need to add some things to the kw map
261 kwargs[i]["domain"]=localworld->getDomain();
262 kwargs[i]["jobid"]=object(jobcounter+i);
263 object job=create[i](*(tupargs[i]), **(kwargs[i]));
264 localworld->addJob(job);
265 }
266 }
267 catch (boost::python::error_already_set e)
268 {
269 errstat=1;
270 }
271 jobcounter+=create.size();
272 clearPendingJobs();
273
274 // MPI check to ensure that it worked for everybody
275 short mstat=0;
276 if (MPI_Allreduce(&errstat, &mstat, 1, MPI_SHORT, MPI_MAX, globalcom)!=MPI_SUCCESS)
277 {
278 throw SplitWorldException("MPI appears to have failed.");
279 }
280 errstat=mstat;
281
282 if (errstat==1)
283 {
284 throw SplitWorldException("distributeJobs: Job creation failed.");
285 clearActiveJobs();
286 }
287 }
288
289
290 namespace escript
291 {
292
293 boost::python::object raw_buildDomains(boost::python::tuple t, boost::python::dict kwargs)
294 {
295 int l=len(t);
296 if (l<2)
297 {
298 throw SplitWorldException("Insufficient parameters to buildDomains.");
299 }
300 extract<SplitWorld&> exw(t[0]);
301 if (!exw.check())
302 {
303 throw SplitWorldException("First parameter to buildDomains must be a SplitWorld.");
304 }
305 SplitWorld& ws=exw();
306 tuple ntup=tuple(t.slice(1,l)); // strip off the object param
307 return ws.buildDomains(ntup, kwargs);
308 }
309
310 boost::python::object raw_addJob(boost::python::tuple t, boost::python::dict kwargs)
311 {
312 int l=len(t);
313 if (l<2)
314 {
315 throw SplitWorldException("Insufficient parameters to addJob.");
316 }
317 extract<SplitWorld&> exw(t[0]);
318 if (!exw.check())
319 {
320 throw SplitWorldException("First parameter to addJob must be a SplitWorld.");
321 }
322 SplitWorld& ws=exw();
323 object creator=t[1];
324 tuple ntup=tuple(t.slice(2,l)); // strip off the object param
325 ws.addJob(creator, ntup, kwargs);
326 return object();
327 }
328
329
330 }

  ViewVC Help
Powered by ViewVC 1.1.26