/[escript]/trunk/dudley/src/Mesh_optimizeDOFDistribution.c
ViewVC logotype

Annotation of /trunk/dudley/src/Mesh_optimizeDOFDistribution.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3231 - (hide annotations)
Fri Oct 1 01:53:46 2010 UTC (8 years, 10 months ago) by jfenwick
Original Path: branches/domexper/dudley/src/Mesh_optimizeDOFDistribution.c
File MIME type: text/plain
File size: 9765 byte(s)
Fix MPI and OMP problems not detected in serial

1 ksteube 1315
2     /*******************************************************
3 ksteube 1811 *
4 jfenwick 2881 * Copyright (c) 2003-2010 by University of Queensland
5 ksteube 1811 * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13 ksteube 1315
14     /**************************************************************/
15    
16 jfenwick 3086 /* Dudley: Mesh: optimizes the distribution of DOFs across processors */
17 ksteube 1315 /* using ParMETIS. On return a new distribution is given and the globalDOF are relabled */
18     /* accordingly but the mesh has not been redesitributed yet */
19    
20     /**************************************************************/
21    
22     #include "Mesh.h"
23     #include "IndexList.h"
24     #ifdef _OPENMP
25     #include <omp.h>
26     #endif
27 gross 1720 #ifdef USE_PARMETIS
28 ksteube 1459 #include "parmetis.h"
29     #endif
30 ksteube 1315
31 lgao 2897 /**************************************************************
32     Check whether there is any node which has no vertex. In case
33     such node exists, we don't use parmetis since parmetis requires
34     that every node has at least 1 vertex (at line 129 of file
35     "xyzpart.c" in parmetis 3.1.1, variable "nvtxs" would be 0 if
36     any node has no vertex).
37     **************************************************************/
38     #ifdef USE_PARMETIS
39 jfenwick 3224 int Check_Inputs_For_Parmetis(dim_t mpiSize, dim_t rank, dim_t * distribution, MPI_Comm * comm)
40 lgao 2897 {
41 jfenwick 3224 dim_t i, len;
42     int ret_val = 1;
43 lgao 2897
44 jfenwick 3224 if (rank == 0)
45     {
46     for (i = 0; i < mpiSize; i++)
47     {
48     len = distribution[i + 1] - distribution[i];
49     if (len == 0)
50     {
51     ret_val = 0;
52     break;
53     }
54     }
55 lgao 2897 }
56 jfenwick 3224 MPI_Bcast(&ret_val, 1, MPI_INTEGER, 0, *comm);
57     if (ret_val == 0)
58     printf("INFO: Parmetis is not used since some nodes have no vertex!\n");
59     return ret_val;
60 lgao 2897 }
61     #endif
62    
63 jfenwick 3224 /**************************************************************/
64 lgao 2897
65 jfenwick 3224 void Dudley_Mesh_optimizeDOFDistribution(Dudley_Mesh * in, dim_t * distribution)
66     {
67 lgao 2897
68 jfenwick 3224 dim_t dim, i, j, k, myNumVertices, p, mpiSize, len, globalNumVertices, *partition_count = NULL, *new_distribution =
69     NULL, *loc_partition_count = NULL;
70     bool_t *setNewDOFId = NULL;
71     index_t myFirstVertex, myLastVertex, firstVertex, lastVertex, *newGlobalDOFID = NULL;
72     size_t mpiSize_size;
73     index_t *partition = NULL;
74     Paso_Pattern *pattern = NULL;
75 jfenwick 3227 Esys_MPI_rank myRank, dest, source, current_rank, rank;
76 jfenwick 3224 Dudley_IndexList *index_list = NULL;
77     float *xyz = NULL;
78     int c;
79 ksteube 1315
80 jfenwick 3231 #ifdef ESYS_MPI
81 jfenwick 3224 MPI_Status status;
82     #endif
83 ksteube 1315
84 jfenwick 3224 if (in == NULL)
85     return;
86     if (in->Nodes == NULL)
87     return;
88 ksteube 1315
89 jfenwick 3224 myRank = in->MPIInfo->rank;
90     mpiSize = in->MPIInfo->size;
91     mpiSize_size = mpiSize * sizeof(dim_t);
92     dim = in->Nodes->numDim;
93     /* first step is to distribute the elements according to a global X of DOF */
94 ksteube 1315
95 jfenwick 3224 myFirstVertex = distribution[myRank];
96     myLastVertex = distribution[myRank + 1];
97     myNumVertices = myLastVertex - myFirstVertex;
98     globalNumVertices = distribution[mpiSize];
99     len = 0;
100     for (p = 0; p < mpiSize; ++p)
101     len = MAX(len, distribution[p + 1] - distribution[p]);
102     partition = TMPMEMALLOC(len, index_t); /* len is used for the sending around of partition later on */
103     xyz = TMPMEMALLOC(myNumVertices * dim, float);
104     partition_count = TMPMEMALLOC(mpiSize + 1, dim_t);
105     new_distribution = TMPMEMALLOC(mpiSize + 1, dim_t);
106     newGlobalDOFID = TMPMEMALLOC(len, index_t);
107     setNewDOFId = TMPMEMALLOC(in->Nodes->numNodes, bool_t);
108     if (!
109     (Dudley_checkPtr(partition) || Dudley_checkPtr(xyz) || Dudley_checkPtr(partition_count)
110     || Dudley_checkPtr(partition_count) || Dudley_checkPtr(newGlobalDOFID) || Dudley_checkPtr(setNewDOFId)))
111     {
112     dim_t *recvbuf = TMPMEMALLOC(mpiSize * mpiSize, dim_t);
113 ksteube 1315
114 jfenwick 3224 /* set the coordinates: */
115     /* it is assumed that at least one node on this processor provides a coordinate */
116     #pragma omp parallel for private(i,j,k)
117     for (i = 0; i < in->Nodes->numNodes; ++i)
118     {
119     k = in->Nodes->globalDegreesOfFreedom[i] - myFirstVertex;
120     if ((k >= 0) && (k < myNumVertices))
121     {
122     for (j = 0; j < dim; ++j)
123     xyz[k * dim + j] = (float)(in->Nodes->Coordinates[INDEX2(j, i, dim)]);
124     }
125     }
126 ksteube 1315
127 jfenwick 3224 index_list = TMPMEMALLOC(myNumVertices, Dudley_IndexList);
128     /* ksteube CSR of DOF IDs */
129     /* create the adjacency structure xadj and adjncy */
130     if (!Dudley_checkPtr(index_list))
131     {
132     #pragma omp parallel private(i)
133     {
134     #pragma omp for schedule(static)
135     for (i = 0; i < myNumVertices; ++i)
136     {
137     index_list[i].extension = NULL;
138     index_list[i].n = 0;
139     }
140     /* ksteube build CSR format */
141     /* insert contributions from element matrices into colums index index_list: */
142     Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list, myFirstVertex, myLastVertex,
143     in->Elements,
144     in->Nodes->globalDegreesOfFreedom,
145     in->Nodes->globalDegreesOfFreedom);
146     Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list, myFirstVertex, myLastVertex,
147     in->FaceElements,
148     in->Nodes->globalDegreesOfFreedom,
149     in->Nodes->globalDegreesOfFreedom);
150     Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list, myFirstVertex, myLastVertex,
151     in->Points, in->Nodes->globalDegreesOfFreedom,
152     in->Nodes->globalDegreesOfFreedom);
153     }
154 ksteube 1315
155 jfenwick 3224 /* create the local matrix pattern */
156     pattern = Dudley_IndexList_createPattern(0, myNumVertices, index_list, 0, globalNumVertices, 0);
157 ksteube 1315
158 jfenwick 3224 /* clean up index list */
159     if (index_list != NULL)
160     {
161     #pragma omp parallel for private(i)
162     for (i = 0; i < myNumVertices; ++i)
163     Dudley_IndexList_free(index_list[i].extension);
164     }
165 ksteube 1315
166 jfenwick 3224 if (Dudley_noError())
167     {
168 ksteube 1315
169 gross 1720 #ifdef USE_PARMETIS
170    
171 jfenwick 3224 if (mpiSize > 1 && Check_Inputs_For_Parmetis(mpiSize, myRank, distribution, &(in->MPIInfo->comm)) > 0)
172     {
173     int i;
174     int wgtflag = 0;
175     int numflag = 0; /* pattern->ptr is C style: starting from 0 instead of 1 */
176     int ncon = 1;
177     int edgecut;
178     int options[2];
179     float *tpwgts = TMPMEMALLOC(ncon * mpiSize, float);
180     float *ubvec = TMPMEMALLOC(ncon, float);
181     for (i = 0; i < ncon * mpiSize; i++)
182     tpwgts[i] = 1.0 / (float)mpiSize;
183     for (i = 0; i < ncon; i++)
184     ubvec[i] = 1.05;
185     options[0] = 3;
186     options[1] = 15;
187     ParMETIS_V3_PartGeomKway(distribution, pattern->ptr, pattern->index, NULL, NULL, &wgtflag, &numflag, &dim, xyz, &ncon, &mpiSize, tpwgts, ubvec, options, &edgecut, partition, /* new CPU ownership of elements */
188     &(in->MPIInfo->comm));
189     /* printf("ParMETIS number of edges cut by partitioning per processor: %d\n", edgecut/MAX(in->MPIInfo->size,1)); */
190     TMPMEMFREE(ubvec);
191     TMPMEMFREE(tpwgts);
192     }
193     else
194     {
195     for (i = 0; i < myNumVertices; ++i)
196     partition[i] = 0; /* CPU 0 owns it */
197     }
198 ksteube 1459 #else
199 jfenwick 3224 for (i = 0; i < myNumVertices; ++i)
200     partition[i] = myRank; /* CPU 0 owns it */
201 ksteube 1459 #endif
202    
203 jfenwick 3224 }
204 ksteube 1315
205 jfenwick 3224 Paso_Pattern_free(pattern);
206 ksteube 1459
207 jfenwick 3224 /* create a new distributioin and labeling of the DOF */
208     memset(new_distribution, 0, mpiSize_size);
209     #pragma omp parallel private(loc_partition_count)
210     {
211     loc_partition_count = THREAD_MEMALLOC(mpiSize, dim_t);
212     memset(loc_partition_count, 0, mpiSize_size);
213     #pragma omp for private(i)
214     for (i = 0; i < myNumVertices; ++i)
215     loc_partition_count[partition[i]]++;
216     #pragma omp critical
217     {
218     for (i = 0; i < mpiSize; ++i)
219     new_distribution[i] += loc_partition_count[i];
220     }
221     THREAD_MEMFREE(loc_partition_count);
222     }
223 jfenwick 3231 #ifdef ESYS_MPI
224 jfenwick 3224 /* recvbuf will be the concatenation of each CPU's contribution to new_distribution */
225     MPI_Allgather(new_distribution, mpiSize, MPI_INT, recvbuf, mpiSize, MPI_INT, in->MPIInfo->comm);
226     #else
227     for (i = 0; i < mpiSize; ++i)
228     recvbuf[i] = new_distribution[i];
229     #endif
230     new_distribution[0] = 0;
231     for (rank = 0; rank < mpiSize; rank++)
232     {
233     c = 0;
234     for (i = 0; i < myRank; ++i)
235     c += recvbuf[rank + mpiSize * i];
236     for (i = 0; i < myNumVertices; ++i)
237     {
238     if (rank == partition[i])
239     {
240     newGlobalDOFID[i] = new_distribution[rank] + c;
241     c++;
242     }
243     }
244     for (i = myRank + 1; i < mpiSize; ++i)
245     c += recvbuf[rank + mpiSize * i];
246     new_distribution[rank + 1] = new_distribution[rank] + c;
247     }
248     TMPMEMFREE(recvbuf);
249 ksteube 1315
250 jfenwick 3224 /* now the overlap needs to be created by sending the partition around */
251 ksteube 1315
252 jfenwick 3227 dest = Esys_MPIInfo_mod(mpiSize, myRank + 1);
253     source = Esys_MPIInfo_mod(mpiSize, myRank - 1);
254 jfenwick 3224 current_rank = myRank;
255     #pragma omp parallel for private(i)
256     for (i = 0; i < in->Nodes->numNodes; ++i)
257     setNewDOFId[i] = TRUE;
258 ksteube 1315
259 jfenwick 3224 for (p = 0; p < mpiSize; ++p)
260     {
261 ksteube 1315
262 jfenwick 3224 firstVertex = distribution[current_rank];
263     lastVertex = distribution[current_rank + 1];
264     #pragma omp parallel for private(i,j,k)
265     for (i = 0; i < in->Nodes->numNodes; ++i)
266     {
267     k = in->Nodes->globalDegreesOfFreedom[i];
268     if (setNewDOFId[i] && (firstVertex <= k) && (k < lastVertex))
269     {
270     in->Nodes->globalDegreesOfFreedom[i] = newGlobalDOFID[k - firstVertex];
271     setNewDOFId[i] = FALSE;
272     }
273     }
274    
275     if (p < mpiSize - 1)
276     { /* the final send can be skipped */
277 jfenwick 3231 #ifdef ESYS_MPI
278 jfenwick 3224 MPI_Sendrecv_replace(newGlobalDOFID, len, MPI_INT,
279     dest, in->MPIInfo->msg_tag_counter,
280     source, in->MPIInfo->msg_tag_counter, in->MPIInfo->comm, &status);
281     #endif
282     in->MPIInfo->msg_tag_counter++;
283 jfenwick 3227 current_rank = Esys_MPIInfo_mod(mpiSize, current_rank - 1);
284 jfenwick 3224 }
285     }
286     for (i = 0; i < mpiSize + 1; ++i)
287     distribution[i] = new_distribution[i];
288     }
289     TMPMEMFREE(index_list);
290     }
291     TMPMEMFREE(newGlobalDOFID);
292     TMPMEMFREE(setNewDOFId);
293     TMPMEMFREE(new_distribution);
294     TMPMEMFREE(partition_count);
295     TMPMEMFREE(partition);
296     TMPMEMFREE(xyz);
297     return;
298 ksteube 1315 }

  ViewVC Help
Powered by ViewVC 1.1.26