/[escript]/branches/trilinos_from_5897/dudley/src/Mesh_optimizeDOFDistribution.cpp
ViewVC logotype

Diff of /branches/trilinos_from_5897/dudley/src/Mesh_optimizeDOFDistribution.cpp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 6078 by caltinay, Wed Mar 2 04:13:26 2016 UTC revision 6079 by caltinay, Mon Mar 21 12:22:38 2016 UTC
# Line 14  Line 14 
14  *  *
15  *****************************************************************************/  *****************************************************************************/
16    
 /****************************************************************************/  
   
 /*   Dudley: Mesh: optimizes the distribution of DOFs across processors */  
 /*   using ParMETIS. On return a new distribution is given and the globalDOF are relabelled */  
 /*   accordingly but the mesh has not been redistributed yet */  
   
 /****************************************************************************/  
   
17  #include "Mesh.h"  #include "Mesh.h"
18  #include "IndexList.h"  #include "IndexList.h"
19    
# Line 36  typedef float real_t; Line 28  typedef float real_t;
28    
29  namespace dudley {  namespace dudley {
30    
 /*****************************************************************************  
    Check whether there is any node which has no vertex. In case  
    such node exists, we don't use parmetis since parmetis requires  
    that every node has at least 1 vertex (at line 129 of file  
    "xyzpart.c" in parmetis 3.1.1, variable "nvtxs" would be 0 if  
    any node has no vertex).  
  *****************************************************************************/  
31  #ifdef USE_PARMETIS  #ifdef USE_PARMETIS
32  int Check_Inputs_For_Parmetis(dim_t mpiSize, dim_t rank, dim_t * distribution, MPI_Comm * comm)  // Check whether there is any rank which has no vertex. In case
33    // such a rank exists, we don't use parmetis since parmetis requires
34    // that every rank has at least 1 vertex (at line 129 of file
35    // "xyzpart.c" in parmetis 3.1.1, variable "nvtxs" would be 0 if
36    // any rank has no vertex).
37    static bool allRanksHaveNodes(escript::JMPI mpiInfo,
38                                  const std::vector<index_t>& distribution)
39  {  {
40      dim_t i, len;      int ret = 1;
     int ret_val = 1;  
41    
42      if (rank == 0) {      if (mpiInfo->rank == 0) {
43          for (i = 0; i < mpiSize; i++) {          for (int i = 0; i < mpiInfo->size; i++) {
44              len = distribution[i + 1] - distribution[i];              if (distribution[i + 1] == distribution[i]) {
45              if (len == 0) {                  ret = 0;
                 ret_val = 0;  
46                  break;                  break;
47              }              }
48          }          }
49            if (ret == 0)
50                std::cerr << "INFO: ParMetis is not used since at least one rank "
51                             "has no vertex." << std::endl;
52      }      }
53      MPI_Bcast(&ret_val, 1, MPI_INTEGER, 0, *comm);      MPI_Bcast(&ret, 1, MPI_INTEGER, 0, mpiInfo->comm);
54      if (ret_val == 0)      return ret==1;
         printf("INFO: Parmetis is not used since some nodes have no vertex!\n");  
     return ret_val;  
55  }  }
56  #endif  #endif
57    
58  /*****************************************************************************/  /// optimizes the distribution of DOFs across processors using ParMETIS.
59    /// On return a new distribution is given and the globalDOF are relabeled
60  void Dudley_Mesh_optimizeDOFDistribution(Dudley_Mesh* in, dim_t* distribution)  /// accordingly but the mesh has not been redistributed yet
61    void Mesh::optimizeDOFDistribution(std::vector<index_t>& distribution)
62  {  {
63      if (in == NULL || in->Nodes == NULL)      int mpiSize = MPIInfo->size;
64          return;      const int myRank = MPIInfo->rank;
65        const index_t myFirstVertex = distribution[myRank];
66      dim_t i, k;      const index_t myLastVertex = distribution[myRank + 1];
67      int rank;      const dim_t myNumVertices = myLastVertex - myFirstVertex;
68      int c;      const dim_t numNodes = Nodes->getNumNodes();
   
     const int myRank = in->MPIInfo->rank;  
     dim_t mpiSize = in->MPIInfo->size;  
69    
70      // first step is to distribute the elements according to a global X of DOF      // first step is to distribute the elements according to a global X of DOF
   
     index_t myFirstVertex = distribution[myRank];  
     index_t myLastVertex = distribution[myRank + 1];  
     dim_t myNumVertices = myLastVertex - myFirstVertex;  
     dim_t globalNumVertices = distribution[mpiSize];  
71      dim_t len = 0;      dim_t len = 0;
72      for (dim_t p = 0; p < mpiSize; ++p)      for (int p = 0; p < mpiSize; ++p)
73          len = std::max(len, distribution[p + 1] - distribution[p]);          len = std::max(len, distribution[p + 1] - distribution[p]);
74    
75      index_t* partition = new index_t[len];      index_t* partition = new index_t[len];
     dim_t* partition_count = new dim_t[mpiSize + 1];  
     dim_t* new_distribution = new dim_t[mpiSize + 1];  
     index_t* newGlobalDOFID = new index_t[len];  
     bool* setNewDOFId = new bool[in->Nodes->numNodes];  
     dim_t* recvbuf = new dim_t[mpiSize * mpiSize];  
76    
77  #ifdef USE_PARMETIS  #ifdef USE_PARMETIS
78      dim_t dim = in->Nodes->numDim;      if (mpiSize > 1 && allRanksHaveNodes(MPIInfo, distribution)) {
79      real_t* xyz = new real_t[myNumVertices * dim];          boost::scoped_array<IndexList> index_list(new IndexList[myNumVertices]);
80            index_t dim = Nodes->numDim;
81    
82      /* set the coordinates: */          // create the adjacency structure xadj and adjncy
83      /* it is assumed that at least one node on this processor provides a coordinate */  #pragma omp parallel
 #pragma omp parallel for private(i,k)  
     for (i = 0; i < in->Nodes->numNodes; ++i)  
     {  
         k = in->Nodes->globalDegreesOfFreedom[i] - myFirstVertex;  
         if ((k >= 0) && (k < myNumVertices))  
84          {          {
85              for (dim_t j = 0; j < dim; ++j)              // insert contributions from element matrices into columns index
86                  xyz[k * dim + j] = (real_t)(in->Nodes->Coordinates[INDEX2(j, i, dim)]);              IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list.get(),
87                        myFirstVertex, myLastVertex, Elements,
88                        Nodes->globalDegreesOfFreedom);
89                IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list.get(),
90                        myFirstVertex, myLastVertex, FaceElements,
91                        Nodes->globalDegreesOfFreedom);
92                IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list.get(),
93                        myFirstVertex, myLastVertex, Points,
94                        Nodes->globalDegreesOfFreedom);
95            }
96    
97            // create the local matrix pattern
98            const dim_t globalNumVertices = distribution[mpiSize];
99            paso::Pattern_ptr pattern = paso::Pattern::fromIndexListArray(0,
100                    myNumVertices, index_list.get(), 0, globalNumVertices, 0);
101    
102            // set the coordinates
103            real_t* xyz = new real_t[myNumVertices * dim];
104    #pragma omp parallel for
105            for (index_t i = 0; i < numNodes; ++i) {
106                const index_t k = Nodes->globalDegreesOfFreedom[i] - myFirstVertex;
107                if (k >= 0 && k < myNumVertices) {
108                    for (int j = 0; j < dim; ++j)
109                        xyz[k * dim + j] = (real_t)(Nodes->Coordinates[INDEX2(j, i, dim)]);
110                }
111          }          }
     }  
 #endif // USE_PARMETIS  
   
     boost::scoped_array<IndexList> index_list(new IndexList[myNumVertices]);  
     /* ksteube CSR of DOF IDs */  
     /* create the adjacency structure xadj and adjncy */  
 #pragma omp parallel  
     {  
         /* ksteube build CSR format */  
         /*  insert contributions from element matrices into columns index index_list: */  
         Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list.get(),  
         myFirstVertex, myLastVertex, in->Elements,  
         in->Nodes->globalDegreesOfFreedom,  
         in->Nodes->globalDegreesOfFreedom);  
         Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list.get(),  
         myFirstVertex, myLastVertex, in->FaceElements,  
         in->Nodes->globalDegreesOfFreedom,  
         in->Nodes->globalDegreesOfFreedom);  
         Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list.get(),  
         myFirstVertex, myLastVertex, in->Points,  
         in->Nodes->globalDegreesOfFreedom,  
         in->Nodes->globalDegreesOfFreedom);  
     }  
   
     /* create the local matrix pattern */  
     paso::Pattern_ptr pattern = paso::Pattern::fromIndexListArray(0,  
             myNumVertices, index_list.get(), 0, globalNumVertices, 0);  
112    
113  #ifdef USE_PARMETIS          index_t wgtflag = 0;
114      if (mpiSize > 1 && Check_Inputs_For_Parmetis(mpiSize, myRank, distribution, &(in->MPIInfo->comm)) > 0)          index_t numflag = 0;
115      {          index_t ncon = 1;
116          int i;          index_t edgecut;
117          int wgtflag = 0;          index_t impiSize = mpiSize;
118          int numflag = 0;    /* pattern->ptr is C style: starting from 0 instead of 1 */          index_t options[3] = { 1, 0, 0 };
119          int ncon = 1;          std::vector<real_t> tpwgts(ncon * mpiSize, 1.f / mpiSize);
120          int edgecut;          std::vector<real_t> ubvec(ncon, 1.05f);
121          int options[3];          ParMETIS_V3_PartGeomKway(&distribution[0], pattern->ptr,
         real_t *tpwgts = new real_t[ncon * mpiSize];  
         real_t *ubvec = new real_t[ncon];  
         for (i = 0; i < ncon * mpiSize; i++)  
             tpwgts[i] = 1.0 / (real_t)mpiSize;  
         for (i = 0; i < ncon; i++)  
             ubvec[i] = 1.05;  
         options[0] = 1;  
         options[1] = 15;  
         options[2] = 0;  
         ParMETIS_V3_PartGeomKway(distribution, pattern->ptr,  
122                  pattern->index, NULL, NULL, &wgtflag, &numflag, &dim,                  pattern->index, NULL, NULL, &wgtflag, &numflag, &dim,
123                  xyz, &ncon, &mpiSize, tpwgts, ubvec, options, &edgecut,                  xyz, &ncon, &impiSize, &tpwgts[0], &ubvec[0], options,
124                  partition, /* new CPU ownership of elements */                  &edgecut, partition, &MPIInfo->comm);
                 &in->MPIInfo->comm);  
         //printf("ParMETIS number of edges cut by partitioning per processor: %d\n", edgecut/std::max(in->MPIInfo->size,1));  
125          delete[] xyz;          delete[] xyz;
126          delete[] ubvec;      } else {
127          delete[] tpwgts;          for (index_t i = 0; i < myNumVertices; ++i)
128      }              partition[i] = 0; // CPU 0 owns all
     else  
     {  
         for (i = 0; i < myNumVertices; ++i)  
             partition[i] = 0;       /* CPU 0 owns it */  
129      }      }
130  #else  #else
131      for (i = 0; i < myNumVertices; ++i)  #pragma omp parallel for
132          partition[i] = myRank;      /* CPU 0 owns it */      for (index_t i = 0; i < myNumVertices; ++i)
133            partition[i] = myRank;
134  #endif // USE_PARMETIS  #endif // USE_PARMETIS
135    
136      // create a new distribution and labelling of the DOF      // create a new distribution and labeling of the DOF
137      const size_t mpiSize_size = mpiSize * sizeof(dim_t);      std::vector<index_t> new_distribution(mpiSize + 1);
     memset(new_distribution, 0, mpiSize_size);  
138  #pragma omp parallel  #pragma omp parallel
139      {      {
140          dim_t* loc_partition_count = new dim_t[mpiSize];          std::vector<index_t> loc_partition_count(mpiSize);
141          memset(loc_partition_count, 0, mpiSize_size);  #pragma omp for
142  #pragma omp for private(i)          for (index_t i = 0; i < myNumVertices; ++i)
         for (i = 0; i < myNumVertices; ++i)  
143              loc_partition_count[partition[i]]++;              loc_partition_count[partition[i]]++;
144  #pragma omp critical  #pragma omp critical
145          {          {
146              for (i = 0; i < mpiSize; ++i)              for (int i = 0; i < mpiSize; ++i)
147                  new_distribution[i] += loc_partition_count[i];                  new_distribution[i] += loc_partition_count[i];
148          }          }
         delete[] loc_partition_count;  
149      }      }
150    
151        std::vector<index_t> recvbuf(mpiSize * mpiSize);
152  #ifdef ESYS_MPI  #ifdef ESYS_MPI
153      // recvbuf will be the concatenation of each CPU's contribution to      // recvbuf will be the concatenation of each CPU's contribution to
154      // new_distribution      // new_distribution
155      MPI_Allgather(new_distribution, mpiSize, MPI_INT, recvbuf, mpiSize, MPI_INT, in->MPIInfo->comm);      MPI_Allgather(&new_distribution[0], mpiSize, MPI_DIM_T, &recvbuf[0],
156                      mpiSize, MPI_DIM_T, MPIInfo->comm);
157  #else  #else
158      for (i = 0; i < mpiSize; ++i)      for (int i = 0; i < mpiSize; ++i)
159          recvbuf[i] = new_distribution[i];          recvbuf[i] = new_distribution[i];
160  #endif  #endif
161      new_distribution[0] = 0;      new_distribution[0] = 0;
162      for (rank = 0; rank < mpiSize; rank++)      index_t* newGlobalDOFID = new index_t[len];
163      {      for (int rank = 0; rank < mpiSize; rank++) {
164          c = 0;          index_t c = 0;
165          for (i = 0; i < myRank; ++i)          for (int i = 0; i < myRank; ++i)
166              c += recvbuf[rank + mpiSize * i];              c += recvbuf[rank + mpiSize * i];
167          for (i = 0; i < myNumVertices; ++i)          for (index_t i = 0; i < myNumVertices; ++i) {
168          {              if (rank == partition[i]) {
             if (rank == partition[i])  
             {  
169                  newGlobalDOFID[i] = new_distribution[rank] + c;                  newGlobalDOFID[i] = new_distribution[rank] + c;
170                  c++;                  c++;
171              }              }
172          }          }
173          for (i = myRank + 1; i < mpiSize; ++i)          for (int i = myRank + 1; i < mpiSize; ++i)
174              c += recvbuf[rank + mpiSize * i];              c += recvbuf[rank + mpiSize * i];
175          new_distribution[rank + 1] = new_distribution[rank] + c;          new_distribution[rank + 1] = new_distribution[rank] + c;
176      }      }
     delete[] recvbuf;  
177    
178      // now the overlap needs to be created by sending the partition around      // now the overlap needs to be created by sending the partition around
179  #ifdef ESYS_MPI  #ifdef ESYS_MPI
180      int dest = in->MPIInfo->mod_rank(myRank + 1);      int dest = MPIInfo->mod_rank(myRank + 1);
181      int source = in->MPIInfo->mod_rank(myRank - 1);      int source = MPIInfo->mod_rank(myRank - 1);
182  #endif  #endif
183      int current_rank = myRank;      int current_rank = myRank;
184  #pragma omp parallel for private(i)      bool* setNewDOFId = new bool[numNodes];
185      for (i = 0; i < in->Nodes->numNodes; ++i)  #pragma omp parallel for
186        for (index_t i = 0; i < numNodes; ++i)
187          setNewDOFId[i] = true;          setNewDOFId[i] = true;
188    
189      for (dim_t p = 0; p < mpiSize; ++p)      for (int p = 0; p < mpiSize; ++p) {
190      {          const index_t firstVertex = distribution[current_rank];
191          index_t firstVertex = distribution[current_rank];          const index_t lastVertex = distribution[current_rank + 1];
192          index_t lastVertex = distribution[current_rank + 1];  #pragma omp parallel for
193  #pragma omp parallel for private(i,k)          for (index_t i = 0; i < numNodes; ++i) {
194          for (i = 0; i < in->Nodes->numNodes; ++i)              const index_t k = Nodes->globalDegreesOfFreedom[i];
195          {              if (setNewDOFId[i] && firstVertex <= k && k < lastVertex) {
196              k = in->Nodes->globalDegreesOfFreedom[i];                  Nodes->globalDegreesOfFreedom[i] = newGlobalDOFID[k - firstVertex];
             if (setNewDOFId[i] && (firstVertex <= k) && (k < lastVertex))  
             {  
                 in->Nodes->globalDegreesOfFreedom[i] = newGlobalDOFID[k - firstVertex];  
197                  setNewDOFId[i] = false;                  setNewDOFId[i] = false;
198              }              }
199          }          }
200    
201          if (p < mpiSize - 1)          if (p < mpiSize - 1) { // the final send can be skipped
         {               /* the final send can be skipped */  
202  #ifdef ESYS_MPI  #ifdef ESYS_MPI
203              MPI_Status status;              MPI_Status status;
204              MPI_Sendrecv_replace(newGlobalDOFID, len, MPI_INT,              MPI_Sendrecv_replace(newGlobalDOFID, len, MPI_DIM_T,
205                                   dest, in->MPIInfo->counter(),                                   dest, MPIInfo->counter(),
206                                   source, in->MPIInfo->counter(), in->MPIInfo->comm, &status);                                   source, MPIInfo->counter(),
207              in->MPIInfo->incCounter();                                   MPIInfo->comm, &status);
208                MPIInfo->incCounter();
209  #endif  #endif
210              current_rank = in->MPIInfo->mod_rank(current_rank - 1);              current_rank = MPIInfo->mod_rank(current_rank - 1);
211          }          }
212      }      }
213      for (i = 0; i < mpiSize + 1; ++i)      for (int i = 0; i < mpiSize + 1; ++i)
214          distribution[i] = new_distribution[i];          distribution[i] = new_distribution[i];
215    
216      delete[] newGlobalDOFID;      delete[] newGlobalDOFID;
217      delete[] setNewDOFId;      delete[] setNewDOFId;
     delete[] new_distribution;  
     delete[] partition_count;  
218      delete[] partition;      delete[] partition;
219  }  }
220    

Legend:
Removed from v.6078  
changed lines
  Added in v.6079

  ViewVC Help
Powered by ViewVC 1.1.26