/[escript]/trunk/paso/src/SolverFCT_solve.c
ViewVC logotype

Contents of /trunk/paso/src/SolverFCT_solve.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2078 - (show annotations)
Thu Nov 20 16:10:10 2008 UTC (11 years, 2 months ago) by phornby
File MIME type: text/plain
File size: 15464 byte(s)
Two changes.

1. Move blocktimer from escript to esysUtils.
2. Make it possible to link to paso as a DLL or .so.

Should have no effect on 'nix's

In respect of 1., blocktimer had begun to spring up everywhere, so
for the moment I thought it best to move it to the only other library that
pops up all over the place.

In respect of 2., paso needed to be a DLL in order to use the windows intelc /fast
option, which does aggressive multi-file optimisations. Even in its current form, it either
vectorises or parallelises  hundreds more loops in the esys system than appear in the pragmas.

In achieving 2. I have not been too delicate in adding

PASO_DLL_API

declarations to the .h files in paso/src. Only toward the end of the process of
the conversion, when the number of linker errors dropped below 20, say, did I choosy about what
functions in a header I declared PASO_DLL_API. As a result, there are likely to be many routines
declared as external functions symbols that are in fact internal to the paso DLL. 
Why is this an issue?  It prevents the intelc compiler from getting aggressive on the paso module.
With pain there is sometimes gain. At least all the DLL rules in windows give good
(non-microsoft) compiler writers a chance to really shine.

So, if you should see a PASO_DLL_API on a function in a paso header file,
and think to yourself, "that function is only called in paso, why export it?", then feel free to
delete the PASO_DLL_API export declaration.

Here's hoping for no breakage.....
1
2 /*******************************************************
3 *
4 * Copyright (c) 2003-2008 by University of Queensland
5 * Earth Systems Science Computational Center (ESSCC)
6 * http://www.uq.edu.au/esscc
7 *
8 * Primary Business: Queensland, Australia
9 * Licensed under the Open Software License version 3.0
10 * http://www.opensource.org/licenses/osl-3.0.php
11 *
12 *******************************************************/
13
14
15 /**************************************************************/
16
17 /* Paso: Flux correction transport solver
18 *
19 * solves Mu_t=Ku+q
20 * u(0) >=0
21 *
22 * Warning: the program assums sum_{j} k_{ij}=0!!!
23 *
24 */
25 /**************************************************************/
26
27 /* Author: l.gross@uq.edu.au */
28
29 /**************************************************************/
30
31 #include "Paso.h"
32 #include "Solver.h"
33 #include "SolverFCT.h"
34 #include "PasoUtil.h"
35 #include "esysUtils/blocktimer.h"
36 #ifdef _OPENMP
37 #include <omp.h>
38 #endif
39 #ifdef PASO_MPI
40 #include <mpi.h>
41 #endif
42
43 /*
44 * inserts the source term into the problem
45 */
46 void Paso_FCT_setSource(const dim_t n,const double *source, double* sourceN, double* sourceP)
47 {
48 dim_t i;
49 register double rtmp;
50 /*
51 * seperate source into positive and negative part:
52 */
53 #pragma omp parallel for private(i,rtmp)
54 for (i = 0; i < n; ++i) {
55 rtmp=source[i];
56 if (rtmp <0) {
57 sourceN[i]=-rtmp;
58 sourceP[i]=0;
59 } else {
60 sourceN[i]= 0;
61 sourceP[i]= rtmp;
62 }
63 }
64 }
65
66 err_t Paso_FCT_setUpRightHandSide(Paso_FCTransportProblem* fctp, const double dt, const double *u_m, Paso_Coupler* u_m_coupler, double * z_m,
67 Paso_SystemMatrix* flux_matrix, Paso_Coupler* uTilde_coupler, const double *b,
68 Paso_Coupler* QN_coupler, Paso_Coupler* QP_coupler,
69 double *RN_m, Paso_Coupler* RN_m_coupler, double* RP_m, Paso_Coupler* RP_m_coupler, const double *sourceN,
70 Paso_Performance* pp)
71 {
72 const dim_t n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
73 /* distribute u */
74 Paso_Coupler_startCollect(u_m_coupler,u_m);
75 Paso_Coupler_finishCollect(u_m_coupler);
76 /*
77 * set the ant diffusion fluxes:
78 *
79 */
80 Paso_FCTransportProblem_setAntiDiffusionFlux(dt,fctp,flux_matrix,u_m_coupler);
81 /*
82 * apply pre flux-correction: f_{ij}:=0 if f_{ij}*(\tilde{u}[i]- \tilde{u}[j])<=0
83 *
84 */
85 Paso_FCTransportProblem_applyPreAntiDiffusionCorrection(flux_matrix,uTilde_coupler);
86 /*
87 * set flux limiters RN_m,RP_m
88 *
89 */
90 Paso_FCTransportProblem_setRs(flux_matrix,fctp->lumped_mass_matrix,QN_coupler,QP_coupler,RN_m,RP_m);
91 Paso_Coupler_startCollect(RN_m_coupler,RN_m);
92 Paso_Coupler_startCollect(RP_m_coupler,RP_m);
93 /*
94 * z_m[i]=b[i] - (m_i*u_m[i] - dt*theta*sum_{j<>i} l_{ij} (u_m[j]-u_m[i]) + dt q^-[i])
95 *
96 * note that iteration_matrix stores the negative values of the
97 * low order transport matrix l therefore a=dt*fctp->theta is used.
98 */
99 Paso_SolverFCT_setMuPaLuPbQ(z_m,fctp->lumped_mass_matrix, u_m_coupler,dt*fctp->theta,fctp->iteration_matrix,dt,sourceN);
100 /* z_m=b-z_m */
101 Paso_Update(n,-1.,z_m,1.,b);
102
103 Paso_Coupler_finishCollect(RN_m_coupler);
104 Paso_Coupler_finishCollect(RP_m_coupler);
105 /* add corrected fluxes into z_m */
106 Paso_FCTransportProblem_addCorrectedFluxes(z_m,flux_matrix,RN_m_coupler,RP_m_coupler);
107 return NO_ERROR;
108 }
109
110 void Paso_SolverFCT_solve(Paso_FCTransportProblem* fctp, double* u, double dt, double* source, Paso_Options* options) {
111 const dim_t FAILURES_MAX=5;
112 dim_t m, i_substeps, Failed=0, i;
113 double *z_m=NULL, *b_n=NULL, *sourceP=NULL, *sourceN=NULL, *uTilde_n=NULL, *QN_n=NULL, *QP_n=NULL, *RN_m=NULL, *RP_m=NULL, *du_m=NULL;
114 Paso_Coupler *QN_n_coupler=NULL, *QP_n_coupler=NULL, *RN_m_coupler=NULL, *RP_m_coupler=NULL, *uTilde_n_coupler=NULL, *u_m_coupler=NULL;
115 Paso_SystemMatrix *flux_matrix_m=NULL;
116 double dt_max, dt2,t, norm_u_m, omega, norm_du_m;
117 register double mass, rtmp;
118 const dim_t n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
119 dim_t blockSize=Paso_FCTransportProblem_getBlockSize(fctp);
120 const double atol=options->absolute_tolerance;
121 const double rtol=options->tolerance;
122 const dim_t max_m=options->iter_max;
123 Paso_Performance pp;
124 bool_t converged=FALSE, max_m_reached=FALSE;
125 if (dt<=0.) {
126 Paso_setError(TYPE_ERROR,"Paso_SolverFCT_solve: dt must be positive.");
127 }
128 dt_max=Paso_FCTransportProblem_getSafeTimeStepSize(fctp);
129 /*
130 * allocate memory
131 *
132 */
133 z_m=TMPMEMALLOC(n,double);
134 Paso_checkPtr(z_m);
135 du_m=TMPMEMALLOC(n,double);
136 Paso_checkPtr(du_m);
137 b_n=TMPMEMALLOC(n,double);
138 Paso_checkPtr(b_n);
139 sourceP=TMPMEMALLOC(n,double);
140 Paso_checkPtr(sourceP);
141 sourceN=TMPMEMALLOC(n,double);
142 Paso_checkPtr(sourceN);
143 uTilde_n=TMPMEMALLOC(n,double);
144 Paso_checkPtr(uTilde_n);
145 QN_n=TMPMEMALLOC(n,double);
146 Paso_checkPtr(QN_n);
147 QP_n=TMPMEMALLOC(n,double);
148 Paso_checkPtr(QP_n);
149 RN_m=TMPMEMALLOC(n,double);
150 Paso_checkPtr(RN_m);
151 RP_m=TMPMEMALLOC(n,double);
152 Paso_checkPtr(RP_m);
153 QN_n_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
154 QP_n_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
155 RN_m_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
156 RP_m_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
157 uTilde_n_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
158 u_m_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
159 flux_matrix_m=Paso_SystemMatrix_alloc(fctp->transport_matrix->type,
160 fctp->transport_matrix->pattern,
161 fctp->transport_matrix->row_block_size,
162 fctp->transport_matrix->col_block_size);
163
164 if (Paso_noError()) {
165 /*
166 * Preparation:
167 *
168 */
169 Paso_FCT_setSource(n, source, sourceN, sourceP);
170 /*
171 * let the show begin!!!!
172 *
173 */
174 t=0;
175 i_substeps=0;
176 Paso_Copy(n,u,fctp->u);
177 norm_u_m=Paso_lsup(n,u, fctp->mpi_info);
178 /* while(i_substeps<n_substeps && Paso_noError()) { */
179 if (fctp->dt_max < LARGE_POSITIVE_FLOAT) {
180 dt2=MIN(dt_max,dt);
181 } else {
182 dt2=dt;
183 }
184 while(t<dt && Paso_noError()) {
185 printf("substep step %d at t=%e (step size= %e)\n",i_substeps+1,t+dt2,dt2);
186 Paso_FCT_setUp(fctp,dt2,sourceN,sourceP,b_n,uTilde_n,uTilde_n_coupler,QN_n,QN_n_coupler,QP_n,QP_n_coupler,
187 options,&pp);
188 /* now the iteration starts */
189 m=0;
190 converged=FALSE;
191 max_m_reached=FALSE;
192 /* tolerance? */
193 while ( (!converged) && (! max_m_reached) && Paso_noError()) {
194 /* set z_m */
195 Paso_FCT_setUpRightHandSide(fctp,dt2,u,u_m_coupler,z_m,flux_matrix_m,uTilde_n_coupler,b_n,
196 QN_n_coupler,QP_n_coupler,RN_m,RN_m_coupler,RP_m,RP_m_coupler,sourceN,&pp);
197 /*
198 * now we solve the linear system to get the correction dt:
199 *
200 */
201 if (fctp->theta > 0) {
202 omega=1./(dt2*fctp->theta);
203 Paso_Solver_solvePreconditioner(fctp->iteration_matrix,du_m,z_m);
204 Paso_Update(n,1.,u,omega,du_m);
205 } else {
206 omega=1;
207 #pragma omp parallel for private(i,mass,rtmp)
208 for (i = 0; i < n; ++i) {
209 mass=fctp->lumped_mass_matrix[i];
210 if (ABS(mass)>0.) {
211 rtmp=z_m[i]/mass;
212 } else {
213 rtmp=0;
214 }
215 du_m[i]=rtmp;
216 u[i]+=rtmp;
217 }
218 }
219 norm_u_m=Paso_lsup(n,u, fctp->mpi_info);
220 norm_du_m=Paso_lsup(n,du_m, fctp->mpi_info)*omega;
221 printf("iteration step %d completed: norm increment= %e (tolerance = %e)\n",m+1, norm_du_m, rtol * norm_u_m + atol);
222
223 max_m_reached=(m>max_m);
224 converged=(norm_du_m <= rtol * norm_u_m + atol);
225 m++;
226 }
227 if (converged) {
228 Failed=0;
229 /* #pragma omp parallel for schedule(static) private(i) */
230 Paso_Copy(n,fctp->u,u);
231 i_substeps++;
232 t+=dt2;
233 if (fctp->dt_max < LARGE_POSITIVE_FLOAT) {
234 dt2=MIN3(dt_max,dt2*1.5,dt-t);
235 } else {
236 dt2=MIN(dt2*1.5,dt-t);
237 }
238 } else if (max_m_reached) {
239 /* if FAILURES_MAX failures in a row: give up */
240 if (Failed > FAILURES_MAX) {
241 Paso_setError(VALUE_ERROR,"Paso_SolverFCT_solve: no convergence after time step reduction.");
242 } else {
243 printf("no convergence in Paso_Solver_NewtonGMRES: Trying smaller time step size.");
244 dt2=dt*0.5;
245 Failed++;
246 }
247 }
248
249 }
250 /*
251 * clean-up:
252 *
253 */
254 MEMFREE(z_m);
255 MEMFREE(du_m);
256 TMPMEMFREE(b_n);
257 Paso_SystemMatrix_free(flux_matrix_m);
258 TMPMEMFREE(sourceP);
259 TMPMEMFREE(sourceN);
260 TMPMEMFREE(uTilde_n);
261 TMPMEMFREE(QN_n);
262 TMPMEMFREE(QP_n);
263 TMPMEMFREE(RN_m);
264 TMPMEMFREE(RP_m);
265 Paso_Coupler_free(QN_n_coupler);
266 Paso_Coupler_free(QP_n_coupler);
267 Paso_Coupler_free(RN_m_coupler);
268 Paso_Coupler_free(RP_m_coupler);
269 Paso_Coupler_free(uTilde_n_coupler);
270 Paso_Coupler_free(u_m_coupler);
271 options->absolute_tolerance=atol;
272 options->tolerance=rtol;
273 }
274 }
275 double Paso_FCTransportProblem_getSafeTimeStepSize(Paso_FCTransportProblem* fctp)
276 {
277 dim_t i, n;
278 double dt_max, dt_max_loc;
279 register double l_ii,m;
280 n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
281 if (! fctp->valid_matrices) {
282 fctp->dt_max=LARGE_POSITIVE_FLOAT;
283 /* extract the row sum of the advective part */
284 Paso_SystemMatrix_rowSum(fctp->mass_matrix,fctp->lumped_mass_matrix);
285
286 /* set low order transport operator */
287 Paso_FCTransportProblem_setLowOrderOperator(fctp);
288 /*
289 * calculate time step size:
290 */
291 dt_max=LARGE_POSITIVE_FLOAT;
292 if (fctp->theta < 1.) {
293 #pragma omp parallel private(dt_max_loc)
294 {
295 dt_max_loc=LARGE_POSITIVE_FLOAT;
296 #pragma omp for schedule(static) private(i,l_ii,m)
297 for (i=0;i<n;++i) {
298 l_ii=fctp->main_diagonal_low_order_transport_matrix[i];
299 m=fctp->lumped_mass_matrix[i];
300 if ( (l_ii<0 && m>0.) || (l_ii>0 && m<0) ) {
301 dt_max_loc=MIN(dt_max_loc,-m/l_ii);
302 }
303 }
304 #pragma omp critical
305 {
306 dt_max=MIN(dt_max,dt_max_loc);
307 }
308 }
309 #ifdef PASO_MPI
310 dt_max_loc = dt_max;
311 MPI_Allreduce(&dt_max_loc, &dt_max, 1, MPI_DOUBLE, MPI_MIN, fctp->mpi_info->comm);
312 #endif
313 if (dt_max<LARGE_POSITIVE_FLOAT) dt_max*=1./(1.-fctp->theta);
314 }
315 if (dt_max <= 0.) {
316 Paso_setError(TYPE_ERROR,"Paso_SolverFCT_solve: dt must be positive.");
317 } else {
318 if (dt_max<LARGE_POSITIVE_FLOAT)
319 printf("maximum time step size is %e (theta = %e).\n",dt_max,fctp->theta);
320 }
321 fctp->dt_max=dt_max;
322 fctp->valid_matrices=Paso_noError();
323 }
324 return fctp->dt_max;
325 }
326 void Paso_FCT_setUp(Paso_FCTransportProblem* fctp, const double dt, const double *sourceN, const double *sourceP, double* b, double* uTilde,
327 Paso_Coupler* uTilde_coupler, double *QN, Paso_Coupler* QN_coupler, double *QP, Paso_Coupler* QP_coupler,
328 Paso_Options* options, Paso_Performance* pp)
329 {
330 dim_t i;
331 const dim_t n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
332 double omega, factor;
333 register double m, u_tilde_i, rtmp4;
334 /* distribute u */
335 Paso_Coupler_startCollect(fctp->u_coupler,fctp->u);
336 Paso_Coupler_finishCollect(fctp->u_coupler);
337 /*
338 * b^n[i]=m u^n[i] + dt*(1-theta) sum_{j <> i} l_{ij}*(u^n[j]-u^n[i]) + dt*sourceP[i]
339 *
340 * note that iteration_matrix stores the negative values of the
341 * low order transport matrix l therefore a=-dt*(1-fctp->theta) is used.
342 *
343 */
344 Paso_SolverFCT_setMuPaLuPbQ(b,fctp->lumped_mass_matrix,fctp->u_coupler,
345 -dt*(1-fctp->theta),fctp->iteration_matrix,dt,sourceP);
346 /*
347 * uTilde[i]=b[i]/m[i]
348 *
349 * fctp->iteration_matrix[i,i]=m[i]/(dt theta) + \frac{1}{\theta} \frac{q^-[i]}-l[i,i]
350 *
351 */
352 if (fctp->theta > 0) {
353 Paso_solve_free(fctp->iteration_matrix);
354 omega=1./(dt*fctp->theta);
355 factor=dt*omega;
356 #pragma omp parallel for private(i,m,u_tilde_i,rtmp4)
357 for (i = 0; i < n; ++i) {
358 m=fctp->lumped_mass_matrix[i];
359 if (ABS(m)>0.) {
360 u_tilde_i=b[i]/m;
361 } else {
362 u_tilde_i=fctp->u[i];
363 }
364 rtmp4=m*omega-fctp->main_diagonal_low_order_transport_matrix[i];
365 if (ABS(u_tilde_i)>0) rtmp4+=sourceN[i]*factor/u_tilde_i;
366 fctp->iteration_matrix->mainBlock->val[fctp->main_iptr[i]]=rtmp4;
367 uTilde[i]=u_tilde_i;
368 }
369 Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER_INIT);
370 Paso_Solver_setPreconditioner(fctp->iteration_matrix,options);
371 Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER_INIT);
372 } else {
373 #pragma omp parallel for private(i,m,u_tilde_i)
374 for (i = 0; i < n; ++i) {
375 m=fctp->lumped_mass_matrix[i];
376 if (ABS(m)>0.) {
377 u_tilde_i=b[i]/m;
378 } else {
379 u_tilde_i=fctp->u[i];
380 }
381 uTilde[i]=u_tilde_i;
382 }
383 /* no update of iteration_matrix required! */
384 } /* end (fctp->theta > 0) */
385
386 /* distribute uTilde: */
387 Paso_Coupler_startCollect(uTilde_coupler,uTilde);
388 Paso_Coupler_finishCollect(uTilde_coupler);
389 /*
390 * calculate QP[i] max_{j} (\tilde{u}[j]- \tilde{u}[i] )
391 * QN[i] min_{j} (\tilde{u}[j]- \tilde{u}[i] )
392 *
393 */
394 Paso_SolverFCT_setQs(uTilde_coupler,QN,QP,fctp->iteration_matrix);
395 Paso_Coupler_startCollect(QN_coupler,QN);
396 Paso_Coupler_startCollect(QP_coupler,QP);
397 Paso_Coupler_finishCollect(QN_coupler);
398 Paso_Coupler_finishCollect(QP_coupler);
399 }

  ViewVC Help
Powered by ViewVC 1.1.26