/[escript]/trunk/paso/src/SolverFCT_solve.c
ViewVC logotype

Annotation of /trunk/paso/src/SolverFCT_solve.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2078 - (hide annotations)
Thu Nov 20 16:10:10 2008 UTC (11 years, 3 months ago) by phornby
File MIME type: text/plain
File size: 15464 byte(s)
Two changes.

1. Move blocktimer from escript to esysUtils.
2. Make it possible to link to paso as a DLL or .so.

Should have no effect on 'nix's

In respect of 1., blocktimer had begun to spring up everywhere, so
for the moment I thought it best to move it to the only other library that
pops up all over the place.

In respect of 2., paso needed to be a DLL in order to use the windows intelc /fast
option, which does aggressive multi-file optimisations. Even in its current form, it either
vectorises or parallelises  hundreds more loops in the esys system than appear in the pragmas.

In achieving 2. I have not been too delicate in adding

PASO_DLL_API

declarations to the .h files in paso/src. Only toward the end of the process of
the conversion, when the number of linker errors dropped below 20, say, did I choosy about what
functions in a header I declared PASO_DLL_API. As a result, there are likely to be many routines
declared as external functions symbols that are in fact internal to the paso DLL. 
Why is this an issue?  It prevents the intelc compiler from getting aggressive on the paso module.
With pain there is sometimes gain. At least all the DLL rules in windows give good
(non-microsoft) compiler writers a chance to really shine.

So, if you should see a PASO_DLL_API on a function in a paso header file,
and think to yourself, "that function is only called in paso, why export it?", then feel free to
delete the PASO_DLL_API export declaration.

Here's hoping for no breakage.....
1 gross 1363
2     /*******************************************************
3 ksteube 1811 *
4     * Copyright (c) 2003-2008 by University of Queensland
5     * Earth Systems Science Computational Center (ESSCC)
6     * http://www.uq.edu.au/esscc
7     *
8     * Primary Business: Queensland, Australia
9     * Licensed under the Open Software License version 3.0
10     * http://www.opensource.org/licenses/osl-3.0.php
11     *
12     *******************************************************/
13 gross 1363
14 ksteube 1811
15 gross 1363 /**************************************************************/
16    
17     /* Paso: Flux correction transport solver
18     *
19 gross 1407 * solves Mu_t=Ku+q
20 gross 1570 * u(0) >=0
21 gross 1363 *
22 gross 1407 * Warning: the program assums sum_{j} k_{ij}=0!!!
23 gross 1363 *
24     */
25     /**************************************************************/
26    
27 gross 1804 /* Author: l.gross@uq.edu.au */
28 gross 1363
29     /**************************************************************/
30    
31     #include "Paso.h"
32     #include "Solver.h"
33     #include "SolverFCT.h"
34 gross 1804 #include "PasoUtil.h"
35 phornby 2078 #include "esysUtils/blocktimer.h"
36 gross 1370 #ifdef _OPENMP
37     #include <omp.h>
38     #endif
39     #ifdef PASO_MPI
40     #include <mpi.h>
41     #endif
42 gross 1363
43 ksteube 1812 /*
44     * inserts the source term into the problem
45     */
46     void Paso_FCT_setSource(const dim_t n,const double *source, double* sourceN, double* sourceP)
47     {
48     dim_t i;
49     register double rtmp;
50     /*
51     * seperate source into positive and negative part:
52     */
53     #pragma omp parallel for private(i,rtmp)
54     for (i = 0; i < n; ++i) {
55     rtmp=source[i];
56     if (rtmp <0) {
57     sourceN[i]=-rtmp;
58     sourceP[i]=0;
59     } else {
60     sourceN[i]= 0;
61     sourceP[i]= rtmp;
62     }
63     }
64     }
65    
66     err_t Paso_FCT_setUpRightHandSide(Paso_FCTransportProblem* fctp, const double dt, const double *u_m, Paso_Coupler* u_m_coupler, double * z_m,
67     Paso_SystemMatrix* flux_matrix, Paso_Coupler* uTilde_coupler, const double *b,
68     Paso_Coupler* QN_coupler, Paso_Coupler* QP_coupler,
69     double *RN_m, Paso_Coupler* RN_m_coupler, double* RP_m, Paso_Coupler* RP_m_coupler, const double *sourceN,
70     Paso_Performance* pp)
71     {
72     const dim_t n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
73     /* distribute u */
74     Paso_Coupler_startCollect(u_m_coupler,u_m);
75     Paso_Coupler_finishCollect(u_m_coupler);
76     /*
77     * set the ant diffusion fluxes:
78     *
79     */
80     Paso_FCTransportProblem_setAntiDiffusionFlux(dt,fctp,flux_matrix,u_m_coupler);
81     /*
82     * apply pre flux-correction: f_{ij}:=0 if f_{ij}*(\tilde{u}[i]- \tilde{u}[j])<=0
83     *
84     */
85     Paso_FCTransportProblem_applyPreAntiDiffusionCorrection(flux_matrix,uTilde_coupler);
86     /*
87     * set flux limiters RN_m,RP_m
88     *
89     */
90     Paso_FCTransportProblem_setRs(flux_matrix,fctp->lumped_mass_matrix,QN_coupler,QP_coupler,RN_m,RP_m);
91     Paso_Coupler_startCollect(RN_m_coupler,RN_m);
92     Paso_Coupler_startCollect(RP_m_coupler,RP_m);
93     /*
94     * z_m[i]=b[i] - (m_i*u_m[i] - dt*theta*sum_{j<>i} l_{ij} (u_m[j]-u_m[i]) + dt q^-[i])
95     *
96     * note that iteration_matrix stores the negative values of the
97     * low order transport matrix l therefore a=dt*fctp->theta is used.
98     */
99     Paso_SolverFCT_setMuPaLuPbQ(z_m,fctp->lumped_mass_matrix, u_m_coupler,dt*fctp->theta,fctp->iteration_matrix,dt,sourceN);
100     /* z_m=b-z_m */
101     Paso_Update(n,-1.,z_m,1.,b);
102    
103     Paso_Coupler_finishCollect(RN_m_coupler);
104     Paso_Coupler_finishCollect(RP_m_coupler);
105     /* add corrected fluxes into z_m */
106     Paso_FCTransportProblem_addCorrectedFluxes(z_m,flux_matrix,RN_m_coupler,RP_m_coupler);
107     return NO_ERROR;
108     }
109    
110 gross 1804 void Paso_SolverFCT_solve(Paso_FCTransportProblem* fctp, double* u, double dt, double* source, Paso_Options* options) {
111     const dim_t FAILURES_MAX=5;
112 jfenwick 1981 dim_t m, i_substeps, Failed=0, i;
113 gross 1804 double *z_m=NULL, *b_n=NULL, *sourceP=NULL, *sourceN=NULL, *uTilde_n=NULL, *QN_n=NULL, *QP_n=NULL, *RN_m=NULL, *RP_m=NULL, *du_m=NULL;
114     Paso_Coupler *QN_n_coupler=NULL, *QP_n_coupler=NULL, *RN_m_coupler=NULL, *RP_m_coupler=NULL, *uTilde_n_coupler=NULL, *u_m_coupler=NULL;
115     Paso_SystemMatrix *flux_matrix_m=NULL;
116 phornby 1917 double dt_max, dt2,t, norm_u_m, omega, norm_du_m;
117 gross 1804 register double mass, rtmp;
118     const dim_t n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
119     dim_t blockSize=Paso_FCTransportProblem_getBlockSize(fctp);
120     const double atol=options->absolute_tolerance;
121     const double rtol=options->tolerance;
122     const dim_t max_m=options->iter_max;
123     Paso_Performance pp;
124     bool_t converged=FALSE, max_m_reached=FALSE;
125     if (dt<=0.) {
126     Paso_setError(TYPE_ERROR,"Paso_SolverFCT_solve: dt must be positive.");
127     }
128     dt_max=Paso_FCTransportProblem_getSafeTimeStepSize(fctp);
129     /*
130     * allocate memory
131     *
132     */
133     z_m=TMPMEMALLOC(n,double);
134     Paso_checkPtr(z_m);
135     du_m=TMPMEMALLOC(n,double);
136     Paso_checkPtr(du_m);
137     b_n=TMPMEMALLOC(n,double);
138     Paso_checkPtr(b_n);
139     sourceP=TMPMEMALLOC(n,double);
140     Paso_checkPtr(sourceP);
141     sourceN=TMPMEMALLOC(n,double);
142     Paso_checkPtr(sourceN);
143     uTilde_n=TMPMEMALLOC(n,double);
144     Paso_checkPtr(uTilde_n);
145     QN_n=TMPMEMALLOC(n,double);
146     Paso_checkPtr(QN_n);
147     QP_n=TMPMEMALLOC(n,double);
148     Paso_checkPtr(QP_n);
149     RN_m=TMPMEMALLOC(n,double);
150     Paso_checkPtr(RN_m);
151     RP_m=TMPMEMALLOC(n,double);
152     Paso_checkPtr(RP_m);
153     QN_n_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
154     QP_n_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
155     RN_m_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
156     RP_m_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
157     uTilde_n_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
158     u_m_coupler=Paso_Coupler_alloc(Paso_FCTransportProblem_borrowConnector(fctp),blockSize);
159     flux_matrix_m=Paso_SystemMatrix_alloc(fctp->transport_matrix->type,
160     fctp->transport_matrix->pattern,
161     fctp->transport_matrix->row_block_size,
162     fctp->transport_matrix->col_block_size);
163    
164     if (Paso_noError()) {
165     /*
166     * Preparation:
167     *
168     */
169     Paso_FCT_setSource(n, source, sourceN, sourceP);
170     /*
171     * let the show begin!!!!
172     *
173     */
174     t=0;
175     i_substeps=0;
176     Paso_Copy(n,u,fctp->u);
177     norm_u_m=Paso_lsup(n,u, fctp->mpi_info);
178     /* while(i_substeps<n_substeps && Paso_noError()) { */
179     if (fctp->dt_max < LARGE_POSITIVE_FLOAT) {
180     dt2=MIN(dt_max,dt);
181     } else {
182     dt2=dt;
183     }
184     while(t<dt && Paso_noError()) {
185 ksteube 1812 printf("substep step %d at t=%e (step size= %e)\n",i_substeps+1,t+dt2,dt2);
186 gross 1804 Paso_FCT_setUp(fctp,dt2,sourceN,sourceP,b_n,uTilde_n,uTilde_n_coupler,QN_n,QN_n_coupler,QP_n,QP_n_coupler,
187     options,&pp);
188     /* now the iteration starts */
189     m=0;
190     converged=FALSE;
191     max_m_reached=FALSE;
192     /* tolerance? */
193     while ( (!converged) && (! max_m_reached) && Paso_noError()) {
194     /* set z_m */
195     Paso_FCT_setUpRightHandSide(fctp,dt2,u,u_m_coupler,z_m,flux_matrix_m,uTilde_n_coupler,b_n,
196     QN_n_coupler,QP_n_coupler,RN_m,RN_m_coupler,RP_m,RP_m_coupler,sourceN,&pp);
197     /*
198     * now we solve the linear system to get the correction dt:
199     *
200     */
201     if (fctp->theta > 0) {
202     omega=1./(dt2*fctp->theta);
203     Paso_Solver_solvePreconditioner(fctp->iteration_matrix,du_m,z_m);
204     Paso_Update(n,1.,u,omega,du_m);
205     } else {
206     omega=1;
207     #pragma omp parallel for private(i,mass,rtmp)
208     for (i = 0; i < n; ++i) {
209     mass=fctp->lumped_mass_matrix[i];
210     if (ABS(mass)>0.) {
211     rtmp=z_m[i]/mass;
212     } else {
213     rtmp=0;
214     }
215     du_m[i]=rtmp;
216     u[i]+=rtmp;
217     }
218     }
219     norm_u_m=Paso_lsup(n,u, fctp->mpi_info);
220     norm_du_m=Paso_lsup(n,du_m, fctp->mpi_info)*omega;
221 ksteube 1812 printf("iteration step %d completed: norm increment= %e (tolerance = %e)\n",m+1, norm_du_m, rtol * norm_u_m + atol);
222 gross 1804
223     max_m_reached=(m>max_m);
224     converged=(norm_du_m <= rtol * norm_u_m + atol);
225     m++;
226     }
227     if (converged) {
228     Failed=0;
229 ksteube 1812 /* #pragma omp parallel for schedule(static) private(i) */
230 gross 1804 Paso_Copy(n,fctp->u,u);
231     i_substeps++;
232     t+=dt2;
233     if (fctp->dt_max < LARGE_POSITIVE_FLOAT) {
234     dt2=MIN3(dt_max,dt2*1.5,dt-t);
235     } else {
236     dt2=MIN(dt2*1.5,dt-t);
237     }
238     } else if (max_m_reached) {
239     /* if FAILURES_MAX failures in a row: give up */
240     if (Failed > FAILURES_MAX) {
241     Paso_setError(VALUE_ERROR,"Paso_SolverFCT_solve: no convergence after time step reduction.");
242     } else {
243     printf("no convergence in Paso_Solver_NewtonGMRES: Trying smaller time step size.");
244     dt2=dt*0.5;
245     Failed++;
246     }
247     }
248    
249     }
250     /*
251     * clean-up:
252     *
253     */
254     MEMFREE(z_m);
255     MEMFREE(du_m);
256     TMPMEMFREE(b_n);
257     Paso_SystemMatrix_free(flux_matrix_m);
258     TMPMEMFREE(sourceP);
259     TMPMEMFREE(sourceN);
260     TMPMEMFREE(uTilde_n);
261     TMPMEMFREE(QN_n);
262     TMPMEMFREE(QP_n);
263     TMPMEMFREE(RN_m);
264     TMPMEMFREE(RP_m);
265     Paso_Coupler_free(QN_n_coupler);
266     Paso_Coupler_free(QP_n_coupler);
267     Paso_Coupler_free(RN_m_coupler);
268     Paso_Coupler_free(RP_m_coupler);
269     Paso_Coupler_free(uTilde_n_coupler);
270     Paso_Coupler_free(u_m_coupler);
271     options->absolute_tolerance=atol;
272     options->tolerance=rtol;
273     }
274     }
275 gross 1407 double Paso_FCTransportProblem_getSafeTimeStepSize(Paso_FCTransportProblem* fctp)
276     {
277 gross 1804 dim_t i, n;
278 gross 1407 double dt_max, dt_max_loc;
279 gross 1804 register double l_ii,m;
280     n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
281 gross 1407 if (! fctp->valid_matrices) {
282     fctp->dt_max=LARGE_POSITIVE_FLOAT;
283 gross 1363 /* extract the row sum of the advective part */
284 gross 1407 Paso_SystemMatrix_rowSum(fctp->mass_matrix,fctp->lumped_mass_matrix);
285 gross 1363
286 gross 1407 /* set low order transport operator */
287     Paso_FCTransportProblem_setLowOrderOperator(fctp);
288     /*
289 gross 1370 * calculate time step size:
290 gross 1407 */
291     dt_max=LARGE_POSITIVE_FLOAT;
292 gross 1370 if (fctp->theta < 1.) {
293 gross 1407 #pragma omp parallel private(dt_max_loc)
294 gross 1370 {
295 gross 1407 dt_max_loc=LARGE_POSITIVE_FLOAT;
296 gross 1804 #pragma omp for schedule(static) private(i,l_ii,m)
297     for (i=0;i<n;++i) {
298 phornby 1917 l_ii=fctp->main_diagonal_low_order_transport_matrix[i];
299     m=fctp->lumped_mass_matrix[i];
300     if ( (l_ii<0 && m>0.) || (l_ii>0 && m<0) ) {
301     dt_max_loc=MIN(dt_max_loc,-m/l_ii);
302     }
303     }
304 gross 1372 #pragma omp critical
305 phornby 1917 {
306     dt_max=MIN(dt_max,dt_max_loc);
307     }
308 gross 1370 }
309     #ifdef PASO_MPI
310 gross 1407 dt_max_loc = dt_max;
311 phornby 1917 MPI_Allreduce(&dt_max_loc, &dt_max, 1, MPI_DOUBLE, MPI_MIN, fctp->mpi_info->comm);
312     #endif
313     if (dt_max<LARGE_POSITIVE_FLOAT) dt_max*=1./(1.-fctp->theta);
314 gross 1407 }
315     if (dt_max <= 0.) {
316 gross 1400 Paso_setError(TYPE_ERROR,"Paso_SolverFCT_solve: dt must be positive.");
317 gross 1407 } else {
318 phornby 1917 if (dt_max<LARGE_POSITIVE_FLOAT)
319     printf("maximum time step size is %e (theta = %e).\n",dt_max,fctp->theta);
320 gross 1370 }
321 gross 1407 fctp->dt_max=dt_max;
322 gross 1370 fctp->valid_matrices=Paso_noError();
323 gross 1363 }
324 gross 1407 return fctp->dt_max;
325     }
326 ksteube 1812 void Paso_FCT_setUp(Paso_FCTransportProblem* fctp, const double dt, const double *sourceN, const double *sourceP, double* b, double* uTilde,
327 gross 1804 Paso_Coupler* uTilde_coupler, double *QN, Paso_Coupler* QN_coupler, double *QP, Paso_Coupler* QP_coupler,
328     Paso_Options* options, Paso_Performance* pp)
329     {
330     dim_t i;
331     const dim_t n=Paso_SystemMatrix_getTotalNumRows(fctp->transport_matrix);
332     double omega, factor;
333     register double m, u_tilde_i, rtmp4;
334     /* distribute u */
335     Paso_Coupler_startCollect(fctp->u_coupler,fctp->u);
336     Paso_Coupler_finishCollect(fctp->u_coupler);
337     /*
338     * b^n[i]=m u^n[i] + dt*(1-theta) sum_{j <> i} l_{ij}*(u^n[j]-u^n[i]) + dt*sourceP[i]
339     *
340     * note that iteration_matrix stores the negative values of the
341     * low order transport matrix l therefore a=-dt*(1-fctp->theta) is used.
342     *
343     */
344     Paso_SolverFCT_setMuPaLuPbQ(b,fctp->lumped_mass_matrix,fctp->u_coupler,
345     -dt*(1-fctp->theta),fctp->iteration_matrix,dt,sourceP);
346     /*
347     * uTilde[i]=b[i]/m[i]
348     *
349     * fctp->iteration_matrix[i,i]=m[i]/(dt theta) + \frac{1}{\theta} \frac{q^-[i]}-l[i,i]
350     *
351     */
352     if (fctp->theta > 0) {
353     Paso_solve_free(fctp->iteration_matrix);
354     omega=1./(dt*fctp->theta);
355     factor=dt*omega;
356     #pragma omp parallel for private(i,m,u_tilde_i,rtmp4)
357     for (i = 0; i < n; ++i) {
358     m=fctp->lumped_mass_matrix[i];
359     if (ABS(m)>0.) {
360     u_tilde_i=b[i]/m;
361     } else {
362     u_tilde_i=fctp->u[i];
363     }
364     rtmp4=m*omega-fctp->main_diagonal_low_order_transport_matrix[i];
365     if (ABS(u_tilde_i)>0) rtmp4+=sourceN[i]*factor/u_tilde_i;
366     fctp->iteration_matrix->mainBlock->val[fctp->main_iptr[i]]=rtmp4;
367     uTilde[i]=u_tilde_i;
368     }
369     Performance_startMonitor(pp,PERFORMANCE_PRECONDITIONER_INIT);
370     Paso_Solver_setPreconditioner(fctp->iteration_matrix,options);
371     Performance_stopMonitor(pp,PERFORMANCE_PRECONDITIONER_INIT);
372     } else {
373     #pragma omp parallel for private(i,m,u_tilde_i)
374     for (i = 0; i < n; ++i) {
375     m=fctp->lumped_mass_matrix[i];
376     if (ABS(m)>0.) {
377     u_tilde_i=b[i]/m;
378     } else {
379     u_tilde_i=fctp->u[i];
380     }
381     uTilde[i]=u_tilde_i;
382 gross 1407 }
383 gross 1804 /* no update of iteration_matrix required! */
384     } /* end (fctp->theta > 0) */
385 gross 1562
386 gross 1804 /* distribute uTilde: */
387     Paso_Coupler_startCollect(uTilde_coupler,uTilde);
388     Paso_Coupler_finishCollect(uTilde_coupler);
389     /*
390     * calculate QP[i] max_{j} (\tilde{u}[j]- \tilde{u}[i] )
391     * QN[i] min_{j} (\tilde{u}[j]- \tilde{u}[i] )
392     *
393     */
394     Paso_SolverFCT_setQs(uTilde_coupler,QN,QP,fctp->iteration_matrix);
395     Paso_Coupler_startCollect(QN_coupler,QN);
396     Paso_Coupler_startCollect(QP_coupler,QP);
397     Paso_Coupler_finishCollect(QN_coupler);
398     Paso_Coupler_finishCollect(QP_coupler);
399     }

  ViewVC Help
Powered by ViewVC 1.1.26