/[escript]/trunk/doc/inversion/Minimization.tex
ViewVC logotype

Contents of /trunk/doc/inversion/Minimization.tex

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4143 - (show annotations)
Thu Jan 17 08:48:47 2013 UTC (6 years, 9 months ago) by gross
File MIME type: application/x-tex
File size: 12861 byte(s)
documentation of minimizer update plus clarification of notations on the code
1
2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3 % Copyright (c) 2003-2012 by University of Queensland
4 % http://www.uq.edu.au
5 %
6 % Primary Business: Queensland, Australia
7 % Licensed under the Open Software License version 3.0
8 % http://www.opensource.org/licenses/osl-3.0.php
9 %
10 % Development until 2012 by Earth Systems Science Computational Center (ESSCC)
11 % Development since 2012 by School of Earth Sciences
12 %
13 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
14
15 \chapter{Minimization Algorithms}\label{chapter:ref:Minimization}
16 We need to find the level set function $m$ minimizing the cost function $J$ as
17 defined in equation~\ref{REF:EQU:INTRO 1}. The physical parameters $p^f$ and the
18 data defects are linked through state variables $u^f$ which is a
19 given as a solution of a partial differential equation (PDE) with
20 coefficients depending on $p_f$. This PDE (or - in case of several forward models - this set of PDEs)
21 defines a constraint in the minimization problem. In the context of our applications
22 it can be assumed the the PDE is of 'maximum rank', ie. for a given value of the level set function $m$
23 there is a unique value for the state variables $u^f$ as the solution of the forward model. So from a mathematical point of view
24 the state variable $u^f$ can be eliminated from the problem and the minimization problem becomes in fact
25 a minimization problem for the level set function $m$ alone where the physical parameters which are of most interest
26 in applications can easily be derived from the solution. However one need to keep in mind that
27 each evaluation of the cost function requires the solution of a PDE (an additional PDE solution is required for the
28 gradient).
29
30 In some application cases the optimization problem to be solved defines a quadratic programming problem which would allow
31 to use a special case of solvers. However, in the general scenarios we are interested here we cannot assume this
32 simplification and need to be able to solve for a general cost function.
33 The method used here is the limited-memory Broyden-Fletcher-Goldfarb-Shanno (\emph{L-BFGS}) method, see~\cite{Nocedal1980}.
34 The method is a quasi-Newton method.
35 To implement the method one needs to provides the evaluation of the cost function $J$ and its gradient $\nabla J$
36 and dual product $<.,.>$\footnote{A dual product $<.,.>$ defines function which assigns
37 a pair $(p,g)$ of a level set increment $p$ and a gradient $g$ a real number $<p,g>$. It is linear
38 in the first and linear in the second argument.}
39 such that for $\alpha \to$ 0
40 \begin{equation}\label{EQU:MIN:1}
41 J(m+\alpha p) = J(m) + \alpha \cdot < p , \nabla J(m)> + o(\alpha)
42 \end{equation}
43 where $p$ is an increment to the level set
44 function\footnote{ $o$ denotes the little o notation see \url{http://en.wikipedia.org/wiki/Big_O_notation}}.
45 Notice that if $m$ is the unknown solution one has $\nabla J(m)=0$.
46 Moreover, an approximation of the inverse of the Hessian operator $\nabla \nabla J(m)$ needs to be provided
47 for a given value of $m$. In the implementation we don't need to provide the entire operator but
48 for a given gradient difference $g$ an approximation $h=Hg$ of $\nabla \nabla J(m))^{-1}g$ needs to be
49 calculated. This is an approximative solution of the equation $\nabla \nabla J(m)) h =g$ which in variational
50 from is given as
51 \begin{equation}\label{EQU:MIN:2}
52 <p, \nabla \nabla J(m)\; h > = <p, g>
53 \end{equation}
54 for all level set increments $p$. In the cases relevant here, it is a possible approach to
55 make the approximation $\nabla \nabla J(m) \approx \nabla \nabla J^{reg}(m)$ which can easily be construction
56 and the resulting variational equation~\ref{EQU:MIN:2} can easily be solved, see Chapter~\ref{Chp:ref:regularization}.
57
58 The L-BFGS is an iterative method which calculates a sequence of level set approximations $m_{k}$ which converges
59 towards the unknown level set function minimizing the cost function.
60 This iterative process is terminated if certain stopping criteria are fulfilled. A criteria
61 commonly used is
62 \begin{equation}\label{EQU:MIN:3a}
63 \| m_k - m_{k-1} \| \le \|m_k\| \cdot {m\_tol}
64 \end{equation}
65 in order to terminate the iteration after $m_k$ has been calculated. In this condition $\|.\|$ denotes a
66 norm and $x\_tol$ defines a relative tolerance. Typical value for $x\_tol$ is $10^{-4}$. Alternatively
67 one can check for changes in the cost function:
68 \begin{equation}\label{EQU:MIN:3b}
69 \mid J(m_k) - J(m_k) \mid \le \mid J(m_k) - J(m_0) \mid \cdot {J\_tol}
70 \end{equation}
71 where ${J\_tol}$ is a given tolerance and $m_0$ is the initial guess of the solution.
72 As this criterion is depending ion the initial guess it is not recommended.
73
74 \section{Solver Classes}
75
76 \begin{classdesc}{MinimizerLBFGS}{\optional{J=\None}}
77 Constructs an instance of the limited-memory Broyden-Fletcher-Goldfarb-Shanno \emph{L-BFGS} solver.
78 \member{J} set the cost function to be minimized, see Section~\ref{chapter:ref:Minimization: costfunction class}.
79 If not present the cost function needs to be set using the \member{setCostFunction} method.
80 \member{imax} sets the maximum number of iteration steps.
81 \end{classdesc}
82
83 \begin{methoddesc}[MinimizerLBFGS]{setTolerance}{\optional{x_tol=1e-5} \optional{, J_tol=\None}}
84 sets the tolerances for the stopping criterions.
85 \member{x_tol} sets the tolerance for the unknown level set function~\ref{EQU:MIN:3a}. If \member{x_tol}=\None
86 tolerance on level set function is not checked.
87 \member{J_tol} sets the tolerance for the cost function, see equation~\ref{EQU:MIN:3b}.
88 If \member{J_tol} = \None
89 tolerance on cost function is not checked (recommended). If \member{x_tol} and \member{J_tol} are both set
90 criterion~\ref{EQU:MIN:3a} and criterion~\ref{EQU:MIN:3b} to terminate the iteration.
91 \end{methoddesc}
92
93
94 \begin{methoddesc}[MinimizerLBFGS]{setMaxIterations}{imax}
95 sets the maximum number of iterations before the minimizer terminates. If
96 the maximum number of iteration is reached \exception{MinimizerMaxIterReached} exception is thrown.
97 \end{methoddesc}
98
99 \begin{methoddesc}[MinimizerLBFGS]{getResult}{}
100 returns the solution of the optimization problem.
101 This method can only be called if the optimization process as been completed.
102 If the iteration failed the last available approximation of the solution is returned.
103 If an exception is thrown, the last available solution approximation is returned.
104 \end{methoddesc}
105
106 \begin{methoddesc}[MinimizerLBFGS]{getOptions}{}
107 returns the solver options as a dictionary.
108 \end{methoddesc}
109
110 \begin{methoddesc}[MinimizerLBFGS]{setOptions}{\optional{truncation=30} \optional{, restart=60}}
111 set solver options. \member{truncation} defines number of gradients to keep in memory.
112 \member{restart} defines the number of iteration steps after which the iteration is restarted.
113 \end{methoddesc}
114
115 \begin{methoddesc}[MinimizerLBFGS]{run}{m0}
116 runs the solution algorithm and returns an approximation of the solution.
117 \member{m0} set the initial guess.
118 \end{methoddesc}
119
120 \begin{methoddesc}[MinimizerLBFGS]{logSummary}{}
121 prints log information.
122 \end{methoddesc}
123
124 \section{\member{CostFunction} Class Template}\label{chapter:ref:Minimization: costfunction class}
125
126 \begin{classdesc}{CostFunction}{}
127 template of cost function $J$
128 \end{classdesc}
129
130 \begin{methoddesc}[CostFunction]{resetCounters}{}
131 resets all statistical counters.
132 \end{methoddesc}
133 %
134 \begin{methoddesc}[CostFunction]{getDualProduct}{m, g}
135 returns the dual product $<m,g>$ of \member{m} and \member{g}.
136 \end{methoddesc}
137 %
138 \begin{methoddesc}[CostFunction]{getValue}{m, *args}
139 returns the value $J(m)$ using pre--calculated values \member{args} for $m$.
140 \end{methoddesc}
141 %
142 \begin{methoddesc}[CostFunction]{getGradient}{m, *args}
143 returns the gradient $\nabla J$ of $J$ at $m$ using precalculated values \member{args} for $m$.
144 \end{methoddesc}
145 %
146 \begin{methoddesc}[CostFunction]{getArguments}{m}
147 returns pre--calculated values that are shared in the calculation of $J(m)$
148 and $\nabla J(m)$.
149 \end{methoddesc}
150 \begin{methoddesc}[CostFunction]{getNorm}{m}
151 returns the norm $\|m\|$ of \member{m}.
152 \end{methoddesc}
153
154
155 %
156 %\section{The Nonlinear Conjugate Gradient method}\label{sec:NLCG}
157 %The steps for NLCG are
158 %\begin{enumerate}
159 % \item Given an initial guess $\mat{x}_0$, compute the steepest descent
160 % direction using the gradient: $\mat{p}_0=-\nabla f(\mat{x}_0)$
161 % \item Perform an inexact \emph{line search} (see Section~\ref{sec:LineSearch}) to obtain the step
162 % length $\alpha_i$ which loosely minimizes $f(\mat{x}_i+\alpha_i \mat{d}_i)$
163 % \item Update the position: $\mat{x}_{i+1}=\mat{x}_i+\alpha_i \mat{d}_i$
164 % \item Update the conjugate direction: $\mat{d}_{i+1}=-\nabla f(\mat{x}_{i+1})+\beta_{i+1} \mat{d}_i$\label{CGupdate}
165 %\end{enumerate}
166
167 %\flushleft There are many choices for the CG update parameter $\beta_{i+1}$ in step
168 %\ref{CGupdate} above, see \cite{Hager2006} for a discussion. An efficient
169 %choice, proposed by Polak and Ribi\`{e}re, is:
170 %\begin{equation*}
171 % \beta_{i+1}=\text{max}\left\{\frac{\nabla f^T(\mat{x}_{i+1})(\nabla f(\mat{x}_{i+1})-\nabla f(\mat{x}_i))}{\| \nabla f(\mat{x}_i)\|}, 0\right\}
172 %\end{equation*}
173
174 \section{The L-BFGS Algortihm}
175
176
177 %\section{Limited-Memory BFGS Method}\label{sec:LBFGS}
178 The L-BFGS algorithm, see~\cite{Nocedal1980}, is as follows:
179 \begin{algorithm}
180 \mbox{L-BFGS}
181 \begin{program}
182 \BEGIN
183 \text{Input: initial guess $x_0$ and integer $m$};
184 \text{Allocate $m$ vector pairs \{$s_i$, $g_i$\}};
185 k \leftarrow 0;
186 H_0 \leftarrow I;
187 \WHILE \NOT\text{converged} \DO
188 p_k \leftarrow |twoLoop|(\{s, g\});
189 \alpha_k \leftarrow |lineSearch|(m_k, p_k); \rcomment{See Section~\ref{sec:LineSearch}}
190 m_{k+1} \leftarrow m_k+\alpha_k p_k;
191 \IF k> trunction
192 \THEN
193 |Discard the vector pair | \{s_{k-trunction}, g_{k-trunction}\} | from storage|;
194 \FI
195 s_k \leftarrow m_{k+1}-m_k;
196 g_k \leftarrow \nabla J_{k+1}-\nabla J_k;
197 k \leftarrow k+1;
198 \OD
199 \WHERE
200 \FUNCT |twoLoop|(\{s, g\}) \BODY
201 \EXP q \leftarrow \nabla J_k;
202 \FOR i=k-1, k-2, \ldots, k-trunction \DO
203 \rho_i \leftarrow \frac{1}{<s_i, g_i>};
204 \alpha_i \leftarrow \rho_i <s_i, q>;
205 q \leftarrow q-\alpha_i\cdot g_i;
206 \OD
207 r \leftarrow H q;
208 \FOR i = k-trunction, k-trunction+1, \ldots, k-1 \DO
209 \beta \leftarrow \rho_i \cdot <r,g_i>;
210 r \leftarrow r + s_i \cdot (\alpha_i - \beta)
211 \OD
212 r \ENDEXP \ENDFUNCT
213 \END
214 \end{program}
215 \end{algorithm}
216
217 \subsection{Line Search}\label{sec:LineSearch}
218 Line search minimizes the function $\phi(\alpha)$ for a given level set function $m$ and
219 search direction $p$, see \cite{Nocedal2006}, \cite{MoreThuente1992} with
220 \begin{equation}\label{EQU:MIN:22}
221 \phi(\alpha) = J(m+ \alpha \cdot p)
222 \end{equation}
223 Notice that
224 \begin{equation}\label{EQU:MIN:22}
225 \phi'(\alpha) = < p , \nabla J(m+\alpha \cdot p)>
226 \end{equation}
227
228
229 \begin{algorithm}
230 \mbox{Line Search that satisfies strong Wolfe conditions}
231 \begin{program}
232 \BEGIN
233 \text{Input: $\alpha_{max}>0, 0<c_1<c_2<1$};
234 i \leftarrow 1;
235 \WHILE 1 \DO
236 \IF \phi(\alpha_i) > \phi(0)+c_1 \alpha_i \phi'(0) \OR
237 (\phi(\alpha_i) \ge \phi(\alpha_{i-1}) \AND i>1)
238 \THEN
239 \alpha_* \leftarrow |zoom|(\alpha_{i-1}, \alpha_i);
240 |break|;
241 \FI
242 \IF \|\phi'(\alpha_i)\| \le -c_2\phi'(0)
243 \THEN
244 \alpha_* \leftarrow \alpha_i;
245 |break|;
246 \FI
247 \IF \phi'(\alpha_i) \ge 0
248 \THEN
249 \alpha_* \leftarrow |zoom|(\alpha_i, \alpha_{i-1});
250 |break|;
251 \FI
252 \alpha_{i+1} = 2\alpha_i;
253 i \leftarrow i+1;
254 \OD
255 \WHERE
256 \FUNCT |zoom|(\alpha_{lo}, \alpha_{hi}) \BODY
257 \WHILE 1 \DO
258 \EXP \alpha_j \leftarrow \alpha_{lo}+\frac{\alpha_{hi}-\alpha_{lo}}{2};
259 \IF \phi(\alpha_j) > \phi(0)+c_1 \alpha_j\phi'(0) \OR %
260 \phi(\alpha_j) \ge \phi(\alpha_{lo})
261 \THEN
262 \alpha_{hi} \leftarrow \alpha_j;
263 \ELSE
264 \IF \|\phi'(\alpha_j)\| \le -c_2\phi'(0)
265 \THEN
266 \alpha_* \leftarrow \alpha_j;
267 |break|;
268 \FI
269 \IF \phi'(\alpha_j)(\alpha_{hi}-\alpha_{lo}) \ge 0
270 \THEN
271 \alpha_{hi} \leftarrow \alpha_{lo};
272 \FI
273 \alpha_{lo}=\alpha_j;
274 \FI
275 \OD
276 \ENDEXP \ENDFUNCT
277 \END
278 \end{program}
279 \end{algorithm}
280
281

  ViewVC Help
Powered by ViewVC 1.1.26