# Contents of /branches/doubleplusgood/doc/inversion/Minimization.tex

Revision 4345 - (show annotations)
Fri Mar 29 07:09:41 2013 UTC (6 years, 9 months ago) by jfenwick
File MIME type: application/x-tex
File size: 13805 byte(s)
Spelling fixes

 1 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 % Copyright (c) 2003-2013 by University of Queensland 4 5 % 6 % Primary Business: Queensland, Australia 7 % Licensed under the Open Software License version 3.0 8 9 % 10 % Development until 2012 by Earth Systems Science Computational Center (ESSCC) 11 % Development since 2012 by School of Earth Sciences 12 % 13 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 14 15 \chapter{Minimization Algorithms}\label{chapter:ref:Minimization} 16 We need to find the level set function $m$ minimizing the cost function $J$ as 17 defined in Equation~\ref{REF:EQU:INTRO 1}. The physical parameters $p^f$ and 18 the data defects are linked through state variables $u^f$ which is given as a 19 solution of a partial differential equation (PDE) with coefficients depending 20 on $p_f$. This PDE (or -- in case of several forward models -- this set of PDEs) 21 defines a constraint in the minimization problem. 22 In the context of our applications it can be assumed that the PDE is of 23 'maximum rank', i.e. for a given value of the level set function $m$ there is a 24 unique value for the state variables $u^f$ as the solution of the forward model. 25 So from a mathematical point of view the state variable $u^f$ can be eliminated 26 from the problem and the minimization problem becomes in fact a minimization 27 problem for the level set function $m$ alone where the physical parameters 28 which are of most interest in applications can easily be derived from the 29 solution. However one needs to keep in mind that each evaluation of the cost 30 function requires the solution of a PDE (an additional PDE solution is required 31 for the gradient). 32 33 In some application cases the optimization problem to be solved defines a 34 quadratic programming problem which would allow to use a special case of 35 solvers. 36 However, for the general scenarios we are interested in here we cannot assume 37 this simplification and need to be able to solve for a general cost function. 38 The method used here is the limited-memory Broyden-Fletcher-Goldfarb-Shanno 39 (\emph{L-BFGS}) method, see~\cite{Nocedal1980}. 40 This method is a quasi-Newton method. 41 To implement the method one needs to provide the evaluation of the cost 42 function $J$ and its gradient $\nabla J$ and dual product $<.,.>$\footnote{A 43 dual product $<.,.>$ defines a function which assigns a pair $(p,g)$ of a level 44 set increment $p$ and a gradient $g$ a real number $$. It is linear 45 in the first and linear in the second argument.} 46 such that for \alpha \to 0 47 \begin{equation}\label{EQU:MIN:1} 48 J(m+\alpha p) = J(m) + \alpha \cdot < p , \nabla J(m)> + o(\alpha) 49 \end{equation} 50 where p is an increment to the level set function\footnote{o denotes the 51 little o notation see \url{http://en.wikipedia.org/wiki/Big_O_notation}}. 52 Notice that if m is the unknown solution one has \nabla J(m)=0. 53 Moreover, an approximation of the inverse of the Hessian operator 54 \nabla \nabla J(m) needs to be provided for a given value of m. 55 In the implementation we don't need to provide the entire operator but for a 56 given gradient difference g an approximation h=Hg of 57 \nabla \nabla J(m))^{-1}g needs to be calculated. 58 This is an approximative solution of the equation \nabla \nabla J(m)) h=g 59 which in variational form is given as 60 \begin{equation}\label{EQU:MIN:2} 61 = 62 \end{equation} 63 for all level set increments p. 64 In the cases relevant here, it is a possible approach to make the approximation 65 \nabla \nabla J(m) \approx \nabla \nabla J^{reg}(m) which can easily be 66 constructed and the resulting variational Equation~\ref{EQU:MIN:2} can easily 67 be solved, see Chapter~\ref{Chp:ref:regularization}. 68 69 L-BFGS is an iterative method which calculates a sequence of level set 70 approximations m_{k} which converges towards the unknown level set function 71 minimizing the cost function. 72 This iterative process is terminated if certain stopping criteria are fulfilled. 73 A criterion commonly used is 74 \begin{equation}\label{EQU:MIN:3a} 75 \| m_k - m_{k-1} \| \le \|m_k\| \cdot {m\_tol} 76 \end{equation} 77 in order to terminate the iteration after m_k has been calculated. 78 In this condition \|.\| denotes a norm and m\_tol defines a relative 79 tolerance. A typical value for m\_tol is 10^{-4}. Alternatively one can 80 check for changes in the cost function: 81 \begin{equation}\label{EQU:MIN:3b} 82 \mid J(m_k) - J(m_k) \mid \le \mid J(m_k) - J(m_0) \mid \cdot {J\_tol} 83 \end{equation} 84 where {J\_tol} is a given tolerance and m_0 is the initial guess of the solution. 85 As this criterion depends on the initial guess it is not recommended. 86 87 \section{Solver Classes} 88 89 \begin{classdesc}{MinimizerLBFGS}{% 90 \optional{J=\None}% 91 \optional{, m_tol=1e-4}% 92 \optional{, J_tol=\None}% 93 \optional{, imax=300}} 94 constructs an instance of the limited-memory Broyden-Fletcher-Goldfarb-Shanno \emph{L-BFGS} solver. 95 \member{J} sets the cost function to be minimized, see Section~\ref{chapter:ref:Minimization: costfunction class}. 96 If not present the cost function needs to be set using the \member{setCostFunction} method. 97 \member{m_tol} and \member{J_tol} specify the tolerances for the stopping 98 criteria (see description of \member{setTolerance} below). 99 \member{imax} sets the maximum number of iteration steps (see 100 \member{setMaxIterations} below). 101 \end{classdesc} 102 103 \begin{methoddesc}[MinimizerLBFGS]{setTolerance}{\optional{m_tol=1e-4}\optional{, J_tol=\None}} 104 sets the tolerances for the stopping criteria. 105 \member{m_tol} sets the tolerance for the unknown level set Function~\ref{EQU:MIN:3a}. 106 If \member{m_tol}=\None the tolerance on level set function is not checked. 107 \member{J_tol} sets the tolerance for the cost function, see Equation~\ref{EQU:MIN:3b}. 108 If \member{J_tol}=\None tolerance on the cost function is not checked (recommended). 109 If \member{m_tol} and \member{J_tol} are both set criterion~\ref{EQU:MIN:3a} 110 and criterion~\ref{EQU:MIN:3b} are both applied to terminate the iteration. 111 \end{methoddesc} 112 113 \begin{methoddesc}[MinimizerLBFGS]{setMaxIterations}{imax} 114 sets the maximum number of iterations before the minimizer terminates. If 115 the maximum number of iteration is reached a \exception{MinimizerMaxIterReached} 116 exception is thrown. 117 \end{methoddesc} 118 119 \begin{methoddesc}[MinimizerLBFGS]{getResult}{} 120 returns the solution of the optimization problem. 121 This method only returns something useful if the optimization process has 122 completed. 123 If the iteration failed the last available approximation of the solution is 124 returned. 125 \end{methoddesc} 126 127 \begin{methoddesc}[MinimizerLBFGS]{getOptions}{} 128 returns the solver options as a dictionary which contains all available option 129 names and their current value. 130 \end{methoddesc} 131 132 \begin{methoddesc}[MinimizerLBFGS]{setOptions}{% 133 \optional{truncation=30}% 134 \optional{, restart=60}% 135 \optional{, initialHessian=1}} 136 sets solver options. \member{truncation} defines number of gradients to keep in memory. 137 \member{restart} defines the number of iteration steps after which the iteration is restarted. 138 \member{initialHessian} can be used to provide an estimate for the scale of 139 the inverse Hessian. If the cost function provides such an estimate this 140 option is ignored. 141 \end{methoddesc} 142 143 \begin{methoddesc}[MinimizerLBFGS]{run}{m0} 144 runs the solution algorithm and returns an approximation of the solution. 145 \member{m0} is the initial guess to use. 146 This method may throw an exception if the maximum number of iterations is 147 reached or a solver breakdown occurs. 148 \end{methoddesc} 149 150 \begin{methoddesc}[MinimizerLBFGS]{logSummary}{} 151 writes some statistical information to the logger. 152 \end{methoddesc} 153 154 \section{\member{CostFunction} Class Template}\label{chapter:ref:Minimization: costfunction class} 155 156 \begin{classdesc}{CostFunction}{} 157 template of cost function J 158 \end{classdesc} 159 160 \begin{methoddesc}[CostFunction]{getDualProduct}{m, g} 161 returns the dual product$$ of \member{m} and \member{g}. 162 \end{methoddesc} 163 % 164 \begin{methoddesc}[CostFunction]{getValue}{m, *args} 165 returns the value $J(m)$ using pre-calculated values \member{args} for $m$. 166 \end{methoddesc} 167 % 168 \begin{methoddesc}[CostFunction]{getGradient}{m, *args} 169 returns the gradient $\nabla J$ of $J$ at $m$ using pre-calculated values \member{args} for $m$. 170 \end{methoddesc} 171 % 172 \begin{methoddesc}[CostFunction]{getArguments}{m} 173 returns pre-calculated values that are shared in the calculation of $J(m)$ and 174 $\nabla J(m)$. 175 \end{methoddesc} 176 % 177 \begin{methoddesc}[CostFunction]{getNorm}{m} 178 returns the norm $\|m\|$ of \member{m}. 179 \end{methoddesc} 180 % 181 \begin{methoddesc}[CostFunction]{updateHessian}{} 182 notifies the class that the Hessian operator needs to be updated in case the 183 class provides an approximation for the inverse of the Hessian. 184 \end{methoddesc} 185 % 186 \begin{methoddesc}[CostFunction]{getInverseHessianApproximation}{m, g, *args} 187 returns an approximative evaluation $p$ of the inverse of the Hessian operator 188 of the cost function for a given gradient $g$ at location $m$. 189 \end{methoddesc} 190 191 192 % 193 %\section{The Nonlinear Conjugate Gradient method}\label{sec:NLCG} 194 %The steps for NLCG are 195 %\begin{enumerate} 196 % \item Given an initial guess $\mat{x}_0$, compute the steepest descent 197 % direction using the gradient: $\mat{p}_0=-\nabla f(\mat{x}_0)$ 198 % \item Perform an inexact \emph{line search} (see Section~\ref{sec:LineSearch}) to obtain the step 199 % length $\alpha_i$ which loosely minimizes $f(\mat{x}_i+\alpha_i \mat{d}_i)$ 200 % \item Update the position: $\mat{x}_{i+1}=\mat{x}_i+\alpha_i \mat{d}_i$ 201 % \item Update the conjugate direction: $\mat{d}_{i+1}=-\nabla f(\mat{x}_{i+1})+\beta_{i+1} \mat{d}_i$\label{CGupdate} 202 %\end{enumerate} 203 204 %\flushleft There are many choices for the CG update parameter $\beta_{i+1}$ in step 205 %\ref{CGupdate} above, see \cite{Hager2006} for a discussion. An efficient 206 %choice, proposed by Polak and Ribi\`{e}re, is: 207 %\begin{equation*} 208 % \beta_{i+1}=\text{max}\left\{\frac{\nabla f^T(\mat{x}_{i+1})(\nabla f(\mat{x}_{i+1})-\nabla f(\mat{x}_i))}{\| \nabla f(\mat{x}_i)\|}, 0\right\} 209 %\end{equation*} 210 211 \section{The L-BFGS Algorithm} 212 213 %\section{Limited-Memory BFGS Method}\label{sec:LBFGS} 214 The L-BFGS algorithm looks as follows (see also~\cite{Nocedal1980})\index{L-BFGS}: 215 \begin{algorithm} 216 \mbox{L-BFGS} 217 \begin{program} 218 \BEGIN 219 \text{Input: initial guess $x_0$ and integer $m$}; 220 \text{Allocate $m$ vector pairs \{$s_i$, $g_i$\}}; 221 k \leftarrow 0; 222 H_0 \leftarrow I; 223 \WHILE \NOT\text{converged} \DO 224 p_k \leftarrow |twoLoop|(\{s, g\}); 225 \alpha_k \leftarrow |lineSearch|(m_k, p_k); \rcomment{See Section~\ref{sec:LineSearch}} 226 m_{k+1} \leftarrow m_k+\alpha_k p_k; 227 \IF k> truncation 228 \THEN 229 |Discard the vector pair | \{s_{k-truncation}, g_{k-truncation}\} | from storage|; 230 \FI 231 s_k \leftarrow m_{k+1}-m_k; 232 g_k \leftarrow \nabla J_{k+1}-\nabla J_k; 233 k \leftarrow k+1; 234 \OD 235 \WHERE 236 \FUNCT |twoLoop|(\{s, g\}) \BODY 237 \EXP q \leftarrow \nabla J_k; 238 \FOR i=k-1, k-2, \ldots, k-truncation \DO 239 \rho_i \leftarrow \frac{1}{}; 240 \alpha_i \leftarrow \rho_i ; 241 q \leftarrow q-\alpha_i\cdot g_i; 242 \OD 243 r \leftarrow H q; 244 \FOR i = k-truncation, k-truncation+1, \ldots, k-1 \DO 245 \beta \leftarrow \rho_i \cdot ; 246 r \leftarrow r + s_i \cdot (\alpha_i - \beta) 247 \OD 248 r \ENDEXP \ENDFUNCT 249 \END 250 \end{program} 251 \end{algorithm} 252 253 \subsection{Line Search}\label{sec:LineSearch} 254 The line search procedure minimizes the function $\phi(\alpha)$ for a given 255 level set function $m$ and search direction $p$, see \cite{Nocedal2006}, 256 \cite{MoreThuente1992} with 257 \begin{equation}\label{EQU:MIN:22} 258 \phi(\alpha) = J(m+ \alpha \cdot p) 259 \end{equation} 260 Notice that 261 \begin{equation}\label{EQU:MIN:23} 262 \phi'(\alpha) = < p , \nabla J(m+\alpha \cdot p)> \; . 263 \end{equation} 264 265 \begin{algorithm} 266 \mbox{Line Search that satisfies strong Wolfe conditions} 267 \begin{program} 268 \BEGIN 269 \text{Input: \$\alpha_{max}>0, 0 \phi(0)+c_1 \alpha_i \phi'(0) \OR 273 (\phi(\alpha_i) \ge \phi(\alpha_{i-1}) \AND i>1) 274 \THEN 275 \alpha_* \leftarrow |zoom|(\alpha_{i-1}, \alpha_i); 276 |break|; 277 \FI 278 \IF \|\phi'(\alpha_i)\| \le -c_2\phi'(0) 279 \THEN 280 \alpha_* \leftarrow \alpha_i; 281 |break|; 282 \FI 283 \IF \phi'(\alpha_i) \ge 0 284 \THEN 285 \alpha_* \leftarrow |zoom|(\alpha_i, \alpha_{i-1}); 286 |break|; 287 \FI 288 \alpha_{i+1} = 2\alpha_i; 289 i \leftarrow i+1; 290 \OD 291 \WHERE 292 \FUNCT |zoom|(\alpha_{lo}, \alpha_{hi}) \BODY 293 \WHILE 1 \DO 294 \EXP \alpha_j \leftarrow \alpha_{lo}+\frac{\alpha_{hi}-\alpha_{lo}}{2}; 295 \IF \phi(\alpha_j) > \phi(0)+c_1 \alpha_j\phi'(0) \OR % 296 \phi(\alpha_j) \ge \phi(\alpha_{lo}) 297 \THEN 298 \alpha_{hi} \leftarrow \alpha_j; 299 \ELSE 300 \IF \|\phi'(\alpha_j)\| \le -c_2\phi'(0) 301 \THEN 302 \alpha_* \leftarrow \alpha_j; 303 |break|; 304 \FI 305 \IF \phi'(\alpha_j)(\alpha_{hi}-\alpha_{lo}) \ge 0 306 \THEN 307 \alpha_{hi} \leftarrow \alpha_{lo}; 308 \FI 309 \alpha_{lo}=\alpha_j; 310 \FI 311 \OD 312 \ENDEXP \ENDFUNCT 313 \END 314 \end{program} 315 \end{algorithm} 316