 Diff of /trunk/doc/inversion/Minimization.tex

revision 4142 by gross, Tue Jan 15 09:06:06 2013 UTC revision 4143 by gross, Thu Jan 17 08:48:47 2013 UTC
# Line 13  Line 13
13  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
14
15  \chapter{Minimization Algorithms}\label{chapter:ref:Minimization}  \chapter{Minimization Algorithms}\label{chapter:ref:Minimization}
16  THIS NEEDS REVISION  We need to find the level set function $m$ minimizing the cost function $J$ as
17    defined in equation~\ref{REF:EQU:INTRO 1}. The physical parameters $p^f$ and the
18    data defects are linked through state variables $u^f$ which is a
19    given as a solution of a partial differential equation (PDE) with
20    coefficients depending on $p_f$. This PDE  (or - in case of several forward models - this set of PDEs)
21    defines a constraint in the minimization problem. In the context of our applications
22    it can be assumed the the PDE is of 'maximum rank', ie. for a given value of the level set function $m$
23    there is a unique value for the state variables $u^f$ as the solution of the forward model. So from a mathematical point of view
24    the state variable $u^f$ can be eliminated from the problem and the minimization problem becomes in fact
25    a minimization problem for the level set function $m$ alone where the physical parameters which are of most interest
26    in applications can easily be derived from the solution. However one need to keep in mind that
27    each evaluation of the cost function requires the solution of a PDE (an additional PDE solution is required for the
29
30    In some application cases the optimization problem to be solved defines a quadratic programming problem which would allow
31    to use a special case of solvers. However, in the general scenarios we are interested here we cannot assume this
32    simplification and need to be able to solve for a general cost function.
33    The method used here is the limited-memory Broyden-Fletcher-Goldfarb-Shanno (\emph{L-BFGS}) method, see~\cite{Nocedal1980}.
34    The method is a quasi-Newton method.
35    To implement the method one needs to provides the evaluation of the cost function $J$ and its gradient $\nabla J$
36    and dual product $<.,.>$\footnote{A dual product $<.,.>$ defines function which assigns
37    a pair $(p,g)$ of a level set increment $p$ and a gradient $g$ a real number $<p,g>$. It is linear
38    in the first and linear in the second argument.}
39    such that for $\alpha \to$ 0
40    \begin{equation}\label{EQU:MIN:1}
41    J(m+\alpha p) = J(m) + \alpha \cdot < p , \nabla J(m)> + o(\alpha)
42    \end{equation}
43    where $p$ is an increment to the level set
44    function\footnote{ $o$ denotes the little o notation see \url{http://en.wikipedia.org/wiki/Big_O_notation}}.
45    Notice that if $m$ is the unknown solution one has $\nabla J(m)=0$.
46    Moreover, an approximation of the inverse of the Hessian operator $\nabla \nabla J(m)$ needs to be provided
47    for a given value of  $m$. In the implementation we don't need to provide the entire operator but
48    for a given gradient difference $g$ an approximation $h=Hg$ of $\nabla \nabla J(m))^{-1}g$ needs to be
49    calculated. This is an approximative solution of the equation $\nabla \nabla J(m)) h =g$ which in variational
50    from is given as
51    \begin{equation}\label{EQU:MIN:2}
52    <p, \nabla \nabla J(m)\; h > = <p, g>
53    \end{equation}
54    for all level set increments $p$. In the cases relevant here, it is a possible approach to
55    make the approximation $\nabla \nabla J(m) \approx \nabla \nabla J^{reg}(m)$ which can easily be construction
56    and the resulting variational equation~\ref{EQU:MIN:2} can easily be solved, see Chapter~\ref{Chp:ref:regularization}.
57
58    The L-BFGS is an iterative method which calculates a sequence of level set approximations $m_{k}$ which converges
59    towards the unknown level set function minimizing the cost function.
60    This iterative process is terminated if certain stopping criteria are fulfilled. A criteria
61    commonly used is
62    \begin{equation}\label{EQU:MIN:3a}
63    \| m_k - m_{k-1} \| \le \|m_k\| \cdot {m\_tol}
64    \end{equation}
65    in order to terminate the iteration after $m_k$ has been calculated. In this condition $\|.\|$ denotes a
66    norm and $x\_tol$ defines a relative tolerance. Typical value for  $x\_tol$ is $10^{-4}$. Alternatively
67    one can check for changes in the cost function:
68    \begin{equation}\label{EQU:MIN:3b}
69    \mid J(m_k) - J(m_k) \mid \le \mid J(m_k) - J(m_0) \mid  \cdot {J\_tol}
70    \end{equation}
71    where ${J\_tol}$ is a given tolerance and $m_0$ is the initial guess of the solution.
72    As this criterion is depending ion the initial guess it is not recommended.
73
74    \section{Solver Classes}
75
76    \begin{classdesc}{MinimizerLBFGS}{\optional{J=\None}}
77    Constructs an instance of the limited-memory Broyden-Fletcher-Goldfarb-Shanno \emph{L-BFGS} solver.
78    \member{J} set the cost function to be minimized, see Section~\ref{chapter:ref:Minimization: costfunction class}.
79    If not present the cost function needs to be set using the \member{setCostFunction} method.
80    \member{imax} sets the maximum number of iteration steps.
81    \end{classdesc}
82
83    \begin{methoddesc}[MinimizerLBFGS]{setTolerance}{\optional{x_tol=1e-5} \optional{, J_tol=\None}}
84    sets the tolerances for the stopping criterions.
85    \member{x_tol} sets the tolerance for the unknown level set function~\ref{EQU:MIN:3a}. If \member{x_tol}=\None
86    tolerance on  level set function is not checked.
87    \member{J_tol} sets the tolerance for the cost function, see equation~\ref{EQU:MIN:3b}.
88    If \member{J_tol} = \None
89    tolerance on cost function is not checked (recommended). If \member{x_tol} and \member{J_tol} are both set
90    criterion~\ref{EQU:MIN:3a} and criterion~\ref{EQU:MIN:3b} to terminate the iteration.
91    \end{methoddesc}
92
93
94  Let $f(\mat{x}), \mat{x} \in \mathbb{R}^n$ be a smooth nonlinear function.  \begin{methoddesc}[MinimizerLBFGS]{setMaxIterations}{imax}
95  There are a number of algorithms that try to find a minimum of $f(\mat{x})$  sets the maximum number of iterations before the minimizer terminates. If
96  which have different requirements on $f$ in order to be successful.  the maximum number of iteration is reached \exception{MinimizerMaxIterReached} exception is thrown.
97  We assume that $n$ is large and computing the Hessian matrix  \end{methoddesc}
$\nabla^2 f(\mat{x})$ is not feasible.
The algorithms that are most widely used in the literature for this type of
optimization problem is the
%the \emph{Nonlinear Conjugate Gradient} (\emph{NLCG})
the limited-memory Broyden-Fletcher-Goldfarb-Shanno (\emph{L-BFGS}),
which is a \emph{quasi-Newton} method.
98
99    \begin{methoddesc}[MinimizerLBFGS]{getResult}{}
100    returns the solution of the optimization problem.
101    This method can only be called if the optimization process as been completed.
102    If the iteration failed the last available approximation of the solution is returned.
103    If an exception is thrown, the last available solution approximation is returned.
104    \end{methoddesc}
105
106    \begin{methoddesc}[MinimizerLBFGS]{getOptions}{}
107    returns the solver options as a dictionary.
108    \end{methoddesc}
109
110    \begin{methoddesc}[MinimizerLBFGS]{setOptions}{\optional{truncation=30} \optional{, restart=60}}
111    set solver options. \member{truncation} defines number of gradients to keep in memory.
112    \member{restart} defines the number of iteration steps after which the iteration is restarted.
113    \end{methoddesc}
114
115    \begin{methoddesc}[MinimizerLBFGS]{run}{m0}
116    runs the solution algorithm and returns an approximation of the solution.
117    \member{m0} set the initial guess.
118    \end{methoddesc}
119
120    \begin{methoddesc}[MinimizerLBFGS]{logSummary}{}
122    \end{methoddesc}
123
124    \section{\member{CostFunction} Class Template}\label{chapter:ref:Minimization: costfunction class}
125
126    \begin{classdesc}{CostFunction}{}
127    template of cost function $J$
128    \end{classdesc}
129
130    \begin{methoddesc}[CostFunction]{resetCounters}{}
131        resets all statistical counters.
132    \end{methoddesc}
133    %
134    \begin{methoddesc}[CostFunction]{getDualProduct}{m, g}
135        returns the dual product $<m,g>$ of \member{m} and \member{g}.
136    \end{methoddesc}
137    %
138    \begin{methoddesc}[CostFunction]{getValue}{m, *args}
139        returns the value $J(m)$ using pre--calculated values \member{args} for $m$.
140    \end{methoddesc}
141    %
143        returns the gradient $\nabla J$ of $J$ at $m$ using precalculated values \member{args}  for $m$.
144    \end{methoddesc}
145    %
146    \begin{methoddesc}[CostFunction]{getArguments}{m}
147        returns pre--calculated values that are shared in the calculation of $J(m)$
148        and $\nabla J(m)$.
149    \end{methoddesc}
150    \begin{methoddesc}[CostFunction]{getNorm}{m}
151    returns the norm $\|m\|$ of \member{m}.
152    \end{methoddesc}
153
154
155  %  %
# Line 48  which is a \emph{quasi-Newton} method. Line 171  which is a \emph{quasi-Newton} method.
171  %    \beta_{i+1}=\text{max}\left\{\frac{\nabla f^T(\mat{x}_{i+1})(\nabla f(\mat{x}_{i+1})-\nabla f(\mat{x}_i))}{\| \nabla f(\mat{x}_i)\|}, 0\right\}  %    \beta_{i+1}=\text{max}\left\{\frac{\nabla f^T(\mat{x}_{i+1})(\nabla f(\mat{x}_{i+1})-\nabla f(\mat{x}_i))}{\| \nabla f(\mat{x}_i)\|}, 0\right\}
172  %\end{equation*}  %\end{equation*}
173
174    \section{The L-BFGS Algortihm}
175
176
177  %\section{Limited-Memory BFGS Method}\label{sec:LBFGS}  %\section{Limited-Memory BFGS Method}\label{sec:LBFGS}
178  The L-BFGS algorithm is as follows:  The L-BFGS algorithm, see~\cite{Nocedal1980}, is as follows:

179  \begin{algorithm}  \begin{algorithm}
180      \mbox{L-BFGS}      \mbox{L-BFGS}
181      \begin{program}      \begin{program}
182          \BEGIN          \BEGIN
183          \text{Input: initial guess $x_0$ and integer $m$};          \text{Input: initial guess $x_0$ and integer $m$};
184          \text{Allocate $m$ vector pairs \{$s_i$, $y_i$\}};          \text{Allocate $m$ vector pairs \{$s_i$, $g_i$\}};
185          k \leftarrow 0;          k \leftarrow 0;
186          H_0 \leftarrow I;          H_0 \leftarrow I;
187          \WHILE \NOT\text{converged} \DO          \WHILE \NOT\text{converged} \DO
188          p_k \leftarrow |twoLoop|(H_k, \{s, y\});          p_k \leftarrow |twoLoop|(\{s, g\});
189          \alpha_k \leftarrow |lineSearch|(f, x_k, p_k); \rcomment{See Section~\ref{sec:LineSearch}}          \alpha_k \leftarrow |lineSearch|(m_k, p_k); \rcomment{See Section~\ref{sec:LineSearch}}
190          x_{k+1} \leftarrow x_k+\alpha_k p_k;          m_{k+1} \leftarrow m_k+\alpha_k p_k;
191          \IF k>m          \IF k> trunction
192          \THEN          \THEN
193              |Discard the vector pair | \{s_{k-m}, y_{k-m}\} | from storage|;              |Discard the vector pair | \{s_{k-trunction}, g_{k-trunction}\} | from storage|;
194          \FI          \FI
195          s_k \leftarrow x_{k+1}-x_k;          s_k \leftarrow m_{k+1}-m_k;
196          y_k \leftarrow \nabla f_{k+1}-\nabla f_k;          g_k \leftarrow \nabla J_{k+1}-\nabla J_k;
\gamma_k \leftarrow \frac{[s_k,y_k]}{[y_k, y_k]_0};
H_k \leftarrow \gamma_k I
197          k \leftarrow k+1;          k \leftarrow k+1;
198          \OD          \OD
199          \WHERE          \WHERE
200          \FUNCT |twoLoop|(H_k, \{s, y\}) \BODY          \FUNCT |twoLoop|(\{s, g\}) \BODY
201              \EXP q \leftarrow \nabla f_k;              \EXP q \leftarrow \nabla J_k;
202              \FOR i=k-1, k-2, \ldots, k-m \DO              \FOR i=k-1, k-2, \ldots, k-trunction \DO
203              \rho_i \leftarrow \frac{1}{y^T_i s_i};              \rho_i \leftarrow \frac{1}{<s_i, g_i>};
204              \alpha_i \leftarrow \rho_i s^T_i q;              \alpha_i \leftarrow \rho_i <s_i, q>;
205              q \leftarrow q-\alpha_i y_i;              q \leftarrow q-\alpha_i\cdot  g_i;
206              \OD              \OD
207              r \leftarrow H_k q;              r \leftarrow H q;
208              \FOR i = k-m, k-m+1, \ldots, k-1 \DO              \FOR i = k-trunction, k-trunction+1, \ldots, k-1 \DO
209              \beta \leftarrow \rho_i y^T_i r;              \beta \leftarrow \rho_i \cdot <r,g_i>;
210              r \leftarrow r + s_i (\alpha_i - \beta)              r \leftarrow r + s_i \cdot (\alpha_i - \beta)
211              \OD              \OD
212              r \ENDEXP \ENDFUNCT              r \ENDEXP \ENDFUNCT
213          \END          \END
214      \end{program}      \end{program}
215  \end{algorithm}  \end{algorithm}
216
217  \section{Line Search}\label{sec:LineSearch}  \subsection{Line Search}\label{sec:LineSearch}
218  See \cite{Nocedal2006}, \cite{MoreThuente1992}  Line search minimizes the function $\phi(\alpha)$ for a given level set function $m$ and
219    search direction $p$,  see \cite{Nocedal2006}, \cite{MoreThuente1992} with
220    \begin{equation}\label{EQU:MIN:22}
221    \phi(\alpha) = J(m+ \alpha \cdot  p)
222    \end{equation}
223    Notice that
224    \begin{equation}\label{EQU:MIN:22}
225    \phi'(\alpha) = < p , \nabla J(m+\alpha \cdot p)>
226    \end{equation}
227
228
229  \begin{algorithm}  \begin{algorithm}
230      \mbox{Line Search that satisfies strong Wolfe conditions}      \mbox{Line Search that satisfies strong Wolfe conditions}
# Line 149  See \cite{Nocedal2006}, \cite{MoreThuent Line 278  See \cite{Nocedal2006}, \cite{MoreThuent
278      \end{program}      \end{program}
279  \end{algorithm}  \end{algorithm}
280
\section{\member{CostFunction} Class Template}\label{chapter:ref:Minimization: costfunction class}

\begin{classdesc}{CostFunction}{}
template of cost function
\end{classdesc}
281
\begin{methoddesc}[CostFunction]{resetCounters}{}
resets all statistical counters.
\end{methoddesc}
%
\begin{methoddesc}[CostFunction]{getDualProduct}{x, r}
returns the inner product of \member{x} and \member{r}.
\end{methoddesc}
%
\begin{methoddesc}[CostFunction]{getValue}{x, *args}
returns the value $f(x)$ using precalculated values for $x$.
\end{methoddesc}
%
returns the gradient of $f$ at $x$ using precalculated values for $x$.
\end{methoddesc}
%
\begin{methoddesc}[CostFunction]{getArguments}{x}
returns precalculated values that are shared in the calculation of $f(x)$