123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429 |
- """Hessian update strategies for quasi-Newton optimization methods."""
- import numpy as np
- from numpy.linalg import norm
- from scipy.linalg import get_blas_funcs
- from warnings import warn
- __all__ = ['HessianUpdateStrategy', 'BFGS', 'SR1']
- class HessianUpdateStrategy:
- """Interface for implementing Hessian update strategies.
- Many optimization methods make use of Hessian (or inverse Hessian)
- approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS.
- Some of these approximations, however, do not actually need to store
- the entire matrix or can compute the internal matrix product with a
- given vector in a very efficiently manner. This class serves as an
- abstract interface between the optimization algorithm and the
- quasi-Newton update strategies, giving freedom of implementation
- to store and update the internal matrix as efficiently as possible.
- Different choices of initialization and update procedure will result
- in different quasi-Newton strategies.
- Four methods should be implemented in derived classes: ``initialize``,
- ``update``, ``dot`` and ``get_matrix``.
- Notes
- -----
- Any instance of a class that implements this interface,
- can be accepted by the method ``minimize`` and used by
- the compatible solvers to approximate the Hessian (or
- inverse Hessian) used by the optimization algorithms.
- """
- def initialize(self, n, approx_type):
- """Initialize internal matrix.
- Allocate internal memory for storing and updating
- the Hessian or its inverse.
- Parameters
- ----------
- n : int
- Problem dimension.
- approx_type : {'hess', 'inv_hess'}
- Selects either the Hessian or the inverse Hessian.
- When set to 'hess' the Hessian will be stored and updated.
- When set to 'inv_hess' its inverse will be used instead.
- """
- raise NotImplementedError("The method ``initialize(n, approx_type)``"
- " is not implemented.")
- def update(self, delta_x, delta_grad):
- """Update internal matrix.
- Update Hessian matrix or its inverse (depending on how 'approx_type'
- is defined) using information about the last evaluated points.
- Parameters
- ----------
- delta_x : ndarray
- The difference between two points the gradient
- function have been evaluated at: ``delta_x = x2 - x1``.
- delta_grad : ndarray
- The difference between the gradients:
- ``delta_grad = grad(x2) - grad(x1)``.
- """
- raise NotImplementedError("The method ``update(delta_x, delta_grad)``"
- " is not implemented.")
- def dot(self, p):
- """Compute the product of the internal matrix with the given vector.
- Parameters
- ----------
- p : array_like
- 1-D array representing a vector.
- Returns
- -------
- Hp : array
- 1-D represents the result of multiplying the approximation matrix
- by vector p.
- """
- raise NotImplementedError("The method ``dot(p)``"
- " is not implemented.")
- def get_matrix(self):
- """Return current internal matrix.
- Returns
- -------
- H : ndarray, shape (n, n)
- Dense matrix containing either the Hessian
- or its inverse (depending on how 'approx_type'
- is defined).
- """
- raise NotImplementedError("The method ``get_matrix(p)``"
- " is not implemented.")
- class FullHessianUpdateStrategy(HessianUpdateStrategy):
- """Hessian update strategy with full dimensional internal representation.
- """
- _syr = get_blas_funcs('syr', dtype='d') # Symmetric rank 1 update
- _syr2 = get_blas_funcs('syr2', dtype='d') # Symmetric rank 2 update
- # Symmetric matrix-vector product
- _symv = get_blas_funcs('symv', dtype='d')
- def __init__(self, init_scale='auto'):
- self.init_scale = init_scale
- # Until initialize is called we can't really use the class,
- # so it makes sense to set everything to None.
- self.first_iteration = None
- self.approx_type = None
- self.B = None
- self.H = None
- def initialize(self, n, approx_type):
- """Initialize internal matrix.
- Allocate internal memory for storing and updating
- the Hessian or its inverse.
- Parameters
- ----------
- n : int
- Problem dimension.
- approx_type : {'hess', 'inv_hess'}
- Selects either the Hessian or the inverse Hessian.
- When set to 'hess' the Hessian will be stored and updated.
- When set to 'inv_hess' its inverse will be used instead.
- """
- self.first_iteration = True
- self.n = n
- self.approx_type = approx_type
- if approx_type not in ('hess', 'inv_hess'):
- raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.")
- # Create matrix
- if self.approx_type == 'hess':
- self.B = np.eye(n, dtype=float)
- else:
- self.H = np.eye(n, dtype=float)
- def _auto_scale(self, delta_x, delta_grad):
- # Heuristic to scale matrix at first iteration.
- # Described in Nocedal and Wright "Numerical Optimization"
- # p.143 formula (6.20).
- s_norm2 = np.dot(delta_x, delta_x)
- y_norm2 = np.dot(delta_grad, delta_grad)
- ys = np.abs(np.dot(delta_grad, delta_x))
- if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0:
- return 1
- if self.approx_type == 'hess':
- return y_norm2 / ys
- else:
- return ys / y_norm2
- def _update_implementation(self, delta_x, delta_grad):
- raise NotImplementedError("The method ``_update_implementation``"
- " is not implemented.")
- def update(self, delta_x, delta_grad):
- """Update internal matrix.
- Update Hessian matrix or its inverse (depending on how 'approx_type'
- is defined) using information about the last evaluated points.
- Parameters
- ----------
- delta_x : ndarray
- The difference between two points the gradient
- function have been evaluated at: ``delta_x = x2 - x1``.
- delta_grad : ndarray
- The difference between the gradients:
- ``delta_grad = grad(x2) - grad(x1)``.
- """
- if np.all(delta_x == 0.0):
- return
- if np.all(delta_grad == 0.0):
- warn('delta_grad == 0.0. Check if the approximated '
- 'function is linear. If the function is linear '
- 'better results can be obtained by defining the '
- 'Hessian as zero instead of using quasi-Newton '
- 'approximations.', UserWarning)
- return
- if self.first_iteration:
- # Get user specific scale
- if self.init_scale == "auto":
- scale = self._auto_scale(delta_x, delta_grad)
- else:
- scale = float(self.init_scale)
- # Scale initial matrix with ``scale * np.eye(n)``
- if self.approx_type == 'hess':
- self.B *= scale
- else:
- self.H *= scale
- self.first_iteration = False
- self._update_implementation(delta_x, delta_grad)
- def dot(self, p):
- """Compute the product of the internal matrix with the given vector.
- Parameters
- ----------
- p : array_like
- 1-D array representing a vector.
- Returns
- -------
- Hp : array
- 1-D represents the result of multiplying the approximation matrix
- by vector p.
- """
- if self.approx_type == 'hess':
- return self._symv(1, self.B, p)
- else:
- return self._symv(1, self.H, p)
- def get_matrix(self):
- """Return the current internal matrix.
- Returns
- -------
- M : ndarray, shape (n, n)
- Dense matrix containing either the Hessian or its inverse
- (depending on how `approx_type` was defined).
- """
- if self.approx_type == 'hess':
- M = np.copy(self.B)
- else:
- M = np.copy(self.H)
- li = np.tril_indices_from(M, k=-1)
- M[li] = M.T[li]
- return M
- class BFGS(FullHessianUpdateStrategy):
- """Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy.
- Parameters
- ----------
- exception_strategy : {'skip_update', 'damp_update'}, optional
- Define how to proceed when the curvature condition is violated.
- Set it to 'skip_update' to just skip the update. Or, alternatively,
- set it to 'damp_update' to interpolate between the actual BFGS
- result and the unmodified matrix. Both exceptions strategies
- are explained in [1]_, p.536-537.
- min_curvature : float
- This number, scaled by a normalization factor, defines the
- minimum curvature ``dot(delta_grad, delta_x)`` allowed to go
- unaffected by the exception strategy. By default is equal to
- 1e-8 when ``exception_strategy = 'skip_update'`` and equal
- to 0.2 when ``exception_strategy = 'damp_update'``.
- init_scale : {float, 'auto'}
- Matrix scale at first iteration. At the first
- iteration the Hessian matrix or its inverse will be initialized
- with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
- Set it to 'auto' in order to use an automatic heuristic for choosing
- the initial scale. The heuristic is described in [1]_, p.143.
- By default uses 'auto'.
- Notes
- -----
- The update is based on the description in [1]_, p.140.
- References
- ----------
- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
- Second Edition (2006).
- """
- def __init__(self, exception_strategy='skip_update', min_curvature=None,
- init_scale='auto'):
- if exception_strategy == 'skip_update':
- if min_curvature is not None:
- self.min_curvature = min_curvature
- else:
- self.min_curvature = 1e-8
- elif exception_strategy == 'damp_update':
- if min_curvature is not None:
- self.min_curvature = min_curvature
- else:
- self.min_curvature = 0.2
- else:
- raise ValueError("`exception_strategy` must be 'skip_update' "
- "or 'damp_update'.")
- super().__init__(init_scale)
- self.exception_strategy = exception_strategy
- def _update_inverse_hessian(self, ys, Hy, yHy, s):
- """Update the inverse Hessian matrix.
- BFGS update using the formula:
- ``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T)
- - 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)``
- where ``s = delta_x`` and ``y = delta_grad``. This formula is
- equivalent to (6.17) in [1]_ written in a more efficient way
- for implementation.
- References
- ----------
- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
- Second Edition (2006).
- """
- self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H)
- self.H = self._syr((ys+yHy)/ys**2, s, a=self.H)
- def _update_hessian(self, ys, Bs, sBs, y):
- """Update the Hessian matrix.
- BFGS update using the formula:
- ``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y``
- where ``s`` is short for ``delta_x`` and ``y`` is short
- for ``delta_grad``. Formula (6.19) in [1]_.
- References
- ----------
- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
- Second Edition (2006).
- """
- self.B = self._syr(1.0 / ys, y, a=self.B)
- self.B = self._syr(-1.0 / sBs, Bs, a=self.B)
- def _update_implementation(self, delta_x, delta_grad):
- # Auxiliary variables w and z
- if self.approx_type == 'hess':
- w = delta_x
- z = delta_grad
- else:
- w = delta_grad
- z = delta_x
- # Do some common operations
- wz = np.dot(w, z)
- Mw = self.dot(w)
- wMw = Mw.dot(w)
- # Guarantee that wMw > 0 by reinitializing matrix.
- # While this is always true in exact arithmetics,
- # indefinite matrix may appear due to roundoff errors.
- if wMw <= 0.0:
- scale = self._auto_scale(delta_x, delta_grad)
- # Reinitialize matrix
- if self.approx_type == 'hess':
- self.B = scale * np.eye(self.n, dtype=float)
- else:
- self.H = scale * np.eye(self.n, dtype=float)
- # Do common operations for new matrix
- Mw = self.dot(w)
- wMw = Mw.dot(w)
- # Check if curvature condition is violated
- if wz <= self.min_curvature * wMw:
- # If the option 'skip_update' is set
- # we just skip the update when the condion
- # is violated.
- if self.exception_strategy == 'skip_update':
- return
- # If the option 'damp_update' is set we
- # interpolate between the actual BFGS
- # result and the unmodified matrix.
- elif self.exception_strategy == 'damp_update':
- update_factor = (1-self.min_curvature) / (1 - wz/wMw)
- z = update_factor*z + (1-update_factor)*Mw
- wz = np.dot(w, z)
- # Update matrix
- if self.approx_type == 'hess':
- self._update_hessian(wz, Mw, wMw, z)
- else:
- self._update_inverse_hessian(wz, Mw, wMw, z)
- class SR1(FullHessianUpdateStrategy):
- """Symmetric-rank-1 Hessian update strategy.
- Parameters
- ----------
- min_denominator : float
- This number, scaled by a normalization factor,
- defines the minimum denominator magnitude allowed
- in the update. When the condition is violated we skip
- the update. By default uses ``1e-8``.
- init_scale : {float, 'auto'}, optional
- Matrix scale at first iteration. At the first
- iteration the Hessian matrix or its inverse will be initialized
- with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
- Set it to 'auto' in order to use an automatic heuristic for choosing
- the initial scale. The heuristic is described in [1]_, p.143.
- By default uses 'auto'.
- Notes
- -----
- The update is based on the description in [1]_, p.144-146.
- References
- ----------
- .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
- Second Edition (2006).
- """
- def __init__(self, min_denominator=1e-8, init_scale='auto'):
- self.min_denominator = min_denominator
- super().__init__(init_scale)
- def _update_implementation(self, delta_x, delta_grad):
- # Auxiliary variables w and z
- if self.approx_type == 'hess':
- w = delta_x
- z = delta_grad
- else:
- w = delta_grad
- z = delta_x
- # Do some common operations
- Mw = self.dot(w)
- z_minus_Mw = z - Mw
- denominator = np.dot(w, z_minus_Mw)
- # If the denominator is too small
- # we just skip the update.
- if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw):
- return
- # Update matrix
- if self.approx_type == 'hess':
- self.B = self._syr(1/denominator, z_minus_Mw, a=self.B)
- else:
- self.H = self._syr(1/denominator, z_minus_Mw, a=self.H)
|