_hessian_update_strategy.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. """Hessian update strategies for quasi-Newton optimization methods."""
  2. import numpy as np
  3. from numpy.linalg import norm
  4. from scipy.linalg import get_blas_funcs
  5. from warnings import warn
  6. __all__ = ['HessianUpdateStrategy', 'BFGS', 'SR1']
  7. class HessianUpdateStrategy:
  8. """Interface for implementing Hessian update strategies.
  9. Many optimization methods make use of Hessian (or inverse Hessian)
  10. approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS.
  11. Some of these approximations, however, do not actually need to store
  12. the entire matrix or can compute the internal matrix product with a
  13. given vector in a very efficiently manner. This class serves as an
  14. abstract interface between the optimization algorithm and the
  15. quasi-Newton update strategies, giving freedom of implementation
  16. to store and update the internal matrix as efficiently as possible.
  17. Different choices of initialization and update procedure will result
  18. in different quasi-Newton strategies.
  19. Four methods should be implemented in derived classes: ``initialize``,
  20. ``update``, ``dot`` and ``get_matrix``.
  21. Notes
  22. -----
  23. Any instance of a class that implements this interface,
  24. can be accepted by the method ``minimize`` and used by
  25. the compatible solvers to approximate the Hessian (or
  26. inverse Hessian) used by the optimization algorithms.
  27. """
  28. def initialize(self, n, approx_type):
  29. """Initialize internal matrix.
  30. Allocate internal memory for storing and updating
  31. the Hessian or its inverse.
  32. Parameters
  33. ----------
  34. n : int
  35. Problem dimension.
  36. approx_type : {'hess', 'inv_hess'}
  37. Selects either the Hessian or the inverse Hessian.
  38. When set to 'hess' the Hessian will be stored and updated.
  39. When set to 'inv_hess' its inverse will be used instead.
  40. """
  41. raise NotImplementedError("The method ``initialize(n, approx_type)``"
  42. " is not implemented.")
  43. def update(self, delta_x, delta_grad):
  44. """Update internal matrix.
  45. Update Hessian matrix or its inverse (depending on how 'approx_type'
  46. is defined) using information about the last evaluated points.
  47. Parameters
  48. ----------
  49. delta_x : ndarray
  50. The difference between two points the gradient
  51. function have been evaluated at: ``delta_x = x2 - x1``.
  52. delta_grad : ndarray
  53. The difference between the gradients:
  54. ``delta_grad = grad(x2) - grad(x1)``.
  55. """
  56. raise NotImplementedError("The method ``update(delta_x, delta_grad)``"
  57. " is not implemented.")
  58. def dot(self, p):
  59. """Compute the product of the internal matrix with the given vector.
  60. Parameters
  61. ----------
  62. p : array_like
  63. 1-D array representing a vector.
  64. Returns
  65. -------
  66. Hp : array
  67. 1-D represents the result of multiplying the approximation matrix
  68. by vector p.
  69. """
  70. raise NotImplementedError("The method ``dot(p)``"
  71. " is not implemented.")
  72. def get_matrix(self):
  73. """Return current internal matrix.
  74. Returns
  75. -------
  76. H : ndarray, shape (n, n)
  77. Dense matrix containing either the Hessian
  78. or its inverse (depending on how 'approx_type'
  79. is defined).
  80. """
  81. raise NotImplementedError("The method ``get_matrix(p)``"
  82. " is not implemented.")
  83. class FullHessianUpdateStrategy(HessianUpdateStrategy):
  84. """Hessian update strategy with full dimensional internal representation.
  85. """
  86. _syr = get_blas_funcs('syr', dtype='d') # Symmetric rank 1 update
  87. _syr2 = get_blas_funcs('syr2', dtype='d') # Symmetric rank 2 update
  88. # Symmetric matrix-vector product
  89. _symv = get_blas_funcs('symv', dtype='d')
  90. def __init__(self, init_scale='auto'):
  91. self.init_scale = init_scale
  92. # Until initialize is called we can't really use the class,
  93. # so it makes sense to set everything to None.
  94. self.first_iteration = None
  95. self.approx_type = None
  96. self.B = None
  97. self.H = None
  98. def initialize(self, n, approx_type):
  99. """Initialize internal matrix.
  100. Allocate internal memory for storing and updating
  101. the Hessian or its inverse.
  102. Parameters
  103. ----------
  104. n : int
  105. Problem dimension.
  106. approx_type : {'hess', 'inv_hess'}
  107. Selects either the Hessian or the inverse Hessian.
  108. When set to 'hess' the Hessian will be stored and updated.
  109. When set to 'inv_hess' its inverse will be used instead.
  110. """
  111. self.first_iteration = True
  112. self.n = n
  113. self.approx_type = approx_type
  114. if approx_type not in ('hess', 'inv_hess'):
  115. raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.")
  116. # Create matrix
  117. if self.approx_type == 'hess':
  118. self.B = np.eye(n, dtype=float)
  119. else:
  120. self.H = np.eye(n, dtype=float)
  121. def _auto_scale(self, delta_x, delta_grad):
  122. # Heuristic to scale matrix at first iteration.
  123. # Described in Nocedal and Wright "Numerical Optimization"
  124. # p.143 formula (6.20).
  125. s_norm2 = np.dot(delta_x, delta_x)
  126. y_norm2 = np.dot(delta_grad, delta_grad)
  127. ys = np.abs(np.dot(delta_grad, delta_x))
  128. if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0:
  129. return 1
  130. if self.approx_type == 'hess':
  131. return y_norm2 / ys
  132. else:
  133. return ys / y_norm2
  134. def _update_implementation(self, delta_x, delta_grad):
  135. raise NotImplementedError("The method ``_update_implementation``"
  136. " is not implemented.")
  137. def update(self, delta_x, delta_grad):
  138. """Update internal matrix.
  139. Update Hessian matrix or its inverse (depending on how 'approx_type'
  140. is defined) using information about the last evaluated points.
  141. Parameters
  142. ----------
  143. delta_x : ndarray
  144. The difference between two points the gradient
  145. function have been evaluated at: ``delta_x = x2 - x1``.
  146. delta_grad : ndarray
  147. The difference between the gradients:
  148. ``delta_grad = grad(x2) - grad(x1)``.
  149. """
  150. if np.all(delta_x == 0.0):
  151. return
  152. if np.all(delta_grad == 0.0):
  153. warn('delta_grad == 0.0. Check if the approximated '
  154. 'function is linear. If the function is linear '
  155. 'better results can be obtained by defining the '
  156. 'Hessian as zero instead of using quasi-Newton '
  157. 'approximations.', UserWarning)
  158. return
  159. if self.first_iteration:
  160. # Get user specific scale
  161. if self.init_scale == "auto":
  162. scale = self._auto_scale(delta_x, delta_grad)
  163. else:
  164. scale = float(self.init_scale)
  165. # Scale initial matrix with ``scale * np.eye(n)``
  166. if self.approx_type == 'hess':
  167. self.B *= scale
  168. else:
  169. self.H *= scale
  170. self.first_iteration = False
  171. self._update_implementation(delta_x, delta_grad)
  172. def dot(self, p):
  173. """Compute the product of the internal matrix with the given vector.
  174. Parameters
  175. ----------
  176. p : array_like
  177. 1-D array representing a vector.
  178. Returns
  179. -------
  180. Hp : array
  181. 1-D represents the result of multiplying the approximation matrix
  182. by vector p.
  183. """
  184. if self.approx_type == 'hess':
  185. return self._symv(1, self.B, p)
  186. else:
  187. return self._symv(1, self.H, p)
  188. def get_matrix(self):
  189. """Return the current internal matrix.
  190. Returns
  191. -------
  192. M : ndarray, shape (n, n)
  193. Dense matrix containing either the Hessian or its inverse
  194. (depending on how `approx_type` was defined).
  195. """
  196. if self.approx_type == 'hess':
  197. M = np.copy(self.B)
  198. else:
  199. M = np.copy(self.H)
  200. li = np.tril_indices_from(M, k=-1)
  201. M[li] = M.T[li]
  202. return M
  203. class BFGS(FullHessianUpdateStrategy):
  204. """Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy.
  205. Parameters
  206. ----------
  207. exception_strategy : {'skip_update', 'damp_update'}, optional
  208. Define how to proceed when the curvature condition is violated.
  209. Set it to 'skip_update' to just skip the update. Or, alternatively,
  210. set it to 'damp_update' to interpolate between the actual BFGS
  211. result and the unmodified matrix. Both exceptions strategies
  212. are explained in [1]_, p.536-537.
  213. min_curvature : float
  214. This number, scaled by a normalization factor, defines the
  215. minimum curvature ``dot(delta_grad, delta_x)`` allowed to go
  216. unaffected by the exception strategy. By default is equal to
  217. 1e-8 when ``exception_strategy = 'skip_update'`` and equal
  218. to 0.2 when ``exception_strategy = 'damp_update'``.
  219. init_scale : {float, 'auto'}
  220. Matrix scale at first iteration. At the first
  221. iteration the Hessian matrix or its inverse will be initialized
  222. with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
  223. Set it to 'auto' in order to use an automatic heuristic for choosing
  224. the initial scale. The heuristic is described in [1]_, p.143.
  225. By default uses 'auto'.
  226. Notes
  227. -----
  228. The update is based on the description in [1]_, p.140.
  229. References
  230. ----------
  231. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  232. Second Edition (2006).
  233. """
  234. def __init__(self, exception_strategy='skip_update', min_curvature=None,
  235. init_scale='auto'):
  236. if exception_strategy == 'skip_update':
  237. if min_curvature is not None:
  238. self.min_curvature = min_curvature
  239. else:
  240. self.min_curvature = 1e-8
  241. elif exception_strategy == 'damp_update':
  242. if min_curvature is not None:
  243. self.min_curvature = min_curvature
  244. else:
  245. self.min_curvature = 0.2
  246. else:
  247. raise ValueError("`exception_strategy` must be 'skip_update' "
  248. "or 'damp_update'.")
  249. super().__init__(init_scale)
  250. self.exception_strategy = exception_strategy
  251. def _update_inverse_hessian(self, ys, Hy, yHy, s):
  252. """Update the inverse Hessian matrix.
  253. BFGS update using the formula:
  254. ``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T)
  255. - 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)``
  256. where ``s = delta_x`` and ``y = delta_grad``. This formula is
  257. equivalent to (6.17) in [1]_ written in a more efficient way
  258. for implementation.
  259. References
  260. ----------
  261. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  262. Second Edition (2006).
  263. """
  264. self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H)
  265. self.H = self._syr((ys+yHy)/ys**2, s, a=self.H)
  266. def _update_hessian(self, ys, Bs, sBs, y):
  267. """Update the Hessian matrix.
  268. BFGS update using the formula:
  269. ``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y``
  270. where ``s`` is short for ``delta_x`` and ``y`` is short
  271. for ``delta_grad``. Formula (6.19) in [1]_.
  272. References
  273. ----------
  274. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  275. Second Edition (2006).
  276. """
  277. self.B = self._syr(1.0 / ys, y, a=self.B)
  278. self.B = self._syr(-1.0 / sBs, Bs, a=self.B)
  279. def _update_implementation(self, delta_x, delta_grad):
  280. # Auxiliary variables w and z
  281. if self.approx_type == 'hess':
  282. w = delta_x
  283. z = delta_grad
  284. else:
  285. w = delta_grad
  286. z = delta_x
  287. # Do some common operations
  288. wz = np.dot(w, z)
  289. Mw = self.dot(w)
  290. wMw = Mw.dot(w)
  291. # Guarantee that wMw > 0 by reinitializing matrix.
  292. # While this is always true in exact arithmetics,
  293. # indefinite matrix may appear due to roundoff errors.
  294. if wMw <= 0.0:
  295. scale = self._auto_scale(delta_x, delta_grad)
  296. # Reinitialize matrix
  297. if self.approx_type == 'hess':
  298. self.B = scale * np.eye(self.n, dtype=float)
  299. else:
  300. self.H = scale * np.eye(self.n, dtype=float)
  301. # Do common operations for new matrix
  302. Mw = self.dot(w)
  303. wMw = Mw.dot(w)
  304. # Check if curvature condition is violated
  305. if wz <= self.min_curvature * wMw:
  306. # If the option 'skip_update' is set
  307. # we just skip the update when the condion
  308. # is violated.
  309. if self.exception_strategy == 'skip_update':
  310. return
  311. # If the option 'damp_update' is set we
  312. # interpolate between the actual BFGS
  313. # result and the unmodified matrix.
  314. elif self.exception_strategy == 'damp_update':
  315. update_factor = (1-self.min_curvature) / (1 - wz/wMw)
  316. z = update_factor*z + (1-update_factor)*Mw
  317. wz = np.dot(w, z)
  318. # Update matrix
  319. if self.approx_type == 'hess':
  320. self._update_hessian(wz, Mw, wMw, z)
  321. else:
  322. self._update_inverse_hessian(wz, Mw, wMw, z)
  323. class SR1(FullHessianUpdateStrategy):
  324. """Symmetric-rank-1 Hessian update strategy.
  325. Parameters
  326. ----------
  327. min_denominator : float
  328. This number, scaled by a normalization factor,
  329. defines the minimum denominator magnitude allowed
  330. in the update. When the condition is violated we skip
  331. the update. By default uses ``1e-8``.
  332. init_scale : {float, 'auto'}, optional
  333. Matrix scale at first iteration. At the first
  334. iteration the Hessian matrix or its inverse will be initialized
  335. with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
  336. Set it to 'auto' in order to use an automatic heuristic for choosing
  337. the initial scale. The heuristic is described in [1]_, p.143.
  338. By default uses 'auto'.
  339. Notes
  340. -----
  341. The update is based on the description in [1]_, p.144-146.
  342. References
  343. ----------
  344. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  345. Second Edition (2006).
  346. """
  347. def __init__(self, min_denominator=1e-8, init_scale='auto'):
  348. self.min_denominator = min_denominator
  349. super().__init__(init_scale)
  350. def _update_implementation(self, delta_x, delta_grad):
  351. # Auxiliary variables w and z
  352. if self.approx_type == 'hess':
  353. w = delta_x
  354. z = delta_grad
  355. else:
  356. w = delta_grad
  357. z = delta_x
  358. # Do some common operations
  359. Mw = self.dot(w)
  360. z_minus_Mw = z - Mw
  361. denominator = np.dot(w, z_minus_Mw)
  362. # If the denominator is too small
  363. # we just skip the update.
  364. if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw):
  365. return
  366. # Update matrix
  367. if self.approx_type == 'hess':
  368. self.B = self._syr(1/denominator, z_minus_Mw, a=self.B)
  369. else:
  370. self.H = self._syr(1/denominator, z_minus_Mw, a=self.H)