_trustregion_exact.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. """Nearly exact trust-region optimization subproblem."""
  2. import numpy as np
  3. from scipy.linalg import (norm, get_lapack_funcs, solve_triangular,
  4. cho_solve)
  5. from ._trustregion import (_minimize_trust_region, BaseQuadraticSubproblem)
  6. __all__ = ['_minimize_trustregion_exact',
  7. 'estimate_smallest_singular_value',
  8. 'singular_leading_submatrix',
  9. 'IterativeSubproblem']
  10. def _minimize_trustregion_exact(fun, x0, args=(), jac=None, hess=None,
  11. **trust_region_options):
  12. """
  13. Minimization of scalar function of one or more variables using
  14. a nearly exact trust-region algorithm.
  15. Options
  16. -------
  17. initial_tr_radius : float
  18. Initial trust-region radius.
  19. max_tr_radius : float
  20. Maximum value of the trust-region radius. No steps that are longer
  21. than this value will be proposed.
  22. eta : float
  23. Trust region related acceptance stringency for proposed steps.
  24. gtol : float
  25. Gradient norm must be less than ``gtol`` before successful
  26. termination.
  27. """
  28. if jac is None:
  29. raise ValueError('Jacobian is required for trust region '
  30. 'exact minimization.')
  31. if not callable(hess):
  32. raise ValueError('Hessian matrix is required for trust region '
  33. 'exact minimization.')
  34. return _minimize_trust_region(fun, x0, args=args, jac=jac, hess=hess,
  35. subproblem=IterativeSubproblem,
  36. **trust_region_options)
  37. def estimate_smallest_singular_value(U):
  38. """Given upper triangular matrix ``U`` estimate the smallest singular
  39. value and the correspondent right singular vector in O(n**2) operations.
  40. Parameters
  41. ----------
  42. U : ndarray
  43. Square upper triangular matrix.
  44. Returns
  45. -------
  46. s_min : float
  47. Estimated smallest singular value of the provided matrix.
  48. z_min : ndarray
  49. Estimatied right singular vector.
  50. Notes
  51. -----
  52. The procedure is based on [1]_ and is done in two steps. First, it finds
  53. a vector ``e`` with components selected from {+1, -1} such that the
  54. solution ``w`` from the system ``U.T w = e`` is as large as possible.
  55. Next it estimate ``U v = w``. The smallest singular value is close
  56. to ``norm(w)/norm(v)`` and the right singular vector is close
  57. to ``v/norm(v)``.
  58. The estimation will be better more ill-conditioned is the matrix.
  59. References
  60. ----------
  61. .. [1] Cline, A. K., Moler, C. B., Stewart, G. W., Wilkinson, J. H.
  62. An estimate for the condition number of a matrix. 1979.
  63. SIAM Journal on Numerical Analysis, 16(2), 368-375.
  64. """
  65. U = np.atleast_2d(U)
  66. m, n = U.shape
  67. if m != n:
  68. raise ValueError("A square triangular matrix should be provided.")
  69. # A vector `e` with components selected from {+1, -1}
  70. # is selected so that the solution `w` to the system
  71. # `U.T w = e` is as large as possible. Implementation
  72. # based on algorithm 3.5.1, p. 142, from reference [2]
  73. # adapted for lower triangular matrix.
  74. p = np.zeros(n)
  75. w = np.empty(n)
  76. # Implemented according to: Golub, G. H., Van Loan, C. F. (2013).
  77. # "Matrix computations". Forth Edition. JHU press. pp. 140-142.
  78. for k in range(n):
  79. wp = (1-p[k]) / U.T[k, k]
  80. wm = (-1-p[k]) / U.T[k, k]
  81. pp = p[k+1:] + U.T[k+1:, k]*wp
  82. pm = p[k+1:] + U.T[k+1:, k]*wm
  83. if abs(wp) + norm(pp, 1) >= abs(wm) + norm(pm, 1):
  84. w[k] = wp
  85. p[k+1:] = pp
  86. else:
  87. w[k] = wm
  88. p[k+1:] = pm
  89. # The system `U v = w` is solved using backward substitution.
  90. v = solve_triangular(U, w)
  91. v_norm = norm(v)
  92. w_norm = norm(w)
  93. # Smallest singular value
  94. s_min = w_norm / v_norm
  95. # Associated vector
  96. z_min = v / v_norm
  97. return s_min, z_min
  98. def gershgorin_bounds(H):
  99. """
  100. Given a square matrix ``H`` compute upper
  101. and lower bounds for its eigenvalues (Gregoshgorin Bounds).
  102. Defined ref. [1].
  103. References
  104. ----------
  105. .. [1] Conn, A. R., Gould, N. I., & Toint, P. L.
  106. Trust region methods. 2000. Siam. pp. 19.
  107. """
  108. H_diag = np.diag(H)
  109. H_diag_abs = np.abs(H_diag)
  110. H_row_sums = np.sum(np.abs(H), axis=1)
  111. lb = np.min(H_diag + H_diag_abs - H_row_sums)
  112. ub = np.max(H_diag - H_diag_abs + H_row_sums)
  113. return lb, ub
  114. def singular_leading_submatrix(A, U, k):
  115. """
  116. Compute term that makes the leading ``k`` by ``k``
  117. submatrix from ``A`` singular.
  118. Parameters
  119. ----------
  120. A : ndarray
  121. Symmetric matrix that is not positive definite.
  122. U : ndarray
  123. Upper triangular matrix resulting of an incomplete
  124. Cholesky decomposition of matrix ``A``.
  125. k : int
  126. Positive integer such that the leading k by k submatrix from
  127. `A` is the first non-positive definite leading submatrix.
  128. Returns
  129. -------
  130. delta : float
  131. Amount that should be added to the element (k, k) of the
  132. leading k by k submatrix of ``A`` to make it singular.
  133. v : ndarray
  134. A vector such that ``v.T B v = 0``. Where B is the matrix A after
  135. ``delta`` is added to its element (k, k).
  136. """
  137. # Compute delta
  138. delta = np.sum(U[:k-1, k-1]**2) - A[k-1, k-1]
  139. n = len(A)
  140. # Inicialize v
  141. v = np.zeros(n)
  142. v[k-1] = 1
  143. # Compute the remaining values of v by solving a triangular system.
  144. if k != 1:
  145. v[:k-1] = solve_triangular(U[:k-1, :k-1], -U[:k-1, k-1])
  146. return delta, v
  147. class IterativeSubproblem(BaseQuadraticSubproblem):
  148. """Quadratic subproblem solved by nearly exact iterative method.
  149. Notes
  150. -----
  151. This subproblem solver was based on [1]_, [2]_ and [3]_,
  152. which implement similar algorithms. The algorithm is basically
  153. that of [1]_ but ideas from [2]_ and [3]_ were also used.
  154. References
  155. ----------
  156. .. [1] A.R. Conn, N.I. Gould, and P.L. Toint, "Trust region methods",
  157. Siam, pp. 169-200, 2000.
  158. .. [2] J. Nocedal and S. Wright, "Numerical optimization",
  159. Springer Science & Business Media. pp. 83-91, 2006.
  160. .. [3] J.J. More and D.C. Sorensen, "Computing a trust region step",
  161. SIAM Journal on Scientific and Statistical Computing, vol. 4(3),
  162. pp. 553-572, 1983.
  163. """
  164. # UPDATE_COEFF appears in reference [1]_
  165. # in formula 7.3.14 (p. 190) named as "theta".
  166. # As recommended there it value is fixed in 0.01.
  167. UPDATE_COEFF = 0.01
  168. EPS = np.finfo(float).eps
  169. def __init__(self, x, fun, jac, hess, hessp=None,
  170. k_easy=0.1, k_hard=0.2):
  171. super().__init__(x, fun, jac, hess)
  172. # When the trust-region shrinks in two consecutive
  173. # calculations (``tr_radius < previous_tr_radius``)
  174. # the lower bound ``lambda_lb`` may be reused,
  175. # facilitating the convergence. To indicate no
  176. # previous value is known at first ``previous_tr_radius``
  177. # is set to -1 and ``lambda_lb`` to None.
  178. self.previous_tr_radius = -1
  179. self.lambda_lb = None
  180. self.niter = 0
  181. # ``k_easy`` and ``k_hard`` are parameters used
  182. # to determine the stop criteria to the iterative
  183. # subproblem solver. Take a look at pp. 194-197
  184. # from reference _[1] for a more detailed description.
  185. self.k_easy = k_easy
  186. self.k_hard = k_hard
  187. # Get Lapack function for cholesky decomposition.
  188. # The implemented SciPy wrapper does not return
  189. # the incomplete factorization needed by the method.
  190. self.cholesky, = get_lapack_funcs(('potrf',), (self.hess,))
  191. # Get info about Hessian
  192. self.dimension = len(self.hess)
  193. self.hess_gershgorin_lb,\
  194. self.hess_gershgorin_ub = gershgorin_bounds(self.hess)
  195. self.hess_inf = norm(self.hess, np.Inf)
  196. self.hess_fro = norm(self.hess, 'fro')
  197. # A constant such that for vectors smaler than that
  198. # backward substituition is not reliable. It was stabilished
  199. # based on Golub, G. H., Van Loan, C. F. (2013).
  200. # "Matrix computations". Forth Edition. JHU press., p.165.
  201. self.CLOSE_TO_ZERO = self.dimension * self.EPS * self.hess_inf
  202. def _initial_values(self, tr_radius):
  203. """Given a trust radius, return a good initial guess for
  204. the damping factor, the lower bound and the upper bound.
  205. The values were chosen accordingly to the guidelines on
  206. section 7.3.8 (p. 192) from [1]_.
  207. """
  208. # Upper bound for the damping factor
  209. lambda_ub = max(0, self.jac_mag/tr_radius + min(-self.hess_gershgorin_lb,
  210. self.hess_fro,
  211. self.hess_inf))
  212. # Lower bound for the damping factor
  213. lambda_lb = max(0, -min(self.hess.diagonal()),
  214. self.jac_mag/tr_radius - min(self.hess_gershgorin_ub,
  215. self.hess_fro,
  216. self.hess_inf))
  217. # Improve bounds with previous info
  218. if tr_radius < self.previous_tr_radius:
  219. lambda_lb = max(self.lambda_lb, lambda_lb)
  220. # Initial guess for the damping factor
  221. if lambda_lb == 0:
  222. lambda_initial = 0
  223. else:
  224. lambda_initial = max(np.sqrt(lambda_lb * lambda_ub),
  225. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  226. return lambda_initial, lambda_lb, lambda_ub
  227. def solve(self, tr_radius):
  228. """Solve quadratic subproblem"""
  229. lambda_current, lambda_lb, lambda_ub = self._initial_values(tr_radius)
  230. n = self.dimension
  231. hits_boundary = True
  232. already_factorized = False
  233. self.niter = 0
  234. while True:
  235. # Compute Cholesky factorization
  236. if already_factorized:
  237. already_factorized = False
  238. else:
  239. H = self.hess+lambda_current*np.eye(n)
  240. U, info = self.cholesky(H, lower=False,
  241. overwrite_a=False,
  242. clean=True)
  243. self.niter += 1
  244. # Check if factorization succeeded
  245. if info == 0 and self.jac_mag > self.CLOSE_TO_ZERO:
  246. # Successful factorization
  247. # Solve `U.T U p = s`
  248. p = cho_solve((U, False), -self.jac)
  249. p_norm = norm(p)
  250. # Check for interior convergence
  251. if p_norm <= tr_radius and lambda_current == 0:
  252. hits_boundary = False
  253. break
  254. # Solve `U.T w = p`
  255. w = solve_triangular(U, p, trans='T')
  256. w_norm = norm(w)
  257. # Compute Newton step accordingly to
  258. # formula (4.44) p.87 from ref [2]_.
  259. delta_lambda = (p_norm/w_norm)**2 * (p_norm-tr_radius)/tr_radius
  260. lambda_new = lambda_current + delta_lambda
  261. if p_norm < tr_radius: # Inside boundary
  262. s_min, z_min = estimate_smallest_singular_value(U)
  263. ta, tb = self.get_boundaries_intersections(p, z_min,
  264. tr_radius)
  265. # Choose `step_len` with the smallest magnitude.
  266. # The reason for this choice is explained at
  267. # ref [3]_, p. 6 (Immediately before the formula
  268. # for `tau`).
  269. step_len = min([ta, tb], key=abs)
  270. # Compute the quadratic term (p.T*H*p)
  271. quadratic_term = np.dot(p, np.dot(H, p))
  272. # Check stop criteria
  273. relative_error = (step_len**2 * s_min**2) / (quadratic_term + lambda_current*tr_radius**2)
  274. if relative_error <= self.k_hard:
  275. p += step_len * z_min
  276. break
  277. # Update uncertanty bounds
  278. lambda_ub = lambda_current
  279. lambda_lb = max(lambda_lb, lambda_current - s_min**2)
  280. # Compute Cholesky factorization
  281. H = self.hess + lambda_new*np.eye(n)
  282. c, info = self.cholesky(H, lower=False,
  283. overwrite_a=False,
  284. clean=True)
  285. # Check if the factorization have succeeded
  286. #
  287. if info == 0: # Successful factorization
  288. # Update damping factor
  289. lambda_current = lambda_new
  290. already_factorized = True
  291. else: # Unsuccessful factorization
  292. # Update uncertanty bounds
  293. lambda_lb = max(lambda_lb, lambda_new)
  294. # Update damping factor
  295. lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
  296. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  297. else: # Outside boundary
  298. # Check stop criteria
  299. relative_error = abs(p_norm - tr_radius) / tr_radius
  300. if relative_error <= self.k_easy:
  301. break
  302. # Update uncertanty bounds
  303. lambda_lb = lambda_current
  304. # Update damping factor
  305. lambda_current = lambda_new
  306. elif info == 0 and self.jac_mag <= self.CLOSE_TO_ZERO:
  307. # jac_mag very close to zero
  308. # Check for interior convergence
  309. if lambda_current == 0:
  310. p = np.zeros(n)
  311. hits_boundary = False
  312. break
  313. s_min, z_min = estimate_smallest_singular_value(U)
  314. step_len = tr_radius
  315. # Check stop criteria
  316. if step_len**2 * s_min**2 <= self.k_hard * lambda_current * tr_radius**2:
  317. p = step_len * z_min
  318. break
  319. # Update uncertanty bounds
  320. lambda_ub = lambda_current
  321. lambda_lb = max(lambda_lb, lambda_current - s_min**2)
  322. # Update damping factor
  323. lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
  324. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  325. else: # Unsuccessful factorization
  326. # Compute auxiliary terms
  327. delta, v = singular_leading_submatrix(H, U, info)
  328. v_norm = norm(v)
  329. # Update uncertanty interval
  330. lambda_lb = max(lambda_lb, lambda_current + delta/v_norm**2)
  331. # Update damping factor
  332. lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
  333. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  334. self.lambda_lb = lambda_lb
  335. self.lambda_current = lambda_current
  336. self.previous_tr_radius = tr_radius
  337. return p, hits_boundary