_linesearch.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. """
  2. Functions
  3. ---------
  4. .. autosummary::
  5. :toctree: generated/
  6. line_search_armijo
  7. line_search_wolfe1
  8. line_search_wolfe2
  9. scalar_search_wolfe1
  10. scalar_search_wolfe2
  11. """
  12. from warnings import warn
  13. from scipy.optimize import _minpack2 as minpack2
  14. import numpy as np
  15. __all__ = ['LineSearchWarning', 'line_search_wolfe1', 'line_search_wolfe2',
  16. 'scalar_search_wolfe1', 'scalar_search_wolfe2',
  17. 'line_search_armijo']
  18. class LineSearchWarning(RuntimeWarning):
  19. pass
  20. #------------------------------------------------------------------------------
  21. # Minpack's Wolfe line and scalar searches
  22. #------------------------------------------------------------------------------
  23. def line_search_wolfe1(f, fprime, xk, pk, gfk=None,
  24. old_fval=None, old_old_fval=None,
  25. args=(), c1=1e-4, c2=0.9, amax=50, amin=1e-8,
  26. xtol=1e-14):
  27. """
  28. As `scalar_search_wolfe1` but do a line search to direction `pk`
  29. Parameters
  30. ----------
  31. f : callable
  32. Function `f(x)`
  33. fprime : callable
  34. Gradient of `f`
  35. xk : array_like
  36. Current point
  37. pk : array_like
  38. Search direction
  39. gfk : array_like, optional
  40. Gradient of `f` at point `xk`
  41. old_fval : float, optional
  42. Value of `f` at point `xk`
  43. old_old_fval : float, optional
  44. Value of `f` at point preceding `xk`
  45. The rest of the parameters are the same as for `scalar_search_wolfe1`.
  46. Returns
  47. -------
  48. stp, f_count, g_count, fval, old_fval
  49. As in `line_search_wolfe1`
  50. gval : array
  51. Gradient of `f` at the final point
  52. """
  53. if gfk is None:
  54. gfk = fprime(xk, *args)
  55. gval = [gfk]
  56. gc = [0]
  57. fc = [0]
  58. def phi(s):
  59. fc[0] += 1
  60. return f(xk + s*pk, *args)
  61. def derphi(s):
  62. gval[0] = fprime(xk + s*pk, *args)
  63. gc[0] += 1
  64. return np.dot(gval[0], pk)
  65. derphi0 = np.dot(gfk, pk)
  66. stp, fval, old_fval = scalar_search_wolfe1(
  67. phi, derphi, old_fval, old_old_fval, derphi0,
  68. c1=c1, c2=c2, amax=amax, amin=amin, xtol=xtol)
  69. return stp, fc[0], gc[0], fval, old_fval, gval[0]
  70. def scalar_search_wolfe1(phi, derphi, phi0=None, old_phi0=None, derphi0=None,
  71. c1=1e-4, c2=0.9,
  72. amax=50, amin=1e-8, xtol=1e-14):
  73. """
  74. Scalar function search for alpha that satisfies strong Wolfe conditions
  75. alpha > 0 is assumed to be a descent direction.
  76. Parameters
  77. ----------
  78. phi : callable phi(alpha)
  79. Function at point `alpha`
  80. derphi : callable phi'(alpha)
  81. Objective function derivative. Returns a scalar.
  82. phi0 : float, optional
  83. Value of phi at 0
  84. old_phi0 : float, optional
  85. Value of phi at previous point
  86. derphi0 : float, optional
  87. Value derphi at 0
  88. c1 : float, optional
  89. Parameter for Armijo condition rule.
  90. c2 : float, optional
  91. Parameter for curvature condition rule.
  92. amax, amin : float, optional
  93. Maximum and minimum step size
  94. xtol : float, optional
  95. Relative tolerance for an acceptable step.
  96. Returns
  97. -------
  98. alpha : float
  99. Step size, or None if no suitable step was found
  100. phi : float
  101. Value of `phi` at the new point `alpha`
  102. phi0 : float
  103. Value of `phi` at `alpha=0`
  104. Notes
  105. -----
  106. Uses routine DCSRCH from MINPACK.
  107. """
  108. if phi0 is None:
  109. phi0 = phi(0.)
  110. if derphi0 is None:
  111. derphi0 = derphi(0.)
  112. if old_phi0 is not None and derphi0 != 0:
  113. alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
  114. if alpha1 < 0:
  115. alpha1 = 1.0
  116. else:
  117. alpha1 = 1.0
  118. phi1 = phi0
  119. derphi1 = derphi0
  120. isave = np.zeros((2,), np.intc)
  121. dsave = np.zeros((13,), float)
  122. task = b'START'
  123. maxiter = 100
  124. for i in range(maxiter):
  125. stp, phi1, derphi1, task = minpack2.dcsrch(alpha1, phi1, derphi1,
  126. c1, c2, xtol, task,
  127. amin, amax, isave, dsave)
  128. if task[:2] == b'FG':
  129. alpha1 = stp
  130. phi1 = phi(stp)
  131. derphi1 = derphi(stp)
  132. else:
  133. break
  134. else:
  135. # maxiter reached, the line search did not converge
  136. stp = None
  137. if task[:5] == b'ERROR' or task[:4] == b'WARN':
  138. stp = None # failed
  139. return stp, phi1, phi0
  140. line_search = line_search_wolfe1
  141. #------------------------------------------------------------------------------
  142. # Pure-Python Wolfe line and scalar searches
  143. #------------------------------------------------------------------------------
  144. def line_search_wolfe2(f, myfprime, xk, pk, gfk=None, old_fval=None,
  145. old_old_fval=None, args=(), c1=1e-4, c2=0.9, amax=None,
  146. extra_condition=None, maxiter=10):
  147. """Find alpha that satisfies strong Wolfe conditions.
  148. Parameters
  149. ----------
  150. f : callable f(x,*args)
  151. Objective function.
  152. myfprime : callable f'(x,*args)
  153. Objective function gradient.
  154. xk : ndarray
  155. Starting point.
  156. pk : ndarray
  157. Search direction.
  158. gfk : ndarray, optional
  159. Gradient value for x=xk (xk being the current parameter
  160. estimate). Will be recomputed if omitted.
  161. old_fval : float, optional
  162. Function value for x=xk. Will be recomputed if omitted.
  163. old_old_fval : float, optional
  164. Function value for the point preceding x=xk.
  165. args : tuple, optional
  166. Additional arguments passed to objective function.
  167. c1 : float, optional
  168. Parameter for Armijo condition rule.
  169. c2 : float, optional
  170. Parameter for curvature condition rule.
  171. amax : float, optional
  172. Maximum step size
  173. extra_condition : callable, optional
  174. A callable of the form ``extra_condition(alpha, x, f, g)``
  175. returning a boolean. Arguments are the proposed step ``alpha``
  176. and the corresponding ``x``, ``f`` and ``g`` values. The line search
  177. accepts the value of ``alpha`` only if this
  178. callable returns ``True``. If the callable returns ``False``
  179. for the step length, the algorithm will continue with
  180. new iterates. The callable is only called for iterates
  181. satisfying the strong Wolfe conditions.
  182. maxiter : int, optional
  183. Maximum number of iterations to perform.
  184. Returns
  185. -------
  186. alpha : float or None
  187. Alpha for which ``x_new = x0 + alpha * pk``,
  188. or None if the line search algorithm did not converge.
  189. fc : int
  190. Number of function evaluations made.
  191. gc : int
  192. Number of gradient evaluations made.
  193. new_fval : float or None
  194. New function value ``f(x_new)=f(x0+alpha*pk)``,
  195. or None if the line search algorithm did not converge.
  196. old_fval : float
  197. Old function value ``f(x0)``.
  198. new_slope : float or None
  199. The local slope along the search direction at the
  200. new value ``<myfprime(x_new), pk>``,
  201. or None if the line search algorithm did not converge.
  202. Notes
  203. -----
  204. Uses the line search algorithm to enforce strong Wolfe
  205. conditions. See Wright and Nocedal, 'Numerical Optimization',
  206. 1999, pp. 59-61.
  207. Examples
  208. --------
  209. >>> import numpy as np
  210. >>> from scipy.optimize import line_search
  211. A objective function and its gradient are defined.
  212. >>> def obj_func(x):
  213. ... return (x[0])**2+(x[1])**2
  214. >>> def obj_grad(x):
  215. ... return [2*x[0], 2*x[1]]
  216. We can find alpha that satisfies strong Wolfe conditions.
  217. >>> start_point = np.array([1.8, 1.7])
  218. >>> search_gradient = np.array([-1.0, -1.0])
  219. >>> line_search(obj_func, obj_grad, start_point, search_gradient)
  220. (1.0, 2, 1, 1.1300000000000001, 6.13, [1.6, 1.4])
  221. """
  222. fc = [0]
  223. gc = [0]
  224. gval = [None]
  225. gval_alpha = [None]
  226. def phi(alpha):
  227. fc[0] += 1
  228. return f(xk + alpha * pk, *args)
  229. fprime = myfprime
  230. def derphi(alpha):
  231. gc[0] += 1
  232. gval[0] = fprime(xk + alpha * pk, *args) # store for later use
  233. gval_alpha[0] = alpha
  234. return np.dot(gval[0], pk)
  235. if gfk is None:
  236. gfk = fprime(xk, *args)
  237. derphi0 = np.dot(gfk, pk)
  238. if extra_condition is not None:
  239. # Add the current gradient as argument, to avoid needless
  240. # re-evaluation
  241. def extra_condition2(alpha, phi):
  242. if gval_alpha[0] != alpha:
  243. derphi(alpha)
  244. x = xk + alpha * pk
  245. return extra_condition(alpha, x, phi, gval[0])
  246. else:
  247. extra_condition2 = None
  248. alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  249. phi, derphi, old_fval, old_old_fval, derphi0, c1, c2, amax,
  250. extra_condition2, maxiter=maxiter)
  251. if derphi_star is None:
  252. warn('The line search algorithm did not converge', LineSearchWarning)
  253. else:
  254. # derphi_star is a number (derphi) -- so use the most recently
  255. # calculated gradient used in computing it derphi = gfk*pk
  256. # this is the gradient at the next step no need to compute it
  257. # again in the outer loop.
  258. derphi_star = gval[0]
  259. return alpha_star, fc[0], gc[0], phi_star, old_fval, derphi_star
  260. def scalar_search_wolfe2(phi, derphi, phi0=None,
  261. old_phi0=None, derphi0=None,
  262. c1=1e-4, c2=0.9, amax=None,
  263. extra_condition=None, maxiter=10):
  264. """Find alpha that satisfies strong Wolfe conditions.
  265. alpha > 0 is assumed to be a descent direction.
  266. Parameters
  267. ----------
  268. phi : callable phi(alpha)
  269. Objective scalar function.
  270. derphi : callable phi'(alpha)
  271. Objective function derivative. Returns a scalar.
  272. phi0 : float, optional
  273. Value of phi at 0.
  274. old_phi0 : float, optional
  275. Value of phi at previous point.
  276. derphi0 : float, optional
  277. Value of derphi at 0
  278. c1 : float, optional
  279. Parameter for Armijo condition rule.
  280. c2 : float, optional
  281. Parameter for curvature condition rule.
  282. amax : float, optional
  283. Maximum step size.
  284. extra_condition : callable, optional
  285. A callable of the form ``extra_condition(alpha, phi_value)``
  286. returning a boolean. The line search accepts the value
  287. of ``alpha`` only if this callable returns ``True``.
  288. If the callable returns ``False`` for the step length,
  289. the algorithm will continue with new iterates.
  290. The callable is only called for iterates satisfying
  291. the strong Wolfe conditions.
  292. maxiter : int, optional
  293. Maximum number of iterations to perform.
  294. Returns
  295. -------
  296. alpha_star : float or None
  297. Best alpha, or None if the line search algorithm did not converge.
  298. phi_star : float
  299. phi at alpha_star.
  300. phi0 : float
  301. phi at 0.
  302. derphi_star : float or None
  303. derphi at alpha_star, or None if the line search algorithm
  304. did not converge.
  305. Notes
  306. -----
  307. Uses the line search algorithm to enforce strong Wolfe
  308. conditions. See Wright and Nocedal, 'Numerical Optimization',
  309. 1999, pp. 59-61.
  310. """
  311. if phi0 is None:
  312. phi0 = phi(0.)
  313. if derphi0 is None:
  314. derphi0 = derphi(0.)
  315. alpha0 = 0
  316. if old_phi0 is not None and derphi0 != 0:
  317. alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
  318. else:
  319. alpha1 = 1.0
  320. if alpha1 < 0:
  321. alpha1 = 1.0
  322. if amax is not None:
  323. alpha1 = min(alpha1, amax)
  324. phi_a1 = phi(alpha1)
  325. #derphi_a1 = derphi(alpha1) evaluated below
  326. phi_a0 = phi0
  327. derphi_a0 = derphi0
  328. if extra_condition is None:
  329. extra_condition = lambda alpha, phi: True
  330. for i in range(maxiter):
  331. if alpha1 == 0 or (amax is not None and alpha0 == amax):
  332. # alpha1 == 0: This shouldn't happen. Perhaps the increment has
  333. # slipped below machine precision?
  334. alpha_star = None
  335. phi_star = phi0
  336. phi0 = old_phi0
  337. derphi_star = None
  338. if alpha1 == 0:
  339. msg = 'Rounding errors prevent the line search from converging'
  340. else:
  341. msg = "The line search algorithm could not find a solution " + \
  342. "less than or equal to amax: %s" % amax
  343. warn(msg, LineSearchWarning)
  344. break
  345. not_first_iteration = i > 0
  346. if (phi_a1 > phi0 + c1 * alpha1 * derphi0) or \
  347. ((phi_a1 >= phi_a0) and not_first_iteration):
  348. alpha_star, phi_star, derphi_star = \
  349. _zoom(alpha0, alpha1, phi_a0,
  350. phi_a1, derphi_a0, phi, derphi,
  351. phi0, derphi0, c1, c2, extra_condition)
  352. break
  353. derphi_a1 = derphi(alpha1)
  354. if (abs(derphi_a1) <= -c2*derphi0):
  355. if extra_condition(alpha1, phi_a1):
  356. alpha_star = alpha1
  357. phi_star = phi_a1
  358. derphi_star = derphi_a1
  359. break
  360. if (derphi_a1 >= 0):
  361. alpha_star, phi_star, derphi_star = \
  362. _zoom(alpha1, alpha0, phi_a1,
  363. phi_a0, derphi_a1, phi, derphi,
  364. phi0, derphi0, c1, c2, extra_condition)
  365. break
  366. alpha2 = 2 * alpha1 # increase by factor of two on each iteration
  367. if amax is not None:
  368. alpha2 = min(alpha2, amax)
  369. alpha0 = alpha1
  370. alpha1 = alpha2
  371. phi_a0 = phi_a1
  372. phi_a1 = phi(alpha1)
  373. derphi_a0 = derphi_a1
  374. else:
  375. # stopping test maxiter reached
  376. alpha_star = alpha1
  377. phi_star = phi_a1
  378. derphi_star = None
  379. warn('The line search algorithm did not converge', LineSearchWarning)
  380. return alpha_star, phi_star, phi0, derphi_star
  381. def _cubicmin(a, fa, fpa, b, fb, c, fc):
  382. """
  383. Finds the minimizer for a cubic polynomial that goes through the
  384. points (a,fa), (b,fb), and (c,fc) with derivative at a of fpa.
  385. If no minimizer can be found, return None.
  386. """
  387. # f(x) = A *(x-a)^3 + B*(x-a)^2 + C*(x-a) + D
  388. with np.errstate(divide='raise', over='raise', invalid='raise'):
  389. try:
  390. C = fpa
  391. db = b - a
  392. dc = c - a
  393. denom = (db * dc) ** 2 * (db - dc)
  394. d1 = np.empty((2, 2))
  395. d1[0, 0] = dc ** 2
  396. d1[0, 1] = -db ** 2
  397. d1[1, 0] = -dc ** 3
  398. d1[1, 1] = db ** 3
  399. [A, B] = np.dot(d1, np.asarray([fb - fa - C * db,
  400. fc - fa - C * dc]).flatten())
  401. A /= denom
  402. B /= denom
  403. radical = B * B - 3 * A * C
  404. xmin = a + (-B + np.sqrt(radical)) / (3 * A)
  405. except ArithmeticError:
  406. return None
  407. if not np.isfinite(xmin):
  408. return None
  409. return xmin
  410. def _quadmin(a, fa, fpa, b, fb):
  411. """
  412. Finds the minimizer for a quadratic polynomial that goes through
  413. the points (a,fa), (b,fb) with derivative at a of fpa.
  414. """
  415. # f(x) = B*(x-a)^2 + C*(x-a) + D
  416. with np.errstate(divide='raise', over='raise', invalid='raise'):
  417. try:
  418. D = fa
  419. C = fpa
  420. db = b - a * 1.0
  421. B = (fb - D - C * db) / (db * db)
  422. xmin = a - C / (2.0 * B)
  423. except ArithmeticError:
  424. return None
  425. if not np.isfinite(xmin):
  426. return None
  427. return xmin
  428. def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
  429. phi, derphi, phi0, derphi0, c1, c2, extra_condition):
  430. """Zoom stage of approximate linesearch satisfying strong Wolfe conditions.
  431. Part of the optimization algorithm in `scalar_search_wolfe2`.
  432. Notes
  433. -----
  434. Implements Algorithm 3.6 (zoom) in Wright and Nocedal,
  435. 'Numerical Optimization', 1999, pp. 61.
  436. """
  437. maxiter = 10
  438. i = 0
  439. delta1 = 0.2 # cubic interpolant check
  440. delta2 = 0.1 # quadratic interpolant check
  441. phi_rec = phi0
  442. a_rec = 0
  443. while True:
  444. # interpolate to find a trial step length between a_lo and
  445. # a_hi Need to choose interpolation here. Use cubic
  446. # interpolation and then if the result is within delta *
  447. # dalpha or outside of the interval bounded by a_lo or a_hi
  448. # then use quadratic interpolation, if the result is still too
  449. # close, then use bisection
  450. dalpha = a_hi - a_lo
  451. if dalpha < 0:
  452. a, b = a_hi, a_lo
  453. else:
  454. a, b = a_lo, a_hi
  455. # minimizer of cubic interpolant
  456. # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
  457. #
  458. # if the result is too close to the end points (or out of the
  459. # interval), then use quadratic interpolation with phi_lo,
  460. # derphi_lo and phi_hi if the result is still too close to the
  461. # end points (or out of the interval) then use bisection
  462. if (i > 0):
  463. cchk = delta1 * dalpha
  464. a_j = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
  465. a_rec, phi_rec)
  466. if (i == 0) or (a_j is None) or (a_j > b - cchk) or (a_j < a + cchk):
  467. qchk = delta2 * dalpha
  468. a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
  469. if (a_j is None) or (a_j > b-qchk) or (a_j < a+qchk):
  470. a_j = a_lo + 0.5*dalpha
  471. # Check new value of a_j
  472. phi_aj = phi(a_j)
  473. if (phi_aj > phi0 + c1*a_j*derphi0) or (phi_aj >= phi_lo):
  474. phi_rec = phi_hi
  475. a_rec = a_hi
  476. a_hi = a_j
  477. phi_hi = phi_aj
  478. else:
  479. derphi_aj = derphi(a_j)
  480. if abs(derphi_aj) <= -c2*derphi0 and extra_condition(a_j, phi_aj):
  481. a_star = a_j
  482. val_star = phi_aj
  483. valprime_star = derphi_aj
  484. break
  485. if derphi_aj*(a_hi - a_lo) >= 0:
  486. phi_rec = phi_hi
  487. a_rec = a_hi
  488. a_hi = a_lo
  489. phi_hi = phi_lo
  490. else:
  491. phi_rec = phi_lo
  492. a_rec = a_lo
  493. a_lo = a_j
  494. phi_lo = phi_aj
  495. derphi_lo = derphi_aj
  496. i += 1
  497. if (i > maxiter):
  498. # Failed to find a conforming step size
  499. a_star = None
  500. val_star = None
  501. valprime_star = None
  502. break
  503. return a_star, val_star, valprime_star
  504. #------------------------------------------------------------------------------
  505. # Armijo line and scalar searches
  506. #------------------------------------------------------------------------------
  507. def line_search_armijo(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
  508. """Minimize over alpha, the function ``f(xk+alpha pk)``.
  509. Parameters
  510. ----------
  511. f : callable
  512. Function to be minimized.
  513. xk : array_like
  514. Current point.
  515. pk : array_like
  516. Search direction.
  517. gfk : array_like
  518. Gradient of `f` at point `xk`.
  519. old_fval : float
  520. Value of `f` at point `xk`.
  521. args : tuple, optional
  522. Optional arguments.
  523. c1 : float, optional
  524. Value to control stopping criterion.
  525. alpha0 : scalar, optional
  526. Value of `alpha` at start of the optimization.
  527. Returns
  528. -------
  529. alpha
  530. f_count
  531. f_val_at_alpha
  532. Notes
  533. -----
  534. Uses the interpolation algorithm (Armijo backtracking) as suggested by
  535. Wright and Nocedal in 'Numerical Optimization', 1999, pp. 56-57
  536. """
  537. xk = np.atleast_1d(xk)
  538. fc = [0]
  539. def phi(alpha1):
  540. fc[0] += 1
  541. return f(xk + alpha1*pk, *args)
  542. if old_fval is None:
  543. phi0 = phi(0.)
  544. else:
  545. phi0 = old_fval # compute f(xk) -- done in past loop
  546. derphi0 = np.dot(gfk, pk)
  547. alpha, phi1 = scalar_search_armijo(phi, phi0, derphi0, c1=c1,
  548. alpha0=alpha0)
  549. return alpha, fc[0], phi1
  550. def line_search_BFGS(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
  551. """
  552. Compatibility wrapper for `line_search_armijo`
  553. """
  554. r = line_search_armijo(f, xk, pk, gfk, old_fval, args=args, c1=c1,
  555. alpha0=alpha0)
  556. return r[0], r[1], 0, r[2]
  557. def scalar_search_armijo(phi, phi0, derphi0, c1=1e-4, alpha0=1, amin=0):
  558. """Minimize over alpha, the function ``phi(alpha)``.
  559. Uses the interpolation algorithm (Armijo backtracking) as suggested by
  560. Wright and Nocedal in 'Numerical Optimization', 1999, pp. 56-57
  561. alpha > 0 is assumed to be a descent direction.
  562. Returns
  563. -------
  564. alpha
  565. phi1
  566. """
  567. phi_a0 = phi(alpha0)
  568. if phi_a0 <= phi0 + c1*alpha0*derphi0:
  569. return alpha0, phi_a0
  570. # Otherwise, compute the minimizer of a quadratic interpolant:
  571. alpha1 = -(derphi0) * alpha0**2 / 2.0 / (phi_a0 - phi0 - derphi0 * alpha0)
  572. phi_a1 = phi(alpha1)
  573. if (phi_a1 <= phi0 + c1*alpha1*derphi0):
  574. return alpha1, phi_a1
  575. # Otherwise, loop with cubic interpolation until we find an alpha which
  576. # satisfies the first Wolfe condition (since we are backtracking, we will
  577. # assume that the value of alpha is not too small and satisfies the second
  578. # condition.
  579. while alpha1 > amin: # we are assuming alpha>0 is a descent direction
  580. factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
  581. a = alpha0**2 * (phi_a1 - phi0 - derphi0*alpha1) - \
  582. alpha1**2 * (phi_a0 - phi0 - derphi0*alpha0)
  583. a = a / factor
  584. b = -alpha0**3 * (phi_a1 - phi0 - derphi0*alpha1) + \
  585. alpha1**3 * (phi_a0 - phi0 - derphi0*alpha0)
  586. b = b / factor
  587. alpha2 = (-b + np.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0*a)
  588. phi_a2 = phi(alpha2)
  589. if (phi_a2 <= phi0 + c1*alpha2*derphi0):
  590. return alpha2, phi_a2
  591. if (alpha1 - alpha2) > alpha1 / 2.0 or (1 - alpha2/alpha1) < 0.96:
  592. alpha2 = alpha1 / 2.0
  593. alpha0 = alpha1
  594. alpha1 = alpha2
  595. phi_a0 = phi_a1
  596. phi_a1 = phi_a2
  597. # Failed to find a suitable step length
  598. return None, phi_a1
  599. #------------------------------------------------------------------------------
  600. # Non-monotone line search for DF-SANE
  601. #------------------------------------------------------------------------------
  602. def _nonmonotone_line_search_cruz(f, x_k, d, prev_fs, eta,
  603. gamma=1e-4, tau_min=0.1, tau_max=0.5):
  604. """
  605. Nonmonotone backtracking line search as described in [1]_
  606. Parameters
  607. ----------
  608. f : callable
  609. Function returning a tuple ``(f, F)`` where ``f`` is the value
  610. of a merit function and ``F`` the residual.
  611. x_k : ndarray
  612. Initial position.
  613. d : ndarray
  614. Search direction.
  615. prev_fs : float
  616. List of previous merit function values. Should have ``len(prev_fs) <= M``
  617. where ``M`` is the nonmonotonicity window parameter.
  618. eta : float
  619. Allowed merit function increase, see [1]_
  620. gamma, tau_min, tau_max : float, optional
  621. Search parameters, see [1]_
  622. Returns
  623. -------
  624. alpha : float
  625. Step length
  626. xp : ndarray
  627. Next position
  628. fp : float
  629. Merit function value at next position
  630. Fp : ndarray
  631. Residual at next position
  632. References
  633. ----------
  634. [1] "Spectral residual method without gradient information for solving
  635. large-scale nonlinear systems of equations." W. La Cruz,
  636. J.M. Martinez, M. Raydan. Math. Comp. **75**, 1429 (2006).
  637. """
  638. f_k = prev_fs[-1]
  639. f_bar = max(prev_fs)
  640. alpha_p = 1
  641. alpha_m = 1
  642. alpha = 1
  643. while True:
  644. xp = x_k + alpha_p * d
  645. fp, Fp = f(xp)
  646. if fp <= f_bar + eta - gamma * alpha_p**2 * f_k:
  647. alpha = alpha_p
  648. break
  649. alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
  650. xp = x_k - alpha_m * d
  651. fp, Fp = f(xp)
  652. if fp <= f_bar + eta - gamma * alpha_m**2 * f_k:
  653. alpha = -alpha_m
  654. break
  655. alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
  656. alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
  657. alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
  658. return alpha, xp, fp, Fp
  659. def _nonmonotone_line_search_cheng(f, x_k, d, f_k, C, Q, eta,
  660. gamma=1e-4, tau_min=0.1, tau_max=0.5,
  661. nu=0.85):
  662. """
  663. Nonmonotone line search from [1]
  664. Parameters
  665. ----------
  666. f : callable
  667. Function returning a tuple ``(f, F)`` where ``f`` is the value
  668. of a merit function and ``F`` the residual.
  669. x_k : ndarray
  670. Initial position.
  671. d : ndarray
  672. Search direction.
  673. f_k : float
  674. Initial merit function value.
  675. C, Q : float
  676. Control parameters. On the first iteration, give values
  677. Q=1.0, C=f_k
  678. eta : float
  679. Allowed merit function increase, see [1]_
  680. nu, gamma, tau_min, tau_max : float, optional
  681. Search parameters, see [1]_
  682. Returns
  683. -------
  684. alpha : float
  685. Step length
  686. xp : ndarray
  687. Next position
  688. fp : float
  689. Merit function value at next position
  690. Fp : ndarray
  691. Residual at next position
  692. C : float
  693. New value for the control parameter C
  694. Q : float
  695. New value for the control parameter Q
  696. References
  697. ----------
  698. .. [1] W. Cheng & D.-H. Li, ''A derivative-free nonmonotone line
  699. search and its application to the spectral residual
  700. method'', IMA J. Numer. Anal. 29, 814 (2009).
  701. """
  702. alpha_p = 1
  703. alpha_m = 1
  704. alpha = 1
  705. while True:
  706. xp = x_k + alpha_p * d
  707. fp, Fp = f(xp)
  708. if fp <= C + eta - gamma * alpha_p**2 * f_k:
  709. alpha = alpha_p
  710. break
  711. alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
  712. xp = x_k - alpha_m * d
  713. fp, Fp = f(xp)
  714. if fp <= C + eta - gamma * alpha_m**2 * f_k:
  715. alpha = -alpha_m
  716. break
  717. alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
  718. alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
  719. alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
  720. # Update C and Q
  721. Q_next = nu * Q + 1
  722. C = (nu * Q * (C + eta) + fp) / Q_next
  723. Q = Q_next
  724. return alpha, xp, fp, Fp, C, Q