_ranges.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. """
  2. Helper functions to generate range-like data for DatetimeArray
  3. (and possibly TimedeltaArray/PeriodArray)
  4. """
  5. from __future__ import annotations
  6. import numpy as np
  7. from pandas._libs.lib import i8max
  8. from pandas._libs.tslibs import (
  9. BaseOffset,
  10. OutOfBoundsDatetime,
  11. Timedelta,
  12. Timestamp,
  13. iNaT,
  14. )
  15. from pandas._typing import npt
  16. def generate_regular_range(
  17. start: Timestamp | Timedelta | None,
  18. end: Timestamp | Timedelta | None,
  19. periods: int | None,
  20. freq: BaseOffset,
  21. unit: str = "ns",
  22. ) -> npt.NDArray[np.intp]:
  23. """
  24. Generate a range of dates or timestamps with the spans between dates
  25. described by the given `freq` DateOffset.
  26. Parameters
  27. ----------
  28. start : Timedelta, Timestamp or None
  29. First point of produced date range.
  30. end : Timedelta, Timestamp or None
  31. Last point of produced date range.
  32. periods : int or None
  33. Number of periods in produced date range.
  34. freq : Tick
  35. Describes space between dates in produced date range.
  36. unit : str, default "ns"
  37. The resolution the output is meant to represent.
  38. Returns
  39. -------
  40. ndarray[np.int64]
  41. Representing the given resolution.
  42. """
  43. istart = start._value if start is not None else None
  44. iend = end._value if end is not None else None
  45. freq.nanos # raises if non-fixed frequency
  46. td = Timedelta(freq)
  47. try:
  48. td = td.as_unit( # pyright: ignore[reportGeneralTypeIssues]
  49. unit, round_ok=False
  50. )
  51. except ValueError as err:
  52. raise ValueError(
  53. f"freq={freq} is incompatible with unit={unit}. "
  54. "Use a lower freq or a higher unit instead."
  55. ) from err
  56. stride = int(td._value)
  57. if periods is None and istart is not None and iend is not None:
  58. b = istart
  59. # cannot just use e = Timestamp(end) + 1 because arange breaks when
  60. # stride is too large, see GH10887
  61. e = b + (iend - b) // stride * stride + stride // 2 + 1
  62. elif istart is not None and periods is not None:
  63. b = istart
  64. e = _generate_range_overflow_safe(b, periods, stride, side="start")
  65. elif iend is not None and periods is not None:
  66. e = iend + stride
  67. b = _generate_range_overflow_safe(e, periods, stride, side="end")
  68. else:
  69. raise ValueError(
  70. "at least 'start' or 'end' should be specified if a 'period' is given."
  71. )
  72. with np.errstate(over="raise"):
  73. # If the range is sufficiently large, np.arange may overflow
  74. # and incorrectly return an empty array if not caught.
  75. try:
  76. values = np.arange(b, e, stride, dtype=np.int64)
  77. except FloatingPointError:
  78. xdr = [b]
  79. while xdr[-1] != e:
  80. xdr.append(xdr[-1] + stride)
  81. values = np.array(xdr[:-1], dtype=np.int64)
  82. return values
  83. def _generate_range_overflow_safe(
  84. endpoint: int, periods: int, stride: int, side: str = "start"
  85. ) -> int:
  86. """
  87. Calculate the second endpoint for passing to np.arange, checking
  88. to avoid an integer overflow. Catch OverflowError and re-raise
  89. as OutOfBoundsDatetime.
  90. Parameters
  91. ----------
  92. endpoint : int
  93. nanosecond timestamp of the known endpoint of the desired range
  94. periods : int
  95. number of periods in the desired range
  96. stride : int
  97. nanoseconds between periods in the desired range
  98. side : {'start', 'end'}
  99. which end of the range `endpoint` refers to
  100. Returns
  101. -------
  102. other_end : int
  103. Raises
  104. ------
  105. OutOfBoundsDatetime
  106. """
  107. # GH#14187 raise instead of incorrectly wrapping around
  108. assert side in ["start", "end"]
  109. i64max = np.uint64(i8max)
  110. msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
  111. with np.errstate(over="raise"):
  112. # if periods * strides cannot be multiplied within the *uint64* bounds,
  113. # we cannot salvage the operation by recursing, so raise
  114. try:
  115. addend = np.uint64(periods) * np.uint64(np.abs(stride))
  116. except FloatingPointError as err:
  117. raise OutOfBoundsDatetime(msg) from err
  118. if np.abs(addend) <= i64max:
  119. # relatively easy case without casting concerns
  120. return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
  121. elif (endpoint > 0 and side == "start" and stride > 0) or (
  122. endpoint < 0 < stride and side == "end"
  123. ):
  124. # no chance of not-overflowing
  125. raise OutOfBoundsDatetime(msg)
  126. elif side == "end" and endpoint - stride <= i64max < endpoint:
  127. # in _generate_regular_range we added `stride` thereby overflowing
  128. # the bounds. Adjust to fix this.
  129. return _generate_range_overflow_safe(
  130. endpoint - stride, periods - 1, stride, side
  131. )
  132. # split into smaller pieces
  133. mid_periods = periods // 2
  134. remaining = periods - mid_periods
  135. assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
  136. midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side)
  137. return _generate_range_overflow_safe(midpoint, remaining, stride, side)
  138. def _generate_range_overflow_safe_signed(
  139. endpoint: int, periods: int, stride: int, side: str
  140. ) -> int:
  141. """
  142. A special case for _generate_range_overflow_safe where `periods * stride`
  143. can be calculated without overflowing int64 bounds.
  144. """
  145. assert side in ["start", "end"]
  146. if side == "end":
  147. stride *= -1
  148. with np.errstate(over="raise"):
  149. addend = np.int64(periods) * np.int64(stride)
  150. try:
  151. # easy case with no overflows
  152. result = np.int64(endpoint) + addend
  153. if result == iNaT:
  154. # Putting this into a DatetimeArray/TimedeltaArray
  155. # would incorrectly be interpreted as NaT
  156. raise OverflowError
  157. # error: Incompatible return value type (got "signedinteger[_64Bit]",
  158. # expected "int")
  159. return result # type: ignore[return-value]
  160. except (FloatingPointError, OverflowError):
  161. # with endpoint negative and addend positive we risk
  162. # FloatingPointError; with reversed signed we risk OverflowError
  163. pass
  164. # if stride and endpoint had opposite signs, then endpoint + addend
  165. # should never overflow. so they must have the same signs
  166. assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
  167. if stride > 0:
  168. # watch out for very special case in which we just slightly
  169. # exceed implementation bounds, but when passing the result to
  170. # np.arange will get a result slightly within the bounds
  171. # error: Incompatible types in assignment (expression has type
  172. # "unsignedinteger[_64Bit]", variable has type "signedinteger[_64Bit]")
  173. result = np.uint64(endpoint) + np.uint64(addend) # type: ignore[assignment]
  174. i64max = np.uint64(i8max)
  175. assert result > i64max
  176. if result <= i64max + np.uint64(stride):
  177. # error: Incompatible return value type (got "unsignedinteger", expected
  178. # "int")
  179. return result # type: ignore[return-value]
  180. raise OutOfBoundsDatetime(
  181. f"Cannot generate range with {side}={endpoint} and periods={periods}"
  182. )