test_partial_slicing.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. """ test partial slicing on Series/Frame """
  2. from datetime import datetime
  3. import numpy as np
  4. import pytest
  5. from pandas import (
  6. DataFrame,
  7. DatetimeIndex,
  8. Index,
  9. Series,
  10. Timedelta,
  11. Timestamp,
  12. date_range,
  13. )
  14. import pandas._testing as tm
  15. class TestSlicing:
  16. def test_string_index_series_name_converted(self):
  17. # GH#1644
  18. df = DataFrame(np.random.randn(10, 4), index=date_range("1/1/2000", periods=10))
  19. result = df.loc["1/3/2000"]
  20. assert result.name == df.index[2]
  21. result = df.T["1/3/2000"]
  22. assert result.name == df.index[2]
  23. def test_stringified_slice_with_tz(self):
  24. # GH#2658
  25. start = "2013-01-07"
  26. idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern")
  27. df = DataFrame(np.arange(10), index=idx)
  28. df["2013-01-14 23:44:34.437768-05:00":] # no exception here
  29. def test_return_type_doesnt_depend_on_monotonicity(self):
  30. # GH#24892 we get Series back regardless of whether our DTI is monotonic
  31. dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3)
  32. ser = Series(range(3), index=dti)
  33. # non-monotonic index
  34. ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]])
  35. # key with resolution strictly lower than "min"
  36. key = "2015-5-14 00"
  37. # monotonic increasing index
  38. result = ser.loc[key]
  39. expected = ser.iloc[1:]
  40. tm.assert_series_equal(result, expected)
  41. # monotonic decreasing index
  42. result = ser.iloc[::-1].loc[key]
  43. expected = ser.iloc[::-1][:-1]
  44. tm.assert_series_equal(result, expected)
  45. # non-monotonic index
  46. result2 = ser2.loc[key]
  47. expected2 = ser2.iloc[::2]
  48. tm.assert_series_equal(result2, expected2)
  49. def test_return_type_doesnt_depend_on_monotonicity_higher_reso(self):
  50. # GH#24892 we get Series back regardless of whether our DTI is monotonic
  51. dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3)
  52. ser = Series(range(3), index=dti)
  53. # non-monotonic index
  54. ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]])
  55. # key with resolution strictly *higher) than "min"
  56. key = "2015-5-14 00:00:00"
  57. # monotonic increasing index
  58. result = ser.loc[key]
  59. assert result == 1
  60. # monotonic decreasing index
  61. result = ser.iloc[::-1].loc[key]
  62. assert result == 1
  63. # non-monotonic index
  64. result2 = ser2.loc[key]
  65. assert result2 == 0
  66. def test_monotone_DTI_indexing_bug(self):
  67. # GH 19362
  68. # Testing accessing the first element in a monotonic descending
  69. # partial string indexing.
  70. df = DataFrame(list(range(5)))
  71. date_list = [
  72. "2018-01-02",
  73. "2017-02-10",
  74. "2016-03-10",
  75. "2015-03-15",
  76. "2014-03-16",
  77. ]
  78. date_index = DatetimeIndex(date_list)
  79. df["date"] = date_index
  80. expected = DataFrame({0: list(range(5)), "date": date_index})
  81. tm.assert_frame_equal(df, expected)
  82. # We get a slice because df.index's resolution is hourly and we
  83. # are slicing with a daily-resolution string. If both were daily,
  84. # we would get a single item back
  85. dti = date_range("20170101 01:00:00", periods=3)
  86. df = DataFrame({"A": [1, 2, 3]}, index=dti[::-1])
  87. expected = DataFrame({"A": 1}, index=dti[-1:][::-1])
  88. result = df.loc["2017-01-03"]
  89. tm.assert_frame_equal(result, expected)
  90. result2 = df.iloc[::-1].loc["2017-01-03"]
  91. expected2 = expected.iloc[::-1]
  92. tm.assert_frame_equal(result2, expected2)
  93. def test_slice_year(self):
  94. dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500)
  95. s = Series(np.arange(len(dti)), index=dti)
  96. result = s["2005"]
  97. expected = s[s.index.year == 2005]
  98. tm.assert_series_equal(result, expected)
  99. df = DataFrame(np.random.rand(len(dti), 5), index=dti)
  100. result = df.loc["2005"]
  101. expected = df[df.index.year == 2005]
  102. tm.assert_frame_equal(result, expected)
  103. @pytest.mark.parametrize(
  104. "partial_dtime",
  105. [
  106. "2019",
  107. "2019Q4",
  108. "Dec 2019",
  109. "2019-12-31",
  110. "2019-12-31 23",
  111. "2019-12-31 23:59",
  112. ],
  113. )
  114. def test_slice_end_of_period_resolution(self, partial_dtime):
  115. # GH#31064
  116. dti = date_range("2019-12-31 23:59:55.999999999", periods=10, freq="s")
  117. ser = Series(range(10), index=dti)
  118. result = ser[partial_dtime]
  119. expected = ser.iloc[:5]
  120. tm.assert_series_equal(result, expected)
  121. def test_slice_quarter(self):
  122. dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500)
  123. s = Series(np.arange(len(dti)), index=dti)
  124. assert len(s["2001Q1"]) == 90
  125. df = DataFrame(np.random.rand(len(dti), 5), index=dti)
  126. assert len(df.loc["1Q01"]) == 90
  127. def test_slice_month(self):
  128. dti = date_range(freq="D", start=datetime(2005, 1, 1), periods=500)
  129. s = Series(np.arange(len(dti)), index=dti)
  130. assert len(s["2005-11"]) == 30
  131. df = DataFrame(np.random.rand(len(dti), 5), index=dti)
  132. assert len(df.loc["2005-11"]) == 30
  133. tm.assert_series_equal(s["2005-11"], s["11-2005"])
  134. def test_partial_slice(self):
  135. rng = date_range(freq="D", start=datetime(2005, 1, 1), periods=500)
  136. s = Series(np.arange(len(rng)), index=rng)
  137. result = s["2005-05":"2006-02"]
  138. expected = s["20050501":"20060228"]
  139. tm.assert_series_equal(result, expected)
  140. result = s["2005-05":]
  141. expected = s["20050501":]
  142. tm.assert_series_equal(result, expected)
  143. result = s[:"2006-02"]
  144. expected = s[:"20060228"]
  145. tm.assert_series_equal(result, expected)
  146. result = s["2005-1-1"]
  147. assert result == s.iloc[0]
  148. with pytest.raises(KeyError, match=r"^'2004-12-31'$"):
  149. s["2004-12-31"]
  150. def test_partial_slice_daily(self):
  151. rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500)
  152. s = Series(np.arange(len(rng)), index=rng)
  153. result = s["2005-1-31"]
  154. tm.assert_series_equal(result, s.iloc[:24])
  155. with pytest.raises(KeyError, match=r"^'2004-12-31 00'$"):
  156. s["2004-12-31 00"]
  157. def test_partial_slice_hourly(self):
  158. rng = date_range(freq="T", start=datetime(2005, 1, 1, 20, 0, 0), periods=500)
  159. s = Series(np.arange(len(rng)), index=rng)
  160. result = s["2005-1-1"]
  161. tm.assert_series_equal(result, s.iloc[: 60 * 4])
  162. result = s["2005-1-1 20"]
  163. tm.assert_series_equal(result, s.iloc[:60])
  164. assert s["2005-1-1 20:00"] == s.iloc[0]
  165. with pytest.raises(KeyError, match=r"^'2004-12-31 00:15'$"):
  166. s["2004-12-31 00:15"]
  167. def test_partial_slice_minutely(self):
  168. rng = date_range(freq="S", start=datetime(2005, 1, 1, 23, 59, 0), periods=500)
  169. s = Series(np.arange(len(rng)), index=rng)
  170. result = s["2005-1-1 23:59"]
  171. tm.assert_series_equal(result, s.iloc[:60])
  172. result = s["2005-1-1"]
  173. tm.assert_series_equal(result, s.iloc[:60])
  174. assert s[Timestamp("2005-1-1 23:59:00")] == s.iloc[0]
  175. with pytest.raises(KeyError, match=r"^'2004-12-31 00:00:00'$"):
  176. s["2004-12-31 00:00:00"]
  177. def test_partial_slice_second_precision(self):
  178. rng = date_range(
  179. start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990),
  180. periods=20,
  181. freq="US",
  182. )
  183. s = Series(np.arange(20), rng)
  184. tm.assert_series_equal(s["2005-1-1 00:00"], s.iloc[:10])
  185. tm.assert_series_equal(s["2005-1-1 00:00:59"], s.iloc[:10])
  186. tm.assert_series_equal(s["2005-1-1 00:01"], s.iloc[10:])
  187. tm.assert_series_equal(s["2005-1-1 00:01:00"], s.iloc[10:])
  188. assert s[Timestamp("2005-1-1 00:00:59.999990")] == s.iloc[0]
  189. with pytest.raises(KeyError, match="2005-1-1 00:00:00"):
  190. s["2005-1-1 00:00:00"]
  191. def test_partial_slicing_dataframe(self):
  192. # GH14856
  193. # Test various combinations of string slicing resolution vs.
  194. # index resolution
  195. # - If string resolution is less precise than index resolution,
  196. # string is considered a slice
  197. # - If string resolution is equal to or more precise than index
  198. # resolution, string is considered an exact match
  199. formats = [
  200. "%Y",
  201. "%Y-%m",
  202. "%Y-%m-%d",
  203. "%Y-%m-%d %H",
  204. "%Y-%m-%d %H:%M",
  205. "%Y-%m-%d %H:%M:%S",
  206. ]
  207. resolutions = ["year", "month", "day", "hour", "minute", "second"]
  208. for rnum, resolution in enumerate(resolutions[2:], 2):
  209. # we check only 'day', 'hour', 'minute' and 'second'
  210. unit = Timedelta("1 " + resolution)
  211. middate = datetime(2012, 1, 1, 0, 0, 0)
  212. index = DatetimeIndex([middate - unit, middate, middate + unit])
  213. values = [1, 2, 3]
  214. df = DataFrame({"a": values}, index, dtype=np.int64)
  215. assert df.index.resolution == resolution
  216. # Timestamp with the same resolution as index
  217. # Should be exact match for Series (return scalar)
  218. # and raise KeyError for Frame
  219. for timestamp, expected in zip(index, values):
  220. ts_string = timestamp.strftime(formats[rnum])
  221. # make ts_string as precise as index
  222. result = df["a"][ts_string]
  223. assert isinstance(result, np.int64)
  224. assert result == expected
  225. msg = rf"^'{ts_string}'$"
  226. with pytest.raises(KeyError, match=msg):
  227. df[ts_string]
  228. # Timestamp with resolution less precise than index
  229. for fmt in formats[:rnum]:
  230. for element, theslice in [[0, slice(None, 1)], [1, slice(1, None)]]:
  231. ts_string = index[element].strftime(fmt)
  232. # Series should return slice
  233. result = df["a"][ts_string]
  234. expected = df["a"][theslice]
  235. tm.assert_series_equal(result, expected)
  236. # pre-2.0 df[ts_string] was overloaded to interpret this
  237. # as slicing along index
  238. with pytest.raises(KeyError, match=ts_string):
  239. df[ts_string]
  240. # Timestamp with resolution more precise than index
  241. # Compatible with existing key
  242. # Should return scalar for Series
  243. # and raise KeyError for Frame
  244. for fmt in formats[rnum + 1 :]:
  245. ts_string = index[1].strftime(fmt)
  246. result = df["a"][ts_string]
  247. assert isinstance(result, np.int64)
  248. assert result == 2
  249. msg = rf"^'{ts_string}'$"
  250. with pytest.raises(KeyError, match=msg):
  251. df[ts_string]
  252. # Not compatible with existing key
  253. # Should raise KeyError
  254. for fmt, res in list(zip(formats, resolutions))[rnum + 1 :]:
  255. ts = index[1] + Timedelta("1 " + res)
  256. ts_string = ts.strftime(fmt)
  257. msg = rf"^'{ts_string}'$"
  258. with pytest.raises(KeyError, match=msg):
  259. df["a"][ts_string]
  260. with pytest.raises(KeyError, match=msg):
  261. df[ts_string]
  262. def test_partial_slicing_with_multiindex(self):
  263. # GH 4758
  264. # partial string indexing with a multi-index buggy
  265. df = DataFrame(
  266. {
  267. "ACCOUNT": ["ACCT1", "ACCT1", "ACCT1", "ACCT2"],
  268. "TICKER": ["ABC", "MNP", "XYZ", "XYZ"],
  269. "val": [1, 2, 3, 4],
  270. },
  271. index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"),
  272. )
  273. df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True)
  274. expected = DataFrame(
  275. [[1]], index=Index(["ABC"], name="TICKER"), columns=["val"]
  276. )
  277. result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1")]
  278. tm.assert_frame_equal(result, expected)
  279. expected = df_multi.loc[
  280. (Timestamp("2013-06-19 09:30:00", tz=None), "ACCT1", "ABC")
  281. ]
  282. result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")]
  283. tm.assert_series_equal(result, expected)
  284. # partial string indexing on first level, scalar indexing on the other two
  285. result = df_multi.loc[("2013-06-19", "ACCT1", "ABC")]
  286. expected = df_multi.iloc[:1].droplevel([1, 2])
  287. tm.assert_frame_equal(result, expected)
  288. def test_partial_slicing_with_multiindex_series(self):
  289. # GH 4294
  290. # partial slice on a series mi
  291. ser = DataFrame(
  292. np.random.rand(1000, 1000), index=date_range("2000-1-1", periods=1000)
  293. ).stack()
  294. s2 = ser[:-1].copy()
  295. expected = s2["2000-1-4"]
  296. result = s2[Timestamp("2000-1-4")]
  297. tm.assert_series_equal(result, expected)
  298. result = ser[Timestamp("2000-1-4")]
  299. expected = ser["2000-1-4"]
  300. tm.assert_series_equal(result, expected)
  301. df2 = DataFrame(ser)
  302. expected = df2.xs("2000-1-4")
  303. result = df2.loc[Timestamp("2000-1-4")]
  304. tm.assert_frame_equal(result, expected)
  305. def test_partial_slice_requires_monotonicity(self):
  306. # Disallowed since 2.0 (GH 37819)
  307. ser = Series(np.arange(10), date_range("2014-01-01", periods=10))
  308. nonmonotonic = ser[[3, 5, 4]]
  309. timestamp = Timestamp("2014-01-10")
  310. with pytest.raises(
  311. KeyError, match="Value based partial slicing on non-monotonic"
  312. ):
  313. nonmonotonic["2014-01-10":]
  314. with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"):
  315. nonmonotonic[timestamp:]
  316. with pytest.raises(
  317. KeyError, match="Value based partial slicing on non-monotonic"
  318. ):
  319. nonmonotonic.loc["2014-01-10":]
  320. with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"):
  321. nonmonotonic.loc[timestamp:]
  322. def test_loc_datetime_length_one(self):
  323. # GH16071
  324. df = DataFrame(
  325. columns=["1"],
  326. index=date_range("2016-10-01T00:00:00", "2016-10-01T23:59:59"),
  327. )
  328. result = df.loc[datetime(2016, 10, 1) :]
  329. tm.assert_frame_equal(result, df)
  330. result = df.loc["2016-10-01T00:00:00":]
  331. tm.assert_frame_equal(result, df)
  332. @pytest.mark.parametrize(
  333. "start",
  334. [
  335. "2018-12-02 21:50:00+00:00",
  336. Timestamp("2018-12-02 21:50:00+00:00"),
  337. Timestamp("2018-12-02 21:50:00+00:00").to_pydatetime(),
  338. ],
  339. )
  340. @pytest.mark.parametrize(
  341. "end",
  342. [
  343. "2018-12-02 21:52:00+00:00",
  344. Timestamp("2018-12-02 21:52:00+00:00"),
  345. Timestamp("2018-12-02 21:52:00+00:00").to_pydatetime(),
  346. ],
  347. )
  348. def test_getitem_with_datestring_with_UTC_offset(self, start, end):
  349. # GH 24076
  350. idx = date_range(
  351. start="2018-12-02 14:50:00-07:00",
  352. end="2018-12-02 14:50:00-07:00",
  353. freq="1min",
  354. )
  355. df = DataFrame(1, index=idx, columns=["A"])
  356. result = df[start:end]
  357. expected = df.iloc[0:3, :]
  358. tm.assert_frame_equal(result, expected)
  359. # GH 16785
  360. start = str(start)
  361. end = str(end)
  362. with pytest.raises(ValueError, match="Both dates must"):
  363. df[start : end[:-4] + "1:00"]
  364. with pytest.raises(ValueError, match="The index must be timezone"):
  365. df = df.tz_localize(None)
  366. df[start:end]
  367. def test_slice_reduce_to_series(self):
  368. # GH 27516
  369. df = DataFrame({"A": range(24)}, index=date_range("2000", periods=24, freq="M"))
  370. expected = Series(
  371. range(12), index=date_range("2000", periods=12, freq="M"), name="A"
  372. )
  373. result = df.loc["2000", "A"]
  374. tm.assert_series_equal(result, expected)