test_getitem.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas import (
  5. Categorical,
  6. CategoricalDtype,
  7. CategoricalIndex,
  8. DataFrame,
  9. DateOffset,
  10. DatetimeIndex,
  11. Index,
  12. MultiIndex,
  13. Series,
  14. Timestamp,
  15. concat,
  16. date_range,
  17. get_dummies,
  18. period_range,
  19. )
  20. import pandas._testing as tm
  21. from pandas.core.arrays import SparseArray
  22. class TestGetitem:
  23. def test_getitem_unused_level_raises(self):
  24. # GH#20410
  25. mi = MultiIndex(
  26. levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]],
  27. codes=[[1, 0], [1, 0]],
  28. )
  29. df = DataFrame(-1, index=range(3), columns=mi)
  30. with pytest.raises(KeyError, match="notevenone"):
  31. df["notevenone"]
  32. def test_getitem_periodindex(self):
  33. rng = period_range("1/1/2000", periods=5)
  34. df = DataFrame(np.random.randn(10, 5), columns=rng)
  35. ts = df[rng[0]]
  36. tm.assert_series_equal(ts, df.iloc[:, 0])
  37. # GH#1211; smoketest unrelated to the rest of this test
  38. repr(df)
  39. ts = df["1/1/2000"]
  40. tm.assert_series_equal(ts, df.iloc[:, 0])
  41. def test_getitem_list_of_labels_categoricalindex_cols(self):
  42. # GH#16115
  43. cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])
  44. expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats)
  45. dummies = get_dummies(cats)
  46. result = dummies[list(dummies.columns)]
  47. tm.assert_frame_equal(result, expected)
  48. def test_getitem_sparse_column_return_type_and_dtype(self):
  49. # https://github.com/pandas-dev/pandas/issues/23559
  50. data = SparseArray([0, 1])
  51. df = DataFrame({"A": data})
  52. expected = Series(data, name="A")
  53. result = df["A"]
  54. tm.assert_series_equal(result, expected)
  55. # Also check iloc and loc while we're here
  56. result = df.iloc[:, 0]
  57. tm.assert_series_equal(result, expected)
  58. result = df.loc[:, "A"]
  59. tm.assert_series_equal(result, expected)
  60. def test_getitem_string_columns(self):
  61. # GH#46185
  62. df = DataFrame([[1, 2]], columns=Index(["A", "B"], dtype="string"))
  63. result = df.A
  64. expected = df["A"]
  65. tm.assert_series_equal(result, expected)
  66. class TestGetitemListLike:
  67. def test_getitem_list_missing_key(self):
  68. # GH#13822, incorrect error string with non-unique columns when missing
  69. # column is accessed
  70. df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]})
  71. df.columns = ["x", "x", "z"]
  72. # Check that we get the correct value in the KeyError
  73. with pytest.raises(KeyError, match=r"\['y'\] not in index"):
  74. df[["x", "y", "z"]]
  75. def test_getitem_list_duplicates(self):
  76. # GH#1943
  77. df = DataFrame(np.random.randn(4, 4), columns=list("AABC"))
  78. df.columns.name = "foo"
  79. result = df[["B", "C"]]
  80. assert result.columns.name == "foo"
  81. expected = df.iloc[:, 2:]
  82. tm.assert_frame_equal(result, expected)
  83. def test_getitem_dupe_cols(self):
  84. df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
  85. msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\""
  86. with pytest.raises(KeyError, match=re.escape(msg)):
  87. df[["baf"]]
  88. @pytest.mark.parametrize(
  89. "idx_type",
  90. [
  91. list,
  92. iter,
  93. Index,
  94. set,
  95. lambda keys: dict(zip(keys, range(len(keys)))),
  96. lambda keys: dict(zip(keys, range(len(keys)))).keys(),
  97. ],
  98. ids=["list", "iter", "Index", "set", "dict", "dict_keys"],
  99. )
  100. @pytest.mark.parametrize("levels", [1, 2])
  101. def test_getitem_listlike(self, idx_type, levels, float_frame):
  102. # GH#21294
  103. if levels == 1:
  104. frame, missing = float_frame, "food"
  105. else:
  106. # MultiIndex columns
  107. frame = DataFrame(
  108. np.random.randn(8, 3),
  109. columns=Index(
  110. [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")],
  111. name=("sth", "sth2"),
  112. ),
  113. )
  114. missing = ("good", "food")
  115. keys = [frame.columns[1], frame.columns[0]]
  116. idx = idx_type(keys)
  117. idx_check = list(idx_type(keys))
  118. if isinstance(idx, (set, dict)):
  119. with pytest.raises(TypeError, match="as an indexer is not supported"):
  120. frame[idx]
  121. return
  122. else:
  123. result = frame[idx]
  124. expected = frame.loc[:, idx_check]
  125. expected.columns.names = frame.columns.names
  126. tm.assert_frame_equal(result, expected)
  127. idx = idx_type(keys + [missing])
  128. with pytest.raises(KeyError, match="not in index"):
  129. frame[idx]
  130. def test_getitem_iloc_generator(self):
  131. # GH#39614
  132. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
  133. indexer = (x for x in [1, 2])
  134. result = df.iloc[indexer]
  135. expected = DataFrame({"a": [2, 3], "b": [5, 6]}, index=[1, 2])
  136. tm.assert_frame_equal(result, expected)
  137. def test_getitem_iloc_two_dimensional_generator(self):
  138. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
  139. indexer = (x for x in [1, 2])
  140. result = df.iloc[indexer, 1]
  141. expected = Series([5, 6], name="b", index=[1, 2])
  142. tm.assert_series_equal(result, expected)
  143. def test_getitem_iloc_dateoffset_days(self):
  144. # GH 46671
  145. df = DataFrame(
  146. list(range(10)),
  147. index=date_range("01-01-2022", periods=10, freq=DateOffset(days=1)),
  148. )
  149. result = df.loc["2022-01-01":"2022-01-03"]
  150. expected = DataFrame(
  151. [0, 1, 2],
  152. index=DatetimeIndex(
  153. ["2022-01-01", "2022-01-02", "2022-01-03"],
  154. dtype="datetime64[ns]",
  155. freq=DateOffset(days=1),
  156. ),
  157. )
  158. tm.assert_frame_equal(result, expected)
  159. df = DataFrame(
  160. list(range(10)),
  161. index=date_range(
  162. "01-01-2022", periods=10, freq=DateOffset(days=1, hours=2)
  163. ),
  164. )
  165. result = df.loc["2022-01-01":"2022-01-03"]
  166. expected = DataFrame(
  167. [0, 1, 2],
  168. index=DatetimeIndex(
  169. ["2022-01-01 00:00:00", "2022-01-02 02:00:00", "2022-01-03 04:00:00"],
  170. dtype="datetime64[ns]",
  171. freq=DateOffset(days=1, hours=2),
  172. ),
  173. )
  174. tm.assert_frame_equal(result, expected)
  175. df = DataFrame(
  176. list(range(10)),
  177. index=date_range("01-01-2022", periods=10, freq=DateOffset(minutes=3)),
  178. )
  179. result = df.loc["2022-01-01":"2022-01-03"]
  180. tm.assert_frame_equal(result, df)
  181. class TestGetitemCallable:
  182. def test_getitem_callable(self, float_frame):
  183. # GH#12533
  184. result = float_frame[lambda x: "A"]
  185. expected = float_frame.loc[:, "A"]
  186. tm.assert_series_equal(result, expected)
  187. result = float_frame[lambda x: ["A", "B"]]
  188. expected = float_frame.loc[:, ["A", "B"]]
  189. tm.assert_frame_equal(result, float_frame.loc[:, ["A", "B"]])
  190. df = float_frame[:3]
  191. result = df[lambda x: [True, False, True]]
  192. expected = float_frame.iloc[[0, 2], :]
  193. tm.assert_frame_equal(result, expected)
  194. def test_loc_multiindex_columns_one_level(self):
  195. # GH#29749
  196. df = DataFrame([[1, 2]], columns=[["a", "b"]])
  197. expected = DataFrame([1], columns=[["a"]])
  198. result = df["a"]
  199. tm.assert_frame_equal(result, expected)
  200. result = df.loc[:, "a"]
  201. tm.assert_frame_equal(result, expected)
  202. class TestGetitemBooleanMask:
  203. def test_getitem_bool_mask_categorical_index(self):
  204. df3 = DataFrame(
  205. {
  206. "A": np.arange(6, dtype="int64"),
  207. },
  208. index=CategoricalIndex(
  209. [1, 1, 2, 1, 3, 2],
  210. dtype=CategoricalDtype([3, 2, 1], ordered=True),
  211. name="B",
  212. ),
  213. )
  214. df4 = DataFrame(
  215. {
  216. "A": np.arange(6, dtype="int64"),
  217. },
  218. index=CategoricalIndex(
  219. [1, 1, 2, 1, 3, 2],
  220. dtype=CategoricalDtype([3, 2, 1], ordered=False),
  221. name="B",
  222. ),
  223. )
  224. result = df3[df3.index == "a"]
  225. expected = df3.iloc[[]]
  226. tm.assert_frame_equal(result, expected)
  227. result = df4[df4.index == "a"]
  228. expected = df4.iloc[[]]
  229. tm.assert_frame_equal(result, expected)
  230. result = df3[df3.index == 1]
  231. expected = df3.iloc[[0, 1, 3]]
  232. tm.assert_frame_equal(result, expected)
  233. result = df4[df4.index == 1]
  234. expected = df4.iloc[[0, 1, 3]]
  235. tm.assert_frame_equal(result, expected)
  236. # since we have an ordered categorical
  237. # CategoricalIndex([1, 1, 2, 1, 3, 2],
  238. # categories=[3, 2, 1],
  239. # ordered=True,
  240. # name='B')
  241. result = df3[df3.index < 2]
  242. expected = df3.iloc[[4]]
  243. tm.assert_frame_equal(result, expected)
  244. result = df3[df3.index > 1]
  245. expected = df3.iloc[[]]
  246. tm.assert_frame_equal(result, expected)
  247. # unordered
  248. # cannot be compared
  249. # CategoricalIndex([1, 1, 2, 1, 3, 2],
  250. # categories=[3, 2, 1],
  251. # ordered=False,
  252. # name='B')
  253. msg = "Unordered Categoricals can only compare equality or not"
  254. with pytest.raises(TypeError, match=msg):
  255. df4[df4.index < 2]
  256. with pytest.raises(TypeError, match=msg):
  257. df4[df4.index > 1]
  258. @pytest.mark.parametrize(
  259. "data1,data2,expected_data",
  260. (
  261. (
  262. [[1, 2], [3, 4]],
  263. [[0.5, 6], [7, 8]],
  264. [[np.nan, 3.0], [np.nan, 4.0], [np.nan, 7.0], [6.0, 8.0]],
  265. ),
  266. (
  267. [[1, 2], [3, 4]],
  268. [[5, 6], [7, 8]],
  269. [[np.nan, 3.0], [np.nan, 4.0], [5, 7], [6, 8]],
  270. ),
  271. ),
  272. )
  273. def test_getitem_bool_mask_duplicate_columns_mixed_dtypes(
  274. self,
  275. data1,
  276. data2,
  277. expected_data,
  278. ):
  279. # GH#31954
  280. df1 = DataFrame(np.array(data1))
  281. df2 = DataFrame(np.array(data2))
  282. df = concat([df1, df2], axis=1)
  283. result = df[df > 2]
  284. exdict = {i: np.array(col) for i, col in enumerate(expected_data)}
  285. expected = DataFrame(exdict).rename(columns={2: 0, 3: 1})
  286. tm.assert_frame_equal(result, expected)
  287. @pytest.fixture
  288. def df_dup_cols(self):
  289. dups = ["A", "A", "C", "D"]
  290. df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
  291. return df
  292. def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_cols):
  293. # `df.A > 6` is a DataFrame with a different shape from df
  294. # boolean with the duplicate raises
  295. df = df_dup_cols
  296. msg = "cannot reindex on an axis with duplicate labels"
  297. with pytest.raises(ValueError, match=msg):
  298. df[df.A > 6]
  299. def test_getitem_boolean_series_with_duplicate_columns(self, df_dup_cols):
  300. # boolean indexing
  301. # GH#4879
  302. df = DataFrame(
  303. np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
  304. )
  305. expected = df[df.C > 6]
  306. expected.columns = df_dup_cols.columns
  307. df = df_dup_cols
  308. result = df[df.C > 6]
  309. tm.assert_frame_equal(result, expected)
  310. result.dtypes
  311. str(result)
  312. def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols):
  313. # where
  314. df = DataFrame(
  315. np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
  316. )
  317. # `df > 6` is a DataFrame with the same shape+alignment as df
  318. expected = df[df > 6]
  319. expected.columns = df_dup_cols.columns
  320. df = df_dup_cols
  321. result = df[df > 6]
  322. tm.assert_frame_equal(result, expected)
  323. result.dtypes
  324. str(result)
  325. def test_getitem_empty_frame_with_boolean(self):
  326. # Test for issue GH#11859
  327. df = DataFrame()
  328. df2 = df[df > 0]
  329. tm.assert_frame_equal(df, df2)
  330. def test_getitem_returns_view_when_column_is_unique_in_df(
  331. self, using_copy_on_write
  332. ):
  333. # GH#45316
  334. df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
  335. df_orig = df.copy()
  336. view = df["b"]
  337. view.loc[:] = 100
  338. if using_copy_on_write:
  339. expected = df_orig
  340. else:
  341. expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"])
  342. tm.assert_frame_equal(df, expected)
  343. def test_getitem_frozenset_unique_in_column(self):
  344. # GH#41062
  345. df = DataFrame([[1, 2, 3, 4]], columns=[frozenset(["KEY"]), "B", "C", "C"])
  346. result = df[frozenset(["KEY"])]
  347. expected = Series([1], name=frozenset(["KEY"]))
  348. tm.assert_series_equal(result, expected)
  349. class TestGetitemSlice:
  350. def test_getitem_slice_float64(self, frame_or_series):
  351. values = np.arange(10.0, 50.0, 2)
  352. index = Index(values)
  353. start, end = values[[5, 15]]
  354. data = np.random.randn(20, 3)
  355. if frame_or_series is not DataFrame:
  356. data = data[:, 0]
  357. obj = frame_or_series(data, index=index)
  358. result = obj[start:end]
  359. expected = obj.iloc[5:16]
  360. tm.assert_equal(result, expected)
  361. result = obj.loc[start:end]
  362. tm.assert_equal(result, expected)
  363. def test_getitem_datetime_slice(self):
  364. # GH#43223
  365. df = DataFrame(
  366. {"a": 0},
  367. index=DatetimeIndex(
  368. [
  369. "11.01.2011 22:00",
  370. "11.01.2011 23:00",
  371. "12.01.2011 00:00",
  372. "2011-01-13 00:00",
  373. ]
  374. ),
  375. )
  376. with pytest.raises(
  377. KeyError, match="Value based partial slicing on non-monotonic"
  378. ):
  379. df["2011-01-01":"2011-11-01"]
  380. class TestGetitemDeprecatedIndexers:
  381. @pytest.mark.parametrize("key", [{"a", "b"}, {"a": "a"}])
  382. def test_getitem_dict_and_set_deprecated(self, key):
  383. # GH#42825 enforced in 2.0
  384. df = DataFrame(
  385. [[1, 2], [3, 4]], columns=MultiIndex.from_tuples([("a", 1), ("b", 2)])
  386. )
  387. with pytest.raises(TypeError, match="as an indexer is not supported"):
  388. df[key]