test_drop.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas.errors import PerformanceWarning
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. DatetimeIndex,
  9. Index,
  10. MultiIndex,
  11. Series,
  12. Timestamp,
  13. )
  14. import pandas._testing as tm
  15. @pytest.mark.parametrize(
  16. "msg,labels,level",
  17. [
  18. (r"labels \[4\] not found in level", 4, "a"),
  19. (r"labels \[7\] not found in level", 7, "b"),
  20. ],
  21. )
  22. def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level):
  23. # GH 8594
  24. mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
  25. s = Series([10, 20, 30], index=mi)
  26. df = DataFrame([10, 20, 30], index=mi)
  27. with pytest.raises(KeyError, match=msg):
  28. s.drop(labels, level=level)
  29. with pytest.raises(KeyError, match=msg):
  30. df.drop(labels, level=level)
  31. @pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")])
  32. def test_drop_errors_ignore(labels, level):
  33. # GH 8594
  34. mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
  35. s = Series([10, 20, 30], index=mi)
  36. df = DataFrame([10, 20, 30], index=mi)
  37. expected_s = s.drop(labels, level=level, errors="ignore")
  38. tm.assert_series_equal(s, expected_s)
  39. expected_df = df.drop(labels, level=level, errors="ignore")
  40. tm.assert_frame_equal(df, expected_df)
  41. def test_drop_with_non_unique_datetime_index_and_invalid_keys():
  42. # GH 30399
  43. # define dataframe with unique datetime index
  44. df = DataFrame(
  45. np.random.randn(5, 3),
  46. columns=["a", "b", "c"],
  47. index=pd.date_range("2012", freq="H", periods=5),
  48. )
  49. # create dataframe with non-unique datetime index
  50. df = df.iloc[[0, 2, 2, 3]].copy()
  51. with pytest.raises(KeyError, match="not found in axis"):
  52. df.drop(["a", "b"]) # Dropping with labels not exist in the index
  53. class TestDataFrameDrop:
  54. def test_drop_names(self):
  55. df = DataFrame(
  56. [[1, 2, 3], [3, 4, 5], [5, 6, 7]],
  57. index=["a", "b", "c"],
  58. columns=["d", "e", "f"],
  59. )
  60. df.index.name, df.columns.name = "first", "second"
  61. df_dropped_b = df.drop("b")
  62. df_dropped_e = df.drop("e", axis=1)
  63. df_inplace_b, df_inplace_e = df.copy(), df.copy()
  64. return_value = df_inplace_b.drop("b", inplace=True)
  65. assert return_value is None
  66. return_value = df_inplace_e.drop("e", axis=1, inplace=True)
  67. assert return_value is None
  68. for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
  69. assert obj.index.name == "first"
  70. assert obj.columns.name == "second"
  71. assert list(df.columns) == ["d", "e", "f"]
  72. msg = r"\['g'\] not found in axis"
  73. with pytest.raises(KeyError, match=msg):
  74. df.drop(["g"])
  75. with pytest.raises(KeyError, match=msg):
  76. df.drop(["g"], axis=1)
  77. # errors = 'ignore'
  78. dropped = df.drop(["g"], errors="ignore")
  79. expected = Index(["a", "b", "c"], name="first")
  80. tm.assert_index_equal(dropped.index, expected)
  81. dropped = df.drop(["b", "g"], errors="ignore")
  82. expected = Index(["a", "c"], name="first")
  83. tm.assert_index_equal(dropped.index, expected)
  84. dropped = df.drop(["g"], axis=1, errors="ignore")
  85. expected = Index(["d", "e", "f"], name="second")
  86. tm.assert_index_equal(dropped.columns, expected)
  87. dropped = df.drop(["d", "g"], axis=1, errors="ignore")
  88. expected = Index(["e", "f"], name="second")
  89. tm.assert_index_equal(dropped.columns, expected)
  90. # GH 16398
  91. dropped = df.drop([], errors="ignore")
  92. expected = Index(["a", "b", "c"], name="first")
  93. tm.assert_index_equal(dropped.index, expected)
  94. def test_drop(self):
  95. simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
  96. tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]])
  97. tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]])
  98. tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
  99. tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :])
  100. with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
  101. simple.drop(5)
  102. with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
  103. simple.drop("C", axis=1)
  104. with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
  105. simple.drop([1, 5])
  106. with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
  107. simple.drop(["A", "C"], axis=1)
  108. # GH 42881
  109. with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"):
  110. simple.drop(["C", "D", "F"], axis=1)
  111. # errors = 'ignore'
  112. tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple)
  113. tm.assert_frame_equal(
  114. simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]
  115. )
  116. tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple)
  117. tm.assert_frame_equal(
  118. simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]
  119. )
  120. # non-unique - wheee!
  121. nu_df = DataFrame(
  122. list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"]
  123. )
  124. tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]])
  125. tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"])
  126. tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
  127. nu_df = nu_df.set_index(Index(["X", "Y", "X"]))
  128. nu_df.columns = list("abc")
  129. tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
  130. tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])
  131. # inplace cache issue
  132. # GH#5628
  133. df = DataFrame(np.random.randn(10, 3), columns=list("abc"))
  134. expected = df[~(df.b > 0)]
  135. return_value = df.drop(labels=df[df.b > 0].index, inplace=True)
  136. assert return_value is None
  137. tm.assert_frame_equal(df, expected)
  138. def test_drop_multiindex_not_lexsorted(self):
  139. # GH#11640
  140. # define the lexsorted version
  141. lexsorted_mi = MultiIndex.from_tuples(
  142. [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
  143. )
  144. lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
  145. assert lexsorted_df.columns._is_lexsorted()
  146. # define the non-lexsorted version
  147. not_lexsorted_df = DataFrame(
  148. columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
  149. )
  150. not_lexsorted_df = not_lexsorted_df.pivot_table(
  151. index="a", columns=["b", "c"], values="d"
  152. )
  153. not_lexsorted_df = not_lexsorted_df.reset_index()
  154. assert not not_lexsorted_df.columns._is_lexsorted()
  155. # compare the results
  156. tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
  157. expected = lexsorted_df.drop("a", axis=1)
  158. with tm.assert_produces_warning(PerformanceWarning):
  159. result = not_lexsorted_df.drop("a", axis=1)
  160. tm.assert_frame_equal(result, expected)
  161. def test_drop_api_equivalence(self):
  162. # equivalence of the labels/axis and index/columns API's (GH#12392)
  163. df = DataFrame(
  164. [[1, 2, 3], [3, 4, 5], [5, 6, 7]],
  165. index=["a", "b", "c"],
  166. columns=["d", "e", "f"],
  167. )
  168. res1 = df.drop("a")
  169. res2 = df.drop(index="a")
  170. tm.assert_frame_equal(res1, res2)
  171. res1 = df.drop("d", axis=1)
  172. res2 = df.drop(columns="d")
  173. tm.assert_frame_equal(res1, res2)
  174. res1 = df.drop(labels="e", axis=1)
  175. res2 = df.drop(columns="e")
  176. tm.assert_frame_equal(res1, res2)
  177. res1 = df.drop(["a"], axis=0)
  178. res2 = df.drop(index=["a"])
  179. tm.assert_frame_equal(res1, res2)
  180. res1 = df.drop(["a"], axis=0).drop(["d"], axis=1)
  181. res2 = df.drop(index=["a"], columns=["d"])
  182. tm.assert_frame_equal(res1, res2)
  183. msg = "Cannot specify both 'labels' and 'index'/'columns'"
  184. with pytest.raises(ValueError, match=msg):
  185. df.drop(labels="a", index="b")
  186. with pytest.raises(ValueError, match=msg):
  187. df.drop(labels="a", columns="b")
  188. msg = "Need to specify at least one of 'labels', 'index' or 'columns'"
  189. with pytest.raises(ValueError, match=msg):
  190. df.drop(axis=1)
  191. data = [[1, 2, 3], [1, 2, 3]]
  192. @pytest.mark.parametrize(
  193. "actual",
  194. [
  195. DataFrame(data=data, index=["a", "a"]),
  196. DataFrame(data=data, index=["a", "b"]),
  197. DataFrame(data=data, index=["a", "b"]).set_index([0, 1]),
  198. DataFrame(data=data, index=["a", "a"]).set_index([0, 1]),
  199. ],
  200. )
  201. def test_raise_on_drop_duplicate_index(self, actual):
  202. # GH#19186
  203. level = 0 if isinstance(actual.index, MultiIndex) else None
  204. msg = re.escape("\"['c'] not found in axis\"")
  205. with pytest.raises(KeyError, match=msg):
  206. actual.drop("c", level=level, axis=0)
  207. with pytest.raises(KeyError, match=msg):
  208. actual.T.drop("c", level=level, axis=1)
  209. expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore")
  210. tm.assert_frame_equal(expected_no_err, actual)
  211. expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore")
  212. tm.assert_frame_equal(expected_no_err.T, actual)
  213. @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]])
  214. @pytest.mark.parametrize("drop_labels", [[], [1], [2]])
  215. def test_drop_empty_list(self, index, drop_labels):
  216. # GH#21494
  217. expected_index = [i for i in index if i not in drop_labels]
  218. frame = DataFrame(index=index).drop(drop_labels)
  219. tm.assert_frame_equal(frame, DataFrame(index=expected_index))
  220. @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]])
  221. @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]])
  222. def test_drop_non_empty_list(self, index, drop_labels):
  223. # GH# 21494
  224. with pytest.raises(KeyError, match="not found in axis"):
  225. DataFrame(index=index).drop(drop_labels)
  226. @pytest.mark.parametrize(
  227. "empty_listlike",
  228. [
  229. [],
  230. {},
  231. np.array([]),
  232. Series([], dtype="datetime64[ns]"),
  233. Index([]),
  234. DatetimeIndex([]),
  235. ],
  236. )
  237. def test_drop_empty_listlike_non_unique_datetime_index(self, empty_listlike):
  238. # GH#27994
  239. data = {"column_a": [5, 10], "column_b": ["one", "two"]}
  240. index = [Timestamp("2021-01-01"), Timestamp("2021-01-01")]
  241. df = DataFrame(data, index=index)
  242. # Passing empty list-like should return the same DataFrame.
  243. expected = df.copy()
  244. result = df.drop(empty_listlike)
  245. tm.assert_frame_equal(result, expected)
  246. def test_mixed_depth_drop(self):
  247. arrays = [
  248. ["a", "top", "top", "routine1", "routine1", "routine2"],
  249. ["", "OD", "OD", "result1", "result2", "result1"],
  250. ["", "wx", "wy", "", "", ""],
  251. ]
  252. tuples = sorted(zip(*arrays))
  253. index = MultiIndex.from_tuples(tuples)
  254. df = DataFrame(np.random.randn(4, 6), columns=index)
  255. result = df.drop("a", axis=1)
  256. expected = df.drop([("a", "", "")], axis=1)
  257. tm.assert_frame_equal(expected, result)
  258. result = df.drop(["top"], axis=1)
  259. expected = df.drop([("top", "OD", "wx")], axis=1)
  260. expected = expected.drop([("top", "OD", "wy")], axis=1)
  261. tm.assert_frame_equal(expected, result)
  262. result = df.drop(("top", "OD", "wx"), axis=1)
  263. expected = df.drop([("top", "OD", "wx")], axis=1)
  264. tm.assert_frame_equal(expected, result)
  265. expected = df.drop([("top", "OD", "wy")], axis=1)
  266. expected = df.drop("top", axis=1)
  267. result = df.drop("result1", level=1, axis=1)
  268. expected = df.drop(
  269. [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
  270. )
  271. tm.assert_frame_equal(expected, result)
  272. def test_drop_multiindex_other_level_nan(self):
  273. # GH#12754
  274. df = (
  275. DataFrame(
  276. {
  277. "A": ["one", "one", "two", "two"],
  278. "B": [np.nan, 0.0, 1.0, 2.0],
  279. "C": ["a", "b", "c", "c"],
  280. "D": [1, 2, 3, 4],
  281. }
  282. )
  283. .set_index(["A", "B", "C"])
  284. .sort_index()
  285. )
  286. result = df.drop("c", level="C")
  287. expected = DataFrame(
  288. [2, 1],
  289. columns=["D"],
  290. index=MultiIndex.from_tuples(
  291. [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
  292. ),
  293. )
  294. tm.assert_frame_equal(result, expected)
  295. def test_drop_nonunique(self):
  296. df = DataFrame(
  297. [
  298. ["x-a", "x", "a", 1.5],
  299. ["x-a", "x", "a", 1.2],
  300. ["z-c", "z", "c", 3.1],
  301. ["x-a", "x", "a", 4.1],
  302. ["x-b", "x", "b", 5.1],
  303. ["x-b", "x", "b", 4.1],
  304. ["x-b", "x", "b", 2.2],
  305. ["y-a", "y", "a", 1.2],
  306. ["z-b", "z", "b", 2.1],
  307. ],
  308. columns=["var1", "var2", "var3", "var4"],
  309. )
  310. grp_size = df.groupby("var1").size()
  311. drop_idx = grp_size.loc[grp_size == 1]
  312. idf = df.set_index(["var1", "var2", "var3"])
  313. # it works! GH#2101
  314. result = idf.drop(drop_idx.index, level=0).reset_index()
  315. expected = df[-df.var1.isin(drop_idx.index)]
  316. result.index = expected.index
  317. tm.assert_frame_equal(result, expected)
  318. def test_drop_level(self, multiindex_dataframe_random_data):
  319. frame = multiindex_dataframe_random_data
  320. result = frame.drop(["bar", "qux"], level="first")
  321. expected = frame.iloc[[0, 1, 2, 5, 6]]
  322. tm.assert_frame_equal(result, expected)
  323. result = frame.drop(["two"], level="second")
  324. expected = frame.iloc[[0, 2, 3, 6, 7, 9]]
  325. tm.assert_frame_equal(result, expected)
  326. result = frame.T.drop(["bar", "qux"], axis=1, level="first")
  327. expected = frame.iloc[[0, 1, 2, 5, 6]].T
  328. tm.assert_frame_equal(result, expected)
  329. result = frame.T.drop(["two"], axis=1, level="second")
  330. expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T
  331. tm.assert_frame_equal(result, expected)
  332. def test_drop_level_nonunique_datetime(self):
  333. # GH#12701
  334. idx = Index([2, 3, 4, 4, 5], name="id")
  335. idxdt = pd.to_datetime(
  336. [
  337. "2016-03-23 14:00",
  338. "2016-03-23 15:00",
  339. "2016-03-23 16:00",
  340. "2016-03-23 16:00",
  341. "2016-03-23 17:00",
  342. ]
  343. )
  344. df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
  345. df["tstamp"] = idxdt
  346. df = df.set_index("tstamp", append=True)
  347. ts = Timestamp("201603231600")
  348. assert df.index.is_unique is False
  349. result = df.drop(ts, level="tstamp")
  350. expected = df.loc[idx != 4]
  351. tm.assert_frame_equal(result, expected)
  352. def test_drop_tz_aware_timestamp_across_dst(self, frame_or_series):
  353. # GH#21761
  354. start = Timestamp("2017-10-29", tz="Europe/Berlin")
  355. end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
  356. index = pd.date_range(start, end, freq="15min")
  357. data = frame_or_series(data=[1] * len(index), index=index)
  358. result = data.drop(start)
  359. expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
  360. expected_idx = pd.date_range(expected_start, end, freq="15min")
  361. expected = frame_or_series(data=[1] * len(expected_idx), index=expected_idx)
  362. tm.assert_equal(result, expected)
  363. def test_drop_preserve_names(self):
  364. index = MultiIndex.from_arrays(
  365. [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
  366. )
  367. df = DataFrame(np.random.randn(6, 3), index=index)
  368. result = df.drop([(0, 2)])
  369. assert result.index.names == ("one", "two")
  370. @pytest.mark.parametrize(
  371. "operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"]
  372. )
  373. @pytest.mark.parametrize("inplace", [False, True])
  374. def test_inplace_drop_and_operation(self, operation, inplace):
  375. # GH#30484
  376. df = DataFrame({"x": range(5)})
  377. expected = df.copy()
  378. df["y"] = range(5)
  379. y = df["y"]
  380. with tm.assert_produces_warning(None):
  381. if inplace:
  382. df.drop("y", axis=1, inplace=inplace)
  383. else:
  384. df = df.drop("y", axis=1, inplace=inplace)
  385. # Perform operation and check result
  386. getattr(y, operation)(1)
  387. tm.assert_frame_equal(df, expected)
  388. def test_drop_with_non_unique_multiindex(self):
  389. # GH#36293
  390. mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]])
  391. df = DataFrame([1, 2, 3], index=mi)
  392. result = df.drop(index="x")
  393. expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]]))
  394. tm.assert_frame_equal(result, expected)
  395. @pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]])
  396. def test_drop_tuple_with_non_unique_multiindex(self, indexer):
  397. # GH#42771
  398. idx = MultiIndex.from_product([["a", "b"], ["a", "a"]])
  399. df = DataFrame({"x": range(len(idx))}, index=idx)
  400. result = df.drop(index=[("a", "a")])
  401. expected = DataFrame(
  402. {"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")])
  403. )
  404. tm.assert_frame_equal(result, expected)
  405. def test_drop_with_duplicate_columns(self):
  406. df = DataFrame(
  407. [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
  408. )
  409. result = df.drop(["a"], axis=1)
  410. expected = DataFrame([[1], [1], [1]], columns=["bar"])
  411. tm.assert_frame_equal(result, expected)
  412. result = df.drop("a", axis=1)
  413. tm.assert_frame_equal(result, expected)
  414. def test_drop_with_duplicate_columns2(self):
  415. # drop buggy GH#6240
  416. df = DataFrame(
  417. {
  418. "A": np.random.randn(5),
  419. "B": np.random.randn(5),
  420. "C": np.random.randn(5),
  421. "D": ["a", "b", "c", "d", "e"],
  422. }
  423. )
  424. expected = df.take([0, 1, 1], axis=1)
  425. df2 = df.take([2, 0, 1, 2, 1], axis=1)
  426. result = df2.drop("C", axis=1)
  427. tm.assert_frame_equal(result, expected)
  428. def test_drop_inplace_no_leftover_column_reference(self):
  429. # GH 13934
  430. df = DataFrame({"a": [1, 2, 3]})
  431. a = df.a
  432. df.drop(["a"], axis=1, inplace=True)
  433. tm.assert_index_equal(df.columns, Index([], dtype="object"))
  434. a -= a.mean()
  435. tm.assert_index_equal(df.columns, Index([], dtype="object"))
  436. def test_drop_level_missing_label_multiindex(self):
  437. # GH 18561
  438. df = DataFrame(index=MultiIndex.from_product([range(3), range(3)]))
  439. with pytest.raises(KeyError, match="labels \\[5\\] not found in level"):
  440. df.drop(5, level=0)
  441. @pytest.mark.parametrize("idx, level", [(["a", "b"], 0), (["a"], None)])
  442. def test_drop_index_ea_dtype(self, any_numeric_ea_dtype, idx, level):
  443. # GH#45860
  444. df = DataFrame(
  445. {"a": [1, 2, 2, pd.NA], "b": 100}, dtype=any_numeric_ea_dtype
  446. ).set_index(idx)
  447. result = df.drop(Index([2, pd.NA]), level=level)
  448. expected = DataFrame(
  449. {"a": [1], "b": 100}, dtype=any_numeric_ea_dtype
  450. ).set_index(idx)
  451. tm.assert_frame_equal(result, expected)