test_drop_duplicates.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. from datetime import datetime
  2. import re
  3. import numpy as np
  4. import pytest
  5. from pandas import (
  6. DataFrame,
  7. NaT,
  8. concat,
  9. )
  10. import pandas._testing as tm
  11. @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
  12. def test_drop_duplicates_with_misspelled_column_name(subset):
  13. # GH 19730
  14. df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
  15. msg = re.escape("Index(['a'], dtype='object')")
  16. with pytest.raises(KeyError, match=msg):
  17. df.drop_duplicates(subset)
  18. def test_drop_duplicates():
  19. df = DataFrame(
  20. {
  21. "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
  22. "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
  23. "C": [1, 1, 2, 2, 2, 2, 1, 2],
  24. "D": range(8),
  25. }
  26. )
  27. # single column
  28. result = df.drop_duplicates("AAA")
  29. expected = df[:2]
  30. tm.assert_frame_equal(result, expected)
  31. result = df.drop_duplicates("AAA", keep="last")
  32. expected = df.loc[[6, 7]]
  33. tm.assert_frame_equal(result, expected)
  34. result = df.drop_duplicates("AAA", keep=False)
  35. expected = df.loc[[]]
  36. tm.assert_frame_equal(result, expected)
  37. assert len(result) == 0
  38. # multi column
  39. expected = df.loc[[0, 1, 2, 3]]
  40. result = df.drop_duplicates(np.array(["AAA", "B"]))
  41. tm.assert_frame_equal(result, expected)
  42. result = df.drop_duplicates(["AAA", "B"])
  43. tm.assert_frame_equal(result, expected)
  44. result = df.drop_duplicates(("AAA", "B"), keep="last")
  45. expected = df.loc[[0, 5, 6, 7]]
  46. tm.assert_frame_equal(result, expected)
  47. result = df.drop_duplicates(("AAA", "B"), keep=False)
  48. expected = df.loc[[0]]
  49. tm.assert_frame_equal(result, expected)
  50. # consider everything
  51. df2 = df.loc[:, ["AAA", "B", "C"]]
  52. result = df2.drop_duplicates()
  53. # in this case only
  54. expected = df2.drop_duplicates(["AAA", "B"])
  55. tm.assert_frame_equal(result, expected)
  56. result = df2.drop_duplicates(keep="last")
  57. expected = df2.drop_duplicates(["AAA", "B"], keep="last")
  58. tm.assert_frame_equal(result, expected)
  59. result = df2.drop_duplicates(keep=False)
  60. expected = df2.drop_duplicates(["AAA", "B"], keep=False)
  61. tm.assert_frame_equal(result, expected)
  62. # integers
  63. result = df.drop_duplicates("C")
  64. expected = df.iloc[[0, 2]]
  65. tm.assert_frame_equal(result, expected)
  66. result = df.drop_duplicates("C", keep="last")
  67. expected = df.iloc[[-2, -1]]
  68. tm.assert_frame_equal(result, expected)
  69. df["E"] = df["C"].astype("int8")
  70. result = df.drop_duplicates("E")
  71. expected = df.iloc[[0, 2]]
  72. tm.assert_frame_equal(result, expected)
  73. result = df.drop_duplicates("E", keep="last")
  74. expected = df.iloc[[-2, -1]]
  75. tm.assert_frame_equal(result, expected)
  76. # GH 11376
  77. df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
  78. expected = df.loc[df.index != 3]
  79. tm.assert_frame_equal(df.drop_duplicates(), expected)
  80. df = DataFrame([[1, 0], [0, 2]])
  81. tm.assert_frame_equal(df.drop_duplicates(), df)
  82. df = DataFrame([[-2, 0], [0, -4]])
  83. tm.assert_frame_equal(df.drop_duplicates(), df)
  84. x = np.iinfo(np.int64).max / 3 * 2
  85. df = DataFrame([[-x, x], [0, x + 4]])
  86. tm.assert_frame_equal(df.drop_duplicates(), df)
  87. df = DataFrame([[-x, x], [x, x + 4]])
  88. tm.assert_frame_equal(df.drop_duplicates(), df)
  89. # GH 11864
  90. df = DataFrame([i] * 9 for i in range(16))
  91. df = concat([df, DataFrame([[1] + [0] * 8])], ignore_index=True)
  92. for keep in ["first", "last", False]:
  93. assert df.duplicated(keep=keep).sum() == 0
  94. def test_drop_duplicates_with_duplicate_column_names():
  95. # GH17836
  96. df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
  97. result0 = df.drop_duplicates()
  98. tm.assert_frame_equal(result0, df)
  99. result1 = df.drop_duplicates("a")
  100. expected1 = df[:2]
  101. tm.assert_frame_equal(result1, expected1)
  102. def test_drop_duplicates_for_take_all():
  103. df = DataFrame(
  104. {
  105. "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
  106. "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
  107. "C": [1, 1, 2, 2, 2, 2, 1, 2],
  108. "D": range(8),
  109. }
  110. )
  111. # single column
  112. result = df.drop_duplicates("AAA")
  113. expected = df.iloc[[0, 1, 2, 6]]
  114. tm.assert_frame_equal(result, expected)
  115. result = df.drop_duplicates("AAA", keep="last")
  116. expected = df.iloc[[2, 5, 6, 7]]
  117. tm.assert_frame_equal(result, expected)
  118. result = df.drop_duplicates("AAA", keep=False)
  119. expected = df.iloc[[2, 6]]
  120. tm.assert_frame_equal(result, expected)
  121. # multiple columns
  122. result = df.drop_duplicates(["AAA", "B"])
  123. expected = df.iloc[[0, 1, 2, 3, 4, 6]]
  124. tm.assert_frame_equal(result, expected)
  125. result = df.drop_duplicates(["AAA", "B"], keep="last")
  126. expected = df.iloc[[0, 1, 2, 5, 6, 7]]
  127. tm.assert_frame_equal(result, expected)
  128. result = df.drop_duplicates(["AAA", "B"], keep=False)
  129. expected = df.iloc[[0, 1, 2, 6]]
  130. tm.assert_frame_equal(result, expected)
  131. def test_drop_duplicates_tuple():
  132. df = DataFrame(
  133. {
  134. ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
  135. "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
  136. "C": [1, 1, 2, 2, 2, 2, 1, 2],
  137. "D": range(8),
  138. }
  139. )
  140. # single column
  141. result = df.drop_duplicates(("AA", "AB"))
  142. expected = df[:2]
  143. tm.assert_frame_equal(result, expected)
  144. result = df.drop_duplicates(("AA", "AB"), keep="last")
  145. expected = df.loc[[6, 7]]
  146. tm.assert_frame_equal(result, expected)
  147. result = df.drop_duplicates(("AA", "AB"), keep=False)
  148. expected = df.loc[[]] # empty df
  149. assert len(result) == 0
  150. tm.assert_frame_equal(result, expected)
  151. # multi column
  152. expected = df.loc[[0, 1, 2, 3]]
  153. result = df.drop_duplicates((("AA", "AB"), "B"))
  154. tm.assert_frame_equal(result, expected)
  155. @pytest.mark.parametrize(
  156. "df",
  157. [
  158. DataFrame(),
  159. DataFrame(columns=[]),
  160. DataFrame(columns=["A", "B", "C"]),
  161. DataFrame(index=[]),
  162. DataFrame(index=["A", "B", "C"]),
  163. ],
  164. )
  165. def test_drop_duplicates_empty(df):
  166. # GH 20516
  167. result = df.drop_duplicates()
  168. tm.assert_frame_equal(result, df)
  169. result = df.copy()
  170. result.drop_duplicates(inplace=True)
  171. tm.assert_frame_equal(result, df)
  172. def test_drop_duplicates_NA():
  173. # none
  174. df = DataFrame(
  175. {
  176. "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
  177. "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
  178. "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
  179. "D": range(8),
  180. }
  181. )
  182. # single column
  183. result = df.drop_duplicates("A")
  184. expected = df.loc[[0, 2, 3]]
  185. tm.assert_frame_equal(result, expected)
  186. result = df.drop_duplicates("A", keep="last")
  187. expected = df.loc[[1, 6, 7]]
  188. tm.assert_frame_equal(result, expected)
  189. result = df.drop_duplicates("A", keep=False)
  190. expected = df.loc[[]] # empty df
  191. tm.assert_frame_equal(result, expected)
  192. assert len(result) == 0
  193. # multi column
  194. result = df.drop_duplicates(["A", "B"])
  195. expected = df.loc[[0, 2, 3, 6]]
  196. tm.assert_frame_equal(result, expected)
  197. result = df.drop_duplicates(["A", "B"], keep="last")
  198. expected = df.loc[[1, 5, 6, 7]]
  199. tm.assert_frame_equal(result, expected)
  200. result = df.drop_duplicates(["A", "B"], keep=False)
  201. expected = df.loc[[6]]
  202. tm.assert_frame_equal(result, expected)
  203. # nan
  204. df = DataFrame(
  205. {
  206. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
  207. "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
  208. "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
  209. "D": range(8),
  210. }
  211. )
  212. # single column
  213. result = df.drop_duplicates("C")
  214. expected = df[:2]
  215. tm.assert_frame_equal(result, expected)
  216. result = df.drop_duplicates("C", keep="last")
  217. expected = df.loc[[3, 7]]
  218. tm.assert_frame_equal(result, expected)
  219. result = df.drop_duplicates("C", keep=False)
  220. expected = df.loc[[]] # empty df
  221. tm.assert_frame_equal(result, expected)
  222. assert len(result) == 0
  223. # multi column
  224. result = df.drop_duplicates(["C", "B"])
  225. expected = df.loc[[0, 1, 2, 4]]
  226. tm.assert_frame_equal(result, expected)
  227. result = df.drop_duplicates(["C", "B"], keep="last")
  228. expected = df.loc[[1, 3, 6, 7]]
  229. tm.assert_frame_equal(result, expected)
  230. result = df.drop_duplicates(["C", "B"], keep=False)
  231. expected = df.loc[[1]]
  232. tm.assert_frame_equal(result, expected)
  233. def test_drop_duplicates_NA_for_take_all():
  234. # none
  235. df = DataFrame(
  236. {
  237. "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
  238. "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
  239. }
  240. )
  241. # single column
  242. result = df.drop_duplicates("A")
  243. expected = df.iloc[[0, 2, 3, 5, 7]]
  244. tm.assert_frame_equal(result, expected)
  245. result = df.drop_duplicates("A", keep="last")
  246. expected = df.iloc[[1, 4, 5, 6, 7]]
  247. tm.assert_frame_equal(result, expected)
  248. result = df.drop_duplicates("A", keep=False)
  249. expected = df.iloc[[5, 7]]
  250. tm.assert_frame_equal(result, expected)
  251. # nan
  252. # single column
  253. result = df.drop_duplicates("C")
  254. expected = df.iloc[[0, 1, 5, 6]]
  255. tm.assert_frame_equal(result, expected)
  256. result = df.drop_duplicates("C", keep="last")
  257. expected = df.iloc[[3, 5, 6, 7]]
  258. tm.assert_frame_equal(result, expected)
  259. result = df.drop_duplicates("C", keep=False)
  260. expected = df.iloc[[5, 6]]
  261. tm.assert_frame_equal(result, expected)
  262. def test_drop_duplicates_inplace():
  263. orig = DataFrame(
  264. {
  265. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
  266. "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
  267. "C": [1, 1, 2, 2, 2, 2, 1, 2],
  268. "D": range(8),
  269. }
  270. )
  271. # single column
  272. df = orig.copy()
  273. return_value = df.drop_duplicates("A", inplace=True)
  274. expected = orig[:2]
  275. result = df
  276. tm.assert_frame_equal(result, expected)
  277. assert return_value is None
  278. df = orig.copy()
  279. return_value = df.drop_duplicates("A", keep="last", inplace=True)
  280. expected = orig.loc[[6, 7]]
  281. result = df
  282. tm.assert_frame_equal(result, expected)
  283. assert return_value is None
  284. df = orig.copy()
  285. return_value = df.drop_duplicates("A", keep=False, inplace=True)
  286. expected = orig.loc[[]]
  287. result = df
  288. tm.assert_frame_equal(result, expected)
  289. assert len(df) == 0
  290. assert return_value is None
  291. # multi column
  292. df = orig.copy()
  293. return_value = df.drop_duplicates(["A", "B"], inplace=True)
  294. expected = orig.loc[[0, 1, 2, 3]]
  295. result = df
  296. tm.assert_frame_equal(result, expected)
  297. assert return_value is None
  298. df = orig.copy()
  299. return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True)
  300. expected = orig.loc[[0, 5, 6, 7]]
  301. result = df
  302. tm.assert_frame_equal(result, expected)
  303. assert return_value is None
  304. df = orig.copy()
  305. return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True)
  306. expected = orig.loc[[0]]
  307. result = df
  308. tm.assert_frame_equal(result, expected)
  309. assert return_value is None
  310. # consider everything
  311. orig2 = orig.loc[:, ["A", "B", "C"]].copy()
  312. df2 = orig2.copy()
  313. return_value = df2.drop_duplicates(inplace=True)
  314. # in this case only
  315. expected = orig2.drop_duplicates(["A", "B"])
  316. result = df2
  317. tm.assert_frame_equal(result, expected)
  318. assert return_value is None
  319. df2 = orig2.copy()
  320. return_value = df2.drop_duplicates(keep="last", inplace=True)
  321. expected = orig2.drop_duplicates(["A", "B"], keep="last")
  322. result = df2
  323. tm.assert_frame_equal(result, expected)
  324. assert return_value is None
  325. df2 = orig2.copy()
  326. return_value = df2.drop_duplicates(keep=False, inplace=True)
  327. expected = orig2.drop_duplicates(["A", "B"], keep=False)
  328. result = df2
  329. tm.assert_frame_equal(result, expected)
  330. assert return_value is None
  331. @pytest.mark.parametrize("inplace", [True, False])
  332. @pytest.mark.parametrize(
  333. "origin_dict, output_dict, ignore_index, output_index",
  334. [
  335. ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]),
  336. ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]),
  337. ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]),
  338. ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]),
  339. ],
  340. )
  341. def test_drop_duplicates_ignore_index(
  342. inplace, origin_dict, output_dict, ignore_index, output_index
  343. ):
  344. # GH 30114
  345. df = DataFrame(origin_dict)
  346. expected = DataFrame(output_dict, index=output_index)
  347. if inplace:
  348. result_df = df.copy()
  349. result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
  350. else:
  351. result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
  352. tm.assert_frame_equal(result_df, expected)
  353. tm.assert_frame_equal(df, DataFrame(origin_dict))
  354. def test_drop_duplicates_null_in_object_column(nulls_fixture):
  355. # https://github.com/pandas-dev/pandas/issues/32992
  356. df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object)
  357. result = df.drop_duplicates()
  358. tm.assert_frame_equal(result, df)
  359. def test_drop_duplicates_series_vs_dataframe(keep):
  360. # GH#14192
  361. df = DataFrame(
  362. {
  363. "a": [1, 1, 1, "one", "one"],
  364. "b": [2, 2, np.nan, np.nan, np.nan],
  365. "c": [3, 3, np.nan, np.nan, "three"],
  366. "d": [1, 2, 3, 4, 4],
  367. "e": [
  368. datetime(2015, 1, 1),
  369. datetime(2015, 1, 1),
  370. datetime(2015, 2, 1),
  371. NaT,
  372. NaT,
  373. ],
  374. }
  375. )
  376. for column in df.columns:
  377. dropped_frame = df[[column]].drop_duplicates(keep=keep)
  378. dropped_series = df[column].drop_duplicates(keep=keep)
  379. tm.assert_frame_equal(dropped_frame, dropped_series.to_frame())
  380. @pytest.mark.parametrize("arg", [[1], 1, "True", [], 0])
  381. def test_drop_duplicates_non_boolean_ignore_index(arg):
  382. # GH#38274
  383. df = DataFrame({"a": [1, 2, 1, 3]})
  384. msg = '^For argument "ignore_index" expected type bool, received type .*.$'
  385. with pytest.raises(ValueError, match=msg):
  386. df.drop_duplicates(ignore_index=arg)