test_drop_duplicates.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. Categorical,
  5. Series,
  6. )
  7. import pandas._testing as tm
  8. @pytest.mark.parametrize(
  9. "keep, expected",
  10. [
  11. ("first", Series([False, False, False, False, True, True, False])),
  12. ("last", Series([False, True, True, False, False, False, False])),
  13. (False, Series([False, True, True, False, True, True, False])),
  14. ],
  15. )
  16. def test_drop_duplicates(any_numpy_dtype, keep, expected):
  17. tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))
  18. if tc.dtype == "bool":
  19. pytest.skip("tested separately in test_drop_duplicates_bool")
  20. tm.assert_series_equal(tc.duplicated(keep=keep), expected)
  21. tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
  22. sc = tc.copy()
  23. return_value = sc.drop_duplicates(keep=keep, inplace=True)
  24. assert return_value is None
  25. tm.assert_series_equal(sc, tc[~expected])
  26. @pytest.mark.parametrize(
  27. "keep, expected",
  28. [
  29. ("first", Series([False, False, True, True])),
  30. ("last", Series([True, True, False, False])),
  31. (False, Series([True, True, True, True])),
  32. ],
  33. )
  34. def test_drop_duplicates_bool(keep, expected):
  35. tc = Series([True, False, True, False])
  36. tm.assert_series_equal(tc.duplicated(keep=keep), expected)
  37. tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
  38. sc = tc.copy()
  39. return_value = sc.drop_duplicates(keep=keep, inplace=True)
  40. tm.assert_series_equal(sc, tc[~expected])
  41. assert return_value is None
  42. @pytest.mark.parametrize("values", [[], list(range(5))])
  43. def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
  44. tc = Series(values, dtype=np.dtype(any_numpy_dtype))
  45. expected = Series([False] * len(tc), dtype="bool")
  46. if tc.dtype == "bool":
  47. # 0 -> False and 1-> True
  48. # any other value would be duplicated
  49. tc = tc[:2]
  50. expected = expected[:2]
  51. tm.assert_series_equal(tc.duplicated(keep=keep), expected)
  52. result_dropped = tc.drop_duplicates(keep=keep)
  53. tm.assert_series_equal(result_dropped, tc)
  54. # validate shallow copy
  55. assert result_dropped is not tc
  56. class TestSeriesDropDuplicates:
  57. @pytest.fixture(
  58. params=["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"]
  59. )
  60. def dtype(self, request):
  61. return request.param
  62. @pytest.fixture
  63. def cat_series_unused_category(self, dtype, ordered):
  64. # Test case 1
  65. cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
  66. input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
  67. cat = Categorical(input1, categories=cat_array, ordered=ordered)
  68. tc1 = Series(cat)
  69. return tc1
  70. def test_drop_duplicates_categorical_non_bool(self, cat_series_unused_category):
  71. tc1 = cat_series_unused_category
  72. expected = Series([False, False, False, True])
  73. result = tc1.duplicated()
  74. tm.assert_series_equal(result, expected)
  75. result = tc1.drop_duplicates()
  76. tm.assert_series_equal(result, tc1[~expected])
  77. sc = tc1.copy()
  78. return_value = sc.drop_duplicates(inplace=True)
  79. assert return_value is None
  80. tm.assert_series_equal(sc, tc1[~expected])
  81. def test_drop_duplicates_categorical_non_bool_keeplast(
  82. self, cat_series_unused_category
  83. ):
  84. tc1 = cat_series_unused_category
  85. expected = Series([False, False, True, False])
  86. result = tc1.duplicated(keep="last")
  87. tm.assert_series_equal(result, expected)
  88. result = tc1.drop_duplicates(keep="last")
  89. tm.assert_series_equal(result, tc1[~expected])
  90. sc = tc1.copy()
  91. return_value = sc.drop_duplicates(keep="last", inplace=True)
  92. assert return_value is None
  93. tm.assert_series_equal(sc, tc1[~expected])
  94. def test_drop_duplicates_categorical_non_bool_keepfalse(
  95. self, cat_series_unused_category
  96. ):
  97. tc1 = cat_series_unused_category
  98. expected = Series([False, False, True, True])
  99. result = tc1.duplicated(keep=False)
  100. tm.assert_series_equal(result, expected)
  101. result = tc1.drop_duplicates(keep=False)
  102. tm.assert_series_equal(result, tc1[~expected])
  103. sc = tc1.copy()
  104. return_value = sc.drop_duplicates(keep=False, inplace=True)
  105. assert return_value is None
  106. tm.assert_series_equal(sc, tc1[~expected])
  107. @pytest.fixture
  108. def cat_series(self, dtype, ordered):
  109. # no unused categories, unlike cat_series_unused_category
  110. cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
  111. input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
  112. cat = Categorical(input2, categories=cat_array, ordered=ordered)
  113. tc2 = Series(cat)
  114. return tc2
  115. def test_drop_duplicates_categorical_non_bool2(self, cat_series):
  116. tc2 = cat_series
  117. expected = Series([False, False, False, False, True, True, False])
  118. result = tc2.duplicated()
  119. tm.assert_series_equal(result, expected)
  120. result = tc2.drop_duplicates()
  121. tm.assert_series_equal(result, tc2[~expected])
  122. sc = tc2.copy()
  123. return_value = sc.drop_duplicates(inplace=True)
  124. assert return_value is None
  125. tm.assert_series_equal(sc, tc2[~expected])
  126. def test_drop_duplicates_categorical_non_bool2_keeplast(self, cat_series):
  127. tc2 = cat_series
  128. expected = Series([False, True, True, False, False, False, False])
  129. result = tc2.duplicated(keep="last")
  130. tm.assert_series_equal(result, expected)
  131. result = tc2.drop_duplicates(keep="last")
  132. tm.assert_series_equal(result, tc2[~expected])
  133. sc = tc2.copy()
  134. return_value = sc.drop_duplicates(keep="last", inplace=True)
  135. assert return_value is None
  136. tm.assert_series_equal(sc, tc2[~expected])
  137. def test_drop_duplicates_categorical_non_bool2_keepfalse(self, cat_series):
  138. tc2 = cat_series
  139. expected = Series([False, True, True, False, True, True, False])
  140. result = tc2.duplicated(keep=False)
  141. tm.assert_series_equal(result, expected)
  142. result = tc2.drop_duplicates(keep=False)
  143. tm.assert_series_equal(result, tc2[~expected])
  144. sc = tc2.copy()
  145. return_value = sc.drop_duplicates(keep=False, inplace=True)
  146. assert return_value is None
  147. tm.assert_series_equal(sc, tc2[~expected])
  148. def test_drop_duplicates_categorical_bool(self, ordered):
  149. tc = Series(
  150. Categorical(
  151. [True, False, True, False], categories=[True, False], ordered=ordered
  152. )
  153. )
  154. expected = Series([False, False, True, True])
  155. tm.assert_series_equal(tc.duplicated(), expected)
  156. tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
  157. sc = tc.copy()
  158. return_value = sc.drop_duplicates(inplace=True)
  159. assert return_value is None
  160. tm.assert_series_equal(sc, tc[~expected])
  161. expected = Series([True, True, False, False])
  162. tm.assert_series_equal(tc.duplicated(keep="last"), expected)
  163. tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected])
  164. sc = tc.copy()
  165. return_value = sc.drop_duplicates(keep="last", inplace=True)
  166. assert return_value is None
  167. tm.assert_series_equal(sc, tc[~expected])
  168. expected = Series([True, True, True, True])
  169. tm.assert_series_equal(tc.duplicated(keep=False), expected)
  170. tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
  171. sc = tc.copy()
  172. return_value = sc.drop_duplicates(keep=False, inplace=True)
  173. assert return_value is None
  174. tm.assert_series_equal(sc, tc[~expected])
  175. def test_drop_duplicates_categorical_bool_na(self, nulls_fixture):
  176. # GH#44351
  177. ser = Series(
  178. Categorical(
  179. [True, False, True, False, nulls_fixture],
  180. categories=[True, False],
  181. ordered=True,
  182. )
  183. )
  184. result = ser.drop_duplicates()
  185. expected = Series(
  186. Categorical([True, False, np.nan], categories=[True, False], ordered=True),
  187. index=[0, 1, 4],
  188. )
  189. tm.assert_series_equal(result, expected)
  190. def test_drop_duplicates_ignore_index(self):
  191. # GH#48304
  192. ser = Series([1, 2, 2, 3])
  193. result = ser.drop_duplicates(ignore_index=True)
  194. expected = Series([1, 2, 3])
  195. tm.assert_series_equal(result, expected)