test_duplicated.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas import (
  5. DataFrame,
  6. Series,
  7. date_range,
  8. )
  9. import pandas._testing as tm
  10. @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
  11. def test_duplicated_with_misspelled_column_name(subset):
  12. # GH 19730
  13. df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
  14. msg = re.escape("Index(['a'], dtype='object')")
  15. with pytest.raises(KeyError, match=msg):
  16. df.duplicated(subset)
  17. @pytest.mark.slow
  18. def test_duplicated_do_not_fail_on_wide_dataframes():
  19. # gh-21524
  20. # Given the wide dataframe with a lot of columns
  21. # with different (important!) values
  22. data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)}
  23. df = DataFrame(data).T
  24. result = df.duplicated()
  25. # Then duplicates produce the bool Series as a result and don't fail during
  26. # calculation. Actual values doesn't matter here, though usually it's all
  27. # False in this case
  28. assert isinstance(result, Series)
  29. assert result.dtype == np.bool_
  30. @pytest.mark.parametrize(
  31. "keep, expected",
  32. [
  33. ("first", Series([False, False, True, False, True])),
  34. ("last", Series([True, True, False, False, False])),
  35. (False, Series([True, True, True, False, True])),
  36. ],
  37. )
  38. def test_duplicated_keep(keep, expected):
  39. df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})
  40. result = df.duplicated(keep=keep)
  41. tm.assert_series_equal(result, expected)
  42. @pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
  43. @pytest.mark.parametrize(
  44. "keep, expected",
  45. [
  46. ("first", Series([False, False, True, False, True])),
  47. ("last", Series([True, True, False, False, False])),
  48. (False, Series([True, True, True, False, True])),
  49. ],
  50. )
  51. def test_duplicated_nan_none(keep, expected):
  52. df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object)
  53. result = df.duplicated(keep=keep)
  54. tm.assert_series_equal(result, expected)
  55. @pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
  56. def test_duplicated_subset(subset, keep):
  57. df = DataFrame(
  58. {
  59. "A": [0, 1, 1, 2, 0],
  60. "B": ["a", "b", "b", "c", "a"],
  61. "C": [np.nan, 3, 3, None, np.nan],
  62. }
  63. )
  64. if subset is None:
  65. subset = list(df.columns)
  66. elif isinstance(subset, str):
  67. # need to have a DataFrame, not a Series
  68. # -> select columns with singleton list, not string
  69. subset = [subset]
  70. expected = df[subset].duplicated(keep=keep)
  71. result = df.duplicated(keep=keep, subset=subset)
  72. tm.assert_series_equal(result, expected)
  73. def test_duplicated_on_empty_frame():
  74. # GH 25184
  75. df = DataFrame(columns=["a", "b"])
  76. dupes = df.duplicated("a")
  77. result = df[dupes]
  78. expected = df.copy()
  79. tm.assert_frame_equal(result, expected)
  80. def test_frame_datetime64_duplicated():
  81. dates = date_range("2010-07-01", end="2010-08-05")
  82. tst = DataFrame({"symbol": "AAA", "date": dates})
  83. result = tst.duplicated(["date", "symbol"])
  84. assert (-result).all()
  85. tst = DataFrame({"date": dates})
  86. result = tst.date.duplicated()
  87. assert (-result).all()