123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473 |
- from datetime import datetime
- import re
- import numpy as np
- import pytest
- from pandas import (
- DataFrame,
- NaT,
- concat,
- )
- import pandas._testing as tm
- @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
- def test_drop_duplicates_with_misspelled_column_name(subset):
- # GH 19730
- df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
- msg = re.escape("Index(['a'], dtype='object')")
- with pytest.raises(KeyError, match=msg):
- df.drop_duplicates(subset)
- def test_drop_duplicates():
- df = DataFrame(
- {
- "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": [1, 1, 2, 2, 2, 2, 1, 2],
- "D": range(8),
- }
- )
- # single column
- result = df.drop_duplicates("AAA")
- expected = df[:2]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("AAA", keep="last")
- expected = df.loc[[6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("AAA", keep=False)
- expected = df.loc[[]]
- tm.assert_frame_equal(result, expected)
- assert len(result) == 0
- # multi column
- expected = df.loc[[0, 1, 2, 3]]
- result = df.drop_duplicates(np.array(["AAA", "B"]))
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(["AAA", "B"])
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(("AAA", "B"), keep="last")
- expected = df.loc[[0, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(("AAA", "B"), keep=False)
- expected = df.loc[[0]]
- tm.assert_frame_equal(result, expected)
- # consider everything
- df2 = df.loc[:, ["AAA", "B", "C"]]
- result = df2.drop_duplicates()
- # in this case only
- expected = df2.drop_duplicates(["AAA", "B"])
- tm.assert_frame_equal(result, expected)
- result = df2.drop_duplicates(keep="last")
- expected = df2.drop_duplicates(["AAA", "B"], keep="last")
- tm.assert_frame_equal(result, expected)
- result = df2.drop_duplicates(keep=False)
- expected = df2.drop_duplicates(["AAA", "B"], keep=False)
- tm.assert_frame_equal(result, expected)
- # integers
- result = df.drop_duplicates("C")
- expected = df.iloc[[0, 2]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("C", keep="last")
- expected = df.iloc[[-2, -1]]
- tm.assert_frame_equal(result, expected)
- df["E"] = df["C"].astype("int8")
- result = df.drop_duplicates("E")
- expected = df.iloc[[0, 2]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("E", keep="last")
- expected = df.iloc[[-2, -1]]
- tm.assert_frame_equal(result, expected)
- # GH 11376
- df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
- expected = df.loc[df.index != 3]
- tm.assert_frame_equal(df.drop_duplicates(), expected)
- df = DataFrame([[1, 0], [0, 2]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- df = DataFrame([[-2, 0], [0, -4]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- x = np.iinfo(np.int64).max / 3 * 2
- df = DataFrame([[-x, x], [0, x + 4]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- df = DataFrame([[-x, x], [x, x + 4]])
- tm.assert_frame_equal(df.drop_duplicates(), df)
- # GH 11864
- df = DataFrame([i] * 9 for i in range(16))
- df = concat([df, DataFrame([[1] + [0] * 8])], ignore_index=True)
- for keep in ["first", "last", False]:
- assert df.duplicated(keep=keep).sum() == 0
- def test_drop_duplicates_with_duplicate_column_names():
- # GH17836
- df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
- result0 = df.drop_duplicates()
- tm.assert_frame_equal(result0, df)
- result1 = df.drop_duplicates("a")
- expected1 = df[:2]
- tm.assert_frame_equal(result1, expected1)
- def test_drop_duplicates_for_take_all():
- df = DataFrame(
- {
- "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": [1, 1, 2, 2, 2, 2, 1, 2],
- "D": range(8),
- }
- )
- # single column
- result = df.drop_duplicates("AAA")
- expected = df.iloc[[0, 1, 2, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("AAA", keep="last")
- expected = df.iloc[[2, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("AAA", keep=False)
- expected = df.iloc[[2, 6]]
- tm.assert_frame_equal(result, expected)
- # multiple columns
- result = df.drop_duplicates(["AAA", "B"])
- expected = df.iloc[[0, 1, 2, 3, 4, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(["AAA", "B"], keep="last")
- expected = df.iloc[[0, 1, 2, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(["AAA", "B"], keep=False)
- expected = df.iloc[[0, 1, 2, 6]]
- tm.assert_frame_equal(result, expected)
- def test_drop_duplicates_tuple():
- df = DataFrame(
- {
- ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": [1, 1, 2, 2, 2, 2, 1, 2],
- "D": range(8),
- }
- )
- # single column
- result = df.drop_duplicates(("AA", "AB"))
- expected = df[:2]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(("AA", "AB"), keep="last")
- expected = df.loc[[6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(("AA", "AB"), keep=False)
- expected = df.loc[[]] # empty df
- assert len(result) == 0
- tm.assert_frame_equal(result, expected)
- # multi column
- expected = df.loc[[0, 1, 2, 3]]
- result = df.drop_duplicates((("AA", "AB"), "B"))
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "df",
- [
- DataFrame(),
- DataFrame(columns=[]),
- DataFrame(columns=["A", "B", "C"]),
- DataFrame(index=[]),
- DataFrame(index=["A", "B", "C"]),
- ],
- )
- def test_drop_duplicates_empty(df):
- # GH 20516
- result = df.drop_duplicates()
- tm.assert_frame_equal(result, df)
- result = df.copy()
- result.drop_duplicates(inplace=True)
- tm.assert_frame_equal(result, df)
- def test_drop_duplicates_NA():
- # none
- df = DataFrame(
- {
- "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
- "D": range(8),
- }
- )
- # single column
- result = df.drop_duplicates("A")
- expected = df.loc[[0, 2, 3]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("A", keep="last")
- expected = df.loc[[1, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("A", keep=False)
- expected = df.loc[[]] # empty df
- tm.assert_frame_equal(result, expected)
- assert len(result) == 0
- # multi column
- result = df.drop_duplicates(["A", "B"])
- expected = df.loc[[0, 2, 3, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(["A", "B"], keep="last")
- expected = df.loc[[1, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(["A", "B"], keep=False)
- expected = df.loc[[6]]
- tm.assert_frame_equal(result, expected)
- # nan
- df = DataFrame(
- {
- "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
- "D": range(8),
- }
- )
- # single column
- result = df.drop_duplicates("C")
- expected = df[:2]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("C", keep="last")
- expected = df.loc[[3, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("C", keep=False)
- expected = df.loc[[]] # empty df
- tm.assert_frame_equal(result, expected)
- assert len(result) == 0
- # multi column
- result = df.drop_duplicates(["C", "B"])
- expected = df.loc[[0, 1, 2, 4]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(["C", "B"], keep="last")
- expected = df.loc[[1, 3, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates(["C", "B"], keep=False)
- expected = df.loc[[1]]
- tm.assert_frame_equal(result, expected)
- def test_drop_duplicates_NA_for_take_all():
- # none
- df = DataFrame(
- {
- "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
- "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
- }
- )
- # single column
- result = df.drop_duplicates("A")
- expected = df.iloc[[0, 2, 3, 5, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("A", keep="last")
- expected = df.iloc[[1, 4, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("A", keep=False)
- expected = df.iloc[[5, 7]]
- tm.assert_frame_equal(result, expected)
- # nan
- # single column
- result = df.drop_duplicates("C")
- expected = df.iloc[[0, 1, 5, 6]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("C", keep="last")
- expected = df.iloc[[3, 5, 6, 7]]
- tm.assert_frame_equal(result, expected)
- result = df.drop_duplicates("C", keep=False)
- expected = df.iloc[[5, 6]]
- tm.assert_frame_equal(result, expected)
- def test_drop_duplicates_inplace():
- orig = DataFrame(
- {
- "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": [1, 1, 2, 2, 2, 2, 1, 2],
- "D": range(8),
- }
- )
- # single column
- df = orig.copy()
- return_value = df.drop_duplicates("A", inplace=True)
- expected = orig[:2]
- result = df
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- df = orig.copy()
- return_value = df.drop_duplicates("A", keep="last", inplace=True)
- expected = orig.loc[[6, 7]]
- result = df
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- df = orig.copy()
- return_value = df.drop_duplicates("A", keep=False, inplace=True)
- expected = orig.loc[[]]
- result = df
- tm.assert_frame_equal(result, expected)
- assert len(df) == 0
- assert return_value is None
- # multi column
- df = orig.copy()
- return_value = df.drop_duplicates(["A", "B"], inplace=True)
- expected = orig.loc[[0, 1, 2, 3]]
- result = df
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- df = orig.copy()
- return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True)
- expected = orig.loc[[0, 5, 6, 7]]
- result = df
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- df = orig.copy()
- return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True)
- expected = orig.loc[[0]]
- result = df
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- # consider everything
- orig2 = orig.loc[:, ["A", "B", "C"]].copy()
- df2 = orig2.copy()
- return_value = df2.drop_duplicates(inplace=True)
- # in this case only
- expected = orig2.drop_duplicates(["A", "B"])
- result = df2
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- df2 = orig2.copy()
- return_value = df2.drop_duplicates(keep="last", inplace=True)
- expected = orig2.drop_duplicates(["A", "B"], keep="last")
- result = df2
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- df2 = orig2.copy()
- return_value = df2.drop_duplicates(keep=False, inplace=True)
- expected = orig2.drop_duplicates(["A", "B"], keep=False)
- result = df2
- tm.assert_frame_equal(result, expected)
- assert return_value is None
- @pytest.mark.parametrize("inplace", [True, False])
- @pytest.mark.parametrize(
- "origin_dict, output_dict, ignore_index, output_index",
- [
- ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]),
- ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]),
- ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]),
- ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]),
- ],
- )
- def test_drop_duplicates_ignore_index(
- inplace, origin_dict, output_dict, ignore_index, output_index
- ):
- # GH 30114
- df = DataFrame(origin_dict)
- expected = DataFrame(output_dict, index=output_index)
- if inplace:
- result_df = df.copy()
- result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
- else:
- result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
- tm.assert_frame_equal(result_df, expected)
- tm.assert_frame_equal(df, DataFrame(origin_dict))
- def test_drop_duplicates_null_in_object_column(nulls_fixture):
- # https://github.com/pandas-dev/pandas/issues/32992
- df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object)
- result = df.drop_duplicates()
- tm.assert_frame_equal(result, df)
- def test_drop_duplicates_series_vs_dataframe(keep):
- # GH#14192
- df = DataFrame(
- {
- "a": [1, 1, 1, "one", "one"],
- "b": [2, 2, np.nan, np.nan, np.nan],
- "c": [3, 3, np.nan, np.nan, "three"],
- "d": [1, 2, 3, 4, 4],
- "e": [
- datetime(2015, 1, 1),
- datetime(2015, 1, 1),
- datetime(2015, 2, 1),
- NaT,
- NaT,
- ],
- }
- )
- for column in df.columns:
- dropped_frame = df[[column]].drop_duplicates(keep=keep)
- dropped_series = df[column].drop_duplicates(keep=keep)
- tm.assert_frame_equal(dropped_frame, dropped_series.to_frame())
- @pytest.mark.parametrize("arg", [[1], 1, "True", [], 0])
- def test_drop_duplicates_non_boolean_ignore_index(arg):
- # GH#38274
- df = DataFrame({"a": [1, 2, 1, 3]})
- msg = '^For argument "ignore_index" expected type bool, received type .*.$'
- with pytest.raises(ValueError, match=msg):
- df.drop_duplicates(ignore_index=arg)
|