123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339 |
- from itertools import product
- import numpy as np
- import pytest
- from pandas._libs import hashtable
- from pandas import (
- NA,
- DatetimeIndex,
- MultiIndex,
- Series,
- )
- import pandas._testing as tm
- @pytest.mark.parametrize("names", [None, ["first", "second"]])
- def test_unique(names):
- mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names)
- res = mi.unique()
- exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
- tm.assert_index_equal(res, exp)
- mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names)
- res = mi.unique()
- exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names)
- tm.assert_index_equal(res, exp)
- mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names)
- res = mi.unique()
- exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names)
- tm.assert_index_equal(res, exp)
- # GH #20568 - empty MI
- mi = MultiIndex.from_arrays([[], []], names=names)
- res = mi.unique()
- tm.assert_index_equal(mi, res)
- def test_unique_datetimelike():
- idx1 = DatetimeIndex(
- ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"]
- )
- idx2 = DatetimeIndex(
- ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"],
- tz="Asia/Tokyo",
- )
- result = MultiIndex.from_arrays([idx1, idx2]).unique()
- eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"])
- eidx2 = DatetimeIndex(
- ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo"
- )
- exp = MultiIndex.from_arrays([eidx1, eidx2])
- tm.assert_index_equal(result, exp)
- @pytest.mark.parametrize("level", [0, "first", 1, "second"])
- def test_unique_level(idx, level):
- # GH #17896 - with level= argument
- result = idx.unique(level=level)
- expected = idx.get_level_values(level).unique()
- tm.assert_index_equal(result, expected)
- # With already unique level
- mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"])
- result = mi.unique(level=level)
- expected = mi.get_level_values(level)
- tm.assert_index_equal(result, expected)
- # With empty MI
- mi = MultiIndex.from_arrays([[], []], names=["first", "second"])
- result = mi.unique(level=level)
- expected = mi.get_level_values(level)
- tm.assert_index_equal(result, expected)
- def test_duplicate_multiindex_codes():
- # GH 17464
- # Make sure that a MultiIndex with duplicate levels throws a ValueError
- msg = r"Level values must be unique: \[[A', ]+\] on level 0"
- with pytest.raises(ValueError, match=msg):
- mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)])
- # And that using set_levels with duplicate levels fails
- mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
- msg = r"Level values must be unique: \[[AB', ]+\] on level 0"
- with pytest.raises(ValueError, match=msg):
- mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]])
- @pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]])
- def test_duplicate_level_names(names):
- # GH18872, GH19029
- mi = MultiIndex.from_product([[0, 1]] * 3, names=names)
- assert mi.names == names
- # With .rename()
- mi = MultiIndex.from_product([[0, 1]] * 3)
- mi = mi.rename(names)
- assert mi.names == names
- # With .rename(., level=)
- mi.rename(names[1], level=1, inplace=True)
- mi = mi.rename([names[0], names[2]], level=[0, 2])
- assert mi.names == names
- def test_duplicate_meta_data():
- # GH 10115
- mi = MultiIndex(
- levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]
- )
- for idx in [
- mi,
- mi.set_names([None, None]),
- mi.set_names([None, "Num"]),
- mi.set_names(["Upper", "Num"]),
- ]:
- assert idx.has_duplicates
- assert idx.drop_duplicates().names == idx.names
- def test_has_duplicates(idx, idx_dup):
- # see fixtures
- assert idx.is_unique is True
- assert idx.has_duplicates is False
- assert idx_dup.is_unique is False
- assert idx_dup.has_duplicates is True
- mi = MultiIndex(
- levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]
- )
- assert mi.is_unique is False
- assert mi.has_duplicates is True
- # single instance of NaN
- mi_nan = MultiIndex(
- levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]]
- )
- assert mi_nan.is_unique is True
- assert mi_nan.has_duplicates is False
- # multiple instances of NaN
- mi_nan_dup = MultiIndex(
- levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]]
- )
- assert mi_nan_dup.is_unique is False
- assert mi_nan_dup.has_duplicates is True
- def test_has_duplicates_from_tuples():
- # GH 9075
- t = [
- ("x", "out", "z", 5, "y", "in", "z", 169),
- ("x", "out", "z", 7, "y", "in", "z", 119),
- ("x", "out", "z", 9, "y", "in", "z", 135),
- ("x", "out", "z", 13, "y", "in", "z", 145),
- ("x", "out", "z", 14, "y", "in", "z", 158),
- ("x", "out", "z", 16, "y", "in", "z", 122),
- ("x", "out", "z", 17, "y", "in", "z", 160),
- ("x", "out", "z", 18, "y", "in", "z", 180),
- ("x", "out", "z", 20, "y", "in", "z", 143),
- ("x", "out", "z", 21, "y", "in", "z", 128),
- ("x", "out", "z", 22, "y", "in", "z", 129),
- ("x", "out", "z", 25, "y", "in", "z", 111),
- ("x", "out", "z", 28, "y", "in", "z", 114),
- ("x", "out", "z", 29, "y", "in", "z", 121),
- ("x", "out", "z", 31, "y", "in", "z", 126),
- ("x", "out", "z", 32, "y", "in", "z", 155),
- ("x", "out", "z", 33, "y", "in", "z", 123),
- ("x", "out", "z", 12, "y", "in", "z", 144),
- ]
- mi = MultiIndex.from_tuples(t)
- assert not mi.has_duplicates
- @pytest.mark.parametrize("nlevels", [4, 8])
- @pytest.mark.parametrize("with_nulls", [True, False])
- def test_has_duplicates_overflow(nlevels, with_nulls):
- # handle int64 overflow if possible
- # no overflow with 4
- # overflow possible with 8
- codes = np.tile(np.arange(500), 2)
- level = np.arange(500)
- if with_nulls: # inject some null values
- codes[500] = -1 # common nan value
- codes = [codes.copy() for i in range(nlevels)]
- for i in range(nlevels):
- codes[i][500 + i - nlevels // 2] = -1
- codes += [np.array([-1, 1]).repeat(500)]
- else:
- codes = [codes] * nlevels + [np.arange(2).repeat(500)]
- levels = [level] * nlevels + [[0, 1]]
- # no dups
- mi = MultiIndex(levels=levels, codes=codes)
- assert not mi.has_duplicates
- # with a dup
- if with_nulls:
- def f(a):
- return np.insert(a, 1000, a[0])
- codes = list(map(f, codes))
- mi = MultiIndex(levels=levels, codes=codes)
- else:
- values = mi.values.tolist()
- mi = MultiIndex.from_tuples(values + [values[0]])
- assert mi.has_duplicates
- @pytest.mark.parametrize(
- "keep, expected",
- [
- ("first", np.array([False, False, False, True, True, False])),
- ("last", np.array([False, True, True, False, False, False])),
- (False, np.array([False, True, True, True, True, False])),
- ],
- )
- def test_duplicated(idx_dup, keep, expected):
- result = idx_dup.duplicated(keep=keep)
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.arm_slow
- def test_duplicated_large(keep):
- # GH 9125
- n, k = 200, 5000
- levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
- codes = [np.random.choice(n, k * n) for lev in levels]
- mi = MultiIndex(levels=levels, codes=codes)
- result = mi.duplicated(keep=keep)
- expected = hashtable.duplicated(mi.values, keep=keep)
- tm.assert_numpy_array_equal(result, expected)
- def test_duplicated2():
- # TODO: more informative test name
- # GH5873
- for a in [101, 102]:
- mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
- assert not mi.has_duplicates
- tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool"))
- for n in range(1, 6): # 1st level shape
- for m in range(1, 5): # 2nd level shape
- # all possible unique combinations, including nan
- codes = product(range(-1, n), range(-1, m))
- mi = MultiIndex(
- levels=[list("abcde")[:n], list("WXYZ")[:m]],
- codes=np.random.permutation(list(codes)).T,
- )
- assert len(mi) == (n + 1) * (m + 1)
- assert not mi.has_duplicates
- tm.assert_numpy_array_equal(
- mi.duplicated(), np.zeros(len(mi), dtype="bool")
- )
- def test_duplicated_drop_duplicates():
- # GH#4060
- idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2]))
- expected = np.array([False, False, False, True, False, False], dtype=bool)
- duplicated = idx.duplicated()
- tm.assert_numpy_array_equal(duplicated, expected)
- assert duplicated.dtype == bool
- expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2]))
- tm.assert_index_equal(idx.drop_duplicates(), expected)
- expected = np.array([True, False, False, False, False, False])
- duplicated = idx.duplicated(keep="last")
- tm.assert_numpy_array_equal(duplicated, expected)
- assert duplicated.dtype == bool
- expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2]))
- tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected)
- expected = np.array([True, False, False, True, False, False])
- duplicated = idx.duplicated(keep=False)
- tm.assert_numpy_array_equal(duplicated, expected)
- assert duplicated.dtype == bool
- expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2]))
- tm.assert_index_equal(idx.drop_duplicates(keep=False), expected)
- @pytest.mark.parametrize(
- "dtype",
- [
- np.complex64,
- np.complex128,
- ],
- )
- def test_duplicated_series_complex_numbers(dtype):
- # GH 17927
- expected = Series(
- [False, False, False, True, False, False, False, True, False, True],
- dtype=bool,
- )
- result = Series(
- [
- np.nan + np.nan * 1j,
- 0,
- 1j,
- 1j,
- 1,
- 1 + 1j,
- 1 + 2j,
- 1 + 1j,
- np.nan,
- np.nan + np.nan * 1j,
- ],
- dtype=dtype,
- ).duplicated()
- tm.assert_series_equal(result, expected)
- def test_midx_unique_ea_dtype():
- # GH#48335
- vals_a = Series([1, 2, NA, NA], dtype="Int64")
- vals_b = np.array([1, 2, 3, 3])
- midx = MultiIndex.from_arrays([vals_a, vals_b], names=["a", "b"])
- result = midx.unique()
- exp_vals_a = Series([1, 2, NA], dtype="Int64")
- exp_vals_b = np.array([1, 2, 3])
- expected = MultiIndex.from_arrays([exp_vals_a, exp_vals_b], names=["a", "b"])
- tm.assert_index_equal(result, expected)
|