123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- import re
- import numpy as np
- import pytest
- from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
- import pandas as pd
- from pandas import (
- Index,
- IntervalIndex,
- MultiIndex,
- RangeIndex,
- )
- import pandas._testing as tm
- def test_labels_dtypes():
- # GH 8456
- i = MultiIndex.from_tuples([("A", 1), ("A", 2)])
- assert i.codes[0].dtype == "int8"
- assert i.codes[1].dtype == "int8"
- i = MultiIndex.from_product([["a"], range(40)])
- assert i.codes[1].dtype == "int8"
- i = MultiIndex.from_product([["a"], range(400)])
- assert i.codes[1].dtype == "int16"
- i = MultiIndex.from_product([["a"], range(40000)])
- assert i.codes[1].dtype == "int32"
- i = MultiIndex.from_product([["a"], range(1000)])
- assert (i.codes[0] >= 0).all()
- assert (i.codes[1] >= 0).all()
- def test_values_boxed():
- tuples = [
- (1, pd.Timestamp("2000-01-01")),
- (2, pd.NaT),
- (3, pd.Timestamp("2000-01-03")),
- (1, pd.Timestamp("2000-01-04")),
- (2, pd.Timestamp("2000-01-02")),
- (3, pd.Timestamp("2000-01-03")),
- ]
- result = MultiIndex.from_tuples(tuples)
- expected = construct_1d_object_array_from_listlike(tuples)
- tm.assert_numpy_array_equal(result.values, expected)
- # Check that code branches for boxed values produce identical results
- tm.assert_numpy_array_equal(result.values[:4], result[:4].values)
- def test_values_multiindex_datetimeindex():
- # Test to ensure we hit the boxing / nobox part of MI.values
- ints = np.arange(10**18, 10**18 + 5)
- naive = pd.DatetimeIndex(ints)
- aware = pd.DatetimeIndex(ints, tz="US/Central")
- idx = MultiIndex.from_arrays([naive, aware])
- result = idx.values
- outer = pd.DatetimeIndex([x[0] for x in result])
- tm.assert_index_equal(outer, naive)
- inner = pd.DatetimeIndex([x[1] for x in result])
- tm.assert_index_equal(inner, aware)
- # n_lev > n_lab
- result = idx[:2].values
- outer = pd.DatetimeIndex([x[0] for x in result])
- tm.assert_index_equal(outer, naive[:2])
- inner = pd.DatetimeIndex([x[1] for x in result])
- tm.assert_index_equal(inner, aware[:2])
- def test_values_multiindex_periodindex():
- # Test to ensure we hit the boxing / nobox part of MI.values
- ints = np.arange(2007, 2012)
- pidx = pd.PeriodIndex(ints, freq="D")
- idx = MultiIndex.from_arrays([ints, pidx])
- result = idx.values
- outer = Index([x[0] for x in result])
- tm.assert_index_equal(outer, Index(ints, dtype=np.int64))
- inner = pd.PeriodIndex([x[1] for x in result])
- tm.assert_index_equal(inner, pidx)
- # n_lev > n_lab
- result = idx[:2].values
- outer = Index([x[0] for x in result])
- tm.assert_index_equal(outer, Index(ints[:2], dtype=np.int64))
- inner = pd.PeriodIndex([x[1] for x in result])
- tm.assert_index_equal(inner, pidx[:2])
- def test_consistency():
- # need to construct an overflow
- major_axis = list(range(70000))
- minor_axis = list(range(10))
- major_codes = np.arange(70000)
- minor_codes = np.repeat(range(10), 7000)
- # the fact that is works means it's consistent
- index = MultiIndex(
- levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
- )
- # inconsistent
- major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])
- minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])
- index = MultiIndex(
- levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
- )
- assert index.is_unique is False
- @pytest.mark.slow
- def test_hash_collisions():
- # non-smoke test that we don't get hash collisions
- index = MultiIndex.from_product(
- [np.arange(1000), np.arange(1000)], names=["one", "two"]
- )
- result = index.get_indexer(index.values)
- tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp"))
- for i in [0, 1, len(index) - 2, len(index) - 1]:
- result = index.get_loc(index[i])
- assert result == i
- def test_dims():
- pass
- def test_take_invalid_kwargs():
- vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]]
- idx = MultiIndex.from_product(vals, names=["str", "dt"])
- indices = [1, 2]
- msg = r"take\(\) got an unexpected keyword argument 'foo'"
- with pytest.raises(TypeError, match=msg):
- idx.take(indices, foo=2)
- msg = "the 'out' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, out=indices)
- msg = "the 'mode' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, mode="clip")
- def test_isna_behavior(idx):
- # should not segfault GH5123
- # NOTE: if MI representation changes, may make sense to allow
- # isna(MI)
- msg = "isna is not defined for MultiIndex"
- with pytest.raises(NotImplementedError, match=msg):
- pd.isna(idx)
- def test_large_multiindex_error():
- # GH12527
- df_below_1000000 = pd.DataFrame(
- 1, index=MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"]
- )
- with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
- df_below_1000000.loc[(-1, 0), "dest"]
- with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
- df_below_1000000.loc[(3, 0), "dest"]
- df_above_1000000 = pd.DataFrame(
- 1, index=MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"]
- )
- with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
- df_above_1000000.loc[(-1, 0), "dest"]
- with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
- df_above_1000000.loc[(3, 0), "dest"]
- def test_million_record_attribute_error():
- # GH 18165
- r = list(range(1000000))
- df = pd.DataFrame(
- {"a": r, "b": r}, index=MultiIndex.from_tuples([(x, x) for x in r])
- )
- msg = "'Series' object has no attribute 'foo'"
- with pytest.raises(AttributeError, match=msg):
- df["a"].foo()
- def test_can_hold_identifiers(idx):
- key = idx[0]
- assert idx._can_hold_identifiers_and_holds_name(key) is True
- def test_metadata_immutable(idx):
- levels, codes = idx.levels, idx.codes
- # shouldn't be able to set at either the top level or base level
- mutable_regex = re.compile("does not support mutable operations")
- with pytest.raises(TypeError, match=mutable_regex):
- levels[0] = levels[0]
- with pytest.raises(TypeError, match=mutable_regex):
- levels[0][0] = levels[0][0]
- # ditto for labels
- with pytest.raises(TypeError, match=mutable_regex):
- codes[0] = codes[0]
- with pytest.raises(ValueError, match="assignment destination is read-only"):
- codes[0][0] = codes[0][0]
- # and for names
- names = idx.names
- with pytest.raises(TypeError, match=mutable_regex):
- names[0] = names[0]
- def test_level_setting_resets_attributes():
- ind = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
- assert ind.is_monotonic_increasing
- ind = ind.set_levels([["A", "B"], [1, 3, 2]])
- # if this fails, probably didn't reset the cache correctly.
- assert not ind.is_monotonic_increasing
- def test_rangeindex_fallback_coercion_bug():
- # GH 12893
- df1 = pd.DataFrame(np.arange(100).reshape((10, 10)))
- df2 = pd.DataFrame(np.arange(100).reshape((10, 10)))
- df = pd.concat({"df1": df1.stack(), "df2": df2.stack()}, axis=1)
- df.index.names = ["fizz", "buzz"]
- str(df)
- expected = pd.DataFrame(
- {"df2": np.arange(100), "df1": np.arange(100)},
- index=MultiIndex.from_product([range(10), range(10)], names=["fizz", "buzz"]),
- )
- tm.assert_frame_equal(df, expected, check_like=True)
- result = df.index.get_level_values("fizz")
- expected = Index(np.arange(10, dtype=np.int64), name="fizz").repeat(10)
- tm.assert_index_equal(result, expected)
- result = df.index.get_level_values("buzz")
- expected = Index(np.tile(np.arange(10, dtype=np.int64), 10), name="buzz")
- tm.assert_index_equal(result, expected)
- def test_memory_usage(idx):
- result = idx.memory_usage()
- if len(idx):
- idx.get_loc(idx[0])
- result2 = idx.memory_usage()
- result3 = idx.memory_usage(deep=True)
- # RangeIndex, IntervalIndex
- # don't have engines
- if not isinstance(idx, (RangeIndex, IntervalIndex)):
- assert result2 > result
- if idx.inferred_type == "object":
- assert result3 > result2
- else:
- # we report 0 for no-length
- assert result == 0
- def test_nlevels(idx):
- assert idx.nlevels == 2
|