123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- import numpy as np
- import pytest
- import pandas._libs.index as _index
- from pandas.errors import PerformanceWarning
- import pandas as pd
- from pandas import (
- CategoricalDtype,
- DataFrame,
- Index,
- MultiIndex,
- Series,
- )
- import pandas._testing as tm
- from pandas.core.arrays.boolean import BooleanDtype
- class TestMultiIndexBasic:
- def test_multiindex_perf_warn(self):
- df = DataFrame(
- {
- "jim": [0, 0, 1, 1],
- "joe": ["x", "x", "z", "y"],
- "jolie": np.random.rand(4),
- }
- ).set_index(["jim", "joe"])
- with tm.assert_produces_warning(PerformanceWarning):
- df.loc[(1, "z")]
- df = df.iloc[[2, 1, 3, 0]]
- with tm.assert_produces_warning(PerformanceWarning):
- df.loc[(0,)]
- def test_indexing_over_hashtable_size_cutoff(self):
- n = 10000
- old_cutoff = _index._SIZE_CUTOFF
- _index._SIZE_CUTOFF = 20000
- s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
- # hai it works!
- assert s[("a", 5)] == 5
- assert s[("a", 6)] == 6
- assert s[("a", 7)] == 7
- _index._SIZE_CUTOFF = old_cutoff
- def test_multi_nan_indexing(self):
- # GH 3588
- df = DataFrame(
- {
- "a": ["R1", "R2", np.nan, "R4"],
- "b": ["C1", "C2", "C3", "C4"],
- "c": [10, 15, np.nan, 20],
- }
- )
- result = df.set_index(["a", "b"], drop=False)
- expected = DataFrame(
- {
- "a": ["R1", "R2", np.nan, "R4"],
- "b": ["C1", "C2", "C3", "C4"],
- "c": [10, 15, np.nan, 20],
- },
- index=[
- Index(["R1", "R2", np.nan, "R4"], name="a"),
- Index(["C1", "C2", "C3", "C4"], name="b"),
- ],
- )
- tm.assert_frame_equal(result, expected)
- def test_exclusive_nat_column_indexing(self):
- # GH 38025
- # test multi indexing when one column exclusively contains NaT values
- df = DataFrame(
- {
- "a": [pd.NaT, pd.NaT, pd.NaT, pd.NaT],
- "b": ["C1", "C2", "C3", "C4"],
- "c": [10, 15, np.nan, 20],
- }
- )
- df = df.set_index(["a", "b"])
- expected = DataFrame(
- {
- "c": [10, 15, np.nan, 20],
- },
- index=[
- Index([pd.NaT, pd.NaT, pd.NaT, pd.NaT], name="a"),
- Index(["C1", "C2", "C3", "C4"], name="b"),
- ],
- )
- tm.assert_frame_equal(df, expected)
- def test_nested_tuples_duplicates(self):
- # GH#30892
- dti = pd.to_datetime(["20190101", "20190101", "20190102"])
- idx = Index(["a", "a", "c"])
- mi = MultiIndex.from_arrays([dti, idx], names=["index1", "index2"])
- df = DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi)
- expected = DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi)
- df2 = df.copy(deep=True)
- df2.loc[(dti[0], "a"), "c2"] = 1.0
- tm.assert_frame_equal(df2, expected)
- df3 = df.copy(deep=True)
- df3.loc[[(dti[0], "a")], "c2"] = 1.0
- tm.assert_frame_equal(df3, expected)
- def test_multiindex_with_datatime_level_preserves_freq(self):
- # https://github.com/pandas-dev/pandas/issues/35563
- idx = Index(range(2), name="A")
- dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
- mi = MultiIndex.from_product([idx, dti])
- df = DataFrame(np.random.randn(14, 2), index=mi)
- result = df.loc[0].index
- tm.assert_index_equal(result, dti)
- assert result.freq == dti.freq
- def test_multiindex_complex(self):
- # GH#42145
- complex_data = [1 + 2j, 4 - 3j, 10 - 1j]
- non_complex_data = [3, 4, 5]
- result = DataFrame(
- {
- "x": complex_data,
- "y": non_complex_data,
- "z": non_complex_data,
- }
- )
- result.set_index(["x", "y"], inplace=True)
- expected = DataFrame(
- {"z": non_complex_data},
- index=MultiIndex.from_arrays(
- [complex_data, non_complex_data],
- names=("x", "y"),
- ),
- )
- tm.assert_frame_equal(result, expected)
- def test_rename_multiindex_with_duplicates(self):
- # GH 38015
- mi = MultiIndex.from_tuples([("A", "cat"), ("B", "cat"), ("B", "cat")])
- df = DataFrame(index=mi)
- df = df.rename(index={"A": "Apple"}, level=0)
- mi2 = MultiIndex.from_tuples([("Apple", "cat"), ("B", "cat"), ("B", "cat")])
- expected = DataFrame(index=mi2)
- tm.assert_frame_equal(df, expected)
- def test_series_align_multiindex_with_nan_overlap_only(self):
- # GH 38439
- mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
- mi2 = MultiIndex.from_arrays([[np.nan, 82.0], [np.nan, np.nan]])
- ser1 = Series([1, 2], index=mi1)
- ser2 = Series([1, 2], index=mi2)
- result1, result2 = ser1.align(ser2)
- mi = MultiIndex.from_arrays([[81.0, 82.0, np.nan], [np.nan, np.nan, np.nan]])
- expected1 = Series([1.0, np.nan, 2.0], index=mi)
- expected2 = Series([np.nan, 2.0, 1.0], index=mi)
- tm.assert_series_equal(result1, expected1)
- tm.assert_series_equal(result2, expected2)
- def test_series_align_multiindex_with_nan(self):
- # GH 38439
- mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
- mi2 = MultiIndex.from_arrays([[np.nan, 81.0], [np.nan, np.nan]])
- ser1 = Series([1, 2], index=mi1)
- ser2 = Series([1, 2], index=mi2)
- result1, result2 = ser1.align(ser2)
- mi = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
- expected1 = Series([1, 2], index=mi)
- expected2 = Series([2, 1], index=mi)
- tm.assert_series_equal(result1, expected1)
- tm.assert_series_equal(result2, expected2)
- def test_nunique_smoke(self):
- # GH 34019
- n = DataFrame([[1, 2], [1, 2]]).set_index([0, 1]).index.nunique()
- assert n == 1
- def test_multiindex_repeated_keys(self):
- # GH19414
- tm.assert_series_equal(
- Series([1, 2], MultiIndex.from_arrays([["a", "b"]])).loc[
- ["a", "a", "b", "b"]
- ],
- Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])),
- )
- def test_multiindex_with_na_missing_key(self):
- # GH46173
- df = DataFrame.from_dict(
- {
- ("foo",): [1, 2, 3],
- ("bar",): [5, 6, 7],
- (None,): [8, 9, 0],
- }
- )
- with pytest.raises(KeyError, match="missing_key"):
- df[[("missing_key",)]]
- def test_multiindex_dtype_preservation(self):
- # GH51261
- columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"])
- df = DataFrame(["value"], columns=columns).astype("category")
- df_no_multiindex = df["A"]
- assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype)
- # geopandas 1763 analogue
- df = DataFrame(
- [[1, 0], [0, 1]],
- columns=[
- ["foo", "foo"],
- ["location", "location"],
- ["x", "y"],
- ],
- ).assign(bools=Series([True, False], dtype="boolean"))
- assert isinstance(df["bools"].dtype, BooleanDtype)
|