123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393 |
- import numpy as np
- import pytest
- from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- Series,
- )
- import pandas._testing as tm
- from pandas.core.indexing import IndexingError
- # ----------------------------------------------------------------------------
- # test indexing of Series with multi-level Index
- # ----------------------------------------------------------------------------
- @pytest.mark.parametrize(
- "access_method",
- [lambda s, x: s[:, x], lambda s, x: s.loc[:, x], lambda s, x: s.xs(x, level=1)],
- )
- @pytest.mark.parametrize(
- "level1_value, expected",
- [(0, Series([1], index=[0])), (1, Series([2, 3], index=[1, 2]))],
- )
- def test_series_getitem_multiindex(access_method, level1_value, expected):
- # GH 6018
- # series regression getitem with a multi-index
- mi = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)], names=["A", "B"])
- ser = Series([1, 2, 3], index=mi)
- expected.index.name = "A"
- result = access_method(ser, level1_value)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("level0_value", ["D", "A"])
- def test_series_getitem_duplicates_multiindex(level0_value):
- # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise
- # the appropriate error, only in PY3 of course!
- index = MultiIndex(
- levels=[[level0_value, "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]],
- codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
- names=["tag", "day"],
- )
- arr = np.random.randn(len(index), 1)
- df = DataFrame(arr, index=index, columns=["val"])
- # confirm indexing on missing value raises KeyError
- if level0_value != "A":
- with pytest.raises(KeyError, match=r"^'A'$"):
- df.val["A"]
- with pytest.raises(KeyError, match=r"^'X'$"):
- df.val["X"]
- result = df.val[level0_value]
- expected = Series(
- arr.ravel()[0:3], name="val", index=Index([26, 37, 57], name="day")
- )
- tm.assert_series_equal(result, expected)
- def test_series_getitem(multiindex_year_month_day_dataframe_random_data, indexer_sl):
- s = multiindex_year_month_day_dataframe_random_data["A"]
- expected = s.reindex(s.index[42:65])
- expected.index = expected.index.droplevel(0).droplevel(0)
- result = indexer_sl(s)[2000, 3]
- tm.assert_series_equal(result, expected)
- def test_series_getitem_returns_scalar(
- multiindex_year_month_day_dataframe_random_data, indexer_sl
- ):
- s = multiindex_year_month_day_dataframe_random_data["A"]
- expected = s.iloc[49]
- result = indexer_sl(s)[2000, 3, 10]
- assert result == expected
- @pytest.mark.parametrize(
- "indexer,expected_error,expected_error_msg",
- [
- (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^\(2000, 3, 4\)$"),
- (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
- (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
- (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"),
- (lambda s: s.__getitem__(len(s)), KeyError, ""), # match should include len(s)
- (lambda s: s[len(s)], KeyError, ""), # match should include len(s)
- (
- lambda s: s.iloc[len(s)],
- IndexError,
- "single positional indexer is out-of-bounds",
- ),
- ],
- )
- def test_series_getitem_indexing_errors(
- multiindex_year_month_day_dataframe_random_data,
- indexer,
- expected_error,
- expected_error_msg,
- ):
- s = multiindex_year_month_day_dataframe_random_data["A"]
- with pytest.raises(expected_error, match=expected_error_msg):
- indexer(s)
- def test_series_getitem_corner_generator(
- multiindex_year_month_day_dataframe_random_data,
- ):
- s = multiindex_year_month_day_dataframe_random_data["A"]
- result = s[(x > 0 for x in s)]
- expected = s[s > 0]
- tm.assert_series_equal(result, expected)
- # ----------------------------------------------------------------------------
- # test indexing of DataFrame with multi-level Index
- # ----------------------------------------------------------------------------
- def test_getitem_simple(multiindex_dataframe_random_data):
- df = multiindex_dataframe_random_data.T
- expected = df.values[:, 0]
- result = df["foo", "one"].values
- tm.assert_almost_equal(result, expected)
- @pytest.mark.parametrize(
- "indexer,expected_error_msg",
- [
- (lambda df: df[("foo", "four")], r"^\('foo', 'four'\)$"),
- (lambda df: df["foobar"], r"^'foobar'$"),
- ],
- )
- def test_frame_getitem_simple_key_error(
- multiindex_dataframe_random_data, indexer, expected_error_msg
- ):
- df = multiindex_dataframe_random_data.T
- with pytest.raises(KeyError, match=expected_error_msg):
- indexer(df)
- def test_frame_getitem_multicolumn_empty_level():
- df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]})
- df.columns = [
- ["level1 item1", "level1 item2"],
- ["", "level2 item2"],
- ["level3 item1", "level3 item2"],
- ]
- result = df["level1 item1"]
- expected = DataFrame(
- [["1"], ["2"], ["3"]], index=df.index, columns=["level3 item1"]
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "indexer,expected_slice",
- [
- (lambda df: df["foo"], slice(3)),
- (lambda df: df["bar"], slice(3, 5)),
- (lambda df: df.loc[:, "bar"], slice(3, 5)),
- ],
- )
- def test_frame_getitem_toplevel(
- multiindex_dataframe_random_data, indexer, expected_slice
- ):
- df = multiindex_dataframe_random_data.T
- expected = df.reindex(columns=df.columns[expected_slice])
- expected.columns = expected.columns.droplevel(0)
- result = indexer(df)
- tm.assert_frame_equal(result, expected)
- def test_frame_mixed_depth_get():
- arrays = [
- ["a", "top", "top", "routine1", "routine1", "routine2"],
- ["", "OD", "OD", "result1", "result2", "result1"],
- ["", "wx", "wy", "", "", ""],
- ]
- tuples = sorted(zip(*arrays))
- index = MultiIndex.from_tuples(tuples)
- df = DataFrame(np.random.randn(4, 6), columns=index)
- result = df["a"]
- expected = df["a", "", ""].rename("a")
- tm.assert_series_equal(result, expected)
- result = df["routine1", "result1"]
- expected = df["routine1", "result1", ""]
- expected = expected.rename(("routine1", "result1"))
- tm.assert_series_equal(result, expected)
- def test_frame_getitem_nan_multiindex(nulls_fixture):
- # GH#29751
- # loc on a multiindex containing nan values
- n = nulls_fixture # for code readability
- cols = ["a", "b", "c"]
- df = DataFrame(
- [[11, n, 13], [21, n, 23], [31, n, 33], [41, n, 43]],
- columns=cols,
- ).set_index(["a", "b"])
- df["c"] = df["c"].astype("int64")
- idx = (21, n)
- result = df.loc[:idx]
- expected = DataFrame([[11, n, 13], [21, n, 23]], columns=cols).set_index(["a", "b"])
- expected["c"] = expected["c"].astype("int64")
- tm.assert_frame_equal(result, expected)
- result = df.loc[idx:]
- expected = DataFrame(
- [[21, n, 23], [31, n, 33], [41, n, 43]], columns=cols
- ).set_index(["a", "b"])
- expected["c"] = expected["c"].astype("int64")
- tm.assert_frame_equal(result, expected)
- idx1, idx2 = (21, n), (31, n)
- result = df.loc[idx1:idx2]
- expected = DataFrame([[21, n, 23], [31, n, 33]], columns=cols).set_index(["a", "b"])
- expected["c"] = expected["c"].astype("int64")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "indexer,expected",
- [
- (
- (["b"], ["bar", np.nan]),
- (
- DataFrame(
- [[2, 3], [5, 6]],
- columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]),
- dtype="int64",
- )
- ),
- ),
- (
- (["a", "b"]),
- (
- DataFrame(
- [[1, 2, 3], [4, 5, 6]],
- columns=MultiIndex.from_tuples(
- [("a", "foo"), ("b", "bar"), ("b", np.nan)]
- ),
- dtype="int64",
- )
- ),
- ),
- (
- (["b"]),
- (
- DataFrame(
- [[2, 3], [5, 6]],
- columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]),
- dtype="int64",
- )
- ),
- ),
- (
- (["b"], ["bar"]),
- (
- DataFrame(
- [[2], [5]],
- columns=MultiIndex.from_tuples([("b", "bar")]),
- dtype="int64",
- )
- ),
- ),
- (
- (["b"], [np.nan]),
- (
- DataFrame(
- [[3], [6]],
- columns=MultiIndex(
- codes=[[1], [-1]], levels=[["a", "b"], ["bar", "foo"]]
- ),
- dtype="int64",
- )
- ),
- ),
- (("b", np.nan), Series([3, 6], dtype="int64", name=("b", np.nan))),
- ],
- )
- def test_frame_getitem_nan_cols_multiindex(
- indexer,
- expected,
- nulls_fixture,
- ):
- # Slicing MultiIndex including levels with nan values, for more information
- # see GH#25154
- df = DataFrame(
- [[1, 2, 3], [4, 5, 6]],
- columns=MultiIndex.from_tuples(
- [("a", "foo"), ("b", "bar"), ("b", nulls_fixture)]
- ),
- dtype="int64",
- )
- result = df.loc[:, indexer]
- tm.assert_equal(result, expected)
- # ----------------------------------------------------------------------------
- # test indexing of DataFrame with multi-level Index with duplicates
- # ----------------------------------------------------------------------------
- @pytest.fixture
- def dataframe_with_duplicate_index():
- """Fixture for DataFrame used in tests for gh-4145 and gh-4146"""
- data = [["a", "d", "e", "c", "f", "b"], [1, 4, 5, 3, 6, 2], [1, 4, 5, 3, 6, 2]]
- index = ["h1", "h3", "h5"]
- columns = MultiIndex(
- levels=[["A", "B"], ["A1", "A2", "B1", "B2"]],
- codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]],
- names=["main", "sub"],
- )
- return DataFrame(data, index=index, columns=columns)
- @pytest.mark.parametrize(
- "indexer", [lambda df: df[("A", "A1")], lambda df: df.loc[:, ("A", "A1")]]
- )
- def test_frame_mi_access(dataframe_with_duplicate_index, indexer):
- # GH 4145
- df = dataframe_with_duplicate_index
- index = Index(["h1", "h3", "h5"])
- columns = MultiIndex.from_tuples([("A", "A1")], names=["main", "sub"])
- expected = DataFrame([["a", 1, 1]], index=columns, columns=index).T
- result = indexer(df)
- tm.assert_frame_equal(result, expected)
- def test_frame_mi_access_returns_series(dataframe_with_duplicate_index):
- # GH 4146, not returning a block manager when selecting a unique index
- # from a duplicate index
- # as of 4879, this returns a Series (which is similar to what happens
- # with a non-unique)
- df = dataframe_with_duplicate_index
- expected = Series(["a", 1, 1], index=["h1", "h3", "h5"], name="A1")
- result = df["A"]["A1"]
- tm.assert_series_equal(result, expected)
- def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index):
- # selecting a non_unique from the 2nd level
- df = dataframe_with_duplicate_index
- expected = DataFrame(
- [["d", 4, 4], ["e", 5, 5]],
- index=Index(["B2", "B2"], name="sub"),
- columns=["h1", "h3", "h5"],
- ).T
- result = df["A"]["B2"]
- tm.assert_frame_equal(result, expected)
- def test_frame_mi_empty_slice():
- # GH 15454
- df = DataFrame(0, index=range(2), columns=MultiIndex.from_product([[1], [2]]))
- result = df[[]]
- expected = DataFrame(
- index=[0, 1], columns=MultiIndex(levels=[[1], [2]], codes=[[], []])
- )
- tm.assert_frame_equal(result, expected)
- def test_loc_empty_multiindex():
- # GH#36936
- arrays = [["a", "a", "b", "a"], ["a", "a", "b", "b"]]
- index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
- df = DataFrame([1, 2, 3, 4], index=index, columns=["value"])
- # loc on empty multiindex == loc with False mask
- empty_multiindex = df.loc[df.loc[:, "value"] == 0, :].index
- result = df.loc[empty_multiindex, :]
- expected = df.loc[[False] * len(df.index), :]
- tm.assert_frame_equal(result, expected)
- # replacing value with loc on empty multiindex
- df.loc[df.loc[df.loc[:, "value"] == 0].index, "value"] = 5
- result = df
- expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"])
- tm.assert_frame_equal(result, expected)
|