123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468 |
- import re
- import numpy as np
- import pytest
- from pandas import (
- Categorical,
- CategoricalDtype,
- CategoricalIndex,
- DataFrame,
- DateOffset,
- DatetimeIndex,
- Index,
- MultiIndex,
- Series,
- Timestamp,
- concat,
- date_range,
- get_dummies,
- period_range,
- )
- import pandas._testing as tm
- from pandas.core.arrays import SparseArray
- class TestGetitem:
- def test_getitem_unused_level_raises(self):
- # GH#20410
- mi = MultiIndex(
- levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]],
- codes=[[1, 0], [1, 0]],
- )
- df = DataFrame(-1, index=range(3), columns=mi)
- with pytest.raises(KeyError, match="notevenone"):
- df["notevenone"]
- def test_getitem_periodindex(self):
- rng = period_range("1/1/2000", periods=5)
- df = DataFrame(np.random.randn(10, 5), columns=rng)
- ts = df[rng[0]]
- tm.assert_series_equal(ts, df.iloc[:, 0])
- # GH#1211; smoketest unrelated to the rest of this test
- repr(df)
- ts = df["1/1/2000"]
- tm.assert_series_equal(ts, df.iloc[:, 0])
- def test_getitem_list_of_labels_categoricalindex_cols(self):
- # GH#16115
- cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])
- expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats)
- dummies = get_dummies(cats)
- result = dummies[list(dummies.columns)]
- tm.assert_frame_equal(result, expected)
- def test_getitem_sparse_column_return_type_and_dtype(self):
- # https://github.com/pandas-dev/pandas/issues/23559
- data = SparseArray([0, 1])
- df = DataFrame({"A": data})
- expected = Series(data, name="A")
- result = df["A"]
- tm.assert_series_equal(result, expected)
- # Also check iloc and loc while we're here
- result = df.iloc[:, 0]
- tm.assert_series_equal(result, expected)
- result = df.loc[:, "A"]
- tm.assert_series_equal(result, expected)
- def test_getitem_string_columns(self):
- # GH#46185
- df = DataFrame([[1, 2]], columns=Index(["A", "B"], dtype="string"))
- result = df.A
- expected = df["A"]
- tm.assert_series_equal(result, expected)
- class TestGetitemListLike:
- def test_getitem_list_missing_key(self):
- # GH#13822, incorrect error string with non-unique columns when missing
- # column is accessed
- df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]})
- df.columns = ["x", "x", "z"]
- # Check that we get the correct value in the KeyError
- with pytest.raises(KeyError, match=r"\['y'\] not in index"):
- df[["x", "y", "z"]]
- def test_getitem_list_duplicates(self):
- # GH#1943
- df = DataFrame(np.random.randn(4, 4), columns=list("AABC"))
- df.columns.name = "foo"
- result = df[["B", "C"]]
- assert result.columns.name == "foo"
- expected = df.iloc[:, 2:]
- tm.assert_frame_equal(result, expected)
- def test_getitem_dupe_cols(self):
- df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
- msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\""
- with pytest.raises(KeyError, match=re.escape(msg)):
- df[["baf"]]
- @pytest.mark.parametrize(
- "idx_type",
- [
- list,
- iter,
- Index,
- set,
- lambda keys: dict(zip(keys, range(len(keys)))),
- lambda keys: dict(zip(keys, range(len(keys)))).keys(),
- ],
- ids=["list", "iter", "Index", "set", "dict", "dict_keys"],
- )
- @pytest.mark.parametrize("levels", [1, 2])
- def test_getitem_listlike(self, idx_type, levels, float_frame):
- # GH#21294
- if levels == 1:
- frame, missing = float_frame, "food"
- else:
- # MultiIndex columns
- frame = DataFrame(
- np.random.randn(8, 3),
- columns=Index(
- [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")],
- name=("sth", "sth2"),
- ),
- )
- missing = ("good", "food")
- keys = [frame.columns[1], frame.columns[0]]
- idx = idx_type(keys)
- idx_check = list(idx_type(keys))
- if isinstance(idx, (set, dict)):
- with pytest.raises(TypeError, match="as an indexer is not supported"):
- frame[idx]
- return
- else:
- result = frame[idx]
- expected = frame.loc[:, idx_check]
- expected.columns.names = frame.columns.names
- tm.assert_frame_equal(result, expected)
- idx = idx_type(keys + [missing])
- with pytest.raises(KeyError, match="not in index"):
- frame[idx]
- def test_getitem_iloc_generator(self):
- # GH#39614
- df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
- indexer = (x for x in [1, 2])
- result = df.iloc[indexer]
- expected = DataFrame({"a": [2, 3], "b": [5, 6]}, index=[1, 2])
- tm.assert_frame_equal(result, expected)
- def test_getitem_iloc_two_dimensional_generator(self):
- df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
- indexer = (x for x in [1, 2])
- result = df.iloc[indexer, 1]
- expected = Series([5, 6], name="b", index=[1, 2])
- tm.assert_series_equal(result, expected)
- def test_getitem_iloc_dateoffset_days(self):
- # GH 46671
- df = DataFrame(
- list(range(10)),
- index=date_range("01-01-2022", periods=10, freq=DateOffset(days=1)),
- )
- result = df.loc["2022-01-01":"2022-01-03"]
- expected = DataFrame(
- [0, 1, 2],
- index=DatetimeIndex(
- ["2022-01-01", "2022-01-02", "2022-01-03"],
- dtype="datetime64[ns]",
- freq=DateOffset(days=1),
- ),
- )
- tm.assert_frame_equal(result, expected)
- df = DataFrame(
- list(range(10)),
- index=date_range(
- "01-01-2022", periods=10, freq=DateOffset(days=1, hours=2)
- ),
- )
- result = df.loc["2022-01-01":"2022-01-03"]
- expected = DataFrame(
- [0, 1, 2],
- index=DatetimeIndex(
- ["2022-01-01 00:00:00", "2022-01-02 02:00:00", "2022-01-03 04:00:00"],
- dtype="datetime64[ns]",
- freq=DateOffset(days=1, hours=2),
- ),
- )
- tm.assert_frame_equal(result, expected)
- df = DataFrame(
- list(range(10)),
- index=date_range("01-01-2022", periods=10, freq=DateOffset(minutes=3)),
- )
- result = df.loc["2022-01-01":"2022-01-03"]
- tm.assert_frame_equal(result, df)
- class TestGetitemCallable:
- def test_getitem_callable(self, float_frame):
- # GH#12533
- result = float_frame[lambda x: "A"]
- expected = float_frame.loc[:, "A"]
- tm.assert_series_equal(result, expected)
- result = float_frame[lambda x: ["A", "B"]]
- expected = float_frame.loc[:, ["A", "B"]]
- tm.assert_frame_equal(result, float_frame.loc[:, ["A", "B"]])
- df = float_frame[:3]
- result = df[lambda x: [True, False, True]]
- expected = float_frame.iloc[[0, 2], :]
- tm.assert_frame_equal(result, expected)
- def test_loc_multiindex_columns_one_level(self):
- # GH#29749
- df = DataFrame([[1, 2]], columns=[["a", "b"]])
- expected = DataFrame([1], columns=[["a"]])
- result = df["a"]
- tm.assert_frame_equal(result, expected)
- result = df.loc[:, "a"]
- tm.assert_frame_equal(result, expected)
- class TestGetitemBooleanMask:
- def test_getitem_bool_mask_categorical_index(self):
- df3 = DataFrame(
- {
- "A": np.arange(6, dtype="int64"),
- },
- index=CategoricalIndex(
- [1, 1, 2, 1, 3, 2],
- dtype=CategoricalDtype([3, 2, 1], ordered=True),
- name="B",
- ),
- )
- df4 = DataFrame(
- {
- "A": np.arange(6, dtype="int64"),
- },
- index=CategoricalIndex(
- [1, 1, 2, 1, 3, 2],
- dtype=CategoricalDtype([3, 2, 1], ordered=False),
- name="B",
- ),
- )
- result = df3[df3.index == "a"]
- expected = df3.iloc[[]]
- tm.assert_frame_equal(result, expected)
- result = df4[df4.index == "a"]
- expected = df4.iloc[[]]
- tm.assert_frame_equal(result, expected)
- result = df3[df3.index == 1]
- expected = df3.iloc[[0, 1, 3]]
- tm.assert_frame_equal(result, expected)
- result = df4[df4.index == 1]
- expected = df4.iloc[[0, 1, 3]]
- tm.assert_frame_equal(result, expected)
- # since we have an ordered categorical
- # CategoricalIndex([1, 1, 2, 1, 3, 2],
- # categories=[3, 2, 1],
- # ordered=True,
- # name='B')
- result = df3[df3.index < 2]
- expected = df3.iloc[[4]]
- tm.assert_frame_equal(result, expected)
- result = df3[df3.index > 1]
- expected = df3.iloc[[]]
- tm.assert_frame_equal(result, expected)
- # unordered
- # cannot be compared
- # CategoricalIndex([1, 1, 2, 1, 3, 2],
- # categories=[3, 2, 1],
- # ordered=False,
- # name='B')
- msg = "Unordered Categoricals can only compare equality or not"
- with pytest.raises(TypeError, match=msg):
- df4[df4.index < 2]
- with pytest.raises(TypeError, match=msg):
- df4[df4.index > 1]
- @pytest.mark.parametrize(
- "data1,data2,expected_data",
- (
- (
- [[1, 2], [3, 4]],
- [[0.5, 6], [7, 8]],
- [[np.nan, 3.0], [np.nan, 4.0], [np.nan, 7.0], [6.0, 8.0]],
- ),
- (
- [[1, 2], [3, 4]],
- [[5, 6], [7, 8]],
- [[np.nan, 3.0], [np.nan, 4.0], [5, 7], [6, 8]],
- ),
- ),
- )
- def test_getitem_bool_mask_duplicate_columns_mixed_dtypes(
- self,
- data1,
- data2,
- expected_data,
- ):
- # GH#31954
- df1 = DataFrame(np.array(data1))
- df2 = DataFrame(np.array(data2))
- df = concat([df1, df2], axis=1)
- result = df[df > 2]
- exdict = {i: np.array(col) for i, col in enumerate(expected_data)}
- expected = DataFrame(exdict).rename(columns={2: 0, 3: 1})
- tm.assert_frame_equal(result, expected)
- @pytest.fixture
- def df_dup_cols(self):
- dups = ["A", "A", "C", "D"]
- df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
- return df
- def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_cols):
- # `df.A > 6` is a DataFrame with a different shape from df
- # boolean with the duplicate raises
- df = df_dup_cols
- msg = "cannot reindex on an axis with duplicate labels"
- with pytest.raises(ValueError, match=msg):
- df[df.A > 6]
- def test_getitem_boolean_series_with_duplicate_columns(self, df_dup_cols):
- # boolean indexing
- # GH#4879
- df = DataFrame(
- np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
- )
- expected = df[df.C > 6]
- expected.columns = df_dup_cols.columns
- df = df_dup_cols
- result = df[df.C > 6]
- tm.assert_frame_equal(result, expected)
- result.dtypes
- str(result)
- def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols):
- # where
- df = DataFrame(
- np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
- )
- # `df > 6` is a DataFrame with the same shape+alignment as df
- expected = df[df > 6]
- expected.columns = df_dup_cols.columns
- df = df_dup_cols
- result = df[df > 6]
- tm.assert_frame_equal(result, expected)
- result.dtypes
- str(result)
- def test_getitem_empty_frame_with_boolean(self):
- # Test for issue GH#11859
- df = DataFrame()
- df2 = df[df > 0]
- tm.assert_frame_equal(df, df2)
- def test_getitem_returns_view_when_column_is_unique_in_df(
- self, using_copy_on_write
- ):
- # GH#45316
- df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
- df_orig = df.copy()
- view = df["b"]
- view.loc[:] = 100
- if using_copy_on_write:
- expected = df_orig
- else:
- expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"])
- tm.assert_frame_equal(df, expected)
- def test_getitem_frozenset_unique_in_column(self):
- # GH#41062
- df = DataFrame([[1, 2, 3, 4]], columns=[frozenset(["KEY"]), "B", "C", "C"])
- result = df[frozenset(["KEY"])]
- expected = Series([1], name=frozenset(["KEY"]))
- tm.assert_series_equal(result, expected)
- class TestGetitemSlice:
- def test_getitem_slice_float64(self, frame_or_series):
- values = np.arange(10.0, 50.0, 2)
- index = Index(values)
- start, end = values[[5, 15]]
- data = np.random.randn(20, 3)
- if frame_or_series is not DataFrame:
- data = data[:, 0]
- obj = frame_or_series(data, index=index)
- result = obj[start:end]
- expected = obj.iloc[5:16]
- tm.assert_equal(result, expected)
- result = obj.loc[start:end]
- tm.assert_equal(result, expected)
- def test_getitem_datetime_slice(self):
- # GH#43223
- df = DataFrame(
- {"a": 0},
- index=DatetimeIndex(
- [
- "11.01.2011 22:00",
- "11.01.2011 23:00",
- "12.01.2011 00:00",
- "2011-01-13 00:00",
- ]
- ),
- )
- with pytest.raises(
- KeyError, match="Value based partial slicing on non-monotonic"
- ):
- df["2011-01-01":"2011-11-01"]
- class TestGetitemDeprecatedIndexers:
- @pytest.mark.parametrize("key", [{"a", "b"}, {"a": "a"}])
- def test_getitem_dict_and_set_deprecated(self, key):
- # GH#42825 enforced in 2.0
- df = DataFrame(
- [[1, 2], [3, 4]], columns=MultiIndex.from_tuples([("a", 1), ("b", 2)])
- )
- with pytest.raises(TypeError, match="as an indexer is not supported"):
- df[key]
|