123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401 |
- """
- This file contains a minimal set of tests for compliance with the extension
- array interface test suite, and should contain no other tests.
- The test suite for the full functionality of the array is located in
- `pandas/tests/arrays/`.
- The tests in this file are inherited from the BaseExtensionTests, and only
- minimal tweaks should be applied to get the tests passing (by overwriting a
- parent method).
- Additional tests should either be added to one of the BaseExtensionTests
- classes (if they are relevant for the extension interface for all dtypes), or
- be added to the array-specific tests in `pandas/tests/arrays/`.
- """
- import numpy as np
- import pytest
- from pandas.core.dtypes.common import is_bool_dtype
- import pandas as pd
- import pandas._testing as tm
- from pandas.core.arrays.boolean import BooleanDtype
- from pandas.tests.extension import base
- def make_data():
- return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False]
- @pytest.fixture
- def dtype():
- return BooleanDtype()
- @pytest.fixture
- def data(dtype):
- return pd.array(make_data(), dtype=dtype)
- @pytest.fixture
- def data_for_twos(dtype):
- return pd.array(np.ones(100), dtype=dtype)
- @pytest.fixture
- def data_missing(dtype):
- return pd.array([np.nan, True], dtype=dtype)
- @pytest.fixture
- def data_for_sorting(dtype):
- return pd.array([True, True, False], dtype=dtype)
- @pytest.fixture
- def data_missing_for_sorting(dtype):
- return pd.array([True, np.nan, False], dtype=dtype)
- @pytest.fixture
- def na_cmp():
- # we are pd.NA
- return lambda x, y: x is pd.NA and y is pd.NA
- @pytest.fixture
- def na_value():
- return pd.NA
- @pytest.fixture
- def data_for_grouping(dtype):
- b = True
- a = False
- na = np.nan
- return pd.array([b, b, na, na, a, a, b], dtype=dtype)
- class TestDtype(base.BaseDtypeTests):
- pass
- class TestInterface(base.BaseInterfaceTests):
- pass
- class TestConstructors(base.BaseConstructorsTests):
- pass
- class TestGetitem(base.BaseGetitemTests):
- pass
- class TestSetitem(base.BaseSetitemTests):
- pass
- class TestIndex(base.BaseIndexTests):
- pass
- class TestMissing(base.BaseMissingTests):
- pass
- class TestArithmeticOps(base.BaseArithmeticOpsTests):
- implements = {"__sub__", "__rsub__"}
- def check_opname(self, s, op_name, other, exc=None):
- # overwriting to indicate ops don't raise an error
- exc = None
- if op_name.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]:
- # match behavior with non-masked bool dtype
- exc = NotImplementedError
- super().check_opname(s, op_name, other, exc=exc)
- def _check_op(self, obj, op, other, op_name, exc=NotImplementedError):
- if exc is None:
- if op_name in self.implements:
- msg = r"numpy boolean subtract"
- with pytest.raises(TypeError, match=msg):
- op(obj, other)
- return
- result = op(obj, other)
- expected = self._combine(obj, other, op)
- if op_name in (
- "__floordiv__",
- "__rfloordiv__",
- "__pow__",
- "__rpow__",
- "__mod__",
- "__rmod__",
- ):
- # combine keeps boolean type
- expected = expected.astype("Int8")
- elif op_name in ("__truediv__", "__rtruediv__"):
- # combine with bools does not generate the correct result
- # (numpy behaviour for div is to regard the bools as numeric)
- expected = self._combine(obj.astype(float), other, op)
- expected = expected.astype("Float64")
- if op_name == "__rpow__":
- # for rpow, combine does not propagate NaN
- expected[result.isna()] = np.nan
- self.assert_equal(result, expected)
- else:
- with pytest.raises(exc):
- op(obj, other)
- @pytest.mark.xfail(
- reason="Inconsistency between floordiv and divmod; we raise for floordiv "
- "but not for divmod. This matches what we do for non-masked bool dtype."
- )
- def test_divmod_series_array(self, data, data_for_twos):
- super().test_divmod_series_array(data, data_for_twos)
- @pytest.mark.xfail(
- reason="Inconsistency between floordiv and divmod; we raise for floordiv "
- "but not for divmod. This matches what we do for non-masked bool dtype."
- )
- def test_divmod(self, data):
- super().test_divmod(data)
- class TestComparisonOps(base.BaseComparisonOpsTests):
- def check_opname(self, s, op_name, other, exc=None):
- # overwriting to indicate ops don't raise an error
- super().check_opname(s, op_name, other, exc=None)
- class TestReshaping(base.BaseReshapingTests):
- pass
- class TestMethods(base.BaseMethodsTests):
- _combine_le_expected_dtype = "boolean"
- def test_factorize(self, data_for_grouping):
- # override because we only have 2 unique values
- labels, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True)
- expected_labels = np.array([0, 0, -1, -1, 1, 1, 0], dtype=np.intp)
- expected_uniques = data_for_grouping.take([0, 4])
- tm.assert_numpy_array_equal(labels, expected_labels)
- self.assert_extension_array_equal(uniques, expected_uniques)
- def test_searchsorted(self, data_for_sorting, as_series):
- # override because we only have 2 unique values
- data_for_sorting = pd.array([True, False], dtype="boolean")
- b, a = data_for_sorting
- arr = type(data_for_sorting)._from_sequence([a, b])
- if as_series:
- arr = pd.Series(arr)
- assert arr.searchsorted(a) == 0
- assert arr.searchsorted(a, side="right") == 1
- assert arr.searchsorted(b) == 1
- assert arr.searchsorted(b, side="right") == 2
- result = arr.searchsorted(arr.take([0, 1]))
- expected = np.array([0, 1], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- # sorter
- sorter = np.array([1, 0])
- assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
- def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
- # override because there are only 2 unique values
- # data_for_sorting -> [B, C, A] with A < B < C -> here True, True, False
- assert data_for_sorting.argmax() == 0
- assert data_for_sorting.argmin() == 2
- # with repeated values -> first occurrence
- data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
- assert data.argmax() == 1
- assert data.argmin() == 0
- # with missing values
- # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
- assert data_missing_for_sorting.argmax() == 0
- assert data_missing_for_sorting.argmin() == 2
- class TestCasting(base.BaseCastingTests):
- pass
- class TestGroupby(base.BaseGroupbyTests):
- """
- Groupby-specific tests are overridden because boolean only has 2
- unique values, base tests uses 3 groups.
- """
- def test_grouping_grouper(self, data_for_grouping):
- df = pd.DataFrame(
- {"A": ["B", "B", None, None, "A", "A", "B"], "B": data_for_grouping}
- )
- gr1 = df.groupby("A").grouper.groupings[0]
- gr2 = df.groupby("B").grouper.groupings[0]
- tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
- tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
- @pytest.mark.parametrize("as_index", [True, False])
- def test_groupby_extension_agg(self, as_index, data_for_grouping):
- df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
- result = df.groupby("B", as_index=as_index).A.mean()
- _, uniques = pd.factorize(data_for_grouping, sort=True)
- if as_index:
- index = pd.Index(uniques, name="B")
- expected = pd.Series([3.0, 1.0], index=index, name="A")
- self.assert_series_equal(result, expected)
- else:
- expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0]})
- self.assert_frame_equal(result, expected)
- def test_groupby_agg_extension(self, data_for_grouping):
- # GH#38980 groupby agg on extension type fails for non-numeric types
- df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
- expected = df.iloc[[0, 2, 4]]
- expected = expected.set_index("A")
- result = df.groupby("A").agg({"B": "first"})
- self.assert_frame_equal(result, expected)
- result = df.groupby("A").agg("first")
- self.assert_frame_equal(result, expected)
- result = df.groupby("A").first()
- self.assert_frame_equal(result, expected)
- def test_groupby_extension_no_sort(self, data_for_grouping):
- df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
- result = df.groupby("B", sort=False).A.mean()
- _, index = pd.factorize(data_for_grouping, sort=False)
- index = pd.Index(index, name="B")
- expected = pd.Series([1.0, 3.0], index=index, name="A")
- self.assert_series_equal(result, expected)
- def test_groupby_extension_transform(self, data_for_grouping):
- valid = data_for_grouping[~data_for_grouping.isna()]
- df = pd.DataFrame({"A": [1, 1, 3, 3, 1], "B": valid})
- result = df.groupby("B").A.transform(len)
- expected = pd.Series([3, 3, 2, 2, 3], name="A")
- self.assert_series_equal(result, expected)
- def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
- df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
- df.groupby("B", group_keys=False).apply(groupby_apply_op)
- df.groupby("B", group_keys=False).A.apply(groupby_apply_op)
- df.groupby("A", group_keys=False).apply(groupby_apply_op)
- df.groupby("A", group_keys=False).B.apply(groupby_apply_op)
- def test_groupby_apply_identity(self, data_for_grouping):
- df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
- result = df.groupby("A").B.apply(lambda x: x.array)
- expected = pd.Series(
- [
- df.B.iloc[[0, 1, 6]].array,
- df.B.iloc[[2, 3]].array,
- df.B.iloc[[4, 5]].array,
- ],
- index=pd.Index([1, 2, 3], name="A"),
- name="B",
- )
- self.assert_series_equal(result, expected)
- def test_in_numeric_groupby(self, data_for_grouping):
- df = pd.DataFrame(
- {
- "A": [1, 1, 2, 2, 3, 3, 1],
- "B": data_for_grouping,
- "C": [1, 1, 1, 1, 1, 1, 1],
- }
- )
- result = df.groupby("A").sum().columns
- if data_for_grouping.dtype._is_numeric:
- expected = pd.Index(["B", "C"])
- else:
- expected = pd.Index(["C"])
- tm.assert_index_equal(result, expected)
- @pytest.mark.parametrize("min_count", [0, 10])
- def test_groupby_sum_mincount(self, data_for_grouping, min_count):
- df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
- result = df.groupby("A").sum(min_count=min_count)
- if min_count == 0:
- expected = pd.DataFrame(
- {"B": pd.array([3, 0, 0], dtype="Int64")},
- index=pd.Index([1, 2, 3], name="A"),
- )
- tm.assert_frame_equal(result, expected)
- else:
- expected = pd.DataFrame(
- {"B": pd.array([pd.NA] * 3, dtype="Int64")},
- index=pd.Index([1, 2, 3], name="A"),
- )
- tm.assert_frame_equal(result, expected)
- class TestNumericReduce(base.BaseNumericReduceTests):
- def check_reduce(self, s, op_name, skipna):
- if op_name == "count":
- result = getattr(s, op_name)()
- expected = getattr(s.astype("float64"), op_name)()
- else:
- result = getattr(s, op_name)(skipna=skipna)
- expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
- # override parent function to cast to bool for min/max
- if np.isnan(expected):
- expected = pd.NA
- elif op_name in ("min", "max"):
- expected = bool(expected)
- tm.assert_almost_equal(result, expected)
- class TestBooleanReduce(base.BaseBooleanReduceTests):
- pass
- class TestPrinting(base.BasePrintingTests):
- pass
- class TestUnaryOps(base.BaseUnaryOpsTests):
- pass
- class TestAccumulation(base.BaseAccumulateTests):
- def check_accumulate(self, s, op_name, skipna):
- result = getattr(s, op_name)(skipna=skipna)
- expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna)
- tm.assert_series_equal(result, expected, check_dtype=False)
- if op_name in ("cummin", "cummax"):
- assert is_bool_dtype(result)
- @pytest.mark.parametrize("skipna", [True, False])
- def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
- pass
- class TestParsing(base.BaseParsingTests):
- pass
- class Test2DCompat(base.Dim2CompatTests):
- pass
|