123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- """
- This file contains a minimal set of tests for compliance with the extension
- array interface test suite, and should contain no other tests.
- The test suite for the full functionality of the array is located in
- `pandas/tests/arrays/`.
- The tests in this file are inherited from the BaseExtensionTests, and only
- minimal tweaks should be applied to get the tests passing (by overwriting a
- parent method).
- Additional tests should either be added to one of the BaseExtensionTests
- classes (if they are relevant for the extension interface for all dtypes), or
- be added to the array-specific tests in `pandas/tests/arrays/`.
- """
- import string
- import numpy as np
- import pytest
- from pandas.errors import PerformanceWarning
- import pandas as pd
- import pandas._testing as tm
- from pandas.api.types import is_string_dtype
- from pandas.core.arrays import ArrowStringArray
- from pandas.core.arrays.string_ import StringDtype
- from pandas.tests.extension import base
- def split_array(arr):
- if arr.dtype.storage != "pyarrow":
- pytest.skip("only applicable for pyarrow chunked array n/a")
- def _split_array(arr):
- import pyarrow as pa
- arrow_array = arr._data
- split = len(arrow_array) // 2
- arrow_array = pa.chunked_array(
- [*arrow_array[:split].chunks, *arrow_array[split:].chunks]
- )
- assert arrow_array.num_chunks == 2
- return type(arr)(arrow_array)
- return _split_array(arr)
- @pytest.fixture(params=[True, False])
- def chunked(request):
- return request.param
- @pytest.fixture
- def dtype(string_storage):
- return StringDtype(storage=string_storage)
- @pytest.fixture
- def data(dtype, chunked):
- strings = np.random.choice(list(string.ascii_letters), size=100)
- while strings[0] == strings[1]:
- strings = np.random.choice(list(string.ascii_letters), size=100)
- arr = dtype.construct_array_type()._from_sequence(strings)
- return split_array(arr) if chunked else arr
- @pytest.fixture
- def data_missing(dtype, chunked):
- """Length 2 array with [NA, Valid]"""
- arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"])
- return split_array(arr) if chunked else arr
- @pytest.fixture
- def data_for_sorting(dtype, chunked):
- arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"])
- return split_array(arr) if chunked else arr
- @pytest.fixture
- def data_missing_for_sorting(dtype, chunked):
- arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"])
- return split_array(arr) if chunked else arr
- @pytest.fixture
- def na_value():
- return pd.NA
- @pytest.fixture
- def data_for_grouping(dtype, chunked):
- arr = dtype.construct_array_type()._from_sequence(
- ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]
- )
- return split_array(arr) if chunked else arr
- class TestDtype(base.BaseDtypeTests):
- def test_eq_with_str(self, dtype):
- assert dtype == f"string[{dtype.storage}]"
- super().test_eq_with_str(dtype)
- def test_is_not_string_type(self, dtype):
- # Different from BaseDtypeTests.test_is_not_string_type
- # because StringDtype is a string type
- assert is_string_dtype(dtype)
- class TestInterface(base.BaseInterfaceTests):
- def test_view(self, data, request):
- if data.dtype.storage == "pyarrow":
- pytest.skip(reason="2D support not implemented for ArrowStringArray")
- super().test_view(data)
- class TestConstructors(base.BaseConstructorsTests):
- def test_from_dtype(self, data):
- # base test uses string representation of dtype
- pass
- def test_constructor_from_list(self):
- # GH 27673
- pytest.importorskip("pyarrow", minversion="1.0.0")
- result = pd.Series(["E"], dtype=StringDtype(storage="pyarrow"))
- assert isinstance(result.dtype, StringDtype)
- assert result.dtype.storage == "pyarrow"
- class TestReshaping(base.BaseReshapingTests):
- def test_transpose(self, data, request):
- if data.dtype.storage == "pyarrow":
- pytest.skip(reason="2D support not implemented for ArrowStringArray")
- super().test_transpose(data)
- class TestGetitem(base.BaseGetitemTests):
- pass
- class TestSetitem(base.BaseSetitemTests):
- def test_setitem_preserves_views(self, data, request):
- if data.dtype.storage == "pyarrow":
- pytest.skip(reason="2D support not implemented for ArrowStringArray")
- super().test_setitem_preserves_views(data)
- class TestIndex(base.BaseIndexTests):
- pass
- class TestMissing(base.BaseMissingTests):
- def test_dropna_array(self, data_missing):
- result = data_missing.dropna()
- expected = data_missing[[1]]
- self.assert_extension_array_equal(result, expected)
- def test_fillna_no_op_returns_copy(self, data):
- data = data[~data.isna()]
- valid = data[0]
- result = data.fillna(valid)
- assert result is not data
- self.assert_extension_array_equal(result, data)
- with tm.maybe_produces_warning(
- PerformanceWarning, data.dtype.storage == "pyarrow"
- ):
- result = data.fillna(method="backfill")
- assert result is not data
- self.assert_extension_array_equal(result, data)
- def test_fillna_series_method(self, data_missing, fillna_method):
- with tm.maybe_produces_warning(
- PerformanceWarning,
- fillna_method is not None and data_missing.dtype.storage == "pyarrow",
- check_stacklevel=False,
- ):
- super().test_fillna_series_method(data_missing, fillna_method)
- class TestNoReduce(base.BaseNoReduceTests):
- @pytest.mark.parametrize("skipna", [True, False])
- def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
- op_name = all_numeric_reductions
- if op_name in ["min", "max"]:
- return None
- ser = pd.Series(data)
- with pytest.raises(TypeError):
- getattr(ser, op_name)(skipna=skipna)
- class TestMethods(base.BaseMethodsTests):
- def test_value_counts_with_normalize(self, data):
- data = data[:10].unique()
- values = np.array(data[~data.isna()])
- ser = pd.Series(data, dtype=data.dtype)
- result = ser.value_counts(normalize=True).sort_index()
- expected = pd.Series(
- [1 / len(values)] * len(values), index=result.index, name="proportion"
- )
- if getattr(data.dtype, "storage", "") == "pyarrow":
- expected = expected.astype("double[pyarrow]")
- else:
- expected = expected.astype("Float64")
- self.assert_series_equal(result, expected)
- class TestCasting(base.BaseCastingTests):
- pass
- class TestComparisonOps(base.BaseComparisonOpsTests):
- def _compare_other(self, ser, data, op, other):
- op_name = f"__{op.__name__}__"
- result = getattr(ser, op_name)(other)
- dtype = "boolean[pyarrow]" if ser.dtype.storage == "pyarrow" else "boolean"
- expected = getattr(ser.astype(object), op_name)(other).astype(dtype)
- self.assert_series_equal(result, expected)
- def test_compare_scalar(self, data, comparison_op):
- ser = pd.Series(data)
- self._compare_other(ser, data, comparison_op, "abc")
- class TestParsing(base.BaseParsingTests):
- pass
- class TestPrinting(base.BasePrintingTests):
- pass
- class TestGroupBy(base.BaseGroupbyTests):
- @pytest.mark.parametrize("as_index", [True, False])
- def test_groupby_extension_agg(self, as_index, data_for_grouping):
- df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
- result = df.groupby("B", as_index=as_index).A.mean()
- _, uniques = pd.factorize(data_for_grouping, sort=True)
- if as_index:
- index = pd.Index(uniques, name="B")
- expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
- self.assert_series_equal(result, expected)
- else:
- expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]})
- self.assert_frame_equal(result, expected)
- @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
- def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
- super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
- class Test2DCompat(base.Dim2CompatTests):
- @pytest.fixture(autouse=True)
- def arrow_not_supported(self, data, request):
- if isinstance(data, ArrowStringArray):
- pytest.skip(reason="2D support not implemented for ArrowStringArray")
- def test_searchsorted_with_na_raises(data_for_sorting, as_series):
- # GH50447
- b, c, a = data_for_sorting
- arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
- arr[-1] = pd.NA
- if as_series:
- arr = pd.Series(arr)
- msg = (
- "searchsorted requires array to be sorted, "
- "which is impossible with NAs present."
- )
- with pytest.raises(ValueError, match=msg):
- arr.searchsorted(b)
|