123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609 |
- import inspect
- import operator
- import numpy as np
- import pytest
- from pandas._typing import Dtype
- from pandas.core.dtypes.common import is_bool_dtype
- from pandas.core.dtypes.missing import na_value_for_dtype
- import pandas as pd
- import pandas._testing as tm
- from pandas.core.sorting import nargsort
- from pandas.tests.extension.base.base import BaseExtensionTests
- class BaseMethodsTests(BaseExtensionTests):
- """Various Series and DataFrame methods."""
- def test_value_counts_default_dropna(self, data):
- # make sure we have consistent default dropna kwarg
- if not hasattr(data, "value_counts"):
- pytest.skip(f"value_counts is not implemented for {type(data)}")
- sig = inspect.signature(data.value_counts)
- kwarg = sig.parameters["dropna"]
- assert kwarg.default is True
- @pytest.mark.parametrize("dropna", [True, False])
- def test_value_counts(self, all_data, dropna):
- all_data = all_data[:10]
- if dropna:
- other = all_data[~all_data.isna()]
- else:
- other = all_data
- result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
- expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
- self.assert_series_equal(result, expected)
- def test_value_counts_with_normalize(self, data):
- # GH 33172
- data = data[:10].unique()
- values = np.array(data[~data.isna()])
- ser = pd.Series(data, dtype=data.dtype)
- result = ser.value_counts(normalize=True).sort_index()
- if not isinstance(data, pd.Categorical):
- expected = pd.Series(
- [1 / len(values)] * len(values), index=result.index, name="proportion"
- )
- else:
- expected = pd.Series(0.0, index=result.index, name="proportion")
- expected[result > 0] = 1 / len(values)
- if na_value_for_dtype(data.dtype) is pd.NA:
- # TODO(GH#44692): avoid special-casing
- expected = expected.astype("Float64")
- self.assert_series_equal(result, expected)
- def test_count(self, data_missing):
- df = pd.DataFrame({"A": data_missing})
- result = df.count(axis="columns")
- expected = pd.Series([0, 1])
- self.assert_series_equal(result, expected)
- def test_series_count(self, data_missing):
- # GH#26835
- ser = pd.Series(data_missing)
- result = ser.count()
- expected = 1
- assert result == expected
- def test_apply_simple_series(self, data):
- result = pd.Series(data).apply(id)
- assert isinstance(result, pd.Series)
- def test_argsort(self, data_for_sorting):
- result = pd.Series(data_for_sorting).argsort()
- # argsort result gets passed to take, so should be np.intp
- expected = pd.Series(np.array([2, 0, 1], dtype=np.intp))
- self.assert_series_equal(result, expected)
- def test_argsort_missing_array(self, data_missing_for_sorting):
- result = data_missing_for_sorting.argsort()
- # argsort result gets passed to take, so should be np.intp
- expected = np.array([2, 0, 1], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- def test_argsort_missing(self, data_missing_for_sorting):
- result = pd.Series(data_missing_for_sorting).argsort()
- expected = pd.Series(np.array([1, -1, 0], dtype=np.intp))
- self.assert_series_equal(result, expected)
- def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
- # GH 24382
- # data_for_sorting -> [B, C, A] with A < B < C
- assert data_for_sorting.argmax() == 1
- assert data_for_sorting.argmin() == 2
- # with repeated values -> first occurrence
- data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
- assert data.argmax() == 3
- assert data.argmin() == 0
- # with missing values
- # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
- assert data_missing_for_sorting.argmax() == 0
- assert data_missing_for_sorting.argmin() == 2
- @pytest.mark.parametrize("method", ["argmax", "argmin"])
- def test_argmin_argmax_empty_array(self, method, data):
- # GH 24382
- err_msg = "attempt to get"
- with pytest.raises(ValueError, match=err_msg):
- getattr(data[:0], method)()
- @pytest.mark.parametrize("method", ["argmax", "argmin"])
- def test_argmin_argmax_all_na(self, method, data, na_value):
- # all missing with skipna=True is the same as empty
- err_msg = "attempt to get"
- data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
- with pytest.raises(ValueError, match=err_msg):
- getattr(data_na, method)()
- @pytest.mark.parametrize(
- "op_name, skipna, expected",
- [
- ("idxmax", True, 0),
- ("idxmin", True, 2),
- ("argmax", True, 0),
- ("argmin", True, 2),
- ("idxmax", False, np.nan),
- ("idxmin", False, np.nan),
- ("argmax", False, -1),
- ("argmin", False, -1),
- ],
- )
- def test_argreduce_series(
- self, data_missing_for_sorting, op_name, skipna, expected
- ):
- # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
- ser = pd.Series(data_missing_for_sorting)
- result = getattr(ser, op_name)(skipna=skipna)
- tm.assert_almost_equal(result, expected)
- def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting):
- # GH#38733
- data = data_missing_for_sorting
- with pytest.raises(NotImplementedError, match=""):
- data.argmin(skipna=False)
- with pytest.raises(NotImplementedError, match=""):
- data.argmax(skipna=False)
- @pytest.mark.parametrize(
- "na_position, expected",
- [
- ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
- ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
- ],
- )
- def test_nargsort(self, data_missing_for_sorting, na_position, expected):
- # GH 25439
- result = nargsort(data_missing_for_sorting, na_position=na_position)
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("ascending", [True, False])
- def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
- ser = pd.Series(data_for_sorting)
- result = ser.sort_values(ascending=ascending, key=sort_by_key)
- expected = ser.iloc[[2, 0, 1]]
- if not ascending:
- # GH 35922. Expect stable sort
- if ser.nunique() == 2:
- expected = ser.iloc[[0, 1, 2]]
- else:
- expected = ser.iloc[[1, 0, 2]]
- self.assert_series_equal(result, expected)
- @pytest.mark.parametrize("ascending", [True, False])
- def test_sort_values_missing(
- self, data_missing_for_sorting, ascending, sort_by_key
- ):
- ser = pd.Series(data_missing_for_sorting)
- result = ser.sort_values(ascending=ascending, key=sort_by_key)
- if ascending:
- expected = ser.iloc[[2, 0, 1]]
- else:
- expected = ser.iloc[[0, 2, 1]]
- self.assert_series_equal(result, expected)
- @pytest.mark.parametrize("ascending", [True, False])
- def test_sort_values_frame(self, data_for_sorting, ascending):
- df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
- result = df.sort_values(["A", "B"])
- expected = pd.DataFrame(
- {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
- )
- self.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("box", [pd.Series, lambda x: x])
- @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
- def test_unique(self, data, box, method):
- duplicated = box(data._from_sequence([data[0], data[0]]))
- result = method(duplicated)
- assert len(result) == 1
- assert isinstance(result, type(data))
- assert result[0] == duplicated[0]
- def test_factorize(self, data_for_grouping):
- codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True)
- expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp)
- expected_uniques = data_for_grouping.take([0, 4, 7])
- tm.assert_numpy_array_equal(codes, expected_codes)
- self.assert_extension_array_equal(uniques, expected_uniques)
- def test_factorize_equivalence(self, data_for_grouping):
- codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True)
- codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True)
- tm.assert_numpy_array_equal(codes_1, codes_2)
- self.assert_extension_array_equal(uniques_1, uniques_2)
- assert len(uniques_1) == len(pd.unique(uniques_1))
- assert uniques_1.dtype == data_for_grouping.dtype
- def test_factorize_empty(self, data):
- codes, uniques = pd.factorize(data[:0])
- expected_codes = np.array([], dtype=np.intp)
- expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
- tm.assert_numpy_array_equal(codes, expected_codes)
- self.assert_extension_array_equal(uniques, expected_uniques)
- def test_fillna_copy_frame(self, data_missing):
- arr = data_missing.take([1, 1])
- df = pd.DataFrame({"A": arr})
- df_orig = df.copy()
- filled_val = df.iloc[0, 0]
- result = df.fillna(filled_val)
- result.iloc[0, 0] = filled_val
- self.assert_frame_equal(df, df_orig)
- def test_fillna_copy_series(self, data_missing):
- arr = data_missing.take([1, 1])
- ser = pd.Series(arr, copy=False)
- ser_orig = ser.copy()
- filled_val = ser[0]
- result = ser.fillna(filled_val)
- result.iloc[0] = filled_val
- self.assert_series_equal(ser, ser_orig)
- def test_fillna_length_mismatch(self, data_missing):
- msg = "Length of 'value' does not match."
- with pytest.raises(ValueError, match=msg):
- data_missing.fillna(data_missing.take([1]))
- # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool]
- _combine_le_expected_dtype: Dtype = np.dtype(bool)
- def test_combine_le(self, data_repeated):
- # GH 20825
- # Test that combine works when doing a <= (le) comparison
- orig_data1, orig_data2 = data_repeated(2)
- s1 = pd.Series(orig_data1)
- s2 = pd.Series(orig_data2)
- result = s1.combine(s2, lambda x1, x2: x1 <= x2)
- expected = pd.Series(
- [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
- dtype=self._combine_le_expected_dtype,
- )
- self.assert_series_equal(result, expected)
- val = s1.iloc[0]
- result = s1.combine(val, lambda x1, x2: x1 <= x2)
- expected = pd.Series(
- [a <= val for a in list(orig_data1)],
- dtype=self._combine_le_expected_dtype,
- )
- self.assert_series_equal(result, expected)
- def test_combine_add(self, data_repeated):
- # GH 20825
- orig_data1, orig_data2 = data_repeated(2)
- s1 = pd.Series(orig_data1)
- s2 = pd.Series(orig_data2)
- result = s1.combine(s2, lambda x1, x2: x1 + x2)
- with np.errstate(over="ignore"):
- expected = pd.Series(
- orig_data1._from_sequence(
- [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
- )
- )
- self.assert_series_equal(result, expected)
- val = s1.iloc[0]
- result = s1.combine(val, lambda x1, x2: x1 + x2)
- expected = pd.Series(
- orig_data1._from_sequence([a + val for a in list(orig_data1)])
- )
- self.assert_series_equal(result, expected)
- def test_combine_first(self, data):
- # https://github.com/pandas-dev/pandas/issues/24147
- a = pd.Series(data[:3])
- b = pd.Series(data[2:5], index=[2, 3, 4])
- result = a.combine_first(b)
- expected = pd.Series(data[:5])
- self.assert_series_equal(result, expected)
- @pytest.mark.parametrize("frame", [True, False])
- @pytest.mark.parametrize(
- "periods, indices",
- [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
- )
- def test_container_shift(self, data, frame, periods, indices):
- # https://github.com/pandas-dev/pandas/issues/22386
- subset = data[:5]
- data = pd.Series(subset, name="A")
- expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
- if frame:
- result = data.to_frame(name="A").assign(B=1).shift(periods)
- expected = pd.concat(
- [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
- )
- compare = self.assert_frame_equal
- else:
- result = data.shift(periods)
- compare = self.assert_series_equal
- compare(result, expected)
- def test_shift_0_periods(self, data):
- # GH#33856 shifting with periods=0 should return a copy, not same obj
- result = data.shift(0)
- assert data[0] != data[1] # otherwise below is invalid
- data[0] = data[1]
- assert result[0] != result[1] # i.e. not the same object/view
- @pytest.mark.parametrize("periods", [1, -2])
- def test_diff(self, data, periods):
- data = data[:5]
- if is_bool_dtype(data.dtype):
- op = operator.xor
- else:
- op = operator.sub
- try:
- # does this array implement ops?
- op(data, data)
- except Exception:
- pytest.skip(f"{type(data)} does not support diff")
- s = pd.Series(data)
- result = s.diff(periods)
- expected = pd.Series(op(data, data.shift(periods)))
- self.assert_series_equal(result, expected)
- df = pd.DataFrame({"A": data, "B": [1.0] * 5})
- result = df.diff(periods)
- if periods == 1:
- b = [np.nan, 0, 0, 0, 0]
- else:
- b = [0, 0, 0, np.nan, np.nan]
- expected = pd.DataFrame({"A": expected, "B": b})
- self.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "periods, indices",
- [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
- )
- def test_shift_non_empty_array(self, data, periods, indices):
- # https://github.com/pandas-dev/pandas/issues/23911
- subset = data[:2]
- result = subset.shift(periods)
- expected = subset.take(indices, allow_fill=True)
- self.assert_extension_array_equal(result, expected)
- @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
- def test_shift_empty_array(self, data, periods):
- # https://github.com/pandas-dev/pandas/issues/23911
- empty = data[:0]
- result = empty.shift(periods)
- expected = empty
- self.assert_extension_array_equal(result, expected)
- def test_shift_zero_copies(self, data):
- # GH#31502
- result = data.shift(0)
- assert result is not data
- result = data[:0].shift(2)
- assert result is not data
- def test_shift_fill_value(self, data):
- arr = data[:4]
- fill_value = data[0]
- result = arr.shift(1, fill_value=fill_value)
- expected = data.take([0, 0, 1, 2])
- self.assert_extension_array_equal(result, expected)
- result = arr.shift(-2, fill_value=fill_value)
- expected = data.take([2, 3, 0, 0])
- self.assert_extension_array_equal(result, expected)
- def test_not_hashable(self, data):
- # We are in general mutable, so not hashable
- with pytest.raises(TypeError, match="unhashable type"):
- hash(data)
- def test_hash_pandas_object_works(self, data, as_frame):
- # https://github.com/pandas-dev/pandas/issues/23066
- data = pd.Series(data)
- if as_frame:
- data = data.to_frame()
- a = pd.util.hash_pandas_object(data)
- b = pd.util.hash_pandas_object(data)
- self.assert_equal(a, b)
- def test_searchsorted(self, data_for_sorting, as_series):
- b, c, a = data_for_sorting
- arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
- if as_series:
- arr = pd.Series(arr)
- assert arr.searchsorted(a) == 0
- assert arr.searchsorted(a, side="right") == 1
- assert arr.searchsorted(b) == 1
- assert arr.searchsorted(b, side="right") == 2
- assert arr.searchsorted(c) == 2
- assert arr.searchsorted(c, side="right") == 3
- result = arr.searchsorted(arr.take([0, 2]))
- expected = np.array([0, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- # sorter
- sorter = np.array([1, 2, 0])
- assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
- def test_where_series(self, data, na_value, as_frame):
- assert data[0] != data[1]
- cls = type(data)
- a, b = data[:2]
- orig = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
- ser = orig.copy()
- cond = np.array([True, True, False, False])
- if as_frame:
- ser = ser.to_frame(name="a")
- cond = cond.reshape(-1, 1)
- result = ser.where(cond)
- expected = pd.Series(
- cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
- )
- if as_frame:
- expected = expected.to_frame(name="a")
- self.assert_equal(result, expected)
- ser.mask(~cond, inplace=True)
- self.assert_equal(ser, expected)
- # array other
- ser = orig.copy()
- if as_frame:
- ser = ser.to_frame(name="a")
- cond = np.array([True, False, True, True])
- other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
- if as_frame:
- other = pd.DataFrame({"a": other})
- cond = pd.DataFrame({"a": cond})
- result = ser.where(cond, other)
- expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
- if as_frame:
- expected = expected.to_frame(name="a")
- self.assert_equal(result, expected)
- ser.mask(~cond, other, inplace=True)
- self.assert_equal(ser, expected)
- @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
- def test_repeat(self, data, repeats, as_series, use_numpy):
- arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
- if as_series:
- arr = pd.Series(arr)
- result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
- repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
- expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
- expected = type(data)._from_sequence(expected, dtype=data.dtype)
- if as_series:
- expected = pd.Series(expected, index=arr.index.repeat(repeats))
- self.assert_equal(result, expected)
- @pytest.mark.parametrize(
- "repeats, kwargs, error, msg",
- [
- (2, {"axis": 1}, ValueError, "axis"),
- (-1, {}, ValueError, "negative"),
- ([1, 2], {}, ValueError, "shape"),
- (2, {"foo": "bar"}, TypeError, "'foo'"),
- ],
- )
- def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
- with pytest.raises(error, match=msg):
- if use_numpy:
- np.repeat(data, repeats, **kwargs)
- else:
- data.repeat(repeats, **kwargs)
- def test_delete(self, data):
- result = data.delete(0)
- expected = data[1:]
- self.assert_extension_array_equal(result, expected)
- result = data.delete([1, 3])
- expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
- self.assert_extension_array_equal(result, expected)
- def test_insert(self, data):
- # insert at the beginning
- result = data[1:].insert(0, data[0])
- self.assert_extension_array_equal(result, data)
- result = data[1:].insert(-len(data[1:]), data[0])
- self.assert_extension_array_equal(result, data)
- # insert at the middle
- result = data[:-1].insert(4, data[-1])
- taker = np.arange(len(data))
- taker[5:] = taker[4:-1]
- taker[4] = len(data) - 1
- expected = data.take(taker)
- self.assert_extension_array_equal(result, expected)
- def test_insert_invalid(self, data, invalid_scalar):
- item = invalid_scalar
- with pytest.raises((TypeError, ValueError)):
- data.insert(0, item)
- with pytest.raises((TypeError, ValueError)):
- data.insert(4, item)
- with pytest.raises((TypeError, ValueError)):
- data.insert(len(data) - 1, item)
- def test_insert_invalid_loc(self, data):
- ub = len(data)
- with pytest.raises(IndexError):
- data.insert(ub + 1, data[0])
- with pytest.raises(IndexError):
- data.insert(-ub - 1, data[0])
- with pytest.raises(TypeError):
- # we expect TypeError here instead of IndexError to match np.insert
- data.insert(1.5, data[0])
- @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
- def test_equals(self, data, na_value, as_series, box):
- data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
- data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
- data = tm.box_expected(data, box, transpose=False)
- data2 = tm.box_expected(data2, box, transpose=False)
- data_na = tm.box_expected(data_na, box, transpose=False)
- # we are asserting with `is True/False` explicitly, to test that the
- # result is an actual Python bool, and not something "truthy"
- assert data.equals(data) is True
- assert data.equals(data.copy()) is True
- # unequal other data
- assert data.equals(data2) is False
- assert data.equals(data_na) is False
- # different length
- assert data[:2].equals(data[:3]) is False
- # empty are equal
- assert data[:0].equals(data[:0]) is True
- # other types
- assert data.equals(None) is False
- assert data[[0]].equals(data[0]) is False
|