123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516 |
- import operator
- import numpy as np
- import pytest
- import pandas as pd
- import pandas._testing as tm
- from pandas.core.arrays.sparse import (
- SparseArray,
- SparseDtype,
- )
- @pytest.fixture(params=["integer", "block"])
- def kind(request):
- """kind kwarg to pass to SparseArray"""
- return request.param
- @pytest.fixture(params=[True, False])
- def mix(request):
- """
- Fixture returning True or False, determining whether to operate
- op(sparse, dense) instead of op(sparse, sparse)
- """
- return request.param
- class TestSparseArrayArithmetics:
- def _assert(self, a, b):
- # We have to use tm.assert_sp_array_equal. See GH #45126
- tm.assert_numpy_array_equal(a, b)
- def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op):
- # Check that arithmetic behavior matches non-Sparse Series arithmetic
- if isinstance(a_dense, np.ndarray):
- expected = op(pd.Series(a_dense), b_dense).values
- elif isinstance(b_dense, np.ndarray):
- expected = op(a_dense, pd.Series(b_dense)).values
- else:
- raise NotImplementedError
- with np.errstate(invalid="ignore", divide="ignore"):
- if mix:
- result = op(a, b_dense).to_dense()
- else:
- result = op(a, b).to_dense()
- self._assert(result, expected)
- def _check_bool_result(self, res):
- assert isinstance(res, SparseArray)
- assert isinstance(res.dtype, SparseDtype)
- assert res.dtype.subtype == np.bool_
- assert isinstance(res.fill_value, bool)
- def _check_comparison_ops(self, a, b, a_dense, b_dense):
- with np.errstate(invalid="ignore"):
- # Unfortunately, trying to wrap the computation of each expected
- # value is with np.errstate() is too tedious.
- #
- # sparse & sparse
- self._check_bool_result(a == b)
- self._assert((a == b).to_dense(), a_dense == b_dense)
- self._check_bool_result(a != b)
- self._assert((a != b).to_dense(), a_dense != b_dense)
- self._check_bool_result(a >= b)
- self._assert((a >= b).to_dense(), a_dense >= b_dense)
- self._check_bool_result(a <= b)
- self._assert((a <= b).to_dense(), a_dense <= b_dense)
- self._check_bool_result(a > b)
- self._assert((a > b).to_dense(), a_dense > b_dense)
- self._check_bool_result(a < b)
- self._assert((a < b).to_dense(), a_dense < b_dense)
- # sparse & dense
- self._check_bool_result(a == b_dense)
- self._assert((a == b_dense).to_dense(), a_dense == b_dense)
- self._check_bool_result(a != b_dense)
- self._assert((a != b_dense).to_dense(), a_dense != b_dense)
- self._check_bool_result(a >= b_dense)
- self._assert((a >= b_dense).to_dense(), a_dense >= b_dense)
- self._check_bool_result(a <= b_dense)
- self._assert((a <= b_dense).to_dense(), a_dense <= b_dense)
- self._check_bool_result(a > b_dense)
- self._assert((a > b_dense).to_dense(), a_dense > b_dense)
- self._check_bool_result(a < b_dense)
- self._assert((a < b_dense).to_dense(), a_dense < b_dense)
- def _check_logical_ops(self, a, b, a_dense, b_dense):
- # sparse & sparse
- self._check_bool_result(a & b)
- self._assert((a & b).to_dense(), a_dense & b_dense)
- self._check_bool_result(a | b)
- self._assert((a | b).to_dense(), a_dense | b_dense)
- # sparse & dense
- self._check_bool_result(a & b_dense)
- self._assert((a & b_dense).to_dense(), a_dense & b_dense)
- self._check_bool_result(a | b_dense)
- self._assert((a | b_dense).to_dense(), a_dense | b_dense)
- @pytest.mark.parametrize("scalar", [0, 1, 3])
- @pytest.mark.parametrize("fill_value", [None, 0, 2])
- def test_float_scalar(
- self, kind, mix, all_arithmetic_functions, fill_value, scalar, request
- ):
- op = all_arithmetic_functions
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- a = SparseArray(values, kind=kind, fill_value=fill_value)
- self._check_numeric_ops(a, scalar, values, scalar, mix, op)
- def test_float_scalar_comparison(self, kind):
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- a = SparseArray(values, kind=kind)
- self._check_comparison_ops(a, 1, values, 1)
- self._check_comparison_ops(a, 0, values, 0)
- self._check_comparison_ops(a, 3, values, 3)
- a = SparseArray(values, kind=kind, fill_value=0)
- self._check_comparison_ops(a, 1, values, 1)
- self._check_comparison_ops(a, 0, values, 0)
- self._check_comparison_ops(a, 3, values, 3)
- a = SparseArray(values, kind=kind, fill_value=2)
- self._check_comparison_ops(a, 1, values, 1)
- self._check_comparison_ops(a, 0, values, 0)
- self._check_comparison_ops(a, 3, values, 3)
- def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions):
- # when sp_index are the same
- op = all_arithmetic_functions
- values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
- rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind, fill_value=0)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- def test_float_same_index_with_nans(
- self, kind, mix, all_arithmetic_functions, request
- ):
- # when sp_index are the same
- op = all_arithmetic_functions
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
- a = SparseArray(values, kind=kind)
- b = SparseArray(rvalues, kind=kind)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- def test_float_same_index_comparison(self, kind):
- # when sp_index are the same
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
- a = SparseArray(values, kind=kind)
- b = SparseArray(rvalues, kind=kind)
- self._check_comparison_ops(a, b, values, rvalues)
- values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
- rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind, fill_value=0)
- self._check_comparison_ops(a, b, values, rvalues)
- def test_float_array(self, kind, mix, all_arithmetic_functions):
- op = all_arithmetic_functions
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
- a = SparseArray(values, kind=kind)
- b = SparseArray(rvalues, kind=kind)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind, fill_value=0)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, kind=kind, fill_value=1)
- b = SparseArray(rvalues, kind=kind, fill_value=2)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- def test_float_array_different_kind(self, mix, all_arithmetic_functions):
- op = all_arithmetic_functions
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
- a = SparseArray(values, kind="integer")
- b = SparseArray(rvalues, kind="block")
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
- a = SparseArray(values, kind="integer", fill_value=0)
- b = SparseArray(rvalues, kind="block")
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, kind="integer", fill_value=0)
- b = SparseArray(rvalues, kind="block", fill_value=0)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, kind="integer", fill_value=1)
- b = SparseArray(rvalues, kind="block", fill_value=2)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- def test_float_array_comparison(self, kind):
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
- a = SparseArray(values, kind=kind)
- b = SparseArray(rvalues, kind=kind)
- self._check_comparison_ops(a, b, values, rvalues)
- self._check_comparison_ops(a, b * 0, values, rvalues * 0)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind)
- self._check_comparison_ops(a, b, values, rvalues)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind, fill_value=0)
- self._check_comparison_ops(a, b, values, rvalues)
- a = SparseArray(values, kind=kind, fill_value=1)
- b = SparseArray(rvalues, kind=kind, fill_value=2)
- self._check_comparison_ops(a, b, values, rvalues)
- def test_int_array(self, kind, mix, all_arithmetic_functions):
- op = all_arithmetic_functions
- # have to specify dtype explicitly until fixing GH 667
- dtype = np.int64
- values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
- rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
- a = SparseArray(values, dtype=dtype, kind=kind)
- assert a.dtype == SparseDtype(dtype)
- b = SparseArray(rvalues, dtype=dtype, kind=kind)
- assert b.dtype == SparseDtype(dtype)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
- a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind)
- assert a.dtype == SparseDtype(dtype)
- b = SparseArray(rvalues, dtype=dtype, kind=kind)
- assert b.dtype == SparseDtype(dtype)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind)
- assert a.dtype == SparseDtype(dtype)
- b = SparseArray(rvalues, fill_value=0, dtype=dtype, kind=kind)
- assert b.dtype == SparseDtype(dtype)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, fill_value=1, dtype=dtype, kind=kind)
- assert a.dtype == SparseDtype(dtype, fill_value=1)
- b = SparseArray(rvalues, fill_value=2, dtype=dtype, kind=kind)
- assert b.dtype == SparseDtype(dtype, fill_value=2)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- def test_int_array_comparison(self, kind):
- dtype = "int64"
- # int32 NI ATM
- values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
- rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
- a = SparseArray(values, dtype=dtype, kind=kind)
- b = SparseArray(rvalues, dtype=dtype, kind=kind)
- self._check_comparison_ops(a, b, values, rvalues)
- self._check_comparison_ops(a, b * 0, values, rvalues * 0)
- a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0)
- b = SparseArray(rvalues, dtype=dtype, kind=kind)
- self._check_comparison_ops(a, b, values, rvalues)
- a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0)
- b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=0)
- self._check_comparison_ops(a, b, values, rvalues)
- a = SparseArray(values, dtype=dtype, kind=kind, fill_value=1)
- b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=2)
- self._check_comparison_ops(a, b, values, rvalues)
- @pytest.mark.parametrize("fill_value", [True, False, np.nan])
- def test_bool_same_index(self, kind, fill_value):
- # GH 14000
- # when sp_index are the same
- values = np.array([True, False, True, True], dtype=np.bool_)
- rvalues = np.array([True, False, True, True], dtype=np.bool_)
- a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
- b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
- self._check_logical_ops(a, b, values, rvalues)
- @pytest.mark.parametrize("fill_value", [True, False, np.nan])
- def test_bool_array_logical(self, kind, fill_value):
- # GH 14000
- # when sp_index are the same
- values = np.array([True, False, True, False, True, True], dtype=np.bool_)
- rvalues = np.array([True, False, False, True, False, True], dtype=np.bool_)
- a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
- b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
- self._check_logical_ops(a, b, values, rvalues)
- def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request):
- op = all_arithmetic_functions
- rdtype = "int64"
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
- a = SparseArray(values, kind=kind)
- b = SparseArray(rvalues, kind=kind)
- assert b.dtype == SparseDtype(rdtype)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind)
- assert b.dtype == SparseDtype(rdtype)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind, fill_value=0)
- assert b.dtype == SparseDtype(rdtype)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- a = SparseArray(values, kind=kind, fill_value=1)
- b = SparseArray(rvalues, kind=kind, fill_value=2)
- assert b.dtype == SparseDtype(rdtype, fill_value=2)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
- def test_mixed_array_comparison(self, kind):
- rdtype = "int64"
- # int32 NI ATM
- values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
- a = SparseArray(values, kind=kind)
- b = SparseArray(rvalues, kind=kind)
- assert b.dtype == SparseDtype(rdtype)
- self._check_comparison_ops(a, b, values, rvalues)
- self._check_comparison_ops(a, b * 0, values, rvalues * 0)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind)
- assert b.dtype == SparseDtype(rdtype)
- self._check_comparison_ops(a, b, values, rvalues)
- a = SparseArray(values, kind=kind, fill_value=0)
- b = SparseArray(rvalues, kind=kind, fill_value=0)
- assert b.dtype == SparseDtype(rdtype)
- self._check_comparison_ops(a, b, values, rvalues)
- a = SparseArray(values, kind=kind, fill_value=1)
- b = SparseArray(rvalues, kind=kind, fill_value=2)
- assert b.dtype == SparseDtype(rdtype, fill_value=2)
- self._check_comparison_ops(a, b, values, rvalues)
- def test_xor(self):
- s = SparseArray([True, True, False, False])
- t = SparseArray([True, False, True, False])
- result = s ^ t
- sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
- expected = SparseArray([False, True, True], sparse_index=sp_index)
- tm.assert_sp_array_equal(result, expected)
- @pytest.mark.parametrize("op", [operator.eq, operator.add])
- def test_with_list(op):
- arr = SparseArray([0, 1], fill_value=0)
- result = op(arr, [0, 1])
- expected = op(arr, SparseArray([0, 1]))
- tm.assert_sp_array_equal(result, expected)
- def test_with_dataframe():
- # GH#27910
- arr = SparseArray([0, 1], fill_value=0)
- df = pd.DataFrame([[1, 2], [3, 4]])
- result = arr.__add__(df)
- assert result is NotImplemented
- def test_with_zerodim_ndarray():
- # GH#27910
- arr = SparseArray([0, 1], fill_value=0)
- result = arr * np.array(2)
- expected = arr * 2
- tm.assert_sp_array_equal(result, expected)
- @pytest.mark.parametrize("ufunc", [np.abs, np.exp])
- @pytest.mark.parametrize(
- "arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])]
- )
- def test_ufuncs(ufunc, arr):
- result = ufunc(arr)
- fill_value = ufunc(arr.fill_value)
- expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value)
- tm.assert_sp_array_equal(result, expected)
- @pytest.mark.parametrize(
- "a, b",
- [
- (SparseArray([0, 0, 0]), np.array([0, 1, 2])),
- (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
- (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
- (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
- (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
- ],
- )
- @pytest.mark.parametrize("ufunc", [np.add, np.greater])
- def test_binary_ufuncs(ufunc, a, b):
- # can't say anything about fill value here.
- result = ufunc(a, b)
- expected = ufunc(np.asarray(a), np.asarray(b))
- assert isinstance(result, SparseArray)
- tm.assert_numpy_array_equal(np.asarray(result), expected)
- def test_ndarray_inplace():
- sparray = SparseArray([0, 2, 0, 0])
- ndarray = np.array([0, 1, 2, 3])
- ndarray += sparray
- expected = np.array([0, 3, 2, 3])
- tm.assert_numpy_array_equal(ndarray, expected)
- def test_sparray_inplace():
- sparray = SparseArray([0, 2, 0, 0])
- ndarray = np.array([0, 1, 2, 3])
- sparray += ndarray
- expected = SparseArray([0, 3, 2, 3], fill_value=0)
- tm.assert_sp_array_equal(sparray, expected)
- @pytest.mark.parametrize("cons", [list, np.array, SparseArray])
- def test_mismatched_length_cmp_op(cons):
- left = SparseArray([True, True])
- right = cons([True, True, True])
- with pytest.raises(ValueError, match="operands have mismatched length"):
- left & right
- @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
- @pytest.mark.parametrize("fill_value", [np.nan, 3])
- def test_binary_operators(op, fill_value):
- op = getattr(operator, op)
- data1 = np.random.randn(20)
- data2 = np.random.randn(20)
- data1[::2] = fill_value
- data2[::3] = fill_value
- first = SparseArray(data1, fill_value=fill_value)
- second = SparseArray(data2, fill_value=fill_value)
- with np.errstate(all="ignore"):
- res = op(first, second)
- exp = SparseArray(
- op(first.to_dense(), second.to_dense()), fill_value=first.fill_value
- )
- assert isinstance(res, SparseArray)
- tm.assert_almost_equal(res.to_dense(), exp.to_dense())
- res2 = op(first, second.to_dense())
- assert isinstance(res2, SparseArray)
- tm.assert_sp_array_equal(res, res2)
- res3 = op(first.to_dense(), second)
- assert isinstance(res3, SparseArray)
- tm.assert_sp_array_equal(res, res3)
- res4 = op(first, 4)
- assert isinstance(res4, SparseArray)
- # Ignore this if the actual op raises (e.g. pow).
- try:
- exp = op(first.to_dense(), 4)
- exp_fv = op(first.fill_value, 4)
- except ValueError:
- pass
- else:
- tm.assert_almost_equal(res4.fill_value, exp_fv)
- tm.assert_almost_equal(res4.to_dense(), exp)
|