12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892 |
- """
- SparseArray data structure
- """
- from __future__ import annotations
- from collections import abc
- import numbers
- import operator
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Literal,
- Sequence,
- TypeVar,
- cast,
- overload,
- )
- import warnings
- import numpy as np
- from pandas._libs import lib
- import pandas._libs.sparse as splib
- from pandas._libs.sparse import (
- BlockIndex,
- IntIndex,
- SparseIndex,
- )
- from pandas._libs.tslibs import NaT
- from pandas._typing import (
- ArrayLike,
- AstypeArg,
- Axis,
- AxisInt,
- Dtype,
- NpDtype,
- PositionalIndexer,
- Scalar,
- ScalarIndexer,
- SequenceIndexer,
- npt,
- )
- from pandas.compat.numpy import function as nv
- from pandas.errors import PerformanceWarning
- from pandas.util._exceptions import find_stack_level
- from pandas.util._validators import (
- validate_bool_kwarg,
- validate_insert_loc,
- )
- from pandas.core.dtypes.astype import astype_array
- from pandas.core.dtypes.cast import (
- construct_1d_arraylike_from_scalar,
- find_common_type,
- maybe_box_datetimelike,
- )
- from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_datetime64_any_dtype,
- is_datetime64tz_dtype,
- is_dtype_equal,
- is_integer,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
- )
- from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
- )
- from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
- notna,
- )
- from pandas.core import (
- arraylike,
- ops,
- )
- import pandas.core.algorithms as algos
- from pandas.core.arraylike import OpsMixin
- from pandas.core.arrays import ExtensionArray
- from pandas.core.arrays.sparse.dtype import SparseDtype
- from pandas.core.base import PandasObject
- import pandas.core.common as com
- from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
- sanitize_array,
- )
- from pandas.core.indexers import (
- check_array_indexer,
- unpack_tuple_and_ellipses,
- )
- from pandas.core.missing import interpolate_2d
- from pandas.core.nanops import check_below_min_count
- from pandas.io.formats import printing
- # See https://github.com/python/typing/issues/684
- if TYPE_CHECKING:
- from enum import Enum
- class ellipsis(Enum):
- Ellipsis = "..."
- Ellipsis = ellipsis.Ellipsis
- from scipy.sparse import spmatrix
- from pandas._typing import (
- FillnaOptions,
- NumpySorter,
- )
- SparseIndexKind = Literal["integer", "block"]
- from pandas import Series
- else:
- ellipsis = type(Ellipsis)
- # ----------------------------------------------------------------------------
- # Array
- SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")
- _sparray_doc_kwargs = {"klass": "SparseArray"}
- def _get_fill(arr: SparseArray) -> np.ndarray:
- """
- Create a 0-dim ndarray containing the fill value
- Parameters
- ----------
- arr : SparseArray
- Returns
- -------
- fill_value : ndarray
- 0-dim ndarray with just the fill value.
- Notes
- -----
- coerce fill_value to arr dtype if possible
- int64 SparseArray can have NaN as fill_value if there is no missing
- """
- try:
- return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
- except ValueError:
- return np.asarray(arr.fill_value)
- def _sparse_array_op(
- left: SparseArray, right: SparseArray, op: Callable, name: str
- ) -> SparseArray:
- """
- Perform a binary operation between two arrays.
- Parameters
- ----------
- left : Union[SparseArray, ndarray]
- right : Union[SparseArray, ndarray]
- op : Callable
- The binary operation to perform
- name str
- Name of the callable.
- Returns
- -------
- SparseArray
- """
- if name.startswith("__"):
- # For lookups in _libs.sparse we need non-dunder op name
- name = name[2:-2]
- # dtype used to find corresponding sparse method
- ltype = left.dtype.subtype
- rtype = right.dtype.subtype
- if not is_dtype_equal(ltype, rtype):
- subtype = find_common_type([ltype, rtype])
- ltype = SparseDtype(subtype, left.fill_value)
- rtype = SparseDtype(subtype, right.fill_value)
- left = left.astype(ltype, copy=False)
- right = right.astype(rtype, copy=False)
- dtype = ltype.subtype
- else:
- dtype = ltype
- # dtype the result must have
- result_dtype = None
- if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
- with np.errstate(all="ignore"):
- result = op(left.to_dense(), right.to_dense())
- fill = op(_get_fill(left), _get_fill(right))
- if left.sp_index.ngaps == 0:
- index = left.sp_index
- else:
- index = right.sp_index
- elif left.sp_index.equals(right.sp_index):
- with np.errstate(all="ignore"):
- result = op(left.sp_values, right.sp_values)
- fill = op(_get_fill(left), _get_fill(right))
- index = left.sp_index
- else:
- if name[0] == "r":
- left, right = right, left
- name = name[1:]
- if name in ("and", "or", "xor") and dtype == "bool":
- opname = f"sparse_{name}_uint8"
- # to make template simple, cast here
- left_sp_values = left.sp_values.view(np.uint8)
- right_sp_values = right.sp_values.view(np.uint8)
- result_dtype = bool
- else:
- opname = f"sparse_{name}_{dtype}"
- left_sp_values = left.sp_values
- right_sp_values = right.sp_values
- if (
- name in ["floordiv", "mod"]
- and (right == 0).any()
- and left.dtype.kind in ["i", "u"]
- ):
- # Match the non-Sparse Series behavior
- opname = f"sparse_{name}_float64"
- left_sp_values = left_sp_values.astype("float64")
- right_sp_values = right_sp_values.astype("float64")
- sparse_op = getattr(splib, opname)
- with np.errstate(all="ignore"):
- result, index, fill = sparse_op(
- left_sp_values,
- left.sp_index,
- left.fill_value,
- right_sp_values,
- right.sp_index,
- right.fill_value,
- )
- if name == "divmod":
- # result is a 2-tuple
- # error: Incompatible return value type (got "Tuple[SparseArray,
- # SparseArray]", expected "SparseArray")
- return ( # type: ignore[return-value]
- _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),
- _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),
- )
- if result_dtype is None:
- result_dtype = result.dtype
- return _wrap_result(name, result, index, fill, dtype=result_dtype)
- def _wrap_result(
- name: str, data, sparse_index, fill_value, dtype: Dtype | None = None
- ) -> SparseArray:
- """
- wrap op result to have correct dtype
- """
- if name.startswith("__"):
- # e.g. __eq__ --> eq
- name = name[2:-2]
- if name in ("eq", "ne", "lt", "gt", "le", "ge"):
- dtype = bool
- fill_value = lib.item_from_zerodim(fill_value)
- if is_bool_dtype(dtype):
- # fill_value may be np.bool_
- fill_value = bool(fill_value)
- return SparseArray(
- data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
- )
- class SparseArray(OpsMixin, PandasObject, ExtensionArray):
- """
- An ExtensionArray for storing sparse data.
- Parameters
- ----------
- data : array-like or scalar
- A dense array of values to store in the SparseArray. This may contain
- `fill_value`.
- sparse_index : SparseIndex, optional
- fill_value : scalar, optional
- Elements in data that are ``fill_value`` are not stored in the
- SparseArray. For memory savings, this should be the most common value
- in `data`. By default, `fill_value` depends on the dtype of `data`:
- =========== ==========
- data.dtype na_value
- =========== ==========
- float ``np.nan``
- int ``0``
- bool False
- datetime64 ``pd.NaT``
- timedelta64 ``pd.NaT``
- =========== ==========
- The fill value is potentially specified in three ways. In order of
- precedence, these are
- 1. The `fill_value` argument
- 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
- a ``SparseDtype``
- 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
- is not a ``SparseDtype`` and `data` is a ``SparseArray``.
- kind : str
- Can be 'integer' or 'block', default is 'integer'.
- The type of storage for sparse locations.
- * 'block': Stores a `block` and `block_length` for each
- contiguous *span* of sparse values. This is best when
- sparse data tends to be clumped together, with large
- regions of ``fill-value`` values between sparse values.
- * 'integer': uses an integer to store the location of
- each sparse value.
- dtype : np.dtype or SparseDtype, optional
- The dtype to use for the SparseArray. For numpy dtypes, this
- determines the dtype of ``self.sp_values``. For SparseDtype,
- this determines ``self.sp_values`` and ``self.fill_value``.
- copy : bool, default False
- Whether to explicitly copy the incoming `data` array.
- Attributes
- ----------
- None
- Methods
- -------
- None
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> arr = SparseArray([0, 0, 1, 2])
- >>> arr
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- """
- _subtyp = "sparse_array" # register ABCSparseArray
- _hidden_attrs = PandasObject._hidden_attrs | frozenset([])
- _sparse_index: SparseIndex
- _sparse_values: np.ndarray
- _dtype: SparseDtype
- def __init__(
- self,
- data,
- sparse_index=None,
- fill_value=None,
- kind: SparseIndexKind = "integer",
- dtype: Dtype | None = None,
- copy: bool = False,
- ) -> None:
- if fill_value is None and isinstance(dtype, SparseDtype):
- fill_value = dtype.fill_value
- if isinstance(data, type(self)):
- # disable normal inference on dtype, sparse_index, & fill_value
- if sparse_index is None:
- sparse_index = data.sp_index
- if fill_value is None:
- fill_value = data.fill_value
- if dtype is None:
- dtype = data.dtype
- # TODO: make kind=None, and use data.kind?
- data = data.sp_values
- # Handle use-provided dtype
- if isinstance(dtype, str):
- # Two options: dtype='int', regular numpy dtype
- # or dtype='Sparse[int]', a sparse dtype
- try:
- dtype = SparseDtype.construct_from_string(dtype)
- except TypeError:
- dtype = pandas_dtype(dtype)
- if isinstance(dtype, SparseDtype):
- if fill_value is None:
- fill_value = dtype.fill_value
- dtype = dtype.subtype
- if is_scalar(data):
- if sparse_index is None:
- npoints = 1
- else:
- npoints = sparse_index.length
- data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)
- dtype = data.dtype
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- # TODO: disentangle the fill_value dtype inference from
- # dtype inference
- if data is None:
- # TODO: What should the empty dtype be? Object or float?
- # error: Argument "dtype" to "array" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],
- # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
- # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
- data = np.array([], dtype=dtype) # type: ignore[arg-type]
- if not is_array_like(data):
- try:
- # probably shared code in sanitize_series
- data = sanitize_array(data, index=None)
- except ValueError:
- # NumPy may raise a ValueError on data like [1, []]
- # we retry with object dtype here.
- if dtype is None:
- dtype = np.dtype(object)
- data = np.atleast_1d(np.asarray(data, dtype=dtype))
- else:
- raise
- if copy:
- # TODO: avoid double copy when dtype forces cast.
- data = data.copy()
- if fill_value is None:
- fill_value_dtype = data.dtype if dtype is None else dtype
- if fill_value_dtype is None:
- fill_value = np.nan
- else:
- fill_value = na_value_for_dtype(fill_value_dtype)
- if isinstance(data, type(self)) and sparse_index is None:
- sparse_index = data._sparse_index
- # error: Argument "dtype" to "asarray" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
- sparse_values = np.asarray(
- data.sp_values, dtype=dtype # type: ignore[arg-type]
- )
- elif sparse_index is None:
- data = extract_array(data, extract_numpy=True)
- if not isinstance(data, np.ndarray):
- # EA
- if is_datetime64tz_dtype(data.dtype):
- warnings.warn(
- f"Creating SparseArray from {data.dtype} data "
- "loses timezone information. Cast to object before "
- "sparse to retain timezone information.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- data = np.asarray(data, dtype="datetime64[ns]")
- if fill_value is NaT:
- fill_value = np.datetime64("NaT", "ns")
- data = np.asarray(data)
- sparse_values, sparse_index, fill_value = _make_sparse(
- # error: Argument "dtype" to "_make_sparse" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected
- # "Optional[dtype[Any]]"
- data,
- kind=kind,
- fill_value=fill_value,
- dtype=dtype, # type: ignore[arg-type]
- )
- else:
- # error: Argument "dtype" to "asarray" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
- sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]
- if len(sparse_values) != sparse_index.npoints:
- raise AssertionError(
- f"Non array-like type {type(sparse_values)} must "
- "have the same length as the index"
- )
- self._sparse_index = sparse_index
- self._sparse_values = sparse_values
- self._dtype = SparseDtype(sparse_values.dtype, fill_value)
- @classmethod
- def _simple_new(
- cls: type[SparseArrayT],
- sparse_array: np.ndarray,
- sparse_index: SparseIndex,
- dtype: SparseDtype,
- ) -> SparseArrayT:
- new = object.__new__(cls)
- new._sparse_index = sparse_index
- new._sparse_values = sparse_array
- new._dtype = dtype
- return new
- @classmethod
- def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT:
- """
- Create a SparseArray from a scipy.sparse matrix.
- Parameters
- ----------
- data : scipy.sparse.sp_matrix
- This should be a SciPy sparse matrix where the size
- of the second dimension is 1. In other words, a
- sparse matrix with a single column.
- Returns
- -------
- SparseArray
- Examples
- --------
- >>> import scipy.sparse
- >>> mat = scipy.sparse.coo_matrix((4, 1))
- >>> pd.arrays.SparseArray.from_spmatrix(mat)
- [0.0, 0.0, 0.0, 0.0]
- Fill: 0.0
- IntIndex
- Indices: array([], dtype=int32)
- """
- length, ncol = data.shape
- if ncol != 1:
- raise ValueError(f"'data' must have a single column, not '{ncol}'")
- # our sparse index classes require that the positions be strictly
- # increasing. So we need to sort loc, and arr accordingly.
- data = data.tocsc()
- data.sort_indices()
- arr = data.data
- idx = data.indices
- zero = np.array(0, dtype=arr.dtype).item()
- dtype = SparseDtype(arr.dtype, zero)
- index = IntIndex(length, idx)
- return cls._simple_new(arr, index, dtype)
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- fill_value = self.fill_value
- if self.sp_index.ngaps == 0:
- # Compat for na dtype and int values.
- return self.sp_values
- if dtype is None:
- # Can NumPy represent this type?
- # If not, `np.result_type` will raise. We catch that
- # and return object.
- if is_datetime64_any_dtype(self.sp_values.dtype):
- # However, we *do* special-case the common case of
- # a datetime64 with pandas NaT.
- if fill_value is NaT:
- # Can't put pd.NaT in a datetime64[ns]
- fill_value = np.datetime64("NaT")
- try:
- dtype = np.result_type(self.sp_values.dtype, type(fill_value))
- except TypeError:
- dtype = object
- out = np.full(self.shape, fill_value, dtype=dtype)
- out[self.sp_index.indices] = self.sp_values
- return out
- def __setitem__(self, key, value):
- # I suppose we could allow setting of non-fill_value elements.
- # TODO(SparseArray.__setitem__): remove special cases in
- # ExtensionBlock.where
- msg = "SparseArray does not support item assignment via setitem"
- raise TypeError(msg)
- @classmethod
- def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
- return cls(scalars, dtype=dtype)
- @classmethod
- def _from_factorized(cls, values, original):
- return cls(values, dtype=original.dtype)
- # ------------------------------------------------------------------------
- # Data
- # ------------------------------------------------------------------------
- @property
- def sp_index(self) -> SparseIndex:
- """
- The SparseIndex containing the location of non- ``fill_value`` points.
- """
- return self._sparse_index
- @property
- def sp_values(self) -> np.ndarray:
- """
- An ndarray containing the non- ``fill_value`` values.
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
- >>> s.sp_values
- array([1, 2])
- """
- return self._sparse_values
- @property
- def dtype(self) -> SparseDtype:
- return self._dtype
- @property
- def fill_value(self):
- """
- Elements in `data` that are `fill_value` are not stored.
- For memory savings, this should be the most common value in the array.
- """
- return self.dtype.fill_value
- @fill_value.setter
- def fill_value(self, value) -> None:
- self._dtype = SparseDtype(self.dtype.subtype, value)
- @property
- def kind(self) -> SparseIndexKind:
- """
- The kind of sparse index for this array. One of {'integer', 'block'}.
- """
- if isinstance(self.sp_index, IntIndex):
- return "integer"
- else:
- return "block"
- @property
- def _valid_sp_values(self) -> np.ndarray:
- sp_vals = self.sp_values
- mask = notna(sp_vals)
- return sp_vals[mask]
- def __len__(self) -> int:
- return self.sp_index.length
- @property
- def _null_fill_value(self) -> bool:
- return self._dtype._is_na_fill_value
- def _fill_value_matches(self, fill_value) -> bool:
- if self._null_fill_value:
- return isna(fill_value)
- else:
- return self.fill_value == fill_value
- @property
- def nbytes(self) -> int:
- return self.sp_values.nbytes + self.sp_index.nbytes
- @property
- def density(self) -> float:
- """
- The percent of non- ``fill_value`` points, as decimal.
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
- >>> s.density
- 0.6
- """
- return self.sp_index.npoints / self.sp_index.length
- @property
- def npoints(self) -> int:
- """
- The number of non- ``fill_value`` points.
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
- >>> s.npoints
- 3
- """
- return self.sp_index.npoints
- def isna(self):
- # If null fill value, we want SparseDtype[bool, true]
- # to preserve the same memory usage.
- dtype = SparseDtype(bool, self._null_fill_value)
- if self._null_fill_value:
- return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
- mask = np.full(len(self), False, dtype=np.bool_)
- mask[self.sp_index.indices] = isna(self.sp_values)
- return type(self)(mask, fill_value=False, dtype=dtype)
- def fillna(
- self: SparseArrayT,
- value=None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- ) -> SparseArrayT:
- """
- Fill missing values with `value`.
- Parameters
- ----------
- value : scalar, optional
- method : str, optional
- .. warning::
- Using 'method' will result in high memory use,
- as all `fill_value` methods will be converted to
- an in-memory ndarray
- limit : int, optional
- Returns
- -------
- SparseArray
- Notes
- -----
- When `value` is specified, the result's ``fill_value`` depends on
- ``self.fill_value``. The goal is to maintain low-memory use.
- If ``self.fill_value`` is NA, the result dtype will be
- ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
- amount of memory used before and after filling.
- When ``self.fill_value`` is not NA, the result dtype will be
- ``self.dtype``. Again, this preserves the amount of memory used.
- """
- if (method is None and value is None) or (
- method is not None and value is not None
- ):
- raise ValueError("Must specify one of 'method' or 'value'.")
- if method is not None:
- msg = "fillna with 'method' requires high memory usage."
- warnings.warn(
- msg,
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
- new_values = np.asarray(self)
- # interpolate_2d modifies new_values inplace
- interpolate_2d(new_values, method=method, limit=limit)
- return type(self)(new_values, fill_value=self.fill_value)
- else:
- new_values = np.where(isna(self.sp_values), value, self.sp_values)
- if self._null_fill_value:
- # This is essentially just updating the dtype.
- new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
- else:
- new_dtype = self.dtype
- return self._simple_new(new_values, self._sparse_index, new_dtype)
- def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT:
- if not len(self) or periods == 0:
- return self.copy()
- if isna(fill_value):
- fill_value = self.dtype.na_value
- subtype = np.result_type(fill_value, self.dtype.subtype)
- if subtype != self.dtype.subtype:
- # just coerce up front
- arr = self.astype(SparseDtype(subtype, self.fill_value))
- else:
- arr = self
- empty = self._from_sequence(
- [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
- )
- if periods > 0:
- a = empty
- b = arr[:-periods]
- else:
- a = arr[abs(periods) :]
- b = empty
- return arr._concat_same_type([a, b])
- def _first_fill_value_loc(self):
- """
- Get the location of the first fill value.
- Returns
- -------
- int
- """
- if len(self) == 0 or self.sp_index.npoints == len(self):
- return -1
- indices = self.sp_index.indices
- if not len(indices) or indices[0] > 0:
- return 0
- # a number larger than 1 should be appended to
- # the last in case of fill value only appears
- # in the tail of array
- diff = np.r_[np.diff(indices), 2]
- return indices[(diff > 1).argmax()] + 1
- def unique(self: SparseArrayT) -> SparseArrayT:
- uniques = algos.unique(self.sp_values)
- if len(self.sp_values) != len(self):
- fill_loc = self._first_fill_value_loc()
- # Inorder to align the behavior of pd.unique or
- # pd.Series.unique, we should keep the original
- # order, here we use unique again to find the
- # insertion place. Since the length of sp_values
- # is not large, maybe minor performance hurt
- # is worthwhile to the correctness.
- insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
- uniques = np.insert(uniques, insert_loc, self.fill_value)
- return type(self)._from_sequence(uniques, dtype=self.dtype)
- def _values_for_factorize(self):
- # Still override this for hash_pandas_object
- return np.asarray(self), self.fill_value
- def factorize(
- self,
- use_na_sentinel: bool = True,
- ) -> tuple[np.ndarray, SparseArray]:
- # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
- # The sparsity on this is backwards from what Sparse would want. Want
- # ExtensionArray.factorize -> Tuple[EA, EA]
- # Given that we have to return a dense array of codes, why bother
- # implementing an efficient factorize?
- codes, uniques = algos.factorize(
- np.asarray(self), use_na_sentinel=use_na_sentinel
- )
- uniques_sp = SparseArray(uniques, dtype=self.dtype)
- return codes, uniques_sp
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Returns a Series containing counts of unique values.
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of NaN, even if NaN is in sp_values.
- Returns
- -------
- counts : Series
- """
- from pandas import (
- Index,
- Series,
- )
- keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
- fcounts = self.sp_index.ngaps
- if fcounts > 0 and (not self._null_fill_value or not dropna):
- mask = isna(keys) if self._null_fill_value else keys == self.fill_value
- if mask.any():
- counts[mask] += fcounts
- else:
- # error: Argument 1 to "insert" has incompatible type "Union[
- # ExtensionArray,ndarray[Any, Any]]"; expected "Union[
- # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype
- # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],
- # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence
- # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"
- keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]
- counts = np.insert(counts, 0, fcounts)
- if not isinstance(keys, ABCIndex):
- index = Index(keys)
- else:
- index = keys
- return Series(counts, index=index, copy=False)
- # --------
- # Indexing
- # --------
- @overload
- def __getitem__(self, key: ScalarIndexer) -> Any:
- ...
- @overload
- def __getitem__(
- self: SparseArrayT,
- key: SequenceIndexer | tuple[int | ellipsis, ...],
- ) -> SparseArrayT:
- ...
- def __getitem__(
- self: SparseArrayT,
- key: PositionalIndexer | tuple[int | ellipsis, ...],
- ) -> SparseArrayT | Any:
- if isinstance(key, tuple):
- key = unpack_tuple_and_ellipses(key)
- if key is Ellipsis:
- raise ValueError("Cannot slice with Ellipsis")
- if is_integer(key):
- return self._get_val_at(key)
- elif isinstance(key, tuple):
- # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
- # for "ndarray[Any, Any]"; expected type
- # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
- # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
- # Union[bool_, integer[Any]]]]], _NestedSequence[Union[
- # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
- # dtype[Union[bool_, integer[Any]]]], _NestedSequence[
- # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
- # _NestedSequence[Union[bool, int]]], ...]]"
- data_slice = self.to_dense()[key] # type: ignore[index]
- elif isinstance(key, slice):
- # Avoid densifying when handling contiguous slices
- if key.step is None or key.step == 1:
- start = 0 if key.start is None else key.start
- if start < 0:
- start += len(self)
- end = len(self) if key.stop is None else key.stop
- if end < 0:
- end += len(self)
- indices = self.sp_index.indices
- keep_inds = np.flatnonzero((indices >= start) & (indices < end))
- sp_vals = self.sp_values[keep_inds]
- sp_index = indices[keep_inds].copy()
- # If we've sliced to not include the start of the array, all our indices
- # should be shifted. NB: here we are careful to also not shift by a
- # negative value for a case like [0, 1][-100:] where the start index
- # should be treated like 0
- if start > 0:
- sp_index -= start
- # Length of our result should match applying this slice to a range
- # of the length of our original array
- new_len = len(range(len(self))[key])
- new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
- return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
- else:
- indices = np.arange(len(self), dtype=np.int32)[key]
- return self.take(indices)
- elif not is_list_like(key):
- # e.g. "foo" or 2.5
- # exception message copied from numpy
- raise IndexError(
- r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
- r"(`None`) and integer or boolean arrays are valid indices"
- )
- else:
- if isinstance(key, SparseArray):
- # NOTE: If we guarantee that SparseDType(bool)
- # has only fill_value - true, false or nan
- # (see GH PR 44955)
- # we can apply mask very fast:
- if is_bool_dtype(key):
- if isna(key.fill_value):
- return self.take(key.sp_index.indices[key.sp_values])
- if not key.fill_value:
- return self.take(key.sp_index.indices)
- n = len(self)
- mask = np.full(n, True, dtype=np.bool_)
- mask[key.sp_index.indices] = False
- return self.take(np.arange(n)[mask])
- else:
- key = np.asarray(key)
- key = check_array_indexer(self, key)
- if com.is_bool_indexer(key):
- # mypy doesn't know we have an array here
- key = cast(np.ndarray, key)
- return self.take(np.arange(len(key), dtype=np.int32)[key])
- elif hasattr(key, "__len__"):
- return self.take(key)
- else:
- raise ValueError(f"Cannot slice with '{key}'")
- return type(self)(data_slice, kind=self.kind)
- def _get_val_at(self, loc):
- loc = validate_insert_loc(loc, len(self))
- sp_loc = self.sp_index.lookup(loc)
- if sp_loc == -1:
- return self.fill_value
- else:
- val = self.sp_values[sp_loc]
- val = maybe_box_datetimelike(val, self.sp_values.dtype)
- return val
- def take(
- self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None
- ) -> SparseArrayT:
- if is_scalar(indices):
- raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
- indices = np.asarray(indices, dtype=np.int32)
- dtype = None
- if indices.size == 0:
- result = np.array([], dtype="object")
- dtype = self.dtype
- elif allow_fill:
- result = self._take_with_fill(indices, fill_value=fill_value)
- else:
- return self._take_without_fill(indices)
- return type(self)(
- result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
- )
- def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
- if fill_value is None:
- fill_value = self.dtype.na_value
- if indices.min() < -1:
- raise ValueError(
- "Invalid value in 'indices'. Must be between -1 "
- "and the length of the array."
- )
- if indices.max() >= len(self):
- raise IndexError("out of bounds value in 'indices'.")
- if len(self) == 0:
- # Empty... Allow taking only if all empty
- if (indices == -1).all():
- dtype = np.result_type(self.sp_values, type(fill_value))
- taken = np.empty_like(indices, dtype=dtype)
- taken.fill(fill_value)
- return taken
- else:
- raise IndexError("cannot do a non-empty take from an empty axes.")
- # sp_indexer may be -1 for two reasons
- # 1.) we took for an index of -1 (new)
- # 2.) we took a value that was self.fill_value (old)
- sp_indexer = self.sp_index.lookup_array(indices)
- new_fill_indices = indices == -1
- old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
- if self.sp_index.npoints == 0 and old_fill_indices.all():
- # We've looked up all valid points on an all-sparse array.
- taken = np.full(
- sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
- )
- elif self.sp_index.npoints == 0:
- # Avoid taking from the empty self.sp_values
- _dtype = np.result_type(self.dtype.subtype, type(fill_value))
- taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
- else:
- taken = self.sp_values.take(sp_indexer)
- # Fill in two steps.
- # Old fill values
- # New fill values
- # potentially coercing to a new dtype at each stage.
- m0 = sp_indexer[old_fill_indices] < 0
- m1 = sp_indexer[new_fill_indices] < 0
- result_type = taken.dtype
- if m0.any():
- result_type = np.result_type(result_type, type(self.fill_value))
- taken = taken.astype(result_type)
- taken[old_fill_indices] = self.fill_value
- if m1.any():
- result_type = np.result_type(result_type, type(fill_value))
- taken = taken.astype(result_type)
- taken[new_fill_indices] = fill_value
- return taken
- def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:
- to_shift = indices < 0
- n = len(self)
- if (indices.max() >= n) or (indices.min() < -n):
- if n == 0:
- raise IndexError("cannot do a non-empty take from an empty axes.")
- raise IndexError("out of bounds value in 'indices'.")
- if to_shift.any():
- indices = indices.copy()
- indices[to_shift] += n
- sp_indexer = self.sp_index.lookup_array(indices)
- value_mask = sp_indexer != -1
- new_sp_values = self.sp_values[sp_indexer[value_mask]]
- value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)
- new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
- return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
- def searchsorted(
- self,
- v: ArrayLike | object,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- msg = "searchsorted requires high memory usage."
- warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
- if not is_scalar(v):
- v = np.asarray(v)
- v = np.asarray(v)
- return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
- def copy(self: SparseArrayT) -> SparseArrayT:
- values = self.sp_values.copy()
- return self._simple_new(values, self.sp_index, self.dtype)
- @classmethod
- def _concat_same_type(
- cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT]
- ) -> SparseArrayT:
- fill_value = to_concat[0].fill_value
- values = []
- length = 0
- if to_concat:
- sp_kind = to_concat[0].kind
- else:
- sp_kind = "integer"
- sp_index: SparseIndex
- if sp_kind == "integer":
- indices = []
- for arr in to_concat:
- int_idx = arr.sp_index.indices.copy()
- int_idx += length # TODO: wraparound
- length += arr.sp_index.length
- values.append(arr.sp_values)
- indices.append(int_idx)
- data = np.concatenate(values)
- indices_arr = np.concatenate(indices)
- # error: Argument 2 to "IntIndex" has incompatible type
- # "ndarray[Any, dtype[signedinteger[_32Bit]]]";
- # expected "Sequence[int]"
- sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]
- else:
- # when concatenating block indices, we don't claim that you'll
- # get an identical index as concatenating the values and then
- # creating a new index. We don't want to spend the time trying
- # to merge blocks across arrays in `to_concat`, so the resulting
- # BlockIndex may have more blocks.
- blengths = []
- blocs = []
- for arr in to_concat:
- block_idx = arr.sp_index.to_block_index()
- values.append(arr.sp_values)
- blocs.append(block_idx.blocs.copy() + length)
- blengths.append(block_idx.blengths)
- length += arr.sp_index.length
- data = np.concatenate(values)
- blocs_arr = np.concatenate(blocs)
- blengths_arr = np.concatenate(blengths)
- sp_index = BlockIndex(length, blocs_arr, blengths_arr)
- return cls(data, sparse_index=sp_index, fill_value=fill_value)
- def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
- """
- Change the dtype of a SparseArray.
- The output will always be a SparseArray. To convert to a dense
- ndarray with a certain dtype, use :meth:`numpy.asarray`.
- Parameters
- ----------
- dtype : np.dtype or ExtensionDtype
- For SparseDtype, this changes the dtype of
- ``self.sp_values`` and the ``self.fill_value``.
- For other dtypes, this only changes the dtype of
- ``self.sp_values``.
- copy : bool, default True
- Whether to ensure a copy is made, even if not necessary.
- Returns
- -------
- SparseArray
- Examples
- --------
- >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])
- >>> arr
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- >>> arr.astype(SparseDtype(np.dtype('int32')))
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- Using a NumPy dtype with a different kind (e.g. float) will coerce
- just ``self.sp_values``.
- >>> arr.astype(SparseDtype(np.dtype('float64')))
- ... # doctest: +NORMALIZE_WHITESPACE
- [nan, nan, 1.0, 2.0]
- Fill: nan
- IntIndex
- Indices: array([2, 3], dtype=int32)
- Using a SparseDtype, you can also change the fill value as well.
- >>> arr.astype(SparseDtype("float64", fill_value=0.0))
- ... # doctest: +NORMALIZE_WHITESPACE
- [0.0, 0.0, 1.0, 2.0]
- Fill: 0.0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- """
- if is_dtype_equal(dtype, self._dtype):
- if not copy:
- return self
- else:
- return self.copy()
- future_dtype = pandas_dtype(dtype)
- if not isinstance(future_dtype, SparseDtype):
- # GH#34457
- values = np.asarray(self)
- values = ensure_wrapped_if_datetimelike(values)
- return astype_array(values, dtype=future_dtype, copy=False)
- dtype = self.dtype.update_dtype(dtype)
- subtype = pandas_dtype(dtype._subtype_with_str)
- subtype = cast(np.dtype, subtype) # ensured by update_dtype
- values = ensure_wrapped_if_datetimelike(self.sp_values)
- sp_values = astype_array(values, subtype, copy=copy)
- sp_values = np.asarray(sp_values)
- return self._simple_new(sp_values, self.sp_index, dtype)
- def map(self: SparseArrayT, mapper) -> SparseArrayT:
- """
- Map categories using an input mapping or function.
- Parameters
- ----------
- mapper : dict, Series, callable
- The correspondence from old values to new.
- Returns
- -------
- SparseArray
- The output array will have the same density as the input.
- The output fill value will be the result of applying the
- mapping to ``self.fill_value``
- Examples
- --------
- >>> arr = pd.arrays.SparseArray([0, 1, 2])
- >>> arr.map(lambda x: x + 10)
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
- >>> arr.map({0: 10, 1: 11, 2: 12})
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
- >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
- """
- # this is used in apply.
- # We get hit since we're an "is_extension_array_dtype" but regular extension
- # types are not hit. This may be worth adding to the interface.
- if isinstance(mapper, ABCSeries):
- mapper = mapper.to_dict()
- if isinstance(mapper, abc.Mapping):
- fill_value = mapper.get(self.fill_value, self.fill_value)
- sp_values = [mapper.get(x, None) for x in self.sp_values]
- else:
- fill_value = mapper(self.fill_value)
- sp_values = [mapper(x) for x in self.sp_values]
- return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
- def to_dense(self) -> np.ndarray:
- """
- Convert SparseArray to a NumPy array.
- Returns
- -------
- arr : NumPy array
- """
- return np.asarray(self, dtype=self.sp_values.dtype)
- def _where(self, mask, value):
- # NB: may not preserve dtype, e.g. result may be Sparse[float64]
- # while self is Sparse[int64]
- naive_implementation = np.where(mask, self, value)
- dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)
- result = type(self)._from_sequence(naive_implementation, dtype=dtype)
- return result
- # ------------------------------------------------------------------------
- # IO
- # ------------------------------------------------------------------------
- def __setstate__(self, state) -> None:
- """Necessary for making this object picklable"""
- if isinstance(state, tuple):
- # Compat for pandas < 0.24.0
- nd_state, (fill_value, sp_index) = state
- sparse_values = np.array([])
- sparse_values.__setstate__(nd_state)
- self._sparse_values = sparse_values
- self._sparse_index = sp_index
- self._dtype = SparseDtype(sparse_values.dtype, fill_value)
- else:
- self.__dict__.update(state)
- def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
- if self.fill_value == 0:
- return (self.sp_index.indices,)
- else:
- return (self.sp_index.indices[self.sp_values != 0],)
- # ------------------------------------------------------------------------
- # Reductions
- # ------------------------------------------------------------------------
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
- method = getattr(self, name, None)
- if method is None:
- raise TypeError(f"cannot perform {name} with type {self.dtype}")
- if skipna:
- arr = self
- else:
- arr = self.dropna()
- return getattr(arr, name)(**kwargs)
- def all(self, axis=None, *args, **kwargs):
- """
- Tests whether all elements evaluate True
- Returns
- -------
- all : bool
- See Also
- --------
- numpy.all
- """
- nv.validate_all(args, kwargs)
- values = self.sp_values
- if len(values) != len(self) and not np.all(self.fill_value):
- return False
- return values.all()
- def any(self, axis: AxisInt = 0, *args, **kwargs):
- """
- Tests whether at least one of elements evaluate True
- Returns
- -------
- any : bool
- See Also
- --------
- numpy.any
- """
- nv.validate_any(args, kwargs)
- values = self.sp_values
- if len(values) != len(self) and np.any(self.fill_value):
- return True
- return values.any().item()
- def sum(
- self,
- axis: AxisInt = 0,
- min_count: int = 0,
- skipna: bool = True,
- *args,
- **kwargs,
- ) -> Scalar:
- """
- Sum of non-NA/null values
- Parameters
- ----------
- axis : int, default 0
- Not Used. NumPy compatibility.
- min_count : int, default 0
- The required number of valid values to perform the summation. If fewer
- than ``min_count`` valid values are present, the result will be the missing
- value indicator for subarray type.
- *args, **kwargs
- Not Used. NumPy compatibility.
- Returns
- -------
- scalar
- """
- nv.validate_sum(args, kwargs)
- valid_vals = self._valid_sp_values
- sp_sum = valid_vals.sum()
- has_na = self.sp_index.ngaps > 0 and not self._null_fill_value
- if has_na and not skipna:
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- if self._null_fill_value:
- if check_below_min_count(valid_vals.shape, None, min_count):
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- return sp_sum
- else:
- nsparse = self.sp_index.ngaps
- if check_below_min_count(valid_vals.shape, None, min_count - nsparse):
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- return sp_sum + self.fill_value * nsparse
- def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray:
- """
- Cumulative sum of non-NA/null values.
- When performing the cumulative summation, any non-NA/null values will
- be skipped. The resulting SparseArray will preserve the locations of
- NaN values, but the fill value will be `np.nan` regardless.
- Parameters
- ----------
- axis : int or None
- Axis over which to perform the cumulative summation. If None,
- perform cumulative summation over flattened array.
- Returns
- -------
- cumsum : SparseArray
- """
- nv.validate_cumsum(args, kwargs)
- if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
- raise ValueError(f"axis(={axis}) out of bounds")
- if not self._null_fill_value:
- return SparseArray(self.to_dense()).cumsum()
- return SparseArray(
- self.sp_values.cumsum(),
- sparse_index=self.sp_index,
- fill_value=self.fill_value,
- )
- def mean(self, axis: Axis = 0, *args, **kwargs):
- """
- Mean of non-NA/null values
- Returns
- -------
- mean : float
- """
- nv.validate_mean(args, kwargs)
- valid_vals = self._valid_sp_values
- sp_sum = valid_vals.sum()
- ct = len(valid_vals)
- if self._null_fill_value:
- return sp_sum / ct
- else:
- nsparse = self.sp_index.ngaps
- return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
- def max(self, *, axis: AxisInt | None = None, skipna: bool = True):
- """
- Max of array values, ignoring NA values if specified.
- Parameters
- ----------
- axis : int, default 0
- Not Used. NumPy compatibility.
- skipna : bool, default True
- Whether to ignore NA values.
- Returns
- -------
- scalar
- """
- nv.validate_minmax_axis(axis, self.ndim)
- return self._min_max("max", skipna=skipna)
- def min(self, *, axis: AxisInt | None = None, skipna: bool = True):
- """
- Min of array values, ignoring NA values if specified.
- Parameters
- ----------
- axis : int, default 0
- Not Used. NumPy compatibility.
- skipna : bool, default True
- Whether to ignore NA values.
- Returns
- -------
- scalar
- """
- nv.validate_minmax_axis(axis, self.ndim)
- return self._min_max("min", skipna=skipna)
- def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:
- """
- Min/max of non-NA/null values
- Parameters
- ----------
- kind : {"min", "max"}
- skipna : bool
- Returns
- -------
- scalar
- """
- valid_vals = self._valid_sp_values
- has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
- if len(valid_vals) > 0:
- sp_min_max = getattr(valid_vals, kind)()
- # If a non-null fill value is currently present, it might be the min/max
- if has_nonnull_fill_vals:
- func = max if kind == "max" else min
- return func(sp_min_max, self.fill_value)
- elif skipna:
- return sp_min_max
- elif self.sp_index.ngaps == 0:
- # No NAs present
- return sp_min_max
- else:
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- elif has_nonnull_fill_vals:
- return self.fill_value
- else:
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:
- values = self._sparse_values
- index = self._sparse_index.indices
- mask = np.asarray(isna(values))
- func = np.argmax if kind == "argmax" else np.argmin
- idx = np.arange(values.shape[0])
- non_nans = values[~mask]
- non_nan_idx = idx[~mask]
- _candidate = non_nan_idx[func(non_nans)]
- candidate = index[_candidate]
- if isna(self.fill_value):
- return candidate
- if kind == "argmin" and self[candidate] < self.fill_value:
- return candidate
- if kind == "argmax" and self[candidate] > self.fill_value:
- return candidate
- _loc = self._first_fill_value_loc()
- if _loc == -1:
- # fill_value doesn't exist
- return candidate
- else:
- return _loc
- def argmax(self, skipna: bool = True) -> int:
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return self._argmin_argmax("argmax")
- def argmin(self, skipna: bool = True) -> int:
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return self._argmin_argmax("argmin")
- # ------------------------------------------------------------------------
- # Ufuncs
- # ------------------------------------------------------------------------
- _HANDLED_TYPES = (np.ndarray, numbers.Number)
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- out = kwargs.get("out", ())
- for x in inputs + out:
- if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
- return NotImplemented
- # for binary ops, use our custom dunder methods
- result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
- if "out" in kwargs:
- # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace
- res = arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
- return res
- if method == "reduce":
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- # e.g. tests.series.test_ufunc.TestNumpyReductions
- return result
- if len(inputs) == 1:
- # No alignment necessary.
- sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
- fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
- if ufunc.nout > 1:
- # multiple outputs. e.g. modf
- arrays = tuple(
- self._simple_new(
- sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
- )
- for sp_value, fv in zip(sp_values, fill_value)
- )
- return arrays
- elif method == "reduce":
- # e.g. reductions
- return sp_values
- return self._simple_new(
- sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
- )
- new_inputs = tuple(np.asarray(x) for x in inputs)
- result = getattr(ufunc, method)(*new_inputs, **kwargs)
- if out:
- if len(out) == 1:
- out = out[0]
- return out
- if ufunc.nout > 1:
- return tuple(type(self)(x) for x in result)
- elif method == "at":
- # no return value
- return None
- else:
- return type(self)(result)
- # ------------------------------------------------------------------------
- # Ops
- # ------------------------------------------------------------------------
- def _arith_method(self, other, op):
- op_name = op.__name__
- if isinstance(other, SparseArray):
- return _sparse_array_op(self, other, op, op_name)
- elif is_scalar(other):
- with np.errstate(all="ignore"):
- fill = op(_get_fill(self), np.asarray(other))
- result = op(self.sp_values, other)
- if op_name == "divmod":
- left, right = result
- lfill, rfill = fill
- return (
- _wrap_result(op_name, left, self.sp_index, lfill),
- _wrap_result(op_name, right, self.sp_index, rfill),
- )
- return _wrap_result(op_name, result, self.sp_index, fill)
- else:
- other = np.asarray(other)
- with np.errstate(all="ignore"):
- if len(self) != len(other):
- raise AssertionError(
- f"length mismatch: {len(self)} vs. {len(other)}"
- )
- if not isinstance(other, SparseArray):
- dtype = getattr(other, "dtype", None)
- other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
- return _sparse_array_op(self, other, op, op_name)
- def _cmp_method(self, other, op) -> SparseArray:
- if not is_scalar(other) and not isinstance(other, type(self)):
- # convert list-like to ndarray
- other = np.asarray(other)
- if isinstance(other, np.ndarray):
- # TODO: make this more flexible than just ndarray...
- other = SparseArray(other, fill_value=self.fill_value)
- if isinstance(other, SparseArray):
- if len(self) != len(other):
- raise ValueError(
- f"operands have mismatched length {len(self)} and {len(other)}"
- )
- op_name = op.__name__.strip("_")
- return _sparse_array_op(self, other, op, op_name)
- else:
- # scalar
- with np.errstate(all="ignore"):
- fill_value = op(self.fill_value, other)
- result = np.full(len(self), fill_value, dtype=np.bool_)
- result[self.sp_index.indices] = op(self.sp_values, other)
- return type(self)(
- result,
- fill_value=fill_value,
- dtype=np.bool_,
- )
- _logical_method = _cmp_method
- def _unary_method(self, op) -> SparseArray:
- fill_value = op(np.array(self.fill_value)).item()
- dtype = SparseDtype(self.dtype.subtype, fill_value)
- # NOTE: if fill_value doesn't change
- # we just have to apply op to sp_values
- if isna(self.fill_value) or fill_value == self.fill_value:
- values = op(self.sp_values)
- return type(self)._simple_new(values, self.sp_index, self.dtype)
- # In the other case we have to recalc indexes
- return type(self)(op(self.to_dense()), dtype=dtype)
- def __pos__(self) -> SparseArray:
- return self._unary_method(operator.pos)
- def __neg__(self) -> SparseArray:
- return self._unary_method(operator.neg)
- def __invert__(self) -> SparseArray:
- return self._unary_method(operator.invert)
- def __abs__(self) -> SparseArray:
- return self._unary_method(operator.abs)
- # ----------
- # Formatting
- # -----------
- def __repr__(self) -> str:
- pp_str = printing.pprint_thing(self)
- pp_fill = printing.pprint_thing(self.fill_value)
- pp_index = printing.pprint_thing(self.sp_index)
- return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
- def _formatter(self, boxed: bool = False):
- # Defer to the formatter from the GenericArrayFormatter calling us.
- # This will infer the correct formatter from the dtype of the values.
- return None
- def _make_sparse(
- arr: np.ndarray,
- kind: SparseIndexKind = "block",
- fill_value=None,
- dtype: np.dtype | None = None,
- ):
- """
- Convert ndarray to sparse format
- Parameters
- ----------
- arr : ndarray
- kind : {'block', 'integer'}
- fill_value : NaN or another value
- dtype : np.dtype, optional
- copy : bool, default False
- Returns
- -------
- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
- """
- assert isinstance(arr, np.ndarray)
- if arr.ndim > 1:
- raise TypeError("expected dimension <= 1 data")
- if fill_value is None:
- fill_value = na_value_for_dtype(arr.dtype)
- if isna(fill_value):
- mask = notna(arr)
- else:
- # cast to object comparison to be safe
- if is_string_dtype(arr.dtype):
- arr = arr.astype(object)
- if is_object_dtype(arr.dtype):
- # element-wise equality check method in numpy doesn't treat
- # each element type, eg. 0, 0.0, and False are treated as
- # same. So we have to check the both of its type and value.
- mask = splib.make_mask_object_ndarray(arr, fill_value)
- else:
- mask = arr != fill_value
- length = len(arr)
- if length != len(mask):
- # the arr is a SparseArray
- indices = mask.sp_index.indices
- else:
- indices = mask.nonzero()[0].astype(np.int32)
- index = make_sparse_index(length, indices, kind)
- sparsified_values = arr[mask]
- if dtype is not None:
- sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values)
- sparsified_values = astype_array(sparsified_values, dtype=dtype)
- sparsified_values = np.asarray(sparsified_values)
- # TODO: copy
- return sparsified_values, index, fill_value
- @overload
- def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:
- ...
- @overload
- def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:
- ...
- def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
- index: SparseIndex
- if kind == "block":
- locs, lens = splib.get_blocks(indices)
- index = BlockIndex(length, locs, lens)
- elif kind == "integer":
- index = IntIndex(length, indices)
- else: # pragma: no cover
- raise ValueError("must be block or integer type")
- return index
|