array.py 61 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892
  1. """
  2. SparseArray data structure
  3. """
  4. from __future__ import annotations
  5. from collections import abc
  6. import numbers
  7. import operator
  8. from typing import (
  9. TYPE_CHECKING,
  10. Any,
  11. Callable,
  12. Literal,
  13. Sequence,
  14. TypeVar,
  15. cast,
  16. overload,
  17. )
  18. import warnings
  19. import numpy as np
  20. from pandas._libs import lib
  21. import pandas._libs.sparse as splib
  22. from pandas._libs.sparse import (
  23. BlockIndex,
  24. IntIndex,
  25. SparseIndex,
  26. )
  27. from pandas._libs.tslibs import NaT
  28. from pandas._typing import (
  29. ArrayLike,
  30. AstypeArg,
  31. Axis,
  32. AxisInt,
  33. Dtype,
  34. NpDtype,
  35. PositionalIndexer,
  36. Scalar,
  37. ScalarIndexer,
  38. SequenceIndexer,
  39. npt,
  40. )
  41. from pandas.compat.numpy import function as nv
  42. from pandas.errors import PerformanceWarning
  43. from pandas.util._exceptions import find_stack_level
  44. from pandas.util._validators import (
  45. validate_bool_kwarg,
  46. validate_insert_loc,
  47. )
  48. from pandas.core.dtypes.astype import astype_array
  49. from pandas.core.dtypes.cast import (
  50. construct_1d_arraylike_from_scalar,
  51. find_common_type,
  52. maybe_box_datetimelike,
  53. )
  54. from pandas.core.dtypes.common import (
  55. is_array_like,
  56. is_bool_dtype,
  57. is_datetime64_any_dtype,
  58. is_datetime64tz_dtype,
  59. is_dtype_equal,
  60. is_integer,
  61. is_list_like,
  62. is_object_dtype,
  63. is_scalar,
  64. is_string_dtype,
  65. pandas_dtype,
  66. )
  67. from pandas.core.dtypes.generic import (
  68. ABCIndex,
  69. ABCSeries,
  70. )
  71. from pandas.core.dtypes.missing import (
  72. isna,
  73. na_value_for_dtype,
  74. notna,
  75. )
  76. from pandas.core import (
  77. arraylike,
  78. ops,
  79. )
  80. import pandas.core.algorithms as algos
  81. from pandas.core.arraylike import OpsMixin
  82. from pandas.core.arrays import ExtensionArray
  83. from pandas.core.arrays.sparse.dtype import SparseDtype
  84. from pandas.core.base import PandasObject
  85. import pandas.core.common as com
  86. from pandas.core.construction import (
  87. ensure_wrapped_if_datetimelike,
  88. extract_array,
  89. sanitize_array,
  90. )
  91. from pandas.core.indexers import (
  92. check_array_indexer,
  93. unpack_tuple_and_ellipses,
  94. )
  95. from pandas.core.missing import interpolate_2d
  96. from pandas.core.nanops import check_below_min_count
  97. from pandas.io.formats import printing
  98. # See https://github.com/python/typing/issues/684
  99. if TYPE_CHECKING:
  100. from enum import Enum
  101. class ellipsis(Enum):
  102. Ellipsis = "..."
  103. Ellipsis = ellipsis.Ellipsis
  104. from scipy.sparse import spmatrix
  105. from pandas._typing import (
  106. FillnaOptions,
  107. NumpySorter,
  108. )
  109. SparseIndexKind = Literal["integer", "block"]
  110. from pandas import Series
  111. else:
  112. ellipsis = type(Ellipsis)
  113. # ----------------------------------------------------------------------------
  114. # Array
  115. SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")
  116. _sparray_doc_kwargs = {"klass": "SparseArray"}
  117. def _get_fill(arr: SparseArray) -> np.ndarray:
  118. """
  119. Create a 0-dim ndarray containing the fill value
  120. Parameters
  121. ----------
  122. arr : SparseArray
  123. Returns
  124. -------
  125. fill_value : ndarray
  126. 0-dim ndarray with just the fill value.
  127. Notes
  128. -----
  129. coerce fill_value to arr dtype if possible
  130. int64 SparseArray can have NaN as fill_value if there is no missing
  131. """
  132. try:
  133. return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
  134. except ValueError:
  135. return np.asarray(arr.fill_value)
  136. def _sparse_array_op(
  137. left: SparseArray, right: SparseArray, op: Callable, name: str
  138. ) -> SparseArray:
  139. """
  140. Perform a binary operation between two arrays.
  141. Parameters
  142. ----------
  143. left : Union[SparseArray, ndarray]
  144. right : Union[SparseArray, ndarray]
  145. op : Callable
  146. The binary operation to perform
  147. name str
  148. Name of the callable.
  149. Returns
  150. -------
  151. SparseArray
  152. """
  153. if name.startswith("__"):
  154. # For lookups in _libs.sparse we need non-dunder op name
  155. name = name[2:-2]
  156. # dtype used to find corresponding sparse method
  157. ltype = left.dtype.subtype
  158. rtype = right.dtype.subtype
  159. if not is_dtype_equal(ltype, rtype):
  160. subtype = find_common_type([ltype, rtype])
  161. ltype = SparseDtype(subtype, left.fill_value)
  162. rtype = SparseDtype(subtype, right.fill_value)
  163. left = left.astype(ltype, copy=False)
  164. right = right.astype(rtype, copy=False)
  165. dtype = ltype.subtype
  166. else:
  167. dtype = ltype
  168. # dtype the result must have
  169. result_dtype = None
  170. if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
  171. with np.errstate(all="ignore"):
  172. result = op(left.to_dense(), right.to_dense())
  173. fill = op(_get_fill(left), _get_fill(right))
  174. if left.sp_index.ngaps == 0:
  175. index = left.sp_index
  176. else:
  177. index = right.sp_index
  178. elif left.sp_index.equals(right.sp_index):
  179. with np.errstate(all="ignore"):
  180. result = op(left.sp_values, right.sp_values)
  181. fill = op(_get_fill(left), _get_fill(right))
  182. index = left.sp_index
  183. else:
  184. if name[0] == "r":
  185. left, right = right, left
  186. name = name[1:]
  187. if name in ("and", "or", "xor") and dtype == "bool":
  188. opname = f"sparse_{name}_uint8"
  189. # to make template simple, cast here
  190. left_sp_values = left.sp_values.view(np.uint8)
  191. right_sp_values = right.sp_values.view(np.uint8)
  192. result_dtype = bool
  193. else:
  194. opname = f"sparse_{name}_{dtype}"
  195. left_sp_values = left.sp_values
  196. right_sp_values = right.sp_values
  197. if (
  198. name in ["floordiv", "mod"]
  199. and (right == 0).any()
  200. and left.dtype.kind in ["i", "u"]
  201. ):
  202. # Match the non-Sparse Series behavior
  203. opname = f"sparse_{name}_float64"
  204. left_sp_values = left_sp_values.astype("float64")
  205. right_sp_values = right_sp_values.astype("float64")
  206. sparse_op = getattr(splib, opname)
  207. with np.errstate(all="ignore"):
  208. result, index, fill = sparse_op(
  209. left_sp_values,
  210. left.sp_index,
  211. left.fill_value,
  212. right_sp_values,
  213. right.sp_index,
  214. right.fill_value,
  215. )
  216. if name == "divmod":
  217. # result is a 2-tuple
  218. # error: Incompatible return value type (got "Tuple[SparseArray,
  219. # SparseArray]", expected "SparseArray")
  220. return ( # type: ignore[return-value]
  221. _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),
  222. _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),
  223. )
  224. if result_dtype is None:
  225. result_dtype = result.dtype
  226. return _wrap_result(name, result, index, fill, dtype=result_dtype)
  227. def _wrap_result(
  228. name: str, data, sparse_index, fill_value, dtype: Dtype | None = None
  229. ) -> SparseArray:
  230. """
  231. wrap op result to have correct dtype
  232. """
  233. if name.startswith("__"):
  234. # e.g. __eq__ --> eq
  235. name = name[2:-2]
  236. if name in ("eq", "ne", "lt", "gt", "le", "ge"):
  237. dtype = bool
  238. fill_value = lib.item_from_zerodim(fill_value)
  239. if is_bool_dtype(dtype):
  240. # fill_value may be np.bool_
  241. fill_value = bool(fill_value)
  242. return SparseArray(
  243. data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
  244. )
  245. class SparseArray(OpsMixin, PandasObject, ExtensionArray):
  246. """
  247. An ExtensionArray for storing sparse data.
  248. Parameters
  249. ----------
  250. data : array-like or scalar
  251. A dense array of values to store in the SparseArray. This may contain
  252. `fill_value`.
  253. sparse_index : SparseIndex, optional
  254. fill_value : scalar, optional
  255. Elements in data that are ``fill_value`` are not stored in the
  256. SparseArray. For memory savings, this should be the most common value
  257. in `data`. By default, `fill_value` depends on the dtype of `data`:
  258. =========== ==========
  259. data.dtype na_value
  260. =========== ==========
  261. float ``np.nan``
  262. int ``0``
  263. bool False
  264. datetime64 ``pd.NaT``
  265. timedelta64 ``pd.NaT``
  266. =========== ==========
  267. The fill value is potentially specified in three ways. In order of
  268. precedence, these are
  269. 1. The `fill_value` argument
  270. 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
  271. a ``SparseDtype``
  272. 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
  273. is not a ``SparseDtype`` and `data` is a ``SparseArray``.
  274. kind : str
  275. Can be 'integer' or 'block', default is 'integer'.
  276. The type of storage for sparse locations.
  277. * 'block': Stores a `block` and `block_length` for each
  278. contiguous *span* of sparse values. This is best when
  279. sparse data tends to be clumped together, with large
  280. regions of ``fill-value`` values between sparse values.
  281. * 'integer': uses an integer to store the location of
  282. each sparse value.
  283. dtype : np.dtype or SparseDtype, optional
  284. The dtype to use for the SparseArray. For numpy dtypes, this
  285. determines the dtype of ``self.sp_values``. For SparseDtype,
  286. this determines ``self.sp_values`` and ``self.fill_value``.
  287. copy : bool, default False
  288. Whether to explicitly copy the incoming `data` array.
  289. Attributes
  290. ----------
  291. None
  292. Methods
  293. -------
  294. None
  295. Examples
  296. --------
  297. >>> from pandas.arrays import SparseArray
  298. >>> arr = SparseArray([0, 0, 1, 2])
  299. >>> arr
  300. [0, 0, 1, 2]
  301. Fill: 0
  302. IntIndex
  303. Indices: array([2, 3], dtype=int32)
  304. """
  305. _subtyp = "sparse_array" # register ABCSparseArray
  306. _hidden_attrs = PandasObject._hidden_attrs | frozenset([])
  307. _sparse_index: SparseIndex
  308. _sparse_values: np.ndarray
  309. _dtype: SparseDtype
  310. def __init__(
  311. self,
  312. data,
  313. sparse_index=None,
  314. fill_value=None,
  315. kind: SparseIndexKind = "integer",
  316. dtype: Dtype | None = None,
  317. copy: bool = False,
  318. ) -> None:
  319. if fill_value is None and isinstance(dtype, SparseDtype):
  320. fill_value = dtype.fill_value
  321. if isinstance(data, type(self)):
  322. # disable normal inference on dtype, sparse_index, & fill_value
  323. if sparse_index is None:
  324. sparse_index = data.sp_index
  325. if fill_value is None:
  326. fill_value = data.fill_value
  327. if dtype is None:
  328. dtype = data.dtype
  329. # TODO: make kind=None, and use data.kind?
  330. data = data.sp_values
  331. # Handle use-provided dtype
  332. if isinstance(dtype, str):
  333. # Two options: dtype='int', regular numpy dtype
  334. # or dtype='Sparse[int]', a sparse dtype
  335. try:
  336. dtype = SparseDtype.construct_from_string(dtype)
  337. except TypeError:
  338. dtype = pandas_dtype(dtype)
  339. if isinstance(dtype, SparseDtype):
  340. if fill_value is None:
  341. fill_value = dtype.fill_value
  342. dtype = dtype.subtype
  343. if is_scalar(data):
  344. if sparse_index is None:
  345. npoints = 1
  346. else:
  347. npoints = sparse_index.length
  348. data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)
  349. dtype = data.dtype
  350. if dtype is not None:
  351. dtype = pandas_dtype(dtype)
  352. # TODO: disentangle the fill_value dtype inference from
  353. # dtype inference
  354. if data is None:
  355. # TODO: What should the empty dtype be? Object or float?
  356. # error: Argument "dtype" to "array" has incompatible type
  357. # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],
  358. # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
  359. # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
  360. data = np.array([], dtype=dtype) # type: ignore[arg-type]
  361. if not is_array_like(data):
  362. try:
  363. # probably shared code in sanitize_series
  364. data = sanitize_array(data, index=None)
  365. except ValueError:
  366. # NumPy may raise a ValueError on data like [1, []]
  367. # we retry with object dtype here.
  368. if dtype is None:
  369. dtype = np.dtype(object)
  370. data = np.atleast_1d(np.asarray(data, dtype=dtype))
  371. else:
  372. raise
  373. if copy:
  374. # TODO: avoid double copy when dtype forces cast.
  375. data = data.copy()
  376. if fill_value is None:
  377. fill_value_dtype = data.dtype if dtype is None else dtype
  378. if fill_value_dtype is None:
  379. fill_value = np.nan
  380. else:
  381. fill_value = na_value_for_dtype(fill_value_dtype)
  382. if isinstance(data, type(self)) and sparse_index is None:
  383. sparse_index = data._sparse_index
  384. # error: Argument "dtype" to "asarray" has incompatible type
  385. # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
  386. sparse_values = np.asarray(
  387. data.sp_values, dtype=dtype # type: ignore[arg-type]
  388. )
  389. elif sparse_index is None:
  390. data = extract_array(data, extract_numpy=True)
  391. if not isinstance(data, np.ndarray):
  392. # EA
  393. if is_datetime64tz_dtype(data.dtype):
  394. warnings.warn(
  395. f"Creating SparseArray from {data.dtype} data "
  396. "loses timezone information. Cast to object before "
  397. "sparse to retain timezone information.",
  398. UserWarning,
  399. stacklevel=find_stack_level(),
  400. )
  401. data = np.asarray(data, dtype="datetime64[ns]")
  402. if fill_value is NaT:
  403. fill_value = np.datetime64("NaT", "ns")
  404. data = np.asarray(data)
  405. sparse_values, sparse_index, fill_value = _make_sparse(
  406. # error: Argument "dtype" to "_make_sparse" has incompatible type
  407. # "Union[ExtensionDtype, dtype[Any], None]"; expected
  408. # "Optional[dtype[Any]]"
  409. data,
  410. kind=kind,
  411. fill_value=fill_value,
  412. dtype=dtype, # type: ignore[arg-type]
  413. )
  414. else:
  415. # error: Argument "dtype" to "asarray" has incompatible type
  416. # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
  417. sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]
  418. if len(sparse_values) != sparse_index.npoints:
  419. raise AssertionError(
  420. f"Non array-like type {type(sparse_values)} must "
  421. "have the same length as the index"
  422. )
  423. self._sparse_index = sparse_index
  424. self._sparse_values = sparse_values
  425. self._dtype = SparseDtype(sparse_values.dtype, fill_value)
  426. @classmethod
  427. def _simple_new(
  428. cls: type[SparseArrayT],
  429. sparse_array: np.ndarray,
  430. sparse_index: SparseIndex,
  431. dtype: SparseDtype,
  432. ) -> SparseArrayT:
  433. new = object.__new__(cls)
  434. new._sparse_index = sparse_index
  435. new._sparse_values = sparse_array
  436. new._dtype = dtype
  437. return new
  438. @classmethod
  439. def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT:
  440. """
  441. Create a SparseArray from a scipy.sparse matrix.
  442. Parameters
  443. ----------
  444. data : scipy.sparse.sp_matrix
  445. This should be a SciPy sparse matrix where the size
  446. of the second dimension is 1. In other words, a
  447. sparse matrix with a single column.
  448. Returns
  449. -------
  450. SparseArray
  451. Examples
  452. --------
  453. >>> import scipy.sparse
  454. >>> mat = scipy.sparse.coo_matrix((4, 1))
  455. >>> pd.arrays.SparseArray.from_spmatrix(mat)
  456. [0.0, 0.0, 0.0, 0.0]
  457. Fill: 0.0
  458. IntIndex
  459. Indices: array([], dtype=int32)
  460. """
  461. length, ncol = data.shape
  462. if ncol != 1:
  463. raise ValueError(f"'data' must have a single column, not '{ncol}'")
  464. # our sparse index classes require that the positions be strictly
  465. # increasing. So we need to sort loc, and arr accordingly.
  466. data = data.tocsc()
  467. data.sort_indices()
  468. arr = data.data
  469. idx = data.indices
  470. zero = np.array(0, dtype=arr.dtype).item()
  471. dtype = SparseDtype(arr.dtype, zero)
  472. index = IntIndex(length, idx)
  473. return cls._simple_new(arr, index, dtype)
  474. def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
  475. fill_value = self.fill_value
  476. if self.sp_index.ngaps == 0:
  477. # Compat for na dtype and int values.
  478. return self.sp_values
  479. if dtype is None:
  480. # Can NumPy represent this type?
  481. # If not, `np.result_type` will raise. We catch that
  482. # and return object.
  483. if is_datetime64_any_dtype(self.sp_values.dtype):
  484. # However, we *do* special-case the common case of
  485. # a datetime64 with pandas NaT.
  486. if fill_value is NaT:
  487. # Can't put pd.NaT in a datetime64[ns]
  488. fill_value = np.datetime64("NaT")
  489. try:
  490. dtype = np.result_type(self.sp_values.dtype, type(fill_value))
  491. except TypeError:
  492. dtype = object
  493. out = np.full(self.shape, fill_value, dtype=dtype)
  494. out[self.sp_index.indices] = self.sp_values
  495. return out
  496. def __setitem__(self, key, value):
  497. # I suppose we could allow setting of non-fill_value elements.
  498. # TODO(SparseArray.__setitem__): remove special cases in
  499. # ExtensionBlock.where
  500. msg = "SparseArray does not support item assignment via setitem"
  501. raise TypeError(msg)
  502. @classmethod
  503. def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
  504. return cls(scalars, dtype=dtype)
  505. @classmethod
  506. def _from_factorized(cls, values, original):
  507. return cls(values, dtype=original.dtype)
  508. # ------------------------------------------------------------------------
  509. # Data
  510. # ------------------------------------------------------------------------
  511. @property
  512. def sp_index(self) -> SparseIndex:
  513. """
  514. The SparseIndex containing the location of non- ``fill_value`` points.
  515. """
  516. return self._sparse_index
  517. @property
  518. def sp_values(self) -> np.ndarray:
  519. """
  520. An ndarray containing the non- ``fill_value`` values.
  521. Examples
  522. --------
  523. >>> from pandas.arrays import SparseArray
  524. >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
  525. >>> s.sp_values
  526. array([1, 2])
  527. """
  528. return self._sparse_values
  529. @property
  530. def dtype(self) -> SparseDtype:
  531. return self._dtype
  532. @property
  533. def fill_value(self):
  534. """
  535. Elements in `data` that are `fill_value` are not stored.
  536. For memory savings, this should be the most common value in the array.
  537. """
  538. return self.dtype.fill_value
  539. @fill_value.setter
  540. def fill_value(self, value) -> None:
  541. self._dtype = SparseDtype(self.dtype.subtype, value)
  542. @property
  543. def kind(self) -> SparseIndexKind:
  544. """
  545. The kind of sparse index for this array. One of {'integer', 'block'}.
  546. """
  547. if isinstance(self.sp_index, IntIndex):
  548. return "integer"
  549. else:
  550. return "block"
  551. @property
  552. def _valid_sp_values(self) -> np.ndarray:
  553. sp_vals = self.sp_values
  554. mask = notna(sp_vals)
  555. return sp_vals[mask]
  556. def __len__(self) -> int:
  557. return self.sp_index.length
  558. @property
  559. def _null_fill_value(self) -> bool:
  560. return self._dtype._is_na_fill_value
  561. def _fill_value_matches(self, fill_value) -> bool:
  562. if self._null_fill_value:
  563. return isna(fill_value)
  564. else:
  565. return self.fill_value == fill_value
  566. @property
  567. def nbytes(self) -> int:
  568. return self.sp_values.nbytes + self.sp_index.nbytes
  569. @property
  570. def density(self) -> float:
  571. """
  572. The percent of non- ``fill_value`` points, as decimal.
  573. Examples
  574. --------
  575. >>> from pandas.arrays import SparseArray
  576. >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
  577. >>> s.density
  578. 0.6
  579. """
  580. return self.sp_index.npoints / self.sp_index.length
  581. @property
  582. def npoints(self) -> int:
  583. """
  584. The number of non- ``fill_value`` points.
  585. Examples
  586. --------
  587. >>> from pandas.arrays import SparseArray
  588. >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
  589. >>> s.npoints
  590. 3
  591. """
  592. return self.sp_index.npoints
  593. def isna(self):
  594. # If null fill value, we want SparseDtype[bool, true]
  595. # to preserve the same memory usage.
  596. dtype = SparseDtype(bool, self._null_fill_value)
  597. if self._null_fill_value:
  598. return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
  599. mask = np.full(len(self), False, dtype=np.bool_)
  600. mask[self.sp_index.indices] = isna(self.sp_values)
  601. return type(self)(mask, fill_value=False, dtype=dtype)
  602. def fillna(
  603. self: SparseArrayT,
  604. value=None,
  605. method: FillnaOptions | None = None,
  606. limit: int | None = None,
  607. ) -> SparseArrayT:
  608. """
  609. Fill missing values with `value`.
  610. Parameters
  611. ----------
  612. value : scalar, optional
  613. method : str, optional
  614. .. warning::
  615. Using 'method' will result in high memory use,
  616. as all `fill_value` methods will be converted to
  617. an in-memory ndarray
  618. limit : int, optional
  619. Returns
  620. -------
  621. SparseArray
  622. Notes
  623. -----
  624. When `value` is specified, the result's ``fill_value`` depends on
  625. ``self.fill_value``. The goal is to maintain low-memory use.
  626. If ``self.fill_value`` is NA, the result dtype will be
  627. ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
  628. amount of memory used before and after filling.
  629. When ``self.fill_value`` is not NA, the result dtype will be
  630. ``self.dtype``. Again, this preserves the amount of memory used.
  631. """
  632. if (method is None and value is None) or (
  633. method is not None and value is not None
  634. ):
  635. raise ValueError("Must specify one of 'method' or 'value'.")
  636. if method is not None:
  637. msg = "fillna with 'method' requires high memory usage."
  638. warnings.warn(
  639. msg,
  640. PerformanceWarning,
  641. stacklevel=find_stack_level(),
  642. )
  643. new_values = np.asarray(self)
  644. # interpolate_2d modifies new_values inplace
  645. interpolate_2d(new_values, method=method, limit=limit)
  646. return type(self)(new_values, fill_value=self.fill_value)
  647. else:
  648. new_values = np.where(isna(self.sp_values), value, self.sp_values)
  649. if self._null_fill_value:
  650. # This is essentially just updating the dtype.
  651. new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
  652. else:
  653. new_dtype = self.dtype
  654. return self._simple_new(new_values, self._sparse_index, new_dtype)
  655. def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT:
  656. if not len(self) or periods == 0:
  657. return self.copy()
  658. if isna(fill_value):
  659. fill_value = self.dtype.na_value
  660. subtype = np.result_type(fill_value, self.dtype.subtype)
  661. if subtype != self.dtype.subtype:
  662. # just coerce up front
  663. arr = self.astype(SparseDtype(subtype, self.fill_value))
  664. else:
  665. arr = self
  666. empty = self._from_sequence(
  667. [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
  668. )
  669. if periods > 0:
  670. a = empty
  671. b = arr[:-periods]
  672. else:
  673. a = arr[abs(periods) :]
  674. b = empty
  675. return arr._concat_same_type([a, b])
  676. def _first_fill_value_loc(self):
  677. """
  678. Get the location of the first fill value.
  679. Returns
  680. -------
  681. int
  682. """
  683. if len(self) == 0 or self.sp_index.npoints == len(self):
  684. return -1
  685. indices = self.sp_index.indices
  686. if not len(indices) or indices[0] > 0:
  687. return 0
  688. # a number larger than 1 should be appended to
  689. # the last in case of fill value only appears
  690. # in the tail of array
  691. diff = np.r_[np.diff(indices), 2]
  692. return indices[(diff > 1).argmax()] + 1
  693. def unique(self: SparseArrayT) -> SparseArrayT:
  694. uniques = algos.unique(self.sp_values)
  695. if len(self.sp_values) != len(self):
  696. fill_loc = self._first_fill_value_loc()
  697. # Inorder to align the behavior of pd.unique or
  698. # pd.Series.unique, we should keep the original
  699. # order, here we use unique again to find the
  700. # insertion place. Since the length of sp_values
  701. # is not large, maybe minor performance hurt
  702. # is worthwhile to the correctness.
  703. insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
  704. uniques = np.insert(uniques, insert_loc, self.fill_value)
  705. return type(self)._from_sequence(uniques, dtype=self.dtype)
  706. def _values_for_factorize(self):
  707. # Still override this for hash_pandas_object
  708. return np.asarray(self), self.fill_value
  709. def factorize(
  710. self,
  711. use_na_sentinel: bool = True,
  712. ) -> tuple[np.ndarray, SparseArray]:
  713. # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
  714. # The sparsity on this is backwards from what Sparse would want. Want
  715. # ExtensionArray.factorize -> Tuple[EA, EA]
  716. # Given that we have to return a dense array of codes, why bother
  717. # implementing an efficient factorize?
  718. codes, uniques = algos.factorize(
  719. np.asarray(self), use_na_sentinel=use_na_sentinel
  720. )
  721. uniques_sp = SparseArray(uniques, dtype=self.dtype)
  722. return codes, uniques_sp
  723. def value_counts(self, dropna: bool = True) -> Series:
  724. """
  725. Returns a Series containing counts of unique values.
  726. Parameters
  727. ----------
  728. dropna : bool, default True
  729. Don't include counts of NaN, even if NaN is in sp_values.
  730. Returns
  731. -------
  732. counts : Series
  733. """
  734. from pandas import (
  735. Index,
  736. Series,
  737. )
  738. keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
  739. fcounts = self.sp_index.ngaps
  740. if fcounts > 0 and (not self._null_fill_value or not dropna):
  741. mask = isna(keys) if self._null_fill_value else keys == self.fill_value
  742. if mask.any():
  743. counts[mask] += fcounts
  744. else:
  745. # error: Argument 1 to "insert" has incompatible type "Union[
  746. # ExtensionArray,ndarray[Any, Any]]"; expected "Union[
  747. # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype
  748. # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],
  749. # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence
  750. # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"
  751. keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]
  752. counts = np.insert(counts, 0, fcounts)
  753. if not isinstance(keys, ABCIndex):
  754. index = Index(keys)
  755. else:
  756. index = keys
  757. return Series(counts, index=index, copy=False)
  758. # --------
  759. # Indexing
  760. # --------
  761. @overload
  762. def __getitem__(self, key: ScalarIndexer) -> Any:
  763. ...
  764. @overload
  765. def __getitem__(
  766. self: SparseArrayT,
  767. key: SequenceIndexer | tuple[int | ellipsis, ...],
  768. ) -> SparseArrayT:
  769. ...
  770. def __getitem__(
  771. self: SparseArrayT,
  772. key: PositionalIndexer | tuple[int | ellipsis, ...],
  773. ) -> SparseArrayT | Any:
  774. if isinstance(key, tuple):
  775. key = unpack_tuple_and_ellipses(key)
  776. if key is Ellipsis:
  777. raise ValueError("Cannot slice with Ellipsis")
  778. if is_integer(key):
  779. return self._get_val_at(key)
  780. elif isinstance(key, tuple):
  781. # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
  782. # for "ndarray[Any, Any]"; expected type
  783. # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
  784. # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
  785. # Union[bool_, integer[Any]]]]], _NestedSequence[Union[
  786. # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
  787. # dtype[Union[bool_, integer[Any]]]], _NestedSequence[
  788. # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
  789. # _NestedSequence[Union[bool, int]]], ...]]"
  790. data_slice = self.to_dense()[key] # type: ignore[index]
  791. elif isinstance(key, slice):
  792. # Avoid densifying when handling contiguous slices
  793. if key.step is None or key.step == 1:
  794. start = 0 if key.start is None else key.start
  795. if start < 0:
  796. start += len(self)
  797. end = len(self) if key.stop is None else key.stop
  798. if end < 0:
  799. end += len(self)
  800. indices = self.sp_index.indices
  801. keep_inds = np.flatnonzero((indices >= start) & (indices < end))
  802. sp_vals = self.sp_values[keep_inds]
  803. sp_index = indices[keep_inds].copy()
  804. # If we've sliced to not include the start of the array, all our indices
  805. # should be shifted. NB: here we are careful to also not shift by a
  806. # negative value for a case like [0, 1][-100:] where the start index
  807. # should be treated like 0
  808. if start > 0:
  809. sp_index -= start
  810. # Length of our result should match applying this slice to a range
  811. # of the length of our original array
  812. new_len = len(range(len(self))[key])
  813. new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
  814. return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
  815. else:
  816. indices = np.arange(len(self), dtype=np.int32)[key]
  817. return self.take(indices)
  818. elif not is_list_like(key):
  819. # e.g. "foo" or 2.5
  820. # exception message copied from numpy
  821. raise IndexError(
  822. r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
  823. r"(`None`) and integer or boolean arrays are valid indices"
  824. )
  825. else:
  826. if isinstance(key, SparseArray):
  827. # NOTE: If we guarantee that SparseDType(bool)
  828. # has only fill_value - true, false or nan
  829. # (see GH PR 44955)
  830. # we can apply mask very fast:
  831. if is_bool_dtype(key):
  832. if isna(key.fill_value):
  833. return self.take(key.sp_index.indices[key.sp_values])
  834. if not key.fill_value:
  835. return self.take(key.sp_index.indices)
  836. n = len(self)
  837. mask = np.full(n, True, dtype=np.bool_)
  838. mask[key.sp_index.indices] = False
  839. return self.take(np.arange(n)[mask])
  840. else:
  841. key = np.asarray(key)
  842. key = check_array_indexer(self, key)
  843. if com.is_bool_indexer(key):
  844. # mypy doesn't know we have an array here
  845. key = cast(np.ndarray, key)
  846. return self.take(np.arange(len(key), dtype=np.int32)[key])
  847. elif hasattr(key, "__len__"):
  848. return self.take(key)
  849. else:
  850. raise ValueError(f"Cannot slice with '{key}'")
  851. return type(self)(data_slice, kind=self.kind)
  852. def _get_val_at(self, loc):
  853. loc = validate_insert_loc(loc, len(self))
  854. sp_loc = self.sp_index.lookup(loc)
  855. if sp_loc == -1:
  856. return self.fill_value
  857. else:
  858. val = self.sp_values[sp_loc]
  859. val = maybe_box_datetimelike(val, self.sp_values.dtype)
  860. return val
  861. def take(
  862. self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None
  863. ) -> SparseArrayT:
  864. if is_scalar(indices):
  865. raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
  866. indices = np.asarray(indices, dtype=np.int32)
  867. dtype = None
  868. if indices.size == 0:
  869. result = np.array([], dtype="object")
  870. dtype = self.dtype
  871. elif allow_fill:
  872. result = self._take_with_fill(indices, fill_value=fill_value)
  873. else:
  874. return self._take_without_fill(indices)
  875. return type(self)(
  876. result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
  877. )
  878. def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
  879. if fill_value is None:
  880. fill_value = self.dtype.na_value
  881. if indices.min() < -1:
  882. raise ValueError(
  883. "Invalid value in 'indices'. Must be between -1 "
  884. "and the length of the array."
  885. )
  886. if indices.max() >= len(self):
  887. raise IndexError("out of bounds value in 'indices'.")
  888. if len(self) == 0:
  889. # Empty... Allow taking only if all empty
  890. if (indices == -1).all():
  891. dtype = np.result_type(self.sp_values, type(fill_value))
  892. taken = np.empty_like(indices, dtype=dtype)
  893. taken.fill(fill_value)
  894. return taken
  895. else:
  896. raise IndexError("cannot do a non-empty take from an empty axes.")
  897. # sp_indexer may be -1 for two reasons
  898. # 1.) we took for an index of -1 (new)
  899. # 2.) we took a value that was self.fill_value (old)
  900. sp_indexer = self.sp_index.lookup_array(indices)
  901. new_fill_indices = indices == -1
  902. old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
  903. if self.sp_index.npoints == 0 and old_fill_indices.all():
  904. # We've looked up all valid points on an all-sparse array.
  905. taken = np.full(
  906. sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
  907. )
  908. elif self.sp_index.npoints == 0:
  909. # Avoid taking from the empty self.sp_values
  910. _dtype = np.result_type(self.dtype.subtype, type(fill_value))
  911. taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
  912. else:
  913. taken = self.sp_values.take(sp_indexer)
  914. # Fill in two steps.
  915. # Old fill values
  916. # New fill values
  917. # potentially coercing to a new dtype at each stage.
  918. m0 = sp_indexer[old_fill_indices] < 0
  919. m1 = sp_indexer[new_fill_indices] < 0
  920. result_type = taken.dtype
  921. if m0.any():
  922. result_type = np.result_type(result_type, type(self.fill_value))
  923. taken = taken.astype(result_type)
  924. taken[old_fill_indices] = self.fill_value
  925. if m1.any():
  926. result_type = np.result_type(result_type, type(fill_value))
  927. taken = taken.astype(result_type)
  928. taken[new_fill_indices] = fill_value
  929. return taken
  930. def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:
  931. to_shift = indices < 0
  932. n = len(self)
  933. if (indices.max() >= n) or (indices.min() < -n):
  934. if n == 0:
  935. raise IndexError("cannot do a non-empty take from an empty axes.")
  936. raise IndexError("out of bounds value in 'indices'.")
  937. if to_shift.any():
  938. indices = indices.copy()
  939. indices[to_shift] += n
  940. sp_indexer = self.sp_index.lookup_array(indices)
  941. value_mask = sp_indexer != -1
  942. new_sp_values = self.sp_values[sp_indexer[value_mask]]
  943. value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)
  944. new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
  945. return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
  946. def searchsorted(
  947. self,
  948. v: ArrayLike | object,
  949. side: Literal["left", "right"] = "left",
  950. sorter: NumpySorter = None,
  951. ) -> npt.NDArray[np.intp] | np.intp:
  952. msg = "searchsorted requires high memory usage."
  953. warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
  954. if not is_scalar(v):
  955. v = np.asarray(v)
  956. v = np.asarray(v)
  957. return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
  958. def copy(self: SparseArrayT) -> SparseArrayT:
  959. values = self.sp_values.copy()
  960. return self._simple_new(values, self.sp_index, self.dtype)
  961. @classmethod
  962. def _concat_same_type(
  963. cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT]
  964. ) -> SparseArrayT:
  965. fill_value = to_concat[0].fill_value
  966. values = []
  967. length = 0
  968. if to_concat:
  969. sp_kind = to_concat[0].kind
  970. else:
  971. sp_kind = "integer"
  972. sp_index: SparseIndex
  973. if sp_kind == "integer":
  974. indices = []
  975. for arr in to_concat:
  976. int_idx = arr.sp_index.indices.copy()
  977. int_idx += length # TODO: wraparound
  978. length += arr.sp_index.length
  979. values.append(arr.sp_values)
  980. indices.append(int_idx)
  981. data = np.concatenate(values)
  982. indices_arr = np.concatenate(indices)
  983. # error: Argument 2 to "IntIndex" has incompatible type
  984. # "ndarray[Any, dtype[signedinteger[_32Bit]]]";
  985. # expected "Sequence[int]"
  986. sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]
  987. else:
  988. # when concatenating block indices, we don't claim that you'll
  989. # get an identical index as concatenating the values and then
  990. # creating a new index. We don't want to spend the time trying
  991. # to merge blocks across arrays in `to_concat`, so the resulting
  992. # BlockIndex may have more blocks.
  993. blengths = []
  994. blocs = []
  995. for arr in to_concat:
  996. block_idx = arr.sp_index.to_block_index()
  997. values.append(arr.sp_values)
  998. blocs.append(block_idx.blocs.copy() + length)
  999. blengths.append(block_idx.blengths)
  1000. length += arr.sp_index.length
  1001. data = np.concatenate(values)
  1002. blocs_arr = np.concatenate(blocs)
  1003. blengths_arr = np.concatenate(blengths)
  1004. sp_index = BlockIndex(length, blocs_arr, blengths_arr)
  1005. return cls(data, sparse_index=sp_index, fill_value=fill_value)
  1006. def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
  1007. """
  1008. Change the dtype of a SparseArray.
  1009. The output will always be a SparseArray. To convert to a dense
  1010. ndarray with a certain dtype, use :meth:`numpy.asarray`.
  1011. Parameters
  1012. ----------
  1013. dtype : np.dtype or ExtensionDtype
  1014. For SparseDtype, this changes the dtype of
  1015. ``self.sp_values`` and the ``self.fill_value``.
  1016. For other dtypes, this only changes the dtype of
  1017. ``self.sp_values``.
  1018. copy : bool, default True
  1019. Whether to ensure a copy is made, even if not necessary.
  1020. Returns
  1021. -------
  1022. SparseArray
  1023. Examples
  1024. --------
  1025. >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])
  1026. >>> arr
  1027. [0, 0, 1, 2]
  1028. Fill: 0
  1029. IntIndex
  1030. Indices: array([2, 3], dtype=int32)
  1031. >>> arr.astype(SparseDtype(np.dtype('int32')))
  1032. [0, 0, 1, 2]
  1033. Fill: 0
  1034. IntIndex
  1035. Indices: array([2, 3], dtype=int32)
  1036. Using a NumPy dtype with a different kind (e.g. float) will coerce
  1037. just ``self.sp_values``.
  1038. >>> arr.astype(SparseDtype(np.dtype('float64')))
  1039. ... # doctest: +NORMALIZE_WHITESPACE
  1040. [nan, nan, 1.0, 2.0]
  1041. Fill: nan
  1042. IntIndex
  1043. Indices: array([2, 3], dtype=int32)
  1044. Using a SparseDtype, you can also change the fill value as well.
  1045. >>> arr.astype(SparseDtype("float64", fill_value=0.0))
  1046. ... # doctest: +NORMALIZE_WHITESPACE
  1047. [0.0, 0.0, 1.0, 2.0]
  1048. Fill: 0.0
  1049. IntIndex
  1050. Indices: array([2, 3], dtype=int32)
  1051. """
  1052. if is_dtype_equal(dtype, self._dtype):
  1053. if not copy:
  1054. return self
  1055. else:
  1056. return self.copy()
  1057. future_dtype = pandas_dtype(dtype)
  1058. if not isinstance(future_dtype, SparseDtype):
  1059. # GH#34457
  1060. values = np.asarray(self)
  1061. values = ensure_wrapped_if_datetimelike(values)
  1062. return astype_array(values, dtype=future_dtype, copy=False)
  1063. dtype = self.dtype.update_dtype(dtype)
  1064. subtype = pandas_dtype(dtype._subtype_with_str)
  1065. subtype = cast(np.dtype, subtype) # ensured by update_dtype
  1066. values = ensure_wrapped_if_datetimelike(self.sp_values)
  1067. sp_values = astype_array(values, subtype, copy=copy)
  1068. sp_values = np.asarray(sp_values)
  1069. return self._simple_new(sp_values, self.sp_index, dtype)
  1070. def map(self: SparseArrayT, mapper) -> SparseArrayT:
  1071. """
  1072. Map categories using an input mapping or function.
  1073. Parameters
  1074. ----------
  1075. mapper : dict, Series, callable
  1076. The correspondence from old values to new.
  1077. Returns
  1078. -------
  1079. SparseArray
  1080. The output array will have the same density as the input.
  1081. The output fill value will be the result of applying the
  1082. mapping to ``self.fill_value``
  1083. Examples
  1084. --------
  1085. >>> arr = pd.arrays.SparseArray([0, 1, 2])
  1086. >>> arr.map(lambda x: x + 10)
  1087. [10, 11, 12]
  1088. Fill: 10
  1089. IntIndex
  1090. Indices: array([1, 2], dtype=int32)
  1091. >>> arr.map({0: 10, 1: 11, 2: 12})
  1092. [10, 11, 12]
  1093. Fill: 10
  1094. IntIndex
  1095. Indices: array([1, 2], dtype=int32)
  1096. >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))
  1097. [10, 11, 12]
  1098. Fill: 10
  1099. IntIndex
  1100. Indices: array([1, 2], dtype=int32)
  1101. """
  1102. # this is used in apply.
  1103. # We get hit since we're an "is_extension_array_dtype" but regular extension
  1104. # types are not hit. This may be worth adding to the interface.
  1105. if isinstance(mapper, ABCSeries):
  1106. mapper = mapper.to_dict()
  1107. if isinstance(mapper, abc.Mapping):
  1108. fill_value = mapper.get(self.fill_value, self.fill_value)
  1109. sp_values = [mapper.get(x, None) for x in self.sp_values]
  1110. else:
  1111. fill_value = mapper(self.fill_value)
  1112. sp_values = [mapper(x) for x in self.sp_values]
  1113. return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
  1114. def to_dense(self) -> np.ndarray:
  1115. """
  1116. Convert SparseArray to a NumPy array.
  1117. Returns
  1118. -------
  1119. arr : NumPy array
  1120. """
  1121. return np.asarray(self, dtype=self.sp_values.dtype)
  1122. def _where(self, mask, value):
  1123. # NB: may not preserve dtype, e.g. result may be Sparse[float64]
  1124. # while self is Sparse[int64]
  1125. naive_implementation = np.where(mask, self, value)
  1126. dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)
  1127. result = type(self)._from_sequence(naive_implementation, dtype=dtype)
  1128. return result
  1129. # ------------------------------------------------------------------------
  1130. # IO
  1131. # ------------------------------------------------------------------------
  1132. def __setstate__(self, state) -> None:
  1133. """Necessary for making this object picklable"""
  1134. if isinstance(state, tuple):
  1135. # Compat for pandas < 0.24.0
  1136. nd_state, (fill_value, sp_index) = state
  1137. sparse_values = np.array([])
  1138. sparse_values.__setstate__(nd_state)
  1139. self._sparse_values = sparse_values
  1140. self._sparse_index = sp_index
  1141. self._dtype = SparseDtype(sparse_values.dtype, fill_value)
  1142. else:
  1143. self.__dict__.update(state)
  1144. def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
  1145. if self.fill_value == 0:
  1146. return (self.sp_index.indices,)
  1147. else:
  1148. return (self.sp_index.indices[self.sp_values != 0],)
  1149. # ------------------------------------------------------------------------
  1150. # Reductions
  1151. # ------------------------------------------------------------------------
  1152. def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
  1153. method = getattr(self, name, None)
  1154. if method is None:
  1155. raise TypeError(f"cannot perform {name} with type {self.dtype}")
  1156. if skipna:
  1157. arr = self
  1158. else:
  1159. arr = self.dropna()
  1160. return getattr(arr, name)(**kwargs)
  1161. def all(self, axis=None, *args, **kwargs):
  1162. """
  1163. Tests whether all elements evaluate True
  1164. Returns
  1165. -------
  1166. all : bool
  1167. See Also
  1168. --------
  1169. numpy.all
  1170. """
  1171. nv.validate_all(args, kwargs)
  1172. values = self.sp_values
  1173. if len(values) != len(self) and not np.all(self.fill_value):
  1174. return False
  1175. return values.all()
  1176. def any(self, axis: AxisInt = 0, *args, **kwargs):
  1177. """
  1178. Tests whether at least one of elements evaluate True
  1179. Returns
  1180. -------
  1181. any : bool
  1182. See Also
  1183. --------
  1184. numpy.any
  1185. """
  1186. nv.validate_any(args, kwargs)
  1187. values = self.sp_values
  1188. if len(values) != len(self) and np.any(self.fill_value):
  1189. return True
  1190. return values.any().item()
  1191. def sum(
  1192. self,
  1193. axis: AxisInt = 0,
  1194. min_count: int = 0,
  1195. skipna: bool = True,
  1196. *args,
  1197. **kwargs,
  1198. ) -> Scalar:
  1199. """
  1200. Sum of non-NA/null values
  1201. Parameters
  1202. ----------
  1203. axis : int, default 0
  1204. Not Used. NumPy compatibility.
  1205. min_count : int, default 0
  1206. The required number of valid values to perform the summation. If fewer
  1207. than ``min_count`` valid values are present, the result will be the missing
  1208. value indicator for subarray type.
  1209. *args, **kwargs
  1210. Not Used. NumPy compatibility.
  1211. Returns
  1212. -------
  1213. scalar
  1214. """
  1215. nv.validate_sum(args, kwargs)
  1216. valid_vals = self._valid_sp_values
  1217. sp_sum = valid_vals.sum()
  1218. has_na = self.sp_index.ngaps > 0 and not self._null_fill_value
  1219. if has_na and not skipna:
  1220. return na_value_for_dtype(self.dtype.subtype, compat=False)
  1221. if self._null_fill_value:
  1222. if check_below_min_count(valid_vals.shape, None, min_count):
  1223. return na_value_for_dtype(self.dtype.subtype, compat=False)
  1224. return sp_sum
  1225. else:
  1226. nsparse = self.sp_index.ngaps
  1227. if check_below_min_count(valid_vals.shape, None, min_count - nsparse):
  1228. return na_value_for_dtype(self.dtype.subtype, compat=False)
  1229. return sp_sum + self.fill_value * nsparse
  1230. def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray:
  1231. """
  1232. Cumulative sum of non-NA/null values.
  1233. When performing the cumulative summation, any non-NA/null values will
  1234. be skipped. The resulting SparseArray will preserve the locations of
  1235. NaN values, but the fill value will be `np.nan` regardless.
  1236. Parameters
  1237. ----------
  1238. axis : int or None
  1239. Axis over which to perform the cumulative summation. If None,
  1240. perform cumulative summation over flattened array.
  1241. Returns
  1242. -------
  1243. cumsum : SparseArray
  1244. """
  1245. nv.validate_cumsum(args, kwargs)
  1246. if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
  1247. raise ValueError(f"axis(={axis}) out of bounds")
  1248. if not self._null_fill_value:
  1249. return SparseArray(self.to_dense()).cumsum()
  1250. return SparseArray(
  1251. self.sp_values.cumsum(),
  1252. sparse_index=self.sp_index,
  1253. fill_value=self.fill_value,
  1254. )
  1255. def mean(self, axis: Axis = 0, *args, **kwargs):
  1256. """
  1257. Mean of non-NA/null values
  1258. Returns
  1259. -------
  1260. mean : float
  1261. """
  1262. nv.validate_mean(args, kwargs)
  1263. valid_vals = self._valid_sp_values
  1264. sp_sum = valid_vals.sum()
  1265. ct = len(valid_vals)
  1266. if self._null_fill_value:
  1267. return sp_sum / ct
  1268. else:
  1269. nsparse = self.sp_index.ngaps
  1270. return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
  1271. def max(self, *, axis: AxisInt | None = None, skipna: bool = True):
  1272. """
  1273. Max of array values, ignoring NA values if specified.
  1274. Parameters
  1275. ----------
  1276. axis : int, default 0
  1277. Not Used. NumPy compatibility.
  1278. skipna : bool, default True
  1279. Whether to ignore NA values.
  1280. Returns
  1281. -------
  1282. scalar
  1283. """
  1284. nv.validate_minmax_axis(axis, self.ndim)
  1285. return self._min_max("max", skipna=skipna)
  1286. def min(self, *, axis: AxisInt | None = None, skipna: bool = True):
  1287. """
  1288. Min of array values, ignoring NA values if specified.
  1289. Parameters
  1290. ----------
  1291. axis : int, default 0
  1292. Not Used. NumPy compatibility.
  1293. skipna : bool, default True
  1294. Whether to ignore NA values.
  1295. Returns
  1296. -------
  1297. scalar
  1298. """
  1299. nv.validate_minmax_axis(axis, self.ndim)
  1300. return self._min_max("min", skipna=skipna)
  1301. def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:
  1302. """
  1303. Min/max of non-NA/null values
  1304. Parameters
  1305. ----------
  1306. kind : {"min", "max"}
  1307. skipna : bool
  1308. Returns
  1309. -------
  1310. scalar
  1311. """
  1312. valid_vals = self._valid_sp_values
  1313. has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
  1314. if len(valid_vals) > 0:
  1315. sp_min_max = getattr(valid_vals, kind)()
  1316. # If a non-null fill value is currently present, it might be the min/max
  1317. if has_nonnull_fill_vals:
  1318. func = max if kind == "max" else min
  1319. return func(sp_min_max, self.fill_value)
  1320. elif skipna:
  1321. return sp_min_max
  1322. elif self.sp_index.ngaps == 0:
  1323. # No NAs present
  1324. return sp_min_max
  1325. else:
  1326. return na_value_for_dtype(self.dtype.subtype, compat=False)
  1327. elif has_nonnull_fill_vals:
  1328. return self.fill_value
  1329. else:
  1330. return na_value_for_dtype(self.dtype.subtype, compat=False)
  1331. def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:
  1332. values = self._sparse_values
  1333. index = self._sparse_index.indices
  1334. mask = np.asarray(isna(values))
  1335. func = np.argmax if kind == "argmax" else np.argmin
  1336. idx = np.arange(values.shape[0])
  1337. non_nans = values[~mask]
  1338. non_nan_idx = idx[~mask]
  1339. _candidate = non_nan_idx[func(non_nans)]
  1340. candidate = index[_candidate]
  1341. if isna(self.fill_value):
  1342. return candidate
  1343. if kind == "argmin" and self[candidate] < self.fill_value:
  1344. return candidate
  1345. if kind == "argmax" and self[candidate] > self.fill_value:
  1346. return candidate
  1347. _loc = self._first_fill_value_loc()
  1348. if _loc == -1:
  1349. # fill_value doesn't exist
  1350. return candidate
  1351. else:
  1352. return _loc
  1353. def argmax(self, skipna: bool = True) -> int:
  1354. validate_bool_kwarg(skipna, "skipna")
  1355. if not skipna and self._hasna:
  1356. raise NotImplementedError
  1357. return self._argmin_argmax("argmax")
  1358. def argmin(self, skipna: bool = True) -> int:
  1359. validate_bool_kwarg(skipna, "skipna")
  1360. if not skipna and self._hasna:
  1361. raise NotImplementedError
  1362. return self._argmin_argmax("argmin")
  1363. # ------------------------------------------------------------------------
  1364. # Ufuncs
  1365. # ------------------------------------------------------------------------
  1366. _HANDLED_TYPES = (np.ndarray, numbers.Number)
  1367. def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
  1368. out = kwargs.get("out", ())
  1369. for x in inputs + out:
  1370. if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
  1371. return NotImplemented
  1372. # for binary ops, use our custom dunder methods
  1373. result = ops.maybe_dispatch_ufunc_to_dunder_op(
  1374. self, ufunc, method, *inputs, **kwargs
  1375. )
  1376. if result is not NotImplemented:
  1377. return result
  1378. if "out" in kwargs:
  1379. # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace
  1380. res = arraylike.dispatch_ufunc_with_out(
  1381. self, ufunc, method, *inputs, **kwargs
  1382. )
  1383. return res
  1384. if method == "reduce":
  1385. result = arraylike.dispatch_reduction_ufunc(
  1386. self, ufunc, method, *inputs, **kwargs
  1387. )
  1388. if result is not NotImplemented:
  1389. # e.g. tests.series.test_ufunc.TestNumpyReductions
  1390. return result
  1391. if len(inputs) == 1:
  1392. # No alignment necessary.
  1393. sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  1394. fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
  1395. if ufunc.nout > 1:
  1396. # multiple outputs. e.g. modf
  1397. arrays = tuple(
  1398. self._simple_new(
  1399. sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
  1400. )
  1401. for sp_value, fv in zip(sp_values, fill_value)
  1402. )
  1403. return arrays
  1404. elif method == "reduce":
  1405. # e.g. reductions
  1406. return sp_values
  1407. return self._simple_new(
  1408. sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
  1409. )
  1410. new_inputs = tuple(np.asarray(x) for x in inputs)
  1411. result = getattr(ufunc, method)(*new_inputs, **kwargs)
  1412. if out:
  1413. if len(out) == 1:
  1414. out = out[0]
  1415. return out
  1416. if ufunc.nout > 1:
  1417. return tuple(type(self)(x) for x in result)
  1418. elif method == "at":
  1419. # no return value
  1420. return None
  1421. else:
  1422. return type(self)(result)
  1423. # ------------------------------------------------------------------------
  1424. # Ops
  1425. # ------------------------------------------------------------------------
  1426. def _arith_method(self, other, op):
  1427. op_name = op.__name__
  1428. if isinstance(other, SparseArray):
  1429. return _sparse_array_op(self, other, op, op_name)
  1430. elif is_scalar(other):
  1431. with np.errstate(all="ignore"):
  1432. fill = op(_get_fill(self), np.asarray(other))
  1433. result = op(self.sp_values, other)
  1434. if op_name == "divmod":
  1435. left, right = result
  1436. lfill, rfill = fill
  1437. return (
  1438. _wrap_result(op_name, left, self.sp_index, lfill),
  1439. _wrap_result(op_name, right, self.sp_index, rfill),
  1440. )
  1441. return _wrap_result(op_name, result, self.sp_index, fill)
  1442. else:
  1443. other = np.asarray(other)
  1444. with np.errstate(all="ignore"):
  1445. if len(self) != len(other):
  1446. raise AssertionError(
  1447. f"length mismatch: {len(self)} vs. {len(other)}"
  1448. )
  1449. if not isinstance(other, SparseArray):
  1450. dtype = getattr(other, "dtype", None)
  1451. other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
  1452. return _sparse_array_op(self, other, op, op_name)
  1453. def _cmp_method(self, other, op) -> SparseArray:
  1454. if not is_scalar(other) and not isinstance(other, type(self)):
  1455. # convert list-like to ndarray
  1456. other = np.asarray(other)
  1457. if isinstance(other, np.ndarray):
  1458. # TODO: make this more flexible than just ndarray...
  1459. other = SparseArray(other, fill_value=self.fill_value)
  1460. if isinstance(other, SparseArray):
  1461. if len(self) != len(other):
  1462. raise ValueError(
  1463. f"operands have mismatched length {len(self)} and {len(other)}"
  1464. )
  1465. op_name = op.__name__.strip("_")
  1466. return _sparse_array_op(self, other, op, op_name)
  1467. else:
  1468. # scalar
  1469. with np.errstate(all="ignore"):
  1470. fill_value = op(self.fill_value, other)
  1471. result = np.full(len(self), fill_value, dtype=np.bool_)
  1472. result[self.sp_index.indices] = op(self.sp_values, other)
  1473. return type(self)(
  1474. result,
  1475. fill_value=fill_value,
  1476. dtype=np.bool_,
  1477. )
  1478. _logical_method = _cmp_method
  1479. def _unary_method(self, op) -> SparseArray:
  1480. fill_value = op(np.array(self.fill_value)).item()
  1481. dtype = SparseDtype(self.dtype.subtype, fill_value)
  1482. # NOTE: if fill_value doesn't change
  1483. # we just have to apply op to sp_values
  1484. if isna(self.fill_value) or fill_value == self.fill_value:
  1485. values = op(self.sp_values)
  1486. return type(self)._simple_new(values, self.sp_index, self.dtype)
  1487. # In the other case we have to recalc indexes
  1488. return type(self)(op(self.to_dense()), dtype=dtype)
  1489. def __pos__(self) -> SparseArray:
  1490. return self._unary_method(operator.pos)
  1491. def __neg__(self) -> SparseArray:
  1492. return self._unary_method(operator.neg)
  1493. def __invert__(self) -> SparseArray:
  1494. return self._unary_method(operator.invert)
  1495. def __abs__(self) -> SparseArray:
  1496. return self._unary_method(operator.abs)
  1497. # ----------
  1498. # Formatting
  1499. # -----------
  1500. def __repr__(self) -> str:
  1501. pp_str = printing.pprint_thing(self)
  1502. pp_fill = printing.pprint_thing(self.fill_value)
  1503. pp_index = printing.pprint_thing(self.sp_index)
  1504. return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
  1505. def _formatter(self, boxed: bool = False):
  1506. # Defer to the formatter from the GenericArrayFormatter calling us.
  1507. # This will infer the correct formatter from the dtype of the values.
  1508. return None
  1509. def _make_sparse(
  1510. arr: np.ndarray,
  1511. kind: SparseIndexKind = "block",
  1512. fill_value=None,
  1513. dtype: np.dtype | None = None,
  1514. ):
  1515. """
  1516. Convert ndarray to sparse format
  1517. Parameters
  1518. ----------
  1519. arr : ndarray
  1520. kind : {'block', 'integer'}
  1521. fill_value : NaN or another value
  1522. dtype : np.dtype, optional
  1523. copy : bool, default False
  1524. Returns
  1525. -------
  1526. (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
  1527. """
  1528. assert isinstance(arr, np.ndarray)
  1529. if arr.ndim > 1:
  1530. raise TypeError("expected dimension <= 1 data")
  1531. if fill_value is None:
  1532. fill_value = na_value_for_dtype(arr.dtype)
  1533. if isna(fill_value):
  1534. mask = notna(arr)
  1535. else:
  1536. # cast to object comparison to be safe
  1537. if is_string_dtype(arr.dtype):
  1538. arr = arr.astype(object)
  1539. if is_object_dtype(arr.dtype):
  1540. # element-wise equality check method in numpy doesn't treat
  1541. # each element type, eg. 0, 0.0, and False are treated as
  1542. # same. So we have to check the both of its type and value.
  1543. mask = splib.make_mask_object_ndarray(arr, fill_value)
  1544. else:
  1545. mask = arr != fill_value
  1546. length = len(arr)
  1547. if length != len(mask):
  1548. # the arr is a SparseArray
  1549. indices = mask.sp_index.indices
  1550. else:
  1551. indices = mask.nonzero()[0].astype(np.int32)
  1552. index = make_sparse_index(length, indices, kind)
  1553. sparsified_values = arr[mask]
  1554. if dtype is not None:
  1555. sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values)
  1556. sparsified_values = astype_array(sparsified_values, dtype=dtype)
  1557. sparsified_values = np.asarray(sparsified_values)
  1558. # TODO: copy
  1559. return sparsified_values, index, fill_value
  1560. @overload
  1561. def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:
  1562. ...
  1563. @overload
  1564. def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:
  1565. ...
  1566. def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
  1567. index: SparseIndex
  1568. if kind == "block":
  1569. locs, lens = splib.get_blocks(indices)
  1570. index = BlockIndex(length, locs, lens)
  1571. elif kind == "integer":
  1572. index = IntIndex(length, indices)
  1573. else: # pragma: no cover
  1574. raise ValueError("must be block or integer type")
  1575. return index