array_manager.py 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361
  1. """
  2. Experimental manager based on storing a collection of 1D arrays
  3. """
  4. from __future__ import annotations
  5. from typing import (
  6. Any,
  7. Callable,
  8. Hashable,
  9. Literal,
  10. TypeVar,
  11. )
  12. import numpy as np
  13. from pandas._libs import (
  14. NaT,
  15. algos as libalgos,
  16. lib,
  17. )
  18. from pandas._typing import (
  19. ArrayLike,
  20. AxisInt,
  21. DtypeObj,
  22. QuantileInterpolation,
  23. npt,
  24. )
  25. from pandas.util._validators import validate_bool_kwarg
  26. from pandas.core.dtypes.astype import astype_array_safe
  27. from pandas.core.dtypes.cast import (
  28. ensure_dtype_can_hold_na,
  29. infer_dtype_from_scalar,
  30. )
  31. from pandas.core.dtypes.common import (
  32. ensure_platform_int,
  33. is_datetime64_ns_dtype,
  34. is_dtype_equal,
  35. is_extension_array_dtype,
  36. is_integer,
  37. is_numeric_dtype,
  38. is_object_dtype,
  39. is_timedelta64_ns_dtype,
  40. )
  41. from pandas.core.dtypes.dtypes import (
  42. ExtensionDtype,
  43. PandasDtype,
  44. )
  45. from pandas.core.dtypes.generic import (
  46. ABCDataFrame,
  47. ABCSeries,
  48. )
  49. from pandas.core.dtypes.missing import (
  50. array_equals,
  51. isna,
  52. na_value_for_dtype,
  53. )
  54. import pandas.core.algorithms as algos
  55. from pandas.core.array_algos.quantile import quantile_compat
  56. from pandas.core.array_algos.take import take_1d
  57. from pandas.core.arrays import (
  58. DatetimeArray,
  59. ExtensionArray,
  60. PandasArray,
  61. TimedeltaArray,
  62. )
  63. from pandas.core.arrays.sparse import SparseDtype
  64. from pandas.core.construction import (
  65. ensure_wrapped_if_datetimelike,
  66. extract_array,
  67. sanitize_array,
  68. )
  69. from pandas.core.indexers import (
  70. maybe_convert_indices,
  71. validate_indices,
  72. )
  73. from pandas.core.indexes.api import (
  74. Index,
  75. ensure_index,
  76. )
  77. from pandas.core.internals.base import (
  78. DataManager,
  79. SingleDataManager,
  80. interleaved_dtype,
  81. )
  82. from pandas.core.internals.blocks import (
  83. ensure_block_shape,
  84. external_values,
  85. extract_pandas_array,
  86. maybe_coerce_values,
  87. new_block,
  88. to_native_types,
  89. )
  90. T = TypeVar("T", bound="BaseArrayManager")
  91. class BaseArrayManager(DataManager):
  92. """
  93. Core internal data structure to implement DataFrame and Series.
  94. Alternative to the BlockManager, storing a list of 1D arrays instead of
  95. Blocks.
  96. This is *not* a public API class
  97. Parameters
  98. ----------
  99. arrays : Sequence of arrays
  100. axes : Sequence of Index
  101. verify_integrity : bool, default True
  102. """
  103. __slots__ = [
  104. "_axes", # private attribute, because 'axes' has different order, see below
  105. "arrays",
  106. ]
  107. arrays: list[np.ndarray | ExtensionArray]
  108. _axes: list[Index]
  109. def __init__(
  110. self,
  111. arrays: list[np.ndarray | ExtensionArray],
  112. axes: list[Index],
  113. verify_integrity: bool = True,
  114. ) -> None:
  115. raise NotImplementedError
  116. def make_empty(self: T, axes=None) -> T:
  117. """Return an empty ArrayManager with the items axis of len 0 (no columns)"""
  118. if axes is None:
  119. axes = [self.axes[1:], Index([])]
  120. arrays: list[np.ndarray | ExtensionArray] = []
  121. return type(self)(arrays, axes)
  122. @property
  123. def items(self) -> Index:
  124. return self._axes[-1]
  125. @property
  126. # error: Signature of "axes" incompatible with supertype "DataManager"
  127. def axes(self) -> list[Index]: # type: ignore[override]
  128. # mypy doesn't work to override attribute with property
  129. # see https://github.com/python/mypy/issues/4125
  130. """Axes is BlockManager-compatible order (columns, rows)"""
  131. return [self._axes[1], self._axes[0]]
  132. @property
  133. def shape_proper(self) -> tuple[int, ...]:
  134. # this returns (n_rows, n_columns)
  135. return tuple(len(ax) for ax in self._axes)
  136. @staticmethod
  137. def _normalize_axis(axis: AxisInt) -> int:
  138. # switch axis
  139. axis = 1 if axis == 0 else 0
  140. return axis
  141. def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
  142. # Caller is responsible for ensuring we have an Index object.
  143. self._validate_set_axis(axis, new_labels)
  144. axis = self._normalize_axis(axis)
  145. self._axes[axis] = new_labels
  146. def get_dtypes(self) -> np.ndarray:
  147. return np.array([arr.dtype for arr in self.arrays], dtype="object")
  148. def add_references(self, mgr: BaseArrayManager) -> None:
  149. """
  150. Only implemented on the BlockManager level
  151. """
  152. return
  153. def __getstate__(self):
  154. return self.arrays, self._axes
  155. def __setstate__(self, state) -> None:
  156. self.arrays = state[0]
  157. self._axes = state[1]
  158. def __repr__(self) -> str:
  159. output = type(self).__name__
  160. output += f"\nIndex: {self._axes[0]}"
  161. if self.ndim == 2:
  162. output += f"\nColumns: {self._axes[1]}"
  163. output += f"\n{len(self.arrays)} arrays:"
  164. for arr in self.arrays:
  165. output += f"\n{arr.dtype}"
  166. return output
  167. def apply(
  168. self: T,
  169. f,
  170. align_keys: list[str] | None = None,
  171. **kwargs,
  172. ) -> T:
  173. """
  174. Iterate over the arrays, collect and create a new ArrayManager.
  175. Parameters
  176. ----------
  177. f : str or callable
  178. Name of the Array method to apply.
  179. align_keys: List[str] or None, default None
  180. **kwargs
  181. Keywords to pass to `f`
  182. Returns
  183. -------
  184. ArrayManager
  185. """
  186. assert "filter" not in kwargs
  187. align_keys = align_keys or []
  188. result_arrays: list[np.ndarray] = []
  189. # fillna: Series/DataFrame is responsible for making sure value is aligned
  190. aligned_args = {k: kwargs[k] for k in align_keys}
  191. if f == "apply":
  192. f = kwargs.pop("func")
  193. for i, arr in enumerate(self.arrays):
  194. if aligned_args:
  195. for k, obj in aligned_args.items():
  196. if isinstance(obj, (ABCSeries, ABCDataFrame)):
  197. # The caller is responsible for ensuring that
  198. # obj.axes[-1].equals(self.items)
  199. if obj.ndim == 1:
  200. kwargs[k] = obj.iloc[i]
  201. else:
  202. kwargs[k] = obj.iloc[:, i]._values
  203. else:
  204. # otherwise we have an array-like
  205. kwargs[k] = obj[i]
  206. if callable(f):
  207. applied = f(arr, **kwargs)
  208. else:
  209. applied = getattr(arr, f)(**kwargs)
  210. # if not isinstance(applied, ExtensionArray):
  211. # # TODO not all EA operations return new EAs (eg astype)
  212. # applied = array(applied)
  213. result_arrays.append(applied)
  214. new_axes = self._axes
  215. # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
  216. # expected "List[Union[ndarray, ExtensionArray]]"
  217. return type(self)(result_arrays, new_axes) # type: ignore[arg-type]
  218. def apply_with_block(
  219. self: T, f, align_keys=None, swap_axis: bool = True, **kwargs
  220. ) -> T:
  221. # switch axis to follow BlockManager logic
  222. if swap_axis and "axis" in kwargs and self.ndim == 2:
  223. kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0
  224. align_keys = align_keys or []
  225. aligned_args = {k: kwargs[k] for k in align_keys}
  226. result_arrays = []
  227. for i, arr in enumerate(self.arrays):
  228. if aligned_args:
  229. for k, obj in aligned_args.items():
  230. if isinstance(obj, (ABCSeries, ABCDataFrame)):
  231. # The caller is responsible for ensuring that
  232. # obj.axes[-1].equals(self.items)
  233. if obj.ndim == 1:
  234. if self.ndim == 2:
  235. kwargs[k] = obj.iloc[slice(i, i + 1)]._values
  236. else:
  237. kwargs[k] = obj.iloc[:]._values
  238. else:
  239. kwargs[k] = obj.iloc[:, [i]]._values
  240. else:
  241. # otherwise we have an ndarray
  242. if obj.ndim == 2:
  243. kwargs[k] = obj[[i]]
  244. if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray):
  245. # i.e. TimedeltaArray, DatetimeArray with tz=None. Need to
  246. # convert for the Block constructors.
  247. arr = np.asarray(arr)
  248. if self.ndim == 2:
  249. arr = ensure_block_shape(arr, 2)
  250. block = new_block(arr, placement=slice(0, 1, 1), ndim=2)
  251. else:
  252. block = new_block(arr, placement=slice(0, len(self), 1), ndim=1)
  253. applied = getattr(block, f)(**kwargs)
  254. if isinstance(applied, list):
  255. applied = applied[0]
  256. arr = applied.values
  257. if self.ndim == 2 and arr.ndim == 2:
  258. # 2D for np.ndarray or DatetimeArray/TimedeltaArray
  259. assert len(arr) == 1
  260. # error: No overload variant of "__getitem__" of "ExtensionArray"
  261. # matches argument type "Tuple[int, slice]"
  262. arr = arr[0, :] # type: ignore[call-overload]
  263. result_arrays.append(arr)
  264. return type(self)(result_arrays, self._axes)
  265. def where(self: T, other, cond, align: bool) -> T:
  266. if align:
  267. align_keys = ["other", "cond"]
  268. else:
  269. align_keys = ["cond"]
  270. other = extract_array(other, extract_numpy=True)
  271. return self.apply_with_block(
  272. "where",
  273. align_keys=align_keys,
  274. other=other,
  275. cond=cond,
  276. )
  277. def round(self: T, decimals: int, using_cow: bool = False) -> T:
  278. return self.apply_with_block("round", decimals=decimals, using_cow=using_cow)
  279. def setitem(self: T, indexer, value) -> T:
  280. return self.apply_with_block("setitem", indexer=indexer, value=value)
  281. def putmask(self: T, mask, new, align: bool = True) -> T:
  282. if align:
  283. align_keys = ["new", "mask"]
  284. else:
  285. align_keys = ["mask"]
  286. new = extract_array(new, extract_numpy=True)
  287. return self.apply_with_block(
  288. "putmask",
  289. align_keys=align_keys,
  290. mask=mask,
  291. new=new,
  292. )
  293. def diff(self: T, n: int, axis: AxisInt) -> T:
  294. assert self.ndim == 2 and axis == 0 # caller ensures
  295. return self.apply(algos.diff, n=n, axis=axis)
  296. def interpolate(self: T, **kwargs) -> T:
  297. return self.apply_with_block("interpolate", swap_axis=False, **kwargs)
  298. def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:
  299. if fill_value is lib.no_default:
  300. fill_value = None
  301. if axis == 1 and self.ndim == 2:
  302. # TODO column-wise shift
  303. raise NotImplementedError
  304. return self.apply_with_block(
  305. "shift", periods=periods, axis=axis, fill_value=fill_value
  306. )
  307. def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
  308. if limit is not None:
  309. # Do this validation even if we go through one of the no-op paths
  310. limit = libalgos.validate_limit(None, limit=limit)
  311. return self.apply_with_block(
  312. "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
  313. )
  314. def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
  315. if copy is None:
  316. copy = True
  317. return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
  318. def convert(self: T, copy: bool | None) -> T:
  319. if copy is None:
  320. copy = True
  321. def _convert(arr):
  322. if is_object_dtype(arr.dtype):
  323. # extract PandasArray for tests that patch PandasArray._typ
  324. arr = np.asarray(arr)
  325. result = lib.maybe_convert_objects(
  326. arr,
  327. convert_datetime=True,
  328. convert_timedelta=True,
  329. convert_period=True,
  330. convert_interval=True,
  331. )
  332. if result is arr and copy:
  333. return arr.copy()
  334. return result
  335. else:
  336. return arr.copy() if copy else arr
  337. return self.apply(_convert)
  338. def replace_regex(self: T, **kwargs) -> T:
  339. return self.apply_with_block("_replace_regex", **kwargs)
  340. def replace(self: T, to_replace, value, inplace: bool) -> T:
  341. inplace = validate_bool_kwarg(inplace, "inplace")
  342. assert np.ndim(value) == 0, value
  343. # TODO "replace" is right now implemented on the blocks, we should move
  344. # it to general array algos so it can be reused here
  345. return self.apply_with_block(
  346. "replace", value=value, to_replace=to_replace, inplace=inplace
  347. )
  348. def replace_list(
  349. self: T,
  350. src_list: list[Any],
  351. dest_list: list[Any],
  352. inplace: bool = False,
  353. regex: bool = False,
  354. ) -> T:
  355. """do a list replace"""
  356. inplace = validate_bool_kwarg(inplace, "inplace")
  357. return self.apply_with_block(
  358. "replace_list",
  359. src_list=src_list,
  360. dest_list=dest_list,
  361. inplace=inplace,
  362. regex=regex,
  363. )
  364. def to_native_types(self: T, **kwargs) -> T:
  365. return self.apply(to_native_types, **kwargs)
  366. @property
  367. def is_mixed_type(self) -> bool:
  368. return True
  369. @property
  370. def is_numeric_mixed_type(self) -> bool:
  371. return all(is_numeric_dtype(t) for t in self.get_dtypes())
  372. @property
  373. def any_extension_types(self) -> bool:
  374. """Whether any of the blocks in this manager are extension blocks"""
  375. return False # any(block.is_extension for block in self.blocks)
  376. @property
  377. def is_view(self) -> bool:
  378. """return a boolean if we are a single block and are a view"""
  379. # TODO what is this used for?
  380. return False
  381. @property
  382. def is_single_block(self) -> bool:
  383. return len(self.arrays) == 1
  384. def _get_data_subset(self: T, predicate: Callable) -> T:
  385. indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
  386. arrays = [self.arrays[i] for i in indices]
  387. # TODO copy?
  388. # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq,
  389. # see test_describe_datetime_columns
  390. taker = np.array(indices, dtype="intp")
  391. new_cols = self._axes[1].take(taker)
  392. new_axes = [self._axes[0], new_cols]
  393. return type(self)(arrays, new_axes, verify_integrity=False)
  394. def get_bool_data(self: T, copy: bool = False) -> T:
  395. """
  396. Select columns that are bool-dtype and object-dtype columns that are all-bool.
  397. Parameters
  398. ----------
  399. copy : bool, default False
  400. Whether to copy the blocks
  401. """
  402. return self._get_data_subset(lambda x: x.dtype == np.dtype(bool))
  403. def get_numeric_data(self: T, copy: bool = False) -> T:
  404. """
  405. Select columns that have a numeric dtype.
  406. Parameters
  407. ----------
  408. copy : bool, default False
  409. Whether to copy the blocks
  410. """
  411. return self._get_data_subset(
  412. lambda arr: is_numeric_dtype(arr.dtype)
  413. or getattr(arr.dtype, "_is_numeric", False)
  414. )
  415. def copy(self: T, deep: bool | Literal["all"] | None = True) -> T:
  416. """
  417. Make deep or shallow copy of ArrayManager
  418. Parameters
  419. ----------
  420. deep : bool or string, default True
  421. If False, return shallow copy (do not copy data)
  422. If 'all', copy data and a deep copy of the index
  423. Returns
  424. -------
  425. BlockManager
  426. """
  427. if deep is None:
  428. # ArrayManager does not yet support CoW, so deep=None always means
  429. # deep=True for now
  430. deep = True
  431. # this preserves the notion of view copying of axes
  432. if deep:
  433. # hit in e.g. tests.io.json.test_pandas
  434. def copy_func(ax):
  435. return ax.copy(deep=True) if deep == "all" else ax.view()
  436. new_axes = [copy_func(ax) for ax in self._axes]
  437. else:
  438. new_axes = list(self._axes)
  439. if deep:
  440. new_arrays = [arr.copy() for arr in self.arrays]
  441. else:
  442. new_arrays = list(self.arrays)
  443. return type(self)(new_arrays, new_axes, verify_integrity=False)
  444. def reindex_indexer(
  445. self: T,
  446. new_axis,
  447. indexer,
  448. axis: AxisInt,
  449. fill_value=None,
  450. allow_dups: bool = False,
  451. copy: bool | None = True,
  452. # ignored keywords
  453. only_slice: bool = False,
  454. # ArrayManager specific keywords
  455. use_na_proxy: bool = False,
  456. ) -> T:
  457. axis = self._normalize_axis(axis)
  458. return self._reindex_indexer(
  459. new_axis,
  460. indexer,
  461. axis,
  462. fill_value,
  463. allow_dups,
  464. copy,
  465. use_na_proxy,
  466. )
  467. def _reindex_indexer(
  468. self: T,
  469. new_axis,
  470. indexer: npt.NDArray[np.intp] | None,
  471. axis: AxisInt,
  472. fill_value=None,
  473. allow_dups: bool = False,
  474. copy: bool | None = True,
  475. use_na_proxy: bool = False,
  476. ) -> T:
  477. """
  478. Parameters
  479. ----------
  480. new_axis : Index
  481. indexer : ndarray[intp] or None
  482. axis : int
  483. fill_value : object, default None
  484. allow_dups : bool, default False
  485. copy : bool, default True
  486. pandas-indexer with -1's only.
  487. """
  488. if copy is None:
  489. # ArrayManager does not yet support CoW, so deep=None always means
  490. # deep=True for now
  491. copy = True
  492. if indexer is None:
  493. if new_axis is self._axes[axis] and not copy:
  494. return self
  495. result = self.copy(deep=copy)
  496. result._axes = list(self._axes)
  497. result._axes[axis] = new_axis
  498. return result
  499. # some axes don't allow reindexing with dups
  500. if not allow_dups:
  501. self._axes[axis]._validate_can_reindex(indexer)
  502. if axis >= self.ndim:
  503. raise IndexError("Requested axis not found in manager")
  504. if axis == 1:
  505. new_arrays = []
  506. for i in indexer:
  507. if i == -1:
  508. arr = self._make_na_array(
  509. fill_value=fill_value, use_na_proxy=use_na_proxy
  510. )
  511. else:
  512. arr = self.arrays[i]
  513. if copy:
  514. arr = arr.copy()
  515. new_arrays.append(arr)
  516. else:
  517. validate_indices(indexer, len(self._axes[0]))
  518. indexer = ensure_platform_int(indexer)
  519. mask = indexer == -1
  520. needs_masking = mask.any()
  521. new_arrays = [
  522. take_1d(
  523. arr,
  524. indexer,
  525. allow_fill=needs_masking,
  526. fill_value=fill_value,
  527. mask=mask,
  528. # if fill_value is not None else blk.fill_value
  529. )
  530. for arr in self.arrays
  531. ]
  532. new_axes = list(self._axes)
  533. new_axes[axis] = new_axis
  534. return type(self)(new_arrays, new_axes, verify_integrity=False)
  535. def take(
  536. self: T,
  537. indexer,
  538. axis: AxisInt = 1,
  539. verify: bool = True,
  540. convert_indices: bool = True,
  541. ) -> T:
  542. """
  543. Take items along any axis.
  544. """
  545. axis = self._normalize_axis(axis)
  546. indexer = (
  547. np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64")
  548. if isinstance(indexer, slice)
  549. else np.asanyarray(indexer, dtype="int64")
  550. )
  551. if not indexer.ndim == 1:
  552. raise ValueError("indexer should be 1-dimensional")
  553. n = self.shape_proper[axis]
  554. if convert_indices:
  555. indexer = maybe_convert_indices(indexer, n, verify=verify)
  556. new_labels = self._axes[axis].take(indexer)
  557. return self._reindex_indexer(
  558. new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
  559. )
  560. def _make_na_array(self, fill_value=None, use_na_proxy: bool = False):
  561. if use_na_proxy:
  562. assert fill_value is None
  563. return NullArrayProxy(self.shape_proper[0])
  564. if fill_value is None:
  565. fill_value = np.nan
  566. dtype, fill_value = infer_dtype_from_scalar(fill_value)
  567. # error: Argument "dtype" to "empty" has incompatible type "Union[dtype[Any],
  568. # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
  569. # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
  570. # _DTypeDict, Tuple[Any, Any]]]"
  571. values = np.empty(self.shape_proper[0], dtype=dtype) # type: ignore[arg-type]
  572. values.fill(fill_value)
  573. return values
  574. def _equal_values(self, other) -> bool:
  575. """
  576. Used in .equals defined in base class. Only check the column values
  577. assuming shape and indexes have already been checked.
  578. """
  579. for left, right in zip(self.arrays, other.arrays):
  580. if not array_equals(left, right):
  581. return False
  582. return True
  583. # TODO
  584. # to_dict
  585. class ArrayManager(BaseArrayManager):
  586. @property
  587. def ndim(self) -> Literal[2]:
  588. return 2
  589. def __init__(
  590. self,
  591. arrays: list[np.ndarray | ExtensionArray],
  592. axes: list[Index],
  593. verify_integrity: bool = True,
  594. ) -> None:
  595. # Note: we are storing the axes in "_axes" in the (row, columns) order
  596. # which contrasts the order how it is stored in BlockManager
  597. self._axes = axes
  598. self.arrays = arrays
  599. if verify_integrity:
  600. self._axes = [ensure_index(ax) for ax in axes]
  601. arrays = [extract_pandas_array(x, None, 1)[0] for x in arrays]
  602. self.arrays = [maybe_coerce_values(arr) for arr in arrays]
  603. self._verify_integrity()
  604. def _verify_integrity(self) -> None:
  605. n_rows, n_columns = self.shape_proper
  606. if not len(self.arrays) == n_columns:
  607. raise ValueError(
  608. "Number of passed arrays must equal the size of the column Index: "
  609. f"{len(self.arrays)} arrays vs {n_columns} columns."
  610. )
  611. for arr in self.arrays:
  612. if not len(arr) == n_rows:
  613. raise ValueError(
  614. "Passed arrays should have the same length as the rows Index: "
  615. f"{len(arr)} vs {n_rows} rows"
  616. )
  617. if not isinstance(arr, (np.ndarray, ExtensionArray)):
  618. raise ValueError(
  619. "Passed arrays should be np.ndarray or ExtensionArray instances, "
  620. f"got {type(arr)} instead"
  621. )
  622. if not arr.ndim == 1:
  623. raise ValueError(
  624. "Passed arrays should be 1-dimensional, got array with "
  625. f"{arr.ndim} dimensions instead."
  626. )
  627. # --------------------------------------------------------------------
  628. # Indexing
  629. def fast_xs(self, loc: int) -> SingleArrayManager:
  630. """
  631. Return the array corresponding to `frame.iloc[loc]`.
  632. Parameters
  633. ----------
  634. loc : int
  635. Returns
  636. -------
  637. np.ndarray or ExtensionArray
  638. """
  639. dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
  640. values = [arr[loc] for arr in self.arrays]
  641. if isinstance(dtype, ExtensionDtype):
  642. result = dtype.construct_array_type()._from_sequence(values, dtype=dtype)
  643. # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT
  644. elif is_datetime64_ns_dtype(dtype):
  645. result = DatetimeArray._from_sequence(values, dtype=dtype)._ndarray
  646. elif is_timedelta64_ns_dtype(dtype):
  647. result = TimedeltaArray._from_sequence(values, dtype=dtype)._ndarray
  648. else:
  649. result = np.array(values, dtype=dtype)
  650. return SingleArrayManager([result], [self._axes[1]])
  651. def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
  652. axis = self._normalize_axis(axis)
  653. if axis == 0:
  654. arrays = [arr[slobj] for arr in self.arrays]
  655. elif axis == 1:
  656. arrays = self.arrays[slobj]
  657. new_axes = list(self._axes)
  658. new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
  659. return type(self)(arrays, new_axes, verify_integrity=False)
  660. def iget(self, i: int) -> SingleArrayManager:
  661. """
  662. Return the data as a SingleArrayManager.
  663. """
  664. values = self.arrays[i]
  665. return SingleArrayManager([values], [self._axes[0]])
  666. def iget_values(self, i: int) -> ArrayLike:
  667. """
  668. Return the data for column i as the values (ndarray or ExtensionArray).
  669. """
  670. return self.arrays[i]
  671. @property
  672. def column_arrays(self) -> list[ArrayLike]:
  673. """
  674. Used in the JSON C code to access column arrays.
  675. """
  676. return [np.asarray(arr) for arr in self.arrays]
  677. def iset(
  678. self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
  679. ) -> None:
  680. """
  681. Set new column(s).
  682. This changes the ArrayManager in-place, but replaces (an) existing
  683. column(s), not changing column values in-place).
  684. Parameters
  685. ----------
  686. loc : integer, slice or boolean mask
  687. Positional location (already bounds checked)
  688. value : np.ndarray or ExtensionArray
  689. inplace : bool, default False
  690. Whether overwrite existing array as opposed to replacing it.
  691. """
  692. # single column -> single integer index
  693. if lib.is_integer(loc):
  694. # TODO can we avoid needing to unpack this here? That means converting
  695. # DataFrame into 1D array when loc is an integer
  696. if isinstance(value, np.ndarray) and value.ndim == 2:
  697. assert value.shape[1] == 1
  698. value = value[:, 0]
  699. # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
  700. # but we should avoid that and pass directly the proper array
  701. value = maybe_coerce_values(value)
  702. assert isinstance(value, (np.ndarray, ExtensionArray))
  703. assert value.ndim == 1
  704. assert len(value) == len(self._axes[0])
  705. self.arrays[loc] = value
  706. return
  707. # multiple columns -> convert slice or array to integer indices
  708. elif isinstance(loc, slice):
  709. indices = range(
  710. loc.start if loc.start is not None else 0,
  711. loc.stop if loc.stop is not None else self.shape_proper[1],
  712. loc.step if loc.step is not None else 1,
  713. )
  714. else:
  715. assert isinstance(loc, np.ndarray)
  716. assert loc.dtype == "bool"
  717. # error: Incompatible types in assignment (expression has type "ndarray",
  718. # variable has type "range")
  719. indices = np.nonzero(loc)[0] # type: ignore[assignment]
  720. assert value.ndim == 2
  721. assert value.shape[0] == len(self._axes[0])
  722. for value_idx, mgr_idx in enumerate(indices):
  723. # error: No overload variant of "__getitem__" of "ExtensionArray" matches
  724. # argument type "Tuple[slice, int]"
  725. value_arr = value[:, value_idx] # type: ignore[call-overload]
  726. self.arrays[mgr_idx] = value_arr
  727. return
  728. def column_setitem(
  729. self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
  730. ) -> None:
  731. """
  732. Set values ("setitem") into a single column (not setting the full column).
  733. This is a method on the ArrayManager level, to avoid creating an
  734. intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
  735. """
  736. if not is_integer(loc):
  737. raise TypeError("The column index should be an integer")
  738. arr = self.arrays[loc]
  739. mgr = SingleArrayManager([arr], [self._axes[0]])
  740. if inplace_only:
  741. mgr.setitem_inplace(idx, value)
  742. else:
  743. new_mgr = mgr.setitem((idx,), value)
  744. # update existing ArrayManager in-place
  745. self.arrays[loc] = new_mgr.arrays[0]
  746. def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
  747. """
  748. Insert item at selected position.
  749. Parameters
  750. ----------
  751. loc : int
  752. item : hashable
  753. value : np.ndarray or ExtensionArray
  754. """
  755. # insert to the axis; this could possibly raise a TypeError
  756. new_axis = self.items.insert(loc, item)
  757. value = extract_array(value, extract_numpy=True)
  758. if value.ndim == 2:
  759. if value.shape[0] == 1:
  760. # error: No overload variant of "__getitem__" of "ExtensionArray"
  761. # matches argument type "Tuple[int, slice]"
  762. value = value[0, :] # type: ignore[call-overload]
  763. else:
  764. raise ValueError(
  765. f"Expected a 1D array, got an array with shape {value.shape}"
  766. )
  767. value = maybe_coerce_values(value)
  768. # TODO self.arrays can be empty
  769. # assert len(value) == len(self.arrays[0])
  770. # TODO is this copy needed?
  771. arrays = self.arrays.copy()
  772. arrays.insert(loc, value)
  773. self.arrays = arrays
  774. self._axes[1] = new_axis
  775. def idelete(self, indexer) -> ArrayManager:
  776. """
  777. Delete selected locations in-place (new block and array, same BlockManager)
  778. """
  779. to_keep = np.ones(self.shape[0], dtype=np.bool_)
  780. to_keep[indexer] = False
  781. self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]]
  782. self._axes = [self._axes[0], self._axes[1][to_keep]]
  783. return self
  784. # --------------------------------------------------------------------
  785. # Array-wise Operation
  786. def grouped_reduce(self: T, func: Callable) -> T:
  787. """
  788. Apply grouped reduction function columnwise, returning a new ArrayManager.
  789. Parameters
  790. ----------
  791. func : grouped reduction function
  792. Returns
  793. -------
  794. ArrayManager
  795. """
  796. result_arrays: list[np.ndarray] = []
  797. result_indices: list[int] = []
  798. for i, arr in enumerate(self.arrays):
  799. # grouped_reduce functions all expect 2D arrays
  800. arr = ensure_block_shape(arr, ndim=2)
  801. res = func(arr)
  802. if res.ndim == 2:
  803. # reverse of ensure_block_shape
  804. assert res.shape[0] == 1
  805. res = res[0]
  806. result_arrays.append(res)
  807. result_indices.append(i)
  808. if len(result_arrays) == 0:
  809. nrows = 0
  810. else:
  811. nrows = result_arrays[0].shape[0]
  812. index = Index(range(nrows))
  813. columns = self.items
  814. # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
  815. # expected "List[Union[ndarray, ExtensionArray]]"
  816. return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
  817. def reduce(self: T, func: Callable) -> T:
  818. """
  819. Apply reduction function column-wise, returning a single-row ArrayManager.
  820. Parameters
  821. ----------
  822. func : reduction function
  823. Returns
  824. -------
  825. ArrayManager
  826. """
  827. result_arrays: list[np.ndarray] = []
  828. for i, arr in enumerate(self.arrays):
  829. res = func(arr, axis=0)
  830. # TODO NaT doesn't preserve dtype, so we need to ensure to create
  831. # a timedelta result array if original was timedelta
  832. # what if datetime results in timedelta? (eg std)
  833. dtype = arr.dtype if res is NaT else None
  834. result_arrays.append(
  835. sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
  836. )
  837. index = Index._simple_new(np.array([None], dtype=object)) # placeholder
  838. columns = self.items
  839. # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
  840. # expected "List[Union[ndarray, ExtensionArray]]"
  841. new_mgr = type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
  842. return new_mgr
  843. def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
  844. """
  845. Apply array_op blockwise with another (aligned) BlockManager.
  846. """
  847. # TODO what if `other` is BlockManager ?
  848. left_arrays = self.arrays
  849. right_arrays = other.arrays
  850. result_arrays = [
  851. array_op(left, right) for left, right in zip(left_arrays, right_arrays)
  852. ]
  853. return type(self)(result_arrays, self._axes)
  854. def quantile(
  855. self,
  856. *,
  857. qs: Index, # with dtype float64
  858. axis: AxisInt = 0,
  859. transposed: bool = False,
  860. interpolation: QuantileInterpolation = "linear",
  861. ) -> ArrayManager:
  862. arrs = [ensure_block_shape(x, 2) for x in self.arrays]
  863. assert axis == 1
  864. new_arrs = [
  865. quantile_compat(x, np.asarray(qs._values), interpolation) for x in arrs
  866. ]
  867. for i, arr in enumerate(new_arrs):
  868. if arr.ndim == 2:
  869. assert arr.shape[0] == 1, arr.shape
  870. new_arrs[i] = arr[0]
  871. axes = [qs, self._axes[1]]
  872. return type(self)(new_arrs, axes)
  873. # ----------------------------------------------------------------
  874. def unstack(self, unstacker, fill_value) -> ArrayManager:
  875. """
  876. Return a BlockManager with all blocks unstacked.
  877. Parameters
  878. ----------
  879. unstacker : reshape._Unstacker
  880. fill_value : Any
  881. fill_value for newly introduced missing values.
  882. Returns
  883. -------
  884. unstacked : BlockManager
  885. """
  886. indexer, _ = unstacker._indexer_and_to_sort
  887. if unstacker.mask.all():
  888. new_indexer = indexer
  889. allow_fill = False
  890. new_mask2D = None
  891. needs_masking = None
  892. else:
  893. new_indexer = np.full(unstacker.mask.shape, -1)
  894. new_indexer[unstacker.mask] = indexer
  895. allow_fill = True
  896. # calculating the full mask once and passing it to take_1d is faster
  897. # than letting take_1d calculate it in each repeated call
  898. new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
  899. needs_masking = new_mask2D.any(axis=0)
  900. new_indexer2D = new_indexer.reshape(*unstacker.full_shape)
  901. new_indexer2D = ensure_platform_int(new_indexer2D)
  902. new_arrays = []
  903. for arr in self.arrays:
  904. for i in range(unstacker.full_shape[1]):
  905. if allow_fill:
  906. # error: Value of type "Optional[Any]" is not indexable [index]
  907. new_arr = take_1d(
  908. arr,
  909. new_indexer2D[:, i],
  910. allow_fill=needs_masking[i], # type: ignore[index]
  911. fill_value=fill_value,
  912. mask=new_mask2D[:, i], # type: ignore[index]
  913. )
  914. else:
  915. new_arr = take_1d(arr, new_indexer2D[:, i], allow_fill=False)
  916. new_arrays.append(new_arr)
  917. new_index = unstacker.new_index
  918. new_columns = unstacker.get_new_columns(self._axes[1])
  919. new_axes = [new_index, new_columns]
  920. return type(self)(new_arrays, new_axes, verify_integrity=False)
  921. def as_array(
  922. self,
  923. dtype=None,
  924. copy: bool = False,
  925. na_value: object = lib.no_default,
  926. ) -> np.ndarray:
  927. """
  928. Convert the blockmanager data into an numpy array.
  929. Parameters
  930. ----------
  931. dtype : object, default None
  932. Data type of the return array.
  933. copy : bool, default False
  934. If True then guarantee that a copy is returned. A value of
  935. False does not guarantee that the underlying data is not
  936. copied.
  937. na_value : object, default lib.no_default
  938. Value to be used as the missing value sentinel.
  939. Returns
  940. -------
  941. arr : ndarray
  942. """
  943. if len(self.arrays) == 0:
  944. empty_arr = np.empty(self.shape, dtype=float)
  945. return empty_arr.transpose()
  946. # We want to copy when na_value is provided to avoid
  947. # mutating the original object
  948. copy = copy or na_value is not lib.no_default
  949. if not dtype:
  950. dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
  951. if isinstance(dtype, SparseDtype):
  952. dtype = dtype.subtype
  953. elif isinstance(dtype, PandasDtype):
  954. dtype = dtype.numpy_dtype
  955. elif is_extension_array_dtype(dtype):
  956. dtype = "object"
  957. elif is_dtype_equal(dtype, str):
  958. dtype = "object"
  959. result = np.empty(self.shape_proper, dtype=dtype)
  960. for i, arr in enumerate(self.arrays):
  961. arr = arr.astype(dtype, copy=copy)
  962. result[:, i] = arr
  963. if na_value is not lib.no_default:
  964. result[isna(result)] = na_value
  965. return result
  966. class SingleArrayManager(BaseArrayManager, SingleDataManager):
  967. __slots__ = [
  968. "_axes", # private attribute, because 'axes' has different order, see below
  969. "arrays",
  970. ]
  971. arrays: list[np.ndarray | ExtensionArray]
  972. _axes: list[Index]
  973. @property
  974. def ndim(self) -> Literal[1]:
  975. return 1
  976. def __init__(
  977. self,
  978. arrays: list[np.ndarray | ExtensionArray],
  979. axes: list[Index],
  980. verify_integrity: bool = True,
  981. ) -> None:
  982. self._axes = axes
  983. self.arrays = arrays
  984. if verify_integrity:
  985. assert len(axes) == 1
  986. assert len(arrays) == 1
  987. self._axes = [ensure_index(ax) for ax in self._axes]
  988. arr = arrays[0]
  989. arr = maybe_coerce_values(arr)
  990. arr = extract_pandas_array(arr, None, 1)[0]
  991. self.arrays = [arr]
  992. self._verify_integrity()
  993. def _verify_integrity(self) -> None:
  994. (n_rows,) = self.shape
  995. assert len(self.arrays) == 1
  996. arr = self.arrays[0]
  997. assert len(arr) == n_rows
  998. if not arr.ndim == 1:
  999. raise ValueError(
  1000. "Passed array should be 1-dimensional, got array with "
  1001. f"{arr.ndim} dimensions instead."
  1002. )
  1003. @staticmethod
  1004. def _normalize_axis(axis):
  1005. return axis
  1006. def make_empty(self, axes=None) -> SingleArrayManager:
  1007. """Return an empty ArrayManager with index/array of length 0"""
  1008. if axes is None:
  1009. axes = [Index([], dtype=object)]
  1010. array: np.ndarray = np.array([], dtype=self.dtype)
  1011. return type(self)([array], axes)
  1012. @classmethod
  1013. def from_array(cls, array, index) -> SingleArrayManager:
  1014. return cls([array], [index])
  1015. @property
  1016. def axes(self):
  1017. return self._axes
  1018. @property
  1019. def index(self) -> Index:
  1020. return self._axes[0]
  1021. @property
  1022. def dtype(self):
  1023. return self.array.dtype
  1024. def external_values(self):
  1025. """The array that Series.values returns"""
  1026. return external_values(self.array)
  1027. def internal_values(self):
  1028. """The array that Series._values returns"""
  1029. return self.array
  1030. def array_values(self):
  1031. """The array that Series.array returns"""
  1032. arr = self.array
  1033. if isinstance(arr, np.ndarray):
  1034. arr = PandasArray(arr)
  1035. return arr
  1036. @property
  1037. def _can_hold_na(self) -> bool:
  1038. if isinstance(self.array, np.ndarray):
  1039. return self.array.dtype.kind not in ["b", "i", "u"]
  1040. else:
  1041. # ExtensionArray
  1042. return self.array._can_hold_na
  1043. @property
  1044. def is_single_block(self) -> bool:
  1045. return True
  1046. def fast_xs(self, loc: int) -> SingleArrayManager:
  1047. raise NotImplementedError("Use series._values[loc] instead")
  1048. def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager:
  1049. if axis >= self.ndim:
  1050. raise IndexError("Requested axis not found in manager")
  1051. new_array = self.array[slobj]
  1052. new_index = self.index._getitem_slice(slobj)
  1053. return type(self)([new_array], [new_index], verify_integrity=False)
  1054. def getitem_mgr(self, indexer) -> SingleArrayManager:
  1055. new_array = self.array[indexer]
  1056. new_index = self.index[indexer]
  1057. return type(self)([new_array], [new_index])
  1058. def apply(self, func, **kwargs):
  1059. if callable(func):
  1060. new_array = func(self.array, **kwargs)
  1061. else:
  1062. new_array = getattr(self.array, func)(**kwargs)
  1063. return type(self)([new_array], self._axes)
  1064. def setitem(self, indexer, value) -> SingleArrayManager:
  1065. """
  1066. Set values with indexer.
  1067. For SingleArrayManager, this backs s[indexer] = value
  1068. See `setitem_inplace` for a version that works inplace and doesn't
  1069. return a new Manager.
  1070. """
  1071. if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
  1072. raise ValueError(f"Cannot set values with ndim > {self.ndim}")
  1073. return self.apply_with_block("setitem", indexer=indexer, value=value)
  1074. def idelete(self, indexer) -> SingleArrayManager:
  1075. """
  1076. Delete selected locations in-place (new array, same ArrayManager)
  1077. """
  1078. to_keep = np.ones(self.shape[0], dtype=np.bool_)
  1079. to_keep[indexer] = False
  1080. self.arrays = [self.arrays[0][to_keep]]
  1081. self._axes = [self._axes[0][to_keep]]
  1082. return self
  1083. def _get_data_subset(self, predicate: Callable) -> SingleArrayManager:
  1084. # used in get_numeric_data / get_bool_data
  1085. if predicate(self.array):
  1086. return type(self)(self.arrays, self._axes, verify_integrity=False)
  1087. else:
  1088. return self.make_empty()
  1089. def set_values(self, values: ArrayLike) -> None:
  1090. """
  1091. Set (replace) the values of the SingleArrayManager in place.
  1092. Use at your own risk! This does not check if the passed values are
  1093. valid for the current SingleArrayManager (length, dtype, etc).
  1094. """
  1095. self.arrays[0] = values
  1096. def to_2d_mgr(self, columns: Index) -> ArrayManager:
  1097. """
  1098. Manager analogue of Series.to_frame
  1099. """
  1100. arrays = [self.arrays[0]]
  1101. axes = [self.axes[0], columns]
  1102. return ArrayManager(arrays, axes, verify_integrity=False)
  1103. class NullArrayProxy:
  1104. """
  1105. Proxy object for an all-NA array.
  1106. Only stores the length of the array, and not the dtype. The dtype
  1107. will only be known when actually concatenating (after determining the
  1108. common dtype, for which this proxy is ignored).
  1109. Using this object avoids that the internals/concat.py needs to determine
  1110. the proper dtype and array type.
  1111. """
  1112. ndim = 1
  1113. def __init__(self, n: int) -> None:
  1114. self.n = n
  1115. @property
  1116. def shape(self) -> tuple[int]:
  1117. return (self.n,)
  1118. def to_array(self, dtype: DtypeObj) -> ArrayLike:
  1119. """
  1120. Helper function to create the actual all-NA array from the NullArrayProxy
  1121. object.
  1122. Parameters
  1123. ----------
  1124. arr : NullArrayProxy
  1125. dtype : the dtype for the resulting array
  1126. Returns
  1127. -------
  1128. np.ndarray or ExtensionArray
  1129. """
  1130. if isinstance(dtype, ExtensionDtype):
  1131. empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
  1132. indexer = -np.ones(self.n, dtype=np.intp)
  1133. return empty.take(indexer, allow_fill=True)
  1134. else:
  1135. # when introducing missing values, int becomes float, bool becomes object
  1136. dtype = ensure_dtype_can_hold_na(dtype)
  1137. fill_value = na_value_for_dtype(dtype)
  1138. arr = np.empty(self.n, dtype=dtype)
  1139. arr.fill(fill_value)
  1140. return ensure_wrapped_if_datetimelike(arr)