managers.py 78 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343
  1. from __future__ import annotations
  2. import itertools
  3. from typing import (
  4. Any,
  5. Callable,
  6. Hashable,
  7. Literal,
  8. Sequence,
  9. TypeVar,
  10. cast,
  11. )
  12. import warnings
  13. import weakref
  14. import numpy as np
  15. from pandas._config import using_copy_on_write
  16. from pandas._libs import (
  17. algos as libalgos,
  18. internals as libinternals,
  19. lib,
  20. )
  21. from pandas._libs.internals import (
  22. BlockPlacement,
  23. BlockValuesRefs,
  24. )
  25. from pandas._typing import (
  26. ArrayLike,
  27. AxisInt,
  28. DtypeObj,
  29. QuantileInterpolation,
  30. Shape,
  31. npt,
  32. type_t,
  33. )
  34. from pandas.errors import PerformanceWarning
  35. from pandas.util._decorators import cache_readonly
  36. from pandas.util._exceptions import find_stack_level
  37. from pandas.util._validators import validate_bool_kwarg
  38. from pandas.core.dtypes.cast import infer_dtype_from_scalar
  39. from pandas.core.dtypes.common import (
  40. ensure_platform_int,
  41. is_1d_only_ea_dtype,
  42. is_dtype_equal,
  43. is_list_like,
  44. )
  45. from pandas.core.dtypes.dtypes import ExtensionDtype
  46. from pandas.core.dtypes.generic import (
  47. ABCDataFrame,
  48. ABCSeries,
  49. )
  50. from pandas.core.dtypes.missing import (
  51. array_equals,
  52. isna,
  53. )
  54. import pandas.core.algorithms as algos
  55. from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
  56. from pandas.core.arrays.sparse import SparseDtype
  57. import pandas.core.common as com
  58. from pandas.core.construction import (
  59. ensure_wrapped_if_datetimelike,
  60. extract_array,
  61. )
  62. from pandas.core.indexers import maybe_convert_indices
  63. from pandas.core.indexes.api import (
  64. Index,
  65. ensure_index,
  66. )
  67. from pandas.core.internals.base import (
  68. DataManager,
  69. SingleDataManager,
  70. interleaved_dtype,
  71. )
  72. from pandas.core.internals.blocks import (
  73. Block,
  74. NumpyBlock,
  75. ensure_block_shape,
  76. extend_blocks,
  77. get_block_type,
  78. new_block,
  79. new_block_2d,
  80. )
  81. from pandas.core.internals.ops import (
  82. blockwise_all,
  83. operate_blockwise,
  84. )
  85. T = TypeVar("T", bound="BaseBlockManager")
  86. class BaseBlockManager(DataManager):
  87. """
  88. Core internal data structure to implement DataFrame, Series, etc.
  89. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
  90. lightweight blocked set of labeled data to be manipulated by the DataFrame
  91. public API class
  92. Attributes
  93. ----------
  94. shape
  95. ndim
  96. axes
  97. values
  98. items
  99. Methods
  100. -------
  101. set_axis(axis, new_labels)
  102. copy(deep=True)
  103. get_dtypes
  104. apply(func, axes, block_filter_fn)
  105. get_bool_data
  106. get_numeric_data
  107. get_slice(slice_like, axis)
  108. get(label)
  109. iget(loc)
  110. take(indexer, axis)
  111. reindex_axis(new_labels, axis)
  112. reindex_indexer(new_labels, indexer, axis)
  113. delete(label)
  114. insert(loc, label, value)
  115. set(label, value)
  116. Parameters
  117. ----------
  118. blocks: Sequence of Block
  119. axes: Sequence of Index
  120. verify_integrity: bool, default True
  121. Notes
  122. -----
  123. This is *not* a public API class
  124. """
  125. __slots__ = ()
  126. _blknos: npt.NDArray[np.intp]
  127. _blklocs: npt.NDArray[np.intp]
  128. blocks: tuple[Block, ...]
  129. axes: list[Index]
  130. @property
  131. def ndim(self) -> int:
  132. raise NotImplementedError
  133. _known_consolidated: bool
  134. _is_consolidated: bool
  135. def __init__(self, blocks, axes, verify_integrity: bool = True) -> None:
  136. raise NotImplementedError
  137. @classmethod
  138. def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T:
  139. raise NotImplementedError
  140. @property
  141. def blknos(self) -> npt.NDArray[np.intp]:
  142. """
  143. Suppose we want to find the array corresponding to our i'th column.
  144. blknos[i] identifies the block from self.blocks that contains this column.
  145. blklocs[i] identifies the column of interest within
  146. self.blocks[self.blknos[i]]
  147. """
  148. if self._blknos is None:
  149. # Note: these can be altered by other BlockManager methods.
  150. self._rebuild_blknos_and_blklocs()
  151. return self._blknos
  152. @property
  153. def blklocs(self) -> npt.NDArray[np.intp]:
  154. """
  155. See blknos.__doc__
  156. """
  157. if self._blklocs is None:
  158. # Note: these can be altered by other BlockManager methods.
  159. self._rebuild_blknos_and_blklocs()
  160. return self._blklocs
  161. def make_empty(self: T, axes=None) -> T:
  162. """return an empty BlockManager with the items axis of len 0"""
  163. if axes is None:
  164. axes = [Index([])] + self.axes[1:]
  165. # preserve dtype if possible
  166. if self.ndim == 1:
  167. assert isinstance(self, SingleBlockManager) # for mypy
  168. blk = self.blocks[0]
  169. arr = blk.values[:0]
  170. bp = BlockPlacement(slice(0, 0))
  171. nb = blk.make_block_same_class(arr, placement=bp)
  172. blocks = [nb]
  173. else:
  174. blocks = []
  175. return type(self).from_blocks(blocks, axes)
  176. def __nonzero__(self) -> bool:
  177. return True
  178. # Python3 compat
  179. __bool__ = __nonzero__
  180. def _normalize_axis(self, axis: AxisInt) -> int:
  181. # switch axis to follow BlockManager logic
  182. if self.ndim == 2:
  183. axis = 1 if axis == 0 else 0
  184. return axis
  185. def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
  186. # Caller is responsible for ensuring we have an Index object.
  187. self._validate_set_axis(axis, new_labels)
  188. self.axes[axis] = new_labels
  189. @property
  190. def is_single_block(self) -> bool:
  191. # Assumes we are 2D; overridden by SingleBlockManager
  192. return len(self.blocks) == 1
  193. @property
  194. def items(self) -> Index:
  195. return self.axes[0]
  196. def _has_no_reference(self, i: int) -> bool:
  197. """
  198. Check for column `i` if it has references.
  199. (whether it references another array or is itself being referenced)
  200. Returns True if the column has no references.
  201. """
  202. blkno = self.blknos[i]
  203. return self._has_no_reference_block(blkno)
  204. def _has_no_reference_block(self, blkno: int) -> bool:
  205. """
  206. Check for block `i` if it has references.
  207. (whether it references another array or is itself being referenced)
  208. Returns True if the block has no references.
  209. """
  210. return not self.blocks[blkno].refs.has_reference()
  211. def add_references(self, mgr: BaseBlockManager) -> None:
  212. """
  213. Adds the references from one manager to another. We assume that both
  214. managers have the same block structure.
  215. """
  216. if len(self.blocks) != len(mgr.blocks):
  217. # If block structure changes, then we made a copy
  218. return
  219. for i, blk in enumerate(self.blocks):
  220. blk.refs = mgr.blocks[i].refs
  221. # Argument 1 to "add_reference" of "BlockValuesRefs" has incompatible type
  222. # "Block"; expected "SharedBlock"
  223. blk.refs.add_reference(blk) # type: ignore[arg-type]
  224. def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:
  225. """
  226. Checks if two blocks from two different block managers reference the
  227. same underlying values.
  228. """
  229. ref = weakref.ref(self.blocks[blkno])
  230. return ref in mgr.blocks[blkno].refs.referenced_blocks
  231. def get_dtypes(self):
  232. dtypes = np.array([blk.dtype for blk in self.blocks])
  233. return dtypes.take(self.blknos)
  234. @property
  235. def arrays(self) -> list[ArrayLike]:
  236. """
  237. Quick access to the backing arrays of the Blocks.
  238. Only for compatibility with ArrayManager for testing convenience.
  239. Not to be used in actual code, and return value is not the same as the
  240. ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).
  241. Warning! The returned arrays don't handle Copy-on-Write, so this should
  242. be used with caution (only in read-mode).
  243. """
  244. return [blk.values for blk in self.blocks]
  245. def __repr__(self) -> str:
  246. output = type(self).__name__
  247. for i, ax in enumerate(self.axes):
  248. if i == 0:
  249. output += f"\nItems: {ax}"
  250. else:
  251. output += f"\nAxis {i}: {ax}"
  252. for block in self.blocks:
  253. output += f"\n{block}"
  254. return output
  255. def apply(
  256. self: T,
  257. f,
  258. align_keys: list[str] | None = None,
  259. **kwargs,
  260. ) -> T:
  261. """
  262. Iterate over the blocks, collect and create a new BlockManager.
  263. Parameters
  264. ----------
  265. f : str or callable
  266. Name of the Block method to apply.
  267. align_keys: List[str] or None, default None
  268. **kwargs
  269. Keywords to pass to `f`
  270. Returns
  271. -------
  272. BlockManager
  273. """
  274. assert "filter" not in kwargs
  275. align_keys = align_keys or []
  276. result_blocks: list[Block] = []
  277. # fillna: Series/DataFrame is responsible for making sure value is aligned
  278. aligned_args = {k: kwargs[k] for k in align_keys}
  279. for b in self.blocks:
  280. if aligned_args:
  281. for k, obj in aligned_args.items():
  282. if isinstance(obj, (ABCSeries, ABCDataFrame)):
  283. # The caller is responsible for ensuring that
  284. # obj.axes[-1].equals(self.items)
  285. if obj.ndim == 1:
  286. kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values
  287. else:
  288. kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values
  289. else:
  290. # otherwise we have an ndarray
  291. kwargs[k] = obj[b.mgr_locs.indexer]
  292. if callable(f):
  293. applied = b.apply(f, **kwargs)
  294. else:
  295. applied = getattr(b, f)(**kwargs)
  296. result_blocks = extend_blocks(applied, result_blocks)
  297. out = type(self).from_blocks(result_blocks, self.axes)
  298. return out
  299. def where(self: T, other, cond, align: bool) -> T:
  300. if align:
  301. align_keys = ["other", "cond"]
  302. else:
  303. align_keys = ["cond"]
  304. other = extract_array(other, extract_numpy=True)
  305. return self.apply(
  306. "where",
  307. align_keys=align_keys,
  308. other=other,
  309. cond=cond,
  310. using_cow=using_copy_on_write(),
  311. )
  312. def round(self: T, decimals: int, using_cow: bool = False) -> T:
  313. return self.apply(
  314. "round",
  315. decimals=decimals,
  316. using_cow=using_cow,
  317. )
  318. def setitem(self: T, indexer, value) -> T:
  319. """
  320. Set values with indexer.
  321. For SingleBlockManager, this backs s[indexer] = value
  322. """
  323. if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
  324. raise ValueError(f"Cannot set values with ndim > {self.ndim}")
  325. if using_copy_on_write() and not self._has_no_reference(0):
  326. # if being referenced -> perform Copy-on-Write and clear the reference
  327. # this method is only called if there is a single block -> hardcoded 0
  328. self = self.copy()
  329. return self.apply("setitem", indexer=indexer, value=value)
  330. def putmask(self, mask, new, align: bool = True):
  331. if align:
  332. align_keys = ["new", "mask"]
  333. else:
  334. align_keys = ["mask"]
  335. new = extract_array(new, extract_numpy=True)
  336. return self.apply(
  337. "putmask",
  338. align_keys=align_keys,
  339. mask=mask,
  340. new=new,
  341. using_cow=using_copy_on_write(),
  342. )
  343. def diff(self: T, n: int, axis: AxisInt) -> T:
  344. # only reached with self.ndim == 2 and axis == 1
  345. axis = self._normalize_axis(axis)
  346. return self.apply("diff", n=n, axis=axis)
  347. def interpolate(self: T, inplace: bool, **kwargs) -> T:
  348. return self.apply(
  349. "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write()
  350. )
  351. def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:
  352. axis = self._normalize_axis(axis)
  353. if fill_value is lib.no_default:
  354. fill_value = None
  355. return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)
  356. def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
  357. if limit is not None:
  358. # Do this validation even if we go through one of the no-op paths
  359. limit = libalgos.validate_limit(None, limit=limit)
  360. return self.apply(
  361. "fillna",
  362. value=value,
  363. limit=limit,
  364. inplace=inplace,
  365. downcast=downcast,
  366. using_cow=using_copy_on_write(),
  367. )
  368. def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
  369. if copy is None:
  370. if using_copy_on_write():
  371. copy = False
  372. else:
  373. copy = True
  374. elif using_copy_on_write():
  375. copy = False
  376. return self.apply(
  377. "astype",
  378. dtype=dtype,
  379. copy=copy,
  380. errors=errors,
  381. using_cow=using_copy_on_write(),
  382. )
  383. def convert(self: T, copy: bool | None) -> T:
  384. if copy is None:
  385. if using_copy_on_write():
  386. copy = False
  387. else:
  388. copy = True
  389. elif using_copy_on_write():
  390. copy = False
  391. return self.apply("convert", copy=copy, using_cow=using_copy_on_write())
  392. def replace(self: T, to_replace, value, inplace: bool) -> T:
  393. inplace = validate_bool_kwarg(inplace, "inplace")
  394. # NDFrame.replace ensures the not-is_list_likes here
  395. assert not is_list_like(to_replace)
  396. assert not is_list_like(value)
  397. return self.apply(
  398. "replace",
  399. to_replace=to_replace,
  400. value=value,
  401. inplace=inplace,
  402. using_cow=using_copy_on_write(),
  403. )
  404. def replace_regex(self, **kwargs):
  405. return self.apply("_replace_regex", **kwargs, using_cow=using_copy_on_write())
  406. def replace_list(
  407. self: T,
  408. src_list: list[Any],
  409. dest_list: list[Any],
  410. inplace: bool = False,
  411. regex: bool = False,
  412. ) -> T:
  413. """do a list replace"""
  414. inplace = validate_bool_kwarg(inplace, "inplace")
  415. bm = self.apply(
  416. "replace_list",
  417. src_list=src_list,
  418. dest_list=dest_list,
  419. inplace=inplace,
  420. regex=regex,
  421. using_cow=using_copy_on_write(),
  422. )
  423. bm._consolidate_inplace()
  424. return bm
  425. def to_native_types(self: T, **kwargs) -> T:
  426. """
  427. Convert values to native types (strings / python objects) that are used
  428. in formatting (repr / csv).
  429. """
  430. return self.apply("to_native_types", **kwargs)
  431. @property
  432. def is_numeric_mixed_type(self) -> bool:
  433. return all(block.is_numeric for block in self.blocks)
  434. @property
  435. def any_extension_types(self) -> bool:
  436. """Whether any of the blocks in this manager are extension blocks"""
  437. return any(block.is_extension for block in self.blocks)
  438. @property
  439. def is_view(self) -> bool:
  440. """return a boolean if we are a single block and are a view"""
  441. if len(self.blocks) == 1:
  442. return self.blocks[0].is_view
  443. # It is technically possible to figure out which blocks are views
  444. # e.g. [ b.values.base is not None for b in self.blocks ]
  445. # but then we have the case of possibly some blocks being a view
  446. # and some blocks not. setting in theory is possible on the non-view
  447. # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
  448. # complicated
  449. return False
  450. def _get_data_subset(self: T, predicate: Callable) -> T:
  451. blocks = [blk for blk in self.blocks if predicate(blk.values)]
  452. return self._combine(blocks, copy=False)
  453. def get_bool_data(self: T, copy: bool = False) -> T:
  454. """
  455. Select blocks that are bool-dtype and columns from object-dtype blocks
  456. that are all-bool.
  457. Parameters
  458. ----------
  459. copy : bool, default False
  460. Whether to copy the blocks
  461. """
  462. new_blocks = []
  463. for blk in self.blocks:
  464. if blk.dtype == bool:
  465. new_blocks.append(blk)
  466. elif blk.is_object:
  467. nbs = blk._split()
  468. for nb in nbs:
  469. if nb.is_bool:
  470. new_blocks.append(nb)
  471. return self._combine(new_blocks, copy)
  472. def get_numeric_data(self: T, copy: bool = False) -> T:
  473. """
  474. Parameters
  475. ----------
  476. copy : bool, default False
  477. Whether to copy the blocks
  478. """
  479. numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]
  480. if len(numeric_blocks) == len(self.blocks):
  481. # Avoid somewhat expensive _combine
  482. if copy:
  483. return self.copy(deep=True)
  484. return self
  485. return self._combine(numeric_blocks, copy)
  486. def _combine(
  487. self: T, blocks: list[Block], copy: bool = True, index: Index | None = None
  488. ) -> T:
  489. """return a new manager with the blocks"""
  490. if len(blocks) == 0:
  491. if self.ndim == 2:
  492. # retain our own Index dtype
  493. if index is not None:
  494. axes = [self.items[:0], index]
  495. else:
  496. axes = [self.items[:0]] + self.axes[1:]
  497. return self.make_empty(axes)
  498. return self.make_empty()
  499. # FIXME: optimization potential
  500. indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
  501. inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
  502. new_blocks: list[Block] = []
  503. # TODO(CoW) we could optimize here if we know that the passed blocks
  504. # are fully "owned" (eg created from an operation, not coming from
  505. # an existing manager)
  506. for b in blocks:
  507. nb = b.copy(deep=copy)
  508. nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])
  509. new_blocks.append(nb)
  510. axes = list(self.axes)
  511. if index is not None:
  512. axes[-1] = index
  513. axes[0] = self.items.take(indexer)
  514. return type(self).from_blocks(new_blocks, axes)
  515. @property
  516. def nblocks(self) -> int:
  517. return len(self.blocks)
  518. def copy(self: T, deep: bool | None | Literal["all"] = True) -> T:
  519. """
  520. Make deep or shallow copy of BlockManager
  521. Parameters
  522. ----------
  523. deep : bool, string or None, default True
  524. If False or None, return a shallow copy (do not copy data)
  525. If 'all', copy data and a deep copy of the index
  526. Returns
  527. -------
  528. BlockManager
  529. """
  530. if deep is None:
  531. if using_copy_on_write():
  532. # use shallow copy
  533. deep = False
  534. else:
  535. # preserve deep copy for BlockManager with copy=None
  536. deep = True
  537. # this preserves the notion of view copying of axes
  538. if deep:
  539. # hit in e.g. tests.io.json.test_pandas
  540. def copy_func(ax):
  541. return ax.copy(deep=True) if deep == "all" else ax.view()
  542. new_axes = [copy_func(ax) for ax in self.axes]
  543. else:
  544. new_axes = list(self.axes)
  545. res = self.apply("copy", deep=deep)
  546. res.axes = new_axes
  547. if self.ndim > 1:
  548. # Avoid needing to re-compute these
  549. blknos = self._blknos
  550. if blknos is not None:
  551. res._blknos = blknos.copy()
  552. res._blklocs = self._blklocs.copy()
  553. if deep:
  554. res._consolidate_inplace()
  555. return res
  556. def consolidate(self: T) -> T:
  557. """
  558. Join together blocks having same dtype
  559. Returns
  560. -------
  561. y : BlockManager
  562. """
  563. if self.is_consolidated():
  564. return self
  565. bm = type(self)(self.blocks, self.axes, verify_integrity=False)
  566. bm._is_consolidated = False
  567. bm._consolidate_inplace()
  568. return bm
  569. def reindex_indexer(
  570. self: T,
  571. new_axis: Index,
  572. indexer: npt.NDArray[np.intp] | None,
  573. axis: AxisInt,
  574. fill_value=None,
  575. allow_dups: bool = False,
  576. copy: bool | None = True,
  577. only_slice: bool = False,
  578. *,
  579. use_na_proxy: bool = False,
  580. ) -> T:
  581. """
  582. Parameters
  583. ----------
  584. new_axis : Index
  585. indexer : ndarray[intp] or None
  586. axis : int
  587. fill_value : object, default None
  588. allow_dups : bool, default False
  589. copy : bool or None, default True
  590. If None, regard as False to get shallow copy.
  591. only_slice : bool, default False
  592. Whether to take views, not copies, along columns.
  593. use_na_proxy : bool, default False
  594. Whether to use a np.void ndarray for newly introduced columns.
  595. pandas-indexer with -1's only.
  596. """
  597. if copy is None:
  598. if using_copy_on_write():
  599. # use shallow copy
  600. copy = False
  601. else:
  602. # preserve deep copy for BlockManager with copy=None
  603. copy = True
  604. if indexer is None:
  605. if new_axis is self.axes[axis] and not copy:
  606. return self
  607. result = self.copy(deep=copy)
  608. result.axes = list(self.axes)
  609. result.axes[axis] = new_axis
  610. return result
  611. # Should be intp, but in some cases we get int64 on 32bit builds
  612. assert isinstance(indexer, np.ndarray)
  613. # some axes don't allow reindexing with dups
  614. if not allow_dups:
  615. self.axes[axis]._validate_can_reindex(indexer)
  616. if axis >= self.ndim:
  617. raise IndexError("Requested axis not found in manager")
  618. if axis == 0:
  619. new_blocks = self._slice_take_blocks_ax0(
  620. indexer,
  621. fill_value=fill_value,
  622. only_slice=only_slice,
  623. use_na_proxy=use_na_proxy,
  624. )
  625. else:
  626. new_blocks = [
  627. blk.take_nd(
  628. indexer,
  629. axis=1,
  630. fill_value=(
  631. fill_value if fill_value is not None else blk.fill_value
  632. ),
  633. )
  634. for blk in self.blocks
  635. ]
  636. new_axes = list(self.axes)
  637. new_axes[axis] = new_axis
  638. new_mgr = type(self).from_blocks(new_blocks, new_axes)
  639. if axis == 1:
  640. # We can avoid the need to rebuild these
  641. new_mgr._blknos = self.blknos.copy()
  642. new_mgr._blklocs = self.blklocs.copy()
  643. return new_mgr
  644. def _slice_take_blocks_ax0(
  645. self,
  646. slice_or_indexer: slice | np.ndarray,
  647. fill_value=lib.no_default,
  648. only_slice: bool = False,
  649. *,
  650. use_na_proxy: bool = False,
  651. ) -> list[Block]:
  652. """
  653. Slice/take blocks along axis=0.
  654. Overloaded for SingleBlock
  655. Parameters
  656. ----------
  657. slice_or_indexer : slice or np.ndarray[int64]
  658. fill_value : scalar, default lib.no_default
  659. only_slice : bool, default False
  660. If True, we always return views on existing arrays, never copies.
  661. This is used when called from ops.blockwise.operate_blockwise.
  662. use_na_proxy : bool, default False
  663. Whether to use a np.void ndarray for newly introduced columns.
  664. Returns
  665. -------
  666. new_blocks : list of Block
  667. """
  668. allow_fill = fill_value is not lib.no_default
  669. sl_type, slobj, sllen = _preprocess_slice_or_indexer(
  670. slice_or_indexer, self.shape[0], allow_fill=allow_fill
  671. )
  672. if self.is_single_block:
  673. blk = self.blocks[0]
  674. if sl_type == "slice":
  675. # GH#32959 EABlock would fail since we can't make 0-width
  676. # TODO(EA2D): special casing unnecessary with 2D EAs
  677. if sllen == 0:
  678. return []
  679. bp = BlockPlacement(slice(0, sllen))
  680. return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]
  681. elif not allow_fill or self.ndim == 1:
  682. if allow_fill and fill_value is None:
  683. fill_value = blk.fill_value
  684. if not allow_fill and only_slice:
  685. # GH#33597 slice instead of take, so we get
  686. # views instead of copies
  687. blocks = [
  688. blk.getitem_block_columns(
  689. slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)
  690. )
  691. for i, ml in enumerate(slobj)
  692. ]
  693. return blocks
  694. else:
  695. bp = BlockPlacement(slice(0, sllen))
  696. return [
  697. blk.take_nd(
  698. slobj,
  699. axis=0,
  700. new_mgr_locs=bp,
  701. fill_value=fill_value,
  702. )
  703. ]
  704. if sl_type == "slice":
  705. blknos = self.blknos[slobj]
  706. blklocs = self.blklocs[slobj]
  707. else:
  708. blknos = algos.take_nd(
  709. self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
  710. )
  711. blklocs = algos.take_nd(
  712. self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
  713. )
  714. # When filling blknos, make sure blknos is updated before appending to
  715. # blocks list, that way new blkno is exactly len(blocks).
  716. blocks = []
  717. group = not only_slice
  718. for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):
  719. if blkno == -1:
  720. # If we've got here, fill_value was not lib.no_default
  721. blocks.append(
  722. self._make_na_block(
  723. placement=mgr_locs,
  724. fill_value=fill_value,
  725. use_na_proxy=use_na_proxy,
  726. )
  727. )
  728. else:
  729. blk = self.blocks[blkno]
  730. # Otherwise, slicing along items axis is necessary.
  731. if not blk._can_consolidate and not blk._validate_ndim:
  732. # i.e. we dont go through here for DatetimeTZBlock
  733. # A non-consolidatable block, it's easy, because there's
  734. # only one item and each mgr loc is a copy of that single
  735. # item.
  736. deep = not (only_slice or using_copy_on_write())
  737. for mgr_loc in mgr_locs:
  738. newblk = blk.copy(deep=deep)
  739. newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))
  740. blocks.append(newblk)
  741. else:
  742. # GH#32779 to avoid the performance penalty of copying,
  743. # we may try to only slice
  744. taker = blklocs[mgr_locs.indexer]
  745. max_len = max(len(mgr_locs), taker.max() + 1)
  746. if only_slice or using_copy_on_write():
  747. taker = lib.maybe_indices_to_slice(taker, max_len)
  748. if isinstance(taker, slice):
  749. nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
  750. blocks.append(nb)
  751. elif only_slice:
  752. # GH#33597 slice instead of take, so we get
  753. # views instead of copies
  754. for i, ml in zip(taker, mgr_locs):
  755. slc = slice(i, i + 1)
  756. bp = BlockPlacement(ml)
  757. nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
  758. # We have np.shares_memory(nb.values, blk.values)
  759. blocks.append(nb)
  760. else:
  761. nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)
  762. blocks.append(nb)
  763. return blocks
  764. def _make_na_block(
  765. self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False
  766. ) -> Block:
  767. # Note: we only get here with self.ndim == 2
  768. if use_na_proxy:
  769. assert fill_value is None
  770. shape = (len(placement), self.shape[1])
  771. vals = np.empty(shape, dtype=np.void)
  772. nb = NumpyBlock(vals, placement, ndim=2)
  773. return nb
  774. if fill_value is None:
  775. fill_value = np.nan
  776. block_shape = list(self.shape)
  777. block_shape[0] = len(placement)
  778. dtype, fill_value = infer_dtype_from_scalar(fill_value)
  779. # error: Argument "dtype" to "empty" has incompatible type "Union[dtype,
  780. # ExtensionDtype]"; expected "Union[dtype, None, type, _SupportsDtype, str,
  781. # Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], _DtypeDict,
  782. # Tuple[Any, Any]]"
  783. block_values = np.empty(block_shape, dtype=dtype) # type: ignore[arg-type]
  784. block_values.fill(fill_value)
  785. return new_block_2d(block_values, placement=placement)
  786. def take(
  787. self: T,
  788. indexer,
  789. axis: AxisInt = 1,
  790. verify: bool = True,
  791. convert_indices: bool = True,
  792. ) -> T:
  793. """
  794. Take items along any axis.
  795. indexer : np.ndarray or slice
  796. axis : int, default 1
  797. verify : bool, default True
  798. Check that all entries are between 0 and len(self) - 1, inclusive.
  799. Pass verify=False if this check has been done by the caller.
  800. convert_indices : bool, default True
  801. Whether to attempt to convert indices to positive values.
  802. Returns
  803. -------
  804. BlockManager
  805. """
  806. # We have 6 tests that get here with a slice
  807. indexer = (
  808. np.arange(indexer.start, indexer.stop, indexer.step, dtype=np.intp)
  809. if isinstance(indexer, slice)
  810. else np.asanyarray(indexer, dtype=np.intp)
  811. )
  812. n = self.shape[axis]
  813. if convert_indices:
  814. indexer = maybe_convert_indices(indexer, n, verify=verify)
  815. new_labels = self.axes[axis].take(indexer)
  816. return self.reindex_indexer(
  817. new_axis=new_labels,
  818. indexer=indexer,
  819. axis=axis,
  820. allow_dups=True,
  821. copy=None,
  822. )
  823. class BlockManager(libinternals.BlockManager, BaseBlockManager):
  824. """
  825. BaseBlockManager that holds 2D blocks.
  826. """
  827. ndim = 2
  828. # ----------------------------------------------------------------
  829. # Constructors
  830. def __init__(
  831. self,
  832. blocks: Sequence[Block],
  833. axes: Sequence[Index],
  834. verify_integrity: bool = True,
  835. ) -> None:
  836. if verify_integrity:
  837. # Assertion disabled for performance
  838. # assert all(isinstance(x, Index) for x in axes)
  839. for block in blocks:
  840. if self.ndim != block.ndim:
  841. raise AssertionError(
  842. f"Number of Block dimensions ({block.ndim}) must equal "
  843. f"number of axes ({self.ndim})"
  844. )
  845. # As of 2.0, the caller is responsible for ensuring that
  846. # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2;
  847. # previously there was a special check for fastparquet compat.
  848. self._verify_integrity()
  849. def _verify_integrity(self) -> None:
  850. mgr_shape = self.shape
  851. tot_items = sum(len(x.mgr_locs) for x in self.blocks)
  852. for block in self.blocks:
  853. if block.shape[1:] != mgr_shape[1:]:
  854. raise_construction_error(tot_items, block.shape[1:], self.axes)
  855. if len(self.items) != tot_items:
  856. raise AssertionError(
  857. "Number of manager items must equal union of "
  858. f"block items\n# manager items: {len(self.items)}, # "
  859. f"tot_items: {tot_items}"
  860. )
  861. @classmethod
  862. def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager:
  863. """
  864. Constructor for BlockManager and SingleBlockManager with same signature.
  865. """
  866. return cls(blocks, axes, verify_integrity=False)
  867. # ----------------------------------------------------------------
  868. # Indexing
  869. def fast_xs(self, loc: int) -> SingleBlockManager:
  870. """
  871. Return the array corresponding to `frame.iloc[loc]`.
  872. Parameters
  873. ----------
  874. loc : int
  875. Returns
  876. -------
  877. np.ndarray or ExtensionArray
  878. """
  879. if len(self.blocks) == 1:
  880. # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like;
  881. # is this ruled out in the general case?
  882. result = self.blocks[0].iget((slice(None), loc))
  883. # in the case of a single block, the new block is a view
  884. block = new_block(
  885. result,
  886. placement=slice(0, len(result)),
  887. ndim=1,
  888. refs=self.blocks[0].refs,
  889. )
  890. return SingleBlockManager(block, self.axes[0])
  891. dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
  892. n = len(self)
  893. # GH#46406
  894. immutable_ea = isinstance(dtype, SparseDtype)
  895. if isinstance(dtype, ExtensionDtype) and not immutable_ea:
  896. cls = dtype.construct_array_type()
  897. result = cls._empty((n,), dtype=dtype)
  898. else:
  899. # error: Argument "dtype" to "empty" has incompatible type
  900. # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected
  901. # "None"
  902. result = np.empty(
  903. n, dtype=object if immutable_ea else dtype # type: ignore[arg-type]
  904. )
  905. result = ensure_wrapped_if_datetimelike(result)
  906. for blk in self.blocks:
  907. # Such assignment may incorrectly coerce NaT to None
  908. # result[blk.mgr_locs] = blk._slice((slice(None), loc))
  909. for i, rl in enumerate(blk.mgr_locs):
  910. result[rl] = blk.iget((i, loc))
  911. if immutable_ea:
  912. dtype = cast(ExtensionDtype, dtype)
  913. result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
  914. block = new_block(result, placement=slice(0, len(result)), ndim=1)
  915. return SingleBlockManager(block, self.axes[0])
  916. def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
  917. """
  918. Return the data as a SingleBlockManager.
  919. """
  920. block = self.blocks[self.blknos[i]]
  921. values = block.iget(self.blklocs[i])
  922. # shortcut for select a single-dim from a 2-dim BM
  923. bp = BlockPlacement(slice(0, len(values)))
  924. nb = type(block)(
  925. values, placement=bp, ndim=1, refs=block.refs if track_ref else None
  926. )
  927. return SingleBlockManager(nb, self.axes[1])
  928. def iget_values(self, i: int) -> ArrayLike:
  929. """
  930. Return the data for column i as the values (ndarray or ExtensionArray).
  931. Warning! The returned array is a view but doesn't handle Copy-on-Write,
  932. so this should be used with caution.
  933. """
  934. # TODO(CoW) making the arrays read-only might make this safer to use?
  935. block = self.blocks[self.blknos[i]]
  936. values = block.iget(self.blklocs[i])
  937. return values
  938. @property
  939. def column_arrays(self) -> list[np.ndarray]:
  940. """
  941. Used in the JSON C code to access column arrays.
  942. This optimizes compared to using `iget_values` by converting each
  943. Warning! This doesn't handle Copy-on-Write, so should be used with
  944. caution (current use case of consuming this in the JSON code is fine).
  945. """
  946. # This is an optimized equivalent to
  947. # result = [self.iget_values(i) for i in range(len(self.items))]
  948. result: list[np.ndarray | None] = [None] * len(self.items)
  949. for blk in self.blocks:
  950. mgr_locs = blk._mgr_locs
  951. values = blk.values_for_json()
  952. if values.ndim == 1:
  953. # TODO(EA2D): special casing not needed with 2D EAs
  954. result[mgr_locs[0]] = values
  955. else:
  956. for i, loc in enumerate(mgr_locs):
  957. result[loc] = values[i]
  958. # error: Incompatible return value type (got "List[None]",
  959. # expected "List[ndarray[Any, Any]]")
  960. return result # type: ignore[return-value]
  961. def iset(
  962. self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
  963. ):
  964. """
  965. Set new item in-place. Does not consolidate. Adds new Block if not
  966. contained in the current set of items
  967. """
  968. # FIXME: refactor, clearly separate broadcasting & zip-like assignment
  969. # can prob also fix the various if tests for sparse/categorical
  970. if self._blklocs is None and self.ndim > 1:
  971. self._rebuild_blknos_and_blklocs()
  972. # Note: we exclude DTA/TDA here
  973. value_is_extension_type = is_1d_only_ea_dtype(value.dtype)
  974. if not value_is_extension_type:
  975. if value.ndim == 2:
  976. value = value.T
  977. else:
  978. value = ensure_block_shape(value, ndim=2)
  979. if value.shape[1:] != self.shape[1:]:
  980. raise AssertionError(
  981. "Shape of new values must be compatible with manager shape"
  982. )
  983. if lib.is_integer(loc):
  984. # We have 6 tests where loc is _not_ an int.
  985. # In this case, get_blkno_placements will yield only one tuple,
  986. # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
  987. # Check if we can use _iset_single fastpath
  988. loc = cast(int, loc)
  989. blkno = self.blknos[loc]
  990. blk = self.blocks[blkno]
  991. if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?
  992. return self._iset_single(
  993. loc,
  994. value,
  995. inplace=inplace,
  996. blkno=blkno,
  997. blk=blk,
  998. )
  999. # error: Incompatible types in assignment (expression has type
  1000. # "List[Union[int, slice, ndarray]]", variable has type "Union[int,
  1001. # slice, ndarray]")
  1002. loc = [loc] # type: ignore[assignment]
  1003. # categorical/sparse/datetimetz
  1004. if value_is_extension_type:
  1005. def value_getitem(placement):
  1006. return value
  1007. else:
  1008. def value_getitem(placement):
  1009. return value[placement.indexer]
  1010. # Accessing public blknos ensures the public versions are initialized
  1011. blknos = self.blknos[loc]
  1012. blklocs = self.blklocs[loc].copy()
  1013. unfit_mgr_locs = []
  1014. unfit_val_locs = []
  1015. removed_blknos = []
  1016. for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):
  1017. blk = self.blocks[blkno_l]
  1018. blk_locs = blklocs[val_locs.indexer]
  1019. if inplace and blk.should_store(value):
  1020. # Updating inplace -> check if we need to do Copy-on-Write
  1021. if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
  1022. self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs))
  1023. else:
  1024. blk.set_inplace(blk_locs, value_getitem(val_locs))
  1025. continue
  1026. else:
  1027. unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
  1028. unfit_val_locs.append(val_locs)
  1029. # If all block items are unfit, schedule the block for removal.
  1030. if len(val_locs) == len(blk.mgr_locs):
  1031. removed_blknos.append(blkno_l)
  1032. continue
  1033. else:
  1034. # Defer setting the new values to enable consolidation
  1035. self._iset_split_block(blkno_l, blk_locs)
  1036. if len(removed_blknos):
  1037. # Remove blocks & update blknos accordingly
  1038. is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
  1039. is_deleted[removed_blknos] = True
  1040. new_blknos = np.empty(self.nblocks, dtype=np.intp)
  1041. new_blknos.fill(-1)
  1042. new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))
  1043. self._blknos = new_blknos[self._blknos]
  1044. self.blocks = tuple(
  1045. blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
  1046. )
  1047. if unfit_val_locs:
  1048. unfit_idxr = np.concatenate(unfit_mgr_locs)
  1049. unfit_count = len(unfit_idxr)
  1050. new_blocks: list[Block] = []
  1051. # TODO(CoW) is this always correct to assume that the new_blocks
  1052. # are not referencing anything else?
  1053. if value_is_extension_type:
  1054. # This code (ab-)uses the fact that EA blocks contain only
  1055. # one item.
  1056. # TODO(EA2D): special casing unnecessary with 2D EAs
  1057. new_blocks.extend(
  1058. new_block_2d(
  1059. values=value,
  1060. placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),
  1061. )
  1062. for mgr_loc in unfit_idxr
  1063. )
  1064. self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)
  1065. self._blklocs[unfit_idxr] = 0
  1066. else:
  1067. # unfit_val_locs contains BlockPlacement objects
  1068. unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
  1069. new_blocks.append(
  1070. new_block_2d(
  1071. values=value_getitem(unfit_val_items),
  1072. placement=BlockPlacement(unfit_idxr),
  1073. )
  1074. )
  1075. self._blknos[unfit_idxr] = len(self.blocks)
  1076. self._blklocs[unfit_idxr] = np.arange(unfit_count)
  1077. self.blocks += tuple(new_blocks)
  1078. # Newly created block's dtype may already be present.
  1079. self._known_consolidated = False
  1080. def _iset_split_block(
  1081. self,
  1082. blkno_l: int,
  1083. blk_locs: np.ndarray | list[int],
  1084. value: ArrayLike | None = None,
  1085. ) -> None:
  1086. """Removes columns from a block by splitting the block.
  1087. Avoids copying the whole block through slicing and updates the manager
  1088. after determinint the new block structure. Optionally adds a new block,
  1089. otherwise has to be done by the caller.
  1090. Parameters
  1091. ----------
  1092. blkno_l: The block number to operate on, relevant for updating the manager
  1093. blk_locs: The locations of our block that should be deleted.
  1094. value: The value to set as a replacement.
  1095. """
  1096. blk = self.blocks[blkno_l]
  1097. if self._blklocs is None:
  1098. self._rebuild_blknos_and_blklocs()
  1099. nbs_tup = tuple(blk.delete(blk_locs))
  1100. if value is not None:
  1101. locs = blk.mgr_locs.as_array[blk_locs]
  1102. first_nb = new_block_2d(value, BlockPlacement(locs))
  1103. else:
  1104. first_nb = nbs_tup[0]
  1105. nbs_tup = tuple(nbs_tup[1:])
  1106. nr_blocks = len(self.blocks)
  1107. blocks_tup = (
  1108. self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup
  1109. )
  1110. self.blocks = blocks_tup
  1111. if not nbs_tup and value is not None:
  1112. # No need to update anything if split did not happen
  1113. return
  1114. self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
  1115. for i, nb in enumerate(nbs_tup):
  1116. self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
  1117. self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
  1118. def _iset_single(
  1119. self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
  1120. ) -> None:
  1121. """
  1122. Fastpath for iset when we are only setting a single position and
  1123. the Block currently in that position is itself single-column.
  1124. In this case we can swap out the entire Block and blklocs and blknos
  1125. are unaffected.
  1126. """
  1127. # Caller is responsible for verifying value.shape
  1128. if inplace and blk.should_store(value):
  1129. copy = False
  1130. if using_copy_on_write() and not self._has_no_reference_block(blkno):
  1131. # perform Copy-on-Write and clear the reference
  1132. copy = True
  1133. iloc = self.blklocs[loc]
  1134. blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
  1135. return
  1136. nb = new_block_2d(value, placement=blk._mgr_locs)
  1137. old_blocks = self.blocks
  1138. new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
  1139. self.blocks = new_blocks
  1140. return
  1141. def column_setitem(
  1142. self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
  1143. ) -> None:
  1144. """
  1145. Set values ("setitem") into a single column (not setting the full column).
  1146. This is a method on the BlockManager level, to avoid creating an
  1147. intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
  1148. """
  1149. if using_copy_on_write() and not self._has_no_reference(loc):
  1150. blkno = self.blknos[loc]
  1151. # Split blocks to only copy the column we want to modify
  1152. blk_loc = self.blklocs[loc]
  1153. # Copy our values
  1154. values = self.blocks[blkno].values
  1155. if values.ndim == 1:
  1156. values = values.copy()
  1157. else:
  1158. # Use [blk_loc] as indexer to keep ndim=2, this already results in a
  1159. # copy
  1160. values = values[[blk_loc]]
  1161. self._iset_split_block(blkno, [blk_loc], values)
  1162. # this manager is only created temporarily to mutate the values in place
  1163. # so don't track references, otherwise the `setitem` would perform CoW again
  1164. col_mgr = self.iget(loc, track_ref=False)
  1165. if inplace_only:
  1166. col_mgr.setitem_inplace(idx, value)
  1167. else:
  1168. new_mgr = col_mgr.setitem((idx,), value)
  1169. self.iset(loc, new_mgr._block.values, inplace=True)
  1170. def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
  1171. """
  1172. Insert item at selected position.
  1173. Parameters
  1174. ----------
  1175. loc : int
  1176. item : hashable
  1177. value : np.ndarray or ExtensionArray
  1178. """
  1179. # insert to the axis; this could possibly raise a TypeError
  1180. new_axis = self.items.insert(loc, item)
  1181. if value.ndim == 2:
  1182. value = value.T
  1183. if len(value) > 1:
  1184. raise ValueError(
  1185. f"Expected a 1D array, got an array with shape {value.T.shape}"
  1186. )
  1187. else:
  1188. value = ensure_block_shape(value, ndim=self.ndim)
  1189. bp = BlockPlacement(slice(loc, loc + 1))
  1190. # TODO(CoW) do we always "own" the passed `value`?
  1191. block = new_block_2d(values=value, placement=bp)
  1192. if not len(self.blocks):
  1193. # Fastpath
  1194. self._blklocs = np.array([0], dtype=np.intp)
  1195. self._blknos = np.array([0], dtype=np.intp)
  1196. else:
  1197. self._insert_update_mgr_locs(loc)
  1198. self._insert_update_blklocs_and_blknos(loc)
  1199. self.axes[0] = new_axis
  1200. self.blocks += (block,)
  1201. self._known_consolidated = False
  1202. if sum(not block.is_extension for block in self.blocks) > 100:
  1203. warnings.warn(
  1204. "DataFrame is highly fragmented. This is usually the result "
  1205. "of calling `frame.insert` many times, which has poor performance. "
  1206. "Consider joining all columns at once using pd.concat(axis=1) "
  1207. "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",
  1208. PerformanceWarning,
  1209. stacklevel=find_stack_level(),
  1210. )
  1211. def _insert_update_mgr_locs(self, loc) -> None:
  1212. """
  1213. When inserting a new Block at location 'loc', we increment
  1214. all of the mgr_locs of blocks above that by one.
  1215. """
  1216. for blkno, count in _fast_count_smallints(self.blknos[loc:]):
  1217. # .620 this way, .326 of which is in increment_above
  1218. blk = self.blocks[blkno]
  1219. blk._mgr_locs = blk._mgr_locs.increment_above(loc)
  1220. def _insert_update_blklocs_and_blknos(self, loc) -> None:
  1221. """
  1222. When inserting a new Block at location 'loc', we update our
  1223. _blklocs and _blknos.
  1224. """
  1225. # Accessing public blklocs ensures the public versions are initialized
  1226. if loc == self.blklocs.shape[0]:
  1227. # np.append is a lot faster, let's use it if we can.
  1228. self._blklocs = np.append(self._blklocs, 0)
  1229. self._blknos = np.append(self._blknos, len(self.blocks))
  1230. elif loc == 0:
  1231. # np.append is a lot faster, let's use it if we can.
  1232. self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]
  1233. self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]
  1234. else:
  1235. new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(
  1236. self.blklocs, self.blknos, loc, len(self.blocks)
  1237. )
  1238. self._blklocs = new_blklocs
  1239. self._blknos = new_blknos
  1240. def idelete(self, indexer) -> BlockManager:
  1241. """
  1242. Delete selected locations, returning a new BlockManager.
  1243. """
  1244. is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
  1245. is_deleted[indexer] = True
  1246. taker = (~is_deleted).nonzero()[0]
  1247. nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
  1248. new_columns = self.items[~is_deleted]
  1249. axes = [new_columns, self.axes[1]]
  1250. return type(self)(tuple(nbs), axes, verify_integrity=False)
  1251. # ----------------------------------------------------------------
  1252. # Block-wise Operation
  1253. def grouped_reduce(self: T, func: Callable) -> T:
  1254. """
  1255. Apply grouped reduction function blockwise, returning a new BlockManager.
  1256. Parameters
  1257. ----------
  1258. func : grouped reduction function
  1259. Returns
  1260. -------
  1261. BlockManager
  1262. """
  1263. result_blocks: list[Block] = []
  1264. for blk in self.blocks:
  1265. if blk.is_object:
  1266. # split on object-dtype blocks bc some columns may raise
  1267. # while others do not.
  1268. for sb in blk._split():
  1269. applied = sb.apply(func)
  1270. result_blocks = extend_blocks(applied, result_blocks)
  1271. else:
  1272. applied = blk.apply(func)
  1273. result_blocks = extend_blocks(applied, result_blocks)
  1274. if len(result_blocks) == 0:
  1275. nrows = 0
  1276. else:
  1277. nrows = result_blocks[0].values.shape[-1]
  1278. index = Index(range(nrows))
  1279. return type(self).from_blocks(result_blocks, [self.axes[0], index])
  1280. def reduce(self: T, func: Callable) -> T:
  1281. """
  1282. Apply reduction function blockwise, returning a single-row BlockManager.
  1283. Parameters
  1284. ----------
  1285. func : reduction function
  1286. Returns
  1287. -------
  1288. BlockManager
  1289. """
  1290. # If 2D, we assume that we're operating column-wise
  1291. assert self.ndim == 2
  1292. res_blocks: list[Block] = []
  1293. for blk in self.blocks:
  1294. nbs = blk.reduce(func)
  1295. res_blocks.extend(nbs)
  1296. index = Index([None]) # placeholder
  1297. new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
  1298. return new_mgr
  1299. def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:
  1300. """
  1301. Apply array_op blockwise with another (aligned) BlockManager.
  1302. """
  1303. return operate_blockwise(self, other, array_op)
  1304. def _equal_values(self: BlockManager, other: BlockManager) -> bool:
  1305. """
  1306. Used in .equals defined in base class. Only check the column values
  1307. assuming shape and indexes have already been checked.
  1308. """
  1309. return blockwise_all(self, other, array_equals)
  1310. def quantile(
  1311. self: T,
  1312. *,
  1313. qs: Index, # with dtype float 64
  1314. axis: AxisInt = 0,
  1315. interpolation: QuantileInterpolation = "linear",
  1316. ) -> T:
  1317. """
  1318. Iterate over blocks applying quantile reduction.
  1319. This routine is intended for reduction type operations and
  1320. will do inference on the generated blocks.
  1321. Parameters
  1322. ----------
  1323. axis: reduction axis, default 0
  1324. consolidate: bool, default True. Join together blocks having same
  1325. dtype
  1326. interpolation : type of interpolation, default 'linear'
  1327. qs : list of the quantiles to be computed
  1328. Returns
  1329. -------
  1330. BlockManager
  1331. """
  1332. # Series dispatches to DataFrame for quantile, which allows us to
  1333. # simplify some of the code here and in the blocks
  1334. assert self.ndim >= 2
  1335. assert is_list_like(qs) # caller is responsible for this
  1336. assert axis == 1 # only ever called this way
  1337. new_axes = list(self.axes)
  1338. new_axes[1] = Index(qs, dtype=np.float64)
  1339. blocks = [
  1340. blk.quantile(axis=axis, qs=qs, interpolation=interpolation)
  1341. for blk in self.blocks
  1342. ]
  1343. return type(self)(blocks, new_axes)
  1344. # ----------------------------------------------------------------
  1345. def unstack(self, unstacker, fill_value) -> BlockManager:
  1346. """
  1347. Return a BlockManager with all blocks unstacked.
  1348. Parameters
  1349. ----------
  1350. unstacker : reshape._Unstacker
  1351. fill_value : Any
  1352. fill_value for newly introduced missing values.
  1353. Returns
  1354. -------
  1355. unstacked : BlockManager
  1356. """
  1357. new_columns = unstacker.get_new_columns(self.items)
  1358. new_index = unstacker.new_index
  1359. allow_fill = not unstacker.mask_all
  1360. if allow_fill:
  1361. # calculating the full mask once and passing it to Block._unstack is
  1362. # faster than letting calculating it in each repeated call
  1363. new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
  1364. needs_masking = new_mask2D.any(axis=0)
  1365. else:
  1366. needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)
  1367. new_blocks: list[Block] = []
  1368. columns_mask: list[np.ndarray] = []
  1369. if len(self.items) == 0:
  1370. factor = 1
  1371. else:
  1372. fac = len(new_columns) / len(self.items)
  1373. assert fac == int(fac)
  1374. factor = int(fac)
  1375. for blk in self.blocks:
  1376. mgr_locs = blk.mgr_locs
  1377. new_placement = mgr_locs.tile_for_unstack(factor)
  1378. blocks, mask = blk._unstack(
  1379. unstacker,
  1380. fill_value,
  1381. new_placement=new_placement,
  1382. needs_masking=needs_masking,
  1383. )
  1384. new_blocks.extend(blocks)
  1385. columns_mask.extend(mask)
  1386. # Block._unstack should ensure this holds,
  1387. assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)
  1388. # In turn this ensures that in the BlockManager call below
  1389. # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)
  1390. # which suffices to allow us to pass verify_inegrity=False
  1391. new_columns = new_columns[columns_mask]
  1392. bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
  1393. return bm
  1394. def to_dict(self, copy: bool = True):
  1395. """
  1396. Return a dict of str(dtype) -> BlockManager
  1397. Parameters
  1398. ----------
  1399. copy : bool, default True
  1400. Returns
  1401. -------
  1402. values : a dict of dtype -> BlockManager
  1403. """
  1404. bd: dict[str, list[Block]] = {}
  1405. for b in self.blocks:
  1406. bd.setdefault(str(b.dtype), []).append(b)
  1407. # TODO(EA2D): the combine will be unnecessary with 2D EAs
  1408. return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()}
  1409. def as_array(
  1410. self,
  1411. dtype: np.dtype | None = None,
  1412. copy: bool = False,
  1413. na_value: object = lib.no_default,
  1414. ) -> np.ndarray:
  1415. """
  1416. Convert the blockmanager data into an numpy array.
  1417. Parameters
  1418. ----------
  1419. dtype : np.dtype or None, default None
  1420. Data type of the return array.
  1421. copy : bool, default False
  1422. If True then guarantee that a copy is returned. A value of
  1423. False does not guarantee that the underlying data is not
  1424. copied.
  1425. na_value : object, default lib.no_default
  1426. Value to be used as the missing value sentinel.
  1427. Returns
  1428. -------
  1429. arr : ndarray
  1430. """
  1431. # TODO(CoW) handle case where resulting array is a view
  1432. if len(self.blocks) == 0:
  1433. arr = np.empty(self.shape, dtype=float)
  1434. return arr.transpose()
  1435. # We want to copy when na_value is provided to avoid
  1436. # mutating the original object
  1437. copy = copy or na_value is not lib.no_default
  1438. if self.is_single_block:
  1439. blk = self.blocks[0]
  1440. if blk.is_extension:
  1441. # Avoid implicit conversion of extension blocks to object
  1442. # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
  1443. # attribute "to_numpy"
  1444. arr = blk.values.to_numpy( # type: ignore[union-attr]
  1445. dtype=dtype,
  1446. na_value=na_value,
  1447. ).reshape(blk.shape)
  1448. else:
  1449. arr = np.asarray(blk.get_values())
  1450. if dtype:
  1451. arr = arr.astype(dtype, copy=False)
  1452. if copy:
  1453. arr = arr.copy()
  1454. elif using_copy_on_write():
  1455. arr = arr.view()
  1456. arr.flags.writeable = False
  1457. else:
  1458. arr = self._interleave(dtype=dtype, na_value=na_value)
  1459. # The underlying data was copied within _interleave, so no need
  1460. # to further copy if copy=True or setting na_value
  1461. if na_value is not lib.no_default:
  1462. arr[isna(arr)] = na_value
  1463. return arr.transpose()
  1464. def _interleave(
  1465. self,
  1466. dtype: np.dtype | None = None,
  1467. na_value: object = lib.no_default,
  1468. ) -> np.ndarray:
  1469. """
  1470. Return ndarray from blocks with specified item order
  1471. Items must be contained in the blocks
  1472. """
  1473. if not dtype:
  1474. # Incompatible types in assignment (expression has type
  1475. # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has
  1476. # type "Optional[dtype[Any]]")
  1477. dtype = interleaved_dtype( # type: ignore[assignment]
  1478. [blk.dtype for blk in self.blocks]
  1479. )
  1480. # TODO: https://github.com/pandas-dev/pandas/issues/22791
  1481. # Give EAs some input on what happens here. Sparse needs this.
  1482. if isinstance(dtype, SparseDtype):
  1483. dtype = dtype.subtype
  1484. dtype = cast(np.dtype, dtype)
  1485. elif isinstance(dtype, ExtensionDtype):
  1486. dtype = np.dtype("object")
  1487. elif is_dtype_equal(dtype, str):
  1488. dtype = np.dtype("object")
  1489. result = np.empty(self.shape, dtype=dtype)
  1490. itemmask = np.zeros(self.shape[0])
  1491. if dtype == np.dtype("object") and na_value is lib.no_default:
  1492. # much more performant than using to_numpy below
  1493. for blk in self.blocks:
  1494. rl = blk.mgr_locs
  1495. arr = blk.get_values(dtype)
  1496. result[rl.indexer] = arr
  1497. itemmask[rl.indexer] = 1
  1498. return result
  1499. for blk in self.blocks:
  1500. rl = blk.mgr_locs
  1501. if blk.is_extension:
  1502. # Avoid implicit conversion of extension blocks to object
  1503. # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
  1504. # attribute "to_numpy"
  1505. arr = blk.values.to_numpy( # type: ignore[union-attr]
  1506. dtype=dtype,
  1507. na_value=na_value,
  1508. )
  1509. else:
  1510. arr = blk.get_values(dtype)
  1511. result[rl.indexer] = arr
  1512. itemmask[rl.indexer] = 1
  1513. if not itemmask.all():
  1514. raise AssertionError("Some items were not contained in blocks")
  1515. return result
  1516. # ----------------------------------------------------------------
  1517. # Consolidation
  1518. def is_consolidated(self) -> bool:
  1519. """
  1520. Return True if more than one block with the same dtype
  1521. """
  1522. if not self._known_consolidated:
  1523. self._consolidate_check()
  1524. return self._is_consolidated
  1525. def _consolidate_check(self) -> None:
  1526. if len(self.blocks) == 1:
  1527. # fastpath
  1528. self._is_consolidated = True
  1529. self._known_consolidated = True
  1530. return
  1531. dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]
  1532. self._is_consolidated = len(dtypes) == len(set(dtypes))
  1533. self._known_consolidated = True
  1534. def _consolidate_inplace(self) -> None:
  1535. # In general, _consolidate_inplace should only be called via
  1536. # DataFrame._consolidate_inplace, otherwise we will fail to invalidate
  1537. # the DataFrame's _item_cache. The exception is for newly-created
  1538. # BlockManager objects not yet attached to a DataFrame.
  1539. if not self.is_consolidated():
  1540. self.blocks = _consolidate(self.blocks)
  1541. self._is_consolidated = True
  1542. self._known_consolidated = True
  1543. self._rebuild_blknos_and_blklocs()
  1544. class SingleBlockManager(BaseBlockManager, SingleDataManager):
  1545. """manage a single block with"""
  1546. @property
  1547. def ndim(self) -> Literal[1]:
  1548. return 1
  1549. _is_consolidated = True
  1550. _known_consolidated = True
  1551. __slots__ = ()
  1552. is_single_block = True
  1553. def __init__(
  1554. self,
  1555. block: Block,
  1556. axis: Index,
  1557. verify_integrity: bool = False,
  1558. ) -> None:
  1559. # Assertions disabled for performance
  1560. # assert isinstance(block, Block), type(block)
  1561. # assert isinstance(axis, Index), type(axis)
  1562. self.axes = [axis]
  1563. self.blocks = (block,)
  1564. @classmethod
  1565. def from_blocks(
  1566. cls,
  1567. blocks: list[Block],
  1568. axes: list[Index],
  1569. ) -> SingleBlockManager:
  1570. """
  1571. Constructor for BlockManager and SingleBlockManager with same signature.
  1572. """
  1573. assert len(blocks) == 1
  1574. assert len(axes) == 1
  1575. return cls(blocks[0], axes[0], verify_integrity=False)
  1576. @classmethod
  1577. def from_array(
  1578. cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
  1579. ) -> SingleBlockManager:
  1580. """
  1581. Constructor for if we have an array that is not yet a Block.
  1582. """
  1583. block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
  1584. return cls(block, index)
  1585. def to_2d_mgr(self, columns: Index) -> BlockManager:
  1586. """
  1587. Manager analogue of Series.to_frame
  1588. """
  1589. blk = self.blocks[0]
  1590. arr = ensure_block_shape(blk.values, ndim=2)
  1591. bp = BlockPlacement(0)
  1592. new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)
  1593. axes = [columns, self.axes[0]]
  1594. return BlockManager([new_blk], axes=axes, verify_integrity=False)
  1595. def _has_no_reference(self, i: int = 0) -> bool:
  1596. """
  1597. Check for column `i` if it has references.
  1598. (whether it references another array or is itself being referenced)
  1599. Returns True if the column has no references.
  1600. """
  1601. return not self.blocks[0].refs.has_reference()
  1602. def __getstate__(self):
  1603. block_values = [b.values for b in self.blocks]
  1604. block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
  1605. axes_array = list(self.axes)
  1606. extra_state = {
  1607. "0.14.1": {
  1608. "axes": axes_array,
  1609. "blocks": [
  1610. {"values": b.values, "mgr_locs": b.mgr_locs.indexer}
  1611. for b in self.blocks
  1612. ],
  1613. }
  1614. }
  1615. # First three elements of the state are to maintain forward
  1616. # compatibility with 0.13.1.
  1617. return axes_array, block_values, block_items, extra_state
  1618. def __setstate__(self, state):
  1619. def unpickle_block(values, mgr_locs, ndim: int) -> Block:
  1620. # TODO(EA2D): ndim would be unnecessary with 2D EAs
  1621. # older pickles may store e.g. DatetimeIndex instead of DatetimeArray
  1622. values = extract_array(values, extract_numpy=True)
  1623. return new_block(values, placement=mgr_locs, ndim=ndim)
  1624. if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
  1625. state = state[3]["0.14.1"]
  1626. self.axes = [ensure_index(ax) for ax in state["axes"]]
  1627. ndim = len(self.axes)
  1628. self.blocks = tuple(
  1629. unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
  1630. for b in state["blocks"]
  1631. )
  1632. else:
  1633. raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
  1634. self._post_setstate()
  1635. def _post_setstate(self) -> None:
  1636. pass
  1637. @cache_readonly
  1638. def _block(self) -> Block:
  1639. return self.blocks[0]
  1640. @property
  1641. def _blknos(self):
  1642. """compat with BlockManager"""
  1643. return None
  1644. @property
  1645. def _blklocs(self):
  1646. """compat with BlockManager"""
  1647. return None
  1648. def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager:
  1649. # similar to get_slice, but not restricted to slice indexer
  1650. blk = self._block
  1651. if (
  1652. using_copy_on_write()
  1653. and isinstance(indexer, np.ndarray)
  1654. and len(indexer) > 0
  1655. and com.is_bool_indexer(indexer)
  1656. and indexer.all()
  1657. ):
  1658. return type(self)(blk.copy(deep=False), self.index)
  1659. array = blk._slice(indexer)
  1660. if array.ndim > 1:
  1661. # This will be caught by Series._get_values
  1662. raise ValueError("dimension-expanding indexing not allowed")
  1663. bp = BlockPlacement(slice(0, len(array)))
  1664. # TODO(CoW) in theory only need to track reference if new_array is a view
  1665. block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
  1666. new_idx = self.index[indexer]
  1667. return type(self)(block, new_idx)
  1668. def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:
  1669. # Assertion disabled for performance
  1670. # assert isinstance(slobj, slice), type(slobj)
  1671. if axis >= self.ndim:
  1672. raise IndexError("Requested axis not found in manager")
  1673. blk = self._block
  1674. array = blk._slice(slobj)
  1675. bp = BlockPlacement(slice(0, len(array)))
  1676. # TODO this method is only used in groupby SeriesSplitter at the moment,
  1677. # so passing refs is not yet covered by the tests
  1678. block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
  1679. new_index = self.index._getitem_slice(slobj)
  1680. return type(self)(block, new_index)
  1681. @property
  1682. def index(self) -> Index:
  1683. return self.axes[0]
  1684. @property
  1685. def dtype(self) -> DtypeObj:
  1686. return self._block.dtype
  1687. def get_dtypes(self) -> np.ndarray:
  1688. return np.array([self._block.dtype])
  1689. def external_values(self):
  1690. """The array that Series.values returns"""
  1691. return self._block.external_values()
  1692. def internal_values(self):
  1693. """The array that Series._values returns"""
  1694. return self._block.values
  1695. def array_values(self):
  1696. """The array that Series.array returns"""
  1697. return self._block.array_values
  1698. def get_numeric_data(self, copy: bool = False):
  1699. if self._block.is_numeric:
  1700. return self.copy(deep=copy)
  1701. return self.make_empty()
  1702. @property
  1703. def _can_hold_na(self) -> bool:
  1704. return self._block._can_hold_na
  1705. def setitem_inplace(self, indexer, value) -> None:
  1706. """
  1707. Set values with indexer.
  1708. For Single[Block/Array]Manager, this backs s[indexer] = value
  1709. This is an inplace version of `setitem()`, mutating the manager/values
  1710. in place, not returning a new Manager (and Block), and thus never changing
  1711. the dtype.
  1712. """
  1713. if using_copy_on_write() and not self._has_no_reference(0):
  1714. self.blocks = (self._block.copy(),)
  1715. self._cache.clear()
  1716. super().setitem_inplace(indexer, value)
  1717. def idelete(self, indexer) -> SingleBlockManager:
  1718. """
  1719. Delete single location from SingleBlockManager.
  1720. Ensures that self.blocks doesn't become empty.
  1721. """
  1722. nb = self._block.delete(indexer)[0]
  1723. self.blocks = (nb,)
  1724. self.axes[0] = self.axes[0].delete(indexer)
  1725. self._cache.clear()
  1726. return self
  1727. def fast_xs(self, loc):
  1728. """
  1729. fast path for getting a cross-section
  1730. return a view of the data
  1731. """
  1732. raise NotImplementedError("Use series._values[loc] instead")
  1733. def set_values(self, values: ArrayLike) -> None:
  1734. """
  1735. Set the values of the single block in place.
  1736. Use at your own risk! This does not check if the passed values are
  1737. valid for the current Block/SingleBlockManager (length, dtype, etc).
  1738. """
  1739. # TODO(CoW) do we need to handle copy on write here? Currently this is
  1740. # only used for FrameColumnApply.series_generator (what if apply is
  1741. # mutating inplace?)
  1742. self.blocks[0].values = values
  1743. self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))
  1744. def _equal_values(self: T, other: T) -> bool:
  1745. """
  1746. Used in .equals defined in base class. Only check the column values
  1747. assuming shape and indexes have already been checked.
  1748. """
  1749. # For SingleBlockManager (i.e.Series)
  1750. if other.ndim != 1:
  1751. return False
  1752. left = self.blocks[0].values
  1753. right = other.blocks[0].values
  1754. return array_equals(left, right)
  1755. # --------------------------------------------------------------------
  1756. # Constructor Helpers
  1757. def create_block_manager_from_blocks(
  1758. blocks: list[Block],
  1759. axes: list[Index],
  1760. consolidate: bool = True,
  1761. verify_integrity: bool = True,
  1762. ) -> BlockManager:
  1763. # If verify_integrity=False, then caller is responsible for checking
  1764. # all(x.shape[-1] == len(axes[1]) for x in blocks)
  1765. # sum(x.shape[0] for x in blocks) == len(axes[0])
  1766. # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))
  1767. # all(blk.ndim == 2 for blk in blocks)
  1768. # This allows us to safely pass verify_integrity=False
  1769. try:
  1770. mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)
  1771. except ValueError as err:
  1772. arrays = [blk.values for blk in blocks]
  1773. tot_items = sum(arr.shape[0] for arr in arrays)
  1774. raise_construction_error(tot_items, arrays[0].shape[1:], axes, err)
  1775. if consolidate:
  1776. mgr._consolidate_inplace()
  1777. return mgr
  1778. def create_block_manager_from_column_arrays(
  1779. arrays: list[ArrayLike],
  1780. axes: list[Index],
  1781. consolidate: bool,
  1782. refs: list,
  1783. ) -> BlockManager:
  1784. # Assertions disabled for performance (caller is responsible for verifying)
  1785. # assert isinstance(axes, list)
  1786. # assert all(isinstance(x, Index) for x in axes)
  1787. # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
  1788. # assert all(type(x) is not PandasArray for x in arrays)
  1789. # assert all(x.ndim == 1 for x in arrays)
  1790. # assert all(len(x) == len(axes[1]) for x in arrays)
  1791. # assert len(arrays) == len(axes[0])
  1792. # These last three are sufficient to allow us to safely pass
  1793. # verify_integrity=False below.
  1794. try:
  1795. blocks = _form_blocks(arrays, consolidate, refs)
  1796. mgr = BlockManager(blocks, axes, verify_integrity=False)
  1797. except ValueError as e:
  1798. raise_construction_error(len(arrays), arrays[0].shape, axes, e)
  1799. if consolidate:
  1800. mgr._consolidate_inplace()
  1801. return mgr
  1802. def raise_construction_error(
  1803. tot_items: int,
  1804. block_shape: Shape,
  1805. axes: list[Index],
  1806. e: ValueError | None = None,
  1807. ):
  1808. """raise a helpful message about our construction"""
  1809. passed = tuple(map(int, [tot_items] + list(block_shape)))
  1810. # Correcting the user facing error message during dataframe construction
  1811. if len(passed) <= 2:
  1812. passed = passed[::-1]
  1813. implied = tuple(len(ax) for ax in axes)
  1814. # Correcting the user facing error message during dataframe construction
  1815. if len(implied) <= 2:
  1816. implied = implied[::-1]
  1817. # We return the exception object instead of raising it so that we
  1818. # can raise it in the caller; mypy plays better with that
  1819. if passed == implied and e is not None:
  1820. raise e
  1821. if block_shape[0] == 0:
  1822. raise ValueError("Empty data passed with indices specified.")
  1823. raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
  1824. # -----------------------------------------------------------------------
  1825. def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:
  1826. # compat for numpy<1.21, in which comparing a np.dtype with an ExtensionDtype
  1827. # raises instead of returning False. Once earlier numpy versions are dropped,
  1828. # this can be simplified to `return tup[1].dtype`
  1829. dtype = tup[1].dtype
  1830. if is_1d_only_ea_dtype(dtype):
  1831. # We know these won't be consolidated, so don't need to group these.
  1832. # This avoids expensive comparisons of CategoricalDtype objects
  1833. sep = id(dtype)
  1834. else:
  1835. sep = 0
  1836. return sep, isinstance(dtype, np.dtype), dtype
  1837. def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
  1838. tuples = list(enumerate(arrays))
  1839. if not consolidate:
  1840. nbs = _tuples_to_blocks_no_consolidate(tuples, refs)
  1841. return nbs
  1842. # when consolidating, we can ignore refs (either stacking always copies,
  1843. # or the EA is already copied in the calling dict_to_mgr)
  1844. # TODO(CoW) check if this is also valid for rec_array_to_mgr
  1845. # group by dtype
  1846. grouper = itertools.groupby(tuples, _grouping_func)
  1847. nbs = []
  1848. for (_, _, dtype), tup_block in grouper:
  1849. block_type = get_block_type(dtype)
  1850. if isinstance(dtype, np.dtype):
  1851. is_dtlike = dtype.kind in ["m", "M"]
  1852. if issubclass(dtype.type, (str, bytes)):
  1853. dtype = np.dtype(object)
  1854. values, placement = _stack_arrays(list(tup_block), dtype)
  1855. if is_dtlike:
  1856. values = ensure_wrapped_if_datetimelike(values)
  1857. blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
  1858. nbs.append(blk)
  1859. elif is_1d_only_ea_dtype(dtype):
  1860. dtype_blocks = [
  1861. block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)
  1862. for x in tup_block
  1863. ]
  1864. nbs.extend(dtype_blocks)
  1865. else:
  1866. dtype_blocks = [
  1867. block_type(
  1868. ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2
  1869. )
  1870. for x in tup_block
  1871. ]
  1872. nbs.extend(dtype_blocks)
  1873. return nbs
  1874. def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:
  1875. # tuples produced within _form_blocks are of the form (placement, array)
  1876. return [
  1877. new_block_2d(
  1878. ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref
  1879. )
  1880. for ((i, arr), ref) in zip(tuples, refs)
  1881. ]
  1882. def _stack_arrays(tuples, dtype: np.dtype):
  1883. placement, arrays = zip(*tuples)
  1884. first = arrays[0]
  1885. shape = (len(arrays),) + first.shape
  1886. stacked = np.empty(shape, dtype=dtype)
  1887. for i, arr in enumerate(arrays):
  1888. stacked[i] = arr
  1889. return stacked, placement
  1890. def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:
  1891. """
  1892. Merge blocks having same dtype, exclude non-consolidating blocks
  1893. """
  1894. # sort by _can_consolidate, dtype
  1895. gkey = lambda x: x._consolidate_key
  1896. grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
  1897. new_blocks: list[Block] = []
  1898. for (_can_consolidate, dtype), group_blocks in grouper:
  1899. merged_blocks, _ = _merge_blocks(
  1900. list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate
  1901. )
  1902. new_blocks = extend_blocks(merged_blocks, new_blocks)
  1903. return tuple(new_blocks)
  1904. def _merge_blocks(
  1905. blocks: list[Block], dtype: DtypeObj, can_consolidate: bool
  1906. ) -> tuple[list[Block], bool]:
  1907. if len(blocks) == 1:
  1908. return blocks, False
  1909. if can_consolidate:
  1910. # TODO: optimization potential in case all mgrs contain slices and
  1911. # combination of those slices is a slice, too.
  1912. new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
  1913. new_values: ArrayLike
  1914. if isinstance(blocks[0].dtype, np.dtype):
  1915. # error: List comprehension has incompatible type List[Union[ndarray,
  1916. # ExtensionArray]]; expected List[Union[complex, generic,
  1917. # Sequence[Union[int, float, complex, str, bytes, generic]],
  1918. # Sequence[Sequence[Any]], SupportsArray]]
  1919. new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]
  1920. else:
  1921. bvals = [blk.values for blk in blocks]
  1922. bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)
  1923. new_values = bvals2[0]._concat_same_type(bvals2, axis=0)
  1924. argsort = np.argsort(new_mgr_locs)
  1925. new_values = new_values[argsort]
  1926. new_mgr_locs = new_mgr_locs[argsort]
  1927. bp = BlockPlacement(new_mgr_locs)
  1928. return [new_block_2d(new_values, placement=bp)], True
  1929. # can't consolidate --> no merge
  1930. return blocks, False
  1931. def _fast_count_smallints(arr: npt.NDArray[np.intp]):
  1932. """Faster version of set(arr) for sequences of small numbers."""
  1933. counts = np.bincount(arr)
  1934. nz = counts.nonzero()[0]
  1935. # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
  1936. # in one benchmark by a factor of 11
  1937. return zip(nz, counts[nz])
  1938. def _preprocess_slice_or_indexer(
  1939. slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool
  1940. ):
  1941. if isinstance(slice_or_indexer, slice):
  1942. return (
  1943. "slice",
  1944. slice_or_indexer,
  1945. libinternals.slice_len(slice_or_indexer, length),
  1946. )
  1947. else:
  1948. if (
  1949. not isinstance(slice_or_indexer, np.ndarray)
  1950. or slice_or_indexer.dtype.kind != "i"
  1951. ):
  1952. dtype = getattr(slice_or_indexer, "dtype", None)
  1953. raise TypeError(type(slice_or_indexer), dtype)
  1954. indexer = ensure_platform_int(slice_or_indexer)
  1955. if not allow_fill:
  1956. indexer = maybe_convert_indices(indexer, length)
  1957. return "fancy", indexer, len(indexer)