concat.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. from __future__ import annotations
  2. import copy as cp
  3. import itertools
  4. from typing import (
  5. TYPE_CHECKING,
  6. Sequence,
  7. cast,
  8. )
  9. import numpy as np
  10. from pandas._libs import (
  11. NaT,
  12. internals as libinternals,
  13. )
  14. from pandas._libs.missing import NA
  15. from pandas._typing import (
  16. ArrayLike,
  17. AxisInt,
  18. DtypeObj,
  19. Manager,
  20. Shape,
  21. )
  22. from pandas.util._decorators import cache_readonly
  23. from pandas.core.dtypes.astype import astype_array
  24. from pandas.core.dtypes.cast import (
  25. ensure_dtype_can_hold_na,
  26. find_common_type,
  27. np_find_common_type,
  28. )
  29. from pandas.core.dtypes.common import (
  30. is_1d_only_ea_dtype,
  31. is_dtype_equal,
  32. is_scalar,
  33. needs_i8_conversion,
  34. )
  35. from pandas.core.dtypes.concat import concat_compat
  36. from pandas.core.dtypes.dtypes import (
  37. DatetimeTZDtype,
  38. ExtensionDtype,
  39. )
  40. from pandas.core.dtypes.missing import (
  41. is_valid_na_for_dtype,
  42. isna,
  43. isna_all,
  44. )
  45. import pandas.core.algorithms as algos
  46. from pandas.core.arrays import (
  47. DatetimeArray,
  48. ExtensionArray,
  49. )
  50. from pandas.core.arrays.sparse import SparseDtype
  51. from pandas.core.construction import ensure_wrapped_if_datetimelike
  52. from pandas.core.internals.array_manager import (
  53. ArrayManager,
  54. NullArrayProxy,
  55. )
  56. from pandas.core.internals.blocks import (
  57. ensure_block_shape,
  58. new_block_2d,
  59. )
  60. from pandas.core.internals.managers import BlockManager
  61. if TYPE_CHECKING:
  62. from pandas import Index
  63. from pandas.core.internals.blocks import Block
  64. def _concatenate_array_managers(
  65. mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
  66. ) -> Manager:
  67. """
  68. Concatenate array managers into one.
  69. Parameters
  70. ----------
  71. mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
  72. axes : list of Index
  73. concat_axis : int
  74. copy : bool
  75. Returns
  76. -------
  77. ArrayManager
  78. """
  79. # reindex all arrays
  80. mgrs = []
  81. for mgr, indexers in mgrs_indexers:
  82. axis1_made_copy = False
  83. for ax, indexer in indexers.items():
  84. mgr = mgr.reindex_indexer(
  85. axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
  86. )
  87. if ax == 1 and indexer is not None:
  88. axis1_made_copy = True
  89. if copy and concat_axis == 0 and not axis1_made_copy:
  90. # for concat_axis 1 we will always get a copy through concat_arrays
  91. mgr = mgr.copy()
  92. mgrs.append(mgr)
  93. if concat_axis == 1:
  94. # concatting along the rows -> concat the reindexed arrays
  95. # TODO(ArrayManager) doesn't yet preserve the correct dtype
  96. arrays = [
  97. concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
  98. for j in range(len(mgrs[0].arrays))
  99. ]
  100. else:
  101. # concatting along the columns -> combine reindexed arrays in a single manager
  102. assert concat_axis == 0
  103. arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
  104. new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
  105. return new_mgr
  106. def concat_arrays(to_concat: list) -> ArrayLike:
  107. """
  108. Alternative for concat_compat but specialized for use in the ArrayManager.
  109. Differences: only deals with 1D arrays (no axis keyword), assumes
  110. ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
  111. the dtype.
  112. In addition ensures that all NullArrayProxies get replaced with actual
  113. arrays.
  114. Parameters
  115. ----------
  116. to_concat : list of arrays
  117. Returns
  118. -------
  119. np.ndarray or ExtensionArray
  120. """
  121. # ignore the all-NA proxies to determine the resulting dtype
  122. to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
  123. dtypes = {x.dtype for x in to_concat_no_proxy}
  124. single_dtype = len(dtypes) == 1
  125. if single_dtype:
  126. target_dtype = to_concat_no_proxy[0].dtype
  127. elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes):
  128. # GH#42092
  129. target_dtype = np_find_common_type(*dtypes)
  130. else:
  131. target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
  132. to_concat = [
  133. arr.to_array(target_dtype)
  134. if isinstance(arr, NullArrayProxy)
  135. else astype_array(arr, target_dtype, copy=False)
  136. for arr in to_concat
  137. ]
  138. if isinstance(to_concat[0], ExtensionArray):
  139. cls = type(to_concat[0])
  140. return cls._concat_same_type(to_concat)
  141. result = np.concatenate(to_concat)
  142. # TODO decide on exact behaviour (we shouldn't do this only for empty result)
  143. # see https://github.com/pandas-dev/pandas/issues/39817
  144. if len(result) == 0:
  145. # all empties -> check for bool to not coerce to float
  146. kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
  147. if len(kinds) != 1:
  148. if "b" in kinds:
  149. result = result.astype(object)
  150. return result
  151. def concatenate_managers(
  152. mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
  153. ) -> Manager:
  154. """
  155. Concatenate block managers into one.
  156. Parameters
  157. ----------
  158. mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
  159. axes : list of Index
  160. concat_axis : int
  161. copy : bool
  162. Returns
  163. -------
  164. BlockManager
  165. """
  166. # TODO(ArrayManager) this assumes that all managers are of the same type
  167. if isinstance(mgrs_indexers[0][0], ArrayManager):
  168. return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
  169. # Assertions disabled for performance
  170. # for tup in mgrs_indexers:
  171. # # caller is responsible for ensuring this
  172. # indexers = tup[1]
  173. # assert concat_axis not in indexers
  174. if concat_axis == 0:
  175. return _concat_managers_axis0(mgrs_indexers, axes, copy)
  176. mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
  177. concat_plans = [
  178. _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
  179. ]
  180. concat_plan = _combine_concat_plans(concat_plans)
  181. blocks = []
  182. for placement, join_units in concat_plan:
  183. unit = join_units[0]
  184. blk = unit.block
  185. if len(join_units) == 1 and not join_units[0].indexers:
  186. values = blk.values
  187. if copy:
  188. values = values.copy()
  189. else:
  190. values = values.view()
  191. fastpath = True
  192. elif _is_uniform_join_units(join_units):
  193. vals = [ju.block.values for ju in join_units]
  194. if not blk.is_extension:
  195. # _is_uniform_join_units ensures a single dtype, so
  196. # we can use np.concatenate, which is more performant
  197. # than concat_compat
  198. values = np.concatenate(vals, axis=1)
  199. else:
  200. # TODO(EA2D): special-casing not needed with 2D EAs
  201. values = concat_compat(vals, axis=1)
  202. values = ensure_block_shape(values, ndim=2)
  203. values = ensure_wrapped_if_datetimelike(values)
  204. fastpath = blk.values.dtype == values.dtype
  205. else:
  206. values = _concatenate_join_units(join_units, copy=copy)
  207. fastpath = False
  208. if fastpath:
  209. b = blk.make_block_same_class(values, placement=placement)
  210. else:
  211. b = new_block_2d(values, placement=placement)
  212. blocks.append(b)
  213. return BlockManager(tuple(blocks), axes)
  214. def _concat_managers_axis0(
  215. mgrs_indexers, axes: list[Index], copy: bool
  216. ) -> BlockManager:
  217. """
  218. concat_managers specialized to concat_axis=0, with reindexing already
  219. having been done in _maybe_reindex_columns_na_proxy.
  220. """
  221. had_reindexers = {
  222. i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers))
  223. }
  224. mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
  225. mgrs = [x[0] for x in mgrs_indexers]
  226. offset = 0
  227. blocks = []
  228. for i, mgr in enumerate(mgrs):
  229. # If we already reindexed, then we definitely don't need another copy
  230. made_copy = had_reindexers[i]
  231. for blk in mgr.blocks:
  232. if made_copy:
  233. nb = blk.copy(deep=False)
  234. elif copy:
  235. nb = blk.copy()
  236. else:
  237. # by slicing instead of copy(deep=False), we get a new array
  238. # object, see test_concat_copy
  239. nb = blk.getitem_block(slice(None))
  240. nb._mgr_locs = nb._mgr_locs.add(offset)
  241. blocks.append(nb)
  242. offset += len(mgr.items)
  243. result = BlockManager(tuple(blocks), axes)
  244. return result
  245. def _maybe_reindex_columns_na_proxy(
  246. axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
  247. ) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
  248. """
  249. Reindex along columns so that all of the BlockManagers being concatenated
  250. have matching columns.
  251. Columns added in this reindexing have dtype=np.void, indicating they
  252. should be ignored when choosing a column's final dtype.
  253. """
  254. new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = []
  255. for mgr, indexers in mgrs_indexers:
  256. # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
  257. # is a cheap reindexing.
  258. for i, indexer in indexers.items():
  259. mgr = mgr.reindex_indexer(
  260. axes[i],
  261. indexers[i],
  262. axis=i,
  263. copy=False,
  264. only_slice=True, # only relevant for i==0
  265. allow_dups=True,
  266. use_na_proxy=True, # only relevant for i==0
  267. )
  268. new_mgrs_indexers.append((mgr, {}))
  269. return new_mgrs_indexers
  270. def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]):
  271. """
  272. Construct concatenation plan for given block manager and indexers.
  273. Parameters
  274. ----------
  275. mgr : BlockManager
  276. indexers : dict of {axis: indexer}
  277. Returns
  278. -------
  279. plan : list of (BlockPlacement, JoinUnit) tuples
  280. """
  281. assert len(indexers) == 0
  282. # Calculate post-reindex shape, save for item axis which will be separate
  283. # for each block anyway.
  284. mgr_shape_list = list(mgr.shape)
  285. for ax, indexer in indexers.items():
  286. mgr_shape_list[ax] = len(indexer)
  287. mgr_shape = tuple(mgr_shape_list)
  288. assert 0 not in indexers
  289. if mgr.is_single_block:
  290. blk = mgr.blocks[0]
  291. return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
  292. blknos = mgr.blknos
  293. blklocs = mgr.blklocs
  294. plan = []
  295. for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):
  296. assert placements.is_slice_like
  297. assert blkno != -1
  298. join_unit_indexers = indexers.copy()
  299. shape_list = list(mgr_shape)
  300. shape_list[0] = len(placements)
  301. shape = tuple(shape_list)
  302. blk = mgr.blocks[blkno]
  303. ax0_blk_indexer = blklocs[placements.indexer]
  304. unit_no_ax0_reindexing = (
  305. len(placements) == len(blk.mgr_locs)
  306. and
  307. # Fastpath detection of join unit not
  308. # needing to reindex its block: no ax0
  309. # reindexing took place and block
  310. # placement was sequential before.
  311. (
  312. (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1)
  313. or
  314. # Slow-ish detection: all indexer locs
  315. # are sequential (and length match is
  316. # checked above).
  317. (np.diff(ax0_blk_indexer) == 1).all()
  318. )
  319. )
  320. # Omit indexer if no item reindexing is required.
  321. if unit_no_ax0_reindexing:
  322. join_unit_indexers.pop(0, None)
  323. else:
  324. join_unit_indexers[0] = ax0_blk_indexer
  325. unit = JoinUnit(blk, shape, join_unit_indexers)
  326. plan.append((placements, unit))
  327. return plan
  328. class JoinUnit:
  329. def __init__(self, block: Block, shape: Shape, indexers=None) -> None:
  330. # Passing shape explicitly is required for cases when block is None.
  331. # Note: block is None implies indexers is None, but not vice-versa
  332. if indexers is None:
  333. indexers = {}
  334. self.block = block
  335. self.indexers = indexers
  336. self.shape = shape
  337. def __repr__(self) -> str:
  338. return f"{type(self).__name__}({repr(self.block)}, {self.indexers})"
  339. @cache_readonly
  340. def needs_filling(self) -> bool:
  341. for indexer in self.indexers.values():
  342. # FIXME: cache results of indexer == -1 checks.
  343. if (indexer == -1).any():
  344. return True
  345. return False
  346. @cache_readonly
  347. def dtype(self) -> DtypeObj:
  348. blk = self.block
  349. if blk.values.dtype.kind == "V":
  350. raise AssertionError("Block is None, no dtype")
  351. if not self.needs_filling:
  352. return blk.dtype
  353. return ensure_dtype_can_hold_na(blk.dtype)
  354. def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
  355. """
  356. Check that we are all-NA of a type/dtype that is compatible with this dtype.
  357. Augments `self.is_na` with an additional check of the type of NA values.
  358. """
  359. if not self.is_na:
  360. return False
  361. if self.block.dtype.kind == "V":
  362. return True
  363. if self.dtype == object:
  364. values = self.block.values
  365. return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
  366. na_value = self.block.fill_value
  367. if na_value is NaT and not is_dtype_equal(self.dtype, dtype):
  368. # e.g. we are dt64 and other is td64
  369. # fill_values match but we should not cast self.block.values to dtype
  370. # TODO: this will need updating if we ever have non-nano dt64/td64
  371. return False
  372. if na_value is NA and needs_i8_conversion(dtype):
  373. # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
  374. # e.g. self.dtype == "Int64" and dtype is td64, we dont want
  375. # to consider these as matching
  376. return False
  377. # TODO: better to use can_hold_element?
  378. return is_valid_na_for_dtype(na_value, dtype)
  379. @cache_readonly
  380. def is_na(self) -> bool:
  381. blk = self.block
  382. if blk.dtype.kind == "V":
  383. return True
  384. if not blk._can_hold_na:
  385. return False
  386. values = blk.values
  387. if values.size == 0:
  388. return True
  389. if isinstance(values.dtype, SparseDtype):
  390. return False
  391. if values.ndim == 1:
  392. # TODO(EA2D): no need for special case with 2D EAs
  393. val = values[0]
  394. if not is_scalar(val) or not isna(val):
  395. # ideally isna_all would do this short-circuiting
  396. return False
  397. return isna_all(values)
  398. else:
  399. val = values[0][0]
  400. if not is_scalar(val) or not isna(val):
  401. # ideally isna_all would do this short-circuiting
  402. return False
  403. return all(isna_all(row) for row in values)
  404. def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
  405. values: ArrayLike
  406. if upcasted_na is None and self.block.dtype.kind != "V":
  407. # No upcasting is necessary
  408. fill_value = self.block.fill_value
  409. values = self.block.values
  410. else:
  411. fill_value = upcasted_na
  412. if self._is_valid_na_for(empty_dtype):
  413. # note: always holds when self.block.dtype.kind == "V"
  414. blk_dtype = self.block.dtype
  415. if blk_dtype == np.dtype("object"):
  416. # we want to avoid filling with np.nan if we are
  417. # using None; we already know that we are all
  418. # nulls
  419. values = self.block.values.ravel(order="K")
  420. if len(values) and values[0] is None:
  421. fill_value = None
  422. if isinstance(empty_dtype, DatetimeTZDtype):
  423. # NB: exclude e.g. pyarrow[dt64tz] dtypes
  424. i8values = np.full(self.shape, fill_value._value)
  425. return DatetimeArray(i8values, dtype=empty_dtype)
  426. elif is_1d_only_ea_dtype(empty_dtype):
  427. if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers:
  428. # avoid creating new empty array if we already have an array
  429. # with correct dtype that can be reindexed
  430. pass
  431. else:
  432. empty_dtype = cast(ExtensionDtype, empty_dtype)
  433. cls = empty_dtype.construct_array_type()
  434. missing_arr = cls._from_sequence([], dtype=empty_dtype)
  435. ncols, nrows = self.shape
  436. assert ncols == 1, ncols
  437. empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
  438. return missing_arr.take(
  439. empty_arr, allow_fill=True, fill_value=fill_value
  440. )
  441. elif isinstance(empty_dtype, ExtensionDtype):
  442. # TODO: no tests get here, a handful would if we disabled
  443. # the dt64tz special-case above (which is faster)
  444. cls = empty_dtype.construct_array_type()
  445. missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype)
  446. missing_arr[:] = fill_value
  447. return missing_arr
  448. else:
  449. # NB: we should never get here with empty_dtype integer or bool;
  450. # if we did, the missing_arr.fill would cast to gibberish
  451. missing_arr = np.empty(self.shape, dtype=empty_dtype)
  452. missing_arr.fill(fill_value)
  453. return missing_arr
  454. if (not self.indexers) and (not self.block._can_consolidate):
  455. # preserve these for validation in concat_compat
  456. return self.block.values
  457. if self.block.is_bool:
  458. # External code requested filling/upcasting, bool values must
  459. # be upcasted to object to avoid being upcasted to numeric.
  460. values = self.block.astype(np.dtype("object")).values
  461. else:
  462. # No dtype upcasting is done here, it will be performed during
  463. # concatenation itself.
  464. values = self.block.values
  465. if not self.indexers:
  466. # If there's no indexing to be done, we want to signal outside
  467. # code that this array must be copied explicitly. This is done
  468. # by returning a view and checking `retval.base`.
  469. values = values.view()
  470. else:
  471. for ax, indexer in self.indexers.items():
  472. values = algos.take_nd(values, indexer, axis=ax)
  473. return values
  474. def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:
  475. """
  476. Concatenate values from several join units along axis=1.
  477. """
  478. empty_dtype = _get_empty_dtype(join_units)
  479. has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
  480. upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
  481. to_concat = [
  482. ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
  483. for ju in join_units
  484. ]
  485. if len(to_concat) == 1:
  486. # Only one block, nothing to concatenate.
  487. concat_values = to_concat[0]
  488. if copy:
  489. if isinstance(concat_values, np.ndarray):
  490. # non-reindexed (=not yet copied) arrays are made into a view
  491. # in JoinUnit.get_reindexed_values
  492. if concat_values.base is not None:
  493. concat_values = concat_values.copy()
  494. else:
  495. concat_values = concat_values.copy()
  496. elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
  497. # TODO(EA2D): special case not needed if all EAs used HybridBlocks
  498. # error: No overload variant of "__getitem__" of "ExtensionArray" matches
  499. # argument type "Tuple[int, slice]"
  500. to_concat = [
  501. t
  502. if is_1d_only_ea_dtype(t.dtype)
  503. else t[0, :] # type: ignore[call-overload]
  504. for t in to_concat
  505. ]
  506. concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
  507. concat_values = ensure_block_shape(concat_values, 2)
  508. else:
  509. concat_values = concat_compat(to_concat, axis=1)
  510. return concat_values
  511. def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
  512. """
  513. Find the NA value to go with this dtype.
  514. """
  515. if isinstance(dtype, ExtensionDtype):
  516. return dtype.na_value
  517. elif dtype.kind in ["m", "M"]:
  518. return dtype.type("NaT")
  519. elif dtype.kind in ["f", "c"]:
  520. return dtype.type("NaN")
  521. elif dtype.kind == "b":
  522. # different from missing.na_value_for_dtype
  523. return None
  524. elif dtype.kind in ["i", "u"]:
  525. if not has_none_blocks:
  526. # different from missing.na_value_for_dtype
  527. return None
  528. return np.nan
  529. elif dtype.kind == "O":
  530. return np.nan
  531. raise NotImplementedError
  532. def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
  533. """
  534. Return dtype and N/A values to use when concatenating specified units.
  535. Returned N/A value may be None which means there was no casting involved.
  536. Returns
  537. -------
  538. dtype
  539. """
  540. if len(join_units) == 1:
  541. blk = join_units[0].block
  542. return blk.dtype
  543. if _is_uniform_reindex(join_units):
  544. empty_dtype = join_units[0].block.dtype
  545. return empty_dtype
  546. has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
  547. dtypes = [unit.dtype for unit in join_units if not unit.is_na]
  548. if not len(dtypes):
  549. dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"]
  550. dtype = find_common_type(dtypes)
  551. if has_none_blocks:
  552. dtype = ensure_dtype_can_hold_na(dtype)
  553. return dtype
  554. def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
  555. """
  556. Check if the join units consist of blocks of uniform type that can
  557. be concatenated using Block.concat_same_type instead of the generic
  558. _concatenate_join_units (which uses `concat_compat`).
  559. """
  560. first = join_units[0].block
  561. if first.dtype.kind == "V":
  562. return False
  563. return (
  564. # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
  565. all(type(ju.block) is type(first) for ju in join_units)
  566. and
  567. # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
  568. all(
  569. is_dtype_equal(ju.block.dtype, first.dtype)
  570. # GH#42092 we only want the dtype_equal check for non-numeric blocks
  571. # (for now, may change but that would need a deprecation)
  572. or ju.block.dtype.kind in ["b", "i", "u"]
  573. for ju in join_units
  574. )
  575. and
  576. # no blocks that would get missing values (can lead to type upcasts)
  577. # unless we're an extension dtype.
  578. all(not ju.is_na or ju.block.is_extension for ju in join_units)
  579. and
  580. # no blocks with indexers (as then the dimensions do not fit)
  581. all(not ju.indexers for ju in join_units)
  582. and
  583. # only use this path when there is something to concatenate
  584. len(join_units) > 1
  585. )
  586. def _is_uniform_reindex(join_units) -> bool:
  587. return (
  588. # TODO: should this be ju.block._can_hold_na?
  589. all(ju.block.is_extension for ju in join_units)
  590. and len({ju.block.dtype.name for ju in join_units}) == 1
  591. )
  592. def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
  593. """
  594. Reduce join_unit's shape along item axis to length.
  595. Extra items that didn't fit are returned as a separate block.
  596. """
  597. if 0 not in join_unit.indexers:
  598. extra_indexers = join_unit.indexers
  599. if join_unit.block is None:
  600. extra_block = None
  601. else:
  602. extra_block = join_unit.block.getitem_block(slice(length, None))
  603. join_unit.block = join_unit.block.getitem_block(slice(length))
  604. else:
  605. extra_block = join_unit.block
  606. extra_indexers = cp.copy(join_unit.indexers)
  607. extra_indexers[0] = extra_indexers[0][length:]
  608. join_unit.indexers[0] = join_unit.indexers[0][:length]
  609. extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
  610. join_unit.shape = (length,) + join_unit.shape[1:]
  611. return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape)
  612. def _combine_concat_plans(plans):
  613. """
  614. Combine multiple concatenation plans into one.
  615. existing_plan is updated in-place.
  616. We only get here with concat_axis == 1.
  617. """
  618. if len(plans) == 1:
  619. for p in plans[0]:
  620. yield p[0], [p[1]]
  621. else:
  622. # singleton list so we can modify it as a side-effect within _next_or_none
  623. num_ended = [0]
  624. def _next_or_none(seq):
  625. retval = next(seq, None)
  626. if retval is None:
  627. num_ended[0] += 1
  628. return retval
  629. plans = list(map(iter, plans))
  630. next_items = list(map(_next_or_none, plans))
  631. while num_ended[0] != len(next_items):
  632. if num_ended[0] > 0:
  633. raise ValueError("Plan shapes are not aligned")
  634. placements, units = zip(*next_items)
  635. lengths = list(map(len, placements))
  636. min_len, max_len = min(lengths), max(lengths)
  637. if min_len == max_len:
  638. yield placements[0], units
  639. next_items[:] = map(_next_or_none, plans)
  640. else:
  641. yielded_placement = None
  642. yielded_units = [None] * len(next_items)
  643. for i, (plc, unit) in enumerate(next_items):
  644. yielded_units[i] = unit
  645. if len(plc) > min_len:
  646. # _trim_join_unit updates unit in place, so only
  647. # placement needs to be sliced to skip min_len.
  648. next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len))
  649. else:
  650. yielded_placement = plc
  651. next_items[i] = _next_or_none(plans[i])
  652. yield yielded_placement, yielded_units