construction.py 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. """
  2. Functions for preparing various inputs passed to the DataFrame or Series
  3. constructors before passing them to a BlockManager.
  4. """
  5. from __future__ import annotations
  6. from collections import abc
  7. from typing import (
  8. Any,
  9. Hashable,
  10. Sequence,
  11. )
  12. import numpy as np
  13. from numpy import ma
  14. from pandas._libs import lib
  15. from pandas._typing import (
  16. ArrayLike,
  17. DtypeObj,
  18. Manager,
  19. npt,
  20. )
  21. from pandas.core.dtypes.astype import astype_is_view
  22. from pandas.core.dtypes.cast import (
  23. construct_1d_arraylike_from_scalar,
  24. dict_compat,
  25. maybe_cast_to_datetime,
  26. maybe_convert_platform,
  27. maybe_infer_to_datetimelike,
  28. )
  29. from pandas.core.dtypes.common import (
  30. is_1d_only_ea_dtype,
  31. is_bool_dtype,
  32. is_datetime_or_timedelta_dtype,
  33. is_dtype_equal,
  34. is_extension_array_dtype,
  35. is_float_dtype,
  36. is_integer_dtype,
  37. is_list_like,
  38. is_named_tuple,
  39. is_object_dtype,
  40. )
  41. from pandas.core.dtypes.dtypes import ExtensionDtype
  42. from pandas.core.dtypes.generic import (
  43. ABCDataFrame,
  44. ABCSeries,
  45. )
  46. from pandas.core import (
  47. algorithms,
  48. common as com,
  49. )
  50. from pandas.core.arrays import (
  51. BooleanArray,
  52. ExtensionArray,
  53. FloatingArray,
  54. IntegerArray,
  55. )
  56. from pandas.core.arrays.string_ import StringDtype
  57. from pandas.core.construction import (
  58. ensure_wrapped_if_datetimelike,
  59. extract_array,
  60. range_to_ndarray,
  61. sanitize_array,
  62. )
  63. from pandas.core.indexes.api import (
  64. DatetimeIndex,
  65. Index,
  66. TimedeltaIndex,
  67. default_index,
  68. ensure_index,
  69. get_objs_combined_axis,
  70. union_indexes,
  71. )
  72. from pandas.core.internals.array_manager import (
  73. ArrayManager,
  74. SingleArrayManager,
  75. )
  76. from pandas.core.internals.blocks import (
  77. BlockPlacement,
  78. ensure_block_shape,
  79. new_block_2d,
  80. )
  81. from pandas.core.internals.managers import (
  82. BlockManager,
  83. SingleBlockManager,
  84. create_block_manager_from_blocks,
  85. create_block_manager_from_column_arrays,
  86. )
  87. # ---------------------------------------------------------------------
  88. # BlockManager Interface
  89. def arrays_to_mgr(
  90. arrays,
  91. columns: Index,
  92. index,
  93. *,
  94. dtype: DtypeObj | None = None,
  95. verify_integrity: bool = True,
  96. typ: str | None = None,
  97. consolidate: bool = True,
  98. ) -> Manager:
  99. """
  100. Segregate Series based on type and coerce into matrices.
  101. Needs to handle a lot of exceptional cases.
  102. """
  103. if verify_integrity:
  104. # figure out the index, if necessary
  105. if index is None:
  106. index = _extract_index(arrays)
  107. else:
  108. index = ensure_index(index)
  109. # don't force copy because getting jammed in an ndarray anyway
  110. arrays, refs = _homogenize(arrays, index, dtype)
  111. # _homogenize ensures
  112. # - all(len(x) == len(index) for x in arrays)
  113. # - all(x.ndim == 1 for x in arrays)
  114. # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
  115. # - all(type(x) is not PandasArray for x in arrays)
  116. else:
  117. index = ensure_index(index)
  118. arrays = [extract_array(x, extract_numpy=True) for x in arrays]
  119. # with _from_arrays, the passed arrays should never be Series objects
  120. refs = [None] * len(arrays)
  121. # Reached via DataFrame._from_arrays; we do minimal validation here
  122. for arr in arrays:
  123. if (
  124. not isinstance(arr, (np.ndarray, ExtensionArray))
  125. or arr.ndim != 1
  126. or len(arr) != len(index)
  127. ):
  128. raise ValueError(
  129. "Arrays must be 1-dimensional np.ndarray or ExtensionArray "
  130. "with length matching len(index)"
  131. )
  132. columns = ensure_index(columns)
  133. if len(columns) != len(arrays):
  134. raise ValueError("len(arrays) must match len(columns)")
  135. # from BlockManager perspective
  136. axes = [columns, index]
  137. if typ == "block":
  138. return create_block_manager_from_column_arrays(
  139. arrays, axes, consolidate=consolidate, refs=refs
  140. )
  141. elif typ == "array":
  142. return ArrayManager(arrays, [index, columns])
  143. else:
  144. raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
  145. def rec_array_to_mgr(
  146. data: np.recarray | np.ndarray,
  147. index,
  148. columns,
  149. dtype: DtypeObj | None,
  150. copy: bool,
  151. typ: str,
  152. ) -> Manager:
  153. """
  154. Extract from a masked rec array and create the manager.
  155. """
  156. # essentially process a record array then fill it
  157. fdata = ma.getdata(data)
  158. if index is None:
  159. index = default_index(len(fdata))
  160. else:
  161. index = ensure_index(index)
  162. if columns is not None:
  163. columns = ensure_index(columns)
  164. arrays, arr_columns = to_arrays(fdata, columns)
  165. # create the manager
  166. arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index))
  167. if columns is None:
  168. columns = arr_columns
  169. mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)
  170. if copy:
  171. mgr = mgr.copy()
  172. return mgr
  173. def mgr_to_mgr(mgr, typ: str, copy: bool = True):
  174. """
  175. Convert to specific type of Manager. Does not copy if the type is already
  176. correct. Does not guarantee a copy otherwise. `copy` keyword only controls
  177. whether conversion from Block->ArrayManager copies the 1D arrays.
  178. """
  179. new_mgr: Manager
  180. if typ == "block":
  181. if isinstance(mgr, BlockManager):
  182. new_mgr = mgr
  183. else:
  184. if mgr.ndim == 2:
  185. new_mgr = arrays_to_mgr(
  186. mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"
  187. )
  188. else:
  189. new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)
  190. elif typ == "array":
  191. if isinstance(mgr, ArrayManager):
  192. new_mgr = mgr
  193. else:
  194. if mgr.ndim == 2:
  195. arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
  196. if copy:
  197. arrays = [arr.copy() for arr in arrays]
  198. new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
  199. else:
  200. array = mgr.internal_values()
  201. if copy:
  202. array = array.copy()
  203. new_mgr = SingleArrayManager([array], [mgr.index])
  204. else:
  205. raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
  206. return new_mgr
  207. # ---------------------------------------------------------------------
  208. # DataFrame Constructor Interface
  209. def ndarray_to_mgr(
  210. values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str
  211. ) -> Manager:
  212. # used in DataFrame.__init__
  213. # input must be a ndarray, list, Series, Index, ExtensionArray
  214. if isinstance(values, ABCSeries):
  215. if columns is None:
  216. if values.name is not None:
  217. columns = Index([values.name])
  218. if index is None:
  219. index = values.index
  220. else:
  221. values = values.reindex(index)
  222. # zero len case (GH #2234)
  223. if not len(values) and columns is not None and len(columns):
  224. values = np.empty((0, 1), dtype=object)
  225. # if the array preparation does a copy -> avoid this for ArrayManager,
  226. # since the copy is done on conversion to 1D arrays
  227. copy_on_sanitize = False if typ == "array" else copy
  228. vdtype = getattr(values, "dtype", None)
  229. refs = None
  230. if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
  231. # GH#19157
  232. if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:
  233. # GH#12513 a EA dtype passed with a 2D array, split into
  234. # multiple EAs that view the values
  235. # error: No overload variant of "__getitem__" of "ExtensionArray"
  236. # matches argument type "Tuple[slice, int]"
  237. values = [
  238. values[:, n] # type: ignore[call-overload]
  239. for n in range(values.shape[1])
  240. ]
  241. else:
  242. values = [values]
  243. if columns is None:
  244. columns = Index(range(len(values)))
  245. else:
  246. columns = ensure_index(columns)
  247. return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)
  248. elif is_extension_array_dtype(vdtype):
  249. # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype)
  250. # are already caught above
  251. values = extract_array(values, extract_numpy=True)
  252. if copy:
  253. values = values.copy()
  254. if values.ndim == 1:
  255. values = values.reshape(-1, 1)
  256. elif isinstance(values, (ABCSeries, Index)):
  257. if not copy_on_sanitize and (
  258. dtype is None or astype_is_view(values.dtype, dtype)
  259. ):
  260. refs = values._references
  261. if copy_on_sanitize:
  262. values = values._values.copy()
  263. else:
  264. values = values._values
  265. values = _ensure_2d(values)
  266. elif isinstance(values, (np.ndarray, ExtensionArray)):
  267. # drop subclass info
  268. _copy = (
  269. copy_on_sanitize
  270. if (dtype is None or astype_is_view(values.dtype, dtype))
  271. else False
  272. )
  273. values = np.array(values, copy=_copy)
  274. values = _ensure_2d(values)
  275. else:
  276. # by definition an array here
  277. # the dtypes will be coerced to a single dtype
  278. values = _prep_ndarraylike(values, copy=copy_on_sanitize)
  279. if dtype is not None and not is_dtype_equal(values.dtype, dtype):
  280. # GH#40110 see similar check inside sanitize_array
  281. values = sanitize_array(
  282. values,
  283. None,
  284. dtype=dtype,
  285. copy=copy_on_sanitize,
  286. allow_2d=True,
  287. )
  288. # _prep_ndarraylike ensures that values.ndim == 2 at this point
  289. index, columns = _get_axes(
  290. values.shape[0], values.shape[1], index=index, columns=columns
  291. )
  292. _check_values_indices_shape_match(values, index, columns)
  293. if typ == "array":
  294. if issubclass(values.dtype.type, str):
  295. values = np.array(values, dtype=object)
  296. if dtype is None and is_object_dtype(values.dtype):
  297. arrays = [
  298. ensure_wrapped_if_datetimelike(
  299. maybe_infer_to_datetimelike(values[:, i])
  300. )
  301. for i in range(values.shape[1])
  302. ]
  303. else:
  304. if is_datetime_or_timedelta_dtype(values.dtype):
  305. values = ensure_wrapped_if_datetimelike(values)
  306. arrays = [values[:, i] for i in range(values.shape[1])]
  307. if copy:
  308. arrays = [arr.copy() for arr in arrays]
  309. return ArrayManager(arrays, [index, columns], verify_integrity=False)
  310. values = values.T
  311. # if we don't have a dtype specified, then try to convert objects
  312. # on the entire block; this is to convert if we have datetimelike's
  313. # embedded in an object type
  314. if dtype is None and is_object_dtype(values.dtype):
  315. obj_columns = list(values)
  316. maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
  317. # don't convert (and copy) the objects if no type inference occurs
  318. if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
  319. dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
  320. block_values = [
  321. new_block_2d(dvals_list[n], placement=BlockPlacement(n))
  322. for n in range(len(dvals_list))
  323. ]
  324. else:
  325. bp = BlockPlacement(slice(len(columns)))
  326. nb = new_block_2d(values, placement=bp, refs=refs)
  327. block_values = [nb]
  328. else:
  329. bp = BlockPlacement(slice(len(columns)))
  330. nb = new_block_2d(values, placement=bp, refs=refs)
  331. block_values = [nb]
  332. if len(columns) == 0:
  333. # TODO: check len(values) == 0?
  334. block_values = []
  335. return create_block_manager_from_blocks(
  336. block_values, [columns, index], verify_integrity=False
  337. )
  338. def _check_values_indices_shape_match(
  339. values: np.ndarray, index: Index, columns: Index
  340. ) -> None:
  341. """
  342. Check that the shape implied by our axes matches the actual shape of the
  343. data.
  344. """
  345. if values.shape[1] != len(columns) or values.shape[0] != len(index):
  346. # Could let this raise in Block constructor, but we get a more
  347. # helpful exception message this way.
  348. if values.shape[0] == 0:
  349. raise ValueError("Empty data passed with indices specified.")
  350. passed = values.shape
  351. implied = (len(index), len(columns))
  352. raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
  353. def dict_to_mgr(
  354. data: dict,
  355. index,
  356. columns,
  357. *,
  358. dtype: DtypeObj | None = None,
  359. typ: str = "block",
  360. copy: bool = True,
  361. ) -> Manager:
  362. """
  363. Segregate Series based on type and coerce into matrices.
  364. Needs to handle a lot of exceptional cases.
  365. Used in DataFrame.__init__
  366. """
  367. arrays: Sequence[Any] | Series
  368. if columns is not None:
  369. from pandas.core.series import Series
  370. arrays = Series(data, index=columns, dtype=object)
  371. missing = arrays.isna()
  372. if index is None:
  373. # GH10856
  374. # raise ValueError if only scalars in dict
  375. index = _extract_index(arrays[~missing])
  376. else:
  377. index = ensure_index(index)
  378. # no obvious "empty" int column
  379. if missing.any() and not is_integer_dtype(dtype):
  380. nan_dtype: DtypeObj
  381. if dtype is not None:
  382. # calling sanitize_array ensures we don't mix-and-match
  383. # NA dtypes
  384. midxs = missing.values.nonzero()[0]
  385. for i in midxs:
  386. arr = sanitize_array(arrays.iat[i], index, dtype=dtype)
  387. arrays.iat[i] = arr
  388. else:
  389. # GH#1783
  390. nan_dtype = np.dtype("object")
  391. val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
  392. nmissing = missing.sum()
  393. if copy:
  394. rhs = [val] * nmissing
  395. else:
  396. # GH#45369
  397. rhs = [val.copy() for _ in range(nmissing)]
  398. arrays.loc[missing] = rhs
  399. arrays = list(arrays)
  400. columns = ensure_index(columns)
  401. else:
  402. keys = list(data.keys())
  403. columns = Index(keys) if keys else default_index(0)
  404. arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
  405. arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]
  406. if copy:
  407. if typ == "block":
  408. # We only need to copy arrays that will not get consolidated, i.e.
  409. # only EA arrays
  410. arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]
  411. else:
  412. # dtype check to exclude e.g. range objects, scalars
  413. arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
  414. return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
  415. def nested_data_to_arrays(
  416. data: Sequence,
  417. columns: Index | None,
  418. index: Index | None,
  419. dtype: DtypeObj | None,
  420. ) -> tuple[list[ArrayLike], Index, Index]:
  421. """
  422. Convert a single sequence of arrays to multiple arrays.
  423. """
  424. # By the time we get here we have already checked treat_as_nested(data)
  425. if is_named_tuple(data[0]) and columns is None:
  426. columns = ensure_index(data[0]._fields)
  427. arrays, columns = to_arrays(data, columns, dtype=dtype)
  428. columns = ensure_index(columns)
  429. if index is None:
  430. if isinstance(data[0], ABCSeries):
  431. index = _get_names_from_index(data)
  432. else:
  433. index = default_index(len(data))
  434. return arrays, columns, index
  435. def treat_as_nested(data) -> bool:
  436. """
  437. Check if we should use nested_data_to_arrays.
  438. """
  439. return (
  440. len(data) > 0
  441. and is_list_like(data[0])
  442. and getattr(data[0], "ndim", 1) == 1
  443. and not (isinstance(data, ExtensionArray) and data.ndim == 2)
  444. )
  445. # ---------------------------------------------------------------------
  446. def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray:
  447. # values is specifically _not_ ndarray, EA, Index, or Series
  448. # We only get here with `not treat_as_nested(values)`
  449. if len(values) == 0:
  450. # TODO: check for length-zero range, in which case return int64 dtype?
  451. # TODO: re-use anything in try_cast?
  452. return np.empty((0, 0), dtype=object)
  453. elif isinstance(values, range):
  454. arr = range_to_ndarray(values)
  455. return arr[..., np.newaxis]
  456. def convert(v):
  457. if not is_list_like(v) or isinstance(v, ABCDataFrame):
  458. return v
  459. v = extract_array(v, extract_numpy=True)
  460. res = maybe_convert_platform(v)
  461. # We don't do maybe_infer_to_datetimelike here bc we will end up doing
  462. # it column-by-column in ndarray_to_mgr
  463. return res
  464. # we could have a 1-dim or 2-dim list here
  465. # this is equiv of np.asarray, but does object conversion
  466. # and platform dtype preservation
  467. # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like
  468. # np.asarray would
  469. if is_list_like(values[0]):
  470. values = np.array([convert(v) for v in values])
  471. elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
  472. # GH#21861 see test_constructor_list_of_lists
  473. values = np.array([convert(v) for v in values])
  474. else:
  475. values = convert(values)
  476. return _ensure_2d(values)
  477. def _ensure_2d(values: np.ndarray) -> np.ndarray:
  478. """
  479. Reshape 1D values, raise on anything else other than 2D.
  480. """
  481. if values.ndim == 1:
  482. values = values.reshape((values.shape[0], 1))
  483. elif values.ndim != 2:
  484. raise ValueError(f"Must pass 2-d input. shape={values.shape}")
  485. return values
  486. def _homogenize(
  487. data, index: Index, dtype: DtypeObj | None
  488. ) -> tuple[list[ArrayLike], list[Any]]:
  489. oindex = None
  490. homogenized = []
  491. # if the original array-like in `data` is a Series, keep track of this Series' refs
  492. refs: list[Any] = []
  493. for val in data:
  494. if isinstance(val, ABCSeries):
  495. if dtype is not None:
  496. val = val.astype(dtype, copy=False)
  497. if val.index is not index:
  498. # Forces alignment. No need to copy data since we
  499. # are putting it into an ndarray later
  500. val = val.reindex(index, copy=False)
  501. refs.append(val._references)
  502. val = val._values
  503. else:
  504. if isinstance(val, dict):
  505. # GH#41785 this _should_ be equivalent to (but faster than)
  506. # val = Series(val, index=index)._values
  507. if oindex is None:
  508. oindex = index.astype("O")
  509. if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
  510. # see test_constructor_dict_datetime64_index
  511. val = dict_compat(val)
  512. else:
  513. # see test_constructor_subclass_dict
  514. val = dict(val)
  515. val = lib.fast_multiget(val, oindex._values, default=np.nan)
  516. val = sanitize_array(val, index, dtype=dtype, copy=False)
  517. com.require_length_match(val, index)
  518. refs.append(None)
  519. homogenized.append(val)
  520. return homogenized, refs
  521. def _extract_index(data) -> Index:
  522. """
  523. Try to infer an Index from the passed data, raise ValueError on failure.
  524. """
  525. index: Index
  526. if len(data) == 0:
  527. return default_index(0)
  528. raw_lengths = []
  529. indexes: list[list[Hashable] | Index] = []
  530. have_raw_arrays = False
  531. have_series = False
  532. have_dicts = False
  533. for val in data:
  534. if isinstance(val, ABCSeries):
  535. have_series = True
  536. indexes.append(val.index)
  537. elif isinstance(val, dict):
  538. have_dicts = True
  539. indexes.append(list(val.keys()))
  540. elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
  541. have_raw_arrays = True
  542. raw_lengths.append(len(val))
  543. elif isinstance(val, np.ndarray) and val.ndim > 1:
  544. raise ValueError("Per-column arrays must each be 1-dimensional")
  545. if not indexes and not raw_lengths:
  546. raise ValueError("If using all scalar values, you must pass an index")
  547. if have_series:
  548. index = union_indexes(indexes)
  549. elif have_dicts:
  550. index = union_indexes(indexes, sort=False)
  551. if have_raw_arrays:
  552. lengths = list(set(raw_lengths))
  553. if len(lengths) > 1:
  554. raise ValueError("All arrays must be of the same length")
  555. if have_dicts:
  556. raise ValueError(
  557. "Mixing dicts with non-Series may lead to ambiguous ordering."
  558. )
  559. if have_series:
  560. if lengths[0] != len(index):
  561. msg = (
  562. f"array length {lengths[0]} does not match index "
  563. f"length {len(index)}"
  564. )
  565. raise ValueError(msg)
  566. else:
  567. index = default_index(lengths[0])
  568. return ensure_index(index)
  569. def reorder_arrays(
  570. arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int
  571. ) -> tuple[list[ArrayLike], Index]:
  572. """
  573. Pre-emptively (cheaply) reindex arrays with new columns.
  574. """
  575. # reorder according to the columns
  576. if columns is not None:
  577. if not columns.equals(arr_columns):
  578. # if they are equal, there is nothing to do
  579. new_arrays: list[ArrayLike | None]
  580. new_arrays = [None] * len(columns)
  581. indexer = arr_columns.get_indexer(columns)
  582. for i, k in enumerate(indexer):
  583. if k == -1:
  584. # by convention default is all-NaN object dtype
  585. arr = np.empty(length, dtype=object)
  586. arr.fill(np.nan)
  587. else:
  588. arr = arrays[k]
  589. new_arrays[i] = arr
  590. # Incompatible types in assignment (expression has type
  591. # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable
  592. # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]")
  593. arrays = new_arrays # type: ignore[assignment]
  594. arr_columns = columns
  595. return arrays, arr_columns
  596. def _get_names_from_index(data) -> Index:
  597. has_some_name = any(getattr(s, "name", None) is not None for s in data)
  598. if not has_some_name:
  599. return default_index(len(data))
  600. index: list[Hashable] = list(range(len(data)))
  601. count = 0
  602. for i, s in enumerate(data):
  603. n = getattr(s, "name", None)
  604. if n is not None:
  605. index[i] = n
  606. else:
  607. index[i] = f"Unnamed {count}"
  608. count += 1
  609. return Index(index)
  610. def _get_axes(
  611. N: int, K: int, index: Index | None, columns: Index | None
  612. ) -> tuple[Index, Index]:
  613. # helper to create the axes as indexes
  614. # return axes or defaults
  615. if index is None:
  616. index = default_index(N)
  617. else:
  618. index = ensure_index(index)
  619. if columns is None:
  620. columns = default_index(K)
  621. else:
  622. columns = ensure_index(columns)
  623. return index, columns
  624. def dataclasses_to_dicts(data):
  625. """
  626. Converts a list of dataclass instances to a list of dictionaries.
  627. Parameters
  628. ----------
  629. data : List[Type[dataclass]]
  630. Returns
  631. --------
  632. list_dict : List[dict]
  633. Examples
  634. --------
  635. >>> from dataclasses import dataclass
  636. >>> @dataclass
  637. ... class Point:
  638. ... x: int
  639. ... y: int
  640. >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])
  641. [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]
  642. """
  643. from dataclasses import asdict
  644. return list(map(asdict, data))
  645. # ---------------------------------------------------------------------
  646. # Conversion of Inputs to Arrays
  647. def to_arrays(
  648. data, columns: Index | None, dtype: DtypeObj | None = None
  649. ) -> tuple[list[ArrayLike], Index]:
  650. """
  651. Return list of arrays, columns.
  652. Returns
  653. -------
  654. list[ArrayLike]
  655. These will become columns in a DataFrame.
  656. Index
  657. This will become frame.columns.
  658. Notes
  659. -----
  660. Ensures that len(result_arrays) == len(result_index).
  661. """
  662. if isinstance(data, ABCDataFrame):
  663. # see test_from_records_with_index_data, test_from_records_bad_index_column
  664. if columns is not None:
  665. arrays = [
  666. data._ixs(i, axis=1)._values
  667. for i, col in enumerate(data.columns)
  668. if col in columns
  669. ]
  670. else:
  671. columns = data.columns
  672. arrays = [data._ixs(i, axis=1)._values for i in range(len(columns))]
  673. return arrays, columns
  674. if not len(data):
  675. if isinstance(data, np.ndarray):
  676. if data.dtype.names is not None:
  677. # i.e. numpy structured array
  678. columns = ensure_index(data.dtype.names)
  679. arrays = [data[name] for name in columns]
  680. if len(data) == 0:
  681. # GH#42456 the indexing above results in list of 2D ndarrays
  682. # TODO: is that an issue with numpy?
  683. for i, arr in enumerate(arrays):
  684. if arr.ndim == 2:
  685. arrays[i] = arr[:, 0]
  686. return arrays, columns
  687. return [], ensure_index([])
  688. elif isinstance(data, np.ndarray) and data.dtype.names is not None:
  689. # e.g. recarray
  690. columns = Index(list(data.dtype.names))
  691. arrays = [data[k] for k in columns]
  692. return arrays, columns
  693. if isinstance(data[0], (list, tuple)):
  694. arr = _list_to_arrays(data)
  695. elif isinstance(data[0], abc.Mapping):
  696. arr, columns = _list_of_dict_to_arrays(data, columns)
  697. elif isinstance(data[0], ABCSeries):
  698. arr, columns = _list_of_series_to_arrays(data, columns)
  699. else:
  700. # last ditch effort
  701. data = [tuple(x) for x in data]
  702. arr = _list_to_arrays(data)
  703. content, columns = _finalize_columns_and_data(arr, columns, dtype)
  704. return content, columns
  705. def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:
  706. # Returned np.ndarray has ndim = 2
  707. # Note: we already check len(data) > 0 before getting hre
  708. if isinstance(data[0], tuple):
  709. content = lib.to_object_array_tuples(data)
  710. else:
  711. # list of lists
  712. content = lib.to_object_array(data)
  713. return content
  714. def _list_of_series_to_arrays(
  715. data: list,
  716. columns: Index | None,
  717. ) -> tuple[np.ndarray, Index]:
  718. # returned np.ndarray has ndim == 2
  719. if columns is None:
  720. # We know pass_data is non-empty because data[0] is a Series
  721. pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
  722. columns = get_objs_combined_axis(pass_data, sort=False)
  723. indexer_cache: dict[int, np.ndarray] = {}
  724. aligned_values = []
  725. for s in data:
  726. index = getattr(s, "index", None)
  727. if index is None:
  728. index = default_index(len(s))
  729. if id(index) in indexer_cache:
  730. indexer = indexer_cache[id(index)]
  731. else:
  732. indexer = indexer_cache[id(index)] = index.get_indexer(columns)
  733. values = extract_array(s, extract_numpy=True)
  734. aligned_values.append(algorithms.take_nd(values, indexer))
  735. content = np.vstack(aligned_values)
  736. return content, columns
  737. def _list_of_dict_to_arrays(
  738. data: list[dict],
  739. columns: Index | None,
  740. ) -> tuple[np.ndarray, Index]:
  741. """
  742. Convert list of dicts to numpy arrays
  743. if `columns` is not passed, column names are inferred from the records
  744. - for OrderedDict and dicts, the column names match
  745. the key insertion-order from the first record to the last.
  746. - For other kinds of dict-likes, the keys are lexically sorted.
  747. Parameters
  748. ----------
  749. data : iterable
  750. collection of records (OrderedDict, dict)
  751. columns: iterables or None
  752. Returns
  753. -------
  754. content : np.ndarray[object, ndim=2]
  755. columns : Index
  756. """
  757. if columns is None:
  758. gen = (list(x.keys()) for x in data)
  759. sort = not any(isinstance(d, dict) for d in data)
  760. pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)
  761. columns = ensure_index(pre_cols)
  762. # assure that they are of the base dict class and not of derived
  763. # classes
  764. data = [d if type(d) is dict else dict(d) for d in data]
  765. content = lib.dicts_to_array(data, list(columns))
  766. return content, columns
  767. def _finalize_columns_and_data(
  768. content: np.ndarray, # ndim == 2
  769. columns: Index | None,
  770. dtype: DtypeObj | None,
  771. ) -> tuple[list[ArrayLike], Index]:
  772. """
  773. Ensure we have valid columns, cast object dtypes if possible.
  774. """
  775. contents = list(content.T)
  776. try:
  777. columns = _validate_or_indexify_columns(contents, columns)
  778. except AssertionError as err:
  779. # GH#26429 do not raise user-facing AssertionError
  780. raise ValueError(err) from err
  781. if len(contents) and contents[0].dtype == np.object_:
  782. contents = convert_object_array(contents, dtype=dtype)
  783. return contents, columns
  784. def _validate_or_indexify_columns(
  785. content: list[np.ndarray], columns: Index | None
  786. ) -> Index:
  787. """
  788. If columns is None, make numbers as column names; Otherwise, validate that
  789. columns have valid length.
  790. Parameters
  791. ----------
  792. content : list of np.ndarrays
  793. columns : Index or None
  794. Returns
  795. -------
  796. Index
  797. If columns is None, assign positional column index value as columns.
  798. Raises
  799. ------
  800. 1. AssertionError when content is not composed of list of lists, and if
  801. length of columns is not equal to length of content.
  802. 2. ValueError when content is list of lists, but length of each sub-list
  803. is not equal
  804. 3. ValueError when content is list of lists, but length of sub-list is
  805. not equal to length of content
  806. """
  807. if columns is None:
  808. columns = default_index(len(content))
  809. else:
  810. # Add mask for data which is composed of list of lists
  811. is_mi_list = isinstance(columns, list) and all(
  812. isinstance(col, list) for col in columns
  813. )
  814. if not is_mi_list and len(columns) != len(content): # pragma: no cover
  815. # caller's responsibility to check for this...
  816. raise AssertionError(
  817. f"{len(columns)} columns passed, passed data had "
  818. f"{len(content)} columns"
  819. )
  820. if is_mi_list:
  821. # check if nested list column, length of each sub-list should be equal
  822. if len({len(col) for col in columns}) > 1:
  823. raise ValueError(
  824. "Length of columns passed for MultiIndex columns is different"
  825. )
  826. # if columns is not empty and length of sublist is not equal to content
  827. if columns and len(columns[0]) != len(content):
  828. raise ValueError(
  829. f"{len(columns[0])} columns passed, passed data had "
  830. f"{len(content)} columns"
  831. )
  832. return columns
  833. def convert_object_array(
  834. content: list[npt.NDArray[np.object_]],
  835. dtype: DtypeObj | None,
  836. dtype_backend: str = "numpy",
  837. coerce_float: bool = False,
  838. ) -> list[ArrayLike]:
  839. """
  840. Internal function to convert object array.
  841. Parameters
  842. ----------
  843. content: List[np.ndarray]
  844. dtype: np.dtype or ExtensionDtype
  845. dtype_backend: Controls if nullable/pyarrow dtypes are returned.
  846. coerce_float: Cast floats that are integers to int.
  847. Returns
  848. -------
  849. List[ArrayLike]
  850. """
  851. # provide soft conversion of object dtypes
  852. def convert(arr):
  853. if dtype != np.dtype("O"):
  854. arr = lib.maybe_convert_objects(
  855. arr,
  856. try_float=coerce_float,
  857. convert_to_nullable_dtype=dtype_backend != "numpy",
  858. )
  859. # Notes on cases that get here 2023-02-15
  860. # 1) we DO get here when arr is all Timestamps and dtype=None
  861. # 2) disabling this doesn't break the world, so this must be
  862. # getting caught at a higher level
  863. # 3) passing convert_datetime to maybe_convert_objects get this right
  864. # 4) convert_timedelta?
  865. if dtype is None:
  866. if arr.dtype == np.dtype("O"):
  867. # i.e. maybe_convert_objects didn't convert
  868. arr = maybe_infer_to_datetimelike(arr)
  869. if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
  870. arr = StringDtype().construct_array_type()._from_sequence(arr)
  871. elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
  872. if is_integer_dtype(arr.dtype):
  873. arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_))
  874. elif is_bool_dtype(arr.dtype):
  875. arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_))
  876. elif is_float_dtype(arr.dtype):
  877. arr = FloatingArray(arr, np.isnan(arr))
  878. elif isinstance(dtype, ExtensionDtype):
  879. # TODO: test(s) that get here
  880. # TODO: try to de-duplicate this convert function with
  881. # core.construction functions
  882. cls = dtype.construct_array_type()
  883. arr = cls._from_sequence(arr, dtype=dtype, copy=False)
  884. elif dtype.kind in ["m", "M"]:
  885. # This restriction is harmless bc these are the only cases
  886. # where maybe_cast_to_datetime is not a no-op.
  887. # Here we know:
  888. # 1) dtype.kind in ["m", "M"] and
  889. # 2) arr is either object or numeric dtype
  890. arr = maybe_cast_to_datetime(arr, dtype)
  891. return arr
  892. arrays = [convert(arr) for arr in content]
  893. return arrays