lib.pyx 87 KB


  1. from collections import abc
  2. from decimal import Decimal
  3. from enum import Enum
  4. from typing import (
  5. Literal,
  6. _GenericAlias,
  7. )
  8. cimport cython
  9. from cpython.datetime cimport (
  10. PyDate_Check,
  11. PyDateTime_Check,
  12. PyDelta_Check,
  13. PyTime_Check,
  14. import_datetime,
  15. )
  16. from cpython.iterator cimport PyIter_Check
  17. from cpython.number cimport PyNumber_Check
  18. from cpython.object cimport (
  19. Py_EQ,
  20. PyObject,
  21. PyObject_RichCompareBool,
  22. PyTypeObject,
  23. )
  24. from cpython.ref cimport Py_INCREF
  25. from cpython.sequence cimport PySequence_Check
  26. from cpython.tuple cimport (
  27. PyTuple_New,
  28. PyTuple_SET_ITEM,
  29. )
  30. from cython cimport (
  31. Py_ssize_t,
  32. floating,
  33. )
  34. from pandas._libs.missing import check_na_tuples_nonequal
  35. import_datetime()
  36. import numpy as np
  37. cimport numpy as cnp
  38. from numpy cimport (
  39. NPY_OBJECT,
  40. PyArray_Check,
  41. PyArray_GETITEM,
  42. PyArray_ITER_DATA,
  43. PyArray_ITER_NEXT,
  44. PyArray_IterNew,
  45. complex128_t,
  46. flatiter,
  47. float64_t,
  48. int32_t,
  49. int64_t,
  50. intp_t,
  51. ndarray,
  52. uint8_t,
  53. uint64_t,
  54. )
  55. cnp.import_array()
  56. cdef extern from "Python.h":
  57. # Note: importing extern-style allows us to declare these as nogil
  58. # functions, whereas `from cpython cimport` does not.
  59. bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
  60. cdef extern from "numpy/arrayobject.h":
  61. # cython's numpy.dtype specification is incorrect, which leads to
  62. # errors in issubclass(self.dtype.type, np.bool_), so we directly
  63. # include the correct version
  64. # https://github.com/cython/cython/issues/2022
  65. ctypedef class numpy.dtype [object PyArray_Descr]:
  66. # Use PyDataType_* macros when possible, however there are no macros
  67. # for accessing some of the fields, so some are defined. Please
  68. # ask on cython-dev if you need more.
  69. cdef:
  70. int type_num
  71. int itemsize "elsize"
  72. char byteorder
  73. object fields
  74. tuple names
  75. PyTypeObject PySignedIntegerArrType_Type
  76. PyTypeObject PyUnsignedIntegerArrType_Type
  77. cdef extern from "numpy/ndarrayobject.h":
  78. bint PyArray_CheckScalar(obj) nogil
  79. cdef extern from "src/parse_helper.h":
  80. int floatify(object, float64_t *result, int *maybe_int) except -1
  81. from pandas._libs cimport util
  82. from pandas._libs.util cimport (
  83. INT64_MAX,
  84. INT64_MIN,
  85. UINT64_MAX,
  86. is_nan,
  87. )
  88. from pandas._libs.tslibs import (
  89. OutOfBoundsDatetime,
  90. OutOfBoundsTimedelta,
  91. )
  92. from pandas._libs.tslibs.period import Period
  93. from pandas._libs.missing cimport (
  94. C_NA,
  95. checknull,
  96. is_matching_na,
  97. is_null_datetime64,
  98. is_null_timedelta64,
  99. )
  100. from pandas._libs.tslibs.conversion cimport (
  101. _TSObject,
  102. convert_to_tsobject,
  103. )
  104. from pandas._libs.tslibs.nattype cimport (
  105. NPY_NAT,
  106. c_NaT as NaT,
  107. checknull_with_nat,
  108. )
  109. from pandas._libs.tslibs.np_datetime cimport NPY_FR_ns
  110. from pandas._libs.tslibs.offsets cimport is_offset_object
  111. from pandas._libs.tslibs.period cimport is_period_object
  112. from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
  113. from pandas._libs.tslibs.timezones cimport tz_compare
  114. # constants that will be compared to potentially arbitrarily large
  115. # python int
  116. cdef:
  117. object oINT64_MAX = <int64_t>INT64_MAX
  118. object oINT64_MIN = <int64_t>INT64_MIN
  119. object oUINT64_MAX = <uint64_t>UINT64_MAX
  120. float64_t NaN = <float64_t>np.NaN
  121. # python-visible
  122. i8max = <int64_t>INT64_MAX
  123. u8max = <uint64_t>UINT64_MAX
  124. @cython.wraparound(False)
  125. @cython.boundscheck(False)
  126. def memory_usage_of_objects(arr: object[:]) -> int64_t:
  127. """
  128. Return the memory usage of an object array in bytes.
  129. Does not include the actual bytes of the pointers
  130. """
  131. cdef:
  132. Py_ssize_t i
  133. Py_ssize_t n
  134. int64_t size = 0
  135. n = len(arr)
  136. for i in range(n):
  137. size += arr[i].__sizeof__()
  138. return size
  139. # ----------------------------------------------------------------------
  140. def is_scalar(val: object) -> bool:
  141. """
  142. Return True if given object is scalar.
  143. Parameters
  144. ----------
  145. val : object
  146. This includes:
  147. - numpy array scalar (e.g. np.int64)
  148. - Python builtin numerics
  149. - Python builtin byte arrays and strings
  150. - None
  151. - datetime.datetime
  152. - datetime.timedelta
  153. - Period
  154. - decimal.Decimal
  155. - Interval
  156. - DateOffset
  157. - Fraction
  158. - Number.
  159. Returns
  160. -------
  161. bool
  162. Return True if given object is scalar.
  163. Examples
  164. --------
  165. >>> import datetime
  166. >>> dt = datetime.datetime(2018, 10, 3)
  167. >>> pd.api.types.is_scalar(dt)
  168. True
  169. >>> pd.api.types.is_scalar([2, 3])
  170. False
  171. >>> pd.api.types.is_scalar({0: 1, 2: 3})
  172. False
  173. >>> pd.api.types.is_scalar((0, 2))
  174. False
  175. pandas supports PEP 3141 numbers:
  176. >>> from fractions import Fraction
  177. >>> pd.api.types.is_scalar(Fraction(3, 5))
  178. True
  179. """
  180. # Start with C-optimized checks
  181. if (cnp.PyArray_IsAnyScalar(val)
  182. # PyArray_IsAnyScalar is always False for bytearrays on Py3
  183. or PyDate_Check(val)
  184. or PyDelta_Check(val)
  185. or PyTime_Check(val)
  186. # We differ from numpy, which claims that None is not scalar;
  187. # see np.isscalar
  188. or val is C_NA
  189. or val is None):
  190. return True
  191. # Next use C-optimized checks to exclude common non-scalars before falling
  192. # back to non-optimized checks.
  193. if PySequence_Check(val):
  194. # e.g. list, tuple
  195. # includes np.ndarray, Series which PyNumber_Check can return True for
  196. return False
  197. # Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number
  198. return (PyNumber_Check(val)
  199. or is_period_object(val)
  200. or is_interval(val)
  201. or is_offset_object(val))
  202. cdef int64_t get_itemsize(object val):
  203. """
  204. Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
  205. Parameters
  206. ----------
  207. val : object
  208. Returns
  209. -------
  210. is_ndarray : bool
  211. """
  212. if PyArray_CheckScalar(val):
  213. return cnp.PyArray_DescrFromScalar(val).itemsize
  214. else:
  215. return -1
  216. def is_iterator(obj: object) -> bool:
  217. """
  218. Check if the object is an iterator.
  219. This is intended for generators, not list-like objects.
  220. Parameters
  221. ----------
  222. obj : The object to check
  223. Returns
  224. -------
  225. is_iter : bool
  226. Whether `obj` is an iterator.
  227. Examples
  228. --------
  229. >>> import datetime
  230. >>> from pandas.api.types import is_iterator
  231. >>> is_iterator((x for x in []))
  232. True
  233. >>> is_iterator([1, 2, 3])
  234. False
  235. >>> is_iterator(datetime.datetime(2017, 1, 1))
  236. False
  237. >>> is_iterator("foo")
  238. False
  239. >>> is_iterator(1)
  240. False
  241. """
  242. return PyIter_Check(obj)
  243. def item_from_zerodim(val: object) -> object:
  244. """
  245. If the value is a zerodim array, return the item it contains.
  246. Parameters
  247. ----------
  248. val : object
  249. Returns
  250. -------
  251. object
  252. Examples
  253. --------
  254. >>> item_from_zerodim(1)
  255. 1
  256. >>> item_from_zerodim('foobar')
  257. 'foobar'
  258. >>> item_from_zerodim(np.array(1))
  259. 1
  260. >>> item_from_zerodim(np.array([1]))
  261. array([1])
  262. """
  263. if cnp.PyArray_IsZeroDim(val):
  264. return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val)
  265. return val
  266. @cython.wraparound(False)
  267. @cython.boundscheck(False)
  268. def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list:
  269. cdef:
  270. list buf
  271. Py_ssize_t k = len(lists)
  272. Py_ssize_t i, j, n
  273. list uniques = []
  274. dict table = {}
  275. object val, stub = 0
  276. for i in range(k):
  277. buf = lists[i]
  278. n = len(buf)
  279. for j in range(n):
  280. val = buf[j]
  281. if val not in table:
  282. table[val] = stub
  283. uniques.append(val)
  284. if sort:
  285. try:
  286. uniques.sort()
  287. except TypeError:
  288. pass
  289. return uniques
  290. @cython.wraparound(False)
  291. @cython.boundscheck(False)
  292. def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
  293. """
  294. Generate a list of unique values from a generator of lists.
  295. Parameters
  296. ----------
  297. gen : generator object
  298. Generator of lists from which the unique list is created.
  299. sort : bool
  300. Whether or not to sort the resulting unique list.
  301. Returns
  302. -------
  303. list of unique values
  304. """
  305. cdef:
  306. list buf
  307. Py_ssize_t j, n
  308. list uniques = []
  309. dict table = {}
  310. object val, stub = 0
  311. for buf in gen:
  312. n = len(buf)
  313. for j in range(n):
  314. val = buf[j]
  315. if val not in table:
  316. table[val] = stub
  317. uniques.append(val)
  318. if sort:
  319. try:
  320. uniques.sort()
  321. except TypeError:
  322. pass
  323. return uniques
  324. @cython.wraparound(False)
  325. @cython.boundscheck(False)
  326. def dicts_to_array(dicts: list, columns: list):
  327. cdef:
  328. Py_ssize_t i, j, k, n
  329. ndarray[object, ndim=2] result
  330. dict row
  331. object col, onan = np.nan
  332. k = len(columns)
  333. n = len(dicts)
  334. result = np.empty((n, k), dtype="O")
  335. for i in range(n):
  336. row = dicts[i]
  337. for j in range(k):
  338. col = columns[j]
  339. if col in row:
  340. result[i, j] = row[col]
  341. else:
  342. result[i, j] = onan
  343. return result
  344. def fast_zip(list ndarrays) -> ndarray[object]:
  345. """
  346. For zipping multiple ndarrays into an ndarray of tuples.
  347. """
  348. cdef:
  349. Py_ssize_t i, j, k, n
  350. ndarray[object, ndim=1] result
  351. flatiter it
  352. object val, tup
  353. k = len(ndarrays)
  354. n = len(ndarrays[0])
  355. result = np.empty(n, dtype=object)
  356. # initialize tuples on first pass
  357. arr = ndarrays[0]
  358. it = <flatiter>PyArray_IterNew(arr)
  359. for i in range(n):
  360. val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
  361. tup = PyTuple_New(k)
  362. PyTuple_SET_ITEM(tup, 0, val)
  363. Py_INCREF(val)
  364. result[i] = tup
  365. PyArray_ITER_NEXT(it)
  366. for j in range(1, k):
  367. arr = ndarrays[j]
  368. it = <flatiter>PyArray_IterNew(arr)
  369. if len(arr) != n:
  370. raise ValueError("all arrays must be same length")
  371. for i in range(n):
  372. val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
  373. PyTuple_SET_ITEM(result[i], j, val)
  374. Py_INCREF(val)
  375. PyArray_ITER_NEXT(it)
  376. return result
  377. def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
  378. """
  379. Reverse indexing operation.
  380. Given `indexer`, make `indexer_inv` of it, such that::
  381. indexer_inv[indexer[x]] = x
  382. Parameters
  383. ----------
  384. indexer : np.ndarray[np.intp]
  385. length : int
  386. Returns
  387. -------
  388. np.ndarray[np.intp]
  389. Notes
  390. -----
  391. If indexer is not unique, only first occurrence is accounted.
  392. """
  393. cdef:
  394. Py_ssize_t i, n = len(indexer)
  395. ndarray[intp_t, ndim=1] rev_indexer
  396. intp_t idx
  397. rev_indexer = np.empty(length, dtype=np.intp)
  398. rev_indexer[:] = -1
  399. for i in range(n):
  400. idx = indexer[i]
  401. if idx != -1:
  402. rev_indexer[idx] = i
  403. return rev_indexer
  404. @cython.wraparound(False)
  405. @cython.boundscheck(False)
  406. # TODO(cython3): Can add const once cython#1772 is resolved
  407. def has_infs(floating[:] arr) -> bool:
  408. cdef:
  409. Py_ssize_t i, n = len(arr)
  410. floating inf, neginf, val
  411. bint ret = False
  412. inf = np.inf
  413. neginf = -inf
  414. with nogil:
  415. for i in range(n):
  416. val = arr[i]
  417. if val == inf or val == neginf:
  418. ret = True
  419. break
  420. return ret
  421. def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
  422. cdef:
  423. Py_ssize_t i, n = len(indices)
  424. intp_t k, vstart, vlast, v
  425. if n == 0:
  426. return slice(0, 0)
  427. vstart = indices[0]
  428. if vstart < 0 or max_len <= vstart:
  429. return indices
  430. if n == 1:
  431. return slice(vstart, <intp_t>(vstart + 1))
  432. vlast = indices[n - 1]
  433. if vlast < 0 or max_len <= vlast:
  434. return indices
  435. k = indices[1] - indices[0]
  436. if k == 0:
  437. return indices
  438. else:
  439. for i in range(2, n):
  440. v = indices[i]
  441. if v - indices[i - 1] != k:
  442. return indices
  443. if k > 0:
  444. return slice(vstart, <intp_t>(vlast + 1), k)
  445. else:
  446. if vlast == 0:
  447. return slice(vstart, None, k)
  448. else:
  449. return slice(vstart, <intp_t>(vlast - 1), k)
  450. @cython.wraparound(False)
  451. @cython.boundscheck(False)
  452. def maybe_booleans_to_slice(ndarray[uint8_t, ndim=1] mask):
  453. cdef:
  454. Py_ssize_t i, n = len(mask)
  455. Py_ssize_t start = 0, end = 0
  456. bint started = False, finished = False
  457. for i in range(n):
  458. if mask[i]:
  459. if finished:
  460. return mask.view(np.bool_)
  461. if not started:
  462. started = True
  463. start = i
  464. else:
  465. if finished:
  466. continue
  467. if started:
  468. end = i
  469. finished = True
  470. if not started:
  471. return slice(0, 0)
  472. if not finished:
  473. return slice(start, None)
  474. else:
  475. return slice(start, end)
  476. @cython.wraparound(False)
  477. @cython.boundscheck(False)
  478. def array_equivalent_object(ndarray left, ndarray right) -> bool:
  479. """
  480. Perform an element by element comparison on N-d object arrays
  481. taking into account nan positions.
  482. """
  483. # left and right both have object dtype, but we cannot annotate that
  484. # without limiting ndim.
  485. cdef:
  486. Py_ssize_t i, n = left.size
  487. object x, y
  488. cnp.broadcast mi = cnp.PyArray_MultiIterNew2(left, right)
  489. # Caller is responsible for checking left.shape == right.shape
  490. for i in range(n):
  491. # Analogous to: x = left[i]
  492. x = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 0))[0]
  493. y = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
  494. # we are either not equal or both nan
  495. # I think None == None will be true here
  496. try:
  497. if PyArray_Check(x) and PyArray_Check(y):
  498. if x.shape != y.shape:
  499. return False
  500. if x.dtype == y.dtype == object:
  501. if not array_equivalent_object(x, y):
  502. return False
  503. else:
  504. # Circular import isn't great, but so it goes.
  505. # TODO: could use np.array_equal?
  506. from pandas.core.dtypes.missing import array_equivalent
  507. if not array_equivalent(x, y):
  508. return False
  509. elif (x is C_NA) ^ (y is C_NA):
  510. return False
  511. elif not (
  512. PyObject_RichCompareBool(x, y, Py_EQ)
  513. or is_matching_na(x, y, nan_matches_none=True)
  514. ):
  515. return False
  516. except (ValueError, TypeError):
  517. # Avoid raising ValueError when comparing Numpy arrays to other types
  518. if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y):
  519. # Only compare scalars to scalars and non-scalars to non-scalars
  520. return False
  521. elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y))
  522. and not (isinstance(x, type(y)) or isinstance(y, type(x)))):
  523. # Check if non-scalars have the same type
  524. return False
  525. elif check_na_tuples_nonequal(x, y):
  526. # We have tuples where one Side has a NA and the other side does not
  527. # Only condition we may end up with a TypeError
  528. return False
  529. raise
  530. cnp.PyArray_MultiIter_NEXT(mi)
  531. return True
  532. ctypedef fused int6432_t:
  533. int64_t
  534. int32_t
  535. @cython.wraparound(False)
  536. @cython.boundscheck(False)
  537. def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool:
  538. """
  539. Perform an element by element comparison on 1-d integer arrays, meant for indexer
  540. comparisons
  541. """
  542. cdef:
  543. Py_ssize_t i
  544. if left.size != n:
  545. return False
  546. for i in range(n):
  547. if left[i] != i:
  548. return False
  549. return True
  550. ctypedef fused ndarr_object:
  551. ndarray[object, ndim=1]
  552. ndarray[object, ndim=2]
  553. # TODO: get rid of this in StringArray and modify
  554. # and go through ensure_string_array instead
  555. @cython.wraparound(False)
  556. @cython.boundscheck(False)
  557. def convert_nans_to_NA(ndarr_object arr) -> ndarray:
  558. """
  559. Helper for StringArray that converts null values that
  560. are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
  561. have already been validated as null.
  562. """
  563. cdef:
  564. Py_ssize_t i, m, n
  565. object val
  566. ndarr_object result
  567. result = np.asarray(arr, dtype="object")
  568. if arr.ndim == 2:
  569. m, n = arr.shape[0], arr.shape[1]
  570. for i in range(m):
  571. for j in range(n):
  572. val = arr[i, j]
  573. if not isinstance(val, str):
  574. result[i, j] = <object>C_NA
  575. else:
  576. n = len(arr)
  577. for i in range(n):
  578. val = arr[i]
  579. if not isinstance(val, str):
  580. result[i] = <object>C_NA
  581. return result
  582. @cython.wraparound(False)
  583. @cython.boundscheck(False)
  584. cpdef ndarray[object] ensure_string_array(
  585. arr,
  586. object na_value=np.nan,
  587. bint convert_na_value=True,
  588. bint copy=True,
  589. bint skipna=True,
  590. ):
  591. """
  592. Returns a new numpy array with object dtype and only strings and na values.
  593. Parameters
  594. ----------
  595. arr : array-like
  596. The values to be converted to str, if needed.
  597. na_value : Any, default np.nan
  598. The value to use for na. For example, np.nan or pd.NA.
  599. convert_na_value : bool, default True
  600. If False, existing na values will be used unchanged in the new array.
  601. copy : bool, default True
  602. Whether to ensure that a new array is returned.
  603. skipna : bool, default True
  604. Whether or not to coerce nulls to their stringified form
  605. (e.g. if False, NaN becomes 'nan').
  606. Returns
  607. -------
  608. np.ndarray[object]
  609. An array with the input array's elements casted to str or nan-like.
  610. """
  611. cdef:
  612. Py_ssize_t i = 0, n = len(arr)
  613. bint already_copied = True
  614. if hasattr(arr, "to_numpy"):
  615. if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]:
  616. # dtype check to exclude DataFrame
  617. # GH#41409 TODO: not a great place for this
  618. out = arr.astype(str).astype(object)
  619. out[arr.isna()] = na_value
  620. return out
  621. arr = arr.to_numpy()
  622. elif not util.is_array(arr):
  623. arr = np.array(arr, dtype="object")
  624. result = np.asarray(arr, dtype="object")
  625. if copy and result is arr:
  626. result = result.copy()
  627. elif not copy and result is arr:
  628. already_copied = False
  629. if issubclass(arr.dtype.type, np.str_):
  630. # short-circuit, all elements are str
  631. return result
  632. for i in range(n):
  633. val = arr[i]
  634. if isinstance(val, str):
  635. continue
  636. elif not already_copied:
  637. result = result.copy()
  638. already_copied = True
  639. if not checknull(val):
  640. if isinstance(val, bytes):
  641. # GH#49658 discussion of desired behavior here
  642. result[i] = val.decode()
  643. elif not util.is_float_object(val):
  644. # f"{val}" is faster than str(val)
  645. result[i] = f"{val}"
  646. else:
  647. # f"{val}" is not always equivalent to str(val) for floats
  648. result[i] = str(val)
  649. else:
  650. if convert_na_value:
  651. val = na_value
  652. if skipna:
  653. result[i] = val
  654. else:
  655. result[i] = f"{val}"
  656. return result
  657. def is_all_arraylike(obj: list) -> bool:
  658. """
  659. Should we treat these as levels of a MultiIndex, as opposed to Index items?
  660. """
  661. cdef:
  662. Py_ssize_t i, n = len(obj)
  663. object val
  664. bint all_arrays = True
  665. for i in range(n):
  666. val = obj[i]
  667. if not (isinstance(val, list) or
  668. util.is_array(val) or hasattr(val, "_data")):
  669. # TODO: EA?
  670. # exclude tuples, frozensets as they may be contained in an Index
  671. all_arrays = False
  672. break
  673. return all_arrays
  674. # ------------------------------------------------------------------------------
  675. # Groupby-related functions
  676. # TODO: could do even better if we know something about the data. eg, index has
  677. # 1-min data, binner has 5-min data, then bins are just strides in index. This
  678. # is a general, O(max(len(values), len(binner))) method.
  679. @cython.boundscheck(False)
  680. @cython.wraparound(False)
  681. def generate_bins_dt64(ndarray[int64_t, ndim=1] values, const int64_t[:] binner,
  682. object closed="left", bint hasnans=False):
  683. """
  684. Int64 (datetime64) version of generic python version in ``groupby.py``.
  685. """
  686. cdef:
  687. Py_ssize_t lenidx, lenbin, i, j, bc
  688. ndarray[int64_t, ndim=1] bins
  689. int64_t r_bin, nat_count
  690. bint right_closed = closed == "right"
  691. nat_count = 0
  692. if hasnans:
  693. mask = values == NPY_NAT
  694. nat_count = np.sum(mask)
  695. values = values[~mask]
  696. lenidx = len(values)
  697. lenbin = len(binner)
  698. if lenidx <= 0 or lenbin <= 0:
  699. raise ValueError("Invalid length for values or for binner")
  700. # check binner fits data
  701. if values[0] < binner[0]:
  702. raise ValueError("Values falls before first bin")
  703. if values[lenidx - 1] > binner[lenbin - 1]:
  704. raise ValueError("Values falls after last bin")
  705. bins = np.empty(lenbin - 1, dtype=np.int64)
  706. j = 0 # index into values
  707. bc = 0 # bin count
  708. # linear scan
  709. if right_closed:
  710. for i in range(0, lenbin - 1):
  711. r_bin = binner[i + 1]
  712. # count values in current bin, advance to next bin
  713. while j < lenidx and values[j] <= r_bin:
  714. j += 1
  715. bins[bc] = j
  716. bc += 1
  717. else:
  718. for i in range(0, lenbin - 1):
  719. r_bin = binner[i + 1]
  720. # count values in current bin, advance to next bin
  721. while j < lenidx and values[j] < r_bin:
  722. j += 1
  723. bins[bc] = j
  724. bc += 1
  725. if nat_count > 0:
  726. # shift bins by the number of NaT
  727. bins = bins + nat_count
  728. bins = np.insert(bins, 0, nat_count)
  729. return bins
  730. @cython.boundscheck(False)
  731. @cython.wraparound(False)
  732. def get_level_sorter(
  733. ndarray[int64_t, ndim=1] codes, const intp_t[:] starts
  734. ) -> ndarray:
  735. """
  736. Argsort for a single level of a multi-index, keeping the order of higher
  737. levels unchanged. `starts` points to starts of same-key indices w.r.t
  738. to leading levels; equivalent to:
  739. np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
  740. + starts[i] for i in range(len(starts) - 1)])
  741. Parameters
  742. ----------
  743. codes : np.ndarray[int64_t, ndim=1]
  744. starts : np.ndarray[intp, ndim=1]
  745. Returns
  746. -------
  747. np.ndarray[np.int, ndim=1]
  748. """
  749. cdef:
  750. Py_ssize_t i, l, r
  751. ndarray[intp_t, ndim=1] out = cnp.PyArray_EMPTY(1, codes.shape, cnp.NPY_INTP, 0)
  752. for i in range(len(starts) - 1):
  753. l, r = starts[i], starts[i + 1]
  754. out[l:r] = l + codes[l:r].argsort(kind="mergesort")
  755. return out
  756. @cython.boundscheck(False)
  757. @cython.wraparound(False)
  758. def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
  759. const intp_t[:] labels,
  760. Py_ssize_t max_bin,
  761. ):
  762. cdef:
  763. Py_ssize_t i, j, k, n
  764. ndarray[int64_t, ndim=2] counts
  765. n, k = (<object>mask).shape
  766. counts = np.zeros((n, max_bin), dtype="i8")
  767. with nogil:
  768. for i in range(n):
  769. for j in range(k):
  770. if mask[i, j]:
  771. counts[i, labels[j]] += 1
  772. return counts
  773. @cython.wraparound(False)
  774. @cython.boundscheck(False)
  775. def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
  776. cdef:
  777. Py_ssize_t i, group_size, n, start
  778. intp_t lab
  779. int64_t[::1] starts, ends
  780. n = len(labels)
  781. starts = np.zeros(ngroups, dtype=np.int64)
  782. ends = np.zeros(ngroups, dtype=np.int64)
  783. start = 0
  784. group_size = 0
  785. with nogil:
  786. for i in range(n):
  787. lab = labels[i]
  788. if lab < 0:
  789. start += 1
  790. else:
  791. group_size += 1
  792. if i == n - 1 or lab != labels[i + 1]:
  793. starts[lab] = start
  794. ends[lab] = start + group_size
  795. start += group_size
  796. group_size = 0
  797. return np.asarray(starts), np.asarray(ends)
  798. def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys,
  799. list sorted_labels) -> dict:
  800. """
  801. Parameters
  802. ----------
  803. index : ndarray[intp]
  804. labels : ndarray[int64]
  805. keys : list
  806. sorted_labels : list[ndarray[int64]]
  807. """
  808. cdef:
  809. Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
  810. dict result = {}
  811. object tup
  812. k = len(keys)
  813. # Start at the first non-null entry
  814. j = 0
  815. for j in range(0, n):
  816. if labels[j] != -1:
  817. break
  818. else:
  819. return result
  820. cur = labels[j]
  821. start = j
  822. for i in range(j+1, n):
  823. lab = labels[i]
  824. if lab != cur:
  825. if lab != -1:
  826. if k == 1:
  827. # When k = 1 we do not want to return a tuple as key
  828. tup = keys[0][sorted_labels[0][i - 1]]
  829. else:
  830. tup = PyTuple_New(k)
  831. for j in range(k):
  832. val = keys[j][sorted_labels[j][i - 1]]
  833. PyTuple_SET_ITEM(tup, j, val)
  834. Py_INCREF(val)
  835. result[tup] = index[start:i]
  836. start = i
  837. cur = lab
  838. if k == 1:
  839. # When k = 1 we do not want to return a tuple as key
  840. tup = keys[0][sorted_labels[0][n - 1]]
  841. else:
  842. tup = PyTuple_New(k)
  843. for j in range(k):
  844. val = keys[j][sorted_labels[j][n - 1]]
  845. PyTuple_SET_ITEM(tup, j, val)
  846. Py_INCREF(val)
  847. result[tup] = index[start:]
  848. return result
  849. # core.common import for fast inference checks
  850. def is_float(obj: object) -> bool:
  851. """
  852. Return True if given object is float.
  853. Returns
  854. -------
  855. bool
  856. """
  857. return util.is_float_object(obj)
  858. def is_integer(obj: object) -> bool:
  859. """
  860. Return True if given object is integer.
  861. Returns
  862. -------
  863. bool
  864. """
  865. return util.is_integer_object(obj)
  866. def is_bool(obj: object) -> bool:
  867. """
  868. Return True if given object is boolean.
  869. Returns
  870. -------
  871. bool
  872. """
  873. return util.is_bool_object(obj)
  874. def is_complex(obj: object) -> bool:
  875. """
  876. Return True if given object is complex.
  877. Returns
  878. -------
  879. bool
  880. """
  881. return util.is_complex_object(obj)
  882. cpdef bint is_decimal(object obj):
  883. return isinstance(obj, Decimal)
  884. cpdef bint is_interval(object obj):
  885. return getattr(obj, "_typ", "_typ") == "interval"
  886. def is_period(val: object) -> bool:
  887. """
  888. Return True if given object is Period.
  889. Returns
  890. -------
  891. bool
  892. """
  893. return is_period_object(val)
  894. def is_list_like(obj: object, allow_sets: bool = True) -> bool:
  895. """
  896. Check if the object is list-like.
  897. Objects that are considered list-like are for example Python
  898. lists, tuples, sets, NumPy arrays, and Pandas Series.
  899. Strings and datetime objects, however, are not considered list-like.
  900. Parameters
  901. ----------
  902. obj : object
  903. Object to check.
  904. allow_sets : bool, default True
  905. If this parameter is False, sets will not be considered list-like.
  906. Returns
  907. -------
  908. bool
  909. Whether `obj` has list-like properties.
  910. Examples
  911. --------
  912. >>> import datetime
  913. >>> from pandas.api.types import is_list_like
  914. >>> is_list_like([1, 2, 3])
  915. True
  916. >>> is_list_like({1, 2, 3})
  917. True
  918. >>> is_list_like(datetime.datetime(2017, 1, 1))
  919. False
  920. >>> is_list_like("foo")
  921. False
  922. >>> is_list_like(1)
  923. False
  924. >>> is_list_like(np.array([2]))
  925. True
  926. >>> is_list_like(np.array(2))
  927. False
  928. """
  929. return c_is_list_like(obj, allow_sets)
  930. cdef bint c_is_list_like(object obj, bint allow_sets) except -1:
  931. # first, performance short-cuts for the most common cases
  932. if util.is_array(obj):
  933. # exclude zero-dimensional numpy arrays, effectively scalars
  934. return not cnp.PyArray_IsZeroDim(obj)
  935. elif isinstance(obj, list):
  936. return True
  937. # then the generic implementation
  938. return (
  939. # equiv: `isinstance(obj, abc.Iterable)`
  940. getattr(obj, "__iter__", None) is not None and not isinstance(obj, type)
  941. # we do not count strings/unicode/bytes as list-like
  942. # exclude Generic types that have __iter__
  943. and not isinstance(obj, (str, bytes, _GenericAlias))
  944. # exclude zero-dimensional duck-arrays, effectively scalars
  945. and not (hasattr(obj, "ndim") and obj.ndim == 0)
  946. # exclude sets if allow_sets is False
  947. and not (allow_sets is False and isinstance(obj, abc.Set))
  948. )
  949. _TYPE_MAP = {
  950. "categorical": "categorical",
  951. "category": "categorical",
  952. "int8": "integer",
  953. "int16": "integer",
  954. "int32": "integer",
  955. "int64": "integer",
  956. "i": "integer",
  957. "uint8": "integer",
  958. "uint16": "integer",
  959. "uint32": "integer",
  960. "uint64": "integer",
  961. "u": "integer",
  962. "float32": "floating",
  963. "float64": "floating",
  964. "f": "floating",
  965. "complex64": "complex",
  966. "complex128": "complex",
  967. "c": "complex",
  968. "string": "string",
  969. str: "string",
  970. "S": "bytes",
  971. "U": "string",
  972. "bool": "boolean",
  973. "b": "boolean",
  974. "datetime64[ns]": "datetime64",
  975. "M": "datetime64",
  976. "timedelta64[ns]": "timedelta64",
  977. "m": "timedelta64",
  978. "interval": "interval",
  979. Period: "period",
  980. }
  981. # types only exist on certain platform
  982. try:
  983. np.float128
  984. _TYPE_MAP["float128"] = "floating"
  985. except AttributeError:
  986. pass
  987. try:
  988. np.complex256
  989. _TYPE_MAP["complex256"] = "complex"
  990. except AttributeError:
  991. pass
  992. try:
  993. np.float16
  994. _TYPE_MAP["float16"] = "floating"
  995. except AttributeError:
  996. pass
  997. @cython.internal
  998. cdef class Seen:
  999. """
  1000. Class for keeping track of the types of elements
  1001. encountered when trying to perform type conversions.
  1002. """
  1003. cdef:
  1004. bint int_ # seen_int
  1005. bint nat_ # seen nat
  1006. bint bool_ # seen_bool
  1007. bint null_ # seen_null
  1008. bint nan_ # seen_np.nan
  1009. bint uint_ # seen_uint (unsigned integer)
  1010. bint sint_ # seen_sint (signed integer)
  1011. bint float_ # seen_float
  1012. bint object_ # seen_object
  1013. bint complex_ # seen_complex
  1014. bint datetime_ # seen_datetime
  1015. bint coerce_numeric # coerce data to numeric
  1016. bint timedelta_ # seen_timedelta
  1017. bint datetimetz_ # seen_datetimetz
  1018. bint period_ # seen_period
  1019. bint interval_ # seen_interval
  1020. def __cinit__(self, bint coerce_numeric=False):
  1021. """
  1022. Initialize a Seen instance.
  1023. Parameters
  1024. ----------
  1025. coerce_numeric : bool, default False
  1026. Whether or not to force conversion to a numeric data type if
  1027. initial methods to convert to numeric fail.
  1028. """
  1029. self.int_ = False
  1030. self.nat_ = False
  1031. self.bool_ = False
  1032. self.null_ = False
  1033. self.nan_ = False
  1034. self.uint_ = False
  1035. self.sint_ = False
  1036. self.float_ = False
  1037. self.object_ = False
  1038. self.complex_ = False
  1039. self.datetime_ = False
  1040. self.timedelta_ = False
  1041. self.datetimetz_ = False
  1042. self.period_ = False
  1043. self.interval_ = False
  1044. self.coerce_numeric = coerce_numeric
  1045. cdef bint check_uint64_conflict(self) except -1:
  1046. """
  1047. Check whether we can safely convert a uint64 array to a numeric dtype.
  1048. There are two cases when conversion to numeric dtype with a uint64
  1049. array is not safe (and will therefore not be performed)
  1050. 1) A NaN element is encountered.
  1051. uint64 cannot be safely cast to float64 due to truncation issues
  1052. at the extreme ends of the range.
  1053. 2) A negative number is encountered.
  1054. There is no numerical dtype that can hold both negative numbers
  1055. and numbers greater than INT64_MAX. Hence, at least one number
  1056. will be improperly cast if we convert to a numeric dtype.
  1057. Returns
  1058. -------
  1059. bool
  1060. Whether or not we should return the original input array to avoid
  1061. data truncation.
  1062. Raises
  1063. ------
  1064. ValueError
  1065. uint64 elements were detected, and at least one of the
  1066. two conflict cases was also detected. However, we are
  1067. trying to force conversion to a numeric dtype.
  1068. """
  1069. return (self.uint_ and (self.null_ or self.sint_)
  1070. and not self.coerce_numeric)
  1071. cdef saw_null(self):
  1072. """
  1073. Set flags indicating that a null value was encountered.
  1074. """
  1075. self.null_ = True
  1076. self.float_ = True
  1077. cdef saw_int(self, object val):
  1078. """
  1079. Set flags indicating that an integer value was encountered.
  1080. In addition to setting a flag that an integer was seen, we
  1081. also set two flags depending on the type of integer seen:
  1082. 1) sint_ : a signed numpy integer type or a negative (signed) number in the
  1083. range of [-2**63, 0) was encountered
  1084. 2) uint_ : an unsigned numpy integer type or a positive number in the range of
  1085. [2**63, 2**64) was encountered
  1086. Parameters
  1087. ----------
  1088. val : Python int
  1089. Value with which to set the flags.
  1090. """
  1091. self.int_ = True
  1092. self.sint_ = (
  1093. self.sint_
  1094. or (oINT64_MIN <= val < 0)
  1095. # Cython equivalent of `isinstance(val, np.signedinteger)`
  1096. or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type)
  1097. )
  1098. self.uint_ = (
  1099. self.uint_
  1100. or (oINT64_MAX < val <= oUINT64_MAX)
  1101. # Cython equivalent of `isinstance(val, np.unsignedinteger)`
  1102. or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type)
  1103. )
  1104. @property
  1105. def numeric_(self):
  1106. return self.complex_ or self.float_ or self.int_
  1107. @property
  1108. def is_bool(self):
  1109. # i.e. not (anything but bool)
  1110. return self.is_bool_or_na and not (self.nan_ or self.null_)
  1111. @property
  1112. def is_bool_or_na(self):
  1113. # i.e. not (anything but bool or missing values)
  1114. return self.bool_ and not (
  1115. self.datetime_ or self.datetimetz_ or self.nat_ or self.timedelta_
  1116. or self.period_ or self.interval_ or self.numeric_ or self.object_
  1117. )
  1118. cdef object _try_infer_map(object dtype):
  1119. """
  1120. If its in our map, just return the dtype.
  1121. """
  1122. cdef:
  1123. object val
  1124. str attr
  1125. for attr in ["kind", "name", "base", "type"]:
  1126. val = getattr(dtype, attr, None)
  1127. if val in _TYPE_MAP:
  1128. return _TYPE_MAP[val]
  1129. return None
  1130. def infer_dtype(value: object, skipna: bool = True) -> str:
  1131. """
  1132. Return a string label of the type of a scalar or list-like of values.
  1133. Parameters
  1134. ----------
  1135. value : scalar, list, ndarray, or pandas type
  1136. skipna : bool, default True
  1137. Ignore NaN values when inferring the type.
  1138. Returns
  1139. -------
  1140. str
  1141. Describing the common type of the input data.
  1142. Results can include:
  1143. - string
  1144. - bytes
  1145. - floating
  1146. - integer
  1147. - mixed-integer
  1148. - mixed-integer-float
  1149. - decimal
  1150. - complex
  1151. - categorical
  1152. - boolean
  1153. - datetime64
  1154. - datetime
  1155. - date
  1156. - timedelta64
  1157. - timedelta
  1158. - time
  1159. - period
  1160. - mixed
  1161. - unknown-array
  1162. Raises
  1163. ------
  1164. TypeError
  1165. If ndarray-like but cannot infer the dtype
  1166. Notes
  1167. -----
  1168. - 'mixed' is the catchall for anything that is not otherwise
  1169. specialized
  1170. - 'mixed-integer-float' are floats and integers
  1171. - 'mixed-integer' are integers mixed with non-integers
  1172. - 'unknown-array' is the catchall for something that *is* an array (has
  1173. a dtype attribute), but has a dtype unknown to pandas (e.g. external
  1174. extension array)
  1175. Examples
  1176. --------
  1177. >>> import datetime
  1178. >>> infer_dtype(['foo', 'bar'])
  1179. 'string'
  1180. >>> infer_dtype(['a', np.nan, 'b'], skipna=True)
  1181. 'string'
  1182. >>> infer_dtype(['a', np.nan, 'b'], skipna=False)
  1183. 'mixed'
  1184. >>> infer_dtype([b'foo', b'bar'])
  1185. 'bytes'
  1186. >>> infer_dtype([1, 2, 3])
  1187. 'integer'
  1188. >>> infer_dtype([1, 2, 3.5])
  1189. 'mixed-integer-float'
  1190. >>> infer_dtype([1.0, 2.0, 3.5])
  1191. 'floating'
  1192. >>> infer_dtype(['a', 1])
  1193. 'mixed-integer'
  1194. >>> infer_dtype([Decimal(1), Decimal(2.0)])
  1195. 'decimal'
  1196. >>> infer_dtype([True, False])
  1197. 'boolean'
  1198. >>> infer_dtype([True, False, np.nan])
  1199. 'boolean'
  1200. >>> infer_dtype([pd.Timestamp('20130101')])
  1201. 'datetime'
  1202. >>> infer_dtype([datetime.date(2013, 1, 1)])
  1203. 'date'
  1204. >>> infer_dtype([np.datetime64('2013-01-01')])
  1205. 'datetime64'
  1206. >>> infer_dtype([datetime.timedelta(0, 1, 1)])
  1207. 'timedelta'
  1208. >>> infer_dtype(pd.Series(list('aabc')).astype('category'))
  1209. 'categorical'
  1210. """
  1211. cdef:
  1212. Py_ssize_t i, n
  1213. object val
  1214. ndarray values
  1215. bint seen_pdnat = False
  1216. bint seen_val = False
  1217. flatiter it
  1218. if util.is_array(value):
  1219. values = value
  1220. elif hasattr(type(value), "inferred_type") and skipna is False:
  1221. # Index, use the cached attribute if possible, populate the cache otherwise
  1222. return value.inferred_type
  1223. elif hasattr(value, "dtype"):
  1224. inferred = _try_infer_map(value.dtype)
  1225. if inferred is not None:
  1226. return inferred
  1227. elif not cnp.PyArray_DescrCheck(value.dtype):
  1228. return "unknown-array"
  1229. # Unwrap Series/Index
  1230. values = np.asarray(value)
  1231. else:
  1232. if not isinstance(value, list):
  1233. value = list(value)
  1234. if not value:
  1235. return "empty"
  1236. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  1237. values = construct_1d_object_array_from_listlike(value)
  1238. inferred = _try_infer_map(values.dtype)
  1239. if inferred is not None:
  1240. # Anything other than object-dtype should return here.
  1241. return inferred
  1242. if values.descr.type_num != NPY_OBJECT:
  1243. # i.e. values.dtype != np.object_
  1244. # This should not be reached
  1245. values = values.astype(object)
  1246. n = cnp.PyArray_SIZE(values)
  1247. if n == 0:
  1248. return "empty"
  1249. # Iterate until we find our first valid value. We will use this
  1250. # value to decide which of the is_foo_array functions to call.
  1251. it = PyArray_IterNew(values)
  1252. for i in range(n):
  1253. # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
  1254. # equivalents to `val = values[i]`
  1255. val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
  1256. PyArray_ITER_NEXT(it)
  1257. # do not use checknull to keep
  1258. # np.datetime64('nat') and np.timedelta64('nat')
  1259. if val is None or util.is_nan(val) or val is C_NA:
  1260. pass
  1261. elif val is NaT:
  1262. seen_pdnat = True
  1263. else:
  1264. seen_val = True
  1265. break
  1266. # if all values are nan/NaT
  1267. if seen_val is False and seen_pdnat is True:
  1268. return "datetime"
  1269. # float/object nan is handled in latter logic
  1270. if seen_val is False and skipna:
  1271. return "empty"
  1272. if util.is_datetime64_object(val):
  1273. if is_datetime64_array(values, skipna=skipna):
  1274. return "datetime64"
  1275. elif is_timedelta(val):
  1276. if is_timedelta_or_timedelta64_array(values, skipna=skipna):
  1277. return "timedelta"
  1278. elif util.is_integer_object(val):
  1279. # ordering matters here; this check must come after the is_timedelta
  1280. # check otherwise numpy timedelta64 objects would come through here
  1281. if is_integer_array(values, skipna=skipna):
  1282. return "integer"
  1283. elif is_integer_float_array(values, skipna=skipna):
  1284. if is_integer_na_array(values, skipna=skipna):
  1285. return "integer-na"
  1286. else:
  1287. return "mixed-integer-float"
  1288. return "mixed-integer"
  1289. elif PyDateTime_Check(val):
  1290. if is_datetime_array(values, skipna=skipna):
  1291. return "datetime"
  1292. elif is_date_array(values, skipna=skipna):
  1293. return "date"
  1294. elif PyDate_Check(val):
  1295. if is_date_array(values, skipna=skipna):
  1296. return "date"
  1297. elif PyTime_Check(val):
  1298. if is_time_array(values, skipna=skipna):
  1299. return "time"
  1300. elif is_decimal(val):
  1301. if is_decimal_array(values, skipna=skipna):
  1302. return "decimal"
  1303. elif util.is_complex_object(val):
  1304. if is_complex_array(values):
  1305. return "complex"
  1306. elif util.is_float_object(val):
  1307. if is_float_array(values):
  1308. return "floating"
  1309. elif is_integer_float_array(values, skipna=skipna):
  1310. if is_integer_na_array(values, skipna=skipna):
  1311. return "integer-na"
  1312. else:
  1313. return "mixed-integer-float"
  1314. elif util.is_bool_object(val):
  1315. if is_bool_array(values, skipna=skipna):
  1316. return "boolean"
  1317. elif isinstance(val, str):
  1318. if is_string_array(values, skipna=skipna):
  1319. return "string"
  1320. elif isinstance(val, bytes):
  1321. if is_bytes_array(values, skipna=skipna):
  1322. return "bytes"
  1323. elif is_period_object(val):
  1324. if is_period_array(values, skipna=skipna):
  1325. return "period"
  1326. elif is_interval(val):
  1327. if is_interval_array(values):
  1328. return "interval"
  1329. cnp.PyArray_ITER_RESET(it)
  1330. for i in range(n):
  1331. val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
  1332. PyArray_ITER_NEXT(it)
  1333. if util.is_integer_object(val):
  1334. return "mixed-integer"
  1335. return "mixed"
  1336. cdef bint is_timedelta(object o):
  1337. return PyDelta_Check(o) or util.is_timedelta64_object(o)
  1338. @cython.internal
  1339. cdef class Validator:
  1340. cdef:
  1341. Py_ssize_t n
  1342. dtype dtype
  1343. bint skipna
  1344. def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
  1345. bint skipna=False):
  1346. self.n = n
  1347. self.dtype = dtype
  1348. self.skipna = skipna
  1349. cdef bint validate(self, ndarray values) except -1:
  1350. if not self.n:
  1351. return False
  1352. if self.is_array_typed():
  1353. # i.e. this ndarray is already of the desired dtype
  1354. return True
  1355. elif self.dtype.type_num == NPY_OBJECT:
  1356. if self.skipna:
  1357. return self._validate_skipna(values)
  1358. else:
  1359. return self._validate(values)
  1360. else:
  1361. return False
  1362. @cython.wraparound(False)
  1363. @cython.boundscheck(False)
  1364. cdef bint _validate(self, ndarray values) except -1:
  1365. cdef:
  1366. Py_ssize_t i
  1367. Py_ssize_t n = values.size
  1368. flatiter it = PyArray_IterNew(values)
  1369. for i in range(n):
  1370. # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
  1371. # equivalents to `val = values[i]`
  1372. val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
  1373. PyArray_ITER_NEXT(it)
  1374. if not self.is_valid(val):
  1375. return False
  1376. return True
  1377. @cython.wraparound(False)
  1378. @cython.boundscheck(False)
  1379. cdef bint _validate_skipna(self, ndarray values) except -1:
  1380. cdef:
  1381. Py_ssize_t i
  1382. Py_ssize_t n = values.size
  1383. flatiter it = PyArray_IterNew(values)
  1384. for i in range(n):
  1385. # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
  1386. # equivalents to `val = values[i]`
  1387. val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
  1388. PyArray_ITER_NEXT(it)
  1389. if not self.is_valid_skipna(val):
  1390. return False
  1391. return True
  1392. cdef bint is_valid(self, object value) except -1:
  1393. return self.is_value_typed(value)
  1394. cdef bint is_valid_skipna(self, object value) except -1:
  1395. return self.is_valid(value) or self.is_valid_null(value)
  1396. cdef bint is_value_typed(self, object value) except -1:
  1397. raise NotImplementedError(f"{type(self).__name__} child class "
  1398. "must define is_value_typed")
  1399. cdef bint is_valid_null(self, object value) except -1:
  1400. return value is None or value is C_NA or util.is_nan(value)
  1401. # TODO: include decimal NA?
  1402. cdef bint is_array_typed(self) except -1:
  1403. return False
  1404. @cython.internal
  1405. cdef class BoolValidator(Validator):
  1406. cdef bint is_value_typed(self, object value) except -1:
  1407. return util.is_bool_object(value)
  1408. cdef bint is_array_typed(self) except -1:
  1409. return issubclass(self.dtype.type, np.bool_)
  1410. cpdef bint is_bool_array(ndarray values, bint skipna=False):
  1411. cdef:
  1412. BoolValidator validator = BoolValidator(len(values),
  1413. values.dtype,
  1414. skipna=skipna)
  1415. return validator.validate(values)
  1416. @cython.internal
  1417. cdef class IntegerValidator(Validator):
  1418. cdef bint is_value_typed(self, object value) except -1:
  1419. return util.is_integer_object(value)
  1420. cdef bint is_array_typed(self) except -1:
  1421. return issubclass(self.dtype.type, np.integer)
  1422. # Note: only python-exposed for tests
  1423. cpdef bint is_integer_array(ndarray values, bint skipna=True):
  1424. cdef:
  1425. IntegerValidator validator = IntegerValidator(len(values),
  1426. values.dtype,
  1427. skipna=skipna)
  1428. return validator.validate(values)
  1429. @cython.internal
  1430. cdef class IntegerNaValidator(Validator):
  1431. cdef bint is_value_typed(self, object value) except -1:
  1432. return (util.is_integer_object(value)
  1433. or (util.is_nan(value) and util.is_float_object(value)))
  1434. cdef bint is_integer_na_array(ndarray values, bint skipna=True):
  1435. cdef:
  1436. IntegerNaValidator validator = IntegerNaValidator(len(values),
  1437. values.dtype, skipna=skipna)
  1438. return validator.validate(values)
  1439. @cython.internal
  1440. cdef class IntegerFloatValidator(Validator):
  1441. cdef bint is_value_typed(self, object value) except -1:
  1442. return util.is_integer_object(value) or util.is_float_object(value)
  1443. cdef bint is_array_typed(self) except -1:
  1444. return issubclass(self.dtype.type, np.integer)
  1445. cdef bint is_integer_float_array(ndarray values, bint skipna=True):
  1446. cdef:
  1447. IntegerFloatValidator validator = IntegerFloatValidator(len(values),
  1448. values.dtype,
  1449. skipna=skipna)
  1450. return validator.validate(values)
  1451. @cython.internal
  1452. cdef class FloatValidator(Validator):
  1453. cdef bint is_value_typed(self, object value) except -1:
  1454. return util.is_float_object(value)
  1455. cdef bint is_array_typed(self) except -1:
  1456. return issubclass(self.dtype.type, np.floating)
  1457. # Note: only python-exposed for tests
  1458. cpdef bint is_float_array(ndarray values):
  1459. cdef:
  1460. FloatValidator validator = FloatValidator(len(values), values.dtype)
  1461. return validator.validate(values)
  1462. @cython.internal
  1463. cdef class ComplexValidator(Validator):
  1464. cdef bint is_value_typed(self, object value) except -1:
  1465. return (
  1466. util.is_complex_object(value)
  1467. or (util.is_float_object(value) and is_nan(value))
  1468. )
  1469. cdef bint is_array_typed(self) except -1:
  1470. return issubclass(self.dtype.type, np.complexfloating)
  1471. cdef bint is_complex_array(ndarray values):
  1472. cdef:
  1473. ComplexValidator validator = ComplexValidator(len(values), values.dtype)
  1474. return validator.validate(values)
  1475. @cython.internal
  1476. cdef class DecimalValidator(Validator):
  1477. cdef bint is_value_typed(self, object value) except -1:
  1478. return is_decimal(value)
  1479. cdef bint is_decimal_array(ndarray values, bint skipna=False):
  1480. cdef:
  1481. DecimalValidator validator = DecimalValidator(
  1482. len(values), values.dtype, skipna=skipna
  1483. )
  1484. return validator.validate(values)
  1485. @cython.internal
  1486. cdef class StringValidator(Validator):
  1487. cdef bint is_value_typed(self, object value) except -1:
  1488. return isinstance(value, str)
  1489. cdef bint is_array_typed(self) except -1:
  1490. return issubclass(self.dtype.type, np.str_)
  1491. cpdef bint is_string_array(ndarray values, bint skipna=False):
  1492. cdef:
  1493. StringValidator validator = StringValidator(len(values),
  1494. values.dtype,
  1495. skipna=skipna)
  1496. return validator.validate(values)
  1497. @cython.internal
  1498. cdef class BytesValidator(Validator):
  1499. cdef bint is_value_typed(self, object value) except -1:
  1500. return isinstance(value, bytes)
  1501. cdef bint is_array_typed(self) except -1:
  1502. return issubclass(self.dtype.type, np.bytes_)
  1503. cdef bint is_bytes_array(ndarray values, bint skipna=False):
  1504. cdef:
  1505. BytesValidator validator = BytesValidator(len(values), values.dtype,
  1506. skipna=skipna)
  1507. return validator.validate(values)
  1508. @cython.internal
  1509. cdef class TemporalValidator(Validator):
  1510. cdef:
  1511. bint all_generic_na
  1512. def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
  1513. bint skipna=False):
  1514. self.n = n
  1515. self.dtype = dtype
  1516. self.skipna = skipna
  1517. self.all_generic_na = True
  1518. cdef bint is_valid(self, object value) except -1:
  1519. return self.is_value_typed(value) or self.is_valid_null(value)
  1520. cdef bint is_valid_null(self, object value) except -1:
  1521. raise NotImplementedError(f"{type(self).__name__} child class "
  1522. "must define is_valid_null")
  1523. cdef bint is_valid_skipna(self, object value) except -1:
  1524. cdef:
  1525. bint is_typed_null = self.is_valid_null(value)
  1526. bint is_generic_null = value is None or util.is_nan(value)
  1527. if not is_generic_null:
  1528. self.all_generic_na = False
  1529. return self.is_value_typed(value) or is_typed_null or is_generic_null
  1530. cdef bint _validate_skipna(self, ndarray values) except -1:
  1531. """
  1532. If we _only_ saw non-dtype-specific NA values, even if they are valid
  1533. for this dtype, we do not infer this dtype.
  1534. """
  1535. return Validator._validate_skipna(self, values) and not self.all_generic_na
  1536. @cython.internal
  1537. cdef class DatetimeValidator(TemporalValidator):
  1538. cdef bint is_value_typed(self, object value) except -1:
  1539. return PyDateTime_Check(value)
  1540. cdef bint is_valid_null(self, object value) except -1:
  1541. return is_null_datetime64(value)
  1542. cpdef bint is_datetime_array(ndarray values, bint skipna=True):
  1543. cdef:
  1544. DatetimeValidator validator = DatetimeValidator(len(values),
  1545. skipna=skipna)
  1546. return validator.validate(values)
  1547. @cython.internal
  1548. cdef class Datetime64Validator(DatetimeValidator):
  1549. cdef bint is_value_typed(self, object value) except -1:
  1550. return util.is_datetime64_object(value)
  1551. # Note: only python-exposed for tests
  1552. cpdef bint is_datetime64_array(ndarray values, bint skipna=True):
  1553. cdef:
  1554. Datetime64Validator validator = Datetime64Validator(len(values),
  1555. skipna=skipna)
  1556. return validator.validate(values)
  1557. @cython.internal
  1558. cdef class AnyDatetimeValidator(DatetimeValidator):
  1559. cdef bint is_value_typed(self, object value) except -1:
  1560. return util.is_datetime64_object(value) or (
  1561. PyDateTime_Check(value) and value.tzinfo is None
  1562. )
  1563. cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True):
  1564. cdef:
  1565. AnyDatetimeValidator validator = AnyDatetimeValidator(len(values),
  1566. skipna=skipna)
  1567. return validator.validate(values)
  1568. # Note: only python-exposed for tests
  1569. def is_datetime_with_singletz_array(values: ndarray) -> bool:
  1570. """
  1571. Check values have the same tzinfo attribute.
  1572. Doesn't check values are datetime-like types.
  1573. """
  1574. cdef:
  1575. Py_ssize_t i = 0, j, n = len(values)
  1576. object base_val, base_tz, val, tz
  1577. if n == 0:
  1578. return False
  1579. # Get a reference timezone to compare with the rest of the tzs in the array
  1580. for i in range(n):
  1581. base_val = values[i]
  1582. if base_val is not NaT and base_val is not None and not util.is_nan(base_val):
  1583. base_tz = getattr(base_val, "tzinfo", None)
  1584. break
  1585. for j in range(i, n):
  1586. # Compare val's timezone with the reference timezone
  1587. # NaT can coexist with tz-aware datetimes, so skip if encountered
  1588. val = values[j]
  1589. if val is not NaT and val is not None and not util.is_nan(val):
  1590. tz = getattr(val, "tzinfo", None)
  1591. if not tz_compare(base_tz, tz):
  1592. return False
  1593. # Note: we should only be called if a tzaware datetime has been seen,
  1594. # so base_tz should always be set at this point.
  1595. return True
  1596. @cython.internal
  1597. cdef class TimedeltaValidator(TemporalValidator):
  1598. cdef bint is_value_typed(self, object value) except -1:
  1599. return PyDelta_Check(value)
  1600. cdef bint is_valid_null(self, object value) except -1:
  1601. return is_null_timedelta64(value)
  1602. @cython.internal
  1603. cdef class AnyTimedeltaValidator(TimedeltaValidator):
  1604. cdef bint is_value_typed(self, object value) except -1:
  1605. return is_timedelta(value)
  1606. # Note: only python-exposed for tests
  1607. cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True):
  1608. """
  1609. Infer with timedeltas and/or nat/none.
  1610. """
  1611. cdef:
  1612. AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values),
  1613. skipna=skipna)
  1614. return validator.validate(values)
  1615. @cython.internal
  1616. cdef class DateValidator(Validator):
  1617. cdef bint is_value_typed(self, object value) except -1:
  1618. return PyDate_Check(value)
  1619. # Note: only python-exposed for tests
  1620. cpdef bint is_date_array(ndarray values, bint skipna=False):
  1621. cdef:
  1622. DateValidator validator = DateValidator(len(values), skipna=skipna)
  1623. return validator.validate(values)
  1624. @cython.internal
  1625. cdef class TimeValidator(Validator):
  1626. cdef bint is_value_typed(self, object value) except -1:
  1627. return PyTime_Check(value)
  1628. # Note: only python-exposed for tests
  1629. cpdef bint is_time_array(ndarray values, bint skipna=False):
  1630. cdef:
  1631. TimeValidator validator = TimeValidator(len(values), skipna=skipna)
  1632. return validator.validate(values)
  1633. # FIXME: actually use skipna
  1634. cdef bint is_period_array(ndarray values, bint skipna=True):
  1635. """
  1636. Is this an ndarray of Period objects (or NaT) with a single `freq`?
  1637. """
  1638. # values should be object-dtype, but ndarray[object] assumes 1D, while
  1639. # this _may_ be 2D.
  1640. cdef:
  1641. Py_ssize_t i, N = values.size
  1642. int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND
  1643. object val
  1644. flatiter it
  1645. if N == 0:
  1646. return False
  1647. it = PyArray_IterNew(values)
  1648. for i in range(N):
  1649. # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
  1650. # equivalents to `val = values[i]`
  1651. val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
  1652. PyArray_ITER_NEXT(it)
  1653. if is_period_object(val):
  1654. if dtype_code == -10000:
  1655. dtype_code = val._dtype._dtype_code
  1656. elif dtype_code != val._dtype._dtype_code:
  1657. # mismatched freqs
  1658. return False
  1659. elif checknull_with_nat(val):
  1660. pass
  1661. else:
  1662. # Not a Period or NaT-like
  1663. return False
  1664. if dtype_code == -10000:
  1665. # we saw all-NaTs, no actual Periods
  1666. return False
  1667. return True
  1668. # Note: only python-exposed for tests
  1669. cpdef bint is_interval_array(ndarray values):
  1670. """
  1671. Is this an ndarray of Interval (or np.nan) with a single dtype?
  1672. """
  1673. cdef:
  1674. Py_ssize_t i, n = len(values)
  1675. str closed = None
  1676. bint numeric = False
  1677. bint dt64 = False
  1678. bint td64 = False
  1679. object val
  1680. if len(values) == 0:
  1681. return False
  1682. for i in range(n):
  1683. val = values[i]
  1684. if is_interval(val):
  1685. if closed is None:
  1686. closed = val.closed
  1687. numeric = (
  1688. util.is_float_object(val.left)
  1689. or util.is_integer_object(val.left)
  1690. )
  1691. td64 = is_timedelta(val.left)
  1692. dt64 = PyDateTime_Check(val.left)
  1693. elif val.closed != closed:
  1694. # mismatched closedness
  1695. return False
  1696. elif numeric:
  1697. if not (
  1698. util.is_float_object(val.left)
  1699. or util.is_integer_object(val.left)
  1700. ):
  1701. # i.e. datetime64 or timedelta64
  1702. return False
  1703. elif td64:
  1704. if not is_timedelta(val.left):
  1705. return False
  1706. elif dt64:
  1707. if not PyDateTime_Check(val.left):
  1708. return False
  1709. else:
  1710. raise ValueError(val)
  1711. elif util.is_nan(val) or val is None:
  1712. pass
  1713. else:
  1714. return False
  1715. if closed is None:
  1716. # we saw all-NAs, no actual Intervals
  1717. return False
  1718. return True
  1719. @cython.boundscheck(False)
  1720. @cython.wraparound(False)
  1721. def maybe_convert_numeric(
  1722. ndarray[object, ndim=1] values,
  1723. set na_values,
  1724. bint convert_empty=True,
  1725. bint coerce_numeric=False,
  1726. bint convert_to_masked_nullable=False,
  1727. ) -> tuple[np.ndarray, np.ndarray | None]:
  1728. """
  1729. Convert object array to a numeric array if possible.
  1730. Parameters
  1731. ----------
  1732. values : ndarray[object]
  1733. Array of object elements to convert.
  1734. na_values : set
  1735. Set of values that should be interpreted as NaN.
  1736. convert_empty : bool, default True
  1737. If an empty array-like object is encountered, whether to interpret
  1738. that element as NaN or not. If set to False, a ValueError will be
  1739. raised if such an element is encountered and 'coerce_numeric' is False.
  1740. coerce_numeric : bool, default False
  1741. If initial attempts to convert to numeric have failed, whether to
  1742. force conversion to numeric via alternative methods or by setting the
  1743. element to NaN. Otherwise, an Exception will be raised when such an
  1744. element is encountered.
  1745. This boolean also has an impact on how conversion behaves when a
  1746. numeric array has no suitable numerical dtype to return (i.e. uint64,
  1747. int32, uint8). If set to False, the original object array will be
  1748. returned. Otherwise, a ValueError will be raised.
  1749. convert_to_masked_nullable : bool, default False
  1750. Whether to return a mask for the converted values. This also disables
  1751. upcasting for ints with nulls to float64.
  1752. Returns
  1753. -------
  1754. np.ndarray
  1755. Array of converted object values to numerical ones.
  1756. Optional[np.ndarray]
  1757. If convert_to_masked_nullable is True,
  1758. returns a boolean mask for the converted values, otherwise returns None.
  1759. """
  1760. if len(values) == 0:
  1761. return (np.array([], dtype="i8"), None)
  1762. # fastpath for ints - try to convert all based on first value
  1763. cdef:
  1764. object val = values[0]
  1765. if util.is_integer_object(val):
  1766. try:
  1767. maybe_ints = values.astype("i8")
  1768. if (maybe_ints == values).all():
  1769. return (maybe_ints, None)
  1770. except (ValueError, OverflowError, TypeError):
  1771. pass
  1772. # Otherwise, iterate and do full inference.
  1773. cdef:
  1774. int maybe_int
  1775. Py_ssize_t i, n = values.size
  1776. Seen seen = Seen(coerce_numeric)
  1777. ndarray[float64_t, ndim=1] floats = cnp.PyArray_EMPTY(
  1778. 1, values.shape, cnp.NPY_FLOAT64, 0
  1779. )
  1780. ndarray[complex128_t, ndim=1] complexes = cnp.PyArray_EMPTY(
  1781. 1, values.shape, cnp.NPY_COMPLEX128, 0
  1782. )
  1783. ndarray[int64_t, ndim=1] ints = cnp.PyArray_EMPTY(
  1784. 1, values.shape, cnp.NPY_INT64, 0
  1785. )
  1786. ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(
  1787. 1, values.shape, cnp.NPY_UINT64, 0
  1788. )
  1789. ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(
  1790. 1, values.shape, cnp.NPY_UINT8, 0
  1791. )
  1792. ndarray[uint8_t, ndim=1] mask = np.zeros(n, dtype="u1")
  1793. float64_t fval
  1794. bint allow_null_in_int = convert_to_masked_nullable
  1795. for i in range(n):
  1796. val = values[i]
  1797. # We only want to disable NaNs showing as float if
  1798. # a) convert_to_masked_nullable = True
  1799. # b) no floats have been seen ( assuming an int shows up later )
  1800. # However, if no ints present (all null array), we need to return floats
  1801. allow_null_in_int = convert_to_masked_nullable and not seen.float_
  1802. if val.__hash__ is not None and val in na_values:
  1803. if allow_null_in_int:
  1804. seen.null_ = True
  1805. mask[i] = 1
  1806. else:
  1807. if convert_to_masked_nullable:
  1808. mask[i] = 1
  1809. seen.saw_null()
  1810. floats[i] = complexes[i] = NaN
  1811. elif util.is_float_object(val):
  1812. fval = val
  1813. if fval != fval:
  1814. seen.null_ = True
  1815. if allow_null_in_int:
  1816. mask[i] = 1
  1817. else:
  1818. if convert_to_masked_nullable:
  1819. mask[i] = 1
  1820. seen.float_ = True
  1821. else:
  1822. seen.float_ = True
  1823. floats[i] = complexes[i] = fval
  1824. elif util.is_integer_object(val):
  1825. floats[i] = complexes[i] = val
  1826. val = int(val)
  1827. seen.saw_int(val)
  1828. if val >= 0:
  1829. if val <= oUINT64_MAX:
  1830. uints[i] = val
  1831. else:
  1832. seen.float_ = True
  1833. if oINT64_MIN <= val <= oINT64_MAX:
  1834. ints[i] = val
  1835. if val < oINT64_MIN or (seen.sint_ and seen.uint_):
  1836. seen.float_ = True
  1837. elif util.is_bool_object(val):
  1838. floats[i] = uints[i] = ints[i] = bools[i] = val
  1839. seen.bool_ = True
  1840. elif val is None or val is C_NA:
  1841. if allow_null_in_int:
  1842. seen.null_ = True
  1843. mask[i] = 1
  1844. else:
  1845. if convert_to_masked_nullable:
  1846. mask[i] = 1
  1847. seen.saw_null()
  1848. floats[i] = complexes[i] = NaN
  1849. elif hasattr(val, "__len__") and len(val) == 0:
  1850. if convert_empty or seen.coerce_numeric:
  1851. seen.saw_null()
  1852. floats[i] = complexes[i] = NaN
  1853. mask[i] = 1
  1854. else:
  1855. raise ValueError("Empty string encountered")
  1856. elif util.is_complex_object(val):
  1857. complexes[i] = val
  1858. seen.complex_ = True
  1859. elif is_decimal(val):
  1860. floats[i] = complexes[i] = val
  1861. seen.float_ = True
  1862. else:
  1863. try:
  1864. floatify(val, &fval, &maybe_int)
  1865. if fval in na_values:
  1866. seen.saw_null()
  1867. floats[i] = complexes[i] = NaN
  1868. mask[i] = 1
  1869. else:
  1870. if fval != fval:
  1871. seen.null_ = True
  1872. mask[i] = 1
  1873. floats[i] = fval
  1874. if maybe_int:
  1875. as_int = int(val)
  1876. if as_int in na_values:
  1877. mask[i] = 1
  1878. seen.null_ = True
  1879. if not allow_null_in_int:
  1880. seen.float_ = True
  1881. else:
  1882. seen.saw_int(as_int)
  1883. if as_int not in na_values:
  1884. if as_int < oINT64_MIN or as_int > oUINT64_MAX:
  1885. if seen.coerce_numeric:
  1886. seen.float_ = True
  1887. else:
  1888. raise ValueError("Integer out of range.")
  1889. else:
  1890. if as_int >= 0:
  1891. uints[i] = as_int
  1892. if as_int <= oINT64_MAX:
  1893. ints[i] = as_int
  1894. seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
  1895. else:
  1896. seen.float_ = True
  1897. except (TypeError, ValueError) as err:
  1898. if not seen.coerce_numeric:
  1899. raise type(err)(f"{err} at position {i}")
  1900. mask[i] = 1
  1901. if allow_null_in_int:
  1902. seen.null_ = True
  1903. else:
  1904. seen.saw_null()
  1905. floats[i] = NaN
  1906. if seen.check_uint64_conflict():
  1907. return (values, None)
  1908. # This occurs since we disabled float nulls showing as null in anticipation
  1909. # of seeing ints that were never seen. So then, we return float
  1910. if allow_null_in_int and seen.null_ and not seen.int_ and not seen.bool_:
  1911. seen.float_ = True
  1912. if seen.complex_:
  1913. return (complexes, None)
  1914. elif seen.float_:
  1915. if seen.null_ and convert_to_masked_nullable:
  1916. return (floats, mask.view(np.bool_))
  1917. return (floats, None)
  1918. elif seen.int_:
  1919. if seen.null_ and convert_to_masked_nullable:
  1920. if seen.uint_:
  1921. return (uints, mask.view(np.bool_))
  1922. else:
  1923. return (ints, mask.view(np.bool_))
  1924. if seen.uint_:
  1925. return (uints, None)
  1926. else:
  1927. return (ints, None)
  1928. elif seen.bool_:
  1929. if allow_null_in_int:
  1930. return (bools.view(np.bool_), mask.view(np.bool_))
  1931. return (bools.view(np.bool_), None)
  1932. elif seen.uint_:
  1933. return (uints, None)
  1934. return (ints, None)
  1935. @cython.boundscheck(False)
  1936. @cython.wraparound(False)
  1937. def maybe_convert_objects(ndarray[object] objects,
  1938. *,
  1939. bint try_float=False,
  1940. bint safe=False,
  1941. bint convert_numeric=True, # NB: different default!
  1942. bint convert_datetime=False,
  1943. bint convert_timedelta=False,
  1944. bint convert_period=False,
  1945. bint convert_interval=False,
  1946. bint convert_to_nullable_dtype=False,
  1947. object dtype_if_all_nat=None) -> "ArrayLike":
  1948. """
  1949. Type inference function-- convert object array to proper dtype
  1950. Parameters
  1951. ----------
  1952. objects : ndarray[object]
  1953. Array of object elements to convert.
  1954. try_float : bool, default False
  1955. If an array-like object contains only float or NaN values is
  1956. encountered, whether to convert and return an array of float dtype.
  1957. safe : bool, default False
  1958. Whether to upcast numeric type (e.g. int cast to float). If set to
  1959. True, no upcasting will be performed.
  1960. convert_numeric : bool, default True
  1961. Whether to convert numeric entries.
  1962. convert_datetime : bool, default False
  1963. If an array-like object contains only datetime values or NaT is
  1964. encountered, whether to convert and return an array of M8[ns] dtype.
  1965. convert_timedelta : bool, default False
  1966. If an array-like object contains only timedelta values or NaT is
  1967. encountered, whether to convert and return an array of m8[ns] dtype.
  1968. convert_period : bool, default False
  1969. If an array-like object contains only (homogeneous-freq) Period values
  1970. or NaT, whether to convert and return a PeriodArray.
  1971. convert_interval : bool, default False
  1972. If an array-like object contains only Interval objects (with matching
  1973. dtypes and closedness) or NaN, whether to convert to IntervalArray.
  1974. convert_to_nullable_dtype : bool, default False
  1975. If an array-like object contains only integer or boolean values (and NaN) is
  1976. encountered, whether to convert and return an Boolean/IntegerArray.
  1977. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None
  1978. Dtype to cast to if we have all-NaT.
  1979. Returns
  1980. -------
  1981. np.ndarray or ExtensionArray
  1982. Array of converted object values to more specific dtypes if applicable.
  1983. """
  1984. cdef:
  1985. Py_ssize_t i, n, itemsize_max = 0
  1986. ndarray[float64_t] floats
  1987. ndarray[complex128_t] complexes
  1988. ndarray[int64_t] ints
  1989. ndarray[uint64_t] uints
  1990. ndarray[uint8_t] bools
  1991. Seen seen = Seen()
  1992. object val
  1993. _TSObject tsobj
  1994. float64_t fnan = np.nan
  1995. if dtype_if_all_nat is not None:
  1996. # in practice we don't expect to ever pass dtype_if_all_nat
  1997. # without both convert_datetime and convert_timedelta, so disallow
  1998. # it to avoid needing to handle it below.
  1999. if not convert_datetime or not convert_timedelta:
  2000. raise ValueError(
  2001. "Cannot specify 'dtype_if_all_nat' without convert_datetime=True "
  2002. "and convert_timedelta=True"
  2003. )
  2004. n = len(objects)
  2005. floats = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_FLOAT64, 0)
  2006. complexes = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_COMPLEX128, 0)
  2007. ints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_INT64, 0)
  2008. uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0)
  2009. bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0)
  2010. mask = np.full(n, False)
  2011. for i in range(n):
  2012. val = objects[i]
  2013. if itemsize_max != -1:
  2014. itemsize = get_itemsize(val)
  2015. if itemsize > itemsize_max or itemsize == -1:
  2016. itemsize_max = itemsize
  2017. if val is None:
  2018. seen.null_ = True
  2019. floats[i] = complexes[i] = fnan
  2020. mask[i] = True
  2021. elif val is NaT:
  2022. seen.nat_ = True
  2023. if not (convert_datetime or convert_timedelta or convert_period):
  2024. seen.object_ = True
  2025. break
  2026. elif util.is_nan(val):
  2027. seen.nan_ = True
  2028. mask[i] = True
  2029. floats[i] = complexes[i] = val
  2030. elif util.is_bool_object(val):
  2031. seen.bool_ = True
  2032. bools[i] = val
  2033. if not convert_numeric:
  2034. break
  2035. elif util.is_float_object(val):
  2036. floats[i] = complexes[i] = val
  2037. seen.float_ = True
  2038. if not convert_numeric:
  2039. break
  2040. elif is_timedelta(val):
  2041. if convert_timedelta:
  2042. seen.timedelta_ = True
  2043. try:
  2044. convert_to_timedelta64(val, "ns")
  2045. except OutOfBoundsTimedelta:
  2046. seen.object_ = True
  2047. break
  2048. break
  2049. else:
  2050. seen.object_ = True
  2051. break
  2052. elif util.is_integer_object(val):
  2053. seen.int_ = True
  2054. floats[i] = <float64_t>val
  2055. complexes[i] = <double complex>val
  2056. if not seen.null_ or convert_to_nullable_dtype:
  2057. seen.saw_int(val)
  2058. if ((seen.uint_ and seen.sint_) or
  2059. val > oUINT64_MAX or val < oINT64_MIN):
  2060. seen.object_ = True
  2061. break
  2062. if seen.uint_:
  2063. uints[i] = val
  2064. elif seen.sint_:
  2065. ints[i] = val
  2066. else:
  2067. uints[i] = val
  2068. ints[i] = val
  2069. if not convert_numeric:
  2070. break
  2071. elif util.is_complex_object(val):
  2072. complexes[i] = val
  2073. seen.complex_ = True
  2074. if not convert_numeric:
  2075. break
  2076. elif PyDateTime_Check(val) or util.is_datetime64_object(val):
  2077. # if we have an tz's attached then return the objects
  2078. if convert_datetime:
  2079. if getattr(val, "tzinfo", None) is not None:
  2080. seen.datetimetz_ = True
  2081. break
  2082. else:
  2083. seen.datetime_ = True
  2084. try:
  2085. tsobj = convert_to_tsobject(val, None, None, 0, 0)
  2086. tsobj.ensure_reso(NPY_FR_ns)
  2087. except OutOfBoundsDatetime:
  2088. seen.object_ = True
  2089. break
  2090. else:
  2091. seen.object_ = True
  2092. break
  2093. elif is_period_object(val):
  2094. if convert_period:
  2095. seen.period_ = True
  2096. break
  2097. else:
  2098. seen.object_ = True
  2099. break
  2100. elif try_float and not isinstance(val, str):
  2101. # this will convert Decimal objects
  2102. try:
  2103. floats[i] = float(val)
  2104. complexes[i] = complex(val)
  2105. seen.float_ = True
  2106. except (ValueError, TypeError):
  2107. seen.object_ = True
  2108. break
  2109. elif is_interval(val):
  2110. if convert_interval:
  2111. seen.interval_ = True
  2112. break
  2113. else:
  2114. seen.object_ = True
  2115. break
  2116. else:
  2117. seen.object_ = True
  2118. break
  2119. # we try to coerce datetime w/tz but must all have the same tz
  2120. if seen.datetimetz_:
  2121. if is_datetime_with_singletz_array(objects):
  2122. from pandas import DatetimeIndex
  2123. try:
  2124. dti = DatetimeIndex(objects)
  2125. except OutOfBoundsDatetime:
  2126. # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds
  2127. pass
  2128. else:
  2129. # unbox to DatetimeArray
  2130. return dti._data
  2131. seen.object_ = True
  2132. elif seen.datetime_:
  2133. if is_datetime_or_datetime64_array(objects):
  2134. from pandas import DatetimeIndex
  2135. try:
  2136. dti = DatetimeIndex(objects)
  2137. except OutOfBoundsDatetime:
  2138. pass
  2139. else:
  2140. # unbox to ndarray[datetime64[ns]]
  2141. return dti._data._ndarray
  2142. seen.object_ = True
  2143. elif seen.timedelta_:
  2144. if is_timedelta_or_timedelta64_array(objects):
  2145. from pandas import TimedeltaIndex
  2146. try:
  2147. tdi = TimedeltaIndex(objects)
  2148. except OutOfBoundsTimedelta:
  2149. pass
  2150. else:
  2151. # unbox to ndarray[timedelta64[ns]]
  2152. return tdi._data._ndarray
  2153. seen.object_ = True
  2154. if seen.period_:
  2155. if is_period_array(objects):
  2156. from pandas import PeriodIndex
  2157. pi = PeriodIndex(objects)
  2158. # unbox to PeriodArray
  2159. return pi._data
  2160. seen.object_ = True
  2161. if seen.interval_:
  2162. if is_interval_array(objects):
  2163. from pandas import IntervalIndex
  2164. ii = IntervalIndex(objects)
  2165. # unbox to IntervalArray
  2166. return ii._data
  2167. seen.object_ = True
  2168. if seen.nat_:
  2169. if not seen.object_ and not seen.numeric_ and not seen.bool_:
  2170. # all NaT, None, or nan (at least one NaT)
  2171. # see GH#49340 for discussion of desired behavior
  2172. dtype = dtype_if_all_nat
  2173. if cnp.PyArray_DescrCheck(dtype):
  2174. # i.e. isinstance(dtype, np.dtype)
  2175. if dtype.kind not in ["m", "M"]:
  2176. raise ValueError(dtype)
  2177. else:
  2178. res = np.empty((<object>objects).shape, dtype=dtype)
  2179. res[:] = NPY_NAT
  2180. return res
  2181. elif dtype is not None:
  2182. # EA, we don't expect to get here, but _could_ implement
  2183. raise NotImplementedError(dtype)
  2184. elif convert_datetime and convert_timedelta:
  2185. # we don't guess
  2186. seen.object_ = True
  2187. elif convert_datetime:
  2188. res = np.empty((<object>objects).shape, dtype="M8[ns]")
  2189. res[:] = NPY_NAT
  2190. return res
  2191. elif convert_timedelta:
  2192. res = np.empty((<object>objects).shape, dtype="m8[ns]")
  2193. res[:] = NPY_NAT
  2194. return res
  2195. else:
  2196. seen.object_ = True
  2197. else:
  2198. seen.object_ = True
  2199. if not convert_numeric:
  2200. # Note: we count "bool" as numeric here. This is becase
  2201. # np.array(list_of_items) will convert bools just like it will numeric
  2202. # entries.
  2203. return objects
  2204. if seen.bool_:
  2205. if seen.is_bool:
  2206. # is_bool property rules out everything else
  2207. return bools.view(np.bool_)
  2208. elif convert_to_nullable_dtype and seen.is_bool_or_na:
  2209. from pandas.core.arrays import BooleanArray
  2210. return BooleanArray(bools.view(np.bool_), mask)
  2211. seen.object_ = True
  2212. if not seen.object_:
  2213. result = None
  2214. if not safe:
  2215. if seen.null_ or seen.nan_:
  2216. if seen.complex_:
  2217. result = complexes
  2218. elif seen.float_:
  2219. result = floats
  2220. elif seen.int_ or seen.uint_:
  2221. if convert_to_nullable_dtype:
  2222. from pandas.core.arrays import IntegerArray
  2223. if seen.uint_:
  2224. result = IntegerArray(uints, mask)
  2225. else:
  2226. result = IntegerArray(ints, mask)
  2227. else:
  2228. result = floats
  2229. elif seen.nan_:
  2230. result = floats
  2231. else:
  2232. if seen.complex_:
  2233. result = complexes
  2234. elif seen.float_:
  2235. result = floats
  2236. elif seen.int_:
  2237. if seen.uint_:
  2238. result = uints
  2239. else:
  2240. result = ints
  2241. else:
  2242. # don't cast int to float, etc.
  2243. if seen.null_:
  2244. if seen.complex_:
  2245. if not seen.int_:
  2246. result = complexes
  2247. elif seen.float_ or seen.nan_:
  2248. if not seen.int_:
  2249. result = floats
  2250. else:
  2251. if seen.complex_:
  2252. if not seen.int_:
  2253. result = complexes
  2254. elif seen.float_ or seen.nan_:
  2255. if not seen.int_:
  2256. result = floats
  2257. elif seen.int_:
  2258. if seen.uint_:
  2259. result = uints
  2260. else:
  2261. result = ints
  2262. if result is uints or result is ints or result is floats or result is complexes:
  2263. # cast to the largest itemsize when all values are NumPy scalars
  2264. if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
  2265. result = result.astype(result.dtype.kind + str(itemsize_max))
  2266. return result
  2267. elif result is not None:
  2268. return result
  2269. return objects
  2270. class _NoDefault(Enum):
  2271. # We make this an Enum
  2272. # 1) because it round-trips through pickle correctly (see GH#40397)
  2273. # 2) because mypy does not understand singletons
  2274. no_default = "NO_DEFAULT"
  2275. def __repr__(self) -> str:
  2276. return "<no_default>"
  2277. # Note: no_default is exported to the public API in pandas.api.extensions
  2278. no_default = _NoDefault.no_default # Sentinel indicating the default value.
  2279. NoDefault = Literal[_NoDefault.no_default]
  2280. @cython.boundscheck(False)
  2281. @cython.wraparound(False)
  2282. def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
  2283. object na_value=no_default, cnp.dtype dtype=np.dtype(object)
  2284. ) -> np.ndarray:
  2285. """
  2286. Substitute for np.vectorize with pandas-friendly dtype inference.
  2287. Parameters
  2288. ----------
  2289. arr : ndarray
  2290. f : function
  2291. mask : ndarray
  2292. uint8 dtype ndarray indicating values not to apply `f` to.
  2293. convert : bool, default True
  2294. Whether to call `maybe_convert_objects` on the resulting ndarray
  2295. na_value : Any, optional
  2296. The result value to use for masked values. By default, the
  2297. input value is used
  2298. dtype : numpy.dtype
  2299. The numpy dtype to use for the result ndarray.
  2300. Returns
  2301. -------
  2302. np.ndarray
  2303. """
  2304. cdef:
  2305. Py_ssize_t i, n
  2306. ndarray result
  2307. object val
  2308. n = len(arr)
  2309. result = np.empty(n, dtype=dtype)
  2310. for i in range(n):
  2311. if mask[i]:
  2312. if na_value is no_default:
  2313. val = arr[i]
  2314. else:
  2315. val = na_value
  2316. else:
  2317. val = f(arr[i])
  2318. if cnp.PyArray_IsZeroDim(val):
  2319. # unbox 0-dim arrays, GH#690
  2320. val = val.item()
  2321. result[i] = val
  2322. if convert:
  2323. return maybe_convert_objects(result,
  2324. try_float=False,
  2325. convert_datetime=False,
  2326. convert_timedelta=False)
  2327. return result
  2328. @cython.boundscheck(False)
  2329. @cython.wraparound(False)
  2330. def map_infer(
  2331. ndarray arr, object f, bint convert=True, bint ignore_na=False
  2332. ) -> np.ndarray:
  2333. """
  2334. Substitute for np.vectorize with pandas-friendly dtype inference.
  2335. Parameters
  2336. ----------
  2337. arr : ndarray
  2338. f : function
  2339. convert : bint
  2340. ignore_na : bint
  2341. If True, NA values will not have f applied
  2342. Returns
  2343. -------
  2344. np.ndarray
  2345. """
  2346. cdef:
  2347. Py_ssize_t i, n
  2348. ndarray[object] result
  2349. object val
  2350. n = len(arr)
  2351. result = cnp.PyArray_EMPTY(1, arr.shape, cnp.NPY_OBJECT, 0)
  2352. for i in range(n):
  2353. if ignore_na and checknull(arr[i]):
  2354. result[i] = arr[i]
  2355. continue
  2356. val = f(arr[i])
  2357. if cnp.PyArray_IsZeroDim(val):
  2358. # unbox 0-dim arrays, GH#690
  2359. val = val.item()
  2360. result[i] = val
  2361. if convert:
  2362. return maybe_convert_objects(result,
  2363. try_float=False,
  2364. convert_datetime=False,
  2365. convert_timedelta=False)
  2366. return result
  2367. def to_object_array(rows: object, min_width: int = 0) -> ndarray:
  2368. """
  2369. Convert a list of lists into an object array.
  2370. Parameters
  2371. ----------
  2372. rows : 2-d array (N, K)
  2373. List of lists to be converted into an array.
  2374. min_width : int
  2375. Minimum width of the object array. If a list
  2376. in `rows` contains fewer than `width` elements,
  2377. the remaining elements in the corresponding row
  2378. will all be `NaN`.
  2379. Returns
  2380. -------
  2381. np.ndarray[object, ndim=2]
  2382. """
  2383. cdef:
  2384. Py_ssize_t i, j, n, k, tmp
  2385. ndarray[object, ndim=2] result
  2386. list row
  2387. rows = list(rows)
  2388. n = len(rows)
  2389. k = min_width
  2390. for i in range(n):
  2391. tmp = len(rows[i])
  2392. if tmp > k:
  2393. k = tmp
  2394. result = np.empty((n, k), dtype=object)
  2395. for i in range(n):
  2396. row = list(rows[i])
  2397. for j in range(len(row)):
  2398. result[i, j] = row[j]
  2399. return result
  2400. def tuples_to_object_array(ndarray[object] tuples):
  2401. cdef:
  2402. Py_ssize_t i, j, n, k
  2403. ndarray[object, ndim=2] result
  2404. tuple tup
  2405. n = len(tuples)
  2406. k = len(tuples[0])
  2407. result = np.empty((n, k), dtype=object)
  2408. for i in range(n):
  2409. tup = tuples[i]
  2410. for j in range(k):
  2411. result[i, j] = tup[j]
  2412. return result
  2413. def to_object_array_tuples(rows: object) -> np.ndarray:
  2414. """
  2415. Convert a list of tuples into an object array. Any subclass of
  2416. tuple in `rows` will be casted to tuple.
  2417. Parameters
  2418. ----------
  2419. rows : 2-d array (N, K)
  2420. List of tuples to be converted into an array.
  2421. Returns
  2422. -------
  2423. np.ndarray[object, ndim=2]
  2424. """
  2425. cdef:
  2426. Py_ssize_t i, j, n, k, tmp
  2427. ndarray[object, ndim=2] result
  2428. tuple row
  2429. rows = list(rows)
  2430. n = len(rows)
  2431. k = 0
  2432. for i in range(n):
  2433. tmp = 1 if checknull(rows[i]) else len(rows[i])
  2434. if tmp > k:
  2435. k = tmp
  2436. result = np.empty((n, k), dtype=object)
  2437. try:
  2438. for i in range(n):
  2439. row = rows[i]
  2440. for j in range(len(row)):
  2441. result[i, j] = row[j]
  2442. except TypeError:
  2443. # e.g. "Expected tuple, got list"
  2444. # upcast any subclasses to tuple
  2445. for i in range(n):
  2446. row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
  2447. for j in range(len(row)):
  2448. result[i, j] = row[j]
  2449. return result
  2450. @cython.wraparound(False)
  2451. @cython.boundscheck(False)
  2452. def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
  2453. cdef:
  2454. Py_ssize_t i, n = len(keys)
  2455. object val
  2456. ndarray[object] output = np.empty(n, dtype="O")
  2457. if n == 0:
  2458. # kludge, for Series
  2459. return np.empty(0, dtype="f8")
  2460. for i in range(n):
  2461. val = keys[i]
  2462. if val in mapping:
  2463. output[i] = mapping[val]
  2464. else:
  2465. output[i] = default
  2466. return maybe_convert_objects(output)
  2467. def is_bool_list(obj: list) -> bool:
  2468. """
  2469. Check if this list contains only bool or np.bool_ objects.
  2470. This is appreciably faster than checking `np.array(obj).dtype == bool`
  2471. obj1 = [True, False] * 100
  2472. obj2 = obj1 * 100
  2473. obj3 = obj2 * 100
  2474. obj4 = [True, None] + obj1
  2475. for obj in [obj1, obj2, obj3, obj4]:
  2476. %timeit is_bool_list(obj)
  2477. %timeit np.array(obj).dtype.kind == "b"
  2478. 340 ns ± 8.22 ns
  2479. 8.78 µs ± 253 ns
  2480. 28.8 µs ± 704 ns
  2481. 813 µs ± 17.8 µs
  2482. 3.4 ms ± 168 µs
  2483. 78.4 ms ± 1.05 ms
  2484. 48.1 ns ± 1.26 ns
  2485. 8.1 µs ± 198 ns
  2486. """
  2487. cdef:
  2488. object item
  2489. for item in obj:
  2490. if not util.is_bool_object(item):
  2491. return False
  2492. # Note: we return True for empty list
  2493. return True
  2494. cpdef ndarray eq_NA_compat(ndarray[object] arr, object key):
  2495. """
  2496. Check for `arr == key`, treating all values as not-equal to pd.NA.
  2497. key is assumed to have `not isna(key)`
  2498. """
  2499. cdef:
  2500. ndarray[uint8_t, cast=True] result = cnp.PyArray_EMPTY(
  2501. arr.ndim, arr.shape, cnp.NPY_BOOL, 0
  2502. )
  2503. Py_ssize_t i
  2504. object item
  2505. for i in range(len(arr)):
  2506. item = arr[i]
  2507. if item is C_NA:
  2508. result[i] = False
  2509. else:
  2510. result[i] = item == key
  2511. return result
  2512. def dtypes_all_equal(list types not None) -> bool:
  2513. """
  2514. Faster version for:
  2515. first = types[0]
  2516. all(is_dtype_equal(first, t) for t in types[1:])
  2517. And assuming all elements in the list are np.dtype/ExtensionDtype objects
  2518. See timings at https://github.com/pandas-dev/pandas/pull/44594
  2519. """
  2520. first = types[0]
  2521. for t in types[1:]:
  2522. try:
  2523. if not t == first:
  2524. return False
  2525. except (TypeError, AttributeError):
  2526. return False
  2527. else:
  2528. return True