array.py 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206
  1. from __future__ import annotations
  2. from copy import deepcopy
  3. import functools
  4. import operator
  5. import re
  6. import sys
  7. import textwrap
  8. from typing import (
  9. TYPE_CHECKING,
  10. Any,
  11. Callable,
  12. Literal,
  13. Sequence,
  14. TypeVar,
  15. cast,
  16. )
  17. import unicodedata
  18. import numpy as np
  19. from pandas._libs import lib
  20. from pandas._typing import (
  21. ArrayLike,
  22. AxisInt,
  23. Dtype,
  24. FillnaOptions,
  25. Iterator,
  26. NpDtype,
  27. PositionalIndexer,
  28. Scalar,
  29. SortKind,
  30. TakeIndexer,
  31. TimeAmbiguous,
  32. TimeNonexistent,
  33. npt,
  34. )
  35. from pandas.compat import (
  36. pa_version_under7p0,
  37. pa_version_under8p0,
  38. pa_version_under9p0,
  39. pa_version_under11p0,
  40. )
  41. from pandas.util._decorators import doc
  42. from pandas.util._validators import validate_fillna_kwargs
  43. from pandas.core.dtypes.common import (
  44. is_array_like,
  45. is_bool_dtype,
  46. is_integer,
  47. is_integer_dtype,
  48. is_list_like,
  49. is_object_dtype,
  50. is_scalar,
  51. )
  52. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  53. from pandas.core.dtypes.missing import isna
  54. from pandas.core import roperator
  55. from pandas.core.arraylike import OpsMixin
  56. from pandas.core.arrays.base import (
  57. ExtensionArray,
  58. ExtensionArraySupportsAnyAll,
  59. )
  60. import pandas.core.common as com
  61. from pandas.core.indexers import (
  62. check_array_indexer,
  63. unpack_tuple_and_ellipses,
  64. validate_indices,
  65. )
  66. from pandas.core.strings.base import BaseStringArrayMethods
  67. from pandas.tseries.frequencies import to_offset
  68. if not pa_version_under7p0:
  69. import pyarrow as pa
  70. import pyarrow.compute as pc
  71. from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
  72. from pandas.core.arrays.arrow.dtype import ArrowDtype
  73. ARROW_CMP_FUNCS = {
  74. "eq": pc.equal,
  75. "ne": pc.not_equal,
  76. "lt": pc.less,
  77. "gt": pc.greater,
  78. "le": pc.less_equal,
  79. "ge": pc.greater_equal,
  80. }
  81. ARROW_LOGICAL_FUNCS = {
  82. "and_": pc.and_kleene,
  83. "rand_": lambda x, y: pc.and_kleene(y, x),
  84. "or_": pc.or_kleene,
  85. "ror_": lambda x, y: pc.or_kleene(y, x),
  86. "xor": pc.xor,
  87. "rxor": lambda x, y: pc.xor(y, x),
  88. }
  89. def cast_for_truediv(
  90. arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
  91. ) -> pa.ChunkedArray:
  92. # Ensure int / int -> float mirroring Python/Numpy behavior
  93. # as pc.divide_checked(int, int) -> int
  94. if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
  95. pa_object.type
  96. ):
  97. return arrow_array.cast(pa.float64())
  98. return arrow_array
  99. def floordiv_compat(
  100. left: pa.ChunkedArray | pa.Array | pa.Scalar,
  101. right: pa.ChunkedArray | pa.Array | pa.Scalar,
  102. ) -> pa.ChunkedArray:
  103. # Ensure int // int -> int mirroring Python/Numpy behavior
  104. # as pc.floor(pc.divide_checked(int, int)) -> float
  105. result = pc.floor(pc.divide(left, right))
  106. if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
  107. result = result.cast(left.type)
  108. return result
  109. ARROW_ARITHMETIC_FUNCS = {
  110. "add": pc.add_checked,
  111. "radd": lambda x, y: pc.add_checked(y, x),
  112. "sub": pc.subtract_checked,
  113. "rsub": lambda x, y: pc.subtract_checked(y, x),
  114. "mul": pc.multiply_checked,
  115. "rmul": lambda x, y: pc.multiply_checked(y, x),
  116. "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
  117. "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
  118. "floordiv": lambda x, y: floordiv_compat(x, y),
  119. "rfloordiv": lambda x, y: floordiv_compat(y, x),
  120. "mod": NotImplemented,
  121. "rmod": NotImplemented,
  122. "divmod": NotImplemented,
  123. "rdivmod": NotImplemented,
  124. "pow": pc.power_checked,
  125. "rpow": lambda x, y: pc.power_checked(y, x),
  126. }
  127. if TYPE_CHECKING:
  128. from pandas._typing import (
  129. NumpySorter,
  130. NumpyValueArrayLike,
  131. )
  132. from pandas import Series
  133. ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
  134. def get_unit_from_pa_dtype(pa_dtype):
  135. # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804
  136. if pa_version_under11p0:
  137. unit = str(pa_dtype).split("[", 1)[-1][:-1]
  138. if unit not in ["s", "ms", "us", "ns"]:
  139. raise ValueError(pa_dtype)
  140. return unit
  141. return pa_dtype.unit
  142. def to_pyarrow_type(
  143. dtype: ArrowDtype | pa.DataType | Dtype | None,
  144. ) -> pa.DataType | None:
  145. """
  146. Convert dtype to a pyarrow type instance.
  147. """
  148. if isinstance(dtype, ArrowDtype):
  149. return dtype.pyarrow_dtype
  150. elif isinstance(dtype, pa.DataType):
  151. return dtype
  152. elif isinstance(dtype, DatetimeTZDtype):
  153. return pa.timestamp(dtype.unit, dtype.tz)
  154. elif dtype:
  155. try:
  156. # Accepts python types too
  157. # Doesn't handle all numpy types
  158. return pa.from_numpy_dtype(dtype)
  159. except pa.ArrowNotImplementedError:
  160. pass
  161. return None
  162. class ArrowExtensionArray(
  163. OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods
  164. ):
  165. """
  166. Pandas ExtensionArray backed by a PyArrow ChunkedArray.
  167. .. warning::
  168. ArrowExtensionArray is considered experimental. The implementation and
  169. parts of the API may change without warning.
  170. Parameters
  171. ----------
  172. values : pyarrow.Array or pyarrow.ChunkedArray
  173. Attributes
  174. ----------
  175. None
  176. Methods
  177. -------
  178. None
  179. Returns
  180. -------
  181. ArrowExtensionArray
  182. Notes
  183. -----
  184. Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__
  185. Some methods may either raise an exception or raise a ``PerformanceWarning`` if an
  186. associated compute function is not available based on the installed version of PyArrow.
  187. Please install the latest version of PyArrow to enable the best functionality and avoid
  188. potential bugs in prior versions of PyArrow.
  189. Examples
  190. --------
  191. Create an ArrowExtensionArray with :func:`pandas.array`:
  192. >>> pd.array([1, 1, None], dtype="int64[pyarrow]")
  193. <ArrowExtensionArray>
  194. [1, 1, <NA>]
  195. Length: 3, dtype: int64[pyarrow]
  196. """ # noqa: E501 (http link too long)
  197. _data: pa.ChunkedArray
  198. _dtype: ArrowDtype
  199. def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
  200. if pa_version_under7p0:
  201. msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray."
  202. raise ImportError(msg)
  203. if isinstance(values, pa.Array):
  204. self._data = pa.chunked_array([values])
  205. elif isinstance(values, pa.ChunkedArray):
  206. self._data = values
  207. else:
  208. raise ValueError(
  209. f"Unsupported type '{type(values)}' for ArrowExtensionArray"
  210. )
  211. self._dtype = ArrowDtype(self._data.type)
  212. @classmethod
  213. def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
  214. """
  215. Construct a new ExtensionArray from a sequence of scalars.
  216. """
  217. pa_dtype = to_pyarrow_type(dtype)
  218. if (
  219. isinstance(scalars, np.ndarray)
  220. and isinstance(dtype, ArrowDtype)
  221. and (
  222. pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)
  223. )
  224. ):
  225. # See https://github.com/apache/arrow/issues/35289
  226. scalars = scalars.tolist()
  227. if isinstance(scalars, cls):
  228. scalars = scalars._data
  229. elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
  230. if copy and is_array_like(scalars):
  231. # pa array should not get updated when numpy array is updated
  232. scalars = deepcopy(scalars)
  233. try:
  234. scalars = pa.array(scalars, type=pa_dtype, from_pandas=True)
  235. except pa.ArrowInvalid:
  236. # GH50430: let pyarrow infer type, then cast
  237. scalars = pa.array(scalars, from_pandas=True)
  238. if pa_dtype:
  239. if pa.types.is_dictionary(pa_dtype):
  240. scalars = scalars.dictionary_encode()
  241. else:
  242. scalars = scalars.cast(pa_dtype)
  243. arr = cls(scalars)
  244. if pa.types.is_duration(scalars.type) and scalars.null_count > 0:
  245. # GH52843: upstream bug for duration types when originally
  246. # constructed with data containing numpy NaT.
  247. # https://github.com/apache/arrow/issues/35088
  248. arr = arr.fillna(arr.dtype.na_value)
  249. return arr
  250. @classmethod
  251. def _from_sequence_of_strings(
  252. cls, strings, *, dtype: Dtype | None = None, copy: bool = False
  253. ):
  254. """
  255. Construct a new ExtensionArray from a sequence of strings.
  256. """
  257. pa_type = to_pyarrow_type(dtype)
  258. if (
  259. pa_type is None
  260. or pa.types.is_binary(pa_type)
  261. or pa.types.is_string(pa_type)
  262. ):
  263. # pa_type is None: Let pa.array infer
  264. # pa_type is string/binary: scalars already correct type
  265. scalars = strings
  266. elif pa.types.is_timestamp(pa_type):
  267. from pandas.core.tools.datetimes import to_datetime
  268. scalars = to_datetime(strings, errors="raise")
  269. elif pa.types.is_date(pa_type):
  270. from pandas.core.tools.datetimes import to_datetime
  271. scalars = to_datetime(strings, errors="raise").date
  272. elif pa.types.is_duration(pa_type):
  273. from pandas.core.tools.timedeltas import to_timedelta
  274. scalars = to_timedelta(strings, errors="raise")
  275. if pa_type.unit != "ns":
  276. # GH51175: test_from_sequence_of_strings_pa_array
  277. # attempt to parse as int64 reflecting pyarrow's
  278. # duration to string casting behavior
  279. mask = isna(scalars)
  280. if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
  281. strings = pa.array(strings, type=pa.string(), from_pandas=True)
  282. strings = pc.if_else(mask, None, strings)
  283. try:
  284. scalars = strings.cast(pa.int64())
  285. except pa.ArrowInvalid:
  286. pass
  287. elif pa.types.is_time(pa_type):
  288. from pandas.core.tools.times import to_time
  289. # "coerce" to allow "null times" (None) to not raise
  290. scalars = to_time(strings, errors="coerce")
  291. elif pa.types.is_boolean(pa_type):
  292. from pandas.core.arrays import BooleanArray
  293. scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy()
  294. elif (
  295. pa.types.is_integer(pa_type)
  296. or pa.types.is_floating(pa_type)
  297. or pa.types.is_decimal(pa_type)
  298. ):
  299. from pandas.core.tools.numeric import to_numeric
  300. scalars = to_numeric(strings, errors="raise")
  301. else:
  302. raise NotImplementedError(
  303. f"Converting strings to {pa_type} is not implemented."
  304. )
  305. return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
  306. def __getitem__(self, item: PositionalIndexer):
  307. """Select a subset of self.
  308. Parameters
  309. ----------
  310. item : int, slice, or ndarray
  311. * int: The position in 'self' to get.
  312. * slice: A slice object, where 'start', 'stop', and 'step' are
  313. integers or None
  314. * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
  315. Returns
  316. -------
  317. item : scalar or ExtensionArray
  318. Notes
  319. -----
  320. For scalar ``item``, return a scalar value suitable for the array's
  321. type. This should be an instance of ``self.dtype.type``.
  322. For slice ``key``, return an instance of ``ExtensionArray``, even
  323. if the slice is length 0 or 1.
  324. For a boolean mask, return an instance of ``ExtensionArray``, filtered
  325. to the values where ``item`` is True.
  326. """
  327. item = check_array_indexer(self, item)
  328. if isinstance(item, np.ndarray):
  329. if not len(item):
  330. # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
  331. if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
  332. pa_dtype = pa.string()
  333. else:
  334. pa_dtype = self._dtype.pyarrow_dtype
  335. return type(self)(pa.chunked_array([], type=pa_dtype))
  336. elif is_integer_dtype(item.dtype):
  337. return self.take(item)
  338. elif is_bool_dtype(item.dtype):
  339. return type(self)(self._data.filter(item))
  340. else:
  341. raise IndexError(
  342. "Only integers, slices and integer or "
  343. "boolean arrays are valid indices."
  344. )
  345. elif isinstance(item, tuple):
  346. item = unpack_tuple_and_ellipses(item)
  347. if item is Ellipsis:
  348. # TODO: should be handled by pyarrow?
  349. item = slice(None)
  350. if is_scalar(item) and not is_integer(item):
  351. # e.g. "foo" or 2.5
  352. # exception message copied from numpy
  353. raise IndexError(
  354. r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
  355. r"(`None`) and integer or boolean arrays are valid indices"
  356. )
  357. # We are not an array indexer, so maybe e.g. a slice or integer
  358. # indexer. We dispatch to pyarrow.
  359. value = self._data[item]
  360. if isinstance(value, pa.ChunkedArray):
  361. return type(self)(value)
  362. else:
  363. scalar = value.as_py()
  364. if scalar is None:
  365. return self._dtype.na_value
  366. else:
  367. return scalar
  368. def __iter__(self) -> Iterator[Any]:
  369. """
  370. Iterate over elements of the array.
  371. """
  372. na_value = self._dtype.na_value
  373. for value in self._data:
  374. val = value.as_py()
  375. if val is None:
  376. yield na_value
  377. else:
  378. yield val
  379. def __arrow_array__(self, type=None):
  380. """Convert myself to a pyarrow ChunkedArray."""
  381. return self._data
  382. def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
  383. """Correctly construct numpy arrays when passed to `np.asarray()`."""
  384. return self.to_numpy(dtype=dtype)
  385. def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
  386. return type(self)(pc.invert(self._data))
  387. def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
  388. return type(self)(pc.negate_checked(self._data))
  389. def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
  390. return type(self)(self._data)
  391. def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
  392. return type(self)(pc.abs_checked(self._data))
  393. # GH 42600: __getstate__/__setstate__ not necessary once
  394. # https://issues.apache.org/jira/browse/ARROW-10739 is addressed
  395. def __getstate__(self):
  396. state = self.__dict__.copy()
  397. state["_data"] = self._data.combine_chunks()
  398. return state
  399. def __setstate__(self, state) -> None:
  400. state["_data"] = pa.chunked_array(state["_data"])
  401. self.__dict__.update(state)
  402. def _cmp_method(self, other, op):
  403. from pandas.core.arrays.masked import BaseMaskedArray
  404. pc_func = ARROW_CMP_FUNCS[op.__name__]
  405. if isinstance(other, ArrowExtensionArray):
  406. result = pc_func(self._data, other._data)
  407. elif isinstance(other, (np.ndarray, list)):
  408. result = pc_func(self._data, other)
  409. elif isinstance(other, BaseMaskedArray):
  410. # GH 52625
  411. result = pc_func(self._data, other.__arrow_array__())
  412. elif is_scalar(other):
  413. try:
  414. result = pc_func(self._data, pa.scalar(other))
  415. except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
  416. mask = isna(self) | isna(other)
  417. valid = ~mask
  418. result = np.zeros(len(self), dtype="bool")
  419. result[valid] = op(np.array(self)[valid], other)
  420. result = pa.array(result, type=pa.bool_())
  421. result = pc.if_else(valid, result, None)
  422. else:
  423. raise NotImplementedError(
  424. f"{op.__name__} not implemented for {type(other)}"
  425. )
  426. return ArrowExtensionArray(result)
  427. def _evaluate_op_method(self, other, op, arrow_funcs):
  428. from pandas.core.arrays.masked import BaseMaskedArray
  429. pa_type = self._data.type
  430. if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
  431. operator.add,
  432. roperator.radd,
  433. ]:
  434. length = self._data.length()
  435. seps: list[str] | list[bytes]
  436. if pa.types.is_string(pa_type):
  437. seps = [""] * length
  438. else:
  439. seps = [b""] * length
  440. if is_scalar(other):
  441. other = [other] * length
  442. elif isinstance(other, type(self)):
  443. other = other._data
  444. if op is operator.add:
  445. result = pc.binary_join_element_wise(self._data, other, seps)
  446. else:
  447. result = pc.binary_join_element_wise(other, self._data, seps)
  448. return type(self)(result)
  449. pc_func = arrow_funcs[op.__name__]
  450. if pc_func is NotImplemented:
  451. raise NotImplementedError(f"{op.__name__} not implemented.")
  452. if isinstance(other, ArrowExtensionArray):
  453. result = pc_func(self._data, other._data)
  454. elif isinstance(other, (np.ndarray, list)):
  455. result = pc_func(self._data, pa.array(other, from_pandas=True))
  456. elif isinstance(other, BaseMaskedArray):
  457. # GH 52625
  458. result = pc_func(self._data, other.__arrow_array__())
  459. elif is_scalar(other):
  460. if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS:
  461. # pyarrow kleene ops require null to be typed
  462. pa_scalar = pa.scalar(None, type=self._data.type)
  463. else:
  464. pa_scalar = pa.scalar(other)
  465. result = pc_func(self._data, pa_scalar)
  466. else:
  467. raise NotImplementedError(
  468. f"{op.__name__} not implemented for {type(other)}"
  469. )
  470. return type(self)(result)
  471. def _logical_method(self, other, op):
  472. return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)
  473. def _arith_method(self, other, op):
  474. return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
  475. def equals(self, other) -> bool:
  476. if not isinstance(other, ArrowExtensionArray):
  477. return False
  478. # I'm told that pyarrow makes __eq__ behave like pandas' equals;
  479. # TODO: is this documented somewhere?
  480. return self._data == other._data
  481. @property
  482. def dtype(self) -> ArrowDtype:
  483. """
  484. An instance of 'ExtensionDtype'.
  485. """
  486. return self._dtype
  487. @property
  488. def nbytes(self) -> int:
  489. """
  490. The number of bytes needed to store this object in memory.
  491. """
  492. return self._data.nbytes
  493. def __len__(self) -> int:
  494. """
  495. Length of this array.
  496. Returns
  497. -------
  498. length : int
  499. """
  500. return len(self._data)
  501. def __contains__(self, key) -> bool:
  502. # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
  503. if isna(key) and key is not self.dtype.na_value:
  504. if self.dtype.kind == "f" and lib.is_float(key) and isna(key):
  505. return pc.any(pc.is_nan(self._data)).as_py()
  506. # e.g. date or timestamp types we do not allow None here to match pd.NA
  507. return False
  508. # TODO: maybe complex? object?
  509. return bool(super().__contains__(key))
  510. @property
  511. def _hasna(self) -> bool:
  512. return self._data.null_count > 0
  513. def isna(self) -> npt.NDArray[np.bool_]:
  514. """
  515. Boolean NumPy array indicating if each value is missing.
  516. This should return a 1-D array the same length as 'self'.
  517. """
  518. return self._data.is_null().to_numpy()
  519. def any(self, *, skipna: bool = True, **kwargs):
  520. """
  521. Return whether any element is truthy.
  522. Returns False unless there is at least one element that is truthy.
  523. By default, NAs are skipped. If ``skipna=False`` is specified and
  524. missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
  525. is used as for logical operations.
  526. Parameters
  527. ----------
  528. skipna : bool, default True
  529. Exclude NA values. If the entire array is NA and `skipna` is
  530. True, then the result will be False, as for an empty array.
  531. If `skipna` is False, the result will still be True if there is
  532. at least one element that is truthy, otherwise NA will be returned
  533. if there are NA's present.
  534. Returns
  535. -------
  536. bool or :attr:`pandas.NA`
  537. See Also
  538. --------
  539. ArrowExtensionArray.all : Return whether all elements are truthy.
  540. Examples
  541. --------
  542. The result indicates whether any element is truthy (and by default
  543. skips NAs):
  544. >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any()
  545. True
  546. >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any()
  547. True
  548. >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any()
  549. False
  550. >>> pd.array([], dtype="boolean[pyarrow]").any()
  551. False
  552. >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any()
  553. False
  554. >>> pd.array([pd.NA], dtype="float64[pyarrow]").any()
  555. False
  556. With ``skipna=False``, the result can be NA if this is logically
  557. required (whether ``pd.NA`` is True or False influences the result):
  558. >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
  559. True
  560. >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
  561. True
  562. >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
  563. <NA>
  564. >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
  565. <NA>
  566. """
  567. return self._reduce("any", skipna=skipna, **kwargs)
  568. def all(self, *, skipna: bool = True, **kwargs):
  569. """
  570. Return whether all elements are truthy.
  571. Returns True unless there is at least one element that is falsey.
  572. By default, NAs are skipped. If ``skipna=False`` is specified and
  573. missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
  574. is used as for logical operations.
  575. Parameters
  576. ----------
  577. skipna : bool, default True
  578. Exclude NA values. If the entire array is NA and `skipna` is
  579. True, then the result will be True, as for an empty array.
  580. If `skipna` is False, the result will still be False if there is
  581. at least one element that is falsey, otherwise NA will be returned
  582. if there are NA's present.
  583. Returns
  584. -------
  585. bool or :attr:`pandas.NA`
  586. See Also
  587. --------
  588. ArrowExtensionArray.any : Return whether any element is truthy.
  589. Examples
  590. --------
  591. The result indicates whether all elements are truthy (and by default
  592. skips NAs):
  593. >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all()
  594. True
  595. >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all()
  596. True
  597. >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all()
  598. False
  599. >>> pd.array([], dtype="boolean[pyarrow]").all()
  600. True
  601. >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all()
  602. True
  603. >>> pd.array([pd.NA], dtype="float64[pyarrow]").all()
  604. True
  605. With ``skipna=False``, the result can be NA if this is logically
  606. required (whether ``pd.NA`` is True or False influences the result):
  607. >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
  608. <NA>
  609. >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
  610. <NA>
  611. >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
  612. False
  613. >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
  614. False
  615. """
  616. return self._reduce("all", skipna=skipna, **kwargs)
  617. def argsort(
  618. self,
  619. *,
  620. ascending: bool = True,
  621. kind: SortKind = "quicksort",
  622. na_position: str = "last",
  623. **kwargs,
  624. ) -> np.ndarray:
  625. order = "ascending" if ascending else "descending"
  626. null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)
  627. if null_placement is None:
  628. raise ValueError(f"invalid na_position: {na_position}")
  629. result = pc.array_sort_indices(
  630. self._data, order=order, null_placement=null_placement
  631. )
  632. np_result = result.to_numpy()
  633. return np_result.astype(np.intp, copy=False)
  634. def _argmin_max(self, skipna: bool, method: str) -> int:
  635. if self._data.length() in (0, self._data.null_count) or (
  636. self._hasna and not skipna
  637. ):
  638. # For empty or all null, pyarrow returns -1 but pandas expects TypeError
  639. # For skipna=False and data w/ null, pandas expects NotImplementedError
  640. # let ExtensionArray.arg{max|min} raise
  641. return getattr(super(), f"arg{method}")(skipna=skipna)
  642. data = self._data
  643. if pa.types.is_duration(data.type):
  644. data = data.cast(pa.int64())
  645. value = getattr(pc, method)(data, skip_nulls=skipna)
  646. return pc.index(data, value).as_py()
  647. def argmin(self, skipna: bool = True) -> int:
  648. return self._argmin_max(skipna, "min")
  649. def argmax(self, skipna: bool = True) -> int:
  650. return self._argmin_max(skipna, "max")
  651. def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
  652. """
  653. Return a shallow copy of the array.
  654. Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
  655. Returns
  656. -------
  657. type(self)
  658. """
  659. return type(self)(self._data)
  660. def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
  661. """
  662. Return ArrowExtensionArray without NA values.
  663. Returns
  664. -------
  665. ArrowExtensionArray
  666. """
  667. return type(self)(pc.drop_null(self._data))
  668. @doc(ExtensionArray.fillna)
  669. def fillna(
  670. self: ArrowExtensionArrayT,
  671. value: object | ArrayLike | None = None,
  672. method: FillnaOptions | None = None,
  673. limit: int | None = None,
  674. ) -> ArrowExtensionArrayT:
  675. value, method = validate_fillna_kwargs(value, method)
  676. if limit is not None:
  677. return super().fillna(value=value, method=method, limit=limit)
  678. if method is not None:
  679. fallback_performancewarning()
  680. return super().fillna(value=value, method=method, limit=limit)
  681. if is_array_like(value):
  682. value = cast(ArrayLike, value)
  683. if len(value) != len(self):
  684. raise ValueError(
  685. f"Length of 'value' does not match. Got ({len(value)}) "
  686. f" expected {len(self)}"
  687. )
  688. def convert_fill_value(value, pa_type, dtype):
  689. if value is None:
  690. return value
  691. if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
  692. return value
  693. if is_array_like(value):
  694. pa_box = pa.array
  695. else:
  696. pa_box = pa.scalar
  697. try:
  698. value = pa_box(value, type=pa_type, from_pandas=True)
  699. except pa.ArrowTypeError as err:
  700. msg = f"Invalid value '{str(value)}' for dtype {dtype}"
  701. raise TypeError(msg) from err
  702. return value
  703. fill_value = convert_fill_value(value, self._data.type, self.dtype)
  704. try:
  705. if method is None:
  706. return type(self)(pc.fill_null(self._data, fill_value=fill_value))
  707. elif method == "pad":
  708. return type(self)(pc.fill_null_forward(self._data))
  709. elif method == "backfill":
  710. return type(self)(pc.fill_null_backward(self._data))
  711. except pa.ArrowNotImplementedError:
  712. # ArrowNotImplementedError: Function 'coalesce' has no kernel
  713. # matching input types (duration[ns], duration[ns])
  714. # TODO: remove try/except wrapper if/when pyarrow implements
  715. # a kernel for duration types.
  716. pass
  717. return super().fillna(value=value, method=method, limit=limit)
  718. def isin(self, values) -> npt.NDArray[np.bool_]:
  719. # short-circuit to return all False array.
  720. if not len(values):
  721. return np.zeros(len(self), dtype=bool)
  722. result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))
  723. # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
  724. # to False
  725. return np.array(result, dtype=np.bool_)
  726. def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
  727. """
  728. Return an array and missing value suitable for factorization.
  729. Returns
  730. -------
  731. values : ndarray
  732. na_value : pd.NA
  733. Notes
  734. -----
  735. The values returned by this method are also used in
  736. :func:`pandas.util.hash_pandas_object`.
  737. """
  738. values = self._data.to_numpy()
  739. return values, self.dtype.na_value
  740. @doc(ExtensionArray.factorize)
  741. def factorize(
  742. self,
  743. use_na_sentinel: bool = True,
  744. ) -> tuple[np.ndarray, ExtensionArray]:
  745. null_encoding = "mask" if use_na_sentinel else "encode"
  746. pa_type = self._data.type
  747. if pa.types.is_duration(pa_type):
  748. # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
  749. data = self._data.cast(pa.int64())
  750. else:
  751. data = self._data
  752. if pa.types.is_dictionary(data.type):
  753. encoded = data
  754. else:
  755. encoded = data.dictionary_encode(null_encoding=null_encoding)
  756. if encoded.length() == 0:
  757. indices = np.array([], dtype=np.intp)
  758. uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
  759. else:
  760. pa_indices = encoded.combine_chunks().indices
  761. if pa_indices.null_count > 0:
  762. pa_indices = pc.fill_null(pa_indices, -1)
  763. indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
  764. np.intp, copy=False
  765. )
  766. uniques = type(self)(encoded.chunk(0).dictionary)
  767. if pa.types.is_duration(pa_type):
  768. uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))
  769. return indices, uniques
  770. def reshape(self, *args, **kwargs):
  771. raise NotImplementedError(
  772. f"{type(self)} does not support reshape "
  773. f"as backed by a 1D pyarrow.ChunkedArray."
  774. )
  775. def round(
  776. self: ArrowExtensionArrayT, decimals: int = 0, *args, **kwargs
  777. ) -> ArrowExtensionArrayT:
  778. """
  779. Round each value in the array a to the given number of decimals.
  780. Parameters
  781. ----------
  782. decimals : int, default 0
  783. Number of decimal places to round to. If decimals is negative,
  784. it specifies the number of positions to the left of the decimal point.
  785. *args, **kwargs
  786. Additional arguments and keywords have no effect.
  787. Returns
  788. -------
  789. ArrowExtensionArray
  790. Rounded values of the ArrowExtensionArray.
  791. See Also
  792. --------
  793. DataFrame.round : Round values of a DataFrame.
  794. Series.round : Round values of a Series.
  795. """
  796. return type(self)(pc.round(self._data, ndigits=decimals))
  797. @doc(ExtensionArray.searchsorted)
  798. def searchsorted(
  799. self,
  800. value: NumpyValueArrayLike | ExtensionArray,
  801. side: Literal["left", "right"] = "left",
  802. sorter: NumpySorter = None,
  803. ) -> npt.NDArray[np.intp] | np.intp:
  804. if self._hasna:
  805. raise ValueError(
  806. "searchsorted requires array to be sorted, which is impossible "
  807. "with NAs present."
  808. )
  809. if isinstance(value, ExtensionArray):
  810. value = value.astype(object)
  811. # Base class searchsorted would cast to object, which is *much* slower.
  812. return self.to_numpy().searchsorted(value, side=side, sorter=sorter)
  813. def take(
  814. self,
  815. indices: TakeIndexer,
  816. allow_fill: bool = False,
  817. fill_value: Any = None,
  818. ) -> ArrowExtensionArray:
  819. """
  820. Take elements from an array.
  821. Parameters
  822. ----------
  823. indices : sequence of int or one-dimensional np.ndarray of int
  824. Indices to be taken.
  825. allow_fill : bool, default False
  826. How to handle negative values in `indices`.
  827. * False: negative values in `indices` indicate positional indices
  828. from the right (the default). This is similar to
  829. :func:`numpy.take`.
  830. * True: negative values in `indices` indicate
  831. missing values. These values are set to `fill_value`. Any other
  832. other negative values raise a ``ValueError``.
  833. fill_value : any, optional
  834. Fill value to use for NA-indices when `allow_fill` is True.
  835. This may be ``None``, in which case the default NA value for
  836. the type, ``self.dtype.na_value``, is used.
  837. For many ExtensionArrays, there will be two representations of
  838. `fill_value`: a user-facing "boxed" scalar, and a low-level
  839. physical NA value. `fill_value` should be the user-facing version,
  840. and the implementation should handle translating that to the
  841. physical version for processing the take if necessary.
  842. Returns
  843. -------
  844. ExtensionArray
  845. Raises
  846. ------
  847. IndexError
  848. When the indices are out of bounds for the array.
  849. ValueError
  850. When `indices` contains negative values other than ``-1``
  851. and `allow_fill` is True.
  852. See Also
  853. --------
  854. numpy.take
  855. api.extensions.take
  856. Notes
  857. -----
  858. ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
  859. ``iloc``, when `indices` is a sequence of values. Additionally,
  860. it's called by :meth:`Series.reindex`, or any other method
  861. that causes realignment, with a `fill_value`.
  862. """
  863. # TODO: Remove once we got rid of the (indices < 0) check
  864. if not is_array_like(indices):
  865. indices_array = np.asanyarray(indices)
  866. else:
  867. # error: Incompatible types in assignment (expression has type
  868. # "Sequence[int]", variable has type "ndarray")
  869. indices_array = indices # type: ignore[assignment]
  870. if len(self._data) == 0 and (indices_array >= 0).any():
  871. raise IndexError("cannot do a non-empty take")
  872. if indices_array.size > 0 and indices_array.max() >= len(self._data):
  873. raise IndexError("out of bounds value in 'indices'.")
  874. if allow_fill:
  875. fill_mask = indices_array < 0
  876. if fill_mask.any():
  877. validate_indices(indices_array, len(self._data))
  878. # TODO(ARROW-9433): Treat negative indices as NULL
  879. indices_array = pa.array(indices_array, mask=fill_mask)
  880. result = self._data.take(indices_array)
  881. if isna(fill_value):
  882. return type(self)(result)
  883. # TODO: ArrowNotImplementedError: Function fill_null has no
  884. # kernel matching input types (array[string], scalar[string])
  885. result = type(self)(result)
  886. result[fill_mask] = fill_value
  887. return result
  888. # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
  889. else:
  890. # Nothing to fill
  891. return type(self)(self._data.take(indices))
  892. else: # allow_fill=False
  893. # TODO(ARROW-9432): Treat negative indices as indices from the right.
  894. if (indices_array < 0).any():
  895. # Don't modify in-place
  896. indices_array = np.copy(indices_array)
  897. indices_array[indices_array < 0] += len(self._data)
  898. return type(self)(self._data.take(indices_array))
  899. @doc(ExtensionArray.to_numpy)
  900. def to_numpy(
  901. self,
  902. dtype: npt.DTypeLike | None = None,
  903. copy: bool = False,
  904. na_value: object = lib.no_default,
  905. ) -> np.ndarray:
  906. if dtype is None and self._hasna:
  907. dtype = object
  908. if na_value is lib.no_default:
  909. na_value = self.dtype.na_value
  910. pa_type = self._data.type
  911. if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):
  912. # temporal types with units and/or timezones currently
  913. # require pandas/python scalars to pass all tests
  914. # TODO: improve performance (this is slow)
  915. result = np.array(list(self), dtype=dtype)
  916. elif is_object_dtype(dtype) and self._hasna:
  917. result = np.empty(len(self), dtype=object)
  918. mask = ~self.isna()
  919. result[mask] = np.asarray(self[mask]._data)
  920. elif pa.types.is_null(self._data.type):
  921. result = np.asarray(self._data, dtype=dtype)
  922. if not isna(na_value):
  923. result[:] = na_value
  924. return result
  925. elif self._hasna:
  926. data = self.copy()
  927. data[self.isna()] = na_value
  928. return np.asarray(data._data, dtype=dtype)
  929. else:
  930. result = np.asarray(self._data, dtype=dtype)
  931. if copy:
  932. result = result.copy()
  933. if self._hasna:
  934. result[self.isna()] = na_value
  935. return result
  936. def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
  937. """
  938. Compute the ArrowExtensionArray of unique values.
  939. Returns
  940. -------
  941. ArrowExtensionArray
  942. """
  943. pa_type = self._data.type
  944. if pa.types.is_duration(pa_type):
  945. # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
  946. data = self._data.cast(pa.int64())
  947. else:
  948. data = self._data
  949. pa_result = pc.unique(data)
  950. if pa.types.is_duration(pa_type):
  951. pa_result = pa_result.cast(pa_type)
  952. return type(self)(pa_result)
  953. def value_counts(self, dropna: bool = True) -> Series:
  954. """
  955. Return a Series containing counts of each unique value.
  956. Parameters
  957. ----------
  958. dropna : bool, default True
  959. Don't include counts of missing values.
  960. Returns
  961. -------
  962. counts : Series
  963. See Also
  964. --------
  965. Series.value_counts
  966. """
  967. pa_type = self._data.type
  968. if pa.types.is_duration(pa_type):
  969. # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
  970. data = self._data.cast(pa.int64())
  971. else:
  972. data = self._data
  973. from pandas import (
  974. Index,
  975. Series,
  976. )
  977. vc = data.value_counts()
  978. values = vc.field(0)
  979. counts = vc.field(1)
  980. if dropna and data.null_count > 0:
  981. mask = values.is_valid()
  982. values = values.filter(mask)
  983. counts = counts.filter(mask)
  984. if pa.types.is_duration(pa_type):
  985. values = values.cast(pa_type)
  986. counts = ArrowExtensionArray(counts)
  987. index = Index(type(self)(values))
  988. return Series(counts, index=index, name="count", copy=False)
  989. @classmethod
  990. def _concat_same_type(
  991. cls: type[ArrowExtensionArrayT], to_concat
  992. ) -> ArrowExtensionArrayT:
  993. """
  994. Concatenate multiple ArrowExtensionArrays.
  995. Parameters
  996. ----------
  997. to_concat : sequence of ArrowExtensionArrays
  998. Returns
  999. -------
  1000. ArrowExtensionArray
  1001. """
  1002. chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
  1003. if to_concat[0].dtype == "string":
  1004. # StringDtype has no attrivute pyarrow_dtype
  1005. pa_dtype = pa.string()
  1006. else:
  1007. pa_dtype = to_concat[0].dtype.pyarrow_dtype
  1008. arr = pa.chunked_array(chunks, type=pa_dtype)
  1009. return cls(arr)
  1010. def _accumulate(
  1011. self, name: str, *, skipna: bool = True, **kwargs
  1012. ) -> ArrowExtensionArray | ExtensionArray:
  1013. """
  1014. Return an ExtensionArray performing an accumulation operation.
  1015. The underlying data type might change.
  1016. Parameters
  1017. ----------
  1018. name : str
  1019. Name of the function, supported values are:
  1020. - cummin
  1021. - cummax
  1022. - cumsum
  1023. - cumprod
  1024. skipna : bool, default True
  1025. If True, skip NA values.
  1026. **kwargs
  1027. Additional keyword arguments passed to the accumulation function.
  1028. Currently, there is no supported kwarg.
  1029. Returns
  1030. -------
  1031. array
  1032. Raises
  1033. ------
  1034. NotImplementedError : subclass does not define accumulations
  1035. """
  1036. pyarrow_name = {
  1037. "cumsum": "cumulative_sum_checked",
  1038. }.get(name, name)
  1039. pyarrow_meth = getattr(pc, pyarrow_name, None)
  1040. if pyarrow_meth is None:
  1041. return super()._accumulate(name, skipna=skipna, **kwargs)
  1042. data_to_accum = self._data
  1043. pa_dtype = data_to_accum.type
  1044. if pa.types.is_duration(pa_dtype):
  1045. data_to_accum = data_to_accum.cast(pa.int64())
  1046. result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
  1047. if pa.types.is_duration(pa_dtype):
  1048. result = result.cast(pa_dtype)
  1049. return type(self)(result)
  1050. def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
  1051. """
  1052. Return a scalar result of performing the reduction operation.
  1053. Parameters
  1054. ----------
  1055. name : str
  1056. Name of the function, supported values are:
  1057. { any, all, min, max, sum, mean, median, prod,
  1058. std, var, sem, kurt, skew }.
  1059. skipna : bool, default True
  1060. If True, skip NaN values.
  1061. **kwargs
  1062. Additional keyword arguments passed to the reduction function.
  1063. Currently, `ddof` is the only supported kwarg.
  1064. Returns
  1065. -------
  1066. scalar
  1067. Raises
  1068. ------
  1069. TypeError : subclass does not define reductions
  1070. """
  1071. pa_type = self._data.type
  1072. data_to_reduce = self._data
  1073. if name in ["any", "all"] and (
  1074. pa.types.is_integer(pa_type)
  1075. or pa.types.is_floating(pa_type)
  1076. or pa.types.is_duration(pa_type)
  1077. or pa.types.is_decimal(pa_type)
  1078. ):
  1079. # pyarrow only supports any/all for boolean dtype, we allow
  1080. # for other dtypes, matching our non-pyarrow behavior
  1081. if pa.types.is_duration(pa_type):
  1082. data_to_cmp = self._data.cast(pa.int64())
  1083. else:
  1084. data_to_cmp = self._data
  1085. not_eq = pc.not_equal(data_to_cmp, 0)
  1086. data_to_reduce = not_eq
  1087. elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
  1088. data_to_reduce = self._data.cast(pa.int64())
  1089. elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type):
  1090. nbits = pa_type.bit_width
  1091. if nbits == 32:
  1092. data_to_reduce = self._data.cast(pa.int32())
  1093. else:
  1094. data_to_reduce = self._data.cast(pa.int64())
  1095. if name == "sem":
  1096. def pyarrow_meth(data, skip_nulls, **kwargs):
  1097. numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)
  1098. denominator = pc.sqrt_checked(pc.count(self._data))
  1099. return pc.divide_checked(numerator, denominator)
  1100. else:
  1101. pyarrow_name = {
  1102. "median": "quantile",
  1103. "prod": "product",
  1104. "std": "stddev",
  1105. "var": "variance",
  1106. }.get(name, name)
  1107. # error: Incompatible types in assignment
  1108. # (expression has type "Optional[Any]", variable has type
  1109. # "Callable[[Any, Any, KwArg(Any)], Any]")
  1110. pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]
  1111. if pyarrow_meth is None:
  1112. # Let ExtensionArray._reduce raise the TypeError
  1113. return super()._reduce(name, skipna=skipna, **kwargs)
  1114. # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0
  1115. if name in ["any", "all"] and "min_count" not in kwargs:
  1116. kwargs["min_count"] = 0
  1117. elif name == "median":
  1118. # GH 52679: Use quantile instead of approximate_median
  1119. kwargs["q"] = 0.5
  1120. try:
  1121. result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
  1122. except (AttributeError, NotImplementedError, TypeError) as err:
  1123. msg = (
  1124. f"'{type(self).__name__}' with dtype {self.dtype} "
  1125. f"does not support reduction '{name}' with pyarrow "
  1126. f"version {pa.__version__}. '{name}' may be supported by "
  1127. f"upgrading pyarrow."
  1128. )
  1129. raise TypeError(msg) from err
  1130. if name == "median":
  1131. # GH 52679: Use quantile instead of approximate_median; returns array
  1132. result = result[0]
  1133. if pc.is_null(result).as_py():
  1134. return self.dtype.na_value
  1135. if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
  1136. result = result.cast(pa_type)
  1137. if name in ["median", "mean"] and pa.types.is_temporal(pa_type):
  1138. result = result.cast(pa_type)
  1139. if name in ["std", "sem"] and pa.types.is_temporal(pa_type):
  1140. result = result.cast(pa.int64())
  1141. if pa.types.is_duration(pa_type):
  1142. result = result.cast(pa_type)
  1143. elif pa.types.is_time(pa_type):
  1144. unit = get_unit_from_pa_dtype(pa_type)
  1145. result = result.cast(pa.duration(unit))
  1146. elif pa.types.is_date(pa_type):
  1147. # go with closest available unit, i.e. "s"
  1148. result = result.cast(pa.duration("s"))
  1149. else:
  1150. # i.e. timestamp
  1151. result = result.cast(pa.duration(pa_type.unit))
  1152. return result.as_py()
  1153. def __setitem__(self, key, value) -> None:
  1154. """Set one or more values inplace.
  1155. Parameters
  1156. ----------
  1157. key : int, ndarray, or slice
  1158. When called from, e.g. ``Series.__setitem__``, ``key`` will be
  1159. one of
  1160. * scalar int
  1161. * ndarray of integers.
  1162. * boolean ndarray
  1163. * slice object
  1164. value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
  1165. value or values to be set of ``key``.
  1166. Returns
  1167. -------
  1168. None
  1169. """
  1170. # GH50085: unwrap 1D indexers
  1171. if isinstance(key, tuple) and len(key) == 1:
  1172. key = key[0]
  1173. key = check_array_indexer(self, key)
  1174. value = self._maybe_convert_setitem_value(value)
  1175. if com.is_null_slice(key):
  1176. # fast path (GH50248)
  1177. data = self._if_else(True, value, self._data)
  1178. elif is_integer(key):
  1179. # fast path
  1180. key = cast(int, key)
  1181. n = len(self)
  1182. if key < 0:
  1183. key += n
  1184. if not 0 <= key < n:
  1185. raise IndexError(
  1186. f"index {key} is out of bounds for axis 0 with size {n}"
  1187. )
  1188. if is_list_like(value):
  1189. raise ValueError("Length of indexer and values mismatch")
  1190. elif isinstance(value, pa.Scalar):
  1191. value = value.as_py()
  1192. chunks = [
  1193. *self._data[:key].chunks,
  1194. pa.array([value], type=self._data.type, from_pandas=True),
  1195. *self._data[key + 1 :].chunks,
  1196. ]
  1197. data = pa.chunked_array(chunks).combine_chunks()
  1198. elif is_bool_dtype(key):
  1199. key = np.asarray(key, dtype=np.bool_)
  1200. data = self._replace_with_mask(self._data, key, value)
  1201. elif is_scalar(value) or isinstance(value, pa.Scalar):
  1202. mask = np.zeros(len(self), dtype=np.bool_)
  1203. mask[key] = True
  1204. data = self._if_else(mask, value, self._data)
  1205. else:
  1206. indices = np.arange(len(self))[key]
  1207. if len(indices) != len(value):
  1208. raise ValueError("Length of indexer and values mismatch")
  1209. if len(indices) == 0:
  1210. return
  1211. argsort = np.argsort(indices)
  1212. indices = indices[argsort]
  1213. value = value.take(argsort)
  1214. mask = np.zeros(len(self), dtype=np.bool_)
  1215. mask[indices] = True
  1216. data = self._replace_with_mask(self._data, mask, value)
  1217. if isinstance(data, pa.Array):
  1218. data = pa.chunked_array([data])
  1219. self._data = data
  1220. def _rank(
  1221. self,
  1222. *,
  1223. axis: AxisInt = 0,
  1224. method: str = "average",
  1225. na_option: str = "keep",
  1226. ascending: bool = True,
  1227. pct: bool = False,
  1228. ):
  1229. """
  1230. See Series.rank.__doc__.
  1231. """
  1232. if pa_version_under9p0 or axis != 0:
  1233. ranked = super()._rank(
  1234. axis=axis,
  1235. method=method,
  1236. na_option=na_option,
  1237. ascending=ascending,
  1238. pct=pct,
  1239. )
  1240. # keep dtypes consistent with the implementation below
  1241. if method == "average" or pct:
  1242. pa_type = pa.float64()
  1243. else:
  1244. pa_type = pa.uint64()
  1245. result = pa.array(ranked, type=pa_type, from_pandas=True)
  1246. return type(self)(result)
  1247. data = self._data.combine_chunks()
  1248. sort_keys = "ascending" if ascending else "descending"
  1249. null_placement = "at_start" if na_option == "top" else "at_end"
  1250. tiebreaker = "min" if method == "average" else method
  1251. result = pc.rank(
  1252. data,
  1253. sort_keys=sort_keys,
  1254. null_placement=null_placement,
  1255. tiebreaker=tiebreaker,
  1256. )
  1257. if na_option == "keep":
  1258. mask = pc.is_null(self._data)
  1259. null = pa.scalar(None, type=result.type)
  1260. result = pc.if_else(mask, null, result)
  1261. if method == "average":
  1262. result_max = pc.rank(
  1263. data,
  1264. sort_keys=sort_keys,
  1265. null_placement=null_placement,
  1266. tiebreaker="max",
  1267. )
  1268. result_max = result_max.cast(pa.float64())
  1269. result_min = result.cast(pa.float64())
  1270. result = pc.divide(pc.add(result_min, result_max), 2)
  1271. if pct:
  1272. if not pa.types.is_floating(result.type):
  1273. result = result.cast(pa.float64())
  1274. if method == "dense":
  1275. divisor = pc.max(result)
  1276. else:
  1277. divisor = pc.count(result)
  1278. result = pc.divide(result, divisor)
  1279. return type(self)(result)
  1280. def _quantile(
  1281. self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str
  1282. ) -> ArrowExtensionArrayT:
  1283. """
  1284. Compute the quantiles of self for each quantile in `qs`.
  1285. Parameters
  1286. ----------
  1287. qs : np.ndarray[float64]
  1288. interpolation: str
  1289. Returns
  1290. -------
  1291. same type as self
  1292. """
  1293. pa_dtype = self._data.type
  1294. data = self._data
  1295. if pa.types.is_temporal(pa_dtype):
  1296. # https://github.com/apache/arrow/issues/33769 in these cases
  1297. # we can cast to ints and back
  1298. nbits = pa_dtype.bit_width
  1299. if nbits == 32:
  1300. data = data.cast(pa.int32())
  1301. else:
  1302. data = data.cast(pa.int64())
  1303. result = pc.quantile(data, q=qs, interpolation=interpolation)
  1304. if pa.types.is_temporal(pa_dtype):
  1305. nbits = pa_dtype.bit_width
  1306. if nbits == 32:
  1307. result = result.cast(pa.int32())
  1308. else:
  1309. result = result.cast(pa.int64())
  1310. result = result.cast(pa_dtype)
  1311. return type(self)(result)
  1312. def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT:
  1313. """
  1314. Returns the mode(s) of the ExtensionArray.
  1315. Always returns `ExtensionArray` even if only one value.
  1316. Parameters
  1317. ----------
  1318. dropna : bool, default True
  1319. Don't consider counts of NA values.
  1320. Returns
  1321. -------
  1322. same type as self
  1323. Sorted, if possible.
  1324. """
  1325. pa_type = self._data.type
  1326. if pa.types.is_temporal(pa_type):
  1327. nbits = pa_type.bit_width
  1328. if nbits == 32:
  1329. data = self._data.cast(pa.int32())
  1330. elif nbits == 64:
  1331. data = self._data.cast(pa.int64())
  1332. else:
  1333. raise NotImplementedError(pa_type)
  1334. else:
  1335. data = self._data
  1336. if dropna:
  1337. data = data.drop_null()
  1338. res = pc.value_counts(data)
  1339. most_common = res.field("values").filter(
  1340. pc.equal(res.field("counts"), pc.max(res.field("counts")))
  1341. )
  1342. if pa.types.is_temporal(pa_type):
  1343. most_common = most_common.cast(pa_type)
  1344. return type(self)(most_common)
  1345. def _maybe_convert_setitem_value(self, value):
  1346. """Maybe convert value to be pyarrow compatible."""
  1347. if value is None:
  1348. return value
  1349. if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
  1350. return value
  1351. if is_list_like(value):
  1352. pa_box = pa.array
  1353. else:
  1354. pa_box = pa.scalar
  1355. try:
  1356. value = pa_box(value, type=self._data.type, from_pandas=True)
  1357. except pa.ArrowTypeError as err:
  1358. msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
  1359. raise TypeError(msg) from err
  1360. return value
  1361. @classmethod
  1362. def _if_else(
  1363. cls,
  1364. cond: npt.NDArray[np.bool_] | bool,
  1365. left: ArrayLike | Scalar,
  1366. right: ArrayLike | Scalar,
  1367. ):
  1368. """
  1369. Choose values based on a condition.
  1370. Analogous to pyarrow.compute.if_else, with logic
  1371. to fallback to numpy for unsupported types.
  1372. Parameters
  1373. ----------
  1374. cond : npt.NDArray[np.bool_] or bool
  1375. left : ArrayLike | Scalar
  1376. right : ArrayLike | Scalar
  1377. Returns
  1378. -------
  1379. pa.Array
  1380. """
  1381. try:
  1382. return pc.if_else(cond, left, right)
  1383. except pa.ArrowNotImplementedError:
  1384. pass
  1385. def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
  1386. if isinstance(value, (pa.Array, pa.ChunkedArray)):
  1387. pa_type = value.type
  1388. elif isinstance(value, pa.Scalar):
  1389. pa_type = value.type
  1390. value = value.as_py()
  1391. else:
  1392. pa_type = None
  1393. return np.array(value, dtype=object), pa_type
  1394. left, left_type = _to_numpy_and_type(left)
  1395. right, right_type = _to_numpy_and_type(right)
  1396. pa_type = left_type or right_type
  1397. result = np.where(cond, left, right)
  1398. return pa.array(result, type=pa_type, from_pandas=True)
  1399. @classmethod
  1400. def _replace_with_mask(
  1401. cls,
  1402. values: pa.Array | pa.ChunkedArray,
  1403. mask: npt.NDArray[np.bool_] | bool,
  1404. replacements: ArrayLike | Scalar,
  1405. ):
  1406. """
  1407. Replace items selected with a mask.
  1408. Analogous to pyarrow.compute.replace_with_mask, with logic
  1409. to fallback to numpy for unsupported types.
  1410. Parameters
  1411. ----------
  1412. values : pa.Array or pa.ChunkedArray
  1413. mask : npt.NDArray[np.bool_] or bool
  1414. replacements : ArrayLike or Scalar
  1415. Replacement value(s)
  1416. Returns
  1417. -------
  1418. pa.Array or pa.ChunkedArray
  1419. """
  1420. if isinstance(replacements, pa.ChunkedArray):
  1421. # replacements must be array or scalar, not ChunkedArray
  1422. replacements = replacements.combine_chunks()
  1423. if pa_version_under8p0:
  1424. # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0:
  1425. # version <= 7: segfaults with various types
  1426. # version <= 6: fails to replace nulls
  1427. if isinstance(replacements, pa.Array):
  1428. indices = np.full(len(values), None)
  1429. indices[mask] = np.arange(len(replacements))
  1430. indices = pa.array(indices, type=pa.int64())
  1431. replacements = replacements.take(indices)
  1432. return cls._if_else(mask, replacements, values)
  1433. if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type):
  1434. # GH#52059 replace_with_mask segfaults for chunked array
  1435. # https://github.com/apache/arrow/issues/34634
  1436. values = values.combine_chunks()
  1437. try:
  1438. return pc.replace_with_mask(values, mask, replacements)
  1439. except pa.ArrowNotImplementedError:
  1440. pass
  1441. if isinstance(replacements, pa.Array):
  1442. replacements = np.array(replacements, dtype=object)
  1443. elif isinstance(replacements, pa.Scalar):
  1444. replacements = replacements.as_py()
  1445. result = np.array(values, dtype=object)
  1446. result[mask] = replacements
  1447. return pa.array(result, type=values.type, from_pandas=True)
  1448. def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
  1449. """Apply a callable to each element while maintaining the chunking structure."""
  1450. return [
  1451. [
  1452. None if val is None else func(val)
  1453. for val in chunk.to_numpy(zero_copy_only=False)
  1454. ]
  1455. for chunk in self._data.iterchunks()
  1456. ]
  1457. def _str_count(self, pat: str, flags: int = 0):
  1458. if flags:
  1459. raise NotImplementedError(f"count not implemented with {flags=}")
  1460. return type(self)(pc.count_substring_regex(self._data, pat))
  1461. def _str_pad(
  1462. self,
  1463. width: int,
  1464. side: Literal["left", "right", "both"] = "left",
  1465. fillchar: str = " ",
  1466. ):
  1467. if side == "left":
  1468. pa_pad = pc.utf8_lpad
  1469. elif side == "right":
  1470. pa_pad = pc.utf8_rpad
  1471. elif side == "both":
  1472. pa_pad = pc.utf8_center
  1473. else:
  1474. raise ValueError(
  1475. f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
  1476. )
  1477. return type(self)(pa_pad(self._data, width=width, padding=fillchar))
  1478. def _str_contains(
  1479. self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
  1480. ):
  1481. if flags:
  1482. raise NotImplementedError(f"contains not implemented with {flags=}")
  1483. if regex:
  1484. pa_contains = pc.match_substring_regex
  1485. else:
  1486. pa_contains = pc.match_substring
  1487. result = pa_contains(self._data, pat, ignore_case=not case)
  1488. if not isna(na):
  1489. result = result.fill_null(na)
  1490. return type(self)(result)
  1491. def _str_startswith(self, pat: str, na=None):
  1492. result = pc.starts_with(self._data, pattern=pat)
  1493. if not isna(na):
  1494. result = result.fill_null(na)
  1495. return type(self)(result)
  1496. def _str_endswith(self, pat: str, na=None):
  1497. result = pc.ends_with(self._data, pattern=pat)
  1498. if not isna(na):
  1499. result = result.fill_null(na)
  1500. return type(self)(result)
  1501. def _str_replace(
  1502. self,
  1503. pat: str | re.Pattern,
  1504. repl: str | Callable,
  1505. n: int = -1,
  1506. case: bool = True,
  1507. flags: int = 0,
  1508. regex: bool = True,
  1509. ):
  1510. if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
  1511. raise NotImplementedError(
  1512. "replace is not supported with a re.Pattern, callable repl, "
  1513. "case=False, or flags!=0"
  1514. )
  1515. func = pc.replace_substring_regex if regex else pc.replace_substring
  1516. result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
  1517. return type(self)(result)
  1518. def _str_repeat(self, repeats: int | Sequence[int]):
  1519. if not isinstance(repeats, int):
  1520. raise NotImplementedError(
  1521. f"repeat is not implemented when repeats is {type(repeats).__name__}"
  1522. )
  1523. elif pa_version_under7p0:
  1524. raise NotImplementedError("repeat is not implemented for pyarrow < 7")
  1525. else:
  1526. return type(self)(pc.binary_repeat(self._data, repeats))
  1527. def _str_match(
  1528. self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
  1529. ):
  1530. if not pat.startswith("^"):
  1531. pat = f"^{pat}"
  1532. return self._str_contains(pat, case, flags, na, regex=True)
  1533. def _str_fullmatch(
  1534. self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
  1535. ):
  1536. if not pat.endswith("$") or pat.endswith("//$"):
  1537. pat = f"{pat}$"
  1538. return self._str_match(pat, case, flags, na)
  1539. def _str_find(self, sub: str, start: int = 0, end: int | None = None):
  1540. if start != 0 and end is not None:
  1541. slices = pc.utf8_slice_codeunits(self._data, start, stop=end)
  1542. result = pc.find_substring(slices, sub)
  1543. not_found = pc.equal(result, -1)
  1544. offset_result = pc.add(result, end - start)
  1545. result = pc.if_else(not_found, result, offset_result)
  1546. elif start == 0 and end is None:
  1547. slices = self._data
  1548. result = pc.find_substring(slices, sub)
  1549. else:
  1550. raise NotImplementedError(
  1551. f"find not implemented with {sub=}, {start=}, {end=}"
  1552. )
  1553. return type(self)(result)
  1554. def _str_get(self, i: int):
  1555. lengths = pc.utf8_length(self._data)
  1556. if i >= 0:
  1557. out_of_bounds = pc.greater_equal(i, lengths)
  1558. start = i
  1559. stop = i + 1
  1560. step = 1
  1561. else:
  1562. out_of_bounds = pc.greater(-i, lengths)
  1563. start = i
  1564. stop = i - 1
  1565. step = -1
  1566. not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
  1567. selected = pc.utf8_slice_codeunits(
  1568. self._data, start=start, stop=stop, step=step
  1569. )
  1570. result = pa.array([None] * self._data.length(), type=self._data.type)
  1571. result = pc.if_else(not_out_of_bounds, selected, result)
  1572. return type(self)(result)
  1573. def _str_join(self, sep: str):
  1574. return type(self)(pc.binary_join(self._data, sep))
  1575. def _str_partition(self, sep: str, expand: bool):
  1576. predicate = lambda val: val.partition(sep)
  1577. result = self._apply_elementwise(predicate)
  1578. return type(self)(pa.chunked_array(result))
  1579. def _str_rpartition(self, sep: str, expand: bool):
  1580. predicate = lambda val: val.rpartition(sep)
  1581. result = self._apply_elementwise(predicate)
  1582. return type(self)(pa.chunked_array(result))
  1583. def _str_slice(
  1584. self, start: int | None = None, stop: int | None = None, step: int | None = None
  1585. ):
  1586. if start is None:
  1587. start = 0
  1588. if step is None:
  1589. step = 1
  1590. return type(self)(
  1591. pc.utf8_slice_codeunits(self._data, start=start, stop=stop, step=step)
  1592. )
  1593. def _str_slice_replace(
  1594. self, start: int | None = None, stop: int | None = None, repl: str | None = None
  1595. ):
  1596. if repl is None:
  1597. repl = ""
  1598. if start is None:
  1599. start = 0
  1600. return type(self)(pc.utf8_replace_slice(self._data, start, stop, repl))
  1601. def _str_isalnum(self):
  1602. return type(self)(pc.utf8_is_alnum(self._data))
  1603. def _str_isalpha(self):
  1604. return type(self)(pc.utf8_is_alpha(self._data))
  1605. def _str_isdecimal(self):
  1606. return type(self)(pc.utf8_is_decimal(self._data))
  1607. def _str_isdigit(self):
  1608. return type(self)(pc.utf8_is_digit(self._data))
  1609. def _str_islower(self):
  1610. return type(self)(pc.utf8_is_lower(self._data))
  1611. def _str_isnumeric(self):
  1612. return type(self)(pc.utf8_is_numeric(self._data))
  1613. def _str_isspace(self):
  1614. return type(self)(pc.utf8_is_space(self._data))
  1615. def _str_istitle(self):
  1616. return type(self)(pc.utf8_is_title(self._data))
  1617. def _str_capitalize(self):
  1618. return type(self)(pc.utf8_capitalize(self._data))
  1619. def _str_title(self):
  1620. return type(self)(pc.utf8_title(self._data))
  1621. def _str_isupper(self):
  1622. return type(self)(pc.utf8_is_upper(self._data))
  1623. def _str_swapcase(self):
  1624. return type(self)(pc.utf8_swapcase(self._data))
  1625. def _str_len(self):
  1626. return type(self)(pc.utf8_length(self._data))
  1627. def _str_lower(self):
  1628. return type(self)(pc.utf8_lower(self._data))
  1629. def _str_upper(self):
  1630. return type(self)(pc.utf8_upper(self._data))
  1631. def _str_strip(self, to_strip=None):
  1632. if to_strip is None:
  1633. result = pc.utf8_trim_whitespace(self._data)
  1634. else:
  1635. result = pc.utf8_trim(self._data, characters=to_strip)
  1636. return type(self)(result)
  1637. def _str_lstrip(self, to_strip=None):
  1638. if to_strip is None:
  1639. result = pc.utf8_ltrim_whitespace(self._data)
  1640. else:
  1641. result = pc.utf8_ltrim(self._data, characters=to_strip)
  1642. return type(self)(result)
  1643. def _str_rstrip(self, to_strip=None):
  1644. if to_strip is None:
  1645. result = pc.utf8_rtrim_whitespace(self._data)
  1646. else:
  1647. result = pc.utf8_rtrim(self._data, characters=to_strip)
  1648. return type(self)(result)
  1649. def _str_removeprefix(self, prefix: str):
  1650. # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
  1651. # starts_with = pc.starts_with(self._data, pattern=prefix)
  1652. # removed = pc.utf8_slice_codeunits(self._data, len(prefix))
  1653. # result = pc.if_else(starts_with, removed, self._data)
  1654. # return type(self)(result)
  1655. if sys.version_info < (3, 9):
  1656. # NOTE pyupgrade will remove this when we run it with --py39-plus
  1657. # so don't remove the unnecessary `else` statement below
  1658. from pandas.util._str_methods import removeprefix
  1659. predicate = functools.partial(removeprefix, prefix=prefix)
  1660. else:
  1661. predicate = lambda val: val.removeprefix(prefix)
  1662. result = self._apply_elementwise(predicate)
  1663. return type(self)(pa.chunked_array(result))
  1664. def _str_removesuffix(self, suffix: str):
  1665. ends_with = pc.ends_with(self._data, pattern=suffix)
  1666. removed = pc.utf8_slice_codeunits(self._data, 0, stop=-len(suffix))
  1667. result = pc.if_else(ends_with, removed, self._data)
  1668. return type(self)(result)
  1669. def _str_casefold(self):
  1670. predicate = lambda val: val.casefold()
  1671. result = self._apply_elementwise(predicate)
  1672. return type(self)(pa.chunked_array(result))
  1673. def _str_encode(self, encoding: str, errors: str = "strict"):
  1674. predicate = lambda val: val.encode(encoding, errors)
  1675. result = self._apply_elementwise(predicate)
  1676. return type(self)(pa.chunked_array(result))
  1677. def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
  1678. raise NotImplementedError(
  1679. "str.extract not supported with pd.ArrowDtype(pa.string())."
  1680. )
  1681. def _str_findall(self, pat: str, flags: int = 0):
  1682. regex = re.compile(pat, flags=flags)
  1683. predicate = lambda val: regex.findall(val)
  1684. result = self._apply_elementwise(predicate)
  1685. return type(self)(pa.chunked_array(result))
  1686. def _str_get_dummies(self, sep: str = "|"):
  1687. split = pc.split_pattern(self._data, sep).combine_chunks()
  1688. uniques = split.flatten().unique()
  1689. uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))
  1690. result_data = []
  1691. for lst in split.to_pylist():
  1692. if lst is None:
  1693. result_data.append([False] * len(uniques_sorted))
  1694. else:
  1695. res = pc.is_in(uniques_sorted, pa.array(set(lst)))
  1696. result_data.append(res.to_pylist())
  1697. result = type(self)(pa.array(result_data))
  1698. return result, uniques_sorted.to_pylist()
  1699. def _str_index(self, sub: str, start: int = 0, end: int | None = None):
  1700. predicate = lambda val: val.index(sub, start, end)
  1701. result = self._apply_elementwise(predicate)
  1702. return type(self)(pa.chunked_array(result))
  1703. def _str_rindex(self, sub: str, start: int = 0, end: int | None = None):
  1704. predicate = lambda val: val.rindex(sub, start, end)
  1705. result = self._apply_elementwise(predicate)
  1706. return type(self)(pa.chunked_array(result))
  1707. def _str_normalize(self, form: str):
  1708. predicate = lambda val: unicodedata.normalize(form, val)
  1709. result = self._apply_elementwise(predicate)
  1710. return type(self)(pa.chunked_array(result))
  1711. def _str_rfind(self, sub: str, start: int = 0, end=None):
  1712. predicate = lambda val: val.rfind(sub, start, end)
  1713. result = self._apply_elementwise(predicate)
  1714. return type(self)(pa.chunked_array(result))
  1715. def _str_split(
  1716. self,
  1717. pat: str | None = None,
  1718. n: int | None = -1,
  1719. expand: bool = False,
  1720. regex: bool | None = None,
  1721. ):
  1722. if n in {-1, 0}:
  1723. n = None
  1724. if regex:
  1725. split_func = pc.split_pattern_regex
  1726. else:
  1727. split_func = pc.split_pattern
  1728. return type(self)(split_func(self._data, pat, max_splits=n))
  1729. def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
  1730. if n in {-1, 0}:
  1731. n = None
  1732. return type(self)(pc.split_pattern(self._data, pat, max_splits=n, reverse=True))
  1733. def _str_translate(self, table: dict[int, str]):
  1734. predicate = lambda val: val.translate(table)
  1735. result = self._apply_elementwise(predicate)
  1736. return type(self)(pa.chunked_array(result))
  1737. def _str_wrap(self, width: int, **kwargs):
  1738. kwargs["width"] = width
  1739. tw = textwrap.TextWrapper(**kwargs)
  1740. predicate = lambda val: "\n".join(tw.wrap(val))
  1741. result = self._apply_elementwise(predicate)
  1742. return type(self)(pa.chunked_array(result))
  1743. @property
  1744. def _dt_year(self):
  1745. return type(self)(pc.year(self._data))
  1746. @property
  1747. def _dt_day(self):
  1748. return type(self)(pc.day(self._data))
  1749. @property
  1750. def _dt_day_of_week(self):
  1751. return type(self)(pc.day_of_week(self._data))
  1752. _dt_dayofweek = _dt_day_of_week
  1753. _dt_weekday = _dt_day_of_week
  1754. @property
  1755. def _dt_day_of_year(self):
  1756. return type(self)(pc.day_of_year(self._data))
  1757. _dt_dayofyear = _dt_day_of_year
  1758. @property
  1759. def _dt_hour(self):
  1760. return type(self)(pc.hour(self._data))
  1761. def _dt_isocalendar(self):
  1762. return type(self)(pc.iso_calendar(self._data))
  1763. @property
  1764. def _dt_is_leap_year(self):
  1765. return type(self)(pc.is_leap_year(self._data))
  1766. @property
  1767. def _dt_microsecond(self):
  1768. return type(self)(pc.microsecond(self._data))
  1769. @property
  1770. def _dt_minute(self):
  1771. return type(self)(pc.minute(self._data))
  1772. @property
  1773. def _dt_month(self):
  1774. return type(self)(pc.month(self._data))
  1775. @property
  1776. def _dt_nanosecond(self):
  1777. return type(self)(pc.nanosecond(self._data))
  1778. @property
  1779. def _dt_quarter(self):
  1780. return type(self)(pc.quarter(self._data))
  1781. @property
  1782. def _dt_second(self):
  1783. return type(self)(pc.second(self._data))
  1784. @property
  1785. def _dt_date(self):
  1786. return type(self)(self._data.cast(pa.date32()))
  1787. @property
  1788. def _dt_time(self):
  1789. unit = (
  1790. self.dtype.pyarrow_dtype.unit
  1791. if self.dtype.pyarrow_dtype.unit in {"us", "ns"}
  1792. else "ns"
  1793. )
  1794. return type(self)(self._data.cast(pa.time64(unit)))
  1795. @property
  1796. def _dt_tz(self):
  1797. return self.dtype.pyarrow_dtype.tz
  1798. def _dt_strftime(self, format: str):
  1799. return type(self)(pc.strftime(self._data, format=format))
  1800. def _round_temporally(
  1801. self,
  1802. method: Literal["ceil", "floor", "round"],
  1803. freq,
  1804. ambiguous: TimeAmbiguous = "raise",
  1805. nonexistent: TimeNonexistent = "raise",
  1806. ):
  1807. if ambiguous != "raise":
  1808. raise NotImplementedError("ambiguous is not supported.")
  1809. if nonexistent != "raise":
  1810. raise NotImplementedError("nonexistent is not supported.")
  1811. offset = to_offset(freq)
  1812. if offset is None:
  1813. raise ValueError(f"Must specify a valid frequency: {freq}")
  1814. pa_supported_unit = {
  1815. "A": "year",
  1816. "AS": "year",
  1817. "Q": "quarter",
  1818. "QS": "quarter",
  1819. "M": "month",
  1820. "MS": "month",
  1821. "W": "week",
  1822. "D": "day",
  1823. "H": "hour",
  1824. "T": "minute",
  1825. "S": "second",
  1826. "L": "millisecond",
  1827. "U": "microsecond",
  1828. "N": "nanosecond",
  1829. }
  1830. unit = pa_supported_unit.get(offset._prefix, None)
  1831. if unit is None:
  1832. raise ValueError(f"{freq=} is not supported")
  1833. multiple = offset.n
  1834. rounding_method = getattr(pc, f"{method}_temporal")
  1835. return type(self)(rounding_method(self._data, multiple=multiple, unit=unit))
  1836. def _dt_ceil(
  1837. self,
  1838. freq,
  1839. ambiguous: TimeAmbiguous = "raise",
  1840. nonexistent: TimeNonexistent = "raise",
  1841. ):
  1842. return self._round_temporally("ceil", freq, ambiguous, nonexistent)
  1843. def _dt_floor(
  1844. self,
  1845. freq,
  1846. ambiguous: TimeAmbiguous = "raise",
  1847. nonexistent: TimeNonexistent = "raise",
  1848. ):
  1849. return self._round_temporally("floor", freq, ambiguous, nonexistent)
  1850. def _dt_round(
  1851. self,
  1852. freq,
  1853. ambiguous: TimeAmbiguous = "raise",
  1854. nonexistent: TimeNonexistent = "raise",
  1855. ):
  1856. return self._round_temporally("round", freq, ambiguous, nonexistent)
  1857. def _dt_to_pydatetime(self):
  1858. if pa.types.is_date(self.dtype.pyarrow_dtype):
  1859. raise ValueError(
  1860. f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "
  1861. "Convert to pyarrow timestamp type."
  1862. )
  1863. data = self._data.to_pylist()
  1864. if self._dtype.pyarrow_dtype.unit == "ns":
  1865. data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]
  1866. return np.array(data, dtype=object)
  1867. def _dt_tz_localize(
  1868. self,
  1869. tz,
  1870. ambiguous: TimeAmbiguous = "raise",
  1871. nonexistent: TimeNonexistent = "raise",
  1872. ):
  1873. if ambiguous != "raise":
  1874. raise NotImplementedError(f"{ambiguous=} is not supported")
  1875. nonexistent_pa = {
  1876. "raise": "raise",
  1877. "shift_backward": "earliest",
  1878. "shift_forward": "latest",
  1879. }.get(
  1880. nonexistent, None # type: ignore[arg-type]
  1881. )
  1882. if nonexistent_pa is None:
  1883. raise NotImplementedError(f"{nonexistent=} is not supported")
  1884. if tz is None:
  1885. result = self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))
  1886. else:
  1887. result = pc.assume_timezone(
  1888. self._data, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
  1889. )
  1890. return type(self)(result)