1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206 |
- from __future__ import annotations
- from copy import deepcopy
- import functools
- import operator
- import re
- import sys
- import textwrap
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Literal,
- Sequence,
- TypeVar,
- cast,
- )
- import unicodedata
- import numpy as np
- from pandas._libs import lib
- from pandas._typing import (
- ArrayLike,
- AxisInt,
- Dtype,
- FillnaOptions,
- Iterator,
- NpDtype,
- PositionalIndexer,
- Scalar,
- SortKind,
- TakeIndexer,
- TimeAmbiguous,
- TimeNonexistent,
- npt,
- )
- from pandas.compat import (
- pa_version_under7p0,
- pa_version_under8p0,
- pa_version_under9p0,
- pa_version_under11p0,
- )
- from pandas.util._decorators import doc
- from pandas.util._validators import validate_fillna_kwargs
- from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
- )
- from pandas.core.dtypes.dtypes import DatetimeTZDtype
- from pandas.core.dtypes.missing import isna
- from pandas.core import roperator
- from pandas.core.arraylike import OpsMixin
- from pandas.core.arrays.base import (
- ExtensionArray,
- ExtensionArraySupportsAnyAll,
- )
- import pandas.core.common as com
- from pandas.core.indexers import (
- check_array_indexer,
- unpack_tuple_and_ellipses,
- validate_indices,
- )
- from pandas.core.strings.base import BaseStringArrayMethods
- from pandas.tseries.frequencies import to_offset
- if not pa_version_under7p0:
- import pyarrow as pa
- import pyarrow.compute as pc
- from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
- from pandas.core.arrays.arrow.dtype import ArrowDtype
- ARROW_CMP_FUNCS = {
- "eq": pc.equal,
- "ne": pc.not_equal,
- "lt": pc.less,
- "gt": pc.greater,
- "le": pc.less_equal,
- "ge": pc.greater_equal,
- }
- ARROW_LOGICAL_FUNCS = {
- "and_": pc.and_kleene,
- "rand_": lambda x, y: pc.and_kleene(y, x),
- "or_": pc.or_kleene,
- "ror_": lambda x, y: pc.or_kleene(y, x),
- "xor": pc.xor,
- "rxor": lambda x, y: pc.xor(y, x),
- }
- def cast_for_truediv(
- arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
- ) -> pa.ChunkedArray:
- # Ensure int / int -> float mirroring Python/Numpy behavior
- # as pc.divide_checked(int, int) -> int
- if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
- pa_object.type
- ):
- return arrow_array.cast(pa.float64())
- return arrow_array
- def floordiv_compat(
- left: pa.ChunkedArray | pa.Array | pa.Scalar,
- right: pa.ChunkedArray | pa.Array | pa.Scalar,
- ) -> pa.ChunkedArray:
- # Ensure int // int -> int mirroring Python/Numpy behavior
- # as pc.floor(pc.divide_checked(int, int)) -> float
- result = pc.floor(pc.divide(left, right))
- if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
- result = result.cast(left.type)
- return result
- ARROW_ARITHMETIC_FUNCS = {
- "add": pc.add_checked,
- "radd": lambda x, y: pc.add_checked(y, x),
- "sub": pc.subtract_checked,
- "rsub": lambda x, y: pc.subtract_checked(y, x),
- "mul": pc.multiply_checked,
- "rmul": lambda x, y: pc.multiply_checked(y, x),
- "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
- "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
- "floordiv": lambda x, y: floordiv_compat(x, y),
- "rfloordiv": lambda x, y: floordiv_compat(y, x),
- "mod": NotImplemented,
- "rmod": NotImplemented,
- "divmod": NotImplemented,
- "rdivmod": NotImplemented,
- "pow": pc.power_checked,
- "rpow": lambda x, y: pc.power_checked(y, x),
- }
- if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
- from pandas import Series
- ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
- def get_unit_from_pa_dtype(pa_dtype):
- # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804
- if pa_version_under11p0:
- unit = str(pa_dtype).split("[", 1)[-1][:-1]
- if unit not in ["s", "ms", "us", "ns"]:
- raise ValueError(pa_dtype)
- return unit
- return pa_dtype.unit
- def to_pyarrow_type(
- dtype: ArrowDtype | pa.DataType | Dtype | None,
- ) -> pa.DataType | None:
- """
- Convert dtype to a pyarrow type instance.
- """
- if isinstance(dtype, ArrowDtype):
- return dtype.pyarrow_dtype
- elif isinstance(dtype, pa.DataType):
- return dtype
- elif isinstance(dtype, DatetimeTZDtype):
- return pa.timestamp(dtype.unit, dtype.tz)
- elif dtype:
- try:
- # Accepts python types too
- # Doesn't handle all numpy types
- return pa.from_numpy_dtype(dtype)
- except pa.ArrowNotImplementedError:
- pass
- return None
- class ArrowExtensionArray(
- OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods
- ):
- """
- Pandas ExtensionArray backed by a PyArrow ChunkedArray.
- .. warning::
- ArrowExtensionArray is considered experimental. The implementation and
- parts of the API may change without warning.
- Parameters
- ----------
- values : pyarrow.Array or pyarrow.ChunkedArray
- Attributes
- ----------
- None
- Methods
- -------
- None
- Returns
- -------
- ArrowExtensionArray
- Notes
- -----
- Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__
- Some methods may either raise an exception or raise a ``PerformanceWarning`` if an
- associated compute function is not available based on the installed version of PyArrow.
- Please install the latest version of PyArrow to enable the best functionality and avoid
- potential bugs in prior versions of PyArrow.
- Examples
- --------
- Create an ArrowExtensionArray with :func:`pandas.array`:
- >>> pd.array([1, 1, None], dtype="int64[pyarrow]")
- <ArrowExtensionArray>
- [1, 1, <NA>]
- Length: 3, dtype: int64[pyarrow]
- """ # noqa: E501 (http link too long)
- _data: pa.ChunkedArray
- _dtype: ArrowDtype
- def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
- if pa_version_under7p0:
- msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray."
- raise ImportError(msg)
- if isinstance(values, pa.Array):
- self._data = pa.chunked_array([values])
- elif isinstance(values, pa.ChunkedArray):
- self._data = values
- else:
- raise ValueError(
- f"Unsupported type '{type(values)}' for ArrowExtensionArray"
- )
- self._dtype = ArrowDtype(self._data.type)
- @classmethod
- def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
- """
- Construct a new ExtensionArray from a sequence of scalars.
- """
- pa_dtype = to_pyarrow_type(dtype)
- if (
- isinstance(scalars, np.ndarray)
- and isinstance(dtype, ArrowDtype)
- and (
- pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)
- )
- ):
- # See https://github.com/apache/arrow/issues/35289
- scalars = scalars.tolist()
- if isinstance(scalars, cls):
- scalars = scalars._data
- elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
- if copy and is_array_like(scalars):
- # pa array should not get updated when numpy array is updated
- scalars = deepcopy(scalars)
- try:
- scalars = pa.array(scalars, type=pa_dtype, from_pandas=True)
- except pa.ArrowInvalid:
- # GH50430: let pyarrow infer type, then cast
- scalars = pa.array(scalars, from_pandas=True)
- if pa_dtype:
- if pa.types.is_dictionary(pa_dtype):
- scalars = scalars.dictionary_encode()
- else:
- scalars = scalars.cast(pa_dtype)
- arr = cls(scalars)
- if pa.types.is_duration(scalars.type) and scalars.null_count > 0:
- # GH52843: upstream bug for duration types when originally
- # constructed with data containing numpy NaT.
- # https://github.com/apache/arrow/issues/35088
- arr = arr.fillna(arr.dtype.na_value)
- return arr
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, *, dtype: Dtype | None = None, copy: bool = False
- ):
- """
- Construct a new ExtensionArray from a sequence of strings.
- """
- pa_type = to_pyarrow_type(dtype)
- if (
- pa_type is None
- or pa.types.is_binary(pa_type)
- or pa.types.is_string(pa_type)
- ):
- # pa_type is None: Let pa.array infer
- # pa_type is string/binary: scalars already correct type
- scalars = strings
- elif pa.types.is_timestamp(pa_type):
- from pandas.core.tools.datetimes import to_datetime
- scalars = to_datetime(strings, errors="raise")
- elif pa.types.is_date(pa_type):
- from pandas.core.tools.datetimes import to_datetime
- scalars = to_datetime(strings, errors="raise").date
- elif pa.types.is_duration(pa_type):
- from pandas.core.tools.timedeltas import to_timedelta
- scalars = to_timedelta(strings, errors="raise")
- if pa_type.unit != "ns":
- # GH51175: test_from_sequence_of_strings_pa_array
- # attempt to parse as int64 reflecting pyarrow's
- # duration to string casting behavior
- mask = isna(scalars)
- if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
- strings = pa.array(strings, type=pa.string(), from_pandas=True)
- strings = pc.if_else(mask, None, strings)
- try:
- scalars = strings.cast(pa.int64())
- except pa.ArrowInvalid:
- pass
- elif pa.types.is_time(pa_type):
- from pandas.core.tools.times import to_time
- # "coerce" to allow "null times" (None) to not raise
- scalars = to_time(strings, errors="coerce")
- elif pa.types.is_boolean(pa_type):
- from pandas.core.arrays import BooleanArray
- scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy()
- elif (
- pa.types.is_integer(pa_type)
- or pa.types.is_floating(pa_type)
- or pa.types.is_decimal(pa_type)
- ):
- from pandas.core.tools.numeric import to_numeric
- scalars = to_numeric(strings, errors="raise")
- else:
- raise NotImplementedError(
- f"Converting strings to {pa_type} is not implemented."
- )
- return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
- def __getitem__(self, item: PositionalIndexer):
- """Select a subset of self.
- Parameters
- ----------
- item : int, slice, or ndarray
- * int: The position in 'self' to get.
- * slice: A slice object, where 'start', 'stop', and 'step' are
- integers or None
- * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
- Returns
- -------
- item : scalar or ExtensionArray
- Notes
- -----
- For scalar ``item``, return a scalar value suitable for the array's
- type. This should be an instance of ``self.dtype.type``.
- For slice ``key``, return an instance of ``ExtensionArray``, even
- if the slice is length 0 or 1.
- For a boolean mask, return an instance of ``ExtensionArray``, filtered
- to the values where ``item`` is True.
- """
- item = check_array_indexer(self, item)
- if isinstance(item, np.ndarray):
- if not len(item):
- # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
- if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
- pa_dtype = pa.string()
- else:
- pa_dtype = self._dtype.pyarrow_dtype
- return type(self)(pa.chunked_array([], type=pa_dtype))
- elif is_integer_dtype(item.dtype):
- return self.take(item)
- elif is_bool_dtype(item.dtype):
- return type(self)(self._data.filter(item))
- else:
- raise IndexError(
- "Only integers, slices and integer or "
- "boolean arrays are valid indices."
- )
- elif isinstance(item, tuple):
- item = unpack_tuple_and_ellipses(item)
- if item is Ellipsis:
- # TODO: should be handled by pyarrow?
- item = slice(None)
- if is_scalar(item) and not is_integer(item):
- # e.g. "foo" or 2.5
- # exception message copied from numpy
- raise IndexError(
- r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
- r"(`None`) and integer or boolean arrays are valid indices"
- )
- # We are not an array indexer, so maybe e.g. a slice or integer
- # indexer. We dispatch to pyarrow.
- value = self._data[item]
- if isinstance(value, pa.ChunkedArray):
- return type(self)(value)
- else:
- scalar = value.as_py()
- if scalar is None:
- return self._dtype.na_value
- else:
- return scalar
- def __iter__(self) -> Iterator[Any]:
- """
- Iterate over elements of the array.
- """
- na_value = self._dtype.na_value
- for value in self._data:
- val = value.as_py()
- if val is None:
- yield na_value
- else:
- yield val
- def __arrow_array__(self, type=None):
- """Convert myself to a pyarrow ChunkedArray."""
- return self._data
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- """Correctly construct numpy arrays when passed to `np.asarray()`."""
- return self.to_numpy(dtype=dtype)
- def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(pc.invert(self._data))
- def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(pc.negate_checked(self._data))
- def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(self._data)
- def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(pc.abs_checked(self._data))
- # GH 42600: __getstate__/__setstate__ not necessary once
- # https://issues.apache.org/jira/browse/ARROW-10739 is addressed
- def __getstate__(self):
- state = self.__dict__.copy()
- state["_data"] = self._data.combine_chunks()
- return state
- def __setstate__(self, state) -> None:
- state["_data"] = pa.chunked_array(state["_data"])
- self.__dict__.update(state)
- def _cmp_method(self, other, op):
- from pandas.core.arrays.masked import BaseMaskedArray
- pc_func = ARROW_CMP_FUNCS[op.__name__]
- if isinstance(other, ArrowExtensionArray):
- result = pc_func(self._data, other._data)
- elif isinstance(other, (np.ndarray, list)):
- result = pc_func(self._data, other)
- elif isinstance(other, BaseMaskedArray):
- # GH 52625
- result = pc_func(self._data, other.__arrow_array__())
- elif is_scalar(other):
- try:
- result = pc_func(self._data, pa.scalar(other))
- except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
- mask = isna(self) | isna(other)
- valid = ~mask
- result = np.zeros(len(self), dtype="bool")
- result[valid] = op(np.array(self)[valid], other)
- result = pa.array(result, type=pa.bool_())
- result = pc.if_else(valid, result, None)
- else:
- raise NotImplementedError(
- f"{op.__name__} not implemented for {type(other)}"
- )
- return ArrowExtensionArray(result)
- def _evaluate_op_method(self, other, op, arrow_funcs):
- from pandas.core.arrays.masked import BaseMaskedArray
- pa_type = self._data.type
- if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
- operator.add,
- roperator.radd,
- ]:
- length = self._data.length()
- seps: list[str] | list[bytes]
- if pa.types.is_string(pa_type):
- seps = [""] * length
- else:
- seps = [b""] * length
- if is_scalar(other):
- other = [other] * length
- elif isinstance(other, type(self)):
- other = other._data
- if op is operator.add:
- result = pc.binary_join_element_wise(self._data, other, seps)
- else:
- result = pc.binary_join_element_wise(other, self._data, seps)
- return type(self)(result)
- pc_func = arrow_funcs[op.__name__]
- if pc_func is NotImplemented:
- raise NotImplementedError(f"{op.__name__} not implemented.")
- if isinstance(other, ArrowExtensionArray):
- result = pc_func(self._data, other._data)
- elif isinstance(other, (np.ndarray, list)):
- result = pc_func(self._data, pa.array(other, from_pandas=True))
- elif isinstance(other, BaseMaskedArray):
- # GH 52625
- result = pc_func(self._data, other.__arrow_array__())
- elif is_scalar(other):
- if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS:
- # pyarrow kleene ops require null to be typed
- pa_scalar = pa.scalar(None, type=self._data.type)
- else:
- pa_scalar = pa.scalar(other)
- result = pc_func(self._data, pa_scalar)
- else:
- raise NotImplementedError(
- f"{op.__name__} not implemented for {type(other)}"
- )
- return type(self)(result)
- def _logical_method(self, other, op):
- return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)
- def _arith_method(self, other, op):
- return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
- def equals(self, other) -> bool:
- if not isinstance(other, ArrowExtensionArray):
- return False
- # I'm told that pyarrow makes __eq__ behave like pandas' equals;
- # TODO: is this documented somewhere?
- return self._data == other._data
- @property
- def dtype(self) -> ArrowDtype:
- """
- An instance of 'ExtensionDtype'.
- """
- return self._dtype
- @property
- def nbytes(self) -> int:
- """
- The number of bytes needed to store this object in memory.
- """
- return self._data.nbytes
- def __len__(self) -> int:
- """
- Length of this array.
- Returns
- -------
- length : int
- """
- return len(self._data)
- def __contains__(self, key) -> bool:
- # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
- if isna(key) and key is not self.dtype.na_value:
- if self.dtype.kind == "f" and lib.is_float(key) and isna(key):
- return pc.any(pc.is_nan(self._data)).as_py()
- # e.g. date or timestamp types we do not allow None here to match pd.NA
- return False
- # TODO: maybe complex? object?
- return bool(super().__contains__(key))
- @property
- def _hasna(self) -> bool:
- return self._data.null_count > 0
- def isna(self) -> npt.NDArray[np.bool_]:
- """
- Boolean NumPy array indicating if each value is missing.
- This should return a 1-D array the same length as 'self'.
- """
- return self._data.is_null().to_numpy()
- def any(self, *, skipna: bool = True, **kwargs):
- """
- Return whether any element is truthy.
- Returns False unless there is at least one element that is truthy.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be False, as for an empty array.
- If `skipna` is False, the result will still be True if there is
- at least one element that is truthy, otherwise NA will be returned
- if there are NA's present.
- Returns
- -------
- bool or :attr:`pandas.NA`
- See Also
- --------
- ArrowExtensionArray.all : Return whether all elements are truthy.
- Examples
- --------
- The result indicates whether any element is truthy (and by default
- skips NAs):
- >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any()
- True
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any()
- True
- >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any()
- False
- >>> pd.array([], dtype="boolean[pyarrow]").any()
- False
- >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any()
- False
- >>> pd.array([pd.NA], dtype="float64[pyarrow]").any()
- False
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- True
- >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- True
- >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- <NA>
- >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- <NA>
- """
- return self._reduce("any", skipna=skipna, **kwargs)
- def all(self, *, skipna: bool = True, **kwargs):
- """
- Return whether all elements are truthy.
- Returns True unless there is at least one element that is falsey.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be True, as for an empty array.
- If `skipna` is False, the result will still be False if there is
- at least one element that is falsey, otherwise NA will be returned
- if there are NA's present.
- Returns
- -------
- bool or :attr:`pandas.NA`
- See Also
- --------
- ArrowExtensionArray.any : Return whether any element is truthy.
- Examples
- --------
- The result indicates whether all elements are truthy (and by default
- skips NAs):
- >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all()
- False
- >>> pd.array([], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([pd.NA], dtype="float64[pyarrow]").all()
- True
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
- >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- <NA>
- >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- <NA>
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- False
- >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- False
- """
- return self._reduce("all", skipna=skipna, **kwargs)
- def argsort(
- self,
- *,
- ascending: bool = True,
- kind: SortKind = "quicksort",
- na_position: str = "last",
- **kwargs,
- ) -> np.ndarray:
- order = "ascending" if ascending else "descending"
- null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)
- if null_placement is None:
- raise ValueError(f"invalid na_position: {na_position}")
- result = pc.array_sort_indices(
- self._data, order=order, null_placement=null_placement
- )
- np_result = result.to_numpy()
- return np_result.astype(np.intp, copy=False)
- def _argmin_max(self, skipna: bool, method: str) -> int:
- if self._data.length() in (0, self._data.null_count) or (
- self._hasna and not skipna
- ):
- # For empty or all null, pyarrow returns -1 but pandas expects TypeError
- # For skipna=False and data w/ null, pandas expects NotImplementedError
- # let ExtensionArray.arg{max|min} raise
- return getattr(super(), f"arg{method}")(skipna=skipna)
- data = self._data
- if pa.types.is_duration(data.type):
- data = data.cast(pa.int64())
- value = getattr(pc, method)(data, skip_nulls=skipna)
- return pc.index(data, value).as_py()
- def argmin(self, skipna: bool = True) -> int:
- return self._argmin_max(skipna, "min")
- def argmax(self, skipna: bool = True) -> int:
- return self._argmin_max(skipna, "max")
- def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- """
- Return a shallow copy of the array.
- Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
- Returns
- -------
- type(self)
- """
- return type(self)(self._data)
- def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- """
- Return ArrowExtensionArray without NA values.
- Returns
- -------
- ArrowExtensionArray
- """
- return type(self)(pc.drop_null(self._data))
- @doc(ExtensionArray.fillna)
- def fillna(
- self: ArrowExtensionArrayT,
- value: object | ArrayLike | None = None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- ) -> ArrowExtensionArrayT:
- value, method = validate_fillna_kwargs(value, method)
- if limit is not None:
- return super().fillna(value=value, method=method, limit=limit)
- if method is not None:
- fallback_performancewarning()
- return super().fillna(value=value, method=method, limit=limit)
- if is_array_like(value):
- value = cast(ArrayLike, value)
- if len(value) != len(self):
- raise ValueError(
- f"Length of 'value' does not match. Got ({len(value)}) "
- f" expected {len(self)}"
- )
- def convert_fill_value(value, pa_type, dtype):
- if value is None:
- return value
- if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
- return value
- if is_array_like(value):
- pa_box = pa.array
- else:
- pa_box = pa.scalar
- try:
- value = pa_box(value, type=pa_type, from_pandas=True)
- except pa.ArrowTypeError as err:
- msg = f"Invalid value '{str(value)}' for dtype {dtype}"
- raise TypeError(msg) from err
- return value
- fill_value = convert_fill_value(value, self._data.type, self.dtype)
- try:
- if method is None:
- return type(self)(pc.fill_null(self._data, fill_value=fill_value))
- elif method == "pad":
- return type(self)(pc.fill_null_forward(self._data))
- elif method == "backfill":
- return type(self)(pc.fill_null_backward(self._data))
- except pa.ArrowNotImplementedError:
- # ArrowNotImplementedError: Function 'coalesce' has no kernel
- # matching input types (duration[ns], duration[ns])
- # TODO: remove try/except wrapper if/when pyarrow implements
- # a kernel for duration types.
- pass
- return super().fillna(value=value, method=method, limit=limit)
- def isin(self, values) -> npt.NDArray[np.bool_]:
- # short-circuit to return all False array.
- if not len(values):
- return np.zeros(len(self), dtype=bool)
- result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))
- # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
- # to False
- return np.array(result, dtype=np.bool_)
- def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
- """
- Return an array and missing value suitable for factorization.
- Returns
- -------
- values : ndarray
- na_value : pd.NA
- Notes
- -----
- The values returned by this method are also used in
- :func:`pandas.util.hash_pandas_object`.
- """
- values = self._data.to_numpy()
- return values, self.dtype.na_value
- @doc(ExtensionArray.factorize)
- def factorize(
- self,
- use_na_sentinel: bool = True,
- ) -> tuple[np.ndarray, ExtensionArray]:
- null_encoding = "mask" if use_na_sentinel else "encode"
- pa_type = self._data.type
- if pa.types.is_duration(pa_type):
- # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
- data = self._data.cast(pa.int64())
- else:
- data = self._data
- if pa.types.is_dictionary(data.type):
- encoded = data
- else:
- encoded = data.dictionary_encode(null_encoding=null_encoding)
- if encoded.length() == 0:
- indices = np.array([], dtype=np.intp)
- uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
- else:
- pa_indices = encoded.combine_chunks().indices
- if pa_indices.null_count > 0:
- pa_indices = pc.fill_null(pa_indices, -1)
- indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
- np.intp, copy=False
- )
- uniques = type(self)(encoded.chunk(0).dictionary)
- if pa.types.is_duration(pa_type):
- uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))
- return indices, uniques
- def reshape(self, *args, **kwargs):
- raise NotImplementedError(
- f"{type(self)} does not support reshape "
- f"as backed by a 1D pyarrow.ChunkedArray."
- )
- def round(
- self: ArrowExtensionArrayT, decimals: int = 0, *args, **kwargs
- ) -> ArrowExtensionArrayT:
- """
- Round each value in the array a to the given number of decimals.
- Parameters
- ----------
- decimals : int, default 0
- Number of decimal places to round to. If decimals is negative,
- it specifies the number of positions to the left of the decimal point.
- *args, **kwargs
- Additional arguments and keywords have no effect.
- Returns
- -------
- ArrowExtensionArray
- Rounded values of the ArrowExtensionArray.
- See Also
- --------
- DataFrame.round : Round values of a DataFrame.
- Series.round : Round values of a Series.
- """
- return type(self)(pc.round(self._data, ndigits=decimals))
- @doc(ExtensionArray.searchsorted)
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- if self._hasna:
- raise ValueError(
- "searchsorted requires array to be sorted, which is impossible "
- "with NAs present."
- )
- if isinstance(value, ExtensionArray):
- value = value.astype(object)
- # Base class searchsorted would cast to object, which is *much* slower.
- return self.to_numpy().searchsorted(value, side=side, sorter=sorter)
- def take(
- self,
- indices: TakeIndexer,
- allow_fill: bool = False,
- fill_value: Any = None,
- ) -> ArrowExtensionArray:
- """
- Take elements from an array.
- Parameters
- ----------
- indices : sequence of int or one-dimensional np.ndarray of int
- Indices to be taken.
- allow_fill : bool, default False
- How to handle negative values in `indices`.
- * False: negative values in `indices` indicate positional indices
- from the right (the default). This is similar to
- :func:`numpy.take`.
- * True: negative values in `indices` indicate
- missing values. These values are set to `fill_value`. Any other
- other negative values raise a ``ValueError``.
- fill_value : any, optional
- Fill value to use for NA-indices when `allow_fill` is True.
- This may be ``None``, in which case the default NA value for
- the type, ``self.dtype.na_value``, is used.
- For many ExtensionArrays, there will be two representations of
- `fill_value`: a user-facing "boxed" scalar, and a low-level
- physical NA value. `fill_value` should be the user-facing version,
- and the implementation should handle translating that to the
- physical version for processing the take if necessary.
- Returns
- -------
- ExtensionArray
- Raises
- ------
- IndexError
- When the indices are out of bounds for the array.
- ValueError
- When `indices` contains negative values other than ``-1``
- and `allow_fill` is True.
- See Also
- --------
- numpy.take
- api.extensions.take
- Notes
- -----
- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
- ``iloc``, when `indices` is a sequence of values. Additionally,
- it's called by :meth:`Series.reindex`, or any other method
- that causes realignment, with a `fill_value`.
- """
- # TODO: Remove once we got rid of the (indices < 0) check
- if not is_array_like(indices):
- indices_array = np.asanyarray(indices)
- else:
- # error: Incompatible types in assignment (expression has type
- # "Sequence[int]", variable has type "ndarray")
- indices_array = indices # type: ignore[assignment]
- if len(self._data) == 0 and (indices_array >= 0).any():
- raise IndexError("cannot do a non-empty take")
- if indices_array.size > 0 and indices_array.max() >= len(self._data):
- raise IndexError("out of bounds value in 'indices'.")
- if allow_fill:
- fill_mask = indices_array < 0
- if fill_mask.any():
- validate_indices(indices_array, len(self._data))
- # TODO(ARROW-9433): Treat negative indices as NULL
- indices_array = pa.array(indices_array, mask=fill_mask)
- result = self._data.take(indices_array)
- if isna(fill_value):
- return type(self)(result)
- # TODO: ArrowNotImplementedError: Function fill_null has no
- # kernel matching input types (array[string], scalar[string])
- result = type(self)(result)
- result[fill_mask] = fill_value
- return result
- # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
- else:
- # Nothing to fill
- return type(self)(self._data.take(indices))
- else: # allow_fill=False
- # TODO(ARROW-9432): Treat negative indices as indices from the right.
- if (indices_array < 0).any():
- # Don't modify in-place
- indices_array = np.copy(indices_array)
- indices_array[indices_array < 0] += len(self._data)
- return type(self)(self._data.take(indices_array))
- @doc(ExtensionArray.to_numpy)
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- if dtype is None and self._hasna:
- dtype = object
- if na_value is lib.no_default:
- na_value = self.dtype.na_value
- pa_type = self._data.type
- if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):
- # temporal types with units and/or timezones currently
- # require pandas/python scalars to pass all tests
- # TODO: improve performance (this is slow)
- result = np.array(list(self), dtype=dtype)
- elif is_object_dtype(dtype) and self._hasna:
- result = np.empty(len(self), dtype=object)
- mask = ~self.isna()
- result[mask] = np.asarray(self[mask]._data)
- elif pa.types.is_null(self._data.type):
- result = np.asarray(self._data, dtype=dtype)
- if not isna(na_value):
- result[:] = na_value
- return result
- elif self._hasna:
- data = self.copy()
- data[self.isna()] = na_value
- return np.asarray(data._data, dtype=dtype)
- else:
- result = np.asarray(self._data, dtype=dtype)
- if copy:
- result = result.copy()
- if self._hasna:
- result[self.isna()] = na_value
- return result
- def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- """
- Compute the ArrowExtensionArray of unique values.
- Returns
- -------
- ArrowExtensionArray
- """
- pa_type = self._data.type
- if pa.types.is_duration(pa_type):
- # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
- data = self._data.cast(pa.int64())
- else:
- data = self._data
- pa_result = pc.unique(data)
- if pa.types.is_duration(pa_type):
- pa_result = pa_result.cast(pa_type)
- return type(self)(pa_result)
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Return a Series containing counts of each unique value.
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of missing values.
- Returns
- -------
- counts : Series
- See Also
- --------
- Series.value_counts
- """
- pa_type = self._data.type
- if pa.types.is_duration(pa_type):
- # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
- data = self._data.cast(pa.int64())
- else:
- data = self._data
- from pandas import (
- Index,
- Series,
- )
- vc = data.value_counts()
- values = vc.field(0)
- counts = vc.field(1)
- if dropna and data.null_count > 0:
- mask = values.is_valid()
- values = values.filter(mask)
- counts = counts.filter(mask)
- if pa.types.is_duration(pa_type):
- values = values.cast(pa_type)
- counts = ArrowExtensionArray(counts)
- index = Index(type(self)(values))
- return Series(counts, index=index, name="count", copy=False)
- @classmethod
- def _concat_same_type(
- cls: type[ArrowExtensionArrayT], to_concat
- ) -> ArrowExtensionArrayT:
- """
- Concatenate multiple ArrowExtensionArrays.
- Parameters
- ----------
- to_concat : sequence of ArrowExtensionArrays
- Returns
- -------
- ArrowExtensionArray
- """
- chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
- if to_concat[0].dtype == "string":
- # StringDtype has no attrivute pyarrow_dtype
- pa_dtype = pa.string()
- else:
- pa_dtype = to_concat[0].dtype.pyarrow_dtype
- arr = pa.chunked_array(chunks, type=pa_dtype)
- return cls(arr)
- def _accumulate(
- self, name: str, *, skipna: bool = True, **kwargs
- ) -> ArrowExtensionArray | ExtensionArray:
- """
- Return an ExtensionArray performing an accumulation operation.
- The underlying data type might change.
- Parameters
- ----------
- name : str
- Name of the function, supported values are:
- - cummin
- - cummax
- - cumsum
- - cumprod
- skipna : bool, default True
- If True, skip NA values.
- **kwargs
- Additional keyword arguments passed to the accumulation function.
- Currently, there is no supported kwarg.
- Returns
- -------
- array
- Raises
- ------
- NotImplementedError : subclass does not define accumulations
- """
- pyarrow_name = {
- "cumsum": "cumulative_sum_checked",
- }.get(name, name)
- pyarrow_meth = getattr(pc, pyarrow_name, None)
- if pyarrow_meth is None:
- return super()._accumulate(name, skipna=skipna, **kwargs)
- data_to_accum = self._data
- pa_dtype = data_to_accum.type
- if pa.types.is_duration(pa_dtype):
- data_to_accum = data_to_accum.cast(pa.int64())
- result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
- if pa.types.is_duration(pa_dtype):
- result = result.cast(pa_dtype)
- return type(self)(result)
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
- """
- Return a scalar result of performing the reduction operation.
- Parameters
- ----------
- name : str
- Name of the function, supported values are:
- { any, all, min, max, sum, mean, median, prod,
- std, var, sem, kurt, skew }.
- skipna : bool, default True
- If True, skip NaN values.
- **kwargs
- Additional keyword arguments passed to the reduction function.
- Currently, `ddof` is the only supported kwarg.
- Returns
- -------
- scalar
- Raises
- ------
- TypeError : subclass does not define reductions
- """
- pa_type = self._data.type
- data_to_reduce = self._data
- if name in ["any", "all"] and (
- pa.types.is_integer(pa_type)
- or pa.types.is_floating(pa_type)
- or pa.types.is_duration(pa_type)
- or pa.types.is_decimal(pa_type)
- ):
- # pyarrow only supports any/all for boolean dtype, we allow
- # for other dtypes, matching our non-pyarrow behavior
- if pa.types.is_duration(pa_type):
- data_to_cmp = self._data.cast(pa.int64())
- else:
- data_to_cmp = self._data
- not_eq = pc.not_equal(data_to_cmp, 0)
- data_to_reduce = not_eq
- elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
- data_to_reduce = self._data.cast(pa.int64())
- elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type):
- nbits = pa_type.bit_width
- if nbits == 32:
- data_to_reduce = self._data.cast(pa.int32())
- else:
- data_to_reduce = self._data.cast(pa.int64())
- if name == "sem":
- def pyarrow_meth(data, skip_nulls, **kwargs):
- numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)
- denominator = pc.sqrt_checked(pc.count(self._data))
- return pc.divide_checked(numerator, denominator)
- else:
- pyarrow_name = {
- "median": "quantile",
- "prod": "product",
- "std": "stddev",
- "var": "variance",
- }.get(name, name)
- # error: Incompatible types in assignment
- # (expression has type "Optional[Any]", variable has type
- # "Callable[[Any, Any, KwArg(Any)], Any]")
- pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]
- if pyarrow_meth is None:
- # Let ExtensionArray._reduce raise the TypeError
- return super()._reduce(name, skipna=skipna, **kwargs)
- # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0
- if name in ["any", "all"] and "min_count" not in kwargs:
- kwargs["min_count"] = 0
- elif name == "median":
- # GH 52679: Use quantile instead of approximate_median
- kwargs["q"] = 0.5
- try:
- result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
- except (AttributeError, NotImplementedError, TypeError) as err:
- msg = (
- f"'{type(self).__name__}' with dtype {self.dtype} "
- f"does not support reduction '{name}' with pyarrow "
- f"version {pa.__version__}. '{name}' may be supported by "
- f"upgrading pyarrow."
- )
- raise TypeError(msg) from err
- if name == "median":
- # GH 52679: Use quantile instead of approximate_median; returns array
- result = result[0]
- if pc.is_null(result).as_py():
- return self.dtype.na_value
- if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
- result = result.cast(pa_type)
- if name in ["median", "mean"] and pa.types.is_temporal(pa_type):
- result = result.cast(pa_type)
- if name in ["std", "sem"] and pa.types.is_temporal(pa_type):
- result = result.cast(pa.int64())
- if pa.types.is_duration(pa_type):
- result = result.cast(pa_type)
- elif pa.types.is_time(pa_type):
- unit = get_unit_from_pa_dtype(pa_type)
- result = result.cast(pa.duration(unit))
- elif pa.types.is_date(pa_type):
- # go with closest available unit, i.e. "s"
- result = result.cast(pa.duration("s"))
- else:
- # i.e. timestamp
- result = result.cast(pa.duration(pa_type.unit))
- return result.as_py()
- def __setitem__(self, key, value) -> None:
- """Set one or more values inplace.
- Parameters
- ----------
- key : int, ndarray, or slice
- When called from, e.g. ``Series.__setitem__``, ``key`` will be
- one of
- * scalar int
- * ndarray of integers.
- * boolean ndarray
- * slice object
- value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
- value or values to be set of ``key``.
- Returns
- -------
- None
- """
- # GH50085: unwrap 1D indexers
- if isinstance(key, tuple) and len(key) == 1:
- key = key[0]
- key = check_array_indexer(self, key)
- value = self._maybe_convert_setitem_value(value)
- if com.is_null_slice(key):
- # fast path (GH50248)
- data = self._if_else(True, value, self._data)
- elif is_integer(key):
- # fast path
- key = cast(int, key)
- n = len(self)
- if key < 0:
- key += n
- if not 0 <= key < n:
- raise IndexError(
- f"index {key} is out of bounds for axis 0 with size {n}"
- )
- if is_list_like(value):
- raise ValueError("Length of indexer and values mismatch")
- elif isinstance(value, pa.Scalar):
- value = value.as_py()
- chunks = [
- *self._data[:key].chunks,
- pa.array([value], type=self._data.type, from_pandas=True),
- *self._data[key + 1 :].chunks,
- ]
- data = pa.chunked_array(chunks).combine_chunks()
- elif is_bool_dtype(key):
- key = np.asarray(key, dtype=np.bool_)
- data = self._replace_with_mask(self._data, key, value)
- elif is_scalar(value) or isinstance(value, pa.Scalar):
- mask = np.zeros(len(self), dtype=np.bool_)
- mask[key] = True
- data = self._if_else(mask, value, self._data)
- else:
- indices = np.arange(len(self))[key]
- if len(indices) != len(value):
- raise ValueError("Length of indexer and values mismatch")
- if len(indices) == 0:
- return
- argsort = np.argsort(indices)
- indices = indices[argsort]
- value = value.take(argsort)
- mask = np.zeros(len(self), dtype=np.bool_)
- mask[indices] = True
- data = self._replace_with_mask(self._data, mask, value)
- if isinstance(data, pa.Array):
- data = pa.chunked_array([data])
- self._data = data
- def _rank(
- self,
- *,
- axis: AxisInt = 0,
- method: str = "average",
- na_option: str = "keep",
- ascending: bool = True,
- pct: bool = False,
- ):
- """
- See Series.rank.__doc__.
- """
- if pa_version_under9p0 or axis != 0:
- ranked = super()._rank(
- axis=axis,
- method=method,
- na_option=na_option,
- ascending=ascending,
- pct=pct,
- )
- # keep dtypes consistent with the implementation below
- if method == "average" or pct:
- pa_type = pa.float64()
- else:
- pa_type = pa.uint64()
- result = pa.array(ranked, type=pa_type, from_pandas=True)
- return type(self)(result)
- data = self._data.combine_chunks()
- sort_keys = "ascending" if ascending else "descending"
- null_placement = "at_start" if na_option == "top" else "at_end"
- tiebreaker = "min" if method == "average" else method
- result = pc.rank(
- data,
- sort_keys=sort_keys,
- null_placement=null_placement,
- tiebreaker=tiebreaker,
- )
- if na_option == "keep":
- mask = pc.is_null(self._data)
- null = pa.scalar(None, type=result.type)
- result = pc.if_else(mask, null, result)
- if method == "average":
- result_max = pc.rank(
- data,
- sort_keys=sort_keys,
- null_placement=null_placement,
- tiebreaker="max",
- )
- result_max = result_max.cast(pa.float64())
- result_min = result.cast(pa.float64())
- result = pc.divide(pc.add(result_min, result_max), 2)
- if pct:
- if not pa.types.is_floating(result.type):
- result = result.cast(pa.float64())
- if method == "dense":
- divisor = pc.max(result)
- else:
- divisor = pc.count(result)
- result = pc.divide(result, divisor)
- return type(self)(result)
- def _quantile(
- self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str
- ) -> ArrowExtensionArrayT:
- """
- Compute the quantiles of self for each quantile in `qs`.
- Parameters
- ----------
- qs : np.ndarray[float64]
- interpolation: str
- Returns
- -------
- same type as self
- """
- pa_dtype = self._data.type
- data = self._data
- if pa.types.is_temporal(pa_dtype):
- # https://github.com/apache/arrow/issues/33769 in these cases
- # we can cast to ints and back
- nbits = pa_dtype.bit_width
- if nbits == 32:
- data = data.cast(pa.int32())
- else:
- data = data.cast(pa.int64())
- result = pc.quantile(data, q=qs, interpolation=interpolation)
- if pa.types.is_temporal(pa_dtype):
- nbits = pa_dtype.bit_width
- if nbits == 32:
- result = result.cast(pa.int32())
- else:
- result = result.cast(pa.int64())
- result = result.cast(pa_dtype)
- return type(self)(result)
- def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT:
- """
- Returns the mode(s) of the ExtensionArray.
- Always returns `ExtensionArray` even if only one value.
- Parameters
- ----------
- dropna : bool, default True
- Don't consider counts of NA values.
- Returns
- -------
- same type as self
- Sorted, if possible.
- """
- pa_type = self._data.type
- if pa.types.is_temporal(pa_type):
- nbits = pa_type.bit_width
- if nbits == 32:
- data = self._data.cast(pa.int32())
- elif nbits == 64:
- data = self._data.cast(pa.int64())
- else:
- raise NotImplementedError(pa_type)
- else:
- data = self._data
- if dropna:
- data = data.drop_null()
- res = pc.value_counts(data)
- most_common = res.field("values").filter(
- pc.equal(res.field("counts"), pc.max(res.field("counts")))
- )
- if pa.types.is_temporal(pa_type):
- most_common = most_common.cast(pa_type)
- return type(self)(most_common)
- def _maybe_convert_setitem_value(self, value):
- """Maybe convert value to be pyarrow compatible."""
- if value is None:
- return value
- if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
- return value
- if is_list_like(value):
- pa_box = pa.array
- else:
- pa_box = pa.scalar
- try:
- value = pa_box(value, type=self._data.type, from_pandas=True)
- except pa.ArrowTypeError as err:
- msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
- raise TypeError(msg) from err
- return value
- @classmethod
- def _if_else(
- cls,
- cond: npt.NDArray[np.bool_] | bool,
- left: ArrayLike | Scalar,
- right: ArrayLike | Scalar,
- ):
- """
- Choose values based on a condition.
- Analogous to pyarrow.compute.if_else, with logic
- to fallback to numpy for unsupported types.
- Parameters
- ----------
- cond : npt.NDArray[np.bool_] or bool
- left : ArrayLike | Scalar
- right : ArrayLike | Scalar
- Returns
- -------
- pa.Array
- """
- try:
- return pc.if_else(cond, left, right)
- except pa.ArrowNotImplementedError:
- pass
- def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
- if isinstance(value, (pa.Array, pa.ChunkedArray)):
- pa_type = value.type
- elif isinstance(value, pa.Scalar):
- pa_type = value.type
- value = value.as_py()
- else:
- pa_type = None
- return np.array(value, dtype=object), pa_type
- left, left_type = _to_numpy_and_type(left)
- right, right_type = _to_numpy_and_type(right)
- pa_type = left_type or right_type
- result = np.where(cond, left, right)
- return pa.array(result, type=pa_type, from_pandas=True)
- @classmethod
- def _replace_with_mask(
- cls,
- values: pa.Array | pa.ChunkedArray,
- mask: npt.NDArray[np.bool_] | bool,
- replacements: ArrayLike | Scalar,
- ):
- """
- Replace items selected with a mask.
- Analogous to pyarrow.compute.replace_with_mask, with logic
- to fallback to numpy for unsupported types.
- Parameters
- ----------
- values : pa.Array or pa.ChunkedArray
- mask : npt.NDArray[np.bool_] or bool
- replacements : ArrayLike or Scalar
- Replacement value(s)
- Returns
- -------
- pa.Array or pa.ChunkedArray
- """
- if isinstance(replacements, pa.ChunkedArray):
- # replacements must be array or scalar, not ChunkedArray
- replacements = replacements.combine_chunks()
- if pa_version_under8p0:
- # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0:
- # version <= 7: segfaults with various types
- # version <= 6: fails to replace nulls
- if isinstance(replacements, pa.Array):
- indices = np.full(len(values), None)
- indices[mask] = np.arange(len(replacements))
- indices = pa.array(indices, type=pa.int64())
- replacements = replacements.take(indices)
- return cls._if_else(mask, replacements, values)
- if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type):
- # GH#52059 replace_with_mask segfaults for chunked array
- # https://github.com/apache/arrow/issues/34634
- values = values.combine_chunks()
- try:
- return pc.replace_with_mask(values, mask, replacements)
- except pa.ArrowNotImplementedError:
- pass
- if isinstance(replacements, pa.Array):
- replacements = np.array(replacements, dtype=object)
- elif isinstance(replacements, pa.Scalar):
- replacements = replacements.as_py()
- result = np.array(values, dtype=object)
- result[mask] = replacements
- return pa.array(result, type=values.type, from_pandas=True)
- def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
- """Apply a callable to each element while maintaining the chunking structure."""
- return [
- [
- None if val is None else func(val)
- for val in chunk.to_numpy(zero_copy_only=False)
- ]
- for chunk in self._data.iterchunks()
- ]
- def _str_count(self, pat: str, flags: int = 0):
- if flags:
- raise NotImplementedError(f"count not implemented with {flags=}")
- return type(self)(pc.count_substring_regex(self._data, pat))
- def _str_pad(
- self,
- width: int,
- side: Literal["left", "right", "both"] = "left",
- fillchar: str = " ",
- ):
- if side == "left":
- pa_pad = pc.utf8_lpad
- elif side == "right":
- pa_pad = pc.utf8_rpad
- elif side == "both":
- pa_pad = pc.utf8_center
- else:
- raise ValueError(
- f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
- )
- return type(self)(pa_pad(self._data, width=width, padding=fillchar))
- def _str_contains(
- self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
- ):
- if flags:
- raise NotImplementedError(f"contains not implemented with {flags=}")
- if regex:
- pa_contains = pc.match_substring_regex
- else:
- pa_contains = pc.match_substring
- result = pa_contains(self._data, pat, ignore_case=not case)
- if not isna(na):
- result = result.fill_null(na)
- return type(self)(result)
- def _str_startswith(self, pat: str, na=None):
- result = pc.starts_with(self._data, pattern=pat)
- if not isna(na):
- result = result.fill_null(na)
- return type(self)(result)
- def _str_endswith(self, pat: str, na=None):
- result = pc.ends_with(self._data, pattern=pat)
- if not isna(na):
- result = result.fill_null(na)
- return type(self)(result)
- def _str_replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool = True,
- flags: int = 0,
- regex: bool = True,
- ):
- if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
- raise NotImplementedError(
- "replace is not supported with a re.Pattern, callable repl, "
- "case=False, or flags!=0"
- )
- func = pc.replace_substring_regex if regex else pc.replace_substring
- result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
- return type(self)(result)
- def _str_repeat(self, repeats: int | Sequence[int]):
- if not isinstance(repeats, int):
- raise NotImplementedError(
- f"repeat is not implemented when repeats is {type(repeats).__name__}"
- )
- elif pa_version_under7p0:
- raise NotImplementedError("repeat is not implemented for pyarrow < 7")
- else:
- return type(self)(pc.binary_repeat(self._data, repeats))
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.startswith("^"):
- pat = f"^{pat}"
- return self._str_contains(pat, case, flags, na, regex=True)
- def _str_fullmatch(
- self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.endswith("$") or pat.endswith("//$"):
- pat = f"{pat}$"
- return self._str_match(pat, case, flags, na)
- def _str_find(self, sub: str, start: int = 0, end: int | None = None):
- if start != 0 and end is not None:
- slices = pc.utf8_slice_codeunits(self._data, start, stop=end)
- result = pc.find_substring(slices, sub)
- not_found = pc.equal(result, -1)
- offset_result = pc.add(result, end - start)
- result = pc.if_else(not_found, result, offset_result)
- elif start == 0 and end is None:
- slices = self._data
- result = pc.find_substring(slices, sub)
- else:
- raise NotImplementedError(
- f"find not implemented with {sub=}, {start=}, {end=}"
- )
- return type(self)(result)
- def _str_get(self, i: int):
- lengths = pc.utf8_length(self._data)
- if i >= 0:
- out_of_bounds = pc.greater_equal(i, lengths)
- start = i
- stop = i + 1
- step = 1
- else:
- out_of_bounds = pc.greater(-i, lengths)
- start = i
- stop = i - 1
- step = -1
- not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
- selected = pc.utf8_slice_codeunits(
- self._data, start=start, stop=stop, step=step
- )
- result = pa.array([None] * self._data.length(), type=self._data.type)
- result = pc.if_else(not_out_of_bounds, selected, result)
- return type(self)(result)
- def _str_join(self, sep: str):
- return type(self)(pc.binary_join(self._data, sep))
- def _str_partition(self, sep: str, expand: bool):
- predicate = lambda val: val.partition(sep)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_rpartition(self, sep: str, expand: bool):
- predicate = lambda val: val.rpartition(sep)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_slice(
- self, start: int | None = None, stop: int | None = None, step: int | None = None
- ):
- if start is None:
- start = 0
- if step is None:
- step = 1
- return type(self)(
- pc.utf8_slice_codeunits(self._data, start=start, stop=stop, step=step)
- )
- def _str_slice_replace(
- self, start: int | None = None, stop: int | None = None, repl: str | None = None
- ):
- if repl is None:
- repl = ""
- if start is None:
- start = 0
- return type(self)(pc.utf8_replace_slice(self._data, start, stop, repl))
- def _str_isalnum(self):
- return type(self)(pc.utf8_is_alnum(self._data))
- def _str_isalpha(self):
- return type(self)(pc.utf8_is_alpha(self._data))
- def _str_isdecimal(self):
- return type(self)(pc.utf8_is_decimal(self._data))
- def _str_isdigit(self):
- return type(self)(pc.utf8_is_digit(self._data))
- def _str_islower(self):
- return type(self)(pc.utf8_is_lower(self._data))
- def _str_isnumeric(self):
- return type(self)(pc.utf8_is_numeric(self._data))
- def _str_isspace(self):
- return type(self)(pc.utf8_is_space(self._data))
- def _str_istitle(self):
- return type(self)(pc.utf8_is_title(self._data))
- def _str_capitalize(self):
- return type(self)(pc.utf8_capitalize(self._data))
- def _str_title(self):
- return type(self)(pc.utf8_title(self._data))
- def _str_isupper(self):
- return type(self)(pc.utf8_is_upper(self._data))
- def _str_swapcase(self):
- return type(self)(pc.utf8_swapcase(self._data))
- def _str_len(self):
- return type(self)(pc.utf8_length(self._data))
- def _str_lower(self):
- return type(self)(pc.utf8_lower(self._data))
- def _str_upper(self):
- return type(self)(pc.utf8_upper(self._data))
- def _str_strip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_trim_whitespace(self._data)
- else:
- result = pc.utf8_trim(self._data, characters=to_strip)
- return type(self)(result)
- def _str_lstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_ltrim_whitespace(self._data)
- else:
- result = pc.utf8_ltrim(self._data, characters=to_strip)
- return type(self)(result)
- def _str_rstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_rtrim_whitespace(self._data)
- else:
- result = pc.utf8_rtrim(self._data, characters=to_strip)
- return type(self)(result)
- def _str_removeprefix(self, prefix: str):
- # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
- # starts_with = pc.starts_with(self._data, pattern=prefix)
- # removed = pc.utf8_slice_codeunits(self._data, len(prefix))
- # result = pc.if_else(starts_with, removed, self._data)
- # return type(self)(result)
- if sys.version_info < (3, 9):
- # NOTE pyupgrade will remove this when we run it with --py39-plus
- # so don't remove the unnecessary `else` statement below
- from pandas.util._str_methods import removeprefix
- predicate = functools.partial(removeprefix, prefix=prefix)
- else:
- predicate = lambda val: val.removeprefix(prefix)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_removesuffix(self, suffix: str):
- ends_with = pc.ends_with(self._data, pattern=suffix)
- removed = pc.utf8_slice_codeunits(self._data, 0, stop=-len(suffix))
- result = pc.if_else(ends_with, removed, self._data)
- return type(self)(result)
- def _str_casefold(self):
- predicate = lambda val: val.casefold()
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_encode(self, encoding: str, errors: str = "strict"):
- predicate = lambda val: val.encode(encoding, errors)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
- raise NotImplementedError(
- "str.extract not supported with pd.ArrowDtype(pa.string())."
- )
- def _str_findall(self, pat: str, flags: int = 0):
- regex = re.compile(pat, flags=flags)
- predicate = lambda val: regex.findall(val)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_get_dummies(self, sep: str = "|"):
- split = pc.split_pattern(self._data, sep).combine_chunks()
- uniques = split.flatten().unique()
- uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))
- result_data = []
- for lst in split.to_pylist():
- if lst is None:
- result_data.append([False] * len(uniques_sorted))
- else:
- res = pc.is_in(uniques_sorted, pa.array(set(lst)))
- result_data.append(res.to_pylist())
- result = type(self)(pa.array(result_data))
- return result, uniques_sorted.to_pylist()
- def _str_index(self, sub: str, start: int = 0, end: int | None = None):
- predicate = lambda val: val.index(sub, start, end)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_rindex(self, sub: str, start: int = 0, end: int | None = None):
- predicate = lambda val: val.rindex(sub, start, end)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_normalize(self, form: str):
- predicate = lambda val: unicodedata.normalize(form, val)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_rfind(self, sub: str, start: int = 0, end=None):
- predicate = lambda val: val.rfind(sub, start, end)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_split(
- self,
- pat: str | None = None,
- n: int | None = -1,
- expand: bool = False,
- regex: bool | None = None,
- ):
- if n in {-1, 0}:
- n = None
- if regex:
- split_func = pc.split_pattern_regex
- else:
- split_func = pc.split_pattern
- return type(self)(split_func(self._data, pat, max_splits=n))
- def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
- if n in {-1, 0}:
- n = None
- return type(self)(pc.split_pattern(self._data, pat, max_splits=n, reverse=True))
- def _str_translate(self, table: dict[int, str]):
- predicate = lambda val: val.translate(table)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- def _str_wrap(self, width: int, **kwargs):
- kwargs["width"] = width
- tw = textwrap.TextWrapper(**kwargs)
- predicate = lambda val: "\n".join(tw.wrap(val))
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
- @property
- def _dt_year(self):
- return type(self)(pc.year(self._data))
- @property
- def _dt_day(self):
- return type(self)(pc.day(self._data))
- @property
- def _dt_day_of_week(self):
- return type(self)(pc.day_of_week(self._data))
- _dt_dayofweek = _dt_day_of_week
- _dt_weekday = _dt_day_of_week
- @property
- def _dt_day_of_year(self):
- return type(self)(pc.day_of_year(self._data))
- _dt_dayofyear = _dt_day_of_year
- @property
- def _dt_hour(self):
- return type(self)(pc.hour(self._data))
- def _dt_isocalendar(self):
- return type(self)(pc.iso_calendar(self._data))
- @property
- def _dt_is_leap_year(self):
- return type(self)(pc.is_leap_year(self._data))
- @property
- def _dt_microsecond(self):
- return type(self)(pc.microsecond(self._data))
- @property
- def _dt_minute(self):
- return type(self)(pc.minute(self._data))
- @property
- def _dt_month(self):
- return type(self)(pc.month(self._data))
- @property
- def _dt_nanosecond(self):
- return type(self)(pc.nanosecond(self._data))
- @property
- def _dt_quarter(self):
- return type(self)(pc.quarter(self._data))
- @property
- def _dt_second(self):
- return type(self)(pc.second(self._data))
- @property
- def _dt_date(self):
- return type(self)(self._data.cast(pa.date32()))
- @property
- def _dt_time(self):
- unit = (
- self.dtype.pyarrow_dtype.unit
- if self.dtype.pyarrow_dtype.unit in {"us", "ns"}
- else "ns"
- )
- return type(self)(self._data.cast(pa.time64(unit)))
- @property
- def _dt_tz(self):
- return self.dtype.pyarrow_dtype.tz
- def _dt_strftime(self, format: str):
- return type(self)(pc.strftime(self._data, format=format))
- def _round_temporally(
- self,
- method: Literal["ceil", "floor", "round"],
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- if ambiguous != "raise":
- raise NotImplementedError("ambiguous is not supported.")
- if nonexistent != "raise":
- raise NotImplementedError("nonexistent is not supported.")
- offset = to_offset(freq)
- if offset is None:
- raise ValueError(f"Must specify a valid frequency: {freq}")
- pa_supported_unit = {
- "A": "year",
- "AS": "year",
- "Q": "quarter",
- "QS": "quarter",
- "M": "month",
- "MS": "month",
- "W": "week",
- "D": "day",
- "H": "hour",
- "T": "minute",
- "S": "second",
- "L": "millisecond",
- "U": "microsecond",
- "N": "nanosecond",
- }
- unit = pa_supported_unit.get(offset._prefix, None)
- if unit is None:
- raise ValueError(f"{freq=} is not supported")
- multiple = offset.n
- rounding_method = getattr(pc, f"{method}_temporal")
- return type(self)(rounding_method(self._data, multiple=multiple, unit=unit))
- def _dt_ceil(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round_temporally("ceil", freq, ambiguous, nonexistent)
- def _dt_floor(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round_temporally("floor", freq, ambiguous, nonexistent)
- def _dt_round(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round_temporally("round", freq, ambiguous, nonexistent)
- def _dt_to_pydatetime(self):
- if pa.types.is_date(self.dtype.pyarrow_dtype):
- raise ValueError(
- f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "
- "Convert to pyarrow timestamp type."
- )
- data = self._data.to_pylist()
- if self._dtype.pyarrow_dtype.unit == "ns":
- data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]
- return np.array(data, dtype=object)
- def _dt_tz_localize(
- self,
- tz,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- if ambiguous != "raise":
- raise NotImplementedError(f"{ambiguous=} is not supported")
- nonexistent_pa = {
- "raise": "raise",
- "shift_backward": "earliest",
- "shift_forward": "latest",
- }.get(
- nonexistent, None # type: ignore[arg-type]
- )
- if nonexistent_pa is None:
- raise NotImplementedError(f"{nonexistent=} is not supported")
- if tz is None:
- result = self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))
- else:
- result = pc.assume_timezone(
- self._data, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
- )
- return type(self)(result)
|