123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715 |
- import warnings
- from pandas.util._exceptions import find_stack_level
- cimport cython
- from datetime import timezone
- from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- datetime,
- import_datetime,
- timedelta,
- tzinfo,
- )
- from cpython.object cimport PyObject
- # import datetime C API
- import_datetime()
- cimport numpy as cnp
- from numpy cimport (
- int64_t,
- ndarray,
- )
- import numpy as np
- cnp.import_array()
- from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- check_dts_bounds,
- npy_datetimestruct,
- npy_datetimestruct_to_datetime,
- pandas_datetime_to_datetimestruct,
- pydate_to_dt64,
- string_to_dts,
- )
- from pandas._libs.tslibs.strptime cimport parse_today_now
- from pandas._libs.util cimport (
- is_datetime64_object,
- is_float_object,
- is_integer_object,
- )
- from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
- from pandas._libs.tslibs.conversion cimport (
- _TSObject,
- cast_from_unit,
- convert_str_to_tsobject,
- convert_timezone,
- get_datetime64_nanos,
- parse_pydatetime,
- precision_from_unit,
- )
- from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
- c_nat_strings as nat_strings,
- )
- from pandas._libs.tslibs.timestamps cimport _Timestamp
- from pandas._libs.tslibs import (
- Resolution,
- get_resolution,
- )
- from pandas._libs.tslibs.timestamps import Timestamp
- # Note: this is the only non-tslibs intra-pandas dependency here
- from pandas._libs.missing cimport checknull_with_nat_and_na
- from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
- def _test_parse_iso8601(ts: str):
- """
- TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used
- only for testing, actual construction uses `convert_str_to_tsobject`
- """
- cdef:
- _TSObject obj
- int out_local = 0, out_tzoffset = 0
- NPY_DATETIMEUNIT out_bestunit
- obj = _TSObject()
- string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
- obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
- check_dts_bounds(&obj.dts)
- if out_local == 1:
- obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
- obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo)
- return Timestamp(obj.value, tz=obj.tzinfo)
- else:
- return Timestamp(obj.value)
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def format_array_from_datetime(
- ndarray values,
- tzinfo tz=None,
- str format=None,
- na_rep: str | float = "NaT",
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
- ) -> np.ndarray:
- """
- return a np object array of the string formatted values
- Parameters
- ----------
- values : ndarray[int64_t], arbitrary ndim
- tz : tzinfo or None, default None
- format : str or None, default None
- a strftime capable string
- na_rep : optional, default is None
- a nat format
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
- Returns
- -------
- np.ndarray[object]
- """
- cdef:
- int64_t val, ns, N = values.size
- bint show_ms = False, show_us = False, show_ns = False
- bint basic_format = False, basic_format_day = False
- _Timestamp ts
- object res
- npy_datetimestruct dts
- # Note that `result` (and thus `result_flat`) is C-order and
- # `it` iterates C-order as well, so the iteration matches
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- ndarray result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
- object[::1] res_flat = result.ravel() # should NOT be a copy
- cnp.flatiter it = cnp.PyArray_IterNew(values)
- if tz is None:
- # if we don't have a format nor tz, then choose
- # a format based on precision
- basic_format = format is None
- if basic_format:
- reso_obj = get_resolution(values, tz=tz, reso=reso)
- show_ns = reso_obj == Resolution.RESO_NS
- show_us = reso_obj == Resolution.RESO_US
- show_ms = reso_obj == Resolution.RESO_MS
- elif format == "%Y-%m-%d %H:%M:%S":
- # Same format as default, but with hardcoded precision (s)
- basic_format = True
- show_ns = show_us = show_ms = False
- elif format == "%Y-%m-%d %H:%M:%S.%f":
- # Same format as default, but with hardcoded precision (us)
- basic_format = show_us = True
- show_ns = show_ms = False
- elif format == "%Y-%m-%d":
- # Default format for dates
- basic_format_day = True
- assert not (basic_format_day and basic_format)
- for i in range(N):
- # Analogous to: utc_val = values[i]
- val = (<int64_t*>cnp.PyArray_ITER_DATA(it))[0]
- if val == NPY_NAT:
- res = na_rep
- elif basic_format_day:
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- res = f"{dts.year}-{dts.month:02d}-{dts.day:02d}"
- elif basic_format:
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- res = (f"{dts.year}-{dts.month:02d}-{dts.day:02d} "
- f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}")
- if show_ns:
- ns = dts.ps // 1000
- res += f".{ns + dts.us * 1000:09d}"
- elif show_us:
- res += f".{dts.us:06d}"
- elif show_ms:
- res += f".{dts.us // 1000:03d}"
- else:
- ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz)
- if format is None:
- # Use datetime.str, that returns ts.isoformat(sep=' ')
- res = str(ts)
- else:
- # invalid format string
- # requires dates > 1900
- try:
- # Note: dispatches to pydatetime
- res = ts.strftime(format)
- except ValueError:
- # Use datetime.str, that returns ts.isoformat(sep=' ')
- res = str(ts)
- # Note: we can index result directly instead of using PyArray_MultiIter_DATA
- # like we do for the other functions because result is known C-contiguous
- # and is the first argument to PyArray_MultiIterNew2. The usual pattern
- # does not seem to work with object dtype.
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- res_flat[i] = res
- cnp.PyArray_ITER_NEXT(it)
- return result
- def array_with_unit_to_datetime(
- ndarray[object] values,
- str unit,
- str errors="coerce"
- ):
- """
- Convert the ndarray to datetime according to the time unit.
- This function converts an array of objects into a numpy array of
- datetime64[ns]. It returns the converted array
- and also returns the timezone offset
- if errors:
- - raise: return converted values or raise OutOfBoundsDatetime
- if out of range on the conversion or
- ValueError for other conversions (e.g. a string)
- - ignore: return non-convertible values as the same unit
- - coerce: NaT for non-convertibles
- Parameters
- ----------
- values : ndarray
- Date-like objects to convert.
- unit : str
- Time unit to use during conversion.
- errors : str, default 'raise'
- Error behavior when parsing.
- Returns
- -------
- result : ndarray of m8 values
- tz : parsed timezone offset or None
- """
- cdef:
- Py_ssize_t i, n=len(values)
- int64_t mult
- bint is_ignore = errors == "ignore"
- bint is_coerce = errors == "coerce"
- bint is_raise = errors == "raise"
- ndarray[int64_t] iresult
- tzinfo tz = None
- float fval
- assert is_ignore or is_coerce or is_raise
- if unit == "ns":
- result, tz = array_to_datetime(
- values.astype(object, copy=False),
- errors=errors,
- )
- return result, tz
- mult, _ = precision_from_unit(unit)
- result = np.empty(n, dtype="M8[ns]")
- iresult = result.view("i8")
- for i in range(n):
- val = values[i]
- try:
- if checknull_with_nat_and_na(val):
- iresult[i] = NPY_NAT
- elif is_integer_object(val) or is_float_object(val):
- if val != val or val == NPY_NAT:
- iresult[i] = NPY_NAT
- else:
- iresult[i] = cast_from_unit(val, unit)
- elif isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- iresult[i] = NPY_NAT
- else:
- try:
- fval = float(val)
- except ValueError:
- raise ValueError(
- f"non convertible value {val} with the unit '{unit}'"
- )
- warnings.warn(
- "The behavior of 'to_datetime' with 'unit' when parsing "
- "strings is deprecated. In a future version, strings will "
- "be parsed as datetime strings, matching the behavior "
- "without a 'unit'. To retain the old behavior, explicitly "
- "cast ints or floats to numeric type before calling "
- "to_datetime.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- iresult[i] = cast_from_unit(fval, unit)
- else:
- # TODO: makes more sense as TypeError, but that would be an
- # API change.
- raise ValueError(
- f"unit='{unit}' not valid with non-numerical val='{val}'"
- )
- except (ValueError, OutOfBoundsDatetime, TypeError) as err:
- if is_raise:
- err.args = (f"{err}, at position {i}",)
- raise
- elif is_ignore:
- # we have hit an exception
- # and are in ignore mode
- # redo as object
- return _array_with_unit_to_datetime_object_fallback(values, unit)
- else:
- # is_coerce
- iresult[i] = NPY_NAT
- return result, tz
- cdef _array_with_unit_to_datetime_object_fallback(ndarray[object] values, str unit):
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[object] oresult
- tzinfo tz = None
- # TODO: fix subtle differences between this and no-unit code
- oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
- for i in range(n):
- val = values[i]
- if checknull_with_nat_and_na(val):
- oresult[i] = <object>NaT
- elif is_integer_object(val) or is_float_object(val):
- if val != val or val == NPY_NAT:
- oresult[i] = <object>NaT
- else:
- try:
- oresult[i] = Timestamp(val, unit=unit)
- except OutOfBoundsDatetime:
- oresult[i] = val
- elif isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- oresult[i] = <object>NaT
- else:
- oresult[i] = val
- return oresult, tz
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def first_non_null(values: ndarray) -> int:
- """Find position of first non-null value, return -1 if there isn't one."""
- cdef:
- Py_ssize_t n = len(values)
- Py_ssize_t i
- for i in range(n):
- val = values[i]
- if checknull_with_nat_and_na(val):
- continue
- if (
- isinstance(val, str)
- and
- (len(val) == 0 or val in nat_strings or val in ("now", "today"))
- ):
- continue
- return i
- else:
- return -1
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cpdef array_to_datetime(
- ndarray values, # object dtype, arbitrary ndim
- str errors="raise",
- bint dayfirst=False,
- bint yearfirst=False,
- bint utc=False,
- ):
- """
- Converts a 1D array of date-like values to a numpy array of either:
- 1) datetime64[ns] data
- 2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError
- is encountered
- Also returns a fixed-offset tzinfo object if an array of strings with the same
- timezone offset is passed and utc=True is not passed. Otherwise, None
- is returned
- Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric,
- strings
- Parameters
- ----------
- values : ndarray of object
- date-like objects to convert
- errors : str, default 'raise'
- error behavior when parsing
- dayfirst : bool, default False
- dayfirst parsing behavior when encountering datetime strings
- yearfirst : bool, default False
- yearfirst parsing behavior when encountering datetime strings
- utc : bool, default False
- indicator whether the dates should be UTC
- Returns
- -------
- np.ndarray
- May be datetime64[ns] or object dtype
- tzinfo or None
- """
- cdef:
- Py_ssize_t i, n = values.size
- object val, tz
- ndarray[int64_t] iresult
- npy_datetimestruct dts
- bint utc_convert = bool(utc)
- bint seen_datetime_offset = False
- bint is_raise = errors == "raise"
- bint is_ignore = errors == "ignore"
- bint is_coerce = errors == "coerce"
- bint is_same_offsets
- _TSObject _ts
- float tz_offset
- set out_tzoffset_vals = set()
- tzinfo tz_out = None
- bint found_tz = False, found_naive = False
- cnp.broadcast mi
- # specify error conditions
- assert is_raise or is_ignore or is_coerce
- result = np.empty((<object>values).shape, dtype="M8[ns]")
- mi = cnp.PyArray_MultiIterNew2(result, values)
- iresult = result.view("i8").ravel()
- for i in range(n):
- # Analogous to `val = values[i]`
- val = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
- try:
- if checknull_with_nat_and_na(val):
- iresult[i] = NPY_NAT
- elif PyDateTime_Check(val):
- if val.tzinfo is not None:
- found_tz = True
- else:
- found_naive = True
- tz_out = convert_timezone(
- val.tzinfo,
- tz_out,
- found_naive,
- found_tz,
- utc_convert,
- )
- iresult[i] = parse_pydatetime(val, &dts, utc_convert)
- elif PyDate_Check(val):
- iresult[i] = pydate_to_dt64(val, &dts)
- check_dts_bounds(&dts)
- elif is_datetime64_object(val):
- iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
- elif is_integer_object(val) or is_float_object(val):
- # these must be ns unit by-definition
- if val != val or val == NPY_NAT:
- iresult[i] = NPY_NAT
- else:
- # we now need to parse this as if unit='ns'
- iresult[i] = cast_from_unit(val, "ns")
- elif isinstance(val, str):
- # string
- if type(val) is not str:
- # GH#32264 np.str_ object
- val = str(val)
- if parse_today_now(val, &iresult[i], utc):
- # We can't _quite_ dispatch this to convert_str_to_tsobject
- # bc there isn't a nice way to pass "utc"
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
- _ts = convert_str_to_tsobject(
- val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
- )
- _ts.ensure_reso(NPY_FR_ns, val)
- iresult[i] = _ts.value
- tz = _ts.tzinfo
- if tz is not None:
- # dateutil timezone objects cannot be hashed, so
- # store the UTC offsets in seconds instead
- nsecs = tz.utcoffset(None).total_seconds()
- out_tzoffset_vals.add(nsecs)
- # need to set seen_datetime_offset *after* the
- # potentially-raising timezone(timedelta(...)) call,
- # otherwise we can go down the is_same_offsets path
- # bc len(out_tzoffset_vals) == 0
- seen_datetime_offset = True
- else:
- # Add a marker for naive string, to track if we are
- # parsing mixed naive and aware strings
- out_tzoffset_vals.add("naive")
- else:
- raise TypeError(f"{type(val)} is not convertible to datetime")
- cnp.PyArray_MultiIter_NEXT(mi)
- except (TypeError, OverflowError, ValueError) as ex:
- ex.args = (f"{ex}, at position {i}",)
- if is_coerce:
- iresult[i] = NPY_NAT
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
- elif is_raise:
- raise
- return values, None
- if seen_datetime_offset and not utc_convert:
- # GH#17697
- # 1) If all the offsets are equal, return one offset for
- # the parsed dates to (maybe) pass to DatetimeIndex
- # 2) If the offsets are different, then force the parsing down the
- # object path where an array of datetimes
- # (with individual dateutil.tzoffsets) are returned
- is_same_offsets = len(out_tzoffset_vals) == 1
- if not is_same_offsets:
- return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
- else:
- tz_offset = out_tzoffset_vals.pop()
- tz_out = timezone(timedelta(seconds=tz_offset))
- return result, tz_out
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef _array_to_datetime_object(
- ndarray[object] values,
- str errors,
- bint dayfirst=False,
- bint yearfirst=False,
- ):
- """
- Fall back function for array_to_datetime
- Attempts to parse datetime strings with dateutil to return an array
- of datetime objects
- Parameters
- ----------
- values : ndarray[object]
- date-like objects to convert
- errors : str
- error behavior when parsing
- dayfirst : bool, default False
- dayfirst parsing behavior when encountering datetime strings
- yearfirst : bool, default False
- yearfirst parsing behavior when encountering datetime strings
- Returns
- -------
- np.ndarray[object]
- Literal[None]
- """
- cdef:
- Py_ssize_t i, n = values.size
- object val
- bint is_ignore = errors == "ignore"
- bint is_coerce = errors == "coerce"
- bint is_raise = errors == "raise"
- ndarray oresult_nd
- ndarray[object] oresult
- npy_datetimestruct dts
- cnp.broadcast mi
- _TSObject tsobj
- assert is_raise or is_ignore or is_coerce
- oresult_nd = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
- mi = cnp.PyArray_MultiIterNew2(oresult_nd, values)
- oresult = oresult_nd.ravel()
- # We return an object array and only attempt to parse:
- # 1) NaT or NaT-like values
- # 2) datetime strings, which we return as datetime.datetime
- # 3) special strings - "now" & "today"
- for i in range(n):
- # Analogous to: val = values[i]
- val = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
- if checknull_with_nat_and_na(val) or PyDateTime_Check(val):
- # GH 25978. No need to parse NaT-like or datetime-like vals
- oresult[i] = val
- elif isinstance(val, str):
- if type(val) is not str:
- # GH#32264 np.str_ objects
- val = str(val)
- if len(val) == 0 or val in nat_strings:
- oresult[i] = "NaT"
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
- try:
- tsobj = convert_str_to_tsobject(
- val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
- )
- tsobj.ensure_reso(NPY_FR_ns, val)
- dts = tsobj.dts
- oresult[i] = datetime(
- dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us,
- tzinfo=tsobj.tzinfo,
- fold=tsobj.fold,
- )
- except (ValueError, OverflowError) as ex:
- ex.args = (f"{ex}, at position {i}", )
- if is_coerce:
- oresult[i] = <object>NaT
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
- if is_raise:
- raise
- return values, None
- else:
- if is_raise:
- raise
- return values, None
- cnp.PyArray_MultiIter_NEXT(mi)
- return oresult_nd, None
- def array_to_datetime_with_tz(ndarray values, tzinfo tz):
- """
- Vectorized analogue to pd.Timestamp(value, tz=tz)
- values has object-dtype, unrestricted ndim.
- Major differences between this and array_to_datetime with utc=True
- - np.datetime64 objects are treated as _wall_ times.
- - tznaive datetimes are treated as _wall_ times.
- """
- cdef:
- ndarray result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_INT64, 0)
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, values)
- Py_ssize_t i, n = values.size
- object item
- int64_t ival
- datetime ts
- for i in range(n):
- # Analogous to `item = values[i]`
- item = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
- if checknull_with_nat_and_na(item):
- # this catches pd.NA which would raise in the Timestamp constructor
- ival = NPY_NAT
- else:
- ts = Timestamp(item)
- if ts is NaT:
- ival = NPY_NAT
- else:
- if ts.tz is not None:
- ts = ts.tz_convert(tz)
- else:
- # datetime64, tznaive pydatetime, int, float
- ts = ts.tz_localize(tz)
- ts = ts.as_unit("ns")
- ival = ts._value
- # Analogous to: result[i] = ival
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival
- cnp.PyArray_MultiIter_NEXT(mi)
- return result
|