123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629 |
- cimport cython
- from cpython.datetime cimport (
- PyDateTime_CheckExact,
- PyDateTime_DATE_GET_HOUR,
- PyDateTime_DATE_GET_MICROSECOND,
- PyDateTime_DATE_GET_MINUTE,
- PyDateTime_DATE_GET_SECOND,
- PyDateTime_GET_DAY,
- PyDateTime_GET_MONTH,
- PyDateTime_GET_YEAR,
- import_datetime,
- )
- from cpython.object cimport (
- Py_EQ,
- Py_GE,
- Py_GT,
- Py_LE,
- Py_LT,
- Py_NE,
- )
- import_datetime()
- import numpy as np
- cimport numpy as cnp
- cnp.import_array()
- from numpy cimport (
- int64_t,
- ndarray,
- uint8_t,
- )
- from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
- cdef extern from "src/datetime/np_datetime.h":
- int cmp_npy_datetimestruct(npy_datetimestruct *a,
- npy_datetimestruct *b)
- # AS, FS, PS versions exist but are not imported because they are not used.
- npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
- npy_datetimestruct _US_MIN_DTS, _US_MAX_DTS
- npy_datetimestruct _MS_MIN_DTS, _MS_MAX_DTS
- npy_datetimestruct _S_MIN_DTS, _S_MAX_DTS
- npy_datetimestruct _M_MIN_DTS, _M_MAX_DTS
- PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(cnp.PyArray_Descr *dtype)
- cdef extern from "src/datetime/np_datetime_strings.h":
- int parse_iso_8601_datetime(const char *str, int len, int want_exc,
- npy_datetimestruct *out,
- NPY_DATETIMEUNIT *out_bestunit,
- int *out_local, int *out_tzoffset,
- const char *format, int format_len,
- FormatRequirement exact)
- # ----------------------------------------------------------------------
- # numpy object inspection
- cdef npy_datetime get_datetime64_value(object obj) nogil:
- """
- returns the int64 value underlying scalar numpy datetime64 object
- Note that to interpret this as a datetime, the corresponding unit is
- also needed. That can be found using `get_datetime64_unit`.
- """
- return (<PyDatetimeScalarObject*>obj).obval
- cdef npy_timedelta get_timedelta64_value(object obj) nogil:
- """
- returns the int64 value underlying scalar numpy timedelta64 object
- """
- return (<PyTimedeltaScalarObject*>obj).obval
- cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil:
- """
- returns the unit part of the dtype for a numpy datetime64 object.
- """
- return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base
- cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype):
- # NB: caller is responsible for ensuring this is *some* datetime64 or
- # timedelta64 dtype, otherwise we can segfault
- cdef:
- cnp.PyArray_Descr* descr = <cnp.PyArray_Descr*>dtype
- PyArray_DatetimeMetaData meta
- meta = get_datetime_metadata_from_dtype(descr)
- return meta.base
- def py_get_unit_from_dtype(dtype):
- # for testing get_unit_from_dtype; adds 896 bytes to the .so file.
- return get_unit_from_dtype(dtype)
- def is_unitless(dtype: cnp.dtype) -> bool:
- """
- Check if a datetime64 or timedelta64 dtype has no attached unit.
- """
- if dtype.type_num not in [cnp.NPY_DATETIME, cnp.NPY_TIMEDELTA]:
- raise ValueError("is_unitless dtype must be datetime64 or timedelta64")
- cdef:
- NPY_DATETIMEUNIT unit = get_unit_from_dtype(dtype)
- return unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
- # ----------------------------------------------------------------------
- # Comparison
- cdef bint cmp_dtstructs(
- npy_datetimestruct* left, npy_datetimestruct* right, int op
- ):
- cdef:
- int cmp_res
- cmp_res = cmp_npy_datetimestruct(left, right)
- if op == Py_EQ:
- return cmp_res == 0
- if op == Py_NE:
- return cmp_res != 0
- if op == Py_GT:
- return cmp_res == 1
- if op == Py_LT:
- return cmp_res == -1
- if op == Py_GE:
- return cmp_res == 1 or cmp_res == 0
- else:
- # i.e. op == Py_LE
- return cmp_res == -1 or cmp_res == 0
- cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1:
- """
- cmp_scalar is a more performant version of PyObject_RichCompare
- typed for int64_t arguments.
- """
- if op == Py_EQ:
- return lhs == rhs
- elif op == Py_NE:
- return lhs != rhs
- elif op == Py_LT:
- return lhs < rhs
- elif op == Py_LE:
- return lhs <= rhs
- elif op == Py_GT:
- return lhs > rhs
- elif op == Py_GE:
- return lhs >= rhs
- class OutOfBoundsDatetime(ValueError):
- """
- Raised when the datetime is outside the range that can be represented.
- """
- pass
- class OutOfBoundsTimedelta(ValueError):
- """
- Raised when encountering a timedelta value that cannot be represented.
- Representation should be within a timedelta64[ns].
- """
- # Timedelta analogue to OutOfBoundsDatetime
- pass
- cdef get_implementation_bounds(
- NPY_DATETIMEUNIT reso,
- npy_datetimestruct *lower,
- npy_datetimestruct *upper,
- ):
- if reso == NPY_FR_ns:
- upper[0] = _NS_MAX_DTS
- lower[0] = _NS_MIN_DTS
- elif reso == NPY_FR_us:
- upper[0] = _US_MAX_DTS
- lower[0] = _US_MIN_DTS
- elif reso == NPY_FR_ms:
- upper[0] = _MS_MAX_DTS
- lower[0] = _MS_MIN_DTS
- elif reso == NPY_FR_s:
- upper[0] = _S_MAX_DTS
- lower[0] = _S_MIN_DTS
- elif reso == NPY_FR_m:
- upper[0] = _M_MAX_DTS
- lower[0] = _M_MIN_DTS
- else:
- raise NotImplementedError(reso)
- cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns):
- """Raises OutOfBoundsDatetime if the given date is outside the range that
- can be represented by nanosecond-resolution 64-bit integers."""
- cdef:
- bint error = False
- npy_datetimestruct cmp_upper, cmp_lower
- get_implementation_bounds(unit, &cmp_lower, &cmp_upper)
- if cmp_npy_datetimestruct(dts, &cmp_lower) == -1:
- error = True
- elif cmp_npy_datetimestruct(dts, &cmp_upper) == 1:
- error = True
- if error:
- fmt = (f"{dts.year}-{dts.month:02d}-{dts.day:02d} "
- f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}")
- # TODO: "nanosecond" in the message assumes NPY_FR_ns
- raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {fmt}")
- # ----------------------------------------------------------------------
- # Conversion
- # just exposed for testing at the moment
- def py_td64_to_tdstruct(int64_t td64, NPY_DATETIMEUNIT unit):
- cdef:
- pandas_timedeltastruct tds
- pandas_timedelta_to_timedeltastruct(td64, unit, &tds)
- return tds # <- returned as a dict to python
- cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts):
- if PyDateTime_CheckExact(dt):
- dts.year = PyDateTime_GET_YEAR(dt)
- else:
- # We use dt.year instead of PyDateTime_GET_YEAR because with Timestamp
- # we override year such that PyDateTime_GET_YEAR is incorrect.
- dts.year = dt.year
- dts.month = PyDateTime_GET_MONTH(dt)
- dts.day = PyDateTime_GET_DAY(dt)
- dts.hour = PyDateTime_DATE_GET_HOUR(dt)
- dts.min = PyDateTime_DATE_GET_MINUTE(dt)
- dts.sec = PyDateTime_DATE_GET_SECOND(dt)
- dts.us = PyDateTime_DATE_GET_MICROSECOND(dt)
- dts.ps = dts.as = 0
- cdef int64_t pydatetime_to_dt64(datetime val,
- npy_datetimestruct *dts,
- NPY_DATETIMEUNIT reso=NPY_FR_ns):
- """
- Note we are assuming that the datetime object is timezone-naive.
- """
- pydatetime_to_dtstruct(val, dts)
- return npy_datetimestruct_to_datetime(reso, dts)
- cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts):
- dts.year = PyDateTime_GET_YEAR(val)
- dts.month = PyDateTime_GET_MONTH(val)
- dts.day = PyDateTime_GET_DAY(val)
- dts.hour = dts.min = dts.sec = dts.us = 0
- dts.ps = dts.as = 0
- return
- cdef int64_t pydate_to_dt64(
- date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=NPY_FR_ns
- ):
- pydate_to_dtstruct(val, dts)
- return npy_datetimestruct_to_datetime(reso, dts)
- cdef int string_to_dts(
- str val,
- npy_datetimestruct* dts,
- NPY_DATETIMEUNIT* out_bestunit,
- int* out_local,
- int* out_tzoffset,
- bint want_exc,
- format: str | None=None,
- bint exact=True,
- ) except? -1:
- cdef:
- Py_ssize_t length
- const char* buf
- Py_ssize_t format_length
- const char* format_buf
- FormatRequirement format_requirement
- buf = get_c_string_buf_and_size(val, &length)
- if format is None:
- format_buf = b""
- format_length = 0
- format_requirement = INFER_FORMAT
- else:
- format_buf = get_c_string_buf_and_size(format, &format_length)
- format_requirement = <FormatRequirement>exact
- return parse_iso_8601_datetime(buf, length, want_exc,
- dts, out_bestunit, out_local, out_tzoffset,
- format_buf, format_length,
- format_requirement)
- cpdef ndarray astype_overflowsafe(
- ndarray values,
- cnp.dtype dtype,
- bint copy=True,
- bint round_ok=True,
- bint is_coerce=False,
- ):
- """
- Convert an ndarray with datetime64[X] to datetime64[Y]
- or timedelta64[X] to timedelta64[Y],
- raising on overflow.
- """
- if values.descr.type_num == dtype.type_num == cnp.NPY_DATETIME:
- # i.e. dtype.kind == "M"
- dtype_name = "datetime64"
- elif values.descr.type_num == dtype.type_num == cnp.NPY_TIMEDELTA:
- # i.e. dtype.kind == "m"
- dtype_name = "timedelta64"
- else:
- raise TypeError(
- "astype_overflowsafe values.dtype and dtype must be either "
- "both-datetime64 or both-timedelta64."
- )
- cdef:
- NPY_DATETIMEUNIT from_unit = get_unit_from_dtype(values.dtype)
- NPY_DATETIMEUNIT to_unit = get_unit_from_dtype(dtype)
- if from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- raise TypeError(f"{dtype_name} values must have a unit specified")
- if to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- # without raising explicitly here, we end up with a SystemError
- # built-in function [...] returned a result with an error
- raise ValueError(
- f"{dtype_name} dtype must have a unit specified"
- )
- if from_unit == to_unit:
- # Check this before allocating result for perf, might save some memory
- if copy:
- return values.copy()
- return values
- elif from_unit > to_unit:
- if round_ok:
- # e.g. ns -> us, so there is no risk of overflow, so we can use
- # numpy's astype safely. Note there _is_ risk of truncation.
- return values.astype(dtype)
- else:
- iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit)
- return iresult2.view(dtype)
- if (<object>values).dtype.byteorder == ">":
- # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
- values = values.astype(values.dtype.newbyteorder("<"))
- cdef:
- ndarray i8values = values.view("i8")
- # equiv: result = np.empty((<object>values).shape, dtype="i8")
- ndarray iresult = cnp.PyArray_EMPTY(
- values.ndim, values.shape, cnp.NPY_INT64, 0
- )
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
- Py_ssize_t i, N = values.size
- int64_t value, new_value
- npy_datetimestruct dts
- bint is_td = dtype.type_num == cnp.NPY_TIMEDELTA
- for i in range(N):
- # Analogous to: item = values[i]
- value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
- if value == NPY_DATETIME_NAT:
- new_value = NPY_DATETIME_NAT
- else:
- pandas_datetime_to_datetimestruct(value, from_unit, &dts)
- try:
- check_dts_bounds(&dts, to_unit)
- except OutOfBoundsDatetime as err:
- if is_coerce:
- new_value = NPY_DATETIME_NAT
- elif is_td:
- from_abbrev = np.datetime_data(values.dtype)[0]
- np_val = np.timedelta64(value, from_abbrev)
- msg = (
- "Cannot convert {np_val} to {dtype} without overflow"
- .format(np_val=str(np_val), dtype=str(dtype))
- )
- raise OutOfBoundsTimedelta(msg) from err
- else:
- raise
- else:
- new_value = npy_datetimestruct_to_datetime(to_unit, &dts)
- # Analogous to: iresult[i] = new_value
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value
- cnp.PyArray_MultiIter_NEXT(mi)
- return iresult.view(dtype)
- # TODO: try to upstream this fix to numpy
- def compare_mismatched_resolutions(ndarray left, ndarray right, op):
- """
- Overflow-safe comparison of timedelta64/datetime64 with mismatched resolutions.
- >>> left = np.array([500], dtype="M8[Y]")
- >>> right = np.array([0], dtype="M8[ns]")
- >>> left < right # <- wrong!
- array([ True])
- """
- if left.dtype.kind != right.dtype.kind or left.dtype.kind not in ["m", "M"]:
- raise ValueError("left and right must both be timedelta64 or both datetime64")
- cdef:
- int op_code = op_to_op_code(op)
- NPY_DATETIMEUNIT left_unit = get_unit_from_dtype(left.dtype)
- NPY_DATETIMEUNIT right_unit = get_unit_from_dtype(right.dtype)
- # equiv: result = np.empty((<object>left).shape, dtype="bool")
- ndarray result = cnp.PyArray_EMPTY(
- left.ndim, left.shape, cnp.NPY_BOOL, 0
- )
- ndarray lvalues = left.view("i8")
- ndarray rvalues = right.view("i8")
- cnp.broadcast mi = cnp.PyArray_MultiIterNew3(result, lvalues, rvalues)
- int64_t lval, rval
- bint res_value
- Py_ssize_t i, N = left.size
- npy_datetimestruct ldts, rdts
- for i in range(N):
- # Analogous to: lval = lvalues[i]
- lval = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
- # Analogous to: rval = rvalues[i]
- rval = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 2))[0]
- if lval == NPY_DATETIME_NAT or rval == NPY_DATETIME_NAT:
- res_value = op_code == Py_NE
- else:
- pandas_datetime_to_datetimestruct(lval, left_unit, &ldts)
- pandas_datetime_to_datetimestruct(rval, right_unit, &rdts)
- res_value = cmp_dtstructs(&ldts, &rdts, op_code)
- # Analogous to: result[i] = res_value
- (<uint8_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_value
- cnp.PyArray_MultiIter_NEXT(mi)
- return result
- import operator
- cdef int op_to_op_code(op):
- # TODO: should exist somewhere?
- if op is operator.eq:
- return Py_EQ
- if op is operator.ne:
- return Py_NE
- if op is operator.le:
- return Py_LE
- if op is operator.lt:
- return Py_LT
- if op is operator.ge:
- return Py_GE
- if op is operator.gt:
- return Py_GT
- cdef ndarray astype_round_check(
- ndarray i8values,
- NPY_DATETIMEUNIT from_unit,
- NPY_DATETIMEUNIT to_unit
- ):
- # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion
- # involves truncation, e.g. 1500ns->1us
- cdef:
- Py_ssize_t i, N = i8values.size
- # equiv: iresult = np.empty((<object>i8values).shape, dtype="i8")
- ndarray iresult = cnp.PyArray_EMPTY(
- i8values.ndim, i8values.shape, cnp.NPY_INT64, 0
- )
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
- # Note the arguments to_unit, from unit are swapped vs how they
- # are passed when going to a higher-frequency reso.
- int64_t mult = get_conversion_factor(to_unit, from_unit)
- int64_t value, mod
- for i in range(N):
- # Analogous to: item = i8values[i]
- value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
- if value == NPY_DATETIME_NAT:
- new_value = NPY_DATETIME_NAT
- else:
- new_value, mod = divmod(value, mult)
- if mod != 0:
- # TODO: avoid runtime import
- from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev
- from_abbrev = npy_unit_to_abbrev(from_unit)
- to_abbrev = npy_unit_to_abbrev(to_unit)
- raise ValueError(
- f"Cannot losslessly cast '{value} {from_abbrev}' to {to_abbrev}"
- )
- # Analogous to: iresult[i] = new_value
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value
- cnp.PyArray_MultiIter_NEXT(mi)
- return iresult
- @cython.overflowcheck(True)
- cdef int64_t get_conversion_factor(
- NPY_DATETIMEUNIT from_unit,
- NPY_DATETIMEUNIT to_unit
- ) except? -1:
- """
- Find the factor by which we need to multiply to convert from from_unit to to_unit.
- """
- if (
- from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
- or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
- ):
- raise ValueError("unit-less resolutions are not supported")
- if from_unit > to_unit:
- raise ValueError
- if from_unit == to_unit:
- return 1
- if from_unit == NPY_DATETIMEUNIT.NPY_FR_W:
- return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D:
- return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h:
- return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m:
- return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
- cdef int64_t convert_reso(
- int64_t value,
- NPY_DATETIMEUNIT from_reso,
- NPY_DATETIMEUNIT to_reso,
- bint round_ok,
- ) except? -1:
- cdef:
- int64_t res_value, mult, div, mod
- if from_reso == to_reso:
- return value
- elif to_reso < from_reso:
- # e.g. ns -> us, no risk of overflow, but can be lossy rounding
- mult = get_conversion_factor(to_reso, from_reso)
- div, mod = divmod(value, mult)
- if mod > 0 and not round_ok:
- raise ValueError("Cannot losslessly convert units")
- # Note that when mod > 0, we follow np.timedelta64 in always
- # rounding down.
- res_value = div
- elif (
- from_reso == NPY_FR_Y
- or from_reso == NPY_FR_M
- or to_reso == NPY_FR_Y
- or to_reso == NPY_FR_M
- ):
- # Converting by multiplying isn't _quite_ right bc the number of
- # seconds in a month/year isn't fixed.
- res_value = _convert_reso_with_dtstruct(value, from_reso, to_reso)
- else:
- # e.g. ns -> us, risk of overflow, but no risk of lossy rounding
- mult = get_conversion_factor(from_reso, to_reso)
- with cython.overflowcheck(True):
- # Note: caller is responsible for re-raising as OutOfBoundsTimedelta
- res_value = value * mult
- return res_value
- cdef int64_t _convert_reso_with_dtstruct(
- int64_t value,
- NPY_DATETIMEUNIT from_unit,
- NPY_DATETIMEUNIT to_unit,
- ) except? -1:
- cdef:
- npy_datetimestruct dts
- pandas_datetime_to_datetimestruct(value, from_unit, &dts)
- check_dts_bounds(&dts, to_unit)
- return npy_datetimestruct_to_datetime(to_unit, &dts)
|