parsing.pyx 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189
  1. """
  2. Parsing functions for datetime and datetime-like strings.
  3. """
  4. import re
  5. import time
  6. import warnings
  7. from pandas.util._exceptions import find_stack_level
  8. cimport cython
  9. from cpython.datetime cimport (
  10. datetime,
  11. datetime_new,
  12. import_datetime,
  13. timedelta,
  14. tzinfo,
  15. )
  16. from datetime import timezone
  17. from cpython.object cimport PyObject_Str
  18. from cython cimport Py_ssize_t
  19. from libc.string cimport strchr
  20. import_datetime()
  21. import numpy as np
  22. cimport numpy as cnp
  23. from numpy cimport (
  24. PyArray_GETITEM,
  25. PyArray_ITER_DATA,
  26. PyArray_ITER_NEXT,
  27. PyArray_IterNew,
  28. flatiter,
  29. float64_t,
  30. )
  31. cnp.import_array()
  32. # dateutil compat
  33. from decimal import InvalidOperation
  34. from dateutil.parser import (
  35. DEFAULTPARSER,
  36. parse as du_parse,
  37. )
  38. from dateutil.relativedelta import relativedelta
  39. from dateutil.tz import (
  40. tzlocal as _dateutil_tzlocal,
  41. tzoffset,
  42. tzutc as _dateutil_tzutc,
  43. )
  44. from pandas._config import get_option
  45. from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
  46. from pandas._libs.tslibs.dtypes cimport (
  47. attrname_to_npy_unit,
  48. npy_unit_to_attrname,
  49. )
  50. from pandas._libs.tslibs.nattype cimport (
  51. c_NaT as NaT,
  52. c_nat_strings as nat_strings,
  53. )
  54. from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
  55. from pandas._libs.tslibs.np_datetime cimport (
  56. NPY_DATETIMEUNIT,
  57. npy_datetimestruct,
  58. string_to_dts,
  59. )
  60. from pandas._libs.tslibs.strptime import array_strptime
  61. from pandas._libs.tslibs.util cimport (
  62. get_c_string_buf_and_size,
  63. is_array,
  64. )
  65. cdef extern from "../src/headers/portable.h":
  66. int getdigit_ascii(char c, int default) nogil
  67. cdef extern from "../src/parser/tokenizer.h":
  68. double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
  69. int skip_trailing, int *error, int *maybe_int)
  70. # ----------------------------------------------------------------------
  71. # Constants
  72. class DateParseError(ValueError):
  73. pass
  74. _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0,
  75. second=0, microsecond=0)
  76. cdef:
  77. set _not_datelike_strings = {"a", "A", "m", "M", "p", "P", "t", "T"}
  78. # _timestamp_units -> units that we round to nanos
  79. set _timestamp_units = {
  80. NPY_DATETIMEUNIT.NPY_FR_ns,
  81. NPY_DATETIMEUNIT.NPY_FR_ps,
  82. NPY_DATETIMEUNIT.NPY_FR_fs,
  83. NPY_DATETIMEUNIT.NPY_FR_as,
  84. }
  85. # ----------------------------------------------------------------------
  86. cdef:
  87. const char* delimiters = " /-."
  88. int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12
  89. cdef bint _is_delimiter(const char ch):
  90. return strchr(delimiters, ch) != NULL
  91. cdef int _parse_1digit(const char* s):
  92. cdef int result = 0
  93. result += getdigit_ascii(s[0], -10) * 1
  94. return result
  95. cdef int _parse_2digit(const char* s):
  96. cdef int result = 0
  97. result += getdigit_ascii(s[0], -10) * 10
  98. result += getdigit_ascii(s[1], -100) * 1
  99. return result
  100. cdef int _parse_4digit(const char* s):
  101. cdef int result = 0
  102. result += getdigit_ascii(s[0], -10) * 1000
  103. result += getdigit_ascii(s[1], -100) * 100
  104. result += getdigit_ascii(s[2], -1000) * 10
  105. result += getdigit_ascii(s[3], -10000) * 1
  106. return result
  107. cdef datetime _parse_delimited_date(
  108. str date_string, bint dayfirst, NPY_DATETIMEUNIT* out_bestunit
  109. ):
  110. """
  111. Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.
  112. At the beginning function tries to parse date in MM/DD/YYYY format, but
  113. if month > 12 - in DD/MM/YYYY (`dayfirst == False`).
  114. With `dayfirst == True` function makes an attempt to parse date in
  115. DD/MM/YYYY, if an attempt is wrong - in DD/MM/YYYY
  116. For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-.
  117. For MM/YYYY: delimiter can be a space or one of /-
  118. If `date_string` can't be converted to date, then function returns
  119. None, None
  120. Parameters
  121. ----------
  122. date_string : str
  123. dayfirst : bool
  124. out_bestunit : NPY_DATETIMEUNIT*
  125. For specifying identified resolution.
  126. Returns:
  127. --------
  128. datetime or None
  129. """
  130. cdef:
  131. const char* buf
  132. Py_ssize_t length
  133. int day = 1, month = 1, year
  134. bint can_swap = 0
  135. buf = get_c_string_buf_and_size(date_string, &length)
  136. if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
  137. # parsing MM?DD?YYYY and DD?MM?YYYY dates
  138. month = _parse_2digit(buf)
  139. day = _parse_2digit(buf + 3)
  140. year = _parse_4digit(buf + 6)
  141. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
  142. can_swap = 1
  143. elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
  144. # parsing M?DD?YYYY and D?MM?YYYY dates
  145. month = _parse_1digit(buf)
  146. day = _parse_2digit(buf + 2)
  147. year = _parse_4digit(buf + 5)
  148. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
  149. can_swap = 1
  150. elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
  151. # parsing MM?D?YYYY and DD?M?YYYY dates
  152. month = _parse_2digit(buf)
  153. day = _parse_1digit(buf + 3)
  154. year = _parse_4digit(buf + 5)
  155. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
  156. can_swap = 1
  157. elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
  158. # parsing M?D?YYYY and D?M?YYYY dates
  159. month = _parse_1digit(buf)
  160. day = _parse_1digit(buf + 2)
  161. year = _parse_4digit(buf + 4)
  162. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
  163. can_swap = 1
  164. elif length == 7 and _is_delimiter(buf[2]):
  165. # parsing MM?YYYY dates
  166. if buf[2] == b".":
  167. # we cannot reliably tell whether e.g. 10.2010 is a float
  168. # or a date, thus we refuse to parse it here
  169. return None
  170. month = _parse_2digit(buf)
  171. year = _parse_4digit(buf + 3)
  172. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
  173. else:
  174. return None
  175. if month < 0 or day < 0 or year < 1000:
  176. # some part is not an integer, so
  177. # date_string can't be converted to date, above format
  178. return None
  179. if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \
  180. and (month <= MAX_MONTH or day <= MAX_MONTH):
  181. if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap:
  182. day, month = month, day
  183. # In Python <= 3.6.0 there is no range checking for invalid dates
  184. # in C api, thus we call faster C version for 3.6.1 or newer
  185. return datetime_new(year, month, day, 0, 0, 0, 0, None)
  186. raise DateParseError(f"Invalid date specified ({month}/{day})")
  187. cdef bint _does_string_look_like_time(str parse_string):
  188. """
  189. Checks whether given string is a time: it has to start either from
  190. H:MM or from HH:MM, and hour and minute values must be valid.
  191. Parameters
  192. ----------
  193. parse_string : str
  194. Returns:
  195. --------
  196. bool
  197. Whether given string is potentially a time.
  198. """
  199. cdef:
  200. const char* buf
  201. Py_ssize_t length
  202. int hour = -1, minute = -1
  203. buf = get_c_string_buf_and_size(parse_string, &length)
  204. if length >= 4:
  205. if buf[1] == b":":
  206. # h:MM format
  207. hour = getdigit_ascii(buf[0], -1)
  208. minute = _parse_2digit(buf + 2)
  209. elif buf[2] == b":":
  210. # HH:MM format
  211. hour = _parse_2digit(buf)
  212. minute = _parse_2digit(buf + 3)
  213. return 0 <= hour <= 23 and 0 <= minute <= 59
  214. def py_parse_datetime_string(
  215. str date_string, bint dayfirst=False, bint yearfirst=False
  216. ):
  217. # Python-accessible version for testing (we can't just make
  218. # parse_datetime_string cpdef bc it has a pointer argument)
  219. cdef:
  220. NPY_DATETIMEUNIT out_bestunit
  221. return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit)
  222. cdef datetime parse_datetime_string(
  223. # NB: This will break with np.str_ (GH#32264) even though
  224. # isinstance(npstrobj, str) evaluates to True, so caller must ensure
  225. # the argument is *exactly* 'str'
  226. str date_string,
  227. bint dayfirst,
  228. bint yearfirst,
  229. NPY_DATETIMEUNIT* out_bestunit
  230. ):
  231. """
  232. Parse datetime string, only returns datetime.
  233. Also cares special handling matching time patterns.
  234. Returns
  235. -------
  236. datetime
  237. Notes
  238. -----
  239. Does not handle "today" or "now", which caller is responsible for handling.
  240. """
  241. cdef:
  242. datetime dt
  243. bint is_quarter = 0
  244. if not _does_string_look_like_datetime(date_string):
  245. raise ValueError(f'Given date string "{date_string}" not likely a datetime')
  246. if _does_string_look_like_time(date_string):
  247. # use current datetime as default, not pass _DEFAULT_DATETIME
  248. dt = du_parse(date_string, dayfirst=dayfirst,
  249. yearfirst=yearfirst)
  250. return dt
  251. dt = _parse_delimited_date(date_string, dayfirst, out_bestunit)
  252. if dt is not None:
  253. return dt
  254. try:
  255. dt = _parse_dateabbr_string(
  256. date_string, _DEFAULT_DATETIME, None, out_bestunit, &is_quarter
  257. )
  258. return dt
  259. except DateParseError:
  260. raise
  261. except ValueError:
  262. pass
  263. dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
  264. dayfirst=dayfirst, yearfirst=yearfirst,
  265. ignoretz=False, out_bestunit=out_bestunit)
  266. return dt
  267. def parse_datetime_string_with_reso(
  268. str date_string, str freq=None, dayfirst=None, yearfirst=None
  269. ):
  270. # NB: This will break with np.str_ (GH#45580) even though
  271. # isinstance(npstrobj, str) evaluates to True, so caller must ensure
  272. # the argument is *exactly* 'str'
  273. """
  274. Try hard to parse datetime string, leveraging dateutil plus some extra
  275. goodies like quarter recognition.
  276. Parameters
  277. ----------
  278. date_string : str
  279. freq : str or None, default None
  280. Helps with interpreting time string if supplied
  281. Corresponds to `offset.rule_code`
  282. dayfirst : bool, default None
  283. If None uses default from print_config
  284. yearfirst : bool, default None
  285. If None uses default from print_config
  286. Returns
  287. -------
  288. datetime
  289. str
  290. Describing resolution of parsed string.
  291. Raises
  292. ------
  293. ValueError : preliminary check suggests string is not datetime
  294. DateParseError : error within dateutil
  295. """
  296. if dayfirst is None:
  297. dayfirst = get_option("display.date_dayfirst")
  298. if yearfirst is None:
  299. yearfirst = get_option("display.date_yearfirst")
  300. cdef:
  301. datetime parsed
  302. str reso
  303. bint string_to_dts_failed
  304. npy_datetimestruct dts
  305. NPY_DATETIMEUNIT out_bestunit
  306. int out_local = 0
  307. int out_tzoffset
  308. tzinfo tz
  309. bint is_quarter = 0
  310. if not _does_string_look_like_datetime(date_string):
  311. raise ValueError(f'Given date string "{date_string}" not likely a datetime')
  312. # Try iso8601 first, as it handles nanoseconds
  313. string_to_dts_failed = string_to_dts(
  314. date_string, &dts, &out_bestunit, &out_local,
  315. &out_tzoffset, False
  316. )
  317. if not string_to_dts_failed:
  318. # Match Timestamp and drop picoseconds, femtoseconds, attoseconds
  319. # The new resolution will just be nano
  320. # GH#50417
  321. if out_bestunit in _timestamp_units:
  322. out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns
  323. if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns:
  324. # TODO: avoid circular import
  325. from pandas import Timestamp
  326. parsed = Timestamp(date_string)
  327. else:
  328. if out_local:
  329. tz = timezone(timedelta(minutes=out_tzoffset))
  330. else:
  331. tz = None
  332. parsed = datetime_new(
  333. dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz
  334. )
  335. reso = npy_unit_to_attrname[out_bestunit]
  336. return parsed, reso
  337. parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit)
  338. if parsed is not None:
  339. reso = npy_unit_to_attrname[out_bestunit]
  340. return parsed, reso
  341. try:
  342. parsed = _parse_dateabbr_string(
  343. date_string, _DEFAULT_DATETIME, freq, &out_bestunit, &is_quarter
  344. )
  345. except DateParseError:
  346. raise
  347. except ValueError:
  348. pass
  349. else:
  350. if is_quarter:
  351. reso = "quarter"
  352. else:
  353. reso = npy_unit_to_attrname[out_bestunit]
  354. return parsed, reso
  355. parsed = dateutil_parse(date_string, _DEFAULT_DATETIME,
  356. dayfirst=dayfirst, yearfirst=yearfirst,
  357. ignoretz=False, out_bestunit=&out_bestunit)
  358. reso = npy_unit_to_attrname[out_bestunit]
  359. return parsed, reso
  360. cpdef bint _does_string_look_like_datetime(str py_string):
  361. """
  362. Checks whether given string is a datetime: it has to start with '0' or
  363. be greater than 1000.
  364. Parameters
  365. ----------
  366. py_string: str
  367. Returns
  368. -------
  369. bool
  370. Whether given string is potentially a datetime.
  371. """
  372. cdef:
  373. const char *buf
  374. char *endptr = NULL
  375. Py_ssize_t length = -1
  376. double converted_date
  377. char first
  378. int error = 0
  379. buf = get_c_string_buf_and_size(py_string, &length)
  380. if length >= 1:
  381. first = buf[0]
  382. if first == b"0":
  383. # Strings starting with 0 are more consistent with a
  384. # date-like string than a number
  385. return True
  386. elif py_string in _not_datelike_strings:
  387. return False
  388. else:
  389. # xstrtod with such parameters copies behavior of python `float`
  390. # cast; for example, " 35.e-1 " is valid string for this cast so,
  391. # for correctly xstrtod call necessary to pass these params:
  392. # b'.' - a dot is used as separator, b'e' - an exponential form of
  393. # a float number can be used, b'\0' - not to use a thousand
  394. # separator, 1 - skip extra spaces before and after,
  395. converted_date = xstrtod(buf, &endptr,
  396. b".", b"e", b"\0", 1, &error, NULL)
  397. # if there were no errors and the whole line was parsed, then ...
  398. if error == 0 and endptr == buf + length:
  399. return converted_date >= 1000
  400. return True
  401. cdef datetime _parse_dateabbr_string(str date_string, datetime default,
  402. str freq, NPY_DATETIMEUNIT* out_bestunit,
  403. bint* is_quarter):
  404. # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
  405. cdef:
  406. datetime ret
  407. # year initialized to prevent compiler warnings
  408. int year = -1, quarter = -1, month
  409. Py_ssize_t date_len
  410. const char* buf
  411. if date_string in nat_strings:
  412. # default to nanos, could also reasonably do NPY_FR_GENERIC
  413. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_ns
  414. return NaT
  415. date_string = date_string.upper()
  416. date_len = len(date_string)
  417. if date_len == 4:
  418. # parse year only like 2000
  419. try:
  420. ret = default.replace(year=int(date_string))
  421. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_Y
  422. return ret
  423. except ValueError:
  424. pass
  425. if 4 <= date_len <= 7:
  426. buf = get_c_string_buf_and_size(date_string, &date_len)
  427. try:
  428. i = date_string.index("Q", 1, 6)
  429. if i == 1:
  430. quarter = _parse_1digit(buf) # i.e. int(date_string[0])
  431. if date_len == 4 or (date_len == 5
  432. and date_string[i + 1] == "-"):
  433. # r'(\d)Q-?(\d\d)')
  434. year = 2000 + int(date_string[-2:])
  435. elif date_len == 6 or (date_len == 7
  436. and date_string[i + 1] == "-"):
  437. # r'(\d)Q-?(\d\d\d\d)')
  438. year = int(date_string[-4:])
  439. else:
  440. raise ValueError
  441. elif i == 2 or i == 3:
  442. # r'(\d\d)-?Q(\d)'
  443. if date_len == 4 or (date_len == 5
  444. and date_string[i - 1] == "-"):
  445. # i.e. quarter = int(date_string[-1])
  446. quarter = _parse_1digit(buf + date_len - 1)
  447. year = 2000 + int(date_string[:2])
  448. else:
  449. raise ValueError
  450. elif i == 4 or i == 5:
  451. if date_len == 6 or (date_len == 7
  452. and date_string[i - 1] == "-"):
  453. # r'(\d\d\d\d)-?Q(\d)'
  454. # i.e. quarter = int(date_string[-1])
  455. quarter = _parse_1digit(buf + date_len - 1)
  456. year = int(date_string[:4])
  457. else:
  458. raise ValueError
  459. if not (1 <= quarter <= 4):
  460. raise DateParseError(f"Incorrect quarterly string is given, "
  461. f"quarter must be "
  462. f"between 1 and 4: {date_string}")
  463. try:
  464. # GH#1228
  465. year, month = quarter_to_myear(year, quarter, freq)
  466. except KeyError:
  467. raise DateParseError("Unable to retrieve month "
  468. "information from given "
  469. f"freq: {freq}")
  470. ret = default.replace(year=year, month=month)
  471. # Monthly is as close as we can get to a non-existent NPY_FR_Q
  472. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
  473. is_quarter[0] = 1
  474. return ret
  475. except DateParseError:
  476. raise
  477. except ValueError:
  478. # e.g. if "Q" is not in date_string and .index raised
  479. pass
  480. if date_len == 6 and freq == "M":
  481. year = int(date_string[:4])
  482. month = int(date_string[4:6])
  483. try:
  484. ret = default.replace(year=year, month=month)
  485. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
  486. return ret
  487. except ValueError as err:
  488. # We can infer that none of the patterns below will match
  489. raise ValueError(f"Unable to parse {date_string}") from err
  490. for pat in ["%Y-%m", "%b %Y", "%b-%Y"]:
  491. try:
  492. ret = datetime.strptime(date_string, pat)
  493. out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
  494. return ret
  495. except ValueError:
  496. pass
  497. raise ValueError(f"Unable to parse {date_string}")
  498. cpdef quarter_to_myear(int year, int quarter, str freq):
  499. """
  500. A quarterly frequency defines a "year" which may not coincide with
  501. the calendar-year. Find the calendar-year and calendar-month associated
  502. with the given year and quarter under the `freq`-derived calendar.
  503. Parameters
  504. ----------
  505. year : int
  506. quarter : int
  507. freq : str or None
  508. Returns
  509. -------
  510. year : int
  511. month : int
  512. See Also
  513. --------
  514. Period.qyear
  515. """
  516. if quarter <= 0 or quarter > 4:
  517. raise ValueError("Quarter must be 1 <= q <= 4")
  518. if freq is not None:
  519. mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1
  520. month = (mnum + (quarter - 1) * 3) % 12 + 1
  521. if month > mnum:
  522. year -= 1
  523. else:
  524. month = (quarter - 1) * 3 + 1
  525. return year, month
  526. cdef datetime dateutil_parse(
  527. str timestr,
  528. datetime default,
  529. bint ignoretz,
  530. bint dayfirst,
  531. bint yearfirst,
  532. NPY_DATETIMEUNIT* out_bestunit
  533. ):
  534. """ lifted from dateutil to get resolution"""
  535. cdef:
  536. str attr
  537. datetime ret
  538. object res
  539. str reso = None
  540. dict repl = {}
  541. try:
  542. res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst)
  543. except InvalidOperation:
  544. # GH#51157 dateutil can raise decimal.InvalidOperation
  545. res = None
  546. if res is None:
  547. raise DateParseError(
  548. f"Unknown datetime string format, unable to parse: {timestr}"
  549. )
  550. for attr in ["year", "month", "day", "hour",
  551. "minute", "second", "microsecond"]:
  552. value = getattr(res, attr)
  553. if value is not None:
  554. repl[attr] = value
  555. reso = attr
  556. if reso is None:
  557. raise DateParseError(f"Unable to parse datetime string: {timestr}")
  558. if reso == "microsecond":
  559. if repl["microsecond"] == 0:
  560. reso = "second"
  561. elif repl["microsecond"] % 1000 == 0:
  562. reso = "millisecond"
  563. try:
  564. ret = default.replace(**repl)
  565. except ValueError as err:
  566. # e.g. "day is out of range for month"
  567. # we re-raise to match dateutil's exception message
  568. raise DateParseError(str(err) + ": " + timestr) from err
  569. except OverflowError as err:
  570. # with e.g. "08335394550" dateutil raises when trying to pass
  571. # year=8335394550 to datetime.replace
  572. raise OutOfBoundsDatetime(
  573. f'Parsing "{timestr}" to datetime overflows'
  574. ) from err
  575. if res.weekday is not None and not res.day:
  576. ret = ret + relativedelta.relativedelta(weekday=res.weekday)
  577. if not ignoretz:
  578. if res.tzname and res.tzname in time.tzname:
  579. # GH#50791
  580. if res.tzname != "UTC":
  581. # If the system is localized in UTC (as many CI runs are)
  582. # we get tzlocal, once the deprecation is enforced will get
  583. # timezone.utc, not raise.
  584. warnings.warn(
  585. "Parsing '{res.tzname}' as tzlocal (dependent on system timezone) "
  586. "is deprecated and will raise in a future version. Pass the 'tz' "
  587. "keyword or call tz_localize after construction instead",
  588. FutureWarning,
  589. stacklevel=find_stack_level()
  590. )
  591. ret = ret.replace(tzinfo=_dateutil_tzlocal())
  592. elif res.tzoffset == 0:
  593. ret = ret.replace(tzinfo=_dateutil_tzutc())
  594. elif res.tzoffset:
  595. ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset))
  596. # dateutil can return a datetime with a tzoffset outside of (-24H, 24H)
  597. # bounds, which is invalid (can be constructed, but raises if we call
  598. # str(ret)). Check that and raise here if necessary.
  599. try:
  600. ret.utcoffset()
  601. except ValueError as err:
  602. # offset must be a timedelta strictly between -timedelta(hours=24)
  603. # and timedelta(hours=24)
  604. raise ValueError(
  605. f'Parsed string "{timestr}" gives an invalid tzoffset, '
  606. "which must be between -timedelta(hours=24) and timedelta(hours=24)"
  607. )
  608. out_bestunit[0] = attrname_to_npy_unit[reso]
  609. return ret
  610. # ----------------------------------------------------------------------
  611. # Parsing for type-inference
  612. def try_parse_dates(object[:] values, parser) -> np.ndarray:
  613. cdef:
  614. Py_ssize_t i, n
  615. object[::1] result
  616. n = len(values)
  617. result = np.empty(n, dtype="O")
  618. for i in range(n):
  619. if values[i] == "":
  620. result[i] = np.nan
  621. else:
  622. result[i] = parser(values[i])
  623. return result.base # .base to access underlying ndarray
  624. def try_parse_year_month_day(
  625. object[:] years, object[:] months, object[:] days
  626. ) -> np.ndarray:
  627. cdef:
  628. Py_ssize_t i, n
  629. object[::1] result
  630. n = len(years)
  631. # TODO(cython3): Use len instead of `shape[0]`
  632. if months.shape[0] != n or days.shape[0] != n:
  633. raise ValueError("Length of years/months/days must all be equal")
  634. result = np.empty(n, dtype="O")
  635. for i in range(n):
  636. result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))
  637. return result.base # .base to access underlying ndarray
  638. # ----------------------------------------------------------------------
  639. # Miscellaneous
  640. # Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
  641. #
  642. # We use this class to parse and tokenize date strings. However, as it is
  643. # a private class in the dateutil library, relying on backwards compatibility
  644. # is not practical. In fact, using this class issues warnings (xref gh-21322).
  645. # Thus, we port the class over so that both issues are resolved.
  646. #
  647. # Copyright (c) 2017 - dateutil contributors
  648. class _timelex:
  649. def __init__(self, instream):
  650. if getattr(instream, "decode", None) is not None:
  651. instream = instream.decode()
  652. if isinstance(instream, str):
  653. self.stream = instream
  654. elif getattr(instream, "read", None) is None:
  655. raise TypeError(
  656. "Parser must be a string or character stream, not "
  657. f"{type(instream).__name__}")
  658. else:
  659. self.stream = instream.read()
  660. def get_tokens(self):
  661. """
  662. This function breaks the time string into lexical units (tokens), which
  663. can be parsed by the parser. Lexical units are demarcated by changes in
  664. the character set, so any continuous string of letters is considered
  665. one unit, any continuous string of numbers is considered one unit.
  666. The main complication arises from the fact that dots ('.') can be used
  667. both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
  668. "4:30:21.447"). As such, it is necessary to read the full context of
  669. any dot-separated strings before breaking it into tokens; as such, this
  670. function maintains a "token stack", for when the ambiguous context
  671. demands that multiple tokens be parsed at once.
  672. """
  673. cdef:
  674. Py_ssize_t n
  675. stream = self.stream.replace("\x00", "")
  676. # TODO: Change \s --> \s+ (this doesn't match existing behavior)
  677. # TODO: change the punctuation block to punc+ (does not match existing)
  678. # TODO: can we merge the two digit patterns?
  679. tokens = re.findall(r"\s|"
  680. r"(?<![\.\d])\d+\.\d+(?![\.\d])"
  681. r"|\d+"
  682. r"|[a-zA-Z]+"
  683. r"|[\./:]+"
  684. r"|[^\da-zA-Z\./:\s]+", stream)
  685. # Re-combine token tuples of the form ["59", ",", "456"] because
  686. # in this context the "," is treated as a decimal
  687. # (e.g. in python's default logging format)
  688. for n, token in enumerate(tokens[:-2]):
  689. # Kludge to match ,-decimal behavior; it'd be better to do this
  690. # later in the process and have a simpler tokenization
  691. if (token is not None and token.isdigit() and
  692. tokens[n + 1] == "," and tokens[n + 2].isdigit()):
  693. # Have to check None b/c it might be replaced during the loop
  694. # TODO: I _really_ don't faking the value here
  695. tokens[n] = token + "." + tokens[n + 2]
  696. tokens[n + 1] = None
  697. tokens[n + 2] = None
  698. tokens = [x for x in tokens if x is not None]
  699. return tokens
  700. @classmethod
  701. def split(cls, s):
  702. return cls(s).get_tokens()
  703. _DATEUTIL_LEXER_SPLIT = _timelex.split
  704. def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
  705. """
  706. Guess the datetime format of a given datetime string.
  707. Parameters
  708. ----------
  709. dt_str : str
  710. Datetime string to guess the format of.
  711. dayfirst : bool, default False
  712. If True parses dates with the day first, eg 20/01/2005
  713. Warning: dayfirst=True is not strict, but will prefer to parse
  714. with day first (this is a known bug).
  715. Returns
  716. -------
  717. str or None : ret
  718. datetime format string (for `strftime` or `strptime`),
  719. or None if it can't be guessed.
  720. """
  721. day_attribute_and_format = (("day",), "%d", 2)
  722. # attr name, format, padding (if any)
  723. datetime_attrs_to_format = [
  724. (("year", "month", "day", "hour", "minute", "second"), "%Y%m%d%H%M%S", 0),
  725. (("year", "month", "day", "hour", "minute"), "%Y%m%d%H%M", 0),
  726. (("year", "month", "day", "hour"), "%Y%m%d%H", 0),
  727. (("year", "month", "day"), "%Y%m%d", 0),
  728. (("hour", "minute", "second"), "%H%M%S", 0),
  729. (("hour", "minute"), "%H%M", 0),
  730. (("year",), "%Y", 0),
  731. (("month",), "%B", 0),
  732. (("month",), "%b", 0),
  733. (("month",), "%m", 2),
  734. day_attribute_and_format,
  735. (("hour",), "%H", 2),
  736. (("minute",), "%M", 2),
  737. (("second",), "%S", 2),
  738. (("second", "microsecond"), "%S.%f", 0),
  739. (("tzinfo",), "%z", 0),
  740. (("tzinfo",), "%Z", 0),
  741. (("day_of_week",), "%a", 0),
  742. (("day_of_week",), "%A", 0),
  743. (("meridiem",), "%p", 0),
  744. ]
  745. if dayfirst:
  746. datetime_attrs_to_format.remove(day_attribute_and_format)
  747. datetime_attrs_to_format.insert(0, day_attribute_and_format)
  748. try:
  749. parsed_datetime = du_parse(dt_str, dayfirst=dayfirst)
  750. except (ValueError, OverflowError, InvalidOperation):
  751. # In case the datetime can't be parsed, its format cannot be guessed
  752. return None
  753. if parsed_datetime is None:
  754. return None
  755. # _DATEUTIL_LEXER_SPLIT from dateutil will never raise here
  756. tokens = _DATEUTIL_LEXER_SPLIT(dt_str)
  757. # Normalize offset part of tokens.
  758. # There are multiple formats for the timezone offset.
  759. # To pass the comparison condition between the output of `strftime` and
  760. # joined tokens, which is carried out at the final step of the function,
  761. # the offset part of the tokens must match the '%z' format like '+0900'
  762. # instead of ‘+09:00’.
  763. if parsed_datetime.tzinfo is not None:
  764. offset_index = None
  765. if len(tokens) > 0 and tokens[-1] == "Z":
  766. # the last 'Z' means zero offset
  767. offset_index = -1
  768. elif len(tokens) > 1 and tokens[-2] in ("+", "-"):
  769. # ex. [..., '+', '0900']
  770. offset_index = -2
  771. elif len(tokens) > 3 and tokens[-4] in ("+", "-"):
  772. # ex. [..., '+', '09', ':', '00']
  773. offset_index = -4
  774. if offset_index is not None:
  775. # If the input string has a timezone offset like '+0900',
  776. # the offset is separated into two tokens, ex. ['+', '0900’].
  777. # This separation will prevent subsequent processing
  778. # from correctly parsing the time zone format.
  779. # So in addition to the format nomalization, we rejoin them here.
  780. try:
  781. tokens[offset_index] = parsed_datetime.strftime("%z")
  782. except ValueError:
  783. # Invalid offset might not have raised in du_parse
  784. # https://github.com/dateutil/dateutil/issues/188
  785. return None
  786. tokens = tokens[:offset_index + 1 or None]
  787. format_guess = [None] * len(tokens)
  788. found_attrs = set()
  789. for attrs, attr_format, padding in datetime_attrs_to_format:
  790. # If a given attribute has been placed in the format string, skip
  791. # over other formats for that same underlying attribute (IE, month
  792. # can be represented in multiple different ways)
  793. if set(attrs) & found_attrs:
  794. continue
  795. if parsed_datetime.tzinfo is None and attr_format in ("%Z", "%z"):
  796. continue
  797. parsed_formatted = parsed_datetime.strftime(attr_format)
  798. for i, token_format in enumerate(format_guess):
  799. token_filled = _fill_token(tokens[i], padding)
  800. if token_format is None and token_filled == parsed_formatted:
  801. format_guess[i] = attr_format
  802. tokens[i] = token_filled
  803. found_attrs.update(attrs)
  804. break
  805. # Only consider it a valid guess if we have a year, month and day.
  806. # We make exceptions for %Y and %Y-%m (only with the `-` separator)
  807. # as they conform with ISO8601.
  808. if (
  809. len({"year", "month", "day"} & found_attrs) != 3
  810. and format_guess != ["%Y"]
  811. and not (
  812. format_guess == ["%Y", None, "%m"] and tokens[1] == "-"
  813. )
  814. ):
  815. return None
  816. output_format = []
  817. for i, guess in enumerate(format_guess):
  818. if guess is not None:
  819. # Either fill in the format placeholder (like %Y)
  820. output_format.append(guess)
  821. else:
  822. # Or just the token separate (IE, the dashes in "01-01-2013")
  823. try:
  824. # If the token is numeric, then we likely didn't parse it
  825. # properly, so our guess is wrong
  826. float(tokens[i])
  827. return None
  828. except ValueError:
  829. pass
  830. output_format.append(tokens[i])
  831. # if am/pm token present, replace 24-hour %H, with 12-hour %I
  832. if "%p" in output_format and "%H" in output_format:
  833. i = output_format.index("%H")
  834. output_format[i] = "%I"
  835. guessed_format = "".join(output_format)
  836. try:
  837. array_strptime(np.asarray([dt_str], dtype=object), guessed_format)
  838. except ValueError:
  839. # Doesn't parse, so this can't be the correct format.
  840. return None
  841. # rebuild string, capturing any inferred padding
  842. dt_str = "".join(tokens)
  843. if parsed_datetime.strftime(guessed_format) == dt_str:
  844. _maybe_warn_about_dayfirst(guessed_format, dayfirst)
  845. return guessed_format
  846. else:
  847. return None
  848. cdef str _fill_token(token: str, padding: int):
  849. cdef str token_filled
  850. if re.search(r"\d+\.\d+", token) is None:
  851. # For example: 98
  852. token_filled = token.zfill(padding)
  853. else:
  854. # For example: 00.123
  855. seconds, nanoseconds = token.split(".")
  856. seconds = f"{int(seconds):02d}"
  857. # right-pad so we get nanoseconds, then only take
  858. # first 6 digits (microseconds) as stdlib datetime
  859. # doesn't support nanoseconds
  860. nanoseconds = nanoseconds.ljust(9, "0")[:6]
  861. token_filled = f"{seconds}.{nanoseconds}"
  862. return token_filled
  863. cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst):
  864. """Warn if guessed datetime format doesn't respect dayfirst argument."""
  865. cdef:
  866. int day_index = format.find("%d")
  867. int month_index = format.find("%m")
  868. if (day_index != -1) and (month_index != -1):
  869. if (day_index > month_index) and dayfirst:
  870. warnings.warn(
  871. f"Parsing dates in {format} format when dayfirst=True was specified. "
  872. "Pass `dayfirst=False` or specify a format to silence this warning.",
  873. UserWarning,
  874. stacklevel=find_stack_level(),
  875. )
  876. if (day_index < month_index) and not dayfirst:
  877. warnings.warn(
  878. f"Parsing dates in {format} format when dayfirst=False (the default) "
  879. "was specified. "
  880. "Pass `dayfirst=True` or specify a format to silence this warning.",
  881. UserWarning,
  882. stacklevel=find_stack_level(),
  883. )
  884. @cython.wraparound(False)
  885. @cython.boundscheck(False)
  886. cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
  887. """
  888. Convert `item` to str.
  889. Parameters
  890. ----------
  891. item : object
  892. keep_trivial_numbers : bool
  893. if True, then conversion (to string from integer/float zero)
  894. is not performed
  895. Returns
  896. -------
  897. str or int or float
  898. """
  899. cdef:
  900. float64_t float_item
  901. if keep_trivial_numbers:
  902. if isinstance(item, int):
  903. if <int>item == 0:
  904. return item
  905. elif isinstance(item, float):
  906. float_item = item
  907. if float_item == 0.0 or float_item != float_item:
  908. return item
  909. if not isinstance(item, str):
  910. item = PyObject_Str(item)
  911. return item
  912. @cython.wraparound(False)
  913. @cython.boundscheck(False)
  914. def concat_date_cols(tuple date_cols) -> np.ndarray:
  915. """
  916. Concatenates elements from numpy arrays in `date_cols` into strings.
  917. Parameters
  918. ----------
  919. date_cols : tuple[ndarray]
  920. Returns
  921. -------
  922. arr_of_rows : ndarray[object]
  923. Examples
  924. --------
  925. >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
  926. >>> times=np.array(['11:20', '10:45'], dtype=object)
  927. >>> result = concat_date_cols((dates, times))
  928. >>> result
  929. array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
  930. """
  931. cdef:
  932. Py_ssize_t rows_count = 0, col_count = len(date_cols)
  933. Py_ssize_t col_idx, row_idx
  934. list list_to_join
  935. cnp.ndarray[object] iters
  936. object[::1] iters_view
  937. flatiter it
  938. cnp.ndarray[object] result
  939. object[::1] result_view
  940. if col_count == 0:
  941. return np.zeros(0, dtype=object)
  942. if not all(is_array(array) for array in date_cols):
  943. raise ValueError("not all elements from date_cols are numpy arrays")
  944. rows_count = min(len(array) for array in date_cols)
  945. result = np.zeros(rows_count, dtype=object)
  946. result_view = result
  947. if col_count == 1:
  948. array = date_cols[0]
  949. it = <flatiter>PyArray_IterNew(array)
  950. for row_idx in range(rows_count):
  951. item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
  952. result_view[row_idx] = convert_to_unicode(item, True)
  953. PyArray_ITER_NEXT(it)
  954. else:
  955. # create fixed size list - more efficient memory allocation
  956. list_to_join = [None] * col_count
  957. iters = np.zeros(col_count, dtype=object)
  958. # create memoryview of iters ndarray, that will contain some
  959. # flatiter's for each array in `date_cols` - more efficient indexing
  960. iters_view = iters
  961. for col_idx, array in enumerate(date_cols):
  962. iters_view[col_idx] = PyArray_IterNew(array)
  963. # array elements that are on the same line are converted to one string
  964. for row_idx in range(rows_count):
  965. for col_idx, array in enumerate(date_cols):
  966. # this cast is needed, because we did not find a way
  967. # to efficiently store `flatiter` type objects in ndarray
  968. it = <flatiter>iters_view[col_idx]
  969. item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
  970. list_to_join[col_idx] = convert_to_unicode(item, False)
  971. PyArray_ITER_NEXT(it)
  972. result_view[row_idx] = " ".join(list_to_join)
  973. return result
  974. cpdef str get_rule_month(str source):
  975. """
  976. Return starting month of given freq, default is December.
  977. Parameters
  978. ----------
  979. source : str
  980. Derived from `freq.rule_code` or `freq.freqstr`.
  981. Returns
  982. -------
  983. rule_month: str
  984. Examples
  985. --------
  986. >>> get_rule_month('D')
  987. 'DEC'
  988. >>> get_rule_month('A-JAN')
  989. 'JAN'
  990. """
  991. source = source.upper()
  992. if "-" not in source:
  993. return "DEC"
  994. else:
  995. return source.split("-")[1]