test_parse_dates.py 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240
  1. """
  2. Tests date parsing functionality for all of the
  3. parsers defined in parsers.py
  4. """
  5. from datetime import (
  6. date,
  7. datetime,
  8. timedelta,
  9. timezone,
  10. )
  11. from io import StringIO
  12. from dateutil.parser import parse as du_parse
  13. from hypothesis import given
  14. import numpy as np
  15. import pytest
  16. import pytz
  17. from pandas._libs.tslibs import parsing
  18. from pandas._libs.tslibs.parsing import py_parse_datetime_string
  19. import pandas as pd
  20. from pandas import (
  21. DataFrame,
  22. DatetimeIndex,
  23. Index,
  24. MultiIndex,
  25. Series,
  26. Timestamp,
  27. )
  28. import pandas._testing as tm
  29. from pandas._testing._hypothesis import DATETIME_NO_TZ
  30. from pandas.core.indexes.datetimes import date_range
  31. from pandas.io.parsers import read_csv
  32. xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
  33. # GH#43650: Some expected failures with the pyarrow engine can occasionally
  34. # cause a deadlock instead, so we skip these instead of xfailing
  35. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  36. @xfail_pyarrow
  37. def test_read_csv_with_custom_date_parser(all_parsers):
  38. # GH36111
  39. def __custom_date_parser(time):
  40. time = time.astype(np.float_)
  41. time = time.astype(np.int_) # convert float seconds to int type
  42. return pd.to_timedelta(time, unit="s")
  43. testdata = StringIO(
  44. """time e n h
  45. 41047.00 -98573.7297 871458.0640 389.0089
  46. 41048.00 -98573.7299 871458.0640 389.0089
  47. 41049.00 -98573.7300 871458.0642 389.0088
  48. 41050.00 -98573.7299 871458.0643 389.0088
  49. 41051.00 -98573.7302 871458.0640 389.0086
  50. """
  51. )
  52. result = all_parsers.read_csv_check_warnings(
  53. FutureWarning,
  54. "Please use 'date_format' instead",
  55. testdata,
  56. delim_whitespace=True,
  57. parse_dates=True,
  58. date_parser=__custom_date_parser,
  59. index_col="time",
  60. )
  61. time = [41047, 41048, 41049, 41050, 41051]
  62. time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time")
  63. expected = DataFrame(
  64. {
  65. "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302],
  66. "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640],
  67. "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086],
  68. },
  69. index=time,
  70. )
  71. tm.assert_frame_equal(result, expected)
  72. @xfail_pyarrow
  73. def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers):
  74. # GH44366
  75. def __custom_date_parser(time):
  76. time = time.astype(np.float_)
  77. time = time.astype(np.int_) # convert float seconds to int type
  78. return pd.to_timedelta(time, unit="s")
  79. testdata = StringIO(
  80. """time e
  81. 41047.00 -93.77
  82. 41048.00 -95.79
  83. 41049.00 -98.73
  84. 41050.00 -93.99
  85. 41051.00 -97.72
  86. """
  87. )
  88. result = all_parsers.read_csv_check_warnings(
  89. FutureWarning,
  90. "Please use 'date_format' instead",
  91. testdata,
  92. delim_whitespace=True,
  93. parse_dates=False,
  94. date_parser=__custom_date_parser,
  95. index_col="time",
  96. )
  97. time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time")
  98. expected = DataFrame(
  99. {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]},
  100. index=time,
  101. )
  102. tm.assert_frame_equal(result, expected)
  103. @xfail_pyarrow
  104. def test_separator_date_conflict(all_parsers):
  105. # Regression test for gh-4678
  106. #
  107. # Make sure thousands separator and
  108. # date parsing do not conflict.
  109. parser = all_parsers
  110. data = "06-02-2013;13:00;1-000.215"
  111. expected = DataFrame(
  112. [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2]
  113. )
  114. df = parser.read_csv(
  115. StringIO(data),
  116. sep=";",
  117. thousands="-",
  118. parse_dates={"Date": [0, 1]},
  119. header=None,
  120. )
  121. tm.assert_frame_equal(df, expected)
  122. @xfail_pyarrow
  123. @pytest.mark.parametrize("keep_date_col", [True, False])
  124. def test_multiple_date_col_custom(all_parsers, keep_date_col):
  125. data = """\
  126. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  127. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  128. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  129. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  130. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  131. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  132. """
  133. parser = all_parsers
  134. def date_parser(*date_cols):
  135. """
  136. Test date parser.
  137. Parameters
  138. ----------
  139. date_cols : args
  140. The list of data columns to parse.
  141. Returns
  142. -------
  143. parsed : Series
  144. """
  145. return parsing.try_parse_dates(
  146. parsing.concat_date_cols(date_cols), parser=du_parse
  147. )
  148. kwds = {
  149. "header": None,
  150. "date_parser": date_parser,
  151. "parse_dates": {"actual": [1, 2], "nominal": [1, 3]},
  152. "keep_date_col": keep_date_col,
  153. "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"],
  154. }
  155. result = parser.read_csv_check_warnings(
  156. FutureWarning,
  157. "use 'date_format' instead",
  158. StringIO(data),
  159. **kwds,
  160. )
  161. expected = DataFrame(
  162. [
  163. [
  164. datetime(1999, 1, 27, 19, 0),
  165. datetime(1999, 1, 27, 18, 56),
  166. "KORD",
  167. "19990127",
  168. " 19:00:00",
  169. " 18:56:00",
  170. 0.81,
  171. 2.81,
  172. 7.2,
  173. 0.0,
  174. 280.0,
  175. ],
  176. [
  177. datetime(1999, 1, 27, 20, 0),
  178. datetime(1999, 1, 27, 19, 56),
  179. "KORD",
  180. "19990127",
  181. " 20:00:00",
  182. " 19:56:00",
  183. 0.01,
  184. 2.21,
  185. 7.2,
  186. 0.0,
  187. 260.0,
  188. ],
  189. [
  190. datetime(1999, 1, 27, 21, 0),
  191. datetime(1999, 1, 27, 20, 56),
  192. "KORD",
  193. "19990127",
  194. " 21:00:00",
  195. " 20:56:00",
  196. -0.59,
  197. 2.21,
  198. 5.7,
  199. 0.0,
  200. 280.0,
  201. ],
  202. [
  203. datetime(1999, 1, 27, 21, 0),
  204. datetime(1999, 1, 27, 21, 18),
  205. "KORD",
  206. "19990127",
  207. " 21:00:00",
  208. " 21:18:00",
  209. -0.99,
  210. 2.01,
  211. 3.6,
  212. 0.0,
  213. 270.0,
  214. ],
  215. [
  216. datetime(1999, 1, 27, 22, 0),
  217. datetime(1999, 1, 27, 21, 56),
  218. "KORD",
  219. "19990127",
  220. " 22:00:00",
  221. " 21:56:00",
  222. -0.59,
  223. 1.71,
  224. 5.1,
  225. 0.0,
  226. 290.0,
  227. ],
  228. [
  229. datetime(1999, 1, 27, 23, 0),
  230. datetime(1999, 1, 27, 22, 56),
  231. "KORD",
  232. "19990127",
  233. " 23:00:00",
  234. " 22:56:00",
  235. -0.59,
  236. 1.71,
  237. 4.6,
  238. 0.0,
  239. 280.0,
  240. ],
  241. ],
  242. columns=[
  243. "actual",
  244. "nominal",
  245. "X0",
  246. "X1",
  247. "X2",
  248. "X3",
  249. "X4",
  250. "X5",
  251. "X6",
  252. "X7",
  253. "X8",
  254. ],
  255. )
  256. if not keep_date_col:
  257. expected = expected.drop(["X1", "X2", "X3"], axis=1)
  258. # Python can sometimes be flaky about how
  259. # the aggregated columns are entered, so
  260. # this standardizes the order.
  261. result = result[expected.columns]
  262. tm.assert_frame_equal(result, expected)
  263. @pytest.mark.parametrize("container", [list, tuple, Index, Series])
  264. @pytest.mark.parametrize("dim", [1, 2])
  265. def test_concat_date_col_fail(container, dim):
  266. msg = "not all elements from date_cols are numpy arrays"
  267. value = "19990127"
  268. date_cols = tuple(container([value]) for _ in range(dim))
  269. with pytest.raises(ValueError, match=msg):
  270. parsing.concat_date_cols(date_cols)
  271. @xfail_pyarrow
  272. @pytest.mark.parametrize("keep_date_col", [True, False])
  273. def test_multiple_date_col(all_parsers, keep_date_col):
  274. data = """\
  275. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  276. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  277. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  278. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  279. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  280. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  281. """
  282. parser = all_parsers
  283. kwds = {
  284. "header": None,
  285. "parse_dates": [[1, 2], [1, 3]],
  286. "keep_date_col": keep_date_col,
  287. "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"],
  288. }
  289. result = parser.read_csv(StringIO(data), **kwds)
  290. expected = DataFrame(
  291. [
  292. [
  293. datetime(1999, 1, 27, 19, 0),
  294. datetime(1999, 1, 27, 18, 56),
  295. "KORD",
  296. "19990127",
  297. " 19:00:00",
  298. " 18:56:00",
  299. 0.81,
  300. 2.81,
  301. 7.2,
  302. 0.0,
  303. 280.0,
  304. ],
  305. [
  306. datetime(1999, 1, 27, 20, 0),
  307. datetime(1999, 1, 27, 19, 56),
  308. "KORD",
  309. "19990127",
  310. " 20:00:00",
  311. " 19:56:00",
  312. 0.01,
  313. 2.21,
  314. 7.2,
  315. 0.0,
  316. 260.0,
  317. ],
  318. [
  319. datetime(1999, 1, 27, 21, 0),
  320. datetime(1999, 1, 27, 20, 56),
  321. "KORD",
  322. "19990127",
  323. " 21:00:00",
  324. " 20:56:00",
  325. -0.59,
  326. 2.21,
  327. 5.7,
  328. 0.0,
  329. 280.0,
  330. ],
  331. [
  332. datetime(1999, 1, 27, 21, 0),
  333. datetime(1999, 1, 27, 21, 18),
  334. "KORD",
  335. "19990127",
  336. " 21:00:00",
  337. " 21:18:00",
  338. -0.99,
  339. 2.01,
  340. 3.6,
  341. 0.0,
  342. 270.0,
  343. ],
  344. [
  345. datetime(1999, 1, 27, 22, 0),
  346. datetime(1999, 1, 27, 21, 56),
  347. "KORD",
  348. "19990127",
  349. " 22:00:00",
  350. " 21:56:00",
  351. -0.59,
  352. 1.71,
  353. 5.1,
  354. 0.0,
  355. 290.0,
  356. ],
  357. [
  358. datetime(1999, 1, 27, 23, 0),
  359. datetime(1999, 1, 27, 22, 56),
  360. "KORD",
  361. "19990127",
  362. " 23:00:00",
  363. " 22:56:00",
  364. -0.59,
  365. 1.71,
  366. 4.6,
  367. 0.0,
  368. 280.0,
  369. ],
  370. ],
  371. columns=[
  372. "X1_X2",
  373. "X1_X3",
  374. "X0",
  375. "X1",
  376. "X2",
  377. "X3",
  378. "X4",
  379. "X5",
  380. "X6",
  381. "X7",
  382. "X8",
  383. ],
  384. )
  385. if not keep_date_col:
  386. expected = expected.drop(["X1", "X2", "X3"], axis=1)
  387. tm.assert_frame_equal(result, expected)
  388. def test_date_col_as_index_col(all_parsers):
  389. data = """\
  390. KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  391. KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  392. KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  393. KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  394. KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  395. """
  396. parser = all_parsers
  397. kwds = {
  398. "header": None,
  399. "parse_dates": [1],
  400. "index_col": 1,
  401. "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7"],
  402. }
  403. result = parser.read_csv(StringIO(data), **kwds)
  404. index = Index(
  405. [
  406. datetime(1999, 1, 27, 19, 0),
  407. datetime(1999, 1, 27, 20, 0),
  408. datetime(1999, 1, 27, 21, 0),
  409. datetime(1999, 1, 27, 21, 0),
  410. datetime(1999, 1, 27, 22, 0),
  411. ],
  412. name="X1",
  413. )
  414. expected = DataFrame(
  415. [
  416. ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0],
  417. ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0],
  418. ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0],
  419. ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0],
  420. ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0],
  421. ],
  422. columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"],
  423. index=index,
  424. )
  425. if parser.engine == "pyarrow":
  426. # https://github.com/pandas-dev/pandas/issues/44231
  427. # pyarrow 6.0 starts to infer time type
  428. expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time
  429. tm.assert_frame_equal(result, expected)
  430. @xfail_pyarrow
  431. def test_multiple_date_cols_int_cast(all_parsers):
  432. data = (
  433. "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
  434. "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
  435. "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
  436. "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
  437. "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
  438. "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
  439. )
  440. parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
  441. parser = all_parsers
  442. kwds = {
  443. "header": None,
  444. "parse_dates": parse_dates,
  445. "date_parser": pd.to_datetime,
  446. }
  447. result = parser.read_csv_check_warnings(
  448. FutureWarning, "use 'date_format' instead", StringIO(data), **kwds
  449. )
  450. expected = DataFrame(
  451. [
  452. [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81],
  453. [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01],
  454. [
  455. datetime(1999, 1, 27, 21, 0),
  456. datetime(1999, 1, 27, 20, 56),
  457. "KORD",
  458. -0.59,
  459. ],
  460. [
  461. datetime(1999, 1, 27, 21, 0),
  462. datetime(1999, 1, 27, 21, 18),
  463. "KORD",
  464. -0.99,
  465. ],
  466. [
  467. datetime(1999, 1, 27, 22, 0),
  468. datetime(1999, 1, 27, 21, 56),
  469. "KORD",
  470. -0.59,
  471. ],
  472. [
  473. datetime(1999, 1, 27, 23, 0),
  474. datetime(1999, 1, 27, 22, 56),
  475. "KORD",
  476. -0.59,
  477. ],
  478. ],
  479. columns=["actual", "nominal", 0, 4],
  480. )
  481. # Python can sometimes be flaky about how
  482. # the aggregated columns are entered, so
  483. # this standardizes the order.
  484. result = result[expected.columns]
  485. tm.assert_frame_equal(result, expected)
  486. @xfail_pyarrow
  487. def test_multiple_date_col_timestamp_parse(all_parsers):
  488. parser = all_parsers
  489. data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
  490. 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
  491. result = parser.read_csv_check_warnings(
  492. FutureWarning,
  493. "use 'date_format' instead",
  494. StringIO(data),
  495. parse_dates=[[0, 1]],
  496. header=None,
  497. date_parser=Timestamp,
  498. )
  499. expected = DataFrame(
  500. [
  501. [
  502. Timestamp("05/31/2012, 15:30:00.029"),
  503. 1306.25,
  504. 1,
  505. "E",
  506. 0,
  507. np.nan,
  508. 1306.25,
  509. ],
  510. [
  511. Timestamp("05/31/2012, 15:30:00.029"),
  512. 1306.25,
  513. 8,
  514. "E",
  515. 0,
  516. np.nan,
  517. 1306.25,
  518. ],
  519. ],
  520. columns=["0_1", 2, 3, 4, 5, 6, 7],
  521. )
  522. tm.assert_frame_equal(result, expected)
  523. @xfail_pyarrow
  524. def test_multiple_date_cols_with_header(all_parsers):
  525. parser = all_parsers
  526. data = """\
  527. ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
  528. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  529. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  530. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  531. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  532. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  533. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
  534. result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
  535. expected = DataFrame(
  536. [
  537. [
  538. datetime(1999, 1, 27, 19, 0),
  539. "KORD",
  540. " 18:56:00",
  541. 0.81,
  542. 2.81,
  543. 7.2,
  544. 0.0,
  545. 280.0,
  546. ],
  547. [
  548. datetime(1999, 1, 27, 20, 0),
  549. "KORD",
  550. " 19:56:00",
  551. 0.01,
  552. 2.21,
  553. 7.2,
  554. 0.0,
  555. 260.0,
  556. ],
  557. [
  558. datetime(1999, 1, 27, 21, 0),
  559. "KORD",
  560. " 20:56:00",
  561. -0.59,
  562. 2.21,
  563. 5.7,
  564. 0.0,
  565. 280.0,
  566. ],
  567. [
  568. datetime(1999, 1, 27, 21, 0),
  569. "KORD",
  570. " 21:18:00",
  571. -0.99,
  572. 2.01,
  573. 3.6,
  574. 0.0,
  575. 270.0,
  576. ],
  577. [
  578. datetime(1999, 1, 27, 22, 0),
  579. "KORD",
  580. " 21:56:00",
  581. -0.59,
  582. 1.71,
  583. 5.1,
  584. 0.0,
  585. 290.0,
  586. ],
  587. [
  588. datetime(1999, 1, 27, 23, 0),
  589. "KORD",
  590. " 22:56:00",
  591. -0.59,
  592. 1.71,
  593. 4.6,
  594. 0.0,
  595. 280.0,
  596. ],
  597. ],
  598. columns=[
  599. "nominal",
  600. "ID",
  601. "ActualTime",
  602. "TDew",
  603. "TAir",
  604. "Windspeed",
  605. "Precip",
  606. "WindDir",
  607. ],
  608. )
  609. tm.assert_frame_equal(result, expected)
  610. @pytest.mark.parametrize(
  611. "data,parse_dates,msg",
  612. [
  613. (
  614. """\
  615. date_NominalTime,date,NominalTime
  616. KORD1,19990127, 19:00:00
  617. KORD2,19990127, 20:00:00""",
  618. [[1, 2]],
  619. ("New date column already in dict date_NominalTime"),
  620. ),
  621. (
  622. """\
  623. ID,date,nominalTime
  624. KORD,19990127, 19:00:00
  625. KORD,19990127, 20:00:00""",
  626. {"ID": [1, 2]},
  627. "Date column ID already in dict",
  628. ),
  629. ],
  630. )
  631. def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
  632. parser = all_parsers
  633. with pytest.raises(ValueError, match=msg):
  634. parser.read_csv(StringIO(data), parse_dates=parse_dates)
  635. def test_date_parser_int_bug(all_parsers):
  636. # see gh-3071
  637. parser = all_parsers
  638. data = (
  639. "posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
  640. "accountid,userid,contactid,level,silo,method\n"
  641. "1343103150,0.062353,0,4,6,0.01690,3,"
  642. "12345,1,-1,3,invoice_InvoiceResource,search\n"
  643. )
  644. result = parser.read_csv_check_warnings(
  645. FutureWarning,
  646. "use 'date_format' instead",
  647. StringIO(data),
  648. index_col=0,
  649. parse_dates=[0],
  650. date_parser=lambda x: datetime.utcfromtimestamp(int(x)),
  651. )
  652. expected = DataFrame(
  653. [
  654. [
  655. 0.062353,
  656. 0,
  657. 4,
  658. 6,
  659. 0.01690,
  660. 3,
  661. 12345,
  662. 1,
  663. -1,
  664. 3,
  665. "invoice_InvoiceResource",
  666. "search",
  667. ]
  668. ],
  669. columns=[
  670. "elapsed",
  671. "sys",
  672. "user",
  673. "queries",
  674. "query_time",
  675. "rows",
  676. "accountid",
  677. "userid",
  678. "contactid",
  679. "level",
  680. "silo",
  681. "method",
  682. ],
  683. index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"),
  684. )
  685. tm.assert_frame_equal(result, expected)
  686. @xfail_pyarrow
  687. def test_nat_parse(all_parsers):
  688. # see gh-3062
  689. parser = all_parsers
  690. df = DataFrame(
  691. dict({"A": np.arange(10, dtype="float64"), "B": Timestamp("20010101")})
  692. )
  693. df.iloc[3:6, :] = np.nan
  694. with tm.ensure_clean("__nat_parse_.csv") as path:
  695. df.to_csv(path)
  696. result = parser.read_csv(path, index_col=0, parse_dates=["B"])
  697. tm.assert_frame_equal(result, df)
  698. @xfail_pyarrow
  699. def test_csv_custom_parser(all_parsers):
  700. data = """A,B,C
  701. 20090101,a,1,2
  702. 20090102,b,3,4
  703. 20090103,c,4,5
  704. """
  705. parser = all_parsers
  706. result = parser.read_csv_check_warnings(
  707. FutureWarning,
  708. "use 'date_format' instead",
  709. StringIO(data),
  710. date_parser=lambda x: datetime.strptime(x, "%Y%m%d"),
  711. )
  712. expected = parser.read_csv(StringIO(data), parse_dates=True)
  713. tm.assert_frame_equal(result, expected)
  714. result = parser.read_csv(StringIO(data), date_format="%Y%m%d")
  715. tm.assert_frame_equal(result, expected)
  716. @xfail_pyarrow
  717. def test_parse_dates_implicit_first_col(all_parsers):
  718. data = """A,B,C
  719. 20090101,a,1,2
  720. 20090102,b,3,4
  721. 20090103,c,4,5
  722. """
  723. parser = all_parsers
  724. result = parser.read_csv(StringIO(data), parse_dates=True)
  725. expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True)
  726. tm.assert_frame_equal(result, expected)
  727. @xfail_pyarrow
  728. def test_parse_dates_string(all_parsers):
  729. data = """date,A,B,C
  730. 20090101,a,1,2
  731. 20090102,b,3,4
  732. 20090103,c,4,5
  733. """
  734. parser = all_parsers
  735. result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"])
  736. # freq doesn't round-trip
  737. index = DatetimeIndex(
  738. list(date_range("1/1/2009", periods=3)), name="date", freq=None
  739. )
  740. expected = DataFrame(
  741. {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index
  742. )
  743. tm.assert_frame_equal(result, expected)
  744. # Bug in https://github.com/dateutil/dateutil/issues/217
  745. # has been addressed, but we just don't pass in the `yearfirst`
  746. @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
  747. @pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]])
  748. def test_yy_format_with_year_first(all_parsers, parse_dates):
  749. data = """date,time,B,C
  750. 090131,0010,1,2
  751. 090228,1020,3,4
  752. 090331,0830,5,6
  753. """
  754. parser = all_parsers
  755. result = parser.read_csv_check_warnings(
  756. UserWarning,
  757. "Could not infer format",
  758. StringIO(data),
  759. index_col=0,
  760. parse_dates=parse_dates,
  761. )
  762. index = DatetimeIndex(
  763. [
  764. datetime(2009, 1, 31, 0, 10, 0),
  765. datetime(2009, 2, 28, 10, 20, 0),
  766. datetime(2009, 3, 31, 8, 30, 0),
  767. ],
  768. dtype=object,
  769. name="date_time",
  770. )
  771. expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
  772. tm.assert_frame_equal(result, expected)
  773. @xfail_pyarrow
  774. @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
  775. def test_parse_dates_column_list(all_parsers, parse_dates):
  776. data = "a,b,c\n01/01/2010,1,15/02/2010"
  777. parser = all_parsers
  778. expected = DataFrame(
  779. {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]}
  780. )
  781. expected = expected.set_index(["a", "b"])
  782. result = parser.read_csv(
  783. StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True
  784. )
  785. tm.assert_frame_equal(result, expected)
  786. @xfail_pyarrow
  787. @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
  788. def test_multi_index_parse_dates(all_parsers, index_col):
  789. data = """index1,index2,A,B,C
  790. 20090101,one,a,1,2
  791. 20090101,two,b,3,4
  792. 20090101,three,c,4,5
  793. 20090102,one,a,1,2
  794. 20090102,two,b,3,4
  795. 20090102,three,c,4,5
  796. 20090103,one,a,1,2
  797. 20090103,two,b,3,4
  798. 20090103,three,c,4,5
  799. """
  800. parser = all_parsers
  801. index = MultiIndex.from_product(
  802. [
  803. (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)),
  804. ("one", "two", "three"),
  805. ],
  806. names=["index1", "index2"],
  807. )
  808. # Out of order.
  809. if index_col == [1, 0]:
  810. index = index.swaplevel(0, 1)
  811. expected = DataFrame(
  812. [
  813. ["a", 1, 2],
  814. ["b", 3, 4],
  815. ["c", 4, 5],
  816. ["a", 1, 2],
  817. ["b", 3, 4],
  818. ["c", 4, 5],
  819. ["a", 1, 2],
  820. ["b", 3, 4],
  821. ["c", 4, 5],
  822. ],
  823. columns=["A", "B", "C"],
  824. index=index,
  825. )
  826. result = parser.read_csv_check_warnings(
  827. UserWarning,
  828. "Could not infer format",
  829. StringIO(data),
  830. index_col=index_col,
  831. parse_dates=True,
  832. )
  833. tm.assert_frame_equal(result, expected)
  834. @xfail_pyarrow
  835. @pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}])
  836. def test_parse_dates_custom_euro_format(all_parsers, kwargs):
  837. parser = all_parsers
  838. data = """foo,bar,baz
  839. 31/01/2010,1,2
  840. 01/02/2010,1,NA
  841. 02/02/2010,1,2
  842. """
  843. if "dayfirst" in kwargs:
  844. df = parser.read_csv_check_warnings(
  845. FutureWarning,
  846. "use 'date_format' instead",
  847. StringIO(data),
  848. names=["time", "Q", "NTU"],
  849. date_parser=lambda d: du_parse(d, **kwargs),
  850. header=0,
  851. index_col=0,
  852. parse_dates=True,
  853. na_values=["NA"],
  854. )
  855. exp_index = Index(
  856. [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)],
  857. name="time",
  858. )
  859. expected = DataFrame(
  860. {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]},
  861. index=exp_index,
  862. columns=["Q", "NTU"],
  863. )
  864. tm.assert_frame_equal(df, expected)
  865. else:
  866. msg = "got an unexpected keyword argument 'day_first'"
  867. with pytest.raises(TypeError, match=msg):
  868. parser.read_csv_check_warnings(
  869. FutureWarning,
  870. "use 'date_format' instead",
  871. StringIO(data),
  872. names=["time", "Q", "NTU"],
  873. date_parser=lambda d: du_parse(d, **kwargs),
  874. skiprows=[0],
  875. index_col=0,
  876. parse_dates=True,
  877. na_values=["NA"],
  878. )
  879. def test_parse_tz_aware(all_parsers, request):
  880. # See gh-1693
  881. parser = all_parsers
  882. data = "Date,x\n2012-06-13T01:39:00Z,0.5"
  883. result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True)
  884. expected = DataFrame(
  885. {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date")
  886. )
  887. tm.assert_frame_equal(result, expected)
  888. if parser.engine == "pyarrow":
  889. expected_tz = pytz.utc
  890. else:
  891. expected_tz = timezone.utc
  892. assert result.index.tz is expected_tz
  893. @xfail_pyarrow
  894. @pytest.mark.parametrize(
  895. "parse_dates,index_col",
  896. [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
  897. )
  898. def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
  899. parser = all_parsers
  900. data = """
  901. ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
  902. KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  903. KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  904. KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  905. KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  906. KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  907. KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  908. """
  909. expected = DataFrame(
  910. [
  911. [
  912. datetime(1999, 1, 27, 19, 0),
  913. "KORD1",
  914. " 18:56:00",
  915. 0.81,
  916. 2.81,
  917. 7.2,
  918. 0.0,
  919. 280.0,
  920. ],
  921. [
  922. datetime(1999, 1, 27, 20, 0),
  923. "KORD2",
  924. " 19:56:00",
  925. 0.01,
  926. 2.21,
  927. 7.2,
  928. 0.0,
  929. 260.0,
  930. ],
  931. [
  932. datetime(1999, 1, 27, 21, 0),
  933. "KORD3",
  934. " 20:56:00",
  935. -0.59,
  936. 2.21,
  937. 5.7,
  938. 0.0,
  939. 280.0,
  940. ],
  941. [
  942. datetime(1999, 1, 27, 21, 0),
  943. "KORD4",
  944. " 21:18:00",
  945. -0.99,
  946. 2.01,
  947. 3.6,
  948. 0.0,
  949. 270.0,
  950. ],
  951. [
  952. datetime(1999, 1, 27, 22, 0),
  953. "KORD5",
  954. " 21:56:00",
  955. -0.59,
  956. 1.71,
  957. 5.1,
  958. 0.0,
  959. 290.0,
  960. ],
  961. [
  962. datetime(1999, 1, 27, 23, 0),
  963. "KORD6",
  964. " 22:56:00",
  965. -0.59,
  966. 1.71,
  967. 4.6,
  968. 0.0,
  969. 280.0,
  970. ],
  971. ],
  972. columns=[
  973. "nominal",
  974. "ID",
  975. "ActualTime",
  976. "TDew",
  977. "TAir",
  978. "Windspeed",
  979. "Precip",
  980. "WindDir",
  981. ],
  982. )
  983. expected = expected.set_index("nominal")
  984. if not isinstance(parse_dates, dict):
  985. expected.index.name = "date_NominalTime"
  986. result = parser.read_csv(
  987. StringIO(data), parse_dates=parse_dates, index_col=index_col
  988. )
  989. tm.assert_frame_equal(result, expected)
  990. @xfail_pyarrow
  991. def test_multiple_date_cols_chunked(all_parsers):
  992. parser = all_parsers
  993. data = """\
  994. ID,date,nominalTime,actualTime,A,B,C,D,E
  995. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  996. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  997. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  998. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  999. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  1000. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  1001. """
  1002. expected = DataFrame(
  1003. [
  1004. [
  1005. datetime(1999, 1, 27, 19, 0),
  1006. "KORD",
  1007. " 18:56:00",
  1008. 0.81,
  1009. 2.81,
  1010. 7.2,
  1011. 0.0,
  1012. 280.0,
  1013. ],
  1014. [
  1015. datetime(1999, 1, 27, 20, 0),
  1016. "KORD",
  1017. " 19:56:00",
  1018. 0.01,
  1019. 2.21,
  1020. 7.2,
  1021. 0.0,
  1022. 260.0,
  1023. ],
  1024. [
  1025. datetime(1999, 1, 27, 21, 0),
  1026. "KORD",
  1027. " 20:56:00",
  1028. -0.59,
  1029. 2.21,
  1030. 5.7,
  1031. 0.0,
  1032. 280.0,
  1033. ],
  1034. [
  1035. datetime(1999, 1, 27, 21, 0),
  1036. "KORD",
  1037. " 21:18:00",
  1038. -0.99,
  1039. 2.01,
  1040. 3.6,
  1041. 0.0,
  1042. 270.0,
  1043. ],
  1044. [
  1045. datetime(1999, 1, 27, 22, 0),
  1046. "KORD",
  1047. " 21:56:00",
  1048. -0.59,
  1049. 1.71,
  1050. 5.1,
  1051. 0.0,
  1052. 290.0,
  1053. ],
  1054. [
  1055. datetime(1999, 1, 27, 23, 0),
  1056. "KORD",
  1057. " 22:56:00",
  1058. -0.59,
  1059. 1.71,
  1060. 4.6,
  1061. 0.0,
  1062. 280.0,
  1063. ],
  1064. ],
  1065. columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"],
  1066. )
  1067. expected = expected.set_index("nominal")
  1068. with parser.read_csv(
  1069. StringIO(data),
  1070. parse_dates={"nominal": [1, 2]},
  1071. index_col="nominal",
  1072. chunksize=2,
  1073. ) as reader:
  1074. chunks = list(reader)
  1075. tm.assert_frame_equal(chunks[0], expected[:2])
  1076. tm.assert_frame_equal(chunks[1], expected[2:4])
  1077. tm.assert_frame_equal(chunks[2], expected[4:])
  1078. @xfail_pyarrow
  1079. def test_multiple_date_col_named_index_compat(all_parsers):
  1080. parser = all_parsers
  1081. data = """\
  1082. ID,date,nominalTime,actualTime,A,B,C,D,E
  1083. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  1084. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  1085. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  1086. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  1087. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  1088. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  1089. """
  1090. with_indices = parser.read_csv(
  1091. StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal"
  1092. )
  1093. with_names = parser.read_csv(
  1094. StringIO(data),
  1095. index_col="nominal",
  1096. parse_dates={"nominal": ["date", "nominalTime"]},
  1097. )
  1098. tm.assert_frame_equal(with_indices, with_names)
  1099. @xfail_pyarrow
  1100. def test_multiple_date_col_multiple_index_compat(all_parsers):
  1101. parser = all_parsers
  1102. data = """\
  1103. ID,date,nominalTime,actualTime,A,B,C,D,E
  1104. KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  1105. KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  1106. KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  1107. KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  1108. KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  1109. KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
  1110. """
  1111. result = parser.read_csv(
  1112. StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]}
  1113. )
  1114. expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
  1115. expected = expected.set_index(["nominal", "ID"])
  1116. tm.assert_frame_equal(result, expected)
  1117. @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}])
  1118. def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
  1119. # see gh-5636
  1120. parser = all_parsers
  1121. msg = (
  1122. "Only booleans, lists, and dictionaries "
  1123. "are accepted for the 'parse_dates' parameter"
  1124. )
  1125. data = """A,B,C
  1126. 1,2,2003-11-1"""
  1127. with pytest.raises(TypeError, match=msg):
  1128. parser.read_csv(StringIO(data), parse_dates="C", **kwargs)
  1129. @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}])
  1130. def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
  1131. parser = all_parsers
  1132. msg = (
  1133. "Only booleans, lists, and dictionaries "
  1134. "are accepted for the 'parse_dates' parameter"
  1135. )
  1136. data = """A,B,C
  1137. 1,2,2003-11-1"""
  1138. with pytest.raises(TypeError, match=msg):
  1139. parser.read_csv(StringIO(data), parse_dates=(1,))
  1140. @pytest.mark.parametrize("cache_dates", [True, False])
  1141. @pytest.mark.parametrize("value", ["nan", ""])
  1142. def test_bad_date_parse(all_parsers, cache_dates, value):
  1143. # if we have an invalid date make sure that we handle this with
  1144. # and w/o the cache properly
  1145. parser = all_parsers
  1146. s = StringIO((f"{value},\n") * 50000)
  1147. if parser.engine == "pyarrow" and not cache_dates:
  1148. # None in input gets converted to 'None', for which
  1149. # pandas tries to guess the datetime format, triggering
  1150. # the warning. TODO: parse dates directly in pyarrow, see
  1151. # https://github.com/pandas-dev/pandas/issues/48017
  1152. warn = UserWarning
  1153. else:
  1154. # Note: warning is not raised if 'cache_dates', because here there is only a
  1155. # single unique date and hence no risk of inconsistent parsing.
  1156. warn = None
  1157. parser.read_csv_check_warnings(
  1158. warn,
  1159. "Could not infer format",
  1160. s,
  1161. header=None,
  1162. names=["foo", "bar"],
  1163. parse_dates=["foo"],
  1164. cache_dates=cache_dates,
  1165. )
  1166. @pytest.mark.parametrize("cache_dates", [True, False])
  1167. @pytest.mark.parametrize("value", ["0"])
  1168. def test_bad_date_parse_with_warning(all_parsers, cache_dates, value):
  1169. # if we have an invalid date make sure that we handle this with
  1170. # and w/o the cache properly.
  1171. parser = all_parsers
  1172. s = StringIO((f"{value},\n") * 50000)
  1173. if parser.engine == "pyarrow":
  1174. # pyarrow reads "0" as 0 (of type int64), and so
  1175. # pandas doesn't try to guess the datetime format
  1176. # TODO: parse dates directly in pyarrow, see
  1177. # https://github.com/pandas-dev/pandas/issues/48017
  1178. warn = None
  1179. elif cache_dates:
  1180. # Note: warning is not raised if 'cache_dates', because here there is only a
  1181. # single unique date and hence no risk of inconsistent parsing.
  1182. warn = None
  1183. else:
  1184. warn = UserWarning
  1185. parser.read_csv_check_warnings(
  1186. warn,
  1187. "Could not infer format",
  1188. s,
  1189. header=None,
  1190. names=["foo", "bar"],
  1191. parse_dates=["foo"],
  1192. cache_dates=cache_dates,
  1193. )
  1194. @xfail_pyarrow
  1195. def test_parse_dates_empty_string(all_parsers):
  1196. # see gh-2263
  1197. parser = all_parsers
  1198. data = "Date,test\n2012-01-01,1\n,2"
  1199. result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False)
  1200. expected = DataFrame(
  1201. [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"]
  1202. )
  1203. tm.assert_frame_equal(result, expected)
  1204. @pytest.mark.parametrize(
  1205. "reader", ["read_csv_check_warnings", "read_table_check_warnings"]
  1206. )
  1207. def test_parse_dates_infer_datetime_format_warning(all_parsers, reader):
  1208. # GH 49024, 51017
  1209. parser = all_parsers
  1210. data = "Date,test\n2012-01-01,1\n,2"
  1211. getattr(parser, reader)(
  1212. FutureWarning,
  1213. "The argument 'infer_datetime_format' is deprecated",
  1214. StringIO(data),
  1215. parse_dates=["Date"],
  1216. infer_datetime_format=True,
  1217. sep=",",
  1218. )
  1219. @pytest.mark.parametrize(
  1220. "reader", ["read_csv_check_warnings", "read_table_check_warnings"]
  1221. )
  1222. def test_parse_dates_date_parser_and_date_format(all_parsers, reader):
  1223. # GH 50601
  1224. parser = all_parsers
  1225. data = "Date,test\n2012-01-01,1\n,2"
  1226. msg = "Cannot use both 'date_parser' and 'date_format'"
  1227. with pytest.raises(TypeError, match=msg):
  1228. getattr(parser, reader)(
  1229. FutureWarning,
  1230. "use 'date_format' instead",
  1231. StringIO(data),
  1232. parse_dates=["Date"],
  1233. date_parser=pd.to_datetime,
  1234. date_format="ISO8601",
  1235. sep=",",
  1236. )
  1237. @xfail_pyarrow
  1238. @pytest.mark.parametrize(
  1239. "data,kwargs,expected",
  1240. [
  1241. (
  1242. "a\n04.15.2016",
  1243. {"parse_dates": ["a"]},
  1244. DataFrame([datetime(2016, 4, 15)], columns=["a"]),
  1245. ),
  1246. (
  1247. "a\n04.15.2016",
  1248. {"parse_dates": True, "index_col": 0},
  1249. DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]),
  1250. ),
  1251. (
  1252. "a,b\n04.15.2016,09.16.2013",
  1253. {"parse_dates": ["a", "b"]},
  1254. DataFrame(
  1255. [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"]
  1256. ),
  1257. ),
  1258. (
  1259. "a,b\n04.15.2016,09.16.2013",
  1260. {"parse_dates": True, "index_col": [0, 1]},
  1261. DataFrame(
  1262. index=MultiIndex.from_tuples(
  1263. [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]
  1264. ),
  1265. columns=[],
  1266. ),
  1267. ),
  1268. ],
  1269. )
  1270. def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
  1271. # see gh-14066
  1272. parser = all_parsers
  1273. result = parser.read_csv(StringIO(data), thousands=".", **kwargs)
  1274. tm.assert_frame_equal(result, expected)
  1275. @xfail_pyarrow
  1276. def test_parse_date_time_multi_level_column_name(all_parsers):
  1277. data = """\
  1278. D,T,A,B
  1279. date, time,a,b
  1280. 2001-01-05, 09:00:00, 0.0, 10.
  1281. 2001-01-06, 00:00:00, 1.0, 11.
  1282. """
  1283. parser = all_parsers
  1284. result = parser.read_csv_check_warnings(
  1285. FutureWarning,
  1286. "use 'date_format' instead",
  1287. StringIO(data),
  1288. header=[0, 1],
  1289. parse_dates={"date_time": [0, 1]},
  1290. date_parser=pd.to_datetime,
  1291. )
  1292. expected_data = [
  1293. [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0],
  1294. [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0],
  1295. ]
  1296. expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")])
  1297. tm.assert_frame_equal(result, expected)
  1298. @xfail_pyarrow
  1299. @pytest.mark.parametrize(
  1300. "data,kwargs,expected",
  1301. [
  1302. (
  1303. """\
  1304. date,time,a,b
  1305. 2001-01-05, 10:00:00, 0.0, 10.
  1306. 2001-01-05, 00:00:00, 1., 11.
  1307. """,
  1308. {"header": 0, "parse_dates": {"date_time": [0, 1]}},
  1309. DataFrame(
  1310. [
  1311. [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
  1312. [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0],
  1313. ],
  1314. columns=["date_time", "a", "b"],
  1315. ),
  1316. ),
  1317. (
  1318. (
  1319. "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
  1320. "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
  1321. "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
  1322. "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
  1323. "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
  1324. "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
  1325. ),
  1326. {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}},
  1327. DataFrame(
  1328. [
  1329. [
  1330. datetime(1999, 1, 27, 19, 0),
  1331. datetime(1999, 1, 27, 18, 56),
  1332. "KORD",
  1333. 0.81,
  1334. ],
  1335. [
  1336. datetime(1999, 1, 27, 20, 0),
  1337. datetime(1999, 1, 27, 19, 56),
  1338. "KORD",
  1339. 0.01,
  1340. ],
  1341. [
  1342. datetime(1999, 1, 27, 21, 0),
  1343. datetime(1999, 1, 27, 20, 56),
  1344. "KORD",
  1345. -0.59,
  1346. ],
  1347. [
  1348. datetime(1999, 1, 27, 21, 0),
  1349. datetime(1999, 1, 27, 21, 18),
  1350. "KORD",
  1351. -0.99,
  1352. ],
  1353. [
  1354. datetime(1999, 1, 27, 22, 0),
  1355. datetime(1999, 1, 27, 21, 56),
  1356. "KORD",
  1357. -0.59,
  1358. ],
  1359. [
  1360. datetime(1999, 1, 27, 23, 0),
  1361. datetime(1999, 1, 27, 22, 56),
  1362. "KORD",
  1363. -0.59,
  1364. ],
  1365. ],
  1366. columns=["actual", "nominal", 0, 4],
  1367. ),
  1368. ),
  1369. ],
  1370. )
  1371. def test_parse_date_time(all_parsers, data, kwargs, expected):
  1372. parser = all_parsers
  1373. result = parser.read_csv_check_warnings(
  1374. FutureWarning,
  1375. "use 'date_format' instead",
  1376. StringIO(data),
  1377. date_parser=pd.to_datetime,
  1378. **kwargs,
  1379. )
  1380. # Python can sometimes be flaky about how
  1381. # the aggregated columns are entered, so
  1382. # this standardizes the order.
  1383. result = result[expected.columns]
  1384. tm.assert_frame_equal(result, expected)
  1385. @xfail_pyarrow
  1386. # From date_parser fallback behavior
  1387. @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
  1388. def test_parse_date_fields(all_parsers):
  1389. parser = all_parsers
  1390. data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
  1391. result = parser.read_csv_check_warnings(
  1392. FutureWarning,
  1393. "use 'date_format' instead",
  1394. StringIO(data),
  1395. header=0,
  1396. parse_dates={"ymd": [0, 1, 2]},
  1397. date_parser=pd.to_datetime,
  1398. )
  1399. expected = DataFrame(
  1400. [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]],
  1401. columns=["ymd", "a"],
  1402. )
  1403. tm.assert_frame_equal(result, expected)
  1404. @xfail_pyarrow
  1405. @pytest.mark.parametrize(
  1406. ("key", "value", "warn"),
  1407. [
  1408. (
  1409. "date_parser",
  1410. lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"),
  1411. FutureWarning,
  1412. ),
  1413. ("date_format", "%Y %m %d %H %M %S", None),
  1414. ],
  1415. )
  1416. def test_parse_date_all_fields(all_parsers, key, value, warn):
  1417. parser = all_parsers
  1418. data = """\
  1419. year,month,day,hour,minute,second,a,b
  1420. 2001,01,05,10,00,0,0.0,10.
  1421. 2001,01,5,10,0,00,1.,11.
  1422. """
  1423. result = parser.read_csv_check_warnings(
  1424. warn,
  1425. "use 'date_format' instead",
  1426. StringIO(data),
  1427. header=0,
  1428. parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
  1429. **{key: value},
  1430. )
  1431. expected = DataFrame(
  1432. [
  1433. [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
  1434. [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0],
  1435. ],
  1436. columns=["ymdHMS", "a", "b"],
  1437. )
  1438. tm.assert_frame_equal(result, expected)
  1439. @xfail_pyarrow
  1440. @pytest.mark.parametrize(
  1441. ("key", "value", "warn"),
  1442. [
  1443. (
  1444. "date_parser",
  1445. lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"),
  1446. FutureWarning,
  1447. ),
  1448. ("date_format", "%Y %m %d %H %M %S.%f", None),
  1449. ],
  1450. )
  1451. def test_datetime_fractional_seconds(all_parsers, key, value, warn):
  1452. parser = all_parsers
  1453. data = """\
  1454. year,month,day,hour,minute,second,a,b
  1455. 2001,01,05,10,00,0.123456,0.0,10.
  1456. 2001,01,5,10,0,0.500000,1.,11.
  1457. """
  1458. result = parser.read_csv_check_warnings(
  1459. warn,
  1460. "use 'date_format' instead",
  1461. StringIO(data),
  1462. header=0,
  1463. parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
  1464. **{key: value},
  1465. )
  1466. expected = DataFrame(
  1467. [
  1468. [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0],
  1469. [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0],
  1470. ],
  1471. columns=["ymdHMS", "a", "b"],
  1472. )
  1473. tm.assert_frame_equal(result, expected)
  1474. @xfail_pyarrow
  1475. def test_generic(all_parsers):
  1476. parser = all_parsers
  1477. data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
  1478. def parse_function(yy, mm):
  1479. return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)]
  1480. result = parser.read_csv_check_warnings(
  1481. FutureWarning,
  1482. "use 'date_format' instead",
  1483. StringIO(data),
  1484. header=0,
  1485. parse_dates={"ym": [0, 1]},
  1486. date_parser=parse_function,
  1487. )
  1488. expected = DataFrame(
  1489. [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]],
  1490. columns=["ym", "day", "a"],
  1491. )
  1492. expected["ym"] = expected["ym"].astype("datetime64[ns]")
  1493. tm.assert_frame_equal(result, expected)
  1494. @xfail_pyarrow
  1495. def test_date_parser_resolution_if_not_ns(all_parsers):
  1496. # see gh-10245
  1497. parser = all_parsers
  1498. data = """\
  1499. date,time,prn,rxstatus
  1500. 2013-11-03,19:00:00,126,00E80000
  1501. 2013-11-03,19:00:00,23,00E80000
  1502. 2013-11-03,19:00:00,13,00E80000
  1503. """
  1504. def date_parser(dt, time):
  1505. try:
  1506. arr = dt + "T" + time
  1507. except TypeError:
  1508. # dt & time are date/time objects
  1509. arr = [datetime.combine(d, t) for d, t in zip(dt, time)]
  1510. return np.array(arr, dtype="datetime64[s]")
  1511. result = parser.read_csv_check_warnings(
  1512. FutureWarning,
  1513. "use 'date_format' instead",
  1514. StringIO(data),
  1515. date_parser=date_parser,
  1516. parse_dates={"datetime": ["date", "time"]},
  1517. index_col=["datetime", "prn"],
  1518. )
  1519. datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]")
  1520. expected = DataFrame(
  1521. data={"rxstatus": ["00E80000"] * 3},
  1522. index=MultiIndex.from_arrays(
  1523. [datetimes, [126, 23, 13]],
  1524. names=["datetime", "prn"],
  1525. ),
  1526. )
  1527. tm.assert_frame_equal(result, expected)
  1528. def test_parse_date_column_with_empty_string(all_parsers):
  1529. # see gh-6428
  1530. parser = all_parsers
  1531. data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, "
  1532. result = parser.read_csv(StringIO(data), parse_dates=["opdate"])
  1533. expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]]
  1534. expected = DataFrame(expected_data, columns=["case", "opdate"])
  1535. tm.assert_frame_equal(result, expected)
  1536. @pytest.mark.parametrize(
  1537. "data,expected",
  1538. [
  1539. (
  1540. "a\n135217135789158401\n1352171357E+5",
  1541. DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"),
  1542. ),
  1543. (
  1544. "a\n99999999999\n123456789012345\n1234E+0",
  1545. DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"),
  1546. ),
  1547. ],
  1548. )
  1549. @pytest.mark.parametrize("parse_dates", [True, False])
  1550. def test_parse_date_float(all_parsers, data, expected, parse_dates):
  1551. # see gh-2697
  1552. #
  1553. # Date parsing should fail, so we leave the data untouched
  1554. # (i.e. float precision should remain unchanged).
  1555. parser = all_parsers
  1556. result = parser.read_csv(StringIO(data), parse_dates=parse_dates)
  1557. tm.assert_frame_equal(result, expected)
  1558. def test_parse_timezone(all_parsers):
  1559. # see gh-22256
  1560. parser = all_parsers
  1561. data = """dt,val
  1562. 2018-01-04 09:01:00+09:00,23350
  1563. 2018-01-04 09:02:00+09:00,23400
  1564. 2018-01-04 09:03:00+09:00,23400
  1565. 2018-01-04 09:04:00+09:00,23400
  1566. 2018-01-04 09:05:00+09:00,23400"""
  1567. result = parser.read_csv(StringIO(data), parse_dates=["dt"])
  1568. dti = DatetimeIndex(
  1569. list(
  1570. date_range(
  1571. start="2018-01-04 09:01:00",
  1572. end="2018-01-04 09:05:00",
  1573. freq="1min",
  1574. tz=timezone(timedelta(minutes=540)),
  1575. )
  1576. ),
  1577. freq=None,
  1578. )
  1579. expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]}
  1580. expected = DataFrame(expected_data)
  1581. tm.assert_frame_equal(result, expected)
  1582. @skip_pyarrow
  1583. @pytest.mark.parametrize(
  1584. "date_string",
  1585. ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
  1586. )
  1587. def test_invalid_parse_delimited_date(all_parsers, date_string):
  1588. parser = all_parsers
  1589. expected = DataFrame({0: [date_string]}, dtype="object")
  1590. result = parser.read_csv(
  1591. StringIO(date_string),
  1592. header=None,
  1593. parse_dates=[0],
  1594. )
  1595. tm.assert_frame_equal(result, expected)
  1596. @skip_pyarrow
  1597. @pytest.mark.parametrize(
  1598. "date_string,dayfirst,expected",
  1599. [
  1600. # %d/%m/%Y; month > 12 thus replacement
  1601. ("13/02/2019", True, datetime(2019, 2, 13)),
  1602. # %m/%d/%Y; day > 12 thus there will be no replacement
  1603. ("02/13/2019", False, datetime(2019, 2, 13)),
  1604. # %d/%m/%Y; dayfirst==True thus replacement
  1605. ("04/02/2019", True, datetime(2019, 2, 4)),
  1606. ],
  1607. )
  1608. def test_parse_delimited_date_swap_no_warning(
  1609. all_parsers, date_string, dayfirst, expected
  1610. ):
  1611. parser = all_parsers
  1612. expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
  1613. result = parser.read_csv(
  1614. StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0]
  1615. )
  1616. tm.assert_frame_equal(result, expected)
  1617. @skip_pyarrow
  1618. @pytest.mark.parametrize(
  1619. "date_string,dayfirst,expected",
  1620. [
  1621. # %d/%m/%Y; month > 12
  1622. ("13/02/2019", False, datetime(2019, 2, 13)),
  1623. # %m/%d/%Y; day > 12
  1624. ("02/13/2019", True, datetime(2019, 2, 13)),
  1625. ],
  1626. )
  1627. def test_parse_delimited_date_swap_with_warning(
  1628. all_parsers, date_string, dayfirst, expected
  1629. ):
  1630. parser = all_parsers
  1631. expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
  1632. warning_msg = (
  1633. "Parsing dates in .* format when dayfirst=.* was specified. "
  1634. "Pass `dayfirst=.*` or specify a format to silence this warning."
  1635. )
  1636. result = parser.read_csv_check_warnings(
  1637. UserWarning,
  1638. warning_msg,
  1639. StringIO(date_string),
  1640. header=None,
  1641. dayfirst=dayfirst,
  1642. parse_dates=[0],
  1643. )
  1644. tm.assert_frame_equal(result, expected)
  1645. def test_parse_multiple_delimited_dates_with_swap_warnings():
  1646. # GH46210
  1647. with pytest.raises(
  1648. ValueError,
  1649. match=(
  1650. r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", '
  1651. r"at position 1. You might want to try:"
  1652. ),
  1653. ):
  1654. pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])
  1655. def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
  1656. msg, result = None, None
  1657. try:
  1658. result = call(date_string, **kwargs)
  1659. except ValueError as er:
  1660. msg = str(er)
  1661. return msg, result
  1662. @skip_pyarrow
  1663. @given(DATETIME_NO_TZ)
  1664. @pytest.mark.parametrize("delimiter", list(" -./"))
  1665. @pytest.mark.parametrize("dayfirst", [True, False])
  1666. @pytest.mark.parametrize(
  1667. "date_format",
  1668. ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"],
  1669. )
  1670. def test_hypothesis_delimited_date(
  1671. request, date_format, dayfirst, delimiter, test_datetime
  1672. ):
  1673. if date_format == "%m %Y" and delimiter == ".":
  1674. request.node.add_marker(
  1675. pytest.mark.xfail(
  1676. reason="parse_datetime_string cannot reliably tell whether "
  1677. "e.g. %m.%Y is a float or a date"
  1678. )
  1679. )
  1680. date_string = test_datetime.strftime(date_format.replace(" ", delimiter))
  1681. except_out_dateutil, result = _helper_hypothesis_delimited_date(
  1682. py_parse_datetime_string, date_string, dayfirst=dayfirst
  1683. )
  1684. except_in_dateutil, expected = _helper_hypothesis_delimited_date(
  1685. du_parse,
  1686. date_string,
  1687. default=datetime(1, 1, 1),
  1688. dayfirst=dayfirst,
  1689. yearfirst=False,
  1690. )
  1691. assert except_out_dateutil == except_in_dateutil
  1692. assert result == expected
  1693. @skip_pyarrow
  1694. @pytest.mark.parametrize(
  1695. "names, usecols, parse_dates, missing_cols",
  1696. [
  1697. (None, ["val"], ["date", "time"], "date, time"),
  1698. (None, ["val"], [0, "time"], "time"),
  1699. (None, ["val"], [["date", "time"]], "date, time"),
  1700. (None, ["val"], [[0, "time"]], "time"),
  1701. (None, ["val"], {"date": [0, "time"]}, "time"),
  1702. (None, ["val"], {"date": ["date", "time"]}, "date, time"),
  1703. (None, ["val"], [["date", "time"], "date"], "date, time"),
  1704. (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"),
  1705. (
  1706. ["date1", "time1", "temperature"],
  1707. ["date1", "temperature"],
  1708. ["date1", "time"],
  1709. "time",
  1710. ),
  1711. ],
  1712. )
  1713. def test_missing_parse_dates_column_raises(
  1714. all_parsers, names, usecols, parse_dates, missing_cols
  1715. ):
  1716. # gh-31251 column names provided in parse_dates could be missing.
  1717. parser = all_parsers
  1718. content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
  1719. msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
  1720. with pytest.raises(ValueError, match=msg):
  1721. parser.read_csv(
  1722. content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates
  1723. )
  1724. @skip_pyarrow
  1725. def test_date_parser_and_names(all_parsers):
  1726. # GH#33699
  1727. parser = all_parsers
  1728. data = StringIO("""x,y\n1,2""")
  1729. result = parser.read_csv_check_warnings(
  1730. UserWarning,
  1731. "Could not infer format",
  1732. data,
  1733. parse_dates=["B"],
  1734. names=["B"],
  1735. )
  1736. expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"])
  1737. tm.assert_frame_equal(result, expected)
  1738. @skip_pyarrow
  1739. def test_date_parser_multiindex_columns(all_parsers):
  1740. parser = all_parsers
  1741. data = """a,b
  1742. 1,2
  1743. 2019-12-31,6"""
  1744. result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1])
  1745. expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]})
  1746. tm.assert_frame_equal(result, expected)
  1747. @skip_pyarrow
  1748. @pytest.mark.parametrize(
  1749. "parse_spec, col_name",
  1750. [
  1751. ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")),
  1752. ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")),
  1753. ],
  1754. )
  1755. def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name):
  1756. parser = all_parsers
  1757. data = """a,b,c
  1758. 1,2,3
  1759. 2019-12,-31,6"""
  1760. result = parser.read_csv(
  1761. StringIO(data),
  1762. parse_dates=parse_spec,
  1763. header=[0, 1],
  1764. )
  1765. expected = DataFrame({col_name: Timestamp("2019-12-31"), ("c", "3"): [6]})
  1766. tm.assert_frame_equal(result, expected)
  1767. @skip_pyarrow
  1768. def test_date_parser_usecols_thousands(all_parsers):
  1769. # GH#39365
  1770. data = """A,B,C
  1771. 1,3,20-09-01-01
  1772. 2,4,20-09-01-01
  1773. """
  1774. parser = all_parsers
  1775. result = parser.read_csv_check_warnings(
  1776. UserWarning,
  1777. "Could not infer format",
  1778. StringIO(data),
  1779. parse_dates=[1],
  1780. usecols=[1, 2],
  1781. thousands="-",
  1782. )
  1783. expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
  1784. tm.assert_frame_equal(result, expected)
  1785. @skip_pyarrow
  1786. def test_parse_dates_and_keep_orgin_column(all_parsers):
  1787. # GH#13378
  1788. parser = all_parsers
  1789. data = """A
  1790. 20150908
  1791. 20150909
  1792. """
  1793. result = parser.read_csv(
  1794. StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True
  1795. )
  1796. expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")]
  1797. expected = DataFrame({"date": expected_data, "A": expected_data})
  1798. tm.assert_frame_equal(result, expected)
  1799. def test_dayfirst_warnings():
  1800. # GH 12585
  1801. # CASE 1: valid input
  1802. input = "date\n31/12/2014\n10/03/2011"
  1803. expected = DatetimeIndex(
  1804. ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date"
  1805. )
  1806. warning_msg = (
  1807. "Parsing dates in .* format when dayfirst=.* was specified. "
  1808. "Pass `dayfirst=.*` or specify a format to silence this warning."
  1809. )
  1810. # A. dayfirst arg correct, no warning
  1811. res1 = read_csv(
  1812. StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date"
  1813. ).index
  1814. tm.assert_index_equal(expected, res1)
  1815. # B. dayfirst arg incorrect, warning
  1816. with tm.assert_produces_warning(UserWarning, match=warning_msg):
  1817. res2 = read_csv(
  1818. StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
  1819. ).index
  1820. tm.assert_index_equal(expected, res2)
  1821. # CASE 2: invalid input
  1822. # cannot consistently process with single format
  1823. # return to user unaltered
  1824. # first in DD/MM/YYYY, second in MM/DD/YYYY
  1825. input = "date\n31/12/2014\n03/30/2011"
  1826. expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
  1827. # A. use dayfirst=True
  1828. res5 = read_csv(
  1829. StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date"
  1830. ).index
  1831. tm.assert_index_equal(expected, res5)
  1832. # B. use dayfirst=False
  1833. with tm.assert_produces_warning(UserWarning, match=warning_msg):
  1834. res6 = read_csv(
  1835. StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
  1836. ).index
  1837. tm.assert_index_equal(expected, res6)
  1838. @pytest.mark.parametrize(
  1839. "date_string, dayfirst",
  1840. [
  1841. pytest.param(
  1842. "31/1/2014",
  1843. False,
  1844. id="second date is single-digit",
  1845. ),
  1846. pytest.param(
  1847. "1/31/2014",
  1848. True,
  1849. id="first date is single-digit",
  1850. ),
  1851. ],
  1852. )
  1853. def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst):
  1854. # GH47880
  1855. initial_value = f"date\n{date_string}"
  1856. expected = DatetimeIndex(
  1857. ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date"
  1858. )
  1859. warning_msg = (
  1860. "Parsing dates in .* format when dayfirst=.* was specified. "
  1861. "Pass `dayfirst=.*` or specify a format to silence this warning."
  1862. )
  1863. with tm.assert_produces_warning(UserWarning, match=warning_msg):
  1864. res = read_csv(
  1865. StringIO(initial_value),
  1866. parse_dates=["date"],
  1867. index_col="date",
  1868. dayfirst=dayfirst,
  1869. ).index
  1870. tm.assert_index_equal(expected, res)
  1871. @skip_pyarrow
  1872. def test_infer_first_column_as_index(all_parsers):
  1873. # GH#11019
  1874. parser = all_parsers
  1875. data = "a,b,c\n1970-01-01,2,3,4"
  1876. result = parser.read_csv(
  1877. StringIO(data),
  1878. parse_dates=["a"],
  1879. )
  1880. expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"])
  1881. tm.assert_frame_equal(result, expected)
  1882. @skip_pyarrow
  1883. @pytest.mark.parametrize(
  1884. ("key", "value", "warn"),
  1885. [
  1886. ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning),
  1887. ("date_format", "%Y-%m-%d", None),
  1888. ],
  1889. )
  1890. def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn):
  1891. # GH#26203
  1892. parser = all_parsers
  1893. data = """Test
  1894. 2012-10-01
  1895. 0
  1896. 2015-05-15
  1897. #
  1898. 2017-09-09
  1899. """
  1900. result = parser.read_csv_check_warnings(
  1901. warn,
  1902. "use 'date_format' instead",
  1903. StringIO(data),
  1904. na_values={"Test": ["#", "0"]},
  1905. parse_dates=["Test"],
  1906. **{key: value},
  1907. )
  1908. expected = DataFrame(
  1909. {
  1910. "Test": [
  1911. Timestamp("2012-10-01"),
  1912. pd.NaT,
  1913. Timestamp("2015-05-15"),
  1914. pd.NaT,
  1915. Timestamp("2017-09-09"),
  1916. ]
  1917. }
  1918. )
  1919. tm.assert_frame_equal(result, expected)
  1920. @skip_pyarrow
  1921. def test_parse_dates_and_string_dtype(all_parsers):
  1922. # GH#34066
  1923. parser = all_parsers
  1924. data = """a,b
  1925. 1,2019-12-31
  1926. """
  1927. result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"])
  1928. expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]})
  1929. expected["a"] = expected["a"].astype("string")
  1930. tm.assert_frame_equal(result, expected)
  1931. def test_parse_dot_separated_dates(all_parsers):
  1932. # https://github.com/pandas-dev/pandas/issues/2586
  1933. parser = all_parsers
  1934. data = """a,b
  1935. 27.03.2003 14:55:00.000,1
  1936. 03.08.2003 15:20:00.000,2"""
  1937. if parser.engine == "pyarrow":
  1938. expected_index = Index(
  1939. ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
  1940. dtype="object",
  1941. name="a",
  1942. )
  1943. warn = None
  1944. else:
  1945. expected_index = DatetimeIndex(
  1946. ["2003-03-27 14:55:00", "2003-08-03 15:20:00"],
  1947. dtype="datetime64[ns]",
  1948. name="a",
  1949. )
  1950. warn = UserWarning
  1951. msg = r"when dayfirst=False \(the default\) was specified"
  1952. result = parser.read_csv_check_warnings(
  1953. warn, msg, StringIO(data), parse_dates=True, index_col=0
  1954. )
  1955. expected = DataFrame({"b": [1, 2]}, index=expected_index)
  1956. tm.assert_frame_equal(result, expected)
  1957. def test_parse_dates_dict_format(all_parsers):
  1958. # GH#51240
  1959. parser = all_parsers
  1960. data = """a,b
  1961. 2019-12-31,31-12-2019
  1962. 2020-12-31,31-12-2020"""
  1963. result = parser.read_csv(
  1964. StringIO(data),
  1965. date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"},
  1966. parse_dates=["a", "b"],
  1967. )
  1968. expected = DataFrame(
  1969. {
  1970. "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
  1971. "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
  1972. }
  1973. )
  1974. tm.assert_frame_equal(result, expected)
  1975. @skip_pyarrow
  1976. @pytest.mark.parametrize(
  1977. "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})]
  1978. )
  1979. def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates):
  1980. # GH#51240
  1981. parser = all_parsers
  1982. data = """a,b
  1983. 31-,12-2019
  1984. 31-,12-2020"""
  1985. with tm.assert_produces_warning(None):
  1986. result = parser.read_csv(
  1987. StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates
  1988. )
  1989. expected = DataFrame(
  1990. {
  1991. key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
  1992. }
  1993. )
  1994. tm.assert_frame_equal(result, expected)
  1995. @skip_pyarrow
  1996. def test_parse_dates_dict_format_index(all_parsers):
  1997. # GH#51240
  1998. parser = all_parsers
  1999. data = """a,b
  2000. 2019-12-31,31-12-2019
  2001. 2020-12-31,31-12-2020"""
  2002. result = parser.read_csv(
  2003. StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0
  2004. )
  2005. expected = DataFrame(
  2006. {
  2007. "b": ["31-12-2019", "31-12-2020"],
  2008. },
  2009. index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"),
  2010. )
  2011. tm.assert_frame_equal(result, expected)
  2012. def test_parse_dates_arrow_engine(all_parsers):
  2013. # GH#53295
  2014. parser = all_parsers
  2015. data = """a,b
  2016. 2000-01-01 00:00:00,1
  2017. 2000-01-01 00:00:01,1"""
  2018. result = parser.read_csv(StringIO(data), parse_dates=["a"])
  2019. expected = DataFrame(
  2020. {
  2021. "a": [
  2022. Timestamp("2000-01-01 00:00:00"),
  2023. Timestamp("2000-01-01 00:00:01"),
  2024. ],
  2025. "b": 1,
  2026. }
  2027. )
  2028. tm.assert_frame_equal(result, expected)