12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240 |
- """
- Tests date parsing functionality for all of the
- parsers defined in parsers.py
- """
- from datetime import (
- date,
- datetime,
- timedelta,
- timezone,
- )
- from io import StringIO
- from dateutil.parser import parse as du_parse
- from hypothesis import given
- import numpy as np
- import pytest
- import pytz
- from pandas._libs.tslibs import parsing
- from pandas._libs.tslibs.parsing import py_parse_datetime_string
- import pandas as pd
- from pandas import (
- DataFrame,
- DatetimeIndex,
- Index,
- MultiIndex,
- Series,
- Timestamp,
- )
- import pandas._testing as tm
- from pandas._testing._hypothesis import DATETIME_NO_TZ
- from pandas.core.indexes.datetimes import date_range
- from pandas.io.parsers import read_csv
- xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
- # GH#43650: Some expected failures with the pyarrow engine can occasionally
- # cause a deadlock instead, so we skip these instead of xfailing
- skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
- @xfail_pyarrow
- def test_read_csv_with_custom_date_parser(all_parsers):
- # GH36111
- def __custom_date_parser(time):
- time = time.astype(np.float_)
- time = time.astype(np.int_) # convert float seconds to int type
- return pd.to_timedelta(time, unit="s")
- testdata = StringIO(
- """time e n h
- 41047.00 -98573.7297 871458.0640 389.0089
- 41048.00 -98573.7299 871458.0640 389.0089
- 41049.00 -98573.7300 871458.0642 389.0088
- 41050.00 -98573.7299 871458.0643 389.0088
- 41051.00 -98573.7302 871458.0640 389.0086
- """
- )
- result = all_parsers.read_csv_check_warnings(
- FutureWarning,
- "Please use 'date_format' instead",
- testdata,
- delim_whitespace=True,
- parse_dates=True,
- date_parser=__custom_date_parser,
- index_col="time",
- )
- time = [41047, 41048, 41049, 41050, 41051]
- time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time")
- expected = DataFrame(
- {
- "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302],
- "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640],
- "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086],
- },
- index=time,
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers):
- # GH44366
- def __custom_date_parser(time):
- time = time.astype(np.float_)
- time = time.astype(np.int_) # convert float seconds to int type
- return pd.to_timedelta(time, unit="s")
- testdata = StringIO(
- """time e
- 41047.00 -93.77
- 41048.00 -95.79
- 41049.00 -98.73
- 41050.00 -93.99
- 41051.00 -97.72
- """
- )
- result = all_parsers.read_csv_check_warnings(
- FutureWarning,
- "Please use 'date_format' instead",
- testdata,
- delim_whitespace=True,
- parse_dates=False,
- date_parser=__custom_date_parser,
- index_col="time",
- )
- time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time")
- expected = DataFrame(
- {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]},
- index=time,
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_separator_date_conflict(all_parsers):
- # Regression test for gh-4678
- #
- # Make sure thousands separator and
- # date parsing do not conflict.
- parser = all_parsers
- data = "06-02-2013;13:00;1-000.215"
- expected = DataFrame(
- [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2]
- )
- df = parser.read_csv(
- StringIO(data),
- sep=";",
- thousands="-",
- parse_dates={"Date": [0, 1]},
- header=None,
- )
- tm.assert_frame_equal(df, expected)
- @xfail_pyarrow
- @pytest.mark.parametrize("keep_date_col", [True, False])
- def test_multiple_date_col_custom(all_parsers, keep_date_col):
- data = """\
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- parser = all_parsers
- def date_parser(*date_cols):
- """
- Test date parser.
- Parameters
- ----------
- date_cols : args
- The list of data columns to parse.
- Returns
- -------
- parsed : Series
- """
- return parsing.try_parse_dates(
- parsing.concat_date_cols(date_cols), parser=du_parse
- )
- kwds = {
- "header": None,
- "date_parser": date_parser,
- "parse_dates": {"actual": [1, 2], "nominal": [1, 3]},
- "keep_date_col": keep_date_col,
- "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"],
- }
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- **kwds,
- )
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- datetime(1999, 1, 27, 18, 56),
- "KORD",
- "19990127",
- " 19:00:00",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- datetime(1999, 1, 27, 19, 56),
- "KORD",
- "19990127",
- " 20:00:00",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- "19990127",
- " 21:00:00",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- "19990127",
- " 21:00:00",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- "19990127",
- " 22:00:00",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- "19990127",
- " 23:00:00",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "actual",
- "nominal",
- "X0",
- "X1",
- "X2",
- "X3",
- "X4",
- "X5",
- "X6",
- "X7",
- "X8",
- ],
- )
- if not keep_date_col:
- expected = expected.drop(["X1", "X2", "X3"], axis=1)
- # Python can sometimes be flaky about how
- # the aggregated columns are entered, so
- # this standardizes the order.
- result = result[expected.columns]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("container", [list, tuple, Index, Series])
- @pytest.mark.parametrize("dim", [1, 2])
- def test_concat_date_col_fail(container, dim):
- msg = "not all elements from date_cols are numpy arrays"
- value = "19990127"
- date_cols = tuple(container([value]) for _ in range(dim))
- with pytest.raises(ValueError, match=msg):
- parsing.concat_date_cols(date_cols)
- @xfail_pyarrow
- @pytest.mark.parametrize("keep_date_col", [True, False])
- def test_multiple_date_col(all_parsers, keep_date_col):
- data = """\
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- parser = all_parsers
- kwds = {
- "header": None,
- "parse_dates": [[1, 2], [1, 3]],
- "keep_date_col": keep_date_col,
- "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"],
- }
- result = parser.read_csv(StringIO(data), **kwds)
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- datetime(1999, 1, 27, 18, 56),
- "KORD",
- "19990127",
- " 19:00:00",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- datetime(1999, 1, 27, 19, 56),
- "KORD",
- "19990127",
- " 20:00:00",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- "19990127",
- " 21:00:00",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- "19990127",
- " 21:00:00",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- "19990127",
- " 22:00:00",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- "19990127",
- " 23:00:00",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "X1_X2",
- "X1_X3",
- "X0",
- "X1",
- "X2",
- "X3",
- "X4",
- "X5",
- "X6",
- "X7",
- "X8",
- ],
- )
- if not keep_date_col:
- expected = expected.drop(["X1", "X2", "X3"], axis=1)
- tm.assert_frame_equal(result, expected)
- def test_date_col_as_index_col(all_parsers):
- data = """\
- KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- """
- parser = all_parsers
- kwds = {
- "header": None,
- "parse_dates": [1],
- "index_col": 1,
- "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7"],
- }
- result = parser.read_csv(StringIO(data), **kwds)
- index = Index(
- [
- datetime(1999, 1, 27, 19, 0),
- datetime(1999, 1, 27, 20, 0),
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 22, 0),
- ],
- name="X1",
- )
- expected = DataFrame(
- [
- ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0],
- ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0],
- ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0],
- ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0],
- ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0],
- ],
- columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"],
- index=index,
- )
- if parser.engine == "pyarrow":
- # https://github.com/pandas-dev/pandas/issues/44231
- # pyarrow 6.0 starts to infer time type
- expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_multiple_date_cols_int_cast(all_parsers):
- data = (
- "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
- "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
- "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
- "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
- "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
- "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
- )
- parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
- parser = all_parsers
- kwds = {
- "header": None,
- "parse_dates": parse_dates,
- "date_parser": pd.to_datetime,
- }
- result = parser.read_csv_check_warnings(
- FutureWarning, "use 'date_format' instead", StringIO(data), **kwds
- )
- expected = DataFrame(
- [
- [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81],
- [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- -0.99,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- -0.59,
- ],
- ],
- columns=["actual", "nominal", 0, 4],
- )
- # Python can sometimes be flaky about how
- # the aggregated columns are entered, so
- # this standardizes the order.
- result = result[expected.columns]
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_multiple_date_col_timestamp_parse(all_parsers):
- parser = all_parsers
- data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
- 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- parse_dates=[[0, 1]],
- header=None,
- date_parser=Timestamp,
- )
- expected = DataFrame(
- [
- [
- Timestamp("05/31/2012, 15:30:00.029"),
- 1306.25,
- 1,
- "E",
- 0,
- np.nan,
- 1306.25,
- ],
- [
- Timestamp("05/31/2012, 15:30:00.029"),
- 1306.25,
- 8,
- "E",
- 0,
- np.nan,
- 1306.25,
- ],
- ],
- columns=["0_1", 2, 3, 4, 5, 6, 7],
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_multiple_date_cols_with_header(all_parsers):
- parser = all_parsers
- data = """\
- ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
- result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- "KORD",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- "KORD",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- "KORD",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- "KORD",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "nominal",
- "ID",
- "ActualTime",
- "TDew",
- "TAir",
- "Windspeed",
- "Precip",
- "WindDir",
- ],
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "data,parse_dates,msg",
- [
- (
- """\
- date_NominalTime,date,NominalTime
- KORD1,19990127, 19:00:00
- KORD2,19990127, 20:00:00""",
- [[1, 2]],
- ("New date column already in dict date_NominalTime"),
- ),
- (
- """\
- ID,date,nominalTime
- KORD,19990127, 19:00:00
- KORD,19990127, 20:00:00""",
- {"ID": [1, 2]},
- "Date column ID already in dict",
- ),
- ],
- )
- def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
- parser = all_parsers
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), parse_dates=parse_dates)
- def test_date_parser_int_bug(all_parsers):
- # see gh-3071
- parser = all_parsers
- data = (
- "posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
- "accountid,userid,contactid,level,silo,method\n"
- "1343103150,0.062353,0,4,6,0.01690,3,"
- "12345,1,-1,3,invoice_InvoiceResource,search\n"
- )
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- index_col=0,
- parse_dates=[0],
- date_parser=lambda x: datetime.utcfromtimestamp(int(x)),
- )
- expected = DataFrame(
- [
- [
- 0.062353,
- 0,
- 4,
- 6,
- 0.01690,
- 3,
- 12345,
- 1,
- -1,
- 3,
- "invoice_InvoiceResource",
- "search",
- ]
- ],
- columns=[
- "elapsed",
- "sys",
- "user",
- "queries",
- "query_time",
- "rows",
- "accountid",
- "userid",
- "contactid",
- "level",
- "silo",
- "method",
- ],
- index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"),
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_nat_parse(all_parsers):
- # see gh-3062
- parser = all_parsers
- df = DataFrame(
- dict({"A": np.arange(10, dtype="float64"), "B": Timestamp("20010101")})
- )
- df.iloc[3:6, :] = np.nan
- with tm.ensure_clean("__nat_parse_.csv") as path:
- df.to_csv(path)
- result = parser.read_csv(path, index_col=0, parse_dates=["B"])
- tm.assert_frame_equal(result, df)
- @xfail_pyarrow
- def test_csv_custom_parser(all_parsers):
- data = """A,B,C
- 20090101,a,1,2
- 20090102,b,3,4
- 20090103,c,4,5
- """
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- date_parser=lambda x: datetime.strptime(x, "%Y%m%d"),
- )
- expected = parser.read_csv(StringIO(data), parse_dates=True)
- tm.assert_frame_equal(result, expected)
- result = parser.read_csv(StringIO(data), date_format="%Y%m%d")
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_parse_dates_implicit_first_col(all_parsers):
- data = """A,B,C
- 20090101,a,1,2
- 20090102,b,3,4
- 20090103,c,4,5
- """
- parser = all_parsers
- result = parser.read_csv(StringIO(data), parse_dates=True)
- expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True)
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_parse_dates_string(all_parsers):
- data = """date,A,B,C
- 20090101,a,1,2
- 20090102,b,3,4
- 20090103,c,4,5
- """
- parser = all_parsers
- result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"])
- # freq doesn't round-trip
- index = DatetimeIndex(
- list(date_range("1/1/2009", periods=3)), name="date", freq=None
- )
- expected = DataFrame(
- {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index
- )
- tm.assert_frame_equal(result, expected)
- # Bug in https://github.com/dateutil/dateutil/issues/217
- # has been addressed, but we just don't pass in the `yearfirst`
- @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
- @pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]])
- def test_yy_format_with_year_first(all_parsers, parse_dates):
- data = """date,time,B,C
- 090131,0010,1,2
- 090228,1020,3,4
- 090331,0830,5,6
- """
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- UserWarning,
- "Could not infer format",
- StringIO(data),
- index_col=0,
- parse_dates=parse_dates,
- )
- index = DatetimeIndex(
- [
- datetime(2009, 1, 31, 0, 10, 0),
- datetime(2009, 2, 28, 10, 20, 0),
- datetime(2009, 3, 31, 8, 30, 0),
- ],
- dtype=object,
- name="date_time",
- )
- expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
- def test_parse_dates_column_list(all_parsers, parse_dates):
- data = "a,b,c\n01/01/2010,1,15/02/2010"
- parser = all_parsers
- expected = DataFrame(
- {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]}
- )
- expected = expected.set_index(["a", "b"])
- result = parser.read_csv(
- StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
- def test_multi_index_parse_dates(all_parsers, index_col):
- data = """index1,index2,A,B,C
- 20090101,one,a,1,2
- 20090101,two,b,3,4
- 20090101,three,c,4,5
- 20090102,one,a,1,2
- 20090102,two,b,3,4
- 20090102,three,c,4,5
- 20090103,one,a,1,2
- 20090103,two,b,3,4
- 20090103,three,c,4,5
- """
- parser = all_parsers
- index = MultiIndex.from_product(
- [
- (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)),
- ("one", "two", "three"),
- ],
- names=["index1", "index2"],
- )
- # Out of order.
- if index_col == [1, 0]:
- index = index.swaplevel(0, 1)
- expected = DataFrame(
- [
- ["a", 1, 2],
- ["b", 3, 4],
- ["c", 4, 5],
- ["a", 1, 2],
- ["b", 3, 4],
- ["c", 4, 5],
- ["a", 1, 2],
- ["b", 3, 4],
- ["c", 4, 5],
- ],
- columns=["A", "B", "C"],
- index=index,
- )
- result = parser.read_csv_check_warnings(
- UserWarning,
- "Could not infer format",
- StringIO(data),
- index_col=index_col,
- parse_dates=True,
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- @pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}])
- def test_parse_dates_custom_euro_format(all_parsers, kwargs):
- parser = all_parsers
- data = """foo,bar,baz
- 31/01/2010,1,2
- 01/02/2010,1,NA
- 02/02/2010,1,2
- """
- if "dayfirst" in kwargs:
- df = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- names=["time", "Q", "NTU"],
- date_parser=lambda d: du_parse(d, **kwargs),
- header=0,
- index_col=0,
- parse_dates=True,
- na_values=["NA"],
- )
- exp_index = Index(
- [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)],
- name="time",
- )
- expected = DataFrame(
- {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]},
- index=exp_index,
- columns=["Q", "NTU"],
- )
- tm.assert_frame_equal(df, expected)
- else:
- msg = "got an unexpected keyword argument 'day_first'"
- with pytest.raises(TypeError, match=msg):
- parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- names=["time", "Q", "NTU"],
- date_parser=lambda d: du_parse(d, **kwargs),
- skiprows=[0],
- index_col=0,
- parse_dates=True,
- na_values=["NA"],
- )
- def test_parse_tz_aware(all_parsers, request):
- # See gh-1693
- parser = all_parsers
- data = "Date,x\n2012-06-13T01:39:00Z,0.5"
- result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True)
- expected = DataFrame(
- {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date")
- )
- tm.assert_frame_equal(result, expected)
- if parser.engine == "pyarrow":
- expected_tz = pytz.utc
- else:
- expected_tz = timezone.utc
- assert result.index.tz is expected_tz
- @xfail_pyarrow
- @pytest.mark.parametrize(
- "parse_dates,index_col",
- [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
- )
- def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
- parser = all_parsers
- data = """
- ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
- KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- "KORD1",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- "KORD2",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD3",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD4",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- "KORD5",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- "KORD6",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "nominal",
- "ID",
- "ActualTime",
- "TDew",
- "TAir",
- "Windspeed",
- "Precip",
- "WindDir",
- ],
- )
- expected = expected.set_index("nominal")
- if not isinstance(parse_dates, dict):
- expected.index.name = "date_NominalTime"
- result = parser.read_csv(
- StringIO(data), parse_dates=parse_dates, index_col=index_col
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_multiple_date_cols_chunked(all_parsers):
- parser = all_parsers
- data = """\
- ID,date,nominalTime,actualTime,A,B,C,D,E
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- "KORD",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- "KORD",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- "KORD",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- "KORD",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"],
- )
- expected = expected.set_index("nominal")
- with parser.read_csv(
- StringIO(data),
- parse_dates={"nominal": [1, 2]},
- index_col="nominal",
- chunksize=2,
- ) as reader:
- chunks = list(reader)
- tm.assert_frame_equal(chunks[0], expected[:2])
- tm.assert_frame_equal(chunks[1], expected[2:4])
- tm.assert_frame_equal(chunks[2], expected[4:])
- @xfail_pyarrow
- def test_multiple_date_col_named_index_compat(all_parsers):
- parser = all_parsers
- data = """\
- ID,date,nominalTime,actualTime,A,B,C,D,E
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- with_indices = parser.read_csv(
- StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal"
- )
- with_names = parser.read_csv(
- StringIO(data),
- index_col="nominal",
- parse_dates={"nominal": ["date", "nominalTime"]},
- )
- tm.assert_frame_equal(with_indices, with_names)
- @xfail_pyarrow
- def test_multiple_date_col_multiple_index_compat(all_parsers):
- parser = all_parsers
- data = """\
- ID,date,nominalTime,actualTime,A,B,C,D,E
- KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
- KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
- KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
- KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
- KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
- KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
- """
- result = parser.read_csv(
- StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]}
- )
- expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
- expected = expected.set_index(["nominal", "ID"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}])
- def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
- # see gh-5636
- parser = all_parsers
- msg = (
- "Only booleans, lists, and dictionaries "
- "are accepted for the 'parse_dates' parameter"
- )
- data = """A,B,C
- 1,2,2003-11-1"""
- with pytest.raises(TypeError, match=msg):
- parser.read_csv(StringIO(data), parse_dates="C", **kwargs)
- @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}])
- def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
- parser = all_parsers
- msg = (
- "Only booleans, lists, and dictionaries "
- "are accepted for the 'parse_dates' parameter"
- )
- data = """A,B,C
- 1,2,2003-11-1"""
- with pytest.raises(TypeError, match=msg):
- parser.read_csv(StringIO(data), parse_dates=(1,))
- @pytest.mark.parametrize("cache_dates", [True, False])
- @pytest.mark.parametrize("value", ["nan", ""])
- def test_bad_date_parse(all_parsers, cache_dates, value):
- # if we have an invalid date make sure that we handle this with
- # and w/o the cache properly
- parser = all_parsers
- s = StringIO((f"{value},\n") * 50000)
- if parser.engine == "pyarrow" and not cache_dates:
- # None in input gets converted to 'None', for which
- # pandas tries to guess the datetime format, triggering
- # the warning. TODO: parse dates directly in pyarrow, see
- # https://github.com/pandas-dev/pandas/issues/48017
- warn = UserWarning
- else:
- # Note: warning is not raised if 'cache_dates', because here there is only a
- # single unique date and hence no risk of inconsistent parsing.
- warn = None
- parser.read_csv_check_warnings(
- warn,
- "Could not infer format",
- s,
- header=None,
- names=["foo", "bar"],
- parse_dates=["foo"],
- cache_dates=cache_dates,
- )
- @pytest.mark.parametrize("cache_dates", [True, False])
- @pytest.mark.parametrize("value", ["0"])
- def test_bad_date_parse_with_warning(all_parsers, cache_dates, value):
- # if we have an invalid date make sure that we handle this with
- # and w/o the cache properly.
- parser = all_parsers
- s = StringIO((f"{value},\n") * 50000)
- if parser.engine == "pyarrow":
- # pyarrow reads "0" as 0 (of type int64), and so
- # pandas doesn't try to guess the datetime format
- # TODO: parse dates directly in pyarrow, see
- # https://github.com/pandas-dev/pandas/issues/48017
- warn = None
- elif cache_dates:
- # Note: warning is not raised if 'cache_dates', because here there is only a
- # single unique date and hence no risk of inconsistent parsing.
- warn = None
- else:
- warn = UserWarning
- parser.read_csv_check_warnings(
- warn,
- "Could not infer format",
- s,
- header=None,
- names=["foo", "bar"],
- parse_dates=["foo"],
- cache_dates=cache_dates,
- )
- @xfail_pyarrow
- def test_parse_dates_empty_string(all_parsers):
- # see gh-2263
- parser = all_parsers
- data = "Date,test\n2012-01-01,1\n,2"
- result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False)
- expected = DataFrame(
- [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"]
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "reader", ["read_csv_check_warnings", "read_table_check_warnings"]
- )
- def test_parse_dates_infer_datetime_format_warning(all_parsers, reader):
- # GH 49024, 51017
- parser = all_parsers
- data = "Date,test\n2012-01-01,1\n,2"
- getattr(parser, reader)(
- FutureWarning,
- "The argument 'infer_datetime_format' is deprecated",
- StringIO(data),
- parse_dates=["Date"],
- infer_datetime_format=True,
- sep=",",
- )
- @pytest.mark.parametrize(
- "reader", ["read_csv_check_warnings", "read_table_check_warnings"]
- )
- def test_parse_dates_date_parser_and_date_format(all_parsers, reader):
- # GH 50601
- parser = all_parsers
- data = "Date,test\n2012-01-01,1\n,2"
- msg = "Cannot use both 'date_parser' and 'date_format'"
- with pytest.raises(TypeError, match=msg):
- getattr(parser, reader)(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- parse_dates=["Date"],
- date_parser=pd.to_datetime,
- date_format="ISO8601",
- sep=",",
- )
- @xfail_pyarrow
- @pytest.mark.parametrize(
- "data,kwargs,expected",
- [
- (
- "a\n04.15.2016",
- {"parse_dates": ["a"]},
- DataFrame([datetime(2016, 4, 15)], columns=["a"]),
- ),
- (
- "a\n04.15.2016",
- {"parse_dates": True, "index_col": 0},
- DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]),
- ),
- (
- "a,b\n04.15.2016,09.16.2013",
- {"parse_dates": ["a", "b"]},
- DataFrame(
- [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"]
- ),
- ),
- (
- "a,b\n04.15.2016,09.16.2013",
- {"parse_dates": True, "index_col": [0, 1]},
- DataFrame(
- index=MultiIndex.from_tuples(
- [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]
- ),
- columns=[],
- ),
- ),
- ],
- )
- def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
- # see gh-14066
- parser = all_parsers
- result = parser.read_csv(StringIO(data), thousands=".", **kwargs)
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_parse_date_time_multi_level_column_name(all_parsers):
- data = """\
- D,T,A,B
- date, time,a,b
- 2001-01-05, 09:00:00, 0.0, 10.
- 2001-01-06, 00:00:00, 1.0, 11.
- """
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- header=[0, 1],
- parse_dates={"date_time": [0, 1]},
- date_parser=pd.to_datetime,
- )
- expected_data = [
- [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0],
- [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0],
- ]
- expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")])
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- @pytest.mark.parametrize(
- "data,kwargs,expected",
- [
- (
- """\
- date,time,a,b
- 2001-01-05, 10:00:00, 0.0, 10.
- 2001-01-05, 00:00:00, 1., 11.
- """,
- {"header": 0, "parse_dates": {"date_time": [0, 1]}},
- DataFrame(
- [
- [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
- [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0],
- ],
- columns=["date_time", "a", "b"],
- ),
- ),
- (
- (
- "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
- "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
- "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
- "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
- "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
- "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
- ),
- {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}},
- DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- datetime(1999, 1, 27, 18, 56),
- "KORD",
- 0.81,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- datetime(1999, 1, 27, 19, 56),
- "KORD",
- 0.01,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- -0.99,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- -0.59,
- ],
- ],
- columns=["actual", "nominal", 0, 4],
- ),
- ),
- ],
- )
- def test_parse_date_time(all_parsers, data, kwargs, expected):
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- date_parser=pd.to_datetime,
- **kwargs,
- )
- # Python can sometimes be flaky about how
- # the aggregated columns are entered, so
- # this standardizes the order.
- result = result[expected.columns]
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- # From date_parser fallback behavior
- @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
- def test_parse_date_fields(all_parsers):
- parser = all_parsers
- data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ymd": [0, 1, 2]},
- date_parser=pd.to_datetime,
- )
- expected = DataFrame(
- [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]],
- columns=["ymd", "a"],
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- @pytest.mark.parametrize(
- ("key", "value", "warn"),
- [
- (
- "date_parser",
- lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"),
- FutureWarning,
- ),
- ("date_format", "%Y %m %d %H %M %S", None),
- ],
- )
- def test_parse_date_all_fields(all_parsers, key, value, warn):
- parser = all_parsers
- data = """\
- year,month,day,hour,minute,second,a,b
- 2001,01,05,10,00,0,0.0,10.
- 2001,01,5,10,0,00,1.,11.
- """
- result = parser.read_csv_check_warnings(
- warn,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
- **{key: value},
- )
- expected = DataFrame(
- [
- [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
- [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0],
- ],
- columns=["ymdHMS", "a", "b"],
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- @pytest.mark.parametrize(
- ("key", "value", "warn"),
- [
- (
- "date_parser",
- lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"),
- FutureWarning,
- ),
- ("date_format", "%Y %m %d %H %M %S.%f", None),
- ],
- )
- def test_datetime_fractional_seconds(all_parsers, key, value, warn):
- parser = all_parsers
- data = """\
- year,month,day,hour,minute,second,a,b
- 2001,01,05,10,00,0.123456,0.0,10.
- 2001,01,5,10,0,0.500000,1.,11.
- """
- result = parser.read_csv_check_warnings(
- warn,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
- **{key: value},
- )
- expected = DataFrame(
- [
- [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0],
- [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0],
- ],
- columns=["ymdHMS", "a", "b"],
- )
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_generic(all_parsers):
- parser = all_parsers
- data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
- def parse_function(yy, mm):
- return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)]
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ym": [0, 1]},
- date_parser=parse_function,
- )
- expected = DataFrame(
- [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]],
- columns=["ym", "day", "a"],
- )
- expected["ym"] = expected["ym"].astype("datetime64[ns]")
- tm.assert_frame_equal(result, expected)
- @xfail_pyarrow
- def test_date_parser_resolution_if_not_ns(all_parsers):
- # see gh-10245
- parser = all_parsers
- data = """\
- date,time,prn,rxstatus
- 2013-11-03,19:00:00,126,00E80000
- 2013-11-03,19:00:00,23,00E80000
- 2013-11-03,19:00:00,13,00E80000
- """
- def date_parser(dt, time):
- try:
- arr = dt + "T" + time
- except TypeError:
- # dt & time are date/time objects
- arr = [datetime.combine(d, t) for d, t in zip(dt, time)]
- return np.array(arr, dtype="datetime64[s]")
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- date_parser=date_parser,
- parse_dates={"datetime": ["date", "time"]},
- index_col=["datetime", "prn"],
- )
- datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]")
- expected = DataFrame(
- data={"rxstatus": ["00E80000"] * 3},
- index=MultiIndex.from_arrays(
- [datetimes, [126, 23, 13]],
- names=["datetime", "prn"],
- ),
- )
- tm.assert_frame_equal(result, expected)
- def test_parse_date_column_with_empty_string(all_parsers):
- # see gh-6428
- parser = all_parsers
- data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, "
- result = parser.read_csv(StringIO(data), parse_dates=["opdate"])
- expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]]
- expected = DataFrame(expected_data, columns=["case", "opdate"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "data,expected",
- [
- (
- "a\n135217135789158401\n1352171357E+5",
- DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"),
- ),
- (
- "a\n99999999999\n123456789012345\n1234E+0",
- DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"),
- ),
- ],
- )
- @pytest.mark.parametrize("parse_dates", [True, False])
- def test_parse_date_float(all_parsers, data, expected, parse_dates):
- # see gh-2697
- #
- # Date parsing should fail, so we leave the data untouched
- # (i.e. float precision should remain unchanged).
- parser = all_parsers
- result = parser.read_csv(StringIO(data), parse_dates=parse_dates)
- tm.assert_frame_equal(result, expected)
- def test_parse_timezone(all_parsers):
- # see gh-22256
- parser = all_parsers
- data = """dt,val
- 2018-01-04 09:01:00+09:00,23350
- 2018-01-04 09:02:00+09:00,23400
- 2018-01-04 09:03:00+09:00,23400
- 2018-01-04 09:04:00+09:00,23400
- 2018-01-04 09:05:00+09:00,23400"""
- result = parser.read_csv(StringIO(data), parse_dates=["dt"])
- dti = DatetimeIndex(
- list(
- date_range(
- start="2018-01-04 09:01:00",
- end="2018-01-04 09:05:00",
- freq="1min",
- tz=timezone(timedelta(minutes=540)),
- )
- ),
- freq=None,
- )
- expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]}
- expected = DataFrame(expected_data)
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- @pytest.mark.parametrize(
- "date_string",
- ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
- )
- def test_invalid_parse_delimited_date(all_parsers, date_string):
- parser = all_parsers
- expected = DataFrame({0: [date_string]}, dtype="object")
- result = parser.read_csv(
- StringIO(date_string),
- header=None,
- parse_dates=[0],
- )
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- @pytest.mark.parametrize(
- "date_string,dayfirst,expected",
- [
- # %d/%m/%Y; month > 12 thus replacement
- ("13/02/2019", True, datetime(2019, 2, 13)),
- # %m/%d/%Y; day > 12 thus there will be no replacement
- ("02/13/2019", False, datetime(2019, 2, 13)),
- # %d/%m/%Y; dayfirst==True thus replacement
- ("04/02/2019", True, datetime(2019, 2, 4)),
- ],
- )
- def test_parse_delimited_date_swap_no_warning(
- all_parsers, date_string, dayfirst, expected
- ):
- parser = all_parsers
- expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
- result = parser.read_csv(
- StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0]
- )
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- @pytest.mark.parametrize(
- "date_string,dayfirst,expected",
- [
- # %d/%m/%Y; month > 12
- ("13/02/2019", False, datetime(2019, 2, 13)),
- # %m/%d/%Y; day > 12
- ("02/13/2019", True, datetime(2019, 2, 13)),
- ],
- )
- def test_parse_delimited_date_swap_with_warning(
- all_parsers, date_string, dayfirst, expected
- ):
- parser = all_parsers
- expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
- warning_msg = (
- "Parsing dates in .* format when dayfirst=.* was specified. "
- "Pass `dayfirst=.*` or specify a format to silence this warning."
- )
- result = parser.read_csv_check_warnings(
- UserWarning,
- warning_msg,
- StringIO(date_string),
- header=None,
- dayfirst=dayfirst,
- parse_dates=[0],
- )
- tm.assert_frame_equal(result, expected)
- def test_parse_multiple_delimited_dates_with_swap_warnings():
- # GH46210
- with pytest.raises(
- ValueError,
- match=(
- r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", '
- r"at position 1. You might want to try:"
- ),
- ):
- pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])
- def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
- msg, result = None, None
- try:
- result = call(date_string, **kwargs)
- except ValueError as er:
- msg = str(er)
- return msg, result
- @skip_pyarrow
- @given(DATETIME_NO_TZ)
- @pytest.mark.parametrize("delimiter", list(" -./"))
- @pytest.mark.parametrize("dayfirst", [True, False])
- @pytest.mark.parametrize(
- "date_format",
- ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"],
- )
- def test_hypothesis_delimited_date(
- request, date_format, dayfirst, delimiter, test_datetime
- ):
- if date_format == "%m %Y" and delimiter == ".":
- request.node.add_marker(
- pytest.mark.xfail(
- reason="parse_datetime_string cannot reliably tell whether "
- "e.g. %m.%Y is a float or a date"
- )
- )
- date_string = test_datetime.strftime(date_format.replace(" ", delimiter))
- except_out_dateutil, result = _helper_hypothesis_delimited_date(
- py_parse_datetime_string, date_string, dayfirst=dayfirst
- )
- except_in_dateutil, expected = _helper_hypothesis_delimited_date(
- du_parse,
- date_string,
- default=datetime(1, 1, 1),
- dayfirst=dayfirst,
- yearfirst=False,
- )
- assert except_out_dateutil == except_in_dateutil
- assert result == expected
- @skip_pyarrow
- @pytest.mark.parametrize(
- "names, usecols, parse_dates, missing_cols",
- [
- (None, ["val"], ["date", "time"], "date, time"),
- (None, ["val"], [0, "time"], "time"),
- (None, ["val"], [["date", "time"]], "date, time"),
- (None, ["val"], [[0, "time"]], "time"),
- (None, ["val"], {"date": [0, "time"]}, "time"),
- (None, ["val"], {"date": ["date", "time"]}, "date, time"),
- (None, ["val"], [["date", "time"], "date"], "date, time"),
- (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"),
- (
- ["date1", "time1", "temperature"],
- ["date1", "temperature"],
- ["date1", "time"],
- "time",
- ),
- ],
- )
- def test_missing_parse_dates_column_raises(
- all_parsers, names, usecols, parse_dates, missing_cols
- ):
- # gh-31251 column names provided in parse_dates could be missing.
- parser = all_parsers
- content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
- msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(
- content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates
- )
- @skip_pyarrow
- def test_date_parser_and_names(all_parsers):
- # GH#33699
- parser = all_parsers
- data = StringIO("""x,y\n1,2""")
- result = parser.read_csv_check_warnings(
- UserWarning,
- "Could not infer format",
- data,
- parse_dates=["B"],
- names=["B"],
- )
- expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"])
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- def test_date_parser_multiindex_columns(all_parsers):
- parser = all_parsers
- data = """a,b
- 1,2
- 2019-12-31,6"""
- result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1])
- expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]})
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- @pytest.mark.parametrize(
- "parse_spec, col_name",
- [
- ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")),
- ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")),
- ],
- )
- def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name):
- parser = all_parsers
- data = """a,b,c
- 1,2,3
- 2019-12,-31,6"""
- result = parser.read_csv(
- StringIO(data),
- parse_dates=parse_spec,
- header=[0, 1],
- )
- expected = DataFrame({col_name: Timestamp("2019-12-31"), ("c", "3"): [6]})
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- def test_date_parser_usecols_thousands(all_parsers):
- # GH#39365
- data = """A,B,C
- 1,3,20-09-01-01
- 2,4,20-09-01-01
- """
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- UserWarning,
- "Could not infer format",
- StringIO(data),
- parse_dates=[1],
- usecols=[1, 2],
- thousands="-",
- )
- expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- def test_parse_dates_and_keep_orgin_column(all_parsers):
- # GH#13378
- parser = all_parsers
- data = """A
- 20150908
- 20150909
- """
- result = parser.read_csv(
- StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True
- )
- expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")]
- expected = DataFrame({"date": expected_data, "A": expected_data})
- tm.assert_frame_equal(result, expected)
- def test_dayfirst_warnings():
- # GH 12585
- # CASE 1: valid input
- input = "date\n31/12/2014\n10/03/2011"
- expected = DatetimeIndex(
- ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date"
- )
- warning_msg = (
- "Parsing dates in .* format when dayfirst=.* was specified. "
- "Pass `dayfirst=.*` or specify a format to silence this warning."
- )
- # A. dayfirst arg correct, no warning
- res1 = read_csv(
- StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date"
- ).index
- tm.assert_index_equal(expected, res1)
- # B. dayfirst arg incorrect, warning
- with tm.assert_produces_warning(UserWarning, match=warning_msg):
- res2 = read_csv(
- StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
- ).index
- tm.assert_index_equal(expected, res2)
- # CASE 2: invalid input
- # cannot consistently process with single format
- # return to user unaltered
- # first in DD/MM/YYYY, second in MM/DD/YYYY
- input = "date\n31/12/2014\n03/30/2011"
- expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
- # A. use dayfirst=True
- res5 = read_csv(
- StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date"
- ).index
- tm.assert_index_equal(expected, res5)
- # B. use dayfirst=False
- with tm.assert_produces_warning(UserWarning, match=warning_msg):
- res6 = read_csv(
- StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
- ).index
- tm.assert_index_equal(expected, res6)
- @pytest.mark.parametrize(
- "date_string, dayfirst",
- [
- pytest.param(
- "31/1/2014",
- False,
- id="second date is single-digit",
- ),
- pytest.param(
- "1/31/2014",
- True,
- id="first date is single-digit",
- ),
- ],
- )
- def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst):
- # GH47880
- initial_value = f"date\n{date_string}"
- expected = DatetimeIndex(
- ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date"
- )
- warning_msg = (
- "Parsing dates in .* format when dayfirst=.* was specified. "
- "Pass `dayfirst=.*` or specify a format to silence this warning."
- )
- with tm.assert_produces_warning(UserWarning, match=warning_msg):
- res = read_csv(
- StringIO(initial_value),
- parse_dates=["date"],
- index_col="date",
- dayfirst=dayfirst,
- ).index
- tm.assert_index_equal(expected, res)
- @skip_pyarrow
- def test_infer_first_column_as_index(all_parsers):
- # GH#11019
- parser = all_parsers
- data = "a,b,c\n1970-01-01,2,3,4"
- result = parser.read_csv(
- StringIO(data),
- parse_dates=["a"],
- )
- expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"])
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- @pytest.mark.parametrize(
- ("key", "value", "warn"),
- [
- ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning),
- ("date_format", "%Y-%m-%d", None),
- ],
- )
- def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn):
- # GH#26203
- parser = all_parsers
- data = """Test
- 2012-10-01
- 0
- 2015-05-15
- #
- 2017-09-09
- """
- result = parser.read_csv_check_warnings(
- warn,
- "use 'date_format' instead",
- StringIO(data),
- na_values={"Test": ["#", "0"]},
- parse_dates=["Test"],
- **{key: value},
- )
- expected = DataFrame(
- {
- "Test": [
- Timestamp("2012-10-01"),
- pd.NaT,
- Timestamp("2015-05-15"),
- pd.NaT,
- Timestamp("2017-09-09"),
- ]
- }
- )
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- def test_parse_dates_and_string_dtype(all_parsers):
- # GH#34066
- parser = all_parsers
- data = """a,b
- 1,2019-12-31
- """
- result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"])
- expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]})
- expected["a"] = expected["a"].astype("string")
- tm.assert_frame_equal(result, expected)
- def test_parse_dot_separated_dates(all_parsers):
- # https://github.com/pandas-dev/pandas/issues/2586
- parser = all_parsers
- data = """a,b
- 27.03.2003 14:55:00.000,1
- 03.08.2003 15:20:00.000,2"""
- if parser.engine == "pyarrow":
- expected_index = Index(
- ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
- dtype="object",
- name="a",
- )
- warn = None
- else:
- expected_index = DatetimeIndex(
- ["2003-03-27 14:55:00", "2003-08-03 15:20:00"],
- dtype="datetime64[ns]",
- name="a",
- )
- warn = UserWarning
- msg = r"when dayfirst=False \(the default\) was specified"
- result = parser.read_csv_check_warnings(
- warn, msg, StringIO(data), parse_dates=True, index_col=0
- )
- expected = DataFrame({"b": [1, 2]}, index=expected_index)
- tm.assert_frame_equal(result, expected)
- def test_parse_dates_dict_format(all_parsers):
- # GH#51240
- parser = all_parsers
- data = """a,b
- 2019-12-31,31-12-2019
- 2020-12-31,31-12-2020"""
- result = parser.read_csv(
- StringIO(data),
- date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"},
- parse_dates=["a", "b"],
- )
- expected = DataFrame(
- {
- "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
- "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
- }
- )
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- @pytest.mark.parametrize(
- "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})]
- )
- def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates):
- # GH#51240
- parser = all_parsers
- data = """a,b
- 31-,12-2019
- 31-,12-2020"""
- with tm.assert_produces_warning(None):
- result = parser.read_csv(
- StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates
- )
- expected = DataFrame(
- {
- key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
- }
- )
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- def test_parse_dates_dict_format_index(all_parsers):
- # GH#51240
- parser = all_parsers
- data = """a,b
- 2019-12-31,31-12-2019
- 2020-12-31,31-12-2020"""
- result = parser.read_csv(
- StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0
- )
- expected = DataFrame(
- {
- "b": ["31-12-2019", "31-12-2020"],
- },
- index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"),
- )
- tm.assert_frame_equal(result, expected)
- def test_parse_dates_arrow_engine(all_parsers):
- # GH#53295
- parser = all_parsers
- data = """a,b
- 2000-01-01 00:00:00,1
- 2000-01-01 00:00:01,1"""
- result = parser.read_csv(StringIO(data), parse_dates=["a"])
- expected = DataFrame(
- {
- "a": [
- Timestamp("2000-01-01 00:00:00"),
- Timestamp("2000-01-01 00:00:01"),
- ],
- "b": 1,
- }
- )
- tm.assert_frame_equal(result, expected)
|