test_to_csv.py 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308
  1. import csv
  2. from io import StringIO
  3. import os
  4. import numpy as np
  5. import pytest
  6. from pandas.errors import ParserError
  7. import pandas as pd
  8. from pandas import (
  9. DataFrame,
  10. Index,
  11. MultiIndex,
  12. NaT,
  13. Series,
  14. Timestamp,
  15. date_range,
  16. read_csv,
  17. to_datetime,
  18. )
  19. import pandas._testing as tm
  20. import pandas.core.common as com
  21. from pandas.io.common import get_handle
  22. class TestDataFrameToCSV:
  23. def read_csv(self, path, **kwargs):
  24. params = {"index_col": 0}
  25. params.update(**kwargs)
  26. return read_csv(path, **params)
  27. def test_to_csv_from_csv1(self, float_frame, datetime_frame):
  28. with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path:
  29. float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan
  30. float_frame.to_csv(path)
  31. float_frame.to_csv(path, columns=["A", "B"])
  32. float_frame.to_csv(path, header=False)
  33. float_frame.to_csv(path, index=False)
  34. # test roundtrip
  35. # freq does not roundtrip
  36. datetime_frame.index = datetime_frame.index._with_freq(None)
  37. datetime_frame.to_csv(path)
  38. recons = self.read_csv(path, parse_dates=True)
  39. tm.assert_frame_equal(datetime_frame, recons)
  40. datetime_frame.to_csv(path, index_label="index")
  41. recons = self.read_csv(path, index_col=None, parse_dates=True)
  42. assert len(recons.columns) == len(datetime_frame.columns) + 1
  43. # no index
  44. datetime_frame.to_csv(path, index=False)
  45. recons = self.read_csv(path, index_col=None, parse_dates=True)
  46. tm.assert_almost_equal(datetime_frame.values, recons.values)
  47. # corner case
  48. dm = DataFrame(
  49. {
  50. "s1": Series(range(3), index=np.arange(3, dtype=np.int64)),
  51. "s2": Series(range(2), index=np.arange(2, dtype=np.int64)),
  52. }
  53. )
  54. dm.to_csv(path)
  55. recons = self.read_csv(path)
  56. tm.assert_frame_equal(dm, recons)
  57. def test_to_csv_from_csv2(self, float_frame):
  58. with tm.ensure_clean("__tmp_to_csv_from_csv2__") as path:
  59. # duplicate index
  60. df = DataFrame(
  61. np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"]
  62. )
  63. df.to_csv(path)
  64. result = self.read_csv(path)
  65. tm.assert_frame_equal(result, df)
  66. midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
  67. df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"])
  68. df.to_csv(path)
  69. result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False)
  70. tm.assert_frame_equal(result, df, check_names=False)
  71. # column aliases
  72. col_aliases = Index(["AA", "X", "Y", "Z"])
  73. float_frame.to_csv(path, header=col_aliases)
  74. rs = self.read_csv(path)
  75. xp = float_frame.copy()
  76. xp.columns = col_aliases
  77. tm.assert_frame_equal(xp, rs)
  78. msg = "Writing 4 cols but got 2 aliases"
  79. with pytest.raises(ValueError, match=msg):
  80. float_frame.to_csv(path, header=["AA", "X"])
  81. def test_to_csv_from_csv3(self):
  82. with tm.ensure_clean("__tmp_to_csv_from_csv3__") as path:
  83. df1 = DataFrame(np.random.randn(3, 1))
  84. df2 = DataFrame(np.random.randn(3, 1))
  85. df1.to_csv(path)
  86. df2.to_csv(path, mode="a", header=False)
  87. xp = pd.concat([df1, df2])
  88. rs = read_csv(path, index_col=0)
  89. rs.columns = [int(label) for label in rs.columns]
  90. xp.columns = [int(label) for label in xp.columns]
  91. tm.assert_frame_equal(xp, rs)
  92. def test_to_csv_from_csv4(self):
  93. with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path:
  94. # GH 10833 (TimedeltaIndex formatting)
  95. dt = pd.Timedelta(seconds=1)
  96. df = DataFrame(
  97. {"dt_data": [i * dt for i in range(3)]},
  98. index=Index([i * dt for i in range(3)], name="dt_index"),
  99. )
  100. df.to_csv(path)
  101. result = read_csv(path, index_col="dt_index")
  102. result.index = pd.to_timedelta(result.index)
  103. result["dt_data"] = pd.to_timedelta(result["dt_data"])
  104. tm.assert_frame_equal(df, result, check_index_type=True)
  105. def test_to_csv_from_csv5(self, timezone_frame):
  106. # tz, 8260
  107. with tm.ensure_clean("__tmp_to_csv_from_csv5__") as path:
  108. timezone_frame.to_csv(path)
  109. result = read_csv(path, index_col=0, parse_dates=["A"])
  110. converter = (
  111. lambda c: to_datetime(result[c])
  112. .dt.tz_convert("UTC")
  113. .dt.tz_convert(timezone_frame[c].dt.tz)
  114. )
  115. result["B"] = converter("B")
  116. result["C"] = converter("C")
  117. tm.assert_frame_equal(result, timezone_frame)
  118. def test_to_csv_cols_reordering(self):
  119. # GH3454
  120. chunksize = 5
  121. N = int(chunksize * 2.5)
  122. df = tm.makeCustomDataframe(N, 3)
  123. cs = df.columns
  124. cols = [cs[2], cs[0]]
  125. with tm.ensure_clean() as path:
  126. df.to_csv(path, columns=cols, chunksize=chunksize)
  127. rs_c = read_csv(path, index_col=0)
  128. tm.assert_frame_equal(df[cols], rs_c, check_names=False)
  129. @pytest.mark.parametrize("cols", [None, ["b", "a"]])
  130. def test_to_csv_new_dupe_cols(self, cols):
  131. chunksize = 5
  132. N = int(chunksize * 2.5)
  133. # dupe cols
  134. df = tm.makeCustomDataframe(N, 3)
  135. df.columns = ["a", "a", "b"]
  136. with tm.ensure_clean() as path:
  137. df.to_csv(path, columns=cols, chunksize=chunksize)
  138. rs_c = read_csv(path, index_col=0)
  139. # we wrote them in a different order
  140. # so compare them in that order
  141. if cols is not None:
  142. if df.columns.is_unique:
  143. rs_c.columns = cols
  144. else:
  145. indexer, missing = df.columns.get_indexer_non_unique(cols)
  146. rs_c.columns = df.columns.take(indexer)
  147. for c in cols:
  148. obj_df = df[c]
  149. obj_rs = rs_c[c]
  150. if isinstance(obj_df, Series):
  151. tm.assert_series_equal(obj_df, obj_rs)
  152. else:
  153. tm.assert_frame_equal(obj_df, obj_rs, check_names=False)
  154. # wrote in the same order
  155. else:
  156. rs_c.columns = df.columns
  157. tm.assert_frame_equal(df, rs_c, check_names=False)
  158. @pytest.mark.slow
  159. def test_to_csv_dtnat(self):
  160. # GH3437
  161. def make_dtnat_arr(n, nnat=None):
  162. if nnat is None:
  163. nnat = int(n * 0.1) # 10%
  164. s = list(date_range("2000", freq="5min", periods=n))
  165. if nnat:
  166. for i in np.random.randint(0, len(s), nnat):
  167. s[i] = NaT
  168. i = np.random.randint(100)
  169. s[-i] = NaT
  170. s[i] = NaT
  171. return s
  172. chunksize = 1000
  173. s1 = make_dtnat_arr(chunksize + 5)
  174. s2 = make_dtnat_arr(chunksize + 5, 0)
  175. with tm.ensure_clean("1.csv") as pth:
  176. df = DataFrame({"a": s1, "b": s2})
  177. df.to_csv(pth, chunksize=chunksize)
  178. recons = self.read_csv(pth).apply(to_datetime)
  179. tm.assert_frame_equal(df, recons, check_names=False)
  180. def _return_result_expected(
  181. self,
  182. df,
  183. chunksize,
  184. r_dtype=None,
  185. c_dtype=None,
  186. rnlvl=None,
  187. cnlvl=None,
  188. dupe_col=False,
  189. ):
  190. kwargs = {"parse_dates": False}
  191. if cnlvl:
  192. if rnlvl is not None:
  193. kwargs["index_col"] = list(range(rnlvl))
  194. kwargs["header"] = list(range(cnlvl))
  195. with tm.ensure_clean("__tmp_to_csv_moar__") as path:
  196. df.to_csv(path, encoding="utf8", chunksize=chunksize)
  197. recons = self.read_csv(path, **kwargs)
  198. else:
  199. kwargs["header"] = 0
  200. with tm.ensure_clean("__tmp_to_csv_moar__") as path:
  201. df.to_csv(path, encoding="utf8", chunksize=chunksize)
  202. recons = self.read_csv(path, **kwargs)
  203. def _to_uni(x):
  204. if not isinstance(x, str):
  205. return x.decode("utf8")
  206. return x
  207. if dupe_col:
  208. # read_Csv disambiguates the columns by
  209. # labeling them dupe.1,dupe.2, etc'. monkey patch columns
  210. recons.columns = df.columns
  211. if rnlvl and not cnlvl:
  212. delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)]
  213. ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
  214. recons.index = ix
  215. recons = recons.iloc[:, rnlvl - 1 :]
  216. type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"}
  217. if r_dtype:
  218. if r_dtype == "u": # unicode
  219. r_dtype = "O"
  220. recons.index = np.array(
  221. [_to_uni(label) for label in recons.index], dtype=r_dtype
  222. )
  223. df.index = np.array(
  224. [_to_uni(label) for label in df.index], dtype=r_dtype
  225. )
  226. elif r_dtype == "dt": # unicode
  227. r_dtype = "O"
  228. recons.index = np.array(
  229. [Timestamp(label) for label in recons.index], dtype=r_dtype
  230. )
  231. df.index = np.array(
  232. [Timestamp(label) for label in df.index], dtype=r_dtype
  233. )
  234. elif r_dtype == "p":
  235. r_dtype = "O"
  236. idx_list = to_datetime(recons.index)
  237. recons.index = np.array(
  238. [Timestamp(label) for label in idx_list], dtype=r_dtype
  239. )
  240. df.index = np.array(
  241. list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype
  242. )
  243. else:
  244. r_dtype = type_map.get(r_dtype)
  245. recons.index = np.array(recons.index, dtype=r_dtype)
  246. df.index = np.array(df.index, dtype=r_dtype)
  247. if c_dtype:
  248. if c_dtype == "u":
  249. c_dtype = "O"
  250. recons.columns = np.array(
  251. [_to_uni(label) for label in recons.columns], dtype=c_dtype
  252. )
  253. df.columns = np.array(
  254. [_to_uni(label) for label in df.columns], dtype=c_dtype
  255. )
  256. elif c_dtype == "dt":
  257. c_dtype = "O"
  258. recons.columns = np.array(
  259. [Timestamp(label) for label in recons.columns], dtype=c_dtype
  260. )
  261. df.columns = np.array(
  262. [Timestamp(label) for label in df.columns], dtype=c_dtype
  263. )
  264. elif c_dtype == "p":
  265. c_dtype = "O"
  266. col_list = to_datetime(recons.columns)
  267. recons.columns = np.array(
  268. [Timestamp(label) for label in col_list], dtype=c_dtype
  269. )
  270. col_list = df.columns.to_timestamp()
  271. df.columns = np.array(
  272. [Timestamp(label) for label in col_list], dtype=c_dtype
  273. )
  274. else:
  275. c_dtype = type_map.get(c_dtype)
  276. recons.columns = np.array(recons.columns, dtype=c_dtype)
  277. df.columns = np.array(df.columns, dtype=c_dtype)
  278. return df, recons
  279. @pytest.mark.slow
  280. @pytest.mark.parametrize(
  281. "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251]
  282. )
  283. def test_to_csv_nrows(self, nrows):
  284. df = tm.makeCustomDataframe(nrows, 4, r_idx_type="dt", c_idx_type="s")
  285. result, expected = self._return_result_expected(df, 1000, "dt", "s")
  286. tm.assert_frame_equal(result, expected, check_names=False)
  287. @pytest.mark.slow
  288. @pytest.mark.parametrize(
  289. "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251]
  290. )
  291. @pytest.mark.parametrize(
  292. "r_idx_type, c_idx_type", [("i", "i"), ("s", "s"), ("s", "dt"), ("p", "p")]
  293. )
  294. @pytest.mark.parametrize("ncols", [1, 2, 3, 4])
  295. def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols):
  296. df = tm.makeCustomDataframe(
  297. nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type
  298. )
  299. result, expected = self._return_result_expected(
  300. df,
  301. 1000,
  302. r_idx_type,
  303. c_idx_type,
  304. )
  305. tm.assert_frame_equal(result, expected, check_names=False)
  306. @pytest.mark.slow
  307. @pytest.mark.parametrize(
  308. "nrows", [10, 98, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251]
  309. )
  310. @pytest.mark.parametrize("ncols", [1, 2, 3, 4])
  311. def test_to_csv_idx_ncols(self, nrows, ncols):
  312. df = tm.makeCustomDataframe(nrows, ncols)
  313. result, expected = self._return_result_expected(df, 1000)
  314. tm.assert_frame_equal(result, expected, check_names=False)
  315. @pytest.mark.slow
  316. @pytest.mark.parametrize("nrows", [10, 98, 99, 100, 101, 102])
  317. def test_to_csv_dup_cols(self, nrows):
  318. df = tm.makeCustomDataframe(nrows, 3)
  319. cols = list(df.columns)
  320. cols[:2] = ["dupe", "dupe"]
  321. cols[-2:] = ["dupe", "dupe"]
  322. ix = list(df.index)
  323. ix[:2] = ["rdupe", "rdupe"]
  324. ix[-2:] = ["rdupe", "rdupe"]
  325. df.index = ix
  326. df.columns = cols
  327. result, expected = self._return_result_expected(df, 1000, dupe_col=True)
  328. tm.assert_frame_equal(result, expected, check_names=False)
  329. @pytest.mark.slow
  330. def test_to_csv_empty(self):
  331. df = DataFrame(index=np.arange(10, dtype=np.int64))
  332. result, expected = self._return_result_expected(df, 1000)
  333. tm.assert_frame_equal(result, expected, check_column_type=False)
  334. @pytest.mark.slow
  335. def test_to_csv_chunksize(self):
  336. chunksize = 1000
  337. df = tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2)
  338. result, expected = self._return_result_expected(df, chunksize, rnlvl=2)
  339. tm.assert_frame_equal(result, expected, check_names=False)
  340. @pytest.mark.slow
  341. @pytest.mark.parametrize(
  342. "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251]
  343. )
  344. @pytest.mark.parametrize("ncols", [2, 3, 4])
  345. @pytest.mark.parametrize(
  346. "df_params, func_params",
  347. [
  348. [{"r_idx_nlevels": 2}, {"rnlvl": 2}],
  349. [{"c_idx_nlevels": 2}, {"cnlvl": 2}],
  350. [{"r_idx_nlevels": 2, "c_idx_nlevels": 2}, {"rnlvl": 2, "cnlvl": 2}],
  351. ],
  352. )
  353. def test_to_csv_params(self, nrows, df_params, func_params, ncols):
  354. df = tm.makeCustomDataframe(nrows, ncols, **df_params)
  355. result, expected = self._return_result_expected(df, 1000, **func_params)
  356. tm.assert_frame_equal(result, expected, check_names=False)
  357. def test_to_csv_from_csv_w_some_infs(self, float_frame):
  358. # test roundtrip with inf, -inf, nan, as full columns and mix
  359. float_frame["G"] = np.nan
  360. f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5]
  361. float_frame["H"] = float_frame.index.map(f)
  362. with tm.ensure_clean() as path:
  363. float_frame.to_csv(path)
  364. recons = self.read_csv(path)
  365. tm.assert_frame_equal(float_frame, recons)
  366. tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons))
  367. def test_to_csv_from_csv_w_all_infs(self, float_frame):
  368. # test roundtrip with inf, -inf, nan, as full columns and mix
  369. float_frame["E"] = np.inf
  370. float_frame["F"] = -np.inf
  371. with tm.ensure_clean() as path:
  372. float_frame.to_csv(path)
  373. recons = self.read_csv(path)
  374. tm.assert_frame_equal(float_frame, recons)
  375. tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons))
  376. def test_to_csv_no_index(self):
  377. # GH 3624, after appending columns, to_csv fails
  378. with tm.ensure_clean("__tmp_to_csv_no_index__") as path:
  379. df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]})
  380. df.to_csv(path, index=False)
  381. result = read_csv(path)
  382. tm.assert_frame_equal(df, result)
  383. df["c3"] = Series([7, 8, 9], dtype="int64")
  384. df.to_csv(path, index=False)
  385. result = read_csv(path)
  386. tm.assert_frame_equal(df, result)
  387. def test_to_csv_with_mix_columns(self):
  388. # gh-11637: incorrect output when a mix of integer and string column
  389. # names passed as columns parameter in to_csv
  390. df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]})
  391. df["test"] = "txt"
  392. assert df.to_csv() == df.to_csv(columns=[0, 1, "test"])
  393. def test_to_csv_headers(self):
  394. # GH6186, the presence or absence of `index` incorrectly
  395. # causes to_csv to have different header semantics.
  396. from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  397. to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"])
  398. with tm.ensure_clean("__tmp_to_csv_headers__") as path:
  399. from_df.to_csv(path, header=["X", "Y"])
  400. recons = self.read_csv(path)
  401. tm.assert_frame_equal(to_df, recons)
  402. from_df.to_csv(path, index=False, header=["X", "Y"])
  403. recons = self.read_csv(path)
  404. return_value = recons.reset_index(inplace=True)
  405. assert return_value is None
  406. tm.assert_frame_equal(to_df, recons)
  407. def test_to_csv_multiindex(self, float_frame, datetime_frame):
  408. frame = float_frame
  409. old_index = frame.index
  410. arrays = np.arange(len(old_index) * 2, dtype=np.int64).reshape(2, -1)
  411. new_index = MultiIndex.from_arrays(arrays, names=["first", "second"])
  412. frame.index = new_index
  413. with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
  414. frame.to_csv(path, header=False)
  415. frame.to_csv(path, columns=["A", "B"])
  416. # round trip
  417. frame.to_csv(path)
  418. df = self.read_csv(path, index_col=[0, 1], parse_dates=False)
  419. # TODO to_csv drops column name
  420. tm.assert_frame_equal(frame, df, check_names=False)
  421. assert frame.index.names == df.index.names
  422. # needed if setUp becomes a class method
  423. float_frame.index = old_index
  424. # try multiindex with dates
  425. tsframe = datetime_frame
  426. old_index = tsframe.index
  427. new_index = [old_index, np.arange(len(old_index), dtype=np.int64)]
  428. tsframe.index = MultiIndex.from_arrays(new_index)
  429. tsframe.to_csv(path, index_label=["time", "foo"])
  430. with tm.assert_produces_warning(
  431. UserWarning, match="Could not infer format"
  432. ):
  433. recons = self.read_csv(path, index_col=[0, 1], parse_dates=True)
  434. # TODO to_csv drops column name
  435. tm.assert_frame_equal(tsframe, recons, check_names=False)
  436. # do not load index
  437. tsframe.to_csv(path)
  438. recons = self.read_csv(path, index_col=None)
  439. assert len(recons.columns) == len(tsframe.columns) + 2
  440. # no index
  441. tsframe.to_csv(path, index=False)
  442. recons = self.read_csv(path, index_col=None)
  443. tm.assert_almost_equal(recons.values, datetime_frame.values)
  444. # needed if setUp becomes class method
  445. datetime_frame.index = old_index
  446. with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
  447. # GH3571, GH1651, GH3141
  448. def _make_frame(names=None):
  449. if names is True:
  450. names = ["first", "second"]
  451. return DataFrame(
  452. np.random.randint(0, 10, size=(3, 3)),
  453. columns=MultiIndex.from_tuples(
  454. [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names
  455. ),
  456. dtype="int64",
  457. )
  458. # column & index are multi-index
  459. df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
  460. df.to_csv(path)
  461. result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1])
  462. tm.assert_frame_equal(df, result)
  463. # column is mi
  464. df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
  465. df.to_csv(path)
  466. result = read_csv(path, header=[0, 1, 2, 3], index_col=0)
  467. tm.assert_frame_equal(df, result)
  468. # dup column names?
  469. df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
  470. df.to_csv(path)
  471. result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2])
  472. tm.assert_frame_equal(df, result)
  473. # writing with no index
  474. df = _make_frame()
  475. df.to_csv(path, index=False)
  476. result = read_csv(path, header=[0, 1])
  477. tm.assert_frame_equal(df, result)
  478. # we lose the names here
  479. df = _make_frame(True)
  480. df.to_csv(path, index=False)
  481. result = read_csv(path, header=[0, 1])
  482. assert com.all_none(*result.columns.names)
  483. result.columns.names = df.columns.names
  484. tm.assert_frame_equal(df, result)
  485. # whatsnew example
  486. df = _make_frame()
  487. df.to_csv(path)
  488. result = read_csv(path, header=[0, 1], index_col=[0])
  489. tm.assert_frame_equal(df, result)
  490. df = _make_frame(True)
  491. df.to_csv(path)
  492. result = read_csv(path, header=[0, 1], index_col=[0])
  493. tm.assert_frame_equal(df, result)
  494. # invalid options
  495. df = _make_frame(True)
  496. df.to_csv(path)
  497. for i in [6, 7]:
  498. msg = f"len of {i}, but only 5 lines in file"
  499. with pytest.raises(ParserError, match=msg):
  500. read_csv(path, header=list(range(i)), index_col=0)
  501. # write with cols
  502. msg = "cannot specify cols with a MultiIndex"
  503. with pytest.raises(TypeError, match=msg):
  504. df.to_csv(path, columns=["foo", "bar"])
  505. with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
  506. # empty
  507. tsframe[:0].to_csv(path)
  508. recons = self.read_csv(path)
  509. exp = tsframe[:0]
  510. exp.index = []
  511. tm.assert_index_equal(recons.columns, exp.columns)
  512. assert len(recons) == 0
  513. def test_to_csv_interval_index(self):
  514. # GH 28210
  515. df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3))
  516. with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path:
  517. df.to_csv(path)
  518. result = self.read_csv(path, index_col=0)
  519. # can't roundtrip intervalindex via read_csv so check string repr (GH 23595)
  520. expected = df.copy()
  521. expected.index = expected.index.astype(str)
  522. tm.assert_frame_equal(result, expected)
  523. def test_to_csv_float32_nanrep(self):
  524. df = DataFrame(np.random.randn(1, 4).astype(np.float32))
  525. df[1] = np.nan
  526. with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path:
  527. df.to_csv(path, na_rep=999)
  528. with open(path) as f:
  529. lines = f.readlines()
  530. assert lines[1].split(",")[2] == "999"
  531. def test_to_csv_withcommas(self):
  532. # Commas inside fields should be correctly escaped when saving as CSV.
  533. df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]})
  534. with tm.ensure_clean("__tmp_to_csv_withcommas__.csv") as path:
  535. df.to_csv(path)
  536. df2 = self.read_csv(path)
  537. tm.assert_frame_equal(df2, df)
  538. def test_to_csv_mixed(self):
  539. def create_cols(name):
  540. return [f"{name}{i:03d}" for i in range(5)]
  541. df_float = DataFrame(
  542. np.random.randn(100, 5), dtype="float64", columns=create_cols("float")
  543. )
  544. df_int = DataFrame(
  545. np.random.randn(100, 5).astype("int64"),
  546. dtype="int64",
  547. columns=create_cols("int"),
  548. )
  549. df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool"))
  550. df_object = DataFrame(
  551. "foo", index=df_float.index, columns=create_cols("object")
  552. )
  553. df_dt = DataFrame(
  554. Timestamp("20010101"), index=df_float.index, columns=create_cols("date")
  555. )
  556. # add in some nans
  557. df_float.iloc[30:50, 1:3] = np.nan
  558. # ## this is a bug in read_csv right now ####
  559. # df_dt.loc[30:50,1:3] = np.nan
  560. df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
  561. # dtype
  562. dtypes = {}
  563. for n, dtype in [
  564. ("float", np.float64),
  565. ("int", np.int64),
  566. ("bool", np.bool_),
  567. ("object", object),
  568. ]:
  569. for c in create_cols(n):
  570. dtypes[c] = dtype
  571. with tm.ensure_clean() as filename:
  572. df.to_csv(filename)
  573. rs = read_csv(
  574. filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date")
  575. )
  576. tm.assert_frame_equal(rs, df)
  577. def test_to_csv_dups_cols(self):
  578. df = DataFrame(
  579. np.random.randn(1000, 30),
  580. columns=list(range(15)) + list(range(15)),
  581. dtype="float64",
  582. )
  583. with tm.ensure_clean() as filename:
  584. df.to_csv(filename) # single dtype, fine
  585. result = read_csv(filename, index_col=0)
  586. result.columns = df.columns
  587. tm.assert_frame_equal(result, df)
  588. df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
  589. df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
  590. df_bool = DataFrame(True, index=df_float.index, columns=range(3))
  591. df_object = DataFrame("foo", index=df_float.index, columns=range(3))
  592. df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
  593. df = pd.concat(
  594. [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True
  595. )
  596. df.columns = [0, 1, 2] * 5
  597. with tm.ensure_clean() as filename:
  598. df.to_csv(filename)
  599. result = read_csv(filename, index_col=0)
  600. # date cols
  601. for i in ["0.4", "1.4", "2.4"]:
  602. result[i] = to_datetime(result[i])
  603. result.columns = df.columns
  604. tm.assert_frame_equal(result, df)
  605. # GH3457
  606. N = 10
  607. df = tm.makeCustomDataframe(N, 3)
  608. df.columns = ["a", "a", "b"]
  609. with tm.ensure_clean() as filename:
  610. df.to_csv(filename)
  611. # read_csv will rename the dups columns
  612. result = read_csv(filename, index_col=0)
  613. result = result.rename(columns={"a.1": "a"})
  614. tm.assert_frame_equal(result, df)
  615. @pytest.mark.parametrize("chunksize", [10000, 50000, 100000])
  616. def test_to_csv_chunking(self, chunksize):
  617. aa = DataFrame({"A": range(100000)})
  618. aa["B"] = aa.A + 1.0
  619. aa["C"] = aa.A + 2.0
  620. aa["D"] = aa.A + 3.0
  621. with tm.ensure_clean() as filename:
  622. aa.to_csv(filename, chunksize=chunksize)
  623. rs = read_csv(filename, index_col=0)
  624. tm.assert_frame_equal(rs, aa)
  625. @pytest.mark.slow
  626. def test_to_csv_wide_frame_formatting(self):
  627. # Issue #8621
  628. df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
  629. with tm.ensure_clean() as filename:
  630. df.to_csv(filename, header=False, index=False)
  631. rs = read_csv(filename, header=None)
  632. tm.assert_frame_equal(rs, df)
  633. def test_to_csv_bug(self):
  634. f1 = StringIO("a,1.0\nb,2.0")
  635. df = self.read_csv(f1, header=None)
  636. newdf = DataFrame({"t": df[df.columns[0]]})
  637. with tm.ensure_clean() as path:
  638. newdf.to_csv(path)
  639. recons = read_csv(path, index_col=0)
  640. # don't check_names as t != 1
  641. tm.assert_frame_equal(recons, newdf, check_names=False)
  642. def test_to_csv_unicode(self):
  643. df = DataFrame({"c/\u03c3": [1, 2, 3]})
  644. with tm.ensure_clean() as path:
  645. df.to_csv(path, encoding="UTF-8")
  646. df2 = read_csv(path, index_col=0, encoding="UTF-8")
  647. tm.assert_frame_equal(df, df2)
  648. df.to_csv(path, encoding="UTF-8", index=False)
  649. df2 = read_csv(path, index_col=None, encoding="UTF-8")
  650. tm.assert_frame_equal(df, df2)
  651. def test_to_csv_unicode_index_col(self):
  652. buf = StringIO("")
  653. df = DataFrame(
  654. [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]],
  655. columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"],
  656. index=["\u05d0", "\u05d1"],
  657. )
  658. df.to_csv(buf, encoding="UTF-8")
  659. buf.seek(0)
  660. df2 = read_csv(buf, index_col=0, encoding="UTF-8")
  661. tm.assert_frame_equal(df, df2)
  662. def test_to_csv_stringio(self, float_frame):
  663. buf = StringIO()
  664. float_frame.to_csv(buf)
  665. buf.seek(0)
  666. recons = read_csv(buf, index_col=0)
  667. tm.assert_frame_equal(recons, float_frame)
  668. def test_to_csv_float_format(self):
  669. df = DataFrame(
  670. [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  671. index=["A", "B"],
  672. columns=["X", "Y", "Z"],
  673. )
  674. with tm.ensure_clean() as filename:
  675. df.to_csv(filename, float_format="%.2f")
  676. rs = read_csv(filename, index_col=0)
  677. xp = DataFrame(
  678. [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]],
  679. index=["A", "B"],
  680. columns=["X", "Y", "Z"],
  681. )
  682. tm.assert_frame_equal(rs, xp)
  683. def test_to_csv_float_format_over_decimal(self):
  684. # GH#47436
  685. df = DataFrame({"a": [0.5, 1.0]})
  686. result = df.to_csv(
  687. decimal=",",
  688. float_format=lambda x: np.format_float_positional(x, trim="-"),
  689. index=False,
  690. )
  691. expected_rows = ["a", "0.5", "1"]
  692. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  693. assert result == expected
  694. def test_to_csv_unicodewriter_quoting(self):
  695. df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]})
  696. buf = StringIO()
  697. df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8")
  698. result = buf.getvalue()
  699. expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"']
  700. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  701. assert result == expected
  702. @pytest.mark.parametrize("encoding", [None, "utf-8"])
  703. def test_to_csv_quote_none(self, encoding):
  704. # GH4328
  705. df = DataFrame({"A": ["hello", '{"hello"}']})
  706. buf = StringIO()
  707. df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False)
  708. result = buf.getvalue()
  709. expected_rows = ["A", "hello", '{"hello"}']
  710. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  711. assert result == expected
  712. def test_to_csv_index_no_leading_comma(self):
  713. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"])
  714. buf = StringIO()
  715. df.to_csv(buf, index_label=False)
  716. expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"]
  717. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  718. assert buf.getvalue() == expected
  719. def test_to_csv_lineterminators(self):
  720. # see gh-20353
  721. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"])
  722. with tm.ensure_clean() as path:
  723. # case 1: CRLF as line terminator
  724. df.to_csv(path, lineterminator="\r\n")
  725. expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n"
  726. with open(path, mode="rb") as f:
  727. assert f.read() == expected
  728. with tm.ensure_clean() as path:
  729. # case 2: LF as line terminator
  730. df.to_csv(path, lineterminator="\n")
  731. expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n"
  732. with open(path, mode="rb") as f:
  733. assert f.read() == expected
  734. with tm.ensure_clean() as path:
  735. # case 3: The default line terminator(=os.linesep)(gh-21406)
  736. df.to_csv(path)
  737. os_linesep = os.linesep.encode("utf-8")
  738. expected = (
  739. b",A,B"
  740. + os_linesep
  741. + b"one,1,4"
  742. + os_linesep
  743. + b"two,2,5"
  744. + os_linesep
  745. + b"three,3,6"
  746. + os_linesep
  747. )
  748. with open(path, mode="rb") as f:
  749. assert f.read() == expected
  750. def test_to_csv_from_csv_categorical(self):
  751. # CSV with categoricals should result in the same output
  752. # as when one would add a "normal" Series/DataFrame.
  753. s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
  754. s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
  755. res = StringIO()
  756. s.to_csv(res, header=False)
  757. exp = StringIO()
  758. s2.to_csv(exp, header=False)
  759. assert res.getvalue() == exp.getvalue()
  760. df = DataFrame({"s": s})
  761. df2 = DataFrame({"s": s2})
  762. res = StringIO()
  763. df.to_csv(res)
  764. exp = StringIO()
  765. df2.to_csv(exp)
  766. assert res.getvalue() == exp.getvalue()
  767. def test_to_csv_path_is_none(self, float_frame):
  768. # GH 8215
  769. # Make sure we return string for consistency with
  770. # Series.to_csv()
  771. csv_str = float_frame.to_csv(path_or_buf=None)
  772. assert isinstance(csv_str, str)
  773. recons = read_csv(StringIO(csv_str), index_col=0)
  774. tm.assert_frame_equal(float_frame, recons)
  775. @pytest.mark.parametrize(
  776. "df,encoding",
  777. [
  778. (
  779. DataFrame(
  780. [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  781. index=["A", "B"],
  782. columns=["X", "Y", "Z"],
  783. ),
  784. None,
  785. ),
  786. # GH 21241, 21118
  787. (DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"),
  788. (DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"),
  789. (
  790. DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], columns=["X", "Y", "Z"]),
  791. "cp737",
  792. ),
  793. ],
  794. )
  795. def test_to_csv_compression(self, df, encoding, compression):
  796. with tm.ensure_clean() as filename:
  797. df.to_csv(filename, compression=compression, encoding=encoding)
  798. # test the round trip - to_csv -> read_csv
  799. result = read_csv(
  800. filename, compression=compression, index_col=0, encoding=encoding
  801. )
  802. tm.assert_frame_equal(df, result)
  803. # test the round trip using file handle - to_csv -> read_csv
  804. with get_handle(
  805. filename, "w", compression=compression, encoding=encoding
  806. ) as handles:
  807. df.to_csv(handles.handle, encoding=encoding)
  808. assert not handles.handle.closed
  809. result = read_csv(
  810. filename,
  811. compression=compression,
  812. encoding=encoding,
  813. index_col=0,
  814. ).squeeze("columns")
  815. tm.assert_frame_equal(df, result)
  816. # explicitly make sure file is compressed
  817. with tm.decompress_file(filename, compression) as fh:
  818. text = fh.read().decode(encoding or "utf8")
  819. for col in df.columns:
  820. assert col in text
  821. with tm.decompress_file(filename, compression) as fh:
  822. tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding))
  823. def test_to_csv_date_format(self, datetime_frame):
  824. with tm.ensure_clean("__tmp_to_csv_date_format__") as path:
  825. dt_index = datetime_frame.index
  826. datetime_frame = DataFrame(
  827. {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index
  828. )
  829. datetime_frame.to_csv(path, date_format="%Y%m%d")
  830. # Check that the data was put in the specified format
  831. test = read_csv(path, index_col=0)
  832. datetime_frame_int = datetime_frame.applymap(
  833. lambda x: int(x.strftime("%Y%m%d"))
  834. )
  835. datetime_frame_int.index = datetime_frame_int.index.map(
  836. lambda x: int(x.strftime("%Y%m%d"))
  837. )
  838. tm.assert_frame_equal(test, datetime_frame_int)
  839. datetime_frame.to_csv(path, date_format="%Y-%m-%d")
  840. # Check that the data was put in the specified format
  841. test = read_csv(path, index_col=0)
  842. datetime_frame_str = datetime_frame.applymap(
  843. lambda x: x.strftime("%Y-%m-%d")
  844. )
  845. datetime_frame_str.index = datetime_frame_str.index.map(
  846. lambda x: x.strftime("%Y-%m-%d")
  847. )
  848. tm.assert_frame_equal(test, datetime_frame_str)
  849. # Check that columns get converted
  850. datetime_frame_columns = datetime_frame.T
  851. datetime_frame_columns.to_csv(path, date_format="%Y%m%d")
  852. test = read_csv(path, index_col=0)
  853. datetime_frame_columns = datetime_frame_columns.applymap(
  854. lambda x: int(x.strftime("%Y%m%d"))
  855. )
  856. # Columns don't get converted to ints by read_csv
  857. datetime_frame_columns.columns = datetime_frame_columns.columns.map(
  858. lambda x: x.strftime("%Y%m%d")
  859. )
  860. tm.assert_frame_equal(test, datetime_frame_columns)
  861. # test NaTs
  862. nat_index = to_datetime(
  863. ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"]
  864. )
  865. nat_frame = DataFrame({"A": nat_index}, index=nat_index)
  866. nat_frame.to_csv(path, date_format="%Y-%m-%d")
  867. test = read_csv(path, parse_dates=[0, 1], index_col=0)
  868. tm.assert_frame_equal(test, nat_frame)
  869. @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")])
  870. def test_to_csv_with_dst_transitions(self, td):
  871. with tm.ensure_clean("csv_date_format_with_dst") as path:
  872. # make sure we are not failing on transitions
  873. times = date_range(
  874. "2013-10-26 23:00",
  875. "2013-10-27 01:00",
  876. tz="Europe/London",
  877. freq="H",
  878. ambiguous="infer",
  879. )
  880. i = times + td
  881. i = i._with_freq(None) # freq is not preserved by read_csv
  882. time_range = np.array(range(len(i)), dtype="int64")
  883. df = DataFrame({"A": time_range}, index=i)
  884. df.to_csv(path, index=True)
  885. # we have to reconvert the index as we
  886. # don't parse the tz's
  887. result = read_csv(path, index_col=0)
  888. result.index = to_datetime(result.index, utc=True).tz_convert(
  889. "Europe/London"
  890. )
  891. tm.assert_frame_equal(result, df)
  892. def test_to_csv_with_dst_transitions_with_pickle(self):
  893. # GH11619
  894. idx = date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris")
  895. idx = idx._with_freq(None) # freq does not round-trip
  896. idx._data._freq = None # otherwise there is trouble on unpickle
  897. df = DataFrame({"values": 1, "idx": idx}, index=idx)
  898. with tm.ensure_clean("csv_date_format_with_dst") as path:
  899. df.to_csv(path, index=True)
  900. result = read_csv(path, index_col=0)
  901. result.index = to_datetime(result.index, utc=True).tz_convert(
  902. "Europe/Paris"
  903. )
  904. result["idx"] = to_datetime(result["idx"], utc=True).astype(
  905. "datetime64[ns, Europe/Paris]"
  906. )
  907. tm.assert_frame_equal(result, df)
  908. # assert working
  909. df.astype(str)
  910. with tm.ensure_clean("csv_date_format_with_dst") as path:
  911. df.to_pickle(path)
  912. result = pd.read_pickle(path)
  913. tm.assert_frame_equal(result, df)
  914. def test_to_csv_quoting(self):
  915. df = DataFrame(
  916. {
  917. "c_bool": [True, False],
  918. "c_float": [1.0, 3.2],
  919. "c_int": [42, np.nan],
  920. "c_string": ["a", "b,c"],
  921. }
  922. )
  923. expected_rows = [
  924. ",c_bool,c_float,c_int,c_string",
  925. "0,True,1.0,42.0,a",
  926. '1,False,3.2,,"b,c"',
  927. ]
  928. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  929. result = df.to_csv()
  930. assert result == expected
  931. result = df.to_csv(quoting=None)
  932. assert result == expected
  933. expected_rows = [
  934. ",c_bool,c_float,c_int,c_string",
  935. "0,True,1.0,42.0,a",
  936. '1,False,3.2,,"b,c"',
  937. ]
  938. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  939. result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
  940. assert result == expected
  941. expected_rows = [
  942. '"","c_bool","c_float","c_int","c_string"',
  943. '"0","True","1.0","42.0","a"',
  944. '"1","False","3.2","","b,c"',
  945. ]
  946. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  947. result = df.to_csv(quoting=csv.QUOTE_ALL)
  948. assert result == expected
  949. # see gh-12922, gh-13259: make sure changes to
  950. # the formatters do not break this behaviour
  951. expected_rows = [
  952. '"","c_bool","c_float","c_int","c_string"',
  953. '0,True,1.0,42.0,"a"',
  954. '1,False,3.2,"","b,c"',
  955. ]
  956. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  957. result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
  958. assert result == expected
  959. msg = "need to escape, but no escapechar set"
  960. with pytest.raises(csv.Error, match=msg):
  961. df.to_csv(quoting=csv.QUOTE_NONE)
  962. with pytest.raises(csv.Error, match=msg):
  963. df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None)
  964. expected_rows = [
  965. ",c_bool,c_float,c_int,c_string",
  966. "0,True,1.0,42.0,a",
  967. "1,False,3.2,,b!,c",
  968. ]
  969. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  970. result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!")
  971. assert result == expected
  972. expected_rows = [
  973. ",c_bool,c_ffloat,c_int,c_string",
  974. "0,True,1.0,42.0,a",
  975. "1,False,3.2,,bf,c",
  976. ]
  977. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  978. result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f")
  979. assert result == expected
  980. # see gh-3503: quoting Windows line terminators
  981. # presents with encoding?
  982. text_rows = ["a,b,c", '1,"test \r\n",3']
  983. text = tm.convert_rows_list_to_csv_str(text_rows)
  984. df = read_csv(StringIO(text))
  985. buf = StringIO()
  986. df.to_csv(buf, encoding="utf-8", index=False)
  987. assert buf.getvalue() == text
  988. # xref gh-7791: make sure the quoting parameter is passed through
  989. # with multi-indexes
  990. df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
  991. df = df.set_index(["a", "b"])
  992. expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"']
  993. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  994. assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
  995. def test_period_index_date_overflow(self):
  996. # see gh-15982
  997. dates = ["1990-01-01", "2000-01-01", "3005-01-01"]
  998. index = pd.PeriodIndex(dates, freq="D")
  999. df = DataFrame([4, 5, 6], index=index)
  1000. result = df.to_csv()
  1001. expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"]
  1002. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1003. assert result == expected
  1004. date_format = "%m-%d-%Y"
  1005. result = df.to_csv(date_format=date_format)
  1006. expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"]
  1007. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1008. assert result == expected
  1009. # Overflow with pd.NaT
  1010. dates = ["1990-01-01", NaT, "3005-01-01"]
  1011. index = pd.PeriodIndex(dates, freq="D")
  1012. df = DataFrame([4, 5, 6], index=index)
  1013. result = df.to_csv()
  1014. expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"]
  1015. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1016. assert result == expected
  1017. def test_multi_index_header(self):
  1018. # see gh-5539
  1019. columns = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])
  1020. df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
  1021. df.columns = columns
  1022. header = ["a", "b", "c", "d"]
  1023. result = df.to_csv(header=header)
  1024. expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"]
  1025. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1026. assert result == expected
  1027. def test_to_csv_single_level_multi_index(self):
  1028. # see gh-26303
  1029. index = Index([(1,), (2,), (3,)])
  1030. df = DataFrame([[1, 2, 3]], columns=index)
  1031. df = df.reindex(columns=[(1,), (3,)])
  1032. expected = ",1,3\n0,1,3\n"
  1033. result = df.to_csv(lineterminator="\n")
  1034. tm.assert_almost_equal(result, expected)
  1035. def test_gz_lineend(self):
  1036. # GH 25311
  1037. df = DataFrame({"a": [1, 2]})
  1038. expected_rows = ["a", "1", "2"]
  1039. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1040. with tm.ensure_clean("__test_gz_lineend.csv.gz") as path:
  1041. df.to_csv(path, index=False)
  1042. with tm.decompress_file(path, compression="gzip") as f:
  1043. result = f.read().decode("utf-8")
  1044. assert result == expected
  1045. def test_to_csv_numpy_16_bug(self):
  1046. frame = DataFrame({"a": date_range("1/1/2000", periods=10)})
  1047. buf = StringIO()
  1048. frame.to_csv(buf)
  1049. result = buf.getvalue()
  1050. assert "2000-01-01" in result
  1051. def test_to_csv_na_quoting(self):
  1052. # GH 15891
  1053. # Normalize carriage return for Windows OS
  1054. result = (
  1055. DataFrame([None, None])
  1056. .to_csv(None, header=False, index=False, na_rep="")
  1057. .replace("\r\n", "\n")
  1058. )
  1059. expected = '""\n""\n'
  1060. assert result == expected
  1061. def test_to_csv_categorical_and_ea(self):
  1062. # GH#46812
  1063. df = DataFrame({"a": "x", "b": [1, pd.NA]})
  1064. df["b"] = df["b"].astype("Int16")
  1065. df["b"] = df["b"].astype("category")
  1066. result = df.to_csv()
  1067. expected_rows = [",a,b", "0,x,1", "1,x,"]
  1068. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1069. assert result == expected
  1070. def test_to_csv_categorical_and_interval(self):
  1071. # GH#46297
  1072. df = DataFrame(
  1073. {
  1074. "a": [
  1075. pd.Interval(
  1076. Timestamp("2020-01-01"),
  1077. Timestamp("2020-01-02"),
  1078. closed="both",
  1079. )
  1080. ]
  1081. }
  1082. )
  1083. df["a"] = df["a"].astype("category")
  1084. result = df.to_csv()
  1085. expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"']
  1086. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  1087. assert result == expected