test_parquet.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263
  1. """ test parquet compat """
  2. import datetime
  3. from io import BytesIO
  4. import os
  5. import pathlib
  6. from warnings import catch_warnings
  7. import numpy as np
  8. import pytest
  9. from pandas._config import get_option
  10. from pandas.compat import is_platform_windows
  11. from pandas.compat.pyarrow import (
  12. pa_version_under7p0,
  13. pa_version_under8p0,
  14. )
  15. import pandas.util._test_decorators as td
  16. import pandas as pd
  17. import pandas._testing as tm
  18. from pandas.util.version import Version
  19. from pandas.io.parquet import (
  20. FastParquetImpl,
  21. PyArrowImpl,
  22. get_engine,
  23. read_parquet,
  24. to_parquet,
  25. )
  26. try:
  27. import pyarrow
  28. _HAVE_PYARROW = True
  29. except ImportError:
  30. _HAVE_PYARROW = False
  31. try:
  32. import fastparquet
  33. _HAVE_FASTPARQUET = True
  34. except ImportError:
  35. _HAVE_FASTPARQUET = False
  36. # TODO(ArrayManager) fastparquet relies on BlockManager internals
  37. # setup engines & skips
  38. @pytest.fixture(
  39. params=[
  40. pytest.param(
  41. "fastparquet",
  42. marks=pytest.mark.skipif(
  43. not _HAVE_FASTPARQUET or get_option("mode.data_manager") == "array",
  44. reason="fastparquet is not installed or ArrayManager is used",
  45. ),
  46. ),
  47. pytest.param(
  48. "pyarrow",
  49. marks=pytest.mark.skipif(
  50. not _HAVE_PYARROW, reason="pyarrow is not installed"
  51. ),
  52. ),
  53. ]
  54. )
  55. def engine(request):
  56. return request.param
  57. @pytest.fixture
  58. def pa():
  59. if not _HAVE_PYARROW:
  60. pytest.skip("pyarrow is not installed")
  61. return "pyarrow"
  62. @pytest.fixture
  63. def fp():
  64. if not _HAVE_FASTPARQUET:
  65. pytest.skip("fastparquet is not installed")
  66. elif get_option("mode.data_manager") == "array":
  67. pytest.skip("ArrayManager is not supported with fastparquet")
  68. return "fastparquet"
  69. @pytest.fixture
  70. def df_compat():
  71. return pd.DataFrame({"A": [1, 2, 3], "B": "foo"})
  72. @pytest.fixture
  73. def df_cross_compat():
  74. df = pd.DataFrame(
  75. {
  76. "a": list("abc"),
  77. "b": list(range(1, 4)),
  78. # 'c': np.arange(3, 6).astype('u1'),
  79. "d": np.arange(4.0, 7.0, dtype="float64"),
  80. "e": [True, False, True],
  81. "f": pd.date_range("20130101", periods=3),
  82. # 'g': pd.date_range('20130101', periods=3,
  83. # tz='US/Eastern'),
  84. # 'h': pd.date_range('20130101', periods=3, freq='ns')
  85. }
  86. )
  87. return df
  88. @pytest.fixture
  89. def df_full():
  90. return pd.DataFrame(
  91. {
  92. "string": list("abc"),
  93. "string_with_nan": ["a", np.nan, "c"],
  94. "string_with_none": ["a", None, "c"],
  95. "bytes": [b"foo", b"bar", b"baz"],
  96. "unicode": ["foo", "bar", "baz"],
  97. "int": list(range(1, 4)),
  98. "uint": np.arange(3, 6).astype("u1"),
  99. "float": np.arange(4.0, 7.0, dtype="float64"),
  100. "float_with_nan": [2.0, np.nan, 3.0],
  101. "bool": [True, False, True],
  102. "datetime": pd.date_range("20130101", periods=3),
  103. "datetime_with_nat": [
  104. pd.Timestamp("20130101"),
  105. pd.NaT,
  106. pd.Timestamp("20130103"),
  107. ],
  108. }
  109. )
  110. @pytest.fixture(
  111. params=[
  112. datetime.datetime.now(datetime.timezone.utc),
  113. datetime.datetime.now(datetime.timezone.min),
  114. datetime.datetime.now(datetime.timezone.max),
  115. datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"),
  116. datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"),
  117. datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"),
  118. datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"),
  119. ]
  120. )
  121. def timezone_aware_date_list(request):
  122. return request.param
  123. def check_round_trip(
  124. df,
  125. engine=None,
  126. path=None,
  127. write_kwargs=None,
  128. read_kwargs=None,
  129. expected=None,
  130. check_names=True,
  131. check_like=False,
  132. check_dtype=True,
  133. repeat=2,
  134. ):
  135. """Verify parquet serializer and deserializer produce the same results.
  136. Performs a pandas to disk and disk to pandas round trip,
  137. then compares the 2 resulting DataFrames to verify equality.
  138. Parameters
  139. ----------
  140. df: Dataframe
  141. engine: str, optional
  142. 'pyarrow' or 'fastparquet'
  143. path: str, optional
  144. write_kwargs: dict of str:str, optional
  145. read_kwargs: dict of str:str, optional
  146. expected: DataFrame, optional
  147. Expected deserialization result, otherwise will be equal to `df`
  148. check_names: list of str, optional
  149. Closed set of column names to be compared
  150. check_like: bool, optional
  151. If True, ignore the order of index & columns.
  152. repeat: int, optional
  153. How many times to repeat the test
  154. """
  155. write_kwargs = write_kwargs or {"compression": None}
  156. read_kwargs = read_kwargs or {}
  157. if expected is None:
  158. expected = df
  159. if engine:
  160. write_kwargs["engine"] = engine
  161. read_kwargs["engine"] = engine
  162. def compare(repeat):
  163. for _ in range(repeat):
  164. df.to_parquet(path, **write_kwargs)
  165. with catch_warnings(record=True):
  166. actual = read_parquet(path, **read_kwargs)
  167. tm.assert_frame_equal(
  168. expected,
  169. actual,
  170. check_names=check_names,
  171. check_like=check_like,
  172. check_dtype=check_dtype,
  173. )
  174. if path is None:
  175. with tm.ensure_clean() as path:
  176. compare(repeat)
  177. else:
  178. compare(repeat)
  179. def check_partition_names(path, expected):
  180. """Check partitions of a parquet file are as expected.
  181. Parameters
  182. ----------
  183. path: str
  184. Path of the dataset.
  185. expected: iterable of str
  186. Expected partition names.
  187. """
  188. if pa_version_under7p0:
  189. import pyarrow.parquet as pq
  190. dataset = pq.ParquetDataset(path, validate_schema=False)
  191. assert len(dataset.partitions.partition_names) == len(expected)
  192. assert dataset.partitions.partition_names == set(expected)
  193. else:
  194. import pyarrow.dataset as ds
  195. dataset = ds.dataset(path, partitioning="hive")
  196. assert dataset.partitioning.schema.names == expected
  197. def test_invalid_engine(df_compat):
  198. msg = "engine must be one of 'pyarrow', 'fastparquet'"
  199. with pytest.raises(ValueError, match=msg):
  200. check_round_trip(df_compat, "foo", "bar")
  201. def test_options_py(df_compat, pa):
  202. # use the set option
  203. with pd.option_context("io.parquet.engine", "pyarrow"):
  204. check_round_trip(df_compat)
  205. def test_options_fp(df_compat, fp):
  206. # use the set option
  207. with pd.option_context("io.parquet.engine", "fastparquet"):
  208. check_round_trip(df_compat)
  209. def test_options_auto(df_compat, fp, pa):
  210. # use the set option
  211. with pd.option_context("io.parquet.engine", "auto"):
  212. check_round_trip(df_compat)
  213. def test_options_get_engine(fp, pa):
  214. assert isinstance(get_engine("pyarrow"), PyArrowImpl)
  215. assert isinstance(get_engine("fastparquet"), FastParquetImpl)
  216. with pd.option_context("io.parquet.engine", "pyarrow"):
  217. assert isinstance(get_engine("auto"), PyArrowImpl)
  218. assert isinstance(get_engine("pyarrow"), PyArrowImpl)
  219. assert isinstance(get_engine("fastparquet"), FastParquetImpl)
  220. with pd.option_context("io.parquet.engine", "fastparquet"):
  221. assert isinstance(get_engine("auto"), FastParquetImpl)
  222. assert isinstance(get_engine("pyarrow"), PyArrowImpl)
  223. assert isinstance(get_engine("fastparquet"), FastParquetImpl)
  224. with pd.option_context("io.parquet.engine", "auto"):
  225. assert isinstance(get_engine("auto"), PyArrowImpl)
  226. assert isinstance(get_engine("pyarrow"), PyArrowImpl)
  227. assert isinstance(get_engine("fastparquet"), FastParquetImpl)
  228. def test_get_engine_auto_error_message():
  229. # Expect different error messages from get_engine(engine="auto")
  230. # if engines aren't installed vs. are installed but bad version
  231. from pandas.compat._optional import VERSIONS
  232. # Do we have engines installed, but a bad version of them?
  233. pa_min_ver = VERSIONS.get("pyarrow")
  234. fp_min_ver = VERSIONS.get("fastparquet")
  235. have_pa_bad_version = (
  236. False
  237. if not _HAVE_PYARROW
  238. else Version(pyarrow.__version__) < Version(pa_min_ver)
  239. )
  240. have_fp_bad_version = (
  241. False
  242. if not _HAVE_FASTPARQUET
  243. else Version(fastparquet.__version__) < Version(fp_min_ver)
  244. )
  245. # Do we have usable engines installed?
  246. have_usable_pa = _HAVE_PYARROW and not have_pa_bad_version
  247. have_usable_fp = _HAVE_FASTPARQUET and not have_fp_bad_version
  248. if not have_usable_pa and not have_usable_fp:
  249. # No usable engines found.
  250. if have_pa_bad_version:
  251. match = f"Pandas requires version .{pa_min_ver}. or newer of .pyarrow."
  252. with pytest.raises(ImportError, match=match):
  253. get_engine("auto")
  254. else:
  255. match = "Missing optional dependency .pyarrow."
  256. with pytest.raises(ImportError, match=match):
  257. get_engine("auto")
  258. if have_fp_bad_version:
  259. match = f"Pandas requires version .{fp_min_ver}. or newer of .fastparquet."
  260. with pytest.raises(ImportError, match=match):
  261. get_engine("auto")
  262. else:
  263. match = "Missing optional dependency .fastparquet."
  264. with pytest.raises(ImportError, match=match):
  265. get_engine("auto")
  266. def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
  267. # cross-compat with differing reading/writing engines
  268. df = df_cross_compat
  269. with tm.ensure_clean() as path:
  270. df.to_parquet(path, engine=pa, compression=None)
  271. result = read_parquet(path, engine=fp)
  272. tm.assert_frame_equal(result, df)
  273. result = read_parquet(path, engine=fp, columns=["a", "d"])
  274. tm.assert_frame_equal(result, df[["a", "d"]])
  275. def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
  276. # cross-compat with differing reading/writing engines
  277. df = df_cross_compat
  278. with tm.ensure_clean() as path:
  279. df.to_parquet(path, engine=fp, compression=None)
  280. with catch_warnings(record=True):
  281. result = read_parquet(path, engine=pa)
  282. tm.assert_frame_equal(result, df)
  283. result = read_parquet(path, engine=pa, columns=["a", "d"])
  284. tm.assert_frame_equal(result, df[["a", "d"]])
  285. class Base:
  286. def check_error_on_write(self, df, engine, exc, err_msg):
  287. # check that we are raising the exception on writing
  288. with tm.ensure_clean() as path:
  289. with pytest.raises(exc, match=err_msg):
  290. to_parquet(df, path, engine, compression=None)
  291. def check_external_error_on_write(self, df, engine, exc):
  292. # check that an external library is raising the exception on writing
  293. with tm.ensure_clean() as path:
  294. with tm.external_error_raised(exc):
  295. to_parquet(df, path, engine, compression=None)
  296. @pytest.mark.network
  297. @tm.network(
  298. url=(
  299. "https://raw.githubusercontent.com/pandas-dev/pandas/"
  300. "main/pandas/tests/io/data/parquet/simple.parquet"
  301. ),
  302. check_before_test=True,
  303. )
  304. def test_parquet_read_from_url(self, df_compat, engine):
  305. if engine != "auto":
  306. pytest.importorskip(engine)
  307. url = (
  308. "https://raw.githubusercontent.com/pandas-dev/pandas/"
  309. "main/pandas/tests/io/data/parquet/simple.parquet"
  310. )
  311. df = read_parquet(url)
  312. tm.assert_frame_equal(df, df_compat)
  313. class TestBasic(Base):
  314. def test_error(self, engine):
  315. for obj in [
  316. pd.Series([1, 2, 3]),
  317. 1,
  318. "foo",
  319. pd.Timestamp("20130101"),
  320. np.array([1, 2, 3]),
  321. ]:
  322. msg = "to_parquet only supports IO with DataFrames"
  323. self.check_error_on_write(obj, engine, ValueError, msg)
  324. def test_columns_dtypes(self, engine):
  325. df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
  326. # unicode
  327. df.columns = ["foo", "bar"]
  328. check_round_trip(df, engine)
  329. @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"])
  330. def test_compression(self, engine, compression):
  331. if compression == "snappy":
  332. pytest.importorskip("snappy")
  333. elif compression == "brotli":
  334. pytest.importorskip("brotli")
  335. df = pd.DataFrame({"A": [1, 2, 3]})
  336. check_round_trip(df, engine, write_kwargs={"compression": compression})
  337. def test_read_columns(self, engine):
  338. # GH18154
  339. df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
  340. expected = pd.DataFrame({"string": list("abc")})
  341. check_round_trip(
  342. df, engine, expected=expected, read_kwargs={"columns": ["string"]}
  343. )
  344. def test_write_index(self, engine):
  345. check_names = engine != "fastparquet"
  346. df = pd.DataFrame({"A": [1, 2, 3]})
  347. check_round_trip(df, engine)
  348. indexes = [
  349. [2, 3, 4],
  350. pd.date_range("20130101", periods=3),
  351. list("abc"),
  352. [1, 3, 4],
  353. ]
  354. # non-default index
  355. for index in indexes:
  356. df.index = index
  357. if isinstance(index, pd.DatetimeIndex):
  358. df.index = df.index._with_freq(None) # freq doesn't round-trip
  359. check_round_trip(df, engine, check_names=check_names)
  360. # index with meta-data
  361. df.index = [0, 1, 2]
  362. df.index.name = "foo"
  363. check_round_trip(df, engine)
  364. def test_write_multiindex(self, pa):
  365. # Not supported in fastparquet as of 0.1.3 or older pyarrow version
  366. engine = pa
  367. df = pd.DataFrame({"A": [1, 2, 3]})
  368. index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
  369. df.index = index
  370. check_round_trip(df, engine)
  371. def test_multiindex_with_columns(self, pa):
  372. engine = pa
  373. dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS")
  374. df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list("ABC"))
  375. index1 = pd.MultiIndex.from_product(
  376. [["Level1", "Level2"], dates], names=["level", "date"]
  377. )
  378. index2 = index1.copy(names=None)
  379. for index in [index1, index2]:
  380. df.index = index
  381. check_round_trip(df, engine)
  382. check_round_trip(
  383. df, engine, read_kwargs={"columns": ["A", "B"]}, expected=df[["A", "B"]]
  384. )
  385. def test_write_ignoring_index(self, engine):
  386. # ENH 20768
  387. # Ensure index=False omits the index from the written Parquet file.
  388. df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]})
  389. write_kwargs = {"compression": None, "index": False}
  390. # Because we're dropping the index, we expect the loaded dataframe to
  391. # have the default integer index.
  392. expected = df.reset_index(drop=True)
  393. check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
  394. # Ignore custom index
  395. df = pd.DataFrame(
  396. {"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"]
  397. )
  398. check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
  399. # Ignore multi-indexes as well.
  400. arrays = [
  401. ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
  402. ["one", "two", "one", "two", "one", "two", "one", "two"],
  403. ]
  404. df = pd.DataFrame(
  405. {"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays
  406. )
  407. expected = df.reset_index(drop=True)
  408. check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
  409. def test_write_column_multiindex(self, engine):
  410. # Not able to write column multi-indexes with non-string column names.
  411. mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
  412. df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
  413. if engine == "fastparquet":
  414. self.check_error_on_write(
  415. df, engine, TypeError, "Column name must be a string"
  416. )
  417. elif engine == "pyarrow":
  418. check_round_trip(df, engine)
  419. def test_write_column_multiindex_nonstring(self, engine):
  420. # GH #34777
  421. # Not able to write column multi-indexes with non-string column names
  422. arrays = [
  423. ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
  424. [1, 2, 1, 2, 1, 2, 1, 2],
  425. ]
  426. df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
  427. df.columns.names = ["Level1", "Level2"]
  428. if engine == "fastparquet":
  429. if Version(fastparquet.__version__) < Version("0.7.0"):
  430. err = TypeError
  431. else:
  432. err = ValueError
  433. self.check_error_on_write(df, engine, err, "Column name")
  434. elif engine == "pyarrow":
  435. check_round_trip(df, engine)
  436. def test_write_column_multiindex_string(self, pa):
  437. # GH #34777
  438. # Not supported in fastparquet as of 0.1.3
  439. engine = pa
  440. # Write column multi-indexes with string column names
  441. arrays = [
  442. ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
  443. ["one", "two", "one", "two", "one", "two", "one", "two"],
  444. ]
  445. df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
  446. df.columns.names = ["ColLevel1", "ColLevel2"]
  447. check_round_trip(df, engine)
  448. def test_write_column_index_string(self, pa):
  449. # GH #34777
  450. # Not supported in fastparquet as of 0.1.3
  451. engine = pa
  452. # Write column indexes with string column names
  453. arrays = ["bar", "baz", "foo", "qux"]
  454. df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
  455. df.columns.name = "StringCol"
  456. check_round_trip(df, engine)
  457. def test_write_column_index_nonstring(self, engine):
  458. # GH #34777
  459. # Write column indexes with string column names
  460. arrays = [1, 2, 3, 4]
  461. df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
  462. df.columns.name = "NonStringCol"
  463. if engine == "fastparquet":
  464. self.check_error_on_write(
  465. df, engine, TypeError, "Column name must be a string"
  466. )
  467. else:
  468. check_round_trip(df, engine)
  469. @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed")
  470. def test_dtype_backend(self, engine, request):
  471. import pyarrow.parquet as pq
  472. if engine == "fastparquet":
  473. # We are manually disabling fastparquet's
  474. # nullable dtype support pending discussion
  475. mark = pytest.mark.xfail(
  476. reason="Fastparquet nullable dtype support is disabled"
  477. )
  478. request.node.add_marker(mark)
  479. table = pyarrow.table(
  480. {
  481. "a": pyarrow.array([1, 2, 3, None], "int64"),
  482. "b": pyarrow.array([1, 2, 3, None], "uint8"),
  483. "c": pyarrow.array(["a", "b", "c", None]),
  484. "d": pyarrow.array([True, False, True, None]),
  485. # Test that nullable dtypes used even in absence of nulls
  486. "e": pyarrow.array([1, 2, 3, 4], "int64"),
  487. # GH 45694
  488. "f": pyarrow.array([1.0, 2.0, 3.0, None], "float32"),
  489. "g": pyarrow.array([1.0, 2.0, 3.0, None], "float64"),
  490. }
  491. )
  492. with tm.ensure_clean() as path:
  493. # write manually with pyarrow to write integers
  494. pq.write_table(table, path)
  495. result1 = read_parquet(path, engine=engine)
  496. result2 = read_parquet(path, engine=engine, dtype_backend="numpy_nullable")
  497. assert result1["a"].dtype == np.dtype("float64")
  498. expected = pd.DataFrame(
  499. {
  500. "a": pd.array([1, 2, 3, None], dtype="Int64"),
  501. "b": pd.array([1, 2, 3, None], dtype="UInt8"),
  502. "c": pd.array(["a", "b", "c", None], dtype="string"),
  503. "d": pd.array([True, False, True, None], dtype="boolean"),
  504. "e": pd.array([1, 2, 3, 4], dtype="Int64"),
  505. "f": pd.array([1.0, 2.0, 3.0, None], dtype="Float32"),
  506. "g": pd.array([1.0, 2.0, 3.0, None], dtype="Float64"),
  507. }
  508. )
  509. if engine == "fastparquet":
  510. # Fastparquet doesn't support string columns yet
  511. # Only int and boolean
  512. result2 = result2.drop("c", axis=1)
  513. expected = expected.drop("c", axis=1)
  514. tm.assert_frame_equal(result2, expected)
  515. @pytest.mark.parametrize(
  516. "dtype",
  517. [
  518. "Int64",
  519. "UInt8",
  520. "boolean",
  521. "object",
  522. "datetime64[ns, UTC]",
  523. "float",
  524. "period[D]",
  525. "Float64",
  526. "string",
  527. ],
  528. )
  529. def test_read_empty_array(self, pa, dtype):
  530. # GH #41241
  531. df = pd.DataFrame(
  532. {
  533. "value": pd.array([], dtype=dtype),
  534. }
  535. )
  536. # GH 45694
  537. expected = None
  538. if dtype == "float":
  539. expected = pd.DataFrame(
  540. {
  541. "value": pd.array([], dtype="Float64"),
  542. }
  543. )
  544. check_round_trip(
  545. df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected
  546. )
  547. class TestParquetPyArrow(Base):
  548. def test_basic(self, pa, df_full):
  549. df = df_full
  550. # additional supported types for pyarrow
  551. dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
  552. dti = dti._with_freq(None) # freq doesn't round-trip
  553. df["datetime_tz"] = dti
  554. df["bool_with_none"] = [True, None, True]
  555. check_round_trip(df, pa)
  556. def test_basic_subset_columns(self, pa, df_full):
  557. # GH18628
  558. df = df_full
  559. # additional supported types for pyarrow
  560. df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
  561. check_round_trip(
  562. df,
  563. pa,
  564. expected=df[["string", "int"]],
  565. read_kwargs={"columns": ["string", "int"]},
  566. )
  567. def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
  568. # GH 37105
  569. buf_bytes = df_full.to_parquet(engine=pa)
  570. assert isinstance(buf_bytes, bytes)
  571. buf_stream = BytesIO(buf_bytes)
  572. res = read_parquet(buf_stream)
  573. tm.assert_frame_equal(df_full, res)
  574. def test_duplicate_columns(self, pa):
  575. # not currently able to handle duplicate columns
  576. df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
  577. self.check_error_on_write(df, pa, ValueError, "Duplicate column names found")
  578. def test_timedelta(self, pa):
  579. df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)})
  580. if pa_version_under8p0:
  581. self.check_external_error_on_write(df, pa, NotImplementedError)
  582. else:
  583. check_round_trip(df, pa)
  584. def test_unsupported(self, pa):
  585. # mixed python objects
  586. df = pd.DataFrame({"a": ["a", 1, 2.0]})
  587. # pyarrow 0.11 raises ArrowTypeError
  588. # older pyarrows raise ArrowInvalid
  589. self.check_external_error_on_write(df, pa, pyarrow.ArrowException)
  590. def test_unsupported_float16(self, pa):
  591. # #44847, #44914
  592. # Not able to write float 16 column using pyarrow.
  593. data = np.arange(2, 10, dtype=np.float16)
  594. df = pd.DataFrame(data=data, columns=["fp16"])
  595. self.check_external_error_on_write(df, pa, pyarrow.ArrowException)
  596. @pytest.mark.xfail(
  597. is_platform_windows(),
  598. reason=(
  599. "PyArrow does not cleanup of partial files dumps when unsupported "
  600. "dtypes are passed to_parquet function in windows"
  601. ),
  602. )
  603. @pytest.mark.parametrize("path_type", [str, pathlib.Path])
  604. def test_unsupported_float16_cleanup(self, pa, path_type):
  605. # #44847, #44914
  606. # Not able to write float 16 column using pyarrow.
  607. # Tests cleanup by pyarrow in case of an error
  608. data = np.arange(2, 10, dtype=np.float16)
  609. df = pd.DataFrame(data=data, columns=["fp16"])
  610. with tm.ensure_clean() as path_str:
  611. path = path_type(path_str)
  612. with tm.external_error_raised(pyarrow.ArrowException):
  613. df.to_parquet(path=path, engine=pa)
  614. assert not os.path.isfile(path)
  615. def test_categorical(self, pa):
  616. # supported in >= 0.7.0
  617. df = pd.DataFrame()
  618. df["a"] = pd.Categorical(list("abcdef"))
  619. # test for null, out-of-order values, and unobserved category
  620. df["b"] = pd.Categorical(
  621. ["bar", "foo", "foo", "bar", None, "bar"],
  622. dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
  623. )
  624. # test for ordered flag
  625. df["c"] = pd.Categorical(
  626. ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True
  627. )
  628. check_round_trip(df, pa)
  629. @pytest.mark.single_cpu
  630. def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so):
  631. s3fs = pytest.importorskip("s3fs")
  632. s3 = s3fs.S3FileSystem(**s3so)
  633. kw = {"filesystem": s3}
  634. check_round_trip(
  635. df_compat,
  636. pa,
  637. path="pandas-test/pyarrow.parquet",
  638. read_kwargs=kw,
  639. write_kwargs=kw,
  640. )
  641. @pytest.mark.single_cpu
  642. def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so):
  643. # GH #19134
  644. s3so = {"storage_options": s3so}
  645. check_round_trip(
  646. df_compat,
  647. pa,
  648. path="s3://pandas-test/pyarrow.parquet",
  649. read_kwargs=s3so,
  650. write_kwargs=s3so,
  651. )
  652. @pytest.mark.single_cpu
  653. @td.skip_if_no("s3fs") # also requires flask
  654. @pytest.mark.parametrize(
  655. "partition_col",
  656. [
  657. ["A"],
  658. [],
  659. ],
  660. )
  661. def test_s3_roundtrip_for_dir(
  662. self, df_compat, s3_resource, pa, partition_col, s3so
  663. ):
  664. # GH #26388
  665. expected_df = df_compat.copy()
  666. # GH #35791
  667. if partition_col:
  668. expected_df = expected_df.astype(dict.fromkeys(partition_col, np.int32))
  669. partition_col_type = "category"
  670. expected_df[partition_col] = expected_df[partition_col].astype(
  671. partition_col_type
  672. )
  673. check_round_trip(
  674. df_compat,
  675. pa,
  676. expected=expected_df,
  677. path="s3://pandas-test/parquet_dir",
  678. read_kwargs={"storage_options": s3so},
  679. write_kwargs={
  680. "partition_cols": partition_col,
  681. "compression": None,
  682. "storage_options": s3so,
  683. },
  684. check_like=True,
  685. repeat=1,
  686. )
  687. @td.skip_if_no("pyarrow")
  688. def test_read_file_like_obj_support(self, df_compat):
  689. buffer = BytesIO()
  690. df_compat.to_parquet(buffer)
  691. df_from_buf = read_parquet(buffer)
  692. tm.assert_frame_equal(df_compat, df_from_buf)
  693. @td.skip_if_no("pyarrow")
  694. def test_expand_user(self, df_compat, monkeypatch):
  695. monkeypatch.setenv("HOME", "TestingUser")
  696. monkeypatch.setenv("USERPROFILE", "TestingUser")
  697. with pytest.raises(OSError, match=r".*TestingUser.*"):
  698. read_parquet("~/file.parquet")
  699. with pytest.raises(OSError, match=r".*TestingUser.*"):
  700. df_compat.to_parquet("~/file.parquet")
  701. def test_partition_cols_supported(self, tmp_path, pa, df_full):
  702. # GH #23283
  703. partition_cols = ["bool", "int"]
  704. df = df_full
  705. df.to_parquet(tmp_path, partition_cols=partition_cols, compression=None)
  706. check_partition_names(tmp_path, partition_cols)
  707. assert read_parquet(tmp_path).shape == df.shape
  708. def test_partition_cols_string(self, tmp_path, pa, df_full):
  709. # GH #27117
  710. partition_cols = "bool"
  711. partition_cols_list = [partition_cols]
  712. df = df_full
  713. df.to_parquet(tmp_path, partition_cols=partition_cols, compression=None)
  714. check_partition_names(tmp_path, partition_cols_list)
  715. assert read_parquet(tmp_path).shape == df.shape
  716. @pytest.mark.parametrize(
  717. "path_type", [str, lambda x: x], ids=["string", "pathlib.Path"]
  718. )
  719. def test_partition_cols_pathlib(self, tmp_path, pa, df_compat, path_type):
  720. # GH 35902
  721. partition_cols = "B"
  722. partition_cols_list = [partition_cols]
  723. df = df_compat
  724. path = path_type(tmp_path)
  725. df.to_parquet(path, partition_cols=partition_cols_list)
  726. assert read_parquet(path).shape == df.shape
  727. def test_empty_dataframe(self, pa):
  728. # GH #27339
  729. df = pd.DataFrame(index=[], columns=[])
  730. check_round_trip(df, pa)
  731. def test_write_with_schema(self, pa):
  732. import pyarrow
  733. df = pd.DataFrame({"x": [0, 1]})
  734. schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())])
  735. out_df = df.astype(bool)
  736. check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df)
  737. @td.skip_if_no("pyarrow")
  738. def test_additional_extension_arrays(self, pa):
  739. # test additional ExtensionArrays that are supported through the
  740. # __arrow_array__ protocol
  741. df = pd.DataFrame(
  742. {
  743. "a": pd.Series([1, 2, 3], dtype="Int64"),
  744. "b": pd.Series([1, 2, 3], dtype="UInt32"),
  745. "c": pd.Series(["a", None, "c"], dtype="string"),
  746. }
  747. )
  748. check_round_trip(df, pa)
  749. df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
  750. check_round_trip(df, pa)
  751. @td.skip_if_no("pyarrow")
  752. def test_pyarrow_backed_string_array(self, pa, string_storage):
  753. # test ArrowStringArray supported through the __arrow_array__ protocol
  754. df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
  755. with pd.option_context("string_storage", string_storage):
  756. check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))
  757. @td.skip_if_no("pyarrow")
  758. def test_additional_extension_types(self, pa):
  759. # test additional ExtensionArrays that are supported through the
  760. # __arrow_array__ protocol + by defining a custom ExtensionType
  761. df = pd.DataFrame(
  762. {
  763. "c": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]),
  764. "d": pd.period_range("2012-01-01", periods=3, freq="D"),
  765. # GH-45881 issue with interval with datetime64[ns] subtype
  766. "e": pd.IntervalIndex.from_breaks(
  767. pd.date_range("2012-01-01", periods=4, freq="D")
  768. ),
  769. }
  770. )
  771. check_round_trip(df, pa)
  772. def test_timestamp_nanoseconds(self, pa):
  773. # with version 2.6, pyarrow defaults to writing the nanoseconds, so
  774. # this should work without error
  775. # Note in previous pyarrows(<7.0.0), only the pseudo-version 2.0 was available
  776. if not pa_version_under7p0:
  777. ver = "2.6"
  778. else:
  779. ver = "2.0"
  780. df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)})
  781. check_round_trip(df, pa, write_kwargs={"version": ver})
  782. def test_timezone_aware_index(self, request, pa, timezone_aware_date_list):
  783. if (
  784. not pa_version_under7p0
  785. and timezone_aware_date_list.tzinfo != datetime.timezone.utc
  786. ):
  787. request.node.add_marker(
  788. pytest.mark.xfail(
  789. reason="temporary skip this test until it is properly resolved: "
  790. "https://github.com/pandas-dev/pandas/issues/37286"
  791. )
  792. )
  793. idx = 5 * [timezone_aware_date_list]
  794. df = pd.DataFrame(index=idx, data={"index_as_col": idx})
  795. # see gh-36004
  796. # compare time(zone) values only, skip their class:
  797. # pyarrow always creates fixed offset timezones using pytz.FixedOffset()
  798. # even if it was datetime.timezone() originally
  799. #
  800. # technically they are the same:
  801. # they both implement datetime.tzinfo
  802. # they both wrap datetime.timedelta()
  803. # this use-case sets the resolution to 1 minute
  804. check_round_trip(df, pa, check_dtype=False)
  805. @td.skip_if_no("pyarrow")
  806. def test_filter_row_groups(self, pa):
  807. # https://github.com/pandas-dev/pandas/issues/26551
  808. df = pd.DataFrame({"a": list(range(0, 3))})
  809. with tm.ensure_clean() as path:
  810. df.to_parquet(path, pa)
  811. result = read_parquet(
  812. path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False
  813. )
  814. assert len(result) == 1
  815. def test_read_parquet_manager(self, pa, using_array_manager):
  816. # ensure that read_parquet honors the pandas.options.mode.data_manager option
  817. df = pd.DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"])
  818. with tm.ensure_clean() as path:
  819. df.to_parquet(path, pa)
  820. result = read_parquet(path, pa)
  821. if using_array_manager:
  822. assert isinstance(result._mgr, pd.core.internals.ArrayManager)
  823. else:
  824. assert isinstance(result._mgr, pd.core.internals.BlockManager)
  825. def test_read_dtype_backend_pyarrow_config(self, pa, df_full):
  826. import pyarrow
  827. df = df_full
  828. # additional supported types for pyarrow
  829. dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
  830. dti = dti._with_freq(None) # freq doesn't round-trip
  831. df["datetime_tz"] = dti
  832. df["bool_with_none"] = [True, None, True]
  833. pa_table = pyarrow.Table.from_pandas(df)
  834. expected = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
  835. # pyarrow infers datetimes as us instead of ns
  836. expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]")
  837. expected["datetime_with_nat"] = expected["datetime_with_nat"].astype(
  838. "timestamp[us][pyarrow]"
  839. )
  840. expected["datetime_tz"] = expected["datetime_tz"].astype(
  841. pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels"))
  842. )
  843. check_round_trip(
  844. df,
  845. engine=pa,
  846. read_kwargs={"dtype_backend": "pyarrow"},
  847. expected=expected,
  848. )
  849. def test_read_dtype_backend_pyarrow_config_index(self, pa):
  850. df = pd.DataFrame(
  851. {"a": [1, 2]}, index=pd.Index([3, 4], name="test"), dtype="int64[pyarrow]"
  852. )
  853. expected = df.copy()
  854. import pyarrow
  855. if Version(pyarrow.__version__) > Version("11.0.0"):
  856. expected.index = expected.index.astype("int64[pyarrow]")
  857. check_round_trip(
  858. df,
  859. engine=pa,
  860. read_kwargs={"dtype_backend": "pyarrow"},
  861. expected=expected,
  862. )
  863. def test_columns_dtypes_not_invalid(self, pa):
  864. df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
  865. # numeric
  866. df.columns = [0, 1]
  867. check_round_trip(df, pa)
  868. # bytes
  869. df.columns = [b"foo", b"bar"]
  870. with pytest.raises(NotImplementedError, match="|S3"):
  871. # Bytes fails on read_parquet
  872. check_round_trip(df, pa)
  873. # python object
  874. df.columns = [
  875. datetime.datetime(2011, 1, 1, 0, 0),
  876. datetime.datetime(2011, 1, 1, 1, 1),
  877. ]
  878. check_round_trip(df, pa)
  879. def test_empty_columns(self, pa):
  880. # GH 52034
  881. df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
  882. check_round_trip(df, pa)
  883. class TestParquetFastParquet(Base):
  884. def test_basic(self, fp, df_full):
  885. df = df_full
  886. dti = pd.date_range("20130101", periods=3, tz="US/Eastern")
  887. dti = dti._with_freq(None) # freq doesn't round-trip
  888. df["datetime_tz"] = dti
  889. df["timedelta"] = pd.timedelta_range("1 day", periods=3)
  890. check_round_trip(df, fp)
  891. def test_columns_dtypes_invalid(self, fp):
  892. df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
  893. err = TypeError
  894. msg = "Column name must be a string"
  895. # numeric
  896. df.columns = [0, 1]
  897. self.check_error_on_write(df, fp, err, msg)
  898. # bytes
  899. df.columns = [b"foo", b"bar"]
  900. self.check_error_on_write(df, fp, err, msg)
  901. # python object
  902. df.columns = [
  903. datetime.datetime(2011, 1, 1, 0, 0),
  904. datetime.datetime(2011, 1, 1, 1, 1),
  905. ]
  906. self.check_error_on_write(df, fp, err, msg)
  907. def test_duplicate_columns(self, fp):
  908. # not currently able to handle duplicate columns
  909. df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
  910. msg = "Cannot create parquet dataset with duplicate column names"
  911. self.check_error_on_write(df, fp, ValueError, msg)
  912. def test_bool_with_none(self, fp):
  913. df = pd.DataFrame({"a": [True, None, False]})
  914. expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
  915. # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
  916. # float64
  917. check_round_trip(df, fp, expected=expected, check_dtype=False)
  918. def test_unsupported(self, fp):
  919. # period
  920. df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)})
  921. # error from fastparquet -> don't check exact error message
  922. self.check_error_on_write(df, fp, ValueError, None)
  923. # mixed
  924. df = pd.DataFrame({"a": ["a", 1, 2.0]})
  925. msg = "Can't infer object conversion type"
  926. self.check_error_on_write(df, fp, ValueError, msg)
  927. def test_categorical(self, fp):
  928. df = pd.DataFrame({"a": pd.Categorical(list("abc"))})
  929. check_round_trip(df, fp)
  930. def test_filter_row_groups(self, fp):
  931. d = {"a": list(range(0, 3))}
  932. df = pd.DataFrame(d)
  933. with tm.ensure_clean() as path:
  934. df.to_parquet(path, fp, compression=None, row_group_offsets=1)
  935. result = read_parquet(path, fp, filters=[("a", "==", 0)])
  936. assert len(result) == 1
  937. @pytest.mark.single_cpu
  938. def test_s3_roundtrip(self, df_compat, s3_resource, fp, s3so):
  939. # GH #19134
  940. check_round_trip(
  941. df_compat,
  942. fp,
  943. path="s3://pandas-test/fastparquet.parquet",
  944. read_kwargs={"storage_options": s3so},
  945. write_kwargs={"compression": None, "storage_options": s3so},
  946. )
  947. def test_partition_cols_supported(self, tmp_path, fp, df_full):
  948. # GH #23283
  949. partition_cols = ["bool", "int"]
  950. df = df_full
  951. df.to_parquet(
  952. tmp_path,
  953. engine="fastparquet",
  954. partition_cols=partition_cols,
  955. compression=None,
  956. )
  957. assert os.path.exists(tmp_path)
  958. import fastparquet
  959. actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
  960. assert len(actual_partition_cols) == 2
  961. def test_partition_cols_string(self, tmp_path, fp, df_full):
  962. # GH #27117
  963. partition_cols = "bool"
  964. df = df_full
  965. df.to_parquet(
  966. tmp_path,
  967. engine="fastparquet",
  968. partition_cols=partition_cols,
  969. compression=None,
  970. )
  971. assert os.path.exists(tmp_path)
  972. import fastparquet
  973. actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
  974. assert len(actual_partition_cols) == 1
  975. def test_partition_on_supported(self, tmp_path, fp, df_full):
  976. # GH #23283
  977. partition_cols = ["bool", "int"]
  978. df = df_full
  979. df.to_parquet(
  980. tmp_path,
  981. engine="fastparquet",
  982. compression=None,
  983. partition_on=partition_cols,
  984. )
  985. assert os.path.exists(tmp_path)
  986. import fastparquet
  987. actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
  988. assert len(actual_partition_cols) == 2
  989. def test_error_on_using_partition_cols_and_partition_on(
  990. self, tmp_path, fp, df_full
  991. ):
  992. # GH #23283
  993. partition_cols = ["bool", "int"]
  994. df = df_full
  995. msg = (
  996. "Cannot use both partition_on and partition_cols. Use partition_cols for "
  997. "partitioning data"
  998. )
  999. with pytest.raises(ValueError, match=msg):
  1000. df.to_parquet(
  1001. tmp_path,
  1002. engine="fastparquet",
  1003. compression=None,
  1004. partition_on=partition_cols,
  1005. partition_cols=partition_cols,
  1006. )
  1007. def test_empty_dataframe(self, fp):
  1008. # GH #27339
  1009. df = pd.DataFrame()
  1010. expected = df.copy()
  1011. check_round_trip(df, fp, expected=expected)
  1012. def test_timezone_aware_index(self, fp, timezone_aware_date_list):
  1013. idx = 5 * [timezone_aware_date_list]
  1014. df = pd.DataFrame(index=idx, data={"index_as_col": idx})
  1015. expected = df.copy()
  1016. expected.index.name = "index"
  1017. check_round_trip(df, fp, expected=expected)
  1018. def test_use_nullable_dtypes_not_supported(self, fp):
  1019. df = pd.DataFrame({"a": [1, 2]})
  1020. with tm.ensure_clean() as path:
  1021. df.to_parquet(path)
  1022. with pytest.raises(ValueError, match="not supported for the fastparquet"):
  1023. with tm.assert_produces_warning(FutureWarning):
  1024. read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)
  1025. with pytest.raises(ValueError, match="not supported for the fastparquet"):
  1026. read_parquet(path, engine="fastparquet", dtype_backend="pyarrow")
  1027. def test_close_file_handle_on_read_error(self):
  1028. with tm.ensure_clean("test.parquet") as path:
  1029. pathlib.Path(path).write_bytes(b"breakit")
  1030. with pytest.raises(Exception, match=""): # Not important which exception
  1031. read_parquet(path, engine="fastparquet")
  1032. # The next line raises an error on Windows if the file is still open
  1033. pathlib.Path(path).unlink(missing_ok=False)
  1034. def test_bytes_file_name(self, engine):
  1035. # GH#48944
  1036. df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]})
  1037. with tm.ensure_clean("test.parquet") as path:
  1038. with open(path.encode(), "wb") as f:
  1039. df.to_parquet(f)
  1040. result = read_parquet(path, engine=engine)
  1041. tm.assert_frame_equal(result, df)
  1042. def test_invalid_dtype_backend(self, engine):
  1043. msg = (
  1044. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  1045. "'pyarrow' are allowed."
  1046. )
  1047. df = pd.DataFrame({"int": list(range(1, 4))})
  1048. with tm.ensure_clean("tmp.parquet") as path:
  1049. df.to_parquet(path)
  1050. with pytest.raises(ValueError, match=msg):
  1051. read_parquet(path, dtype_backend="numpy")
  1052. def test_empty_columns(self, fp):
  1053. # GH 52034
  1054. df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
  1055. expected = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
  1056. check_round_trip(df, fp, expected=expected)