parquet.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. """ parquet compat """
  2. from __future__ import annotations
  3. import io
  4. import os
  5. from typing import (
  6. Any,
  7. Literal,
  8. )
  9. import warnings
  10. from warnings import catch_warnings
  11. from pandas._libs import lib
  12. from pandas._typing import (
  13. DtypeBackend,
  14. FilePath,
  15. ReadBuffer,
  16. StorageOptions,
  17. WriteBuffer,
  18. )
  19. from pandas.compat._optional import import_optional_dependency
  20. from pandas.errors import AbstractMethodError
  21. from pandas.util._decorators import doc
  22. from pandas.util._exceptions import find_stack_level
  23. from pandas.util._validators import check_dtype_backend
  24. import pandas as pd
  25. from pandas import (
  26. DataFrame,
  27. get_option,
  28. )
  29. from pandas.core.shared_docs import _shared_docs
  30. from pandas.util.version import Version
  31. from pandas.io.common import (
  32. IOHandles,
  33. get_handle,
  34. is_fsspec_url,
  35. is_url,
  36. stringify_path,
  37. )
  38. def get_engine(engine: str) -> BaseImpl:
  39. """return our implementation"""
  40. if engine == "auto":
  41. engine = get_option("io.parquet.engine")
  42. if engine == "auto":
  43. # try engines in this order
  44. engine_classes = [PyArrowImpl, FastParquetImpl]
  45. error_msgs = ""
  46. for engine_class in engine_classes:
  47. try:
  48. return engine_class()
  49. except ImportError as err:
  50. error_msgs += "\n - " + str(err)
  51. raise ImportError(
  52. "Unable to find a usable engine; "
  53. "tried using: 'pyarrow', 'fastparquet'.\n"
  54. "A suitable version of "
  55. "pyarrow or fastparquet is required for parquet "
  56. "support.\n"
  57. "Trying to import the above resulted in these errors:"
  58. f"{error_msgs}"
  59. )
  60. if engine == "pyarrow":
  61. return PyArrowImpl()
  62. elif engine == "fastparquet":
  63. return FastParquetImpl()
  64. raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
  65. def _get_path_or_handle(
  66. path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
  67. fs: Any,
  68. storage_options: StorageOptions = None,
  69. mode: str = "rb",
  70. is_dir: bool = False,
  71. ) -> tuple[
  72. FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
  73. ]:
  74. """File handling for PyArrow."""
  75. path_or_handle = stringify_path(path)
  76. if is_fsspec_url(path_or_handle) and fs is None:
  77. fsspec = import_optional_dependency("fsspec")
  78. fs, path_or_handle = fsspec.core.url_to_fs(
  79. path_or_handle, **(storage_options or {})
  80. )
  81. elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
  82. # can't write to a remote url
  83. # without making use of fsspec at the moment
  84. raise ValueError("storage_options passed with buffer, or non-supported URL")
  85. handles = None
  86. if (
  87. not fs
  88. and not is_dir
  89. and isinstance(path_or_handle, str)
  90. and not os.path.isdir(path_or_handle)
  91. ):
  92. # use get_handle only when we are very certain that it is not a directory
  93. # fsspec resources can also point to directories
  94. # this branch is used for example when reading from non-fsspec URLs
  95. handles = get_handle(
  96. path_or_handle, mode, is_text=False, storage_options=storage_options
  97. )
  98. fs = None
  99. path_or_handle = handles.handle
  100. return path_or_handle, handles, fs
  101. class BaseImpl:
  102. @staticmethod
  103. def validate_dataframe(df: DataFrame) -> None:
  104. if not isinstance(df, DataFrame):
  105. raise ValueError("to_parquet only supports IO with DataFrames")
  106. def write(self, df: DataFrame, path, compression, **kwargs):
  107. raise AbstractMethodError(self)
  108. def read(self, path, columns=None, **kwargs) -> DataFrame:
  109. raise AbstractMethodError(self)
  110. class PyArrowImpl(BaseImpl):
  111. def __init__(self) -> None:
  112. import_optional_dependency(
  113. "pyarrow", extra="pyarrow is required for parquet support."
  114. )
  115. import pyarrow.parquet
  116. # import utils to register the pyarrow extension types
  117. import pandas.core.arrays.arrow.extension_types # pyright: ignore # noqa:F401
  118. self.api = pyarrow
  119. def write(
  120. self,
  121. df: DataFrame,
  122. path: FilePath | WriteBuffer[bytes],
  123. compression: str | None = "snappy",
  124. index: bool | None = None,
  125. storage_options: StorageOptions = None,
  126. partition_cols: list[str] | None = None,
  127. **kwargs,
  128. ) -> None:
  129. self.validate_dataframe(df)
  130. from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
  131. if index is not None:
  132. from_pandas_kwargs["preserve_index"] = index
  133. table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
  134. path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
  135. path,
  136. kwargs.pop("filesystem", None),
  137. storage_options=storage_options,
  138. mode="wb",
  139. is_dir=partition_cols is not None,
  140. )
  141. if (
  142. isinstance(path_or_handle, io.BufferedWriter)
  143. and hasattr(path_or_handle, "name")
  144. and isinstance(path_or_handle.name, (str, bytes))
  145. ):
  146. path_or_handle = path_or_handle.name
  147. if isinstance(path_or_handle, bytes):
  148. path_or_handle = path_or_handle.decode()
  149. try:
  150. if partition_cols is not None:
  151. # writes to multiple files under the given path
  152. self.api.parquet.write_to_dataset(
  153. table,
  154. path_or_handle,
  155. compression=compression,
  156. partition_cols=partition_cols,
  157. **kwargs,
  158. )
  159. else:
  160. # write to single output file
  161. self.api.parquet.write_table(
  162. table, path_or_handle, compression=compression, **kwargs
  163. )
  164. finally:
  165. if handles is not None:
  166. handles.close()
  167. def read(
  168. self,
  169. path,
  170. columns=None,
  171. use_nullable_dtypes: bool = False,
  172. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  173. storage_options: StorageOptions = None,
  174. **kwargs,
  175. ) -> DataFrame:
  176. kwargs["use_pandas_metadata"] = True
  177. to_pandas_kwargs = {}
  178. if dtype_backend == "numpy_nullable":
  179. from pandas.io._util import _arrow_dtype_mapping
  180. mapping = _arrow_dtype_mapping()
  181. to_pandas_kwargs["types_mapper"] = mapping.get
  182. elif dtype_backend == "pyarrow":
  183. to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa
  184. manager = get_option("mode.data_manager")
  185. if manager == "array":
  186. to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment]
  187. path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
  188. path,
  189. kwargs.pop("filesystem", None),
  190. storage_options=storage_options,
  191. mode="rb",
  192. )
  193. try:
  194. pa_table = self.api.parquet.read_table(
  195. path_or_handle, columns=columns, **kwargs
  196. )
  197. result = pa_table.to_pandas(**to_pandas_kwargs)
  198. if manager == "array":
  199. result = result._as_manager("array", copy=False)
  200. return result
  201. finally:
  202. if handles is not None:
  203. handles.close()
  204. class FastParquetImpl(BaseImpl):
  205. def __init__(self) -> None:
  206. # since pandas is a dependency of fastparquet
  207. # we need to import on first use
  208. fastparquet = import_optional_dependency(
  209. "fastparquet", extra="fastparquet is required for parquet support."
  210. )
  211. self.api = fastparquet
  212. def write(
  213. self,
  214. df: DataFrame,
  215. path,
  216. compression: Literal["snappy", "gzip", "brotli"] | None = "snappy",
  217. index=None,
  218. partition_cols=None,
  219. storage_options: StorageOptions = None,
  220. **kwargs,
  221. ) -> None:
  222. self.validate_dataframe(df)
  223. if "partition_on" in kwargs and partition_cols is not None:
  224. raise ValueError(
  225. "Cannot use both partition_on and "
  226. "partition_cols. Use partition_cols for partitioning data"
  227. )
  228. if "partition_on" in kwargs:
  229. partition_cols = kwargs.pop("partition_on")
  230. if partition_cols is not None:
  231. kwargs["file_scheme"] = "hive"
  232. # cannot use get_handle as write() does not accept file buffers
  233. path = stringify_path(path)
  234. if is_fsspec_url(path):
  235. fsspec = import_optional_dependency("fsspec")
  236. # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
  237. kwargs["open_with"] = lambda path, _: fsspec.open(
  238. path, "wb", **(storage_options or {})
  239. ).open()
  240. elif storage_options:
  241. raise ValueError(
  242. "storage_options passed with file object or non-fsspec file path"
  243. )
  244. with catch_warnings(record=True):
  245. self.api.write(
  246. path,
  247. df,
  248. compression=compression,
  249. write_index=index,
  250. partition_on=partition_cols,
  251. **kwargs,
  252. )
  253. def read(
  254. self, path, columns=None, storage_options: StorageOptions = None, **kwargs
  255. ) -> DataFrame:
  256. parquet_kwargs: dict[str, Any] = {}
  257. use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
  258. dtype_backend = kwargs.pop("dtype_backend", lib.no_default)
  259. if Version(self.api.__version__) >= Version("0.7.1"):
  260. # We are disabling nullable dtypes for fastparquet pending discussion
  261. parquet_kwargs["pandas_nulls"] = False
  262. if use_nullable_dtypes:
  263. raise ValueError(
  264. "The 'use_nullable_dtypes' argument is not supported for the "
  265. "fastparquet engine"
  266. )
  267. if dtype_backend is not lib.no_default:
  268. raise ValueError(
  269. "The 'dtype_backend' argument is not supported for the "
  270. "fastparquet engine"
  271. )
  272. path = stringify_path(path)
  273. handles = None
  274. if is_fsspec_url(path):
  275. fsspec = import_optional_dependency("fsspec")
  276. if Version(self.api.__version__) > Version("0.6.1"):
  277. parquet_kwargs["fs"] = fsspec.open(
  278. path, "rb", **(storage_options or {})
  279. ).fs
  280. else:
  281. parquet_kwargs["open_with"] = lambda path, _: fsspec.open(
  282. path, "rb", **(storage_options or {})
  283. ).open()
  284. elif isinstance(path, str) and not os.path.isdir(path):
  285. # use get_handle only when we are very certain that it is not a directory
  286. # fsspec resources can also point to directories
  287. # this branch is used for example when reading from non-fsspec URLs
  288. handles = get_handle(
  289. path, "rb", is_text=False, storage_options=storage_options
  290. )
  291. path = handles.handle
  292. try:
  293. parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
  294. return parquet_file.to_pandas(columns=columns, **kwargs)
  295. finally:
  296. if handles is not None:
  297. handles.close()
  298. @doc(storage_options=_shared_docs["storage_options"])
  299. def to_parquet(
  300. df: DataFrame,
  301. path: FilePath | WriteBuffer[bytes] | None = None,
  302. engine: str = "auto",
  303. compression: str | None = "snappy",
  304. index: bool | None = None,
  305. storage_options: StorageOptions = None,
  306. partition_cols: list[str] | None = None,
  307. **kwargs,
  308. ) -> bytes | None:
  309. """
  310. Write a DataFrame to the parquet format.
  311. Parameters
  312. ----------
  313. df : DataFrame
  314. path : str, path object, file-like object, or None, default None
  315. String, path object (implementing ``os.PathLike[str]``), or file-like
  316. object implementing a binary ``write()`` function. If None, the result is
  317. returned as bytes. If a string, it will be used as Root Directory path
  318. when writing a partitioned dataset. The engine fastparquet does not
  319. accept file-like objects.
  320. .. versionchanged:: 1.2.0
  321. engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
  322. Parquet library to use. If 'auto', then the option
  323. ``io.parquet.engine`` is used. The default ``io.parquet.engine``
  324. behavior is to try 'pyarrow', falling back to 'fastparquet' if
  325. 'pyarrow' is unavailable.
  326. compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
  327. default 'snappy'. Name of the compression to use. Use ``None``
  328. for no compression. The supported compression methods actually
  329. depend on which engine is used. For 'pyarrow', 'snappy', 'gzip',
  330. 'brotli', 'lz4', 'zstd' are all supported. For 'fastparquet',
  331. only 'gzip' and 'snappy' are supported.
  332. index : bool, default None
  333. If ``True``, include the dataframe's index(es) in the file output. If
  334. ``False``, they will not be written to the file.
  335. If ``None``, similar to ``True`` the dataframe's index(es)
  336. will be saved. However, instead of being saved as values,
  337. the RangeIndex will be stored as a range in the metadata so it
  338. doesn't require much space and is faster. Other indexes will
  339. be included as columns in the file output.
  340. partition_cols : str or list, optional, default None
  341. Column names by which to partition the dataset.
  342. Columns are partitioned in the order they are given.
  343. Must be None if path is not a string.
  344. {storage_options}
  345. .. versionadded:: 1.2.0
  346. kwargs
  347. Additional keyword arguments passed to the engine
  348. Returns
  349. -------
  350. bytes if no path argument is provided else None
  351. """
  352. if isinstance(partition_cols, str):
  353. partition_cols = [partition_cols]
  354. impl = get_engine(engine)
  355. path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
  356. impl.write(
  357. df,
  358. path_or_buf,
  359. compression=compression,
  360. index=index,
  361. partition_cols=partition_cols,
  362. storage_options=storage_options,
  363. **kwargs,
  364. )
  365. if path is None:
  366. assert isinstance(path_or_buf, io.BytesIO)
  367. return path_or_buf.getvalue()
  368. else:
  369. return None
  370. @doc(storage_options=_shared_docs["storage_options"])
  371. def read_parquet(
  372. path: FilePath | ReadBuffer[bytes],
  373. engine: str = "auto",
  374. columns: list[str] | None = None,
  375. storage_options: StorageOptions = None,
  376. use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
  377. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  378. **kwargs,
  379. ) -> DataFrame:
  380. """
  381. Load a parquet object from the file path, returning a DataFrame.
  382. Parameters
  383. ----------
  384. path : str, path object or file-like object
  385. String, path object (implementing ``os.PathLike[str]``), or file-like
  386. object implementing a binary ``read()`` function.
  387. The string could be a URL. Valid URL schemes include http, ftp, s3,
  388. gs, and file. For file URLs, a host is expected. A local file could be:
  389. ``file://localhost/path/to/table.parquet``.
  390. A file URL can also be a path to a directory that contains multiple
  391. partitioned parquet files. Both pyarrow and fastparquet support
  392. paths to directories as well as file URLs. A directory path could be:
  393. ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
  394. engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
  395. Parquet library to use. If 'auto', then the option
  396. ``io.parquet.engine`` is used. The default ``io.parquet.engine``
  397. behavior is to try 'pyarrow', falling back to 'fastparquet' if
  398. 'pyarrow' is unavailable.
  399. columns : list, default=None
  400. If not None, only these columns will be read from the file.
  401. {storage_options}
  402. .. versionadded:: 1.3.0
  403. use_nullable_dtypes : bool, default False
  404. If True, use dtypes that use ``pd.NA`` as missing value indicator
  405. for the resulting DataFrame. (only applicable for the ``pyarrow``
  406. engine)
  407. As new dtypes are added that support ``pd.NA`` in the future, the
  408. output with this option will change to use those dtypes.
  409. Note: this is an experimental option, and behaviour (e.g. additional
  410. support dtypes) may change without notice.
  411. .. deprecated:: 2.0
  412. dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
  413. Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
  414. arrays, nullable dtypes are used for all dtypes that have a nullable
  415. implementation when "numpy_nullable" is set, pyarrow is used for all
  416. dtypes if "pyarrow" is set.
  417. The dtype_backends are still experimential.
  418. .. versionadded:: 2.0
  419. **kwargs
  420. Any additional kwargs are passed to the engine.
  421. Returns
  422. -------
  423. DataFrame
  424. """
  425. impl = get_engine(engine)
  426. if use_nullable_dtypes is not lib.no_default:
  427. msg = (
  428. "The argument 'use_nullable_dtypes' is deprecated and will be removed "
  429. "in a future version."
  430. )
  431. if use_nullable_dtypes is True:
  432. msg += (
  433. "Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True."
  434. )
  435. warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
  436. else:
  437. use_nullable_dtypes = False
  438. check_dtype_backend(dtype_backend)
  439. return impl.read(
  440. path,
  441. columns=columns,
  442. storage_options=storage_options,
  443. use_nullable_dtypes=use_nullable_dtypes,
  444. dtype_backend=dtype_backend,
  445. **kwargs,
  446. )