orc.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. """ orc compat """
  2. from __future__ import annotations
  3. import io
  4. from types import ModuleType
  5. from typing import (
  6. Any,
  7. Literal,
  8. )
  9. from pandas._libs import lib
  10. from pandas._typing import (
  11. DtypeBackend,
  12. FilePath,
  13. ReadBuffer,
  14. WriteBuffer,
  15. )
  16. from pandas.compat._optional import import_optional_dependency
  17. from pandas.util._validators import check_dtype_backend
  18. from pandas.core.dtypes.common import (
  19. is_categorical_dtype,
  20. is_interval_dtype,
  21. is_period_dtype,
  22. is_unsigned_integer_dtype,
  23. )
  24. import pandas as pd
  25. from pandas.core.frame import DataFrame
  26. from pandas.io.common import get_handle
  27. def read_orc(
  28. path: FilePath | ReadBuffer[bytes],
  29. columns: list[str] | None = None,
  30. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  31. **kwargs,
  32. ) -> DataFrame:
  33. """
  34. Load an ORC object from the file path, returning a DataFrame.
  35. Parameters
  36. ----------
  37. path : str, path object, or file-like object
  38. String, path object (implementing ``os.PathLike[str]``), or file-like
  39. object implementing a binary ``read()`` function. The string could be a URL.
  40. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
  41. expected. A local file could be:
  42. ``file://localhost/path/to/table.orc``.
  43. columns : list, default None
  44. If not None, only these columns will be read from the file.
  45. Output always follows the ordering of the file and not the columns list.
  46. This mirrors the original behaviour of
  47. :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
  48. dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
  49. Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
  50. arrays, nullable dtypes are used for all dtypes that have a nullable
  51. implementation when "numpy_nullable" is set, pyarrow is used for all
  52. dtypes if "pyarrow" is set.
  53. The dtype_backends are still experimential.
  54. .. versionadded:: 2.0
  55. **kwargs
  56. Any additional kwargs are passed to pyarrow.
  57. Returns
  58. -------
  59. DataFrame
  60. Notes
  61. -----
  62. Before using this function you should read the :ref:`user guide about ORC <io.orc>`
  63. and :ref:`install optional dependencies <install.warn_orc>`.
  64. """
  65. # we require a newer version of pyarrow than we support for parquet
  66. orc = import_optional_dependency("pyarrow.orc")
  67. check_dtype_backend(dtype_backend)
  68. with get_handle(path, "rb", is_text=False) as handles:
  69. orc_file = orc.ORCFile(handles.handle)
  70. pa_table = orc_file.read(columns=columns, **kwargs)
  71. if dtype_backend is not lib.no_default:
  72. if dtype_backend == "pyarrow":
  73. df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
  74. else:
  75. from pandas.io._util import _arrow_dtype_mapping
  76. mapping = _arrow_dtype_mapping()
  77. df = pa_table.to_pandas(types_mapper=mapping.get)
  78. return df
  79. else:
  80. return pa_table.to_pandas()
  81. def to_orc(
  82. df: DataFrame,
  83. path: FilePath | WriteBuffer[bytes] | None = None,
  84. *,
  85. engine: Literal["pyarrow"] = "pyarrow",
  86. index: bool | None = None,
  87. engine_kwargs: dict[str, Any] | None = None,
  88. ) -> bytes | None:
  89. """
  90. Write a DataFrame to the ORC format.
  91. .. versionadded:: 1.5.0
  92. Parameters
  93. ----------
  94. df : DataFrame
  95. The dataframe to be written to ORC. Raises NotImplementedError
  96. if dtype of one or more columns is category, unsigned integers,
  97. intervals, periods or sparse.
  98. path : str, file-like object or None, default None
  99. If a string, it will be used as Root Directory path
  100. when writing a partitioned dataset. By file-like object,
  101. we refer to objects with a write() method, such as a file handle
  102. (e.g. via builtin open function). If path is None,
  103. a bytes object is returned.
  104. engine : str, default 'pyarrow'
  105. ORC library to use. Pyarrow must be >= 7.0.0.
  106. index : bool, optional
  107. If ``True``, include the dataframe's index(es) in the file output. If
  108. ``False``, they will not be written to the file.
  109. If ``None``, similar to ``infer`` the dataframe's index(es)
  110. will be saved. However, instead of being saved as values,
  111. the RangeIndex will be stored as a range in the metadata so it
  112. doesn't require much space and is faster. Other indexes will
  113. be included as columns in the file output.
  114. engine_kwargs : dict[str, Any] or None, default None
  115. Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
  116. Returns
  117. -------
  118. bytes if no path argument is provided else None
  119. Raises
  120. ------
  121. NotImplementedError
  122. Dtype of one or more columns is category, unsigned integers, interval,
  123. period or sparse.
  124. ValueError
  125. engine is not pyarrow.
  126. Notes
  127. -----
  128. * Before using this function you should read the
  129. :ref:`user guide about ORC <io.orc>` and
  130. :ref:`install optional dependencies <install.warn_orc>`.
  131. * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
  132. library.
  133. * For supported dtypes please refer to `supported ORC features in Arrow
  134. <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
  135. * Currently timezones in datetime columns are not preserved when a
  136. dataframe is converted into ORC files.
  137. """
  138. if index is None:
  139. index = df.index.names[0] is not None
  140. if engine_kwargs is None:
  141. engine_kwargs = {}
  142. # If unsupported dtypes are found raise NotImplementedError
  143. # In Pyarrow 9.0.0 this check will no longer be needed
  144. for dtype in df.dtypes:
  145. if (
  146. is_categorical_dtype(dtype)
  147. or is_interval_dtype(dtype)
  148. or is_period_dtype(dtype)
  149. or is_unsigned_integer_dtype(dtype)
  150. ):
  151. raise NotImplementedError(
  152. "The dtype of one or more columns is not supported yet."
  153. )
  154. if engine != "pyarrow":
  155. raise ValueError("engine must be 'pyarrow'")
  156. engine = import_optional_dependency(engine, min_version="7.0.0")
  157. orc = import_optional_dependency("pyarrow.orc")
  158. was_none = path is None
  159. if was_none:
  160. path = io.BytesIO()
  161. assert path is not None # For mypy
  162. with get_handle(path, "wb", is_text=False) as handles:
  163. assert isinstance(engine, ModuleType) # For mypy
  164. try:
  165. orc.write_table(
  166. engine.Table.from_pandas(df, preserve_index=index),
  167. handles.handle,
  168. **engine_kwargs,
  169. )
  170. except TypeError as e:
  171. raise NotImplementedError(
  172. "The dtype of one or more columns is not supported yet."
  173. ) from e
  174. if was_none:
  175. assert isinstance(path, io.BytesIO) # For mypy
  176. return path.getvalue()
  177. return None