feather_format.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. """ feather-format compat """
  2. from __future__ import annotations
  3. from typing import (
  4. Hashable,
  5. Sequence,
  6. )
  7. from pandas._libs import lib
  8. from pandas._typing import (
  9. DtypeBackend,
  10. FilePath,
  11. ReadBuffer,
  12. StorageOptions,
  13. WriteBuffer,
  14. )
  15. from pandas.compat._optional import import_optional_dependency
  16. from pandas.util._decorators import doc
  17. from pandas.util._validators import check_dtype_backend
  18. import pandas as pd
  19. from pandas.core.api import (
  20. DataFrame,
  21. RangeIndex,
  22. )
  23. from pandas.core.shared_docs import _shared_docs
  24. from pandas.io.common import get_handle
  25. @doc(storage_options=_shared_docs["storage_options"])
  26. def to_feather(
  27. df: DataFrame,
  28. path: FilePath | WriteBuffer[bytes],
  29. storage_options: StorageOptions = None,
  30. **kwargs,
  31. ) -> None:
  32. """
  33. Write a DataFrame to the binary Feather format.
  34. Parameters
  35. ----------
  36. df : DataFrame
  37. path : str, path object, or file-like object
  38. {storage_options}
  39. .. versionadded:: 1.2.0
  40. **kwargs :
  41. Additional keywords passed to `pyarrow.feather.write_feather`.
  42. .. versionadded:: 1.1.0
  43. """
  44. import_optional_dependency("pyarrow")
  45. from pyarrow import feather
  46. if not isinstance(df, DataFrame):
  47. raise ValueError("feather only support IO with DataFrames")
  48. valid_types = {"string", "unicode"}
  49. # validate index
  50. # --------------
  51. # validate that we have only a default index
  52. # raise on anything else as we don't serialize the index
  53. if not df.index.dtype == "int64":
  54. typ = type(df.index)
  55. raise ValueError(
  56. f"feather does not support serializing {typ} "
  57. "for the index; you can .reset_index() to make the index into column(s)"
  58. )
  59. if not df.index.equals(RangeIndex.from_range(range(len(df)))):
  60. raise ValueError(
  61. "feather does not support serializing a non-default index for the index; "
  62. "you can .reset_index() to make the index into column(s)"
  63. )
  64. if df.index.name is not None:
  65. raise ValueError(
  66. "feather does not serialize index meta-data on a default index"
  67. )
  68. # validate columns
  69. # ----------------
  70. # must have value column names (strings only)
  71. if df.columns.inferred_type not in valid_types:
  72. raise ValueError("feather must have string column names")
  73. with get_handle(
  74. path, "wb", storage_options=storage_options, is_text=False
  75. ) as handles:
  76. feather.write_feather(df, handles.handle, **kwargs)
  77. @doc(storage_options=_shared_docs["storage_options"])
  78. def read_feather(
  79. path: FilePath | ReadBuffer[bytes],
  80. columns: Sequence[Hashable] | None = None,
  81. use_threads: bool = True,
  82. storage_options: StorageOptions = None,
  83. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  84. ):
  85. """
  86. Load a feather-format object from the file path.
  87. Parameters
  88. ----------
  89. path : str, path object, or file-like object
  90. String, path object (implementing ``os.PathLike[str]``), or file-like
  91. object implementing a binary ``read()`` function. The string could be a URL.
  92. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
  93. expected. A local file could be: ``file://localhost/path/to/table.feather``.
  94. columns : sequence, default None
  95. If not provided, all columns are read.
  96. use_threads : bool, default True
  97. Whether to parallelize reading using multiple threads.
  98. {storage_options}
  99. .. versionadded:: 1.2.0
  100. dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
  101. Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
  102. arrays, nullable dtypes are used for all dtypes that have a nullable
  103. implementation when "numpy_nullable" is set, pyarrow is used for all
  104. dtypes if "pyarrow" is set.
  105. The dtype_backends are still experimential.
  106. .. versionadded:: 2.0
  107. Returns
  108. -------
  109. type of object stored in file
  110. """
  111. import_optional_dependency("pyarrow")
  112. from pyarrow import feather
  113. check_dtype_backend(dtype_backend)
  114. with get_handle(
  115. path, "rb", storage_options=storage_options, is_text=False
  116. ) as handles:
  117. if dtype_backend is lib.no_default:
  118. return feather.read_feather(
  119. handles.handle, columns=columns, use_threads=bool(use_threads)
  120. )
  121. pa_table = feather.read_table(
  122. handles.handle, columns=columns, use_threads=bool(use_threads)
  123. )
  124. if dtype_backend == "numpy_nullable":
  125. from pandas.io._util import _arrow_dtype_mapping
  126. return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
  127. elif dtype_backend == "pyarrow":
  128. return pa_table.to_pandas(types_mapper=pd.ArrowDtype)