sasreader.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. """
  2. Read SAS sas7bdat or xport files.
  3. """
  4. from __future__ import annotations
  5. from abc import (
  6. ABCMeta,
  7. abstractmethod,
  8. )
  9. from types import TracebackType
  10. from typing import (
  11. TYPE_CHECKING,
  12. Hashable,
  13. overload,
  14. )
  15. from pandas._typing import (
  16. CompressionOptions,
  17. FilePath,
  18. ReadBuffer,
  19. )
  20. from pandas.util._decorators import doc
  21. from pandas.core.shared_docs import _shared_docs
  22. from pandas.io.common import stringify_path
  23. if TYPE_CHECKING:
  24. from pandas import DataFrame
  25. # TODO(PY38): replace with Protocol in Python 3.8
  26. class ReaderBase(metaclass=ABCMeta):
  27. """
  28. Protocol for XportReader and SAS7BDATReader classes.
  29. """
  30. @abstractmethod
  31. def read(self, nrows: int | None = None) -> DataFrame:
  32. pass
  33. @abstractmethod
  34. def close(self) -> None:
  35. pass
  36. def __enter__(self) -> ReaderBase:
  37. return self
  38. def __exit__(
  39. self,
  40. exc_type: type[BaseException] | None,
  41. exc_value: BaseException | None,
  42. traceback: TracebackType | None,
  43. ) -> None:
  44. self.close()
  45. @overload
  46. def read_sas(
  47. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  48. *,
  49. format: str | None = ...,
  50. index: Hashable | None = ...,
  51. encoding: str | None = ...,
  52. chunksize: int = ...,
  53. iterator: bool = ...,
  54. compression: CompressionOptions = ...,
  55. ) -> ReaderBase:
  56. ...
  57. @overload
  58. def read_sas(
  59. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  60. *,
  61. format: str | None = ...,
  62. index: Hashable | None = ...,
  63. encoding: str | None = ...,
  64. chunksize: None = ...,
  65. iterator: bool = ...,
  66. compression: CompressionOptions = ...,
  67. ) -> DataFrame | ReaderBase:
  68. ...
  69. @doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer")
  70. def read_sas(
  71. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  72. *,
  73. format: str | None = None,
  74. index: Hashable | None = None,
  75. encoding: str | None = None,
  76. chunksize: int | None = None,
  77. iterator: bool = False,
  78. compression: CompressionOptions = "infer",
  79. ) -> DataFrame | ReaderBase:
  80. """
  81. Read SAS files stored as either XPORT or SAS7BDAT format files.
  82. Parameters
  83. ----------
  84. filepath_or_buffer : str, path object, or file-like object
  85. String, path object (implementing ``os.PathLike[str]``), or file-like
  86. object implementing a binary ``read()`` function. The string could be a URL.
  87. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
  88. expected. A local file could be:
  89. ``file://localhost/path/to/table.sas7bdat``.
  90. format : str {{'xport', 'sas7bdat'}} or None
  91. If None, file format is inferred from file extension. If 'xport' or
  92. 'sas7bdat', uses the corresponding format.
  93. index : identifier of index column, defaults to None
  94. Identifier of column that should be used as index of the DataFrame.
  95. encoding : str, default is None
  96. Encoding for text data. If None, text data are stored as raw bytes.
  97. chunksize : int
  98. Read file `chunksize` lines at a time, returns iterator.
  99. .. versionchanged:: 1.2
  100. ``TextFileReader`` is a context manager.
  101. iterator : bool, defaults to False
  102. If True, returns an iterator for reading the file incrementally.
  103. .. versionchanged:: 1.2
  104. ``TextFileReader`` is a context manager.
  105. {decompression_options}
  106. Returns
  107. -------
  108. DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
  109. or XportReader
  110. """
  111. if format is None:
  112. buffer_error_msg = (
  113. "If this is a buffer object rather "
  114. "than a string name, you must specify a format string"
  115. )
  116. filepath_or_buffer = stringify_path(filepath_or_buffer)
  117. if not isinstance(filepath_or_buffer, str):
  118. raise ValueError(buffer_error_msg)
  119. fname = filepath_or_buffer.lower()
  120. if ".xpt" in fname:
  121. format = "xport"
  122. elif ".sas7bdat" in fname:
  123. format = "sas7bdat"
  124. else:
  125. raise ValueError(
  126. f"unable to infer format of SAS file from filename: {repr(fname)}"
  127. )
  128. reader: ReaderBase
  129. if format.lower() == "xport":
  130. from pandas.io.sas.sas_xport import XportReader
  131. reader = XportReader(
  132. filepath_or_buffer,
  133. index=index,
  134. encoding=encoding,
  135. chunksize=chunksize,
  136. compression=compression,
  137. )
  138. elif format.lower() == "sas7bdat":
  139. from pandas.io.sas.sas7bdat import SAS7BDATReader
  140. reader = SAS7BDATReader(
  141. filepath_or_buffer,
  142. index=index,
  143. encoding=encoding,
  144. chunksize=chunksize,
  145. compression=compression,
  146. )
  147. else:
  148. raise ValueError("unknown SAS format")
  149. if iterator or chunksize:
  150. return reader
  151. with reader:
  152. return reader.read()