_odfreader.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. from __future__ import annotations
  2. from typing import (
  3. TYPE_CHECKING,
  4. cast,
  5. )
  6. import numpy as np
  7. from pandas._typing import (
  8. FilePath,
  9. ReadBuffer,
  10. Scalar,
  11. StorageOptions,
  12. )
  13. from pandas.compat._optional import import_optional_dependency
  14. from pandas.util._decorators import doc
  15. import pandas as pd
  16. from pandas.core.shared_docs import _shared_docs
  17. from pandas.io.excel._base import BaseExcelReader
  18. if TYPE_CHECKING:
  19. from pandas._libs.tslibs.nattype import NaTType
  20. @doc(storage_options=_shared_docs["storage_options"])
  21. class ODFReader(BaseExcelReader):
  22. def __init__(
  23. self,
  24. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  25. storage_options: StorageOptions = None,
  26. ) -> None:
  27. """
  28. Read tables out of OpenDocument formatted files.
  29. Parameters
  30. ----------
  31. filepath_or_buffer : str, path to be parsed or
  32. an open readable stream.
  33. {storage_options}
  34. """
  35. import_optional_dependency("odf")
  36. super().__init__(filepath_or_buffer, storage_options=storage_options)
  37. @property
  38. def _workbook_class(self):
  39. from odf.opendocument import OpenDocument
  40. return OpenDocument
  41. def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
  42. from odf.opendocument import load
  43. return load(filepath_or_buffer)
  44. @property
  45. def empty_value(self) -> str:
  46. """Property for compat with other readers."""
  47. return ""
  48. @property
  49. def sheet_names(self) -> list[str]:
  50. """Return a list of sheet names present in the document"""
  51. from odf.table import Table
  52. tables = self.book.getElementsByType(Table)
  53. return [t.getAttribute("name") for t in tables]
  54. def get_sheet_by_index(self, index: int):
  55. from odf.table import Table
  56. self.raise_if_bad_sheet_by_index(index)
  57. tables = self.book.getElementsByType(Table)
  58. return tables[index]
  59. def get_sheet_by_name(self, name: str):
  60. from odf.table import Table
  61. self.raise_if_bad_sheet_by_name(name)
  62. tables = self.book.getElementsByType(Table)
  63. for table in tables:
  64. if table.getAttribute("name") == name:
  65. return table
  66. self.close()
  67. raise ValueError(f"sheet {name} not found")
  68. def get_sheet_data(
  69. self, sheet, file_rows_needed: int | None = None
  70. ) -> list[list[Scalar | NaTType]]:
  71. """
  72. Parse an ODF Table into a list of lists
  73. """
  74. from odf.table import (
  75. CoveredTableCell,
  76. TableCell,
  77. TableRow,
  78. )
  79. covered_cell_name = CoveredTableCell().qname
  80. table_cell_name = TableCell().qname
  81. cell_names = {covered_cell_name, table_cell_name}
  82. sheet_rows = sheet.getElementsByType(TableRow)
  83. empty_rows = 0
  84. max_row_len = 0
  85. table: list[list[Scalar | NaTType]] = []
  86. for sheet_row in sheet_rows:
  87. sheet_cells = [
  88. x
  89. for x in sheet_row.childNodes
  90. if hasattr(x, "qname") and x.qname in cell_names
  91. ]
  92. empty_cells = 0
  93. table_row: list[Scalar | NaTType] = []
  94. for sheet_cell in sheet_cells:
  95. if sheet_cell.qname == table_cell_name:
  96. value = self._get_cell_value(sheet_cell)
  97. else:
  98. value = self.empty_value
  99. column_repeat = self._get_column_repeat(sheet_cell)
  100. # Queue up empty values, writing only if content succeeds them
  101. if value == self.empty_value:
  102. empty_cells += column_repeat
  103. else:
  104. table_row.extend([self.empty_value] * empty_cells)
  105. empty_cells = 0
  106. table_row.extend([value] * column_repeat)
  107. if max_row_len < len(table_row):
  108. max_row_len = len(table_row)
  109. row_repeat = self._get_row_repeat(sheet_row)
  110. if self._is_empty_row(sheet_row):
  111. empty_rows += row_repeat
  112. else:
  113. # add blank rows to our table
  114. table.extend([[self.empty_value]] * empty_rows)
  115. empty_rows = 0
  116. for _ in range(row_repeat):
  117. table.append(table_row)
  118. if file_rows_needed is not None and len(table) >= file_rows_needed:
  119. break
  120. # Make our table square
  121. for row in table:
  122. if len(row) < max_row_len:
  123. row.extend([self.empty_value] * (max_row_len - len(row)))
  124. return table
  125. def _get_row_repeat(self, row) -> int:
  126. """
  127. Return number of times this row was repeated
  128. Repeating an empty row appeared to be a common way
  129. of representing sparse rows in the table.
  130. """
  131. from odf.namespaces import TABLENS
  132. return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
  133. def _get_column_repeat(self, cell) -> int:
  134. from odf.namespaces import TABLENS
  135. return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
  136. def _is_empty_row(self, row) -> bool:
  137. """
  138. Helper function to find empty rows
  139. """
  140. for column in row.childNodes:
  141. if len(column.childNodes) > 0:
  142. return False
  143. return True
  144. def _get_cell_value(self, cell) -> Scalar | NaTType:
  145. from odf.namespaces import OFFICENS
  146. if str(cell) == "#N/A":
  147. return np.nan
  148. cell_type = cell.attributes.get((OFFICENS, "value-type"))
  149. if cell_type == "boolean":
  150. if str(cell) == "TRUE":
  151. return True
  152. return False
  153. if cell_type is None:
  154. return self.empty_value
  155. elif cell_type == "float":
  156. # GH5394
  157. cell_value = float(cell.attributes.get((OFFICENS, "value")))
  158. val = int(cell_value)
  159. if val == cell_value:
  160. return val
  161. return cell_value
  162. elif cell_type == "percentage":
  163. cell_value = cell.attributes.get((OFFICENS, "value"))
  164. return float(cell_value)
  165. elif cell_type == "string":
  166. return self._get_cell_string_value(cell)
  167. elif cell_type == "currency":
  168. cell_value = cell.attributes.get((OFFICENS, "value"))
  169. return float(cell_value)
  170. elif cell_type == "date":
  171. cell_value = cell.attributes.get((OFFICENS, "date-value"))
  172. return pd.Timestamp(cell_value)
  173. elif cell_type == "time":
  174. stamp = pd.Timestamp(str(cell))
  175. # cast needed here because Scalar doesn't include datetime.time
  176. return cast(Scalar, stamp.time())
  177. else:
  178. self.close()
  179. raise ValueError(f"Unrecognized type {cell_type}")
  180. def _get_cell_string_value(self, cell) -> str:
  181. """
  182. Find and decode OpenDocument text:s tags that represent
  183. a run length encoded sequence of space characters.
  184. """
  185. from odf.element import Element
  186. from odf.namespaces import TEXTNS
  187. from odf.text import S
  188. text_s = S().qname
  189. value = []
  190. for fragment in cell.childNodes:
  191. if isinstance(fragment, Element):
  192. if fragment.qname == text_s:
  193. spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
  194. value.append(" " * spaces)
  195. else:
  196. # recursive impl needed in case of nested fragments
  197. # with multiple spaces
  198. # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
  199. value.append(self._get_cell_string_value(fragment))
  200. else:
  201. value.append(str(fragment).strip("\n"))
  202. return "".join(value)