_pyxlsb.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # pyright: reportMissingImports=false
  2. from __future__ import annotations
  3. from pandas._typing import (
  4. FilePath,
  5. ReadBuffer,
  6. Scalar,
  7. StorageOptions,
  8. )
  9. from pandas.compat._optional import import_optional_dependency
  10. from pandas.util._decorators import doc
  11. from pandas.core.shared_docs import _shared_docs
  12. from pandas.io.excel._base import BaseExcelReader
  13. class PyxlsbReader(BaseExcelReader):
  14. @doc(storage_options=_shared_docs["storage_options"])
  15. def __init__(
  16. self,
  17. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  18. storage_options: StorageOptions = None,
  19. ) -> None:
  20. """
  21. Reader using pyxlsb engine.
  22. Parameters
  23. ----------
  24. filepath_or_buffer : str, path object, or Workbook
  25. Object to be parsed.
  26. {storage_options}
  27. """
  28. import_optional_dependency("pyxlsb")
  29. # This will call load_workbook on the filepath or buffer
  30. # And set the result to the book-attribute
  31. super().__init__(filepath_or_buffer, storage_options=storage_options)
  32. @property
  33. def _workbook_class(self):
  34. from pyxlsb import Workbook
  35. return Workbook
  36. def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
  37. from pyxlsb import open_workbook
  38. # TODO: hack in buffer capability
  39. # This might need some modifications to the Pyxlsb library
  40. # Actual work for opening it is in xlsbpackage.py, line 20-ish
  41. return open_workbook(filepath_or_buffer)
  42. @property
  43. def sheet_names(self) -> list[str]:
  44. return self.book.sheets
  45. def get_sheet_by_name(self, name: str):
  46. self.raise_if_bad_sheet_by_name(name)
  47. return self.book.get_sheet(name)
  48. def get_sheet_by_index(self, index: int):
  49. self.raise_if_bad_sheet_by_index(index)
  50. # pyxlsb sheets are indexed from 1 onwards
  51. # There's a fix for this in the source, but the pypi package doesn't have it
  52. return self.book.get_sheet(index + 1)
  53. def _convert_cell(self, cell) -> Scalar:
  54. # TODO: there is no way to distinguish between floats and datetimes in pyxlsb
  55. # This means that there is no way to read datetime types from an xlsb file yet
  56. if cell.v is None:
  57. return "" # Prevents non-named columns from not showing up as Unnamed: i
  58. if isinstance(cell.v, float):
  59. val = int(cell.v)
  60. if val == cell.v:
  61. return val
  62. else:
  63. return float(cell.v)
  64. return cell.v
  65. def get_sheet_data(
  66. self,
  67. sheet,
  68. file_rows_needed: int | None = None,
  69. ) -> list[list[Scalar]]:
  70. data: list[list[Scalar]] = []
  71. prevous_row_number = -1
  72. # When sparse=True the rows can have different lengths and empty rows are
  73. # not returned. The cells are namedtuples of row, col, value (r, c, v).
  74. for row in sheet.rows(sparse=True):
  75. row_number = row[0].r
  76. converted_row = [self._convert_cell(cell) for cell in row]
  77. while converted_row and converted_row[-1] == "":
  78. # trim trailing empty elements
  79. converted_row.pop()
  80. if converted_row:
  81. data.extend([[]] * (row_number - prevous_row_number - 1))
  82. data.append(converted_row)
  83. prevous_row_number = row_number
  84. if file_rows_needed is not None and len(data) >= file_rows_needed:
  85. break
  86. if data:
  87. # extend rows to max_width
  88. max_width = max(len(data_row) for data_row in data)
  89. if min(len(data_row) for data_row in data) < max_width:
  90. empty_cell: list[Scalar] = [""]
  91. data = [
  92. data_row + (max_width - len(data_row)) * empty_cell
  93. for data_row in data
  94. ]
  95. return data