123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- # pyright: reportMissingImports=false
- from __future__ import annotations
- from pandas._typing import (
- FilePath,
- ReadBuffer,
- Scalar,
- StorageOptions,
- )
- from pandas.compat._optional import import_optional_dependency
- from pandas.util._decorators import doc
- from pandas.core.shared_docs import _shared_docs
- from pandas.io.excel._base import BaseExcelReader
- class PyxlsbReader(BaseExcelReader):
- @doc(storage_options=_shared_docs["storage_options"])
- def __init__(
- self,
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- storage_options: StorageOptions = None,
- ) -> None:
- """
- Reader using pyxlsb engine.
- Parameters
- ----------
- filepath_or_buffer : str, path object, or Workbook
- Object to be parsed.
- {storage_options}
- """
- import_optional_dependency("pyxlsb")
- # This will call load_workbook on the filepath or buffer
- # And set the result to the book-attribute
- super().__init__(filepath_or_buffer, storage_options=storage_options)
- @property
- def _workbook_class(self):
- from pyxlsb import Workbook
- return Workbook
- def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
- from pyxlsb import open_workbook
- # TODO: hack in buffer capability
- # This might need some modifications to the Pyxlsb library
- # Actual work for opening it is in xlsbpackage.py, line 20-ish
- return open_workbook(filepath_or_buffer)
- @property
- def sheet_names(self) -> list[str]:
- return self.book.sheets
- def get_sheet_by_name(self, name: str):
- self.raise_if_bad_sheet_by_name(name)
- return self.book.get_sheet(name)
- def get_sheet_by_index(self, index: int):
- self.raise_if_bad_sheet_by_index(index)
- # pyxlsb sheets are indexed from 1 onwards
- # There's a fix for this in the source, but the pypi package doesn't have it
- return self.book.get_sheet(index + 1)
- def _convert_cell(self, cell) -> Scalar:
- # TODO: there is no way to distinguish between floats and datetimes in pyxlsb
- # This means that there is no way to read datetime types from an xlsb file yet
- if cell.v is None:
- return "" # Prevents non-named columns from not showing up as Unnamed: i
- if isinstance(cell.v, float):
- val = int(cell.v)
- if val == cell.v:
- return val
- else:
- return float(cell.v)
- return cell.v
- def get_sheet_data(
- self,
- sheet,
- file_rows_needed: int | None = None,
- ) -> list[list[Scalar]]:
- data: list[list[Scalar]] = []
- prevous_row_number = -1
- # When sparse=True the rows can have different lengths and empty rows are
- # not returned. The cells are namedtuples of row, col, value (r, c, v).
- for row in sheet.rows(sparse=True):
- row_number = row[0].r
- converted_row = [self._convert_cell(cell) for cell in row]
- while converted_row and converted_row[-1] == "":
- # trim trailing empty elements
- converted_row.pop()
- if converted_row:
- data.extend([[]] * (row_number - prevous_row_number - 1))
- data.append(converted_row)
- prevous_row_number = row_number
- if file_rows_needed is not None and len(data) >= file_rows_needed:
- break
- if data:
- # extend rows to max_width
- max_width = max(len(data_row) for data_row in data)
- if min(len(data_row) for data_row in data) < max_width:
- empty_cell: list[Scalar] = [""]
- data = [
- data_row + (max_width - len(data_row)) * empty_cell
- for data_row in data
- ]
- return data
|