_openpyxl.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. from __future__ import annotations
  2. import mmap
  3. from typing import (
  4. TYPE_CHECKING,
  5. Any,
  6. Tuple,
  7. cast,
  8. )
  9. import numpy as np
  10. from pandas._typing import (
  11. FilePath,
  12. ReadBuffer,
  13. Scalar,
  14. StorageOptions,
  15. WriteExcelBuffer,
  16. )
  17. from pandas.compat._optional import import_optional_dependency
  18. from pandas.util._decorators import doc
  19. from pandas.core.shared_docs import _shared_docs
  20. from pandas.io.excel._base import (
  21. BaseExcelReader,
  22. ExcelWriter,
  23. )
  24. from pandas.io.excel._util import (
  25. combine_kwargs,
  26. validate_freeze_panes,
  27. )
  28. if TYPE_CHECKING:
  29. from openpyxl.descriptors.serialisable import Serialisable
  30. from openpyxl.workbook import Workbook
  31. class OpenpyxlWriter(ExcelWriter):
  32. _engine = "openpyxl"
  33. _supported_extensions = (".xlsx", ".xlsm")
  34. def __init__(
  35. self,
  36. path: FilePath | WriteExcelBuffer | ExcelWriter,
  37. engine: str | None = None,
  38. date_format: str | None = None,
  39. datetime_format: str | None = None,
  40. mode: str = "w",
  41. storage_options: StorageOptions = None,
  42. if_sheet_exists: str | None = None,
  43. engine_kwargs: dict[str, Any] | None = None,
  44. **kwargs,
  45. ) -> None:
  46. # Use the openpyxl module as the Excel writer.
  47. from openpyxl.workbook import Workbook
  48. engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
  49. super().__init__(
  50. path,
  51. mode=mode,
  52. storage_options=storage_options,
  53. if_sheet_exists=if_sheet_exists,
  54. engine_kwargs=engine_kwargs,
  55. )
  56. # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from
  57. # the file and later write to it
  58. if "r+" in self._mode: # Load from existing workbook
  59. from openpyxl import load_workbook
  60. try:
  61. self._book = load_workbook(self._handles.handle, **engine_kwargs)
  62. except TypeError:
  63. self._handles.handle.close()
  64. raise
  65. self._handles.handle.seek(0)
  66. else:
  67. # Create workbook object with default optimized_write=True.
  68. try:
  69. self._book = Workbook(**engine_kwargs)
  70. except TypeError:
  71. self._handles.handle.close()
  72. raise
  73. if self.book.worksheets:
  74. self.book.remove(self.book.worksheets[0])
  75. @property
  76. def book(self) -> Workbook:
  77. """
  78. Book instance of class openpyxl.workbook.Workbook.
  79. This attribute can be used to access engine-specific features.
  80. """
  81. return self._book
  82. @property
  83. def sheets(self) -> dict[str, Any]:
  84. """Mapping of sheet names to sheet objects."""
  85. result = {name: self.book[name] for name in self.book.sheetnames}
  86. return result
  87. def _save(self) -> None:
  88. """
  89. Save workbook to disk.
  90. """
  91. self.book.save(self._handles.handle)
  92. if "r+" in self._mode and not isinstance(self._handles.handle, mmap.mmap):
  93. # truncate file to the written content
  94. self._handles.handle.truncate()
  95. @classmethod
  96. def _convert_to_style_kwargs(cls, style_dict: dict) -> dict[str, Serialisable]:
  97. """
  98. Convert a style_dict to a set of kwargs suitable for initializing
  99. or updating-on-copy an openpyxl v2 style object.
  100. Parameters
  101. ----------
  102. style_dict : dict
  103. A dict with zero or more of the following keys (or their synonyms).
  104. 'font'
  105. 'fill'
  106. 'border' ('borders')
  107. 'alignment'
  108. 'number_format'
  109. 'protection'
  110. Returns
  111. -------
  112. style_kwargs : dict
  113. A dict with the same, normalized keys as ``style_dict`` but each
  114. value has been replaced with a native openpyxl style object of the
  115. appropriate class.
  116. """
  117. _style_key_map = {"borders": "border"}
  118. style_kwargs: dict[str, Serialisable] = {}
  119. for k, v in style_dict.items():
  120. k = _style_key_map.get(k, k)
  121. _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None)
  122. new_v = _conv_to_x(v)
  123. if new_v:
  124. style_kwargs[k] = new_v
  125. return style_kwargs
  126. @classmethod
  127. def _convert_to_color(cls, color_spec):
  128. """
  129. Convert ``color_spec`` to an openpyxl v2 Color object.
  130. Parameters
  131. ----------
  132. color_spec : str, dict
  133. A 32-bit ARGB hex string, or a dict with zero or more of the
  134. following keys.
  135. 'rgb'
  136. 'indexed'
  137. 'auto'
  138. 'theme'
  139. 'tint'
  140. 'index'
  141. 'type'
  142. Returns
  143. -------
  144. color : openpyxl.styles.Color
  145. """
  146. from openpyxl.styles import Color
  147. if isinstance(color_spec, str):
  148. return Color(color_spec)
  149. else:
  150. return Color(**color_spec)
  151. @classmethod
  152. def _convert_to_font(cls, font_dict):
  153. """
  154. Convert ``font_dict`` to an openpyxl v2 Font object.
  155. Parameters
  156. ----------
  157. font_dict : dict
  158. A dict with zero or more of the following keys (or their synonyms).
  159. 'name'
  160. 'size' ('sz')
  161. 'bold' ('b')
  162. 'italic' ('i')
  163. 'underline' ('u')
  164. 'strikethrough' ('strike')
  165. 'color'
  166. 'vertAlign' ('vertalign')
  167. 'charset'
  168. 'scheme'
  169. 'family'
  170. 'outline'
  171. 'shadow'
  172. 'condense'
  173. Returns
  174. -------
  175. font : openpyxl.styles.Font
  176. """
  177. from openpyxl.styles import Font
  178. _font_key_map = {
  179. "sz": "size",
  180. "b": "bold",
  181. "i": "italic",
  182. "u": "underline",
  183. "strike": "strikethrough",
  184. "vertalign": "vertAlign",
  185. }
  186. font_kwargs = {}
  187. for k, v in font_dict.items():
  188. k = _font_key_map.get(k, k)
  189. if k == "color":
  190. v = cls._convert_to_color(v)
  191. font_kwargs[k] = v
  192. return Font(**font_kwargs)
  193. @classmethod
  194. def _convert_to_stop(cls, stop_seq):
  195. """
  196. Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
  197. suitable for initializing the ``GradientFill`` ``stop`` parameter.
  198. Parameters
  199. ----------
  200. stop_seq : iterable
  201. An iterable that yields objects suitable for consumption by
  202. ``_convert_to_color``.
  203. Returns
  204. -------
  205. stop : list of openpyxl.styles.Color
  206. """
  207. return map(cls._convert_to_color, stop_seq)
  208. @classmethod
  209. def _convert_to_fill(cls, fill_dict: dict[str, Any]):
  210. """
  211. Convert ``fill_dict`` to an openpyxl v2 Fill object.
  212. Parameters
  213. ----------
  214. fill_dict : dict
  215. A dict with one or more of the following keys (or their synonyms),
  216. 'fill_type' ('patternType', 'patterntype')
  217. 'start_color' ('fgColor', 'fgcolor')
  218. 'end_color' ('bgColor', 'bgcolor')
  219. or one or more of the following keys (or their synonyms).
  220. 'type' ('fill_type')
  221. 'degree'
  222. 'left'
  223. 'right'
  224. 'top'
  225. 'bottom'
  226. 'stop'
  227. Returns
  228. -------
  229. fill : openpyxl.styles.Fill
  230. """
  231. from openpyxl.styles import (
  232. GradientFill,
  233. PatternFill,
  234. )
  235. _pattern_fill_key_map = {
  236. "patternType": "fill_type",
  237. "patterntype": "fill_type",
  238. "fgColor": "start_color",
  239. "fgcolor": "start_color",
  240. "bgColor": "end_color",
  241. "bgcolor": "end_color",
  242. }
  243. _gradient_fill_key_map = {"fill_type": "type"}
  244. pfill_kwargs = {}
  245. gfill_kwargs = {}
  246. for k, v in fill_dict.items():
  247. pk = _pattern_fill_key_map.get(k)
  248. gk = _gradient_fill_key_map.get(k)
  249. if pk in ["start_color", "end_color"]:
  250. v = cls._convert_to_color(v)
  251. if gk == "stop":
  252. v = cls._convert_to_stop(v)
  253. if pk:
  254. pfill_kwargs[pk] = v
  255. elif gk:
  256. gfill_kwargs[gk] = v
  257. else:
  258. pfill_kwargs[k] = v
  259. gfill_kwargs[k] = v
  260. try:
  261. return PatternFill(**pfill_kwargs)
  262. except TypeError:
  263. return GradientFill(**gfill_kwargs)
  264. @classmethod
  265. def _convert_to_side(cls, side_spec):
  266. """
  267. Convert ``side_spec`` to an openpyxl v2 Side object.
  268. Parameters
  269. ----------
  270. side_spec : str, dict
  271. A string specifying the border style, or a dict with zero or more
  272. of the following keys (or their synonyms).
  273. 'style' ('border_style')
  274. 'color'
  275. Returns
  276. -------
  277. side : openpyxl.styles.Side
  278. """
  279. from openpyxl.styles import Side
  280. _side_key_map = {"border_style": "style"}
  281. if isinstance(side_spec, str):
  282. return Side(style=side_spec)
  283. side_kwargs = {}
  284. for k, v in side_spec.items():
  285. k = _side_key_map.get(k, k)
  286. if k == "color":
  287. v = cls._convert_to_color(v)
  288. side_kwargs[k] = v
  289. return Side(**side_kwargs)
  290. @classmethod
  291. def _convert_to_border(cls, border_dict):
  292. """
  293. Convert ``border_dict`` to an openpyxl v2 Border object.
  294. Parameters
  295. ----------
  296. border_dict : dict
  297. A dict with zero or more of the following keys (or their synonyms).
  298. 'left'
  299. 'right'
  300. 'top'
  301. 'bottom'
  302. 'diagonal'
  303. 'diagonal_direction'
  304. 'vertical'
  305. 'horizontal'
  306. 'diagonalUp' ('diagonalup')
  307. 'diagonalDown' ('diagonaldown')
  308. 'outline'
  309. Returns
  310. -------
  311. border : openpyxl.styles.Border
  312. """
  313. from openpyxl.styles import Border
  314. _border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
  315. border_kwargs = {}
  316. for k, v in border_dict.items():
  317. k = _border_key_map.get(k, k)
  318. if k == "color":
  319. v = cls._convert_to_color(v)
  320. if k in ["left", "right", "top", "bottom", "diagonal"]:
  321. v = cls._convert_to_side(v)
  322. border_kwargs[k] = v
  323. return Border(**border_kwargs)
  324. @classmethod
  325. def _convert_to_alignment(cls, alignment_dict):
  326. """
  327. Convert ``alignment_dict`` to an openpyxl v2 Alignment object.
  328. Parameters
  329. ----------
  330. alignment_dict : dict
  331. A dict with zero or more of the following keys (or their synonyms).
  332. 'horizontal'
  333. 'vertical'
  334. 'text_rotation'
  335. 'wrap_text'
  336. 'shrink_to_fit'
  337. 'indent'
  338. Returns
  339. -------
  340. alignment : openpyxl.styles.Alignment
  341. """
  342. from openpyxl.styles import Alignment
  343. return Alignment(**alignment_dict)
  344. @classmethod
  345. def _convert_to_number_format(cls, number_format_dict):
  346. """
  347. Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
  348. initializer.
  349. Parameters
  350. ----------
  351. number_format_dict : dict
  352. A dict with zero or more of the following keys.
  353. 'format_code' : str
  354. Returns
  355. -------
  356. number_format : str
  357. """
  358. return number_format_dict["format_code"]
  359. @classmethod
  360. def _convert_to_protection(cls, protection_dict):
  361. """
  362. Convert ``protection_dict`` to an openpyxl v2 Protection object.
  363. Parameters
  364. ----------
  365. protection_dict : dict
  366. A dict with zero or more of the following keys.
  367. 'locked'
  368. 'hidden'
  369. Returns
  370. -------
  371. """
  372. from openpyxl.styles import Protection
  373. return Protection(**protection_dict)
  374. def _write_cells(
  375. self,
  376. cells,
  377. sheet_name: str | None = None,
  378. startrow: int = 0,
  379. startcol: int = 0,
  380. freeze_panes: tuple[int, int] | None = None,
  381. ) -> None:
  382. # Write the frame cells using openpyxl.
  383. sheet_name = self._get_sheet_name(sheet_name)
  384. _style_cache: dict[str, dict[str, Serialisable]] = {}
  385. if sheet_name in self.sheets and self._if_sheet_exists != "new":
  386. if "r+" in self._mode:
  387. if self._if_sheet_exists == "replace":
  388. old_wks = self.sheets[sheet_name]
  389. target_index = self.book.index(old_wks)
  390. del self.book[sheet_name]
  391. wks = self.book.create_sheet(sheet_name, target_index)
  392. elif self._if_sheet_exists == "error":
  393. raise ValueError(
  394. f"Sheet '{sheet_name}' already exists and "
  395. f"if_sheet_exists is set to 'error'."
  396. )
  397. elif self._if_sheet_exists == "overlay":
  398. wks = self.sheets[sheet_name]
  399. else:
  400. raise ValueError(
  401. f"'{self._if_sheet_exists}' is not valid for if_sheet_exists. "
  402. "Valid options are 'error', 'new', 'replace' and 'overlay'."
  403. )
  404. else:
  405. wks = self.sheets[sheet_name]
  406. else:
  407. wks = self.book.create_sheet()
  408. wks.title = sheet_name
  409. if validate_freeze_panes(freeze_panes):
  410. freeze_panes = cast(Tuple[int, int], freeze_panes)
  411. wks.freeze_panes = wks.cell(
  412. row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
  413. )
  414. for cell in cells:
  415. xcell = wks.cell(
  416. row=startrow + cell.row + 1, column=startcol + cell.col + 1
  417. )
  418. xcell.value, fmt = self._value_with_fmt(cell.val)
  419. if fmt:
  420. xcell.number_format = fmt
  421. style_kwargs: dict[str, Serialisable] | None = {}
  422. if cell.style:
  423. key = str(cell.style)
  424. style_kwargs = _style_cache.get(key)
  425. if style_kwargs is None:
  426. style_kwargs = self._convert_to_style_kwargs(cell.style)
  427. _style_cache[key] = style_kwargs
  428. if style_kwargs:
  429. for k, v in style_kwargs.items():
  430. setattr(xcell, k, v)
  431. if cell.mergestart is not None and cell.mergeend is not None:
  432. wks.merge_cells(
  433. start_row=startrow + cell.row + 1,
  434. start_column=startcol + cell.col + 1,
  435. end_column=startcol + cell.mergeend + 1,
  436. end_row=startrow + cell.mergestart + 1,
  437. )
  438. # When cells are merged only the top-left cell is preserved
  439. # The behaviour of the other cells in a merged range is
  440. # undefined
  441. if style_kwargs:
  442. first_row = startrow + cell.row + 1
  443. last_row = startrow + cell.mergestart + 1
  444. first_col = startcol + cell.col + 1
  445. last_col = startcol + cell.mergeend + 1
  446. for row in range(first_row, last_row + 1):
  447. for col in range(first_col, last_col + 1):
  448. if row == first_row and col == first_col:
  449. # Ignore first cell. It is already handled.
  450. continue
  451. xcell = wks.cell(column=col, row=row)
  452. for k, v in style_kwargs.items():
  453. setattr(xcell, k, v)
  454. class OpenpyxlReader(BaseExcelReader):
  455. @doc(storage_options=_shared_docs["storage_options"])
  456. def __init__(
  457. self,
  458. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  459. storage_options: StorageOptions = None,
  460. ) -> None:
  461. """
  462. Reader using openpyxl engine.
  463. Parameters
  464. ----------
  465. filepath_or_buffer : str, path object or Workbook
  466. Object to be parsed.
  467. {storage_options}
  468. """
  469. import_optional_dependency("openpyxl")
  470. super().__init__(filepath_or_buffer, storage_options=storage_options)
  471. @property
  472. def _workbook_class(self):
  473. from openpyxl import Workbook
  474. return Workbook
  475. def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
  476. from openpyxl import load_workbook
  477. return load_workbook(
  478. filepath_or_buffer, read_only=True, data_only=True, keep_links=False
  479. )
  480. @property
  481. def sheet_names(self) -> list[str]:
  482. return [sheet.title for sheet in self.book.worksheets]
  483. def get_sheet_by_name(self, name: str):
  484. self.raise_if_bad_sheet_by_name(name)
  485. return self.book[name]
  486. def get_sheet_by_index(self, index: int):
  487. self.raise_if_bad_sheet_by_index(index)
  488. return self.book.worksheets[index]
  489. def _convert_cell(self, cell) -> Scalar:
  490. from openpyxl.cell.cell import (
  491. TYPE_ERROR,
  492. TYPE_NUMERIC,
  493. )
  494. if cell.value is None:
  495. return "" # compat with xlrd
  496. elif cell.data_type == TYPE_ERROR:
  497. return np.nan
  498. elif cell.data_type == TYPE_NUMERIC:
  499. val = int(cell.value)
  500. if val == cell.value:
  501. return val
  502. return float(cell.value)
  503. return cell.value
  504. def get_sheet_data(
  505. self, sheet, file_rows_needed: int | None = None
  506. ) -> list[list[Scalar]]:
  507. if self.book.read_only:
  508. sheet.reset_dimensions()
  509. data: list[list[Scalar]] = []
  510. last_row_with_data = -1
  511. for row_number, row in enumerate(sheet.rows):
  512. converted_row = [self._convert_cell(cell) for cell in row]
  513. while converted_row and converted_row[-1] == "":
  514. # trim trailing empty elements
  515. converted_row.pop()
  516. if converted_row:
  517. last_row_with_data = row_number
  518. data.append(converted_row)
  519. if file_rows_needed is not None and len(data) >= file_rows_needed:
  520. break
  521. # Trim trailing empty rows
  522. data = data[: last_row_with_data + 1]
  523. if len(data) > 0:
  524. # extend rows to max width
  525. max_width = max(len(data_row) for data_row in data)
  526. if min(len(data_row) for data_row in data) < max_width:
  527. empty_cell: list[Scalar] = [""]
  528. data = [
  529. data_row + (max_width - len(data_row)) * empty_cell
  530. for data_row in data
  531. ]
  532. return data