format.py 70 KB


  1. """
  2. Internal module for formatting output data in csv, html, xml,
  3. and latex files. This module also applies to display formatting.
  4. """
  5. from __future__ import annotations
  6. from contextlib import contextmanager
  7. from csv import (
  8. QUOTE_NONE,
  9. QUOTE_NONNUMERIC,
  10. )
  11. from decimal import Decimal
  12. from functools import partial
  13. from io import StringIO
  14. import math
  15. import re
  16. from shutil import get_terminal_size
  17. from typing import (
  18. IO,
  19. TYPE_CHECKING,
  20. Any,
  21. Callable,
  22. Final,
  23. Generator,
  24. Hashable,
  25. Iterable,
  26. List,
  27. Mapping,
  28. Sequence,
  29. cast,
  30. )
  31. from unicodedata import east_asian_width
  32. import numpy as np
  33. from pandas._config.config import (
  34. get_option,
  35. set_option,
  36. )
  37. from pandas._libs import lib
  38. from pandas._libs.missing import NA
  39. from pandas._libs.tslibs import (
  40. NaT,
  41. Timedelta,
  42. Timestamp,
  43. get_unit_from_dtype,
  44. iNaT,
  45. periods_per_day,
  46. )
  47. from pandas._libs.tslibs.nattype import NaTType
  48. from pandas._typing import (
  49. ArrayLike,
  50. Axes,
  51. ColspaceArgType,
  52. ColspaceType,
  53. CompressionOptions,
  54. FilePath,
  55. FloatFormatType,
  56. FormattersType,
  57. IndexLabel,
  58. StorageOptions,
  59. WriteBuffer,
  60. )
  61. from pandas.core.dtypes.common import (
  62. is_categorical_dtype,
  63. is_complex_dtype,
  64. is_datetime64_dtype,
  65. is_extension_array_dtype,
  66. is_float,
  67. is_float_dtype,
  68. is_integer,
  69. is_integer_dtype,
  70. is_list_like,
  71. is_numeric_dtype,
  72. is_scalar,
  73. is_timedelta64_dtype,
  74. )
  75. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  76. from pandas.core.dtypes.missing import (
  77. isna,
  78. notna,
  79. )
  80. from pandas.core.arrays import (
  81. Categorical,
  82. DatetimeArray,
  83. TimedeltaArray,
  84. )
  85. from pandas.core.arrays.string_ import StringDtype
  86. from pandas.core.base import PandasObject
  87. import pandas.core.common as com
  88. from pandas.core.construction import extract_array
  89. from pandas.core.indexes.api import (
  90. Index,
  91. MultiIndex,
  92. PeriodIndex,
  93. ensure_index,
  94. )
  95. from pandas.core.indexes.datetimes import DatetimeIndex
  96. from pandas.core.indexes.timedeltas import TimedeltaIndex
  97. from pandas.core.reshape.concat import concat
  98. from pandas.io.common import (
  99. check_parent_directory,
  100. stringify_path,
  101. )
  102. from pandas.io.formats import printing
  103. if TYPE_CHECKING:
  104. from pandas import (
  105. DataFrame,
  106. Series,
  107. )
  108. common_docstring: Final = """
  109. Parameters
  110. ----------
  111. buf : str, Path or StringIO-like, optional, default None
  112. Buffer to write to. If None, the output is returned as a string.
  113. columns : sequence, optional, default None
  114. The subset of columns to write. Writes all columns by default.
  115. col_space : %(col_space_type)s, optional
  116. %(col_space)s.
  117. header : %(header_type)s, optional
  118. %(header)s.
  119. index : bool, optional, default True
  120. Whether to print index (row) labels.
  121. na_rep : str, optional, default 'NaN'
  122. String representation of ``NaN`` to use.
  123. formatters : list, tuple or dict of one-param. functions, optional
  124. Formatter functions to apply to columns' elements by position or
  125. name.
  126. The result of each function must be a unicode string.
  127. List/tuple must be of length equal to the number of columns.
  128. float_format : one-parameter function, optional, default None
  129. Formatter function to apply to columns' elements if they are
  130. floats. This function must return a unicode string and will be
  131. applied only to the non-``NaN`` elements, with ``NaN`` being
  132. handled by ``na_rep``.
  133. .. versionchanged:: 1.2.0
  134. sparsify : bool, optional, default True
  135. Set to False for a DataFrame with a hierarchical index to print
  136. every multiindex key at each row.
  137. index_names : bool, optional, default True
  138. Prints the names of the indexes.
  139. justify : str, default None
  140. How to justify the column labels. If None uses the option from
  141. the print configuration (controlled by set_option), 'right' out
  142. of the box. Valid values are
  143. * left
  144. * right
  145. * center
  146. * justify
  147. * justify-all
  148. * start
  149. * end
  150. * inherit
  151. * match-parent
  152. * initial
  153. * unset.
  154. max_rows : int, optional
  155. Maximum number of rows to display in the console.
  156. max_cols : int, optional
  157. Maximum number of columns to display in the console.
  158. show_dimensions : bool, default False
  159. Display DataFrame dimensions (number of rows by number of columns).
  160. decimal : str, default '.'
  161. Character recognized as decimal separator, e.g. ',' in Europe.
  162. """
  163. _VALID_JUSTIFY_PARAMETERS = (
  164. "left",
  165. "right",
  166. "center",
  167. "justify",
  168. "justify-all",
  169. "start",
  170. "end",
  171. "inherit",
  172. "match-parent",
  173. "initial",
  174. "unset",
  175. )
  176. return_docstring: Final = """
  177. Returns
  178. -------
  179. str or None
  180. If buf is None, returns the result as a string. Otherwise returns
  181. None.
  182. """
  183. class CategoricalFormatter:
  184. def __init__(
  185. self,
  186. categorical: Categorical,
  187. buf: IO[str] | None = None,
  188. length: bool = True,
  189. na_rep: str = "NaN",
  190. footer: bool = True,
  191. ) -> None:
  192. self.categorical = categorical
  193. self.buf = buf if buf is not None else StringIO("")
  194. self.na_rep = na_rep
  195. self.length = length
  196. self.footer = footer
  197. self.quoting = QUOTE_NONNUMERIC
  198. def _get_footer(self) -> str:
  199. footer = ""
  200. if self.length:
  201. if footer:
  202. footer += ", "
  203. footer += f"Length: {len(self.categorical)}"
  204. level_info = self.categorical._repr_categories_info()
  205. # Levels are added in a newline
  206. if footer:
  207. footer += "\n"
  208. footer += level_info
  209. return str(footer)
  210. def _get_formatted_values(self) -> list[str]:
  211. return format_array(
  212. self.categorical._internal_get_values(),
  213. None,
  214. float_format=None,
  215. na_rep=self.na_rep,
  216. quoting=self.quoting,
  217. )
  218. def to_string(self) -> str:
  219. categorical = self.categorical
  220. if len(categorical) == 0:
  221. if self.footer:
  222. return self._get_footer()
  223. else:
  224. return ""
  225. fmt_values = self._get_formatted_values()
  226. fmt_values = [i.strip() for i in fmt_values]
  227. values = ", ".join(fmt_values)
  228. result = ["[" + values + "]"]
  229. if self.footer:
  230. footer = self._get_footer()
  231. if footer:
  232. result.append(footer)
  233. return str("\n".join(result))
  234. class SeriesFormatter:
  235. def __init__(
  236. self,
  237. series: Series,
  238. buf: IO[str] | None = None,
  239. length: bool | str = True,
  240. header: bool = True,
  241. index: bool = True,
  242. na_rep: str = "NaN",
  243. name: bool = False,
  244. float_format: str | None = None,
  245. dtype: bool = True,
  246. max_rows: int | None = None,
  247. min_rows: int | None = None,
  248. ) -> None:
  249. self.series = series
  250. self.buf = buf if buf is not None else StringIO()
  251. self.name = name
  252. self.na_rep = na_rep
  253. self.header = header
  254. self.length = length
  255. self.index = index
  256. self.max_rows = max_rows
  257. self.min_rows = min_rows
  258. if float_format is None:
  259. float_format = get_option("display.float_format")
  260. self.float_format = float_format
  261. self.dtype = dtype
  262. self.adj = get_adjustment()
  263. self._chk_truncate()
  264. def _chk_truncate(self) -> None:
  265. self.tr_row_num: int | None
  266. min_rows = self.min_rows
  267. max_rows = self.max_rows
  268. # truncation determined by max_rows, actual truncated number of rows
  269. # used below by min_rows
  270. is_truncated_vertically = max_rows and (len(self.series) > max_rows)
  271. series = self.series
  272. if is_truncated_vertically:
  273. max_rows = cast(int, max_rows)
  274. if min_rows:
  275. # if min_rows is set (not None or 0), set max_rows to minimum
  276. # of both
  277. max_rows = min(min_rows, max_rows)
  278. if max_rows == 1:
  279. row_num = max_rows
  280. series = series.iloc[:max_rows]
  281. else:
  282. row_num = max_rows // 2
  283. series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
  284. self.tr_row_num = row_num
  285. else:
  286. self.tr_row_num = None
  287. self.tr_series = series
  288. self.is_truncated_vertically = is_truncated_vertically
  289. def _get_footer(self) -> str:
  290. name = self.series.name
  291. footer = ""
  292. if getattr(self.series.index, "freq", None) is not None:
  293. assert isinstance(
  294. self.series.index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)
  295. )
  296. footer += f"Freq: {self.series.index.freqstr}"
  297. if self.name is not False and name is not None:
  298. if footer:
  299. footer += ", "
  300. series_name = printing.pprint_thing(name, escape_chars=("\t", "\r", "\n"))
  301. footer += f"Name: {series_name}"
  302. if self.length is True or (
  303. self.length == "truncate" and self.is_truncated_vertically
  304. ):
  305. if footer:
  306. footer += ", "
  307. footer += f"Length: {len(self.series)}"
  308. if self.dtype is not False and self.dtype is not None:
  309. dtype_name = getattr(self.tr_series.dtype, "name", None)
  310. if dtype_name:
  311. if footer:
  312. footer += ", "
  313. footer += f"dtype: {printing.pprint_thing(dtype_name)}"
  314. # level infos are added to the end and in a new line, like it is done
  315. # for Categoricals
  316. if is_categorical_dtype(self.tr_series.dtype):
  317. level_info = self.tr_series._values._repr_categories_info()
  318. if footer:
  319. footer += "\n"
  320. footer += level_info
  321. return str(footer)
  322. def _get_formatted_index(self) -> tuple[list[str], bool]:
  323. index = self.tr_series.index
  324. if isinstance(index, MultiIndex):
  325. have_header = any(name for name in index.names)
  326. fmt_index = index.format(names=True)
  327. else:
  328. have_header = index.name is not None
  329. fmt_index = index.format(name=True)
  330. return fmt_index, have_header
  331. def _get_formatted_values(self) -> list[str]:
  332. return format_array(
  333. self.tr_series._values,
  334. None,
  335. float_format=self.float_format,
  336. na_rep=self.na_rep,
  337. leading_space=self.index,
  338. )
  339. def to_string(self) -> str:
  340. series = self.tr_series
  341. footer = self._get_footer()
  342. if len(series) == 0:
  343. return f"{type(self.series).__name__}([], {footer})"
  344. fmt_index, have_header = self._get_formatted_index()
  345. fmt_values = self._get_formatted_values()
  346. if self.is_truncated_vertically:
  347. n_header_rows = 0
  348. row_num = self.tr_row_num
  349. row_num = cast(int, row_num)
  350. width = self.adj.len(fmt_values[row_num - 1])
  351. if width > 3:
  352. dot_str = "..."
  353. else:
  354. dot_str = ".."
  355. # Series uses mode=center because it has single value columns
  356. # DataFrame uses mode=left
  357. dot_str = self.adj.justify([dot_str], width, mode="center")[0]
  358. fmt_values.insert(row_num + n_header_rows, dot_str)
  359. fmt_index.insert(row_num + 1, "")
  360. if self.index:
  361. result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values])
  362. else:
  363. result = self.adj.adjoin(3, fmt_values)
  364. if self.header and have_header:
  365. result = fmt_index[0] + "\n" + result
  366. if footer:
  367. result += "\n" + footer
  368. return str("".join(result))
  369. class TextAdjustment:
  370. def __init__(self) -> None:
  371. self.encoding = get_option("display.encoding")
  372. def len(self, text: str) -> int:
  373. return len(text)
  374. def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]:
  375. return printing.justify(texts, max_len, mode=mode)
  376. def adjoin(self, space: int, *lists, **kwargs) -> str:
  377. return printing.adjoin(
  378. space, *lists, strlen=self.len, justfunc=self.justify, **kwargs
  379. )
  380. class EastAsianTextAdjustment(TextAdjustment):
  381. def __init__(self) -> None:
  382. super().__init__()
  383. if get_option("display.unicode.ambiguous_as_wide"):
  384. self.ambiguous_width = 2
  385. else:
  386. self.ambiguous_width = 1
  387. # Definition of East Asian Width
  388. # https://unicode.org/reports/tr11/
  389. # Ambiguous width can be changed by option
  390. self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
  391. def len(self, text: str) -> int:
  392. """
  393. Calculate display width considering unicode East Asian Width
  394. """
  395. if not isinstance(text, str):
  396. return len(text)
  397. return sum(
  398. self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
  399. )
  400. def justify(
  401. self, texts: Iterable[str], max_len: int, mode: str = "right"
  402. ) -> list[str]:
  403. # re-calculate padding space per str considering East Asian Width
  404. def _get_pad(t):
  405. return max_len - self.len(t) + len(t)
  406. if mode == "left":
  407. return [x.ljust(_get_pad(x)) for x in texts]
  408. elif mode == "center":
  409. return [x.center(_get_pad(x)) for x in texts]
  410. else:
  411. return [x.rjust(_get_pad(x)) for x in texts]
  412. def get_adjustment() -> TextAdjustment:
  413. use_east_asian_width = get_option("display.unicode.east_asian_width")
  414. if use_east_asian_width:
  415. return EastAsianTextAdjustment()
  416. else:
  417. return TextAdjustment()
  418. def get_dataframe_repr_params() -> dict[str, Any]:
  419. """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string.
  420. Supplying these parameters to DataFrame.to_string is equivalent to calling
  421. ``repr(DataFrame)``. This is useful if you want to adjust the repr output.
  422. .. versionadded:: 1.4.0
  423. Example
  424. -------
  425. >>> import pandas as pd
  426. >>>
  427. >>> df = pd.DataFrame([[1, 2], [3, 4]])
  428. >>> repr_params = pd.io.formats.format.get_dataframe_repr_params()
  429. >>> repr(df) == df.to_string(**repr_params)
  430. True
  431. """
  432. from pandas.io.formats import console
  433. if get_option("display.expand_frame_repr"):
  434. line_width, _ = console.get_console_size()
  435. else:
  436. line_width = None
  437. return {
  438. "max_rows": get_option("display.max_rows"),
  439. "min_rows": get_option("display.min_rows"),
  440. "max_cols": get_option("display.max_columns"),
  441. "max_colwidth": get_option("display.max_colwidth"),
  442. "show_dimensions": get_option("display.show_dimensions"),
  443. "line_width": line_width,
  444. }
  445. def get_series_repr_params() -> dict[str, Any]:
  446. """Get the parameters used to repr(Series) calls using Series.to_string.
  447. Supplying these parameters to Series.to_string is equivalent to calling
  448. ``repr(series)``. This is useful if you want to adjust the series repr output.
  449. .. versionadded:: 1.4.0
  450. Example
  451. -------
  452. >>> import pandas as pd
  453. >>>
  454. >>> ser = pd.Series([1, 2, 3, 4])
  455. >>> repr_params = pd.io.formats.format.get_series_repr_params()
  456. >>> repr(ser) == ser.to_string(**repr_params)
  457. True
  458. """
  459. width, height = get_terminal_size()
  460. max_rows = (
  461. height
  462. if get_option("display.max_rows") == 0
  463. else get_option("display.max_rows")
  464. )
  465. min_rows = (
  466. height
  467. if get_option("display.max_rows") == 0
  468. else get_option("display.min_rows")
  469. )
  470. return {
  471. "name": True,
  472. "dtype": True,
  473. "min_rows": min_rows,
  474. "max_rows": max_rows,
  475. "length": get_option("display.show_dimensions"),
  476. }
  477. class DataFrameFormatter:
  478. """Class for processing dataframe formatting options and data."""
  479. __doc__ = __doc__ if __doc__ else ""
  480. __doc__ += common_docstring + return_docstring
  481. def __init__(
  482. self,
  483. frame: DataFrame,
  484. columns: Sequence[Hashable] | None = None,
  485. col_space: ColspaceArgType | None = None,
  486. header: bool | Sequence[str] = True,
  487. index: bool = True,
  488. na_rep: str = "NaN",
  489. formatters: FormattersType | None = None,
  490. justify: str | None = None,
  491. float_format: FloatFormatType | None = None,
  492. sparsify: bool | None = None,
  493. index_names: bool = True,
  494. max_rows: int | None = None,
  495. min_rows: int | None = None,
  496. max_cols: int | None = None,
  497. show_dimensions: bool | str = False,
  498. decimal: str = ".",
  499. bold_rows: bool = False,
  500. escape: bool = True,
  501. ) -> None:
  502. self.frame = frame
  503. self.columns = self._initialize_columns(columns)
  504. self.col_space = self._initialize_colspace(col_space)
  505. self.header = header
  506. self.index = index
  507. self.na_rep = na_rep
  508. self.formatters = self._initialize_formatters(formatters)
  509. self.justify = self._initialize_justify(justify)
  510. self.float_format = float_format
  511. self.sparsify = self._initialize_sparsify(sparsify)
  512. self.show_index_names = index_names
  513. self.decimal = decimal
  514. self.bold_rows = bold_rows
  515. self.escape = escape
  516. self.max_rows = max_rows
  517. self.min_rows = min_rows
  518. self.max_cols = max_cols
  519. self.show_dimensions = show_dimensions
  520. self.max_cols_fitted = self._calc_max_cols_fitted()
  521. self.max_rows_fitted = self._calc_max_rows_fitted()
  522. self.tr_frame = self.frame
  523. self.truncate()
  524. self.adj = get_adjustment()
  525. def get_strcols(self) -> list[list[str]]:
  526. """
  527. Render a DataFrame to a list of columns (as lists of strings).
  528. """
  529. strcols = self._get_strcols_without_index()
  530. if self.index:
  531. str_index = self._get_formatted_index(self.tr_frame)
  532. strcols.insert(0, str_index)
  533. return strcols
  534. @property
  535. def should_show_dimensions(self) -> bool:
  536. return self.show_dimensions is True or (
  537. self.show_dimensions == "truncate" and self.is_truncated
  538. )
  539. @property
  540. def is_truncated(self) -> bool:
  541. return bool(self.is_truncated_horizontally or self.is_truncated_vertically)
  542. @property
  543. def is_truncated_horizontally(self) -> bool:
  544. return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted))
  545. @property
  546. def is_truncated_vertically(self) -> bool:
  547. return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted))
  548. @property
  549. def dimensions_info(self) -> str:
  550. return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]"
  551. @property
  552. def has_index_names(self) -> bool:
  553. return _has_names(self.frame.index)
  554. @property
  555. def has_column_names(self) -> bool:
  556. return _has_names(self.frame.columns)
  557. @property
  558. def show_row_idx_names(self) -> bool:
  559. return all((self.has_index_names, self.index, self.show_index_names))
  560. @property
  561. def show_col_idx_names(self) -> bool:
  562. return all((self.has_column_names, self.show_index_names, self.header))
  563. @property
  564. def max_rows_displayed(self) -> int:
  565. return min(self.max_rows or len(self.frame), len(self.frame))
  566. def _initialize_sparsify(self, sparsify: bool | None) -> bool:
  567. if sparsify is None:
  568. return get_option("display.multi_sparse")
  569. return sparsify
  570. def _initialize_formatters(
  571. self, formatters: FormattersType | None
  572. ) -> FormattersType:
  573. if formatters is None:
  574. return {}
  575. elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict):
  576. return formatters
  577. else:
  578. raise ValueError(
  579. f"Formatters length({len(formatters)}) should match "
  580. f"DataFrame number of columns({len(self.frame.columns)})"
  581. )
  582. def _initialize_justify(self, justify: str | None) -> str:
  583. if justify is None:
  584. return get_option("display.colheader_justify")
  585. else:
  586. return justify
  587. def _initialize_columns(self, columns: Sequence[Hashable] | None) -> Index:
  588. if columns is not None:
  589. # GH 47231 - columns doesn't have to be `Sequence[str]`
  590. # Will fix in later PR
  591. cols = ensure_index(cast(Axes, columns))
  592. self.frame = self.frame[cols]
  593. return cols
  594. else:
  595. return self.frame.columns
  596. def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceType:
  597. result: ColspaceType
  598. if col_space is None:
  599. result = {}
  600. elif isinstance(col_space, (int, str)):
  601. result = {"": col_space}
  602. result.update({column: col_space for column in self.frame.columns})
  603. elif isinstance(col_space, Mapping):
  604. for column in col_space.keys():
  605. if column not in self.frame.columns and column != "":
  606. raise ValueError(
  607. f"Col_space is defined for an unknown column: {column}"
  608. )
  609. result = col_space
  610. else:
  611. if len(self.frame.columns) != len(col_space):
  612. raise ValueError(
  613. f"Col_space length({len(col_space)}) should match "
  614. f"DataFrame number of columns({len(self.frame.columns)})"
  615. )
  616. result = dict(zip(self.frame.columns, col_space))
  617. return result
  618. def _calc_max_cols_fitted(self) -> int | None:
  619. """Number of columns fitting the screen."""
  620. if not self._is_in_terminal():
  621. return self.max_cols
  622. width, _ = get_terminal_size()
  623. if self._is_screen_narrow(width):
  624. return width
  625. else:
  626. return self.max_cols
  627. def _calc_max_rows_fitted(self) -> int | None:
  628. """Number of rows with data fitting the screen."""
  629. max_rows: int | None
  630. if self._is_in_terminal():
  631. _, height = get_terminal_size()
  632. if self.max_rows == 0:
  633. # rows available to fill with actual data
  634. return height - self._get_number_of_auxillary_rows()
  635. if self._is_screen_short(height):
  636. max_rows = height
  637. else:
  638. max_rows = self.max_rows
  639. else:
  640. max_rows = self.max_rows
  641. return self._adjust_max_rows(max_rows)
  642. def _adjust_max_rows(self, max_rows: int | None) -> int | None:
  643. """Adjust max_rows using display logic.
  644. See description here:
  645. https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options
  646. GH #37359
  647. """
  648. if max_rows:
  649. if (len(self.frame) > max_rows) and self.min_rows:
  650. # if truncated, set max_rows showed to min_rows
  651. max_rows = min(self.min_rows, max_rows)
  652. return max_rows
  653. def _is_in_terminal(self) -> bool:
  654. """Check if the output is to be shown in terminal."""
  655. return bool(self.max_cols == 0 or self.max_rows == 0)
  656. def _is_screen_narrow(self, max_width) -> bool:
  657. return bool(self.max_cols == 0 and len(self.frame.columns) > max_width)
  658. def _is_screen_short(self, max_height) -> bool:
  659. return bool(self.max_rows == 0 and len(self.frame) > max_height)
  660. def _get_number_of_auxillary_rows(self) -> int:
  661. """Get number of rows occupied by prompt, dots and dimension info."""
  662. dot_row = 1
  663. prompt_row = 1
  664. num_rows = dot_row + prompt_row
  665. if self.show_dimensions:
  666. num_rows += len(self.dimensions_info.splitlines())
  667. if self.header:
  668. num_rows += 1
  669. return num_rows
  670. def truncate(self) -> None:
  671. """
  672. Check whether the frame should be truncated. If so, slice the frame up.
  673. """
  674. if self.is_truncated_horizontally:
  675. self._truncate_horizontally()
  676. if self.is_truncated_vertically:
  677. self._truncate_vertically()
  678. def _truncate_horizontally(self) -> None:
  679. """Remove columns, which are not to be displayed and adjust formatters.
  680. Attributes affected:
  681. - tr_frame
  682. - formatters
  683. - tr_col_num
  684. """
  685. assert self.max_cols_fitted is not None
  686. col_num = self.max_cols_fitted // 2
  687. if col_num >= 1:
  688. left = self.tr_frame.iloc[:, :col_num]
  689. right = self.tr_frame.iloc[:, -col_num:]
  690. self.tr_frame = concat((left, right), axis=1)
  691. # truncate formatter
  692. if isinstance(self.formatters, (list, tuple)):
  693. self.formatters = [
  694. *self.formatters[:col_num],
  695. *self.formatters[-col_num:],
  696. ]
  697. else:
  698. col_num = cast(int, self.max_cols)
  699. self.tr_frame = self.tr_frame.iloc[:, :col_num]
  700. self.tr_col_num = col_num
  701. def _truncate_vertically(self) -> None:
  702. """Remove rows, which are not to be displayed.
  703. Attributes affected:
  704. - tr_frame
  705. - tr_row_num
  706. """
  707. assert self.max_rows_fitted is not None
  708. row_num = self.max_rows_fitted // 2
  709. if row_num >= 1:
  710. head = self.tr_frame.iloc[:row_num, :]
  711. tail = self.tr_frame.iloc[-row_num:, :]
  712. self.tr_frame = concat((head, tail))
  713. else:
  714. row_num = cast(int, self.max_rows)
  715. self.tr_frame = self.tr_frame.iloc[:row_num, :]
  716. self.tr_row_num = row_num
  717. def _get_strcols_without_index(self) -> list[list[str]]:
  718. strcols: list[list[str]] = []
  719. if not is_list_like(self.header) and not self.header:
  720. for i, c in enumerate(self.tr_frame):
  721. fmt_values = self.format_col(i)
  722. fmt_values = _make_fixed_width(
  723. strings=fmt_values,
  724. justify=self.justify,
  725. minimum=int(self.col_space.get(c, 0)),
  726. adj=self.adj,
  727. )
  728. strcols.append(fmt_values)
  729. return strcols
  730. if is_list_like(self.header):
  731. # cast here since can't be bool if is_list_like
  732. self.header = cast(List[str], self.header)
  733. if len(self.header) != len(self.columns):
  734. raise ValueError(
  735. f"Writing {len(self.columns)} cols "
  736. f"but got {len(self.header)} aliases"
  737. )
  738. str_columns = [[label] for label in self.header]
  739. else:
  740. str_columns = self._get_formatted_column_labels(self.tr_frame)
  741. if self.show_row_idx_names:
  742. for x in str_columns:
  743. x.append("")
  744. for i, c in enumerate(self.tr_frame):
  745. cheader = str_columns[i]
  746. header_colwidth = max(
  747. int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
  748. )
  749. fmt_values = self.format_col(i)
  750. fmt_values = _make_fixed_width(
  751. fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
  752. )
  753. max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)
  754. cheader = self.adj.justify(cheader, max_len, mode=self.justify)
  755. strcols.append(cheader + fmt_values)
  756. return strcols
  757. def format_col(self, i: int) -> list[str]:
  758. frame = self.tr_frame
  759. formatter = self._get_formatter(i)
  760. return format_array(
  761. frame.iloc[:, i]._values,
  762. formatter,
  763. float_format=self.float_format,
  764. na_rep=self.na_rep,
  765. space=self.col_space.get(frame.columns[i]),
  766. decimal=self.decimal,
  767. leading_space=self.index,
  768. )
  769. def _get_formatter(self, i: str | int) -> Callable | None:
  770. if isinstance(self.formatters, (list, tuple)):
  771. if is_integer(i):
  772. i = cast(int, i)
  773. return self.formatters[i]
  774. else:
  775. return None
  776. else:
  777. if is_integer(i) and i not in self.columns:
  778. i = self.columns[i]
  779. return self.formatters.get(i, None)
  780. def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]:
  781. from pandas.core.indexes.multi import sparsify_labels
  782. columns = frame.columns
  783. if isinstance(columns, MultiIndex):
  784. fmt_columns = columns.format(sparsify=False, adjoin=False)
  785. fmt_columns = list(zip(*fmt_columns))
  786. dtypes = self.frame.dtypes._values
  787. # if we have a Float level, they don't use leading space at all
  788. restrict_formatting = any(level.is_floating for level in columns.levels)
  789. need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
  790. def space_format(x, y):
  791. if (
  792. y not in self.formatters
  793. and need_leadsp[x]
  794. and not restrict_formatting
  795. ):
  796. return " " + y
  797. return y
  798. str_columns = list(
  799. zip(*([space_format(x, y) for y in x] for x in fmt_columns))
  800. )
  801. if self.sparsify and len(str_columns):
  802. str_columns = sparsify_labels(str_columns)
  803. str_columns = [list(x) for x in zip(*str_columns)]
  804. else:
  805. fmt_columns = columns.format()
  806. dtypes = self.frame.dtypes
  807. need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
  808. str_columns = [
  809. [" " + x if not self._get_formatter(i) and need_leadsp[x] else x]
  810. for i, x in enumerate(fmt_columns)
  811. ]
  812. # self.str_columns = str_columns
  813. return str_columns
  814. def _get_formatted_index(self, frame: DataFrame) -> list[str]:
  815. # Note: this is only used by to_string() and to_latex(), not by
  816. # to_html(). so safe to cast col_space here.
  817. col_space = {k: cast(int, v) for k, v in self.col_space.items()}
  818. index = frame.index
  819. columns = frame.columns
  820. fmt = self._get_formatter("__index__")
  821. if isinstance(index, MultiIndex):
  822. fmt_index = index.format(
  823. sparsify=self.sparsify,
  824. adjoin=False,
  825. names=self.show_row_idx_names,
  826. formatter=fmt,
  827. )
  828. else:
  829. fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
  830. fmt_index = [
  831. tuple(
  832. _make_fixed_width(
  833. list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj
  834. )
  835. )
  836. for x in fmt_index
  837. ]
  838. adjoined = self.adj.adjoin(1, *fmt_index).split("\n")
  839. # empty space for columns
  840. if self.show_col_idx_names:
  841. col_header = [str(x) for x in self._get_column_name_list()]
  842. else:
  843. col_header = [""] * columns.nlevels
  844. if self.header:
  845. return col_header + adjoined
  846. else:
  847. return adjoined
  848. def _get_column_name_list(self) -> list[Hashable]:
  849. names: list[Hashable] = []
  850. columns = self.frame.columns
  851. if isinstance(columns, MultiIndex):
  852. names.extend("" if name is None else name for name in columns.names)
  853. else:
  854. names.append("" if columns.name is None else columns.name)
  855. return names
  856. class DataFrameRenderer:
  857. """Class for creating dataframe output in multiple formats.
  858. Called in pandas.core.generic.NDFrame:
  859. - to_csv
  860. - to_latex
  861. Called in pandas.core.frame.DataFrame:
  862. - to_html
  863. - to_string
  864. Parameters
  865. ----------
  866. fmt : DataFrameFormatter
  867. Formatter with the formatting options.
  868. """
  869. def __init__(self, fmt: DataFrameFormatter) -> None:
  870. self.fmt = fmt
  871. def to_latex(
  872. self,
  873. buf: FilePath | WriteBuffer[str] | None = None,
  874. column_format: str | None = None,
  875. longtable: bool = False,
  876. encoding: str | None = None,
  877. multicolumn: bool = False,
  878. multicolumn_format: str | None = None,
  879. multirow: bool = False,
  880. caption: str | tuple[str, str] | None = None,
  881. label: str | None = None,
  882. position: str | None = None,
  883. ) -> str | None:
  884. """
  885. Render a DataFrame to a LaTeX tabular/longtable environment output.
  886. """
  887. from pandas.io.formats.latex import LatexFormatter
  888. latex_formatter = LatexFormatter(
  889. self.fmt,
  890. longtable=longtable,
  891. column_format=column_format,
  892. multicolumn=multicolumn,
  893. multicolumn_format=multicolumn_format,
  894. multirow=multirow,
  895. caption=caption,
  896. label=label,
  897. position=position,
  898. )
  899. string = latex_formatter.to_string()
  900. return save_to_buffer(string, buf=buf, encoding=encoding)
  901. def to_html(
  902. self,
  903. buf: FilePath | WriteBuffer[str] | None = None,
  904. encoding: str | None = None,
  905. classes: str | list | tuple | None = None,
  906. notebook: bool = False,
  907. border: int | bool | None = None,
  908. table_id: str | None = None,
  909. render_links: bool = False,
  910. ) -> str | None:
  911. """
  912. Render a DataFrame to a html table.
  913. Parameters
  914. ----------
  915. buf : str, path object, file-like object, or None, default None
  916. String, path object (implementing ``os.PathLike[str]``), or file-like
  917. object implementing a string ``write()`` function. If None, the result is
  918. returned as a string.
  919. encoding : str, default “utf-8”
  920. Set character encoding.
  921. classes : str or list-like
  922. classes to include in the `class` attribute of the opening
  923. ``<table>`` tag, in addition to the default "dataframe".
  924. notebook : {True, False}, optional, default False
  925. Whether the generated HTML is for IPython Notebook.
  926. border : int
  927. A ``border=border`` attribute is included in the opening
  928. ``<table>`` tag. Default ``pd.options.display.html.border``.
  929. table_id : str, optional
  930. A css id is included in the opening `<table>` tag if specified.
  931. render_links : bool, default False
  932. Convert URLs to HTML links.
  933. """
  934. from pandas.io.formats.html import (
  935. HTMLFormatter,
  936. NotebookFormatter,
  937. )
  938. Klass = NotebookFormatter if notebook else HTMLFormatter
  939. html_formatter = Klass(
  940. self.fmt,
  941. classes=classes,
  942. border=border,
  943. table_id=table_id,
  944. render_links=render_links,
  945. )
  946. string = html_formatter.to_string()
  947. return save_to_buffer(string, buf=buf, encoding=encoding)
  948. def to_string(
  949. self,
  950. buf: FilePath | WriteBuffer[str] | None = None,
  951. encoding: str | None = None,
  952. line_width: int | None = None,
  953. ) -> str | None:
  954. """
  955. Render a DataFrame to a console-friendly tabular output.
  956. Parameters
  957. ----------
  958. buf : str, path object, file-like object, or None, default None
  959. String, path object (implementing ``os.PathLike[str]``), or file-like
  960. object implementing a string ``write()`` function. If None, the result is
  961. returned as a string.
  962. encoding: str, default “utf-8”
  963. Set character encoding.
  964. line_width : int, optional
  965. Width to wrap a line in characters.
  966. """
  967. from pandas.io.formats.string import StringFormatter
  968. string_formatter = StringFormatter(self.fmt, line_width=line_width)
  969. string = string_formatter.to_string()
  970. return save_to_buffer(string, buf=buf, encoding=encoding)
  971. def to_csv(
  972. self,
  973. path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
  974. encoding: str | None = None,
  975. sep: str = ",",
  976. columns: Sequence[Hashable] | None = None,
  977. index_label: IndexLabel | None = None,
  978. mode: str = "w",
  979. compression: CompressionOptions = "infer",
  980. quoting: int | None = None,
  981. quotechar: str = '"',
  982. lineterminator: str | None = None,
  983. chunksize: int | None = None,
  984. date_format: str | None = None,
  985. doublequote: bool = True,
  986. escapechar: str | None = None,
  987. errors: str = "strict",
  988. storage_options: StorageOptions = None,
  989. ) -> str | None:
  990. """
  991. Render dataframe as comma-separated file.
  992. """
  993. from pandas.io.formats.csvs import CSVFormatter
  994. if path_or_buf is None:
  995. created_buffer = True
  996. path_or_buf = StringIO()
  997. else:
  998. created_buffer = False
  999. csv_formatter = CSVFormatter(
  1000. path_or_buf=path_or_buf,
  1001. lineterminator=lineterminator,
  1002. sep=sep,
  1003. encoding=encoding,
  1004. errors=errors,
  1005. compression=compression,
  1006. quoting=quoting,
  1007. cols=columns,
  1008. index_label=index_label,
  1009. mode=mode,
  1010. chunksize=chunksize,
  1011. quotechar=quotechar,
  1012. date_format=date_format,
  1013. doublequote=doublequote,
  1014. escapechar=escapechar,
  1015. storage_options=storage_options,
  1016. formatter=self.fmt,
  1017. )
  1018. csv_formatter.save()
  1019. if created_buffer:
  1020. assert isinstance(path_or_buf, StringIO)
  1021. content = path_or_buf.getvalue()
  1022. path_or_buf.close()
  1023. return content
  1024. return None
  1025. def save_to_buffer(
  1026. string: str,
  1027. buf: FilePath | WriteBuffer[str] | None = None,
  1028. encoding: str | None = None,
  1029. ) -> str | None:
  1030. """
  1031. Perform serialization. Write to buf or return as string if buf is None.
  1032. """
  1033. with get_buffer(buf, encoding=encoding) as f:
  1034. f.write(string)
  1035. if buf is None:
  1036. # error: "WriteBuffer[str]" has no attribute "getvalue"
  1037. return f.getvalue() # type: ignore[attr-defined]
  1038. return None
  1039. @contextmanager
  1040. def get_buffer(
  1041. buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None
  1042. ) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]:
  1043. """
  1044. Context manager to open, yield and close buffer for filenames or Path-like
  1045. objects, otherwise yield buf unchanged.
  1046. """
  1047. if buf is not None:
  1048. buf = stringify_path(buf)
  1049. else:
  1050. buf = StringIO()
  1051. if encoding is None:
  1052. encoding = "utf-8"
  1053. elif not isinstance(buf, str):
  1054. raise ValueError("buf is not a file name and encoding is specified.")
  1055. if hasattr(buf, "write"):
  1056. # Incompatible types in "yield" (actual type "Union[str, WriteBuffer[str],
  1057. # StringIO]", expected type "Union[WriteBuffer[str], StringIO]")
  1058. yield buf # type: ignore[misc]
  1059. elif isinstance(buf, str):
  1060. check_parent_directory(str(buf))
  1061. with open(buf, "w", encoding=encoding, newline="") as f:
  1062. # GH#30034 open instead of codecs.open prevents a file leak
  1063. # if we have an invalid encoding argument.
  1064. # newline="" is needed to roundtrip correctly on
  1065. # windows test_to_latex_filename
  1066. yield f
  1067. else:
  1068. raise TypeError("buf is not a file name and it has no write method")
  1069. # ----------------------------------------------------------------------
  1070. # Array formatters
  1071. def format_array(
  1072. values: Any,
  1073. formatter: Callable | None,
  1074. float_format: FloatFormatType | None = None,
  1075. na_rep: str = "NaN",
  1076. digits: int | None = None,
  1077. space: str | int | None = None,
  1078. justify: str = "right",
  1079. decimal: str = ".",
  1080. leading_space: bool | None = True,
  1081. quoting: int | None = None,
  1082. fallback_formatter: Callable | None = None,
  1083. ) -> list[str]:
  1084. """
  1085. Format an array for printing.
  1086. Parameters
  1087. ----------
  1088. values
  1089. formatter
  1090. float_format
  1091. na_rep
  1092. digits
  1093. space
  1094. justify
  1095. decimal
  1096. leading_space : bool, optional, default True
  1097. Whether the array should be formatted with a leading space.
  1098. When an array as a column of a Series or DataFrame, we do want
  1099. the leading space to pad between columns.
  1100. When formatting an Index subclass
  1101. (e.g. IntervalIndex._format_native_types), we don't want the
  1102. leading space since it should be left-aligned.
  1103. fallback_formatter
  1104. Returns
  1105. -------
  1106. List[str]
  1107. """
  1108. fmt_klass: type[GenericArrayFormatter]
  1109. if is_datetime64_dtype(values.dtype):
  1110. fmt_klass = Datetime64Formatter
  1111. elif isinstance(values.dtype, DatetimeTZDtype):
  1112. fmt_klass = Datetime64TZFormatter
  1113. elif is_timedelta64_dtype(values.dtype):
  1114. fmt_klass = Timedelta64Formatter
  1115. elif is_extension_array_dtype(values.dtype):
  1116. fmt_klass = ExtensionArrayFormatter
  1117. elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype):
  1118. fmt_klass = FloatArrayFormatter
  1119. elif is_integer_dtype(values.dtype):
  1120. fmt_klass = IntArrayFormatter
  1121. else:
  1122. fmt_klass = GenericArrayFormatter
  1123. if space is None:
  1124. space = 12
  1125. if float_format is None:
  1126. float_format = get_option("display.float_format")
  1127. if digits is None:
  1128. digits = get_option("display.precision")
  1129. fmt_obj = fmt_klass(
  1130. values,
  1131. digits=digits,
  1132. na_rep=na_rep,
  1133. float_format=float_format,
  1134. formatter=formatter,
  1135. space=space,
  1136. justify=justify,
  1137. decimal=decimal,
  1138. leading_space=leading_space,
  1139. quoting=quoting,
  1140. fallback_formatter=fallback_formatter,
  1141. )
  1142. return fmt_obj.get_result()
  1143. class GenericArrayFormatter:
  1144. def __init__(
  1145. self,
  1146. values: Any,
  1147. digits: int = 7,
  1148. formatter: Callable | None = None,
  1149. na_rep: str = "NaN",
  1150. space: str | int = 12,
  1151. float_format: FloatFormatType | None = None,
  1152. justify: str = "right",
  1153. decimal: str = ".",
  1154. quoting: int | None = None,
  1155. fixed_width: bool = True,
  1156. leading_space: bool | None = True,
  1157. fallback_formatter: Callable | None = None,
  1158. ) -> None:
  1159. self.values = values
  1160. self.digits = digits
  1161. self.na_rep = na_rep
  1162. self.space = space
  1163. self.formatter = formatter
  1164. self.float_format = float_format
  1165. self.justify = justify
  1166. self.decimal = decimal
  1167. self.quoting = quoting
  1168. self.fixed_width = fixed_width
  1169. self.leading_space = leading_space
  1170. self.fallback_formatter = fallback_formatter
  1171. def get_result(self) -> list[str]:
  1172. fmt_values = self._format_strings()
  1173. return _make_fixed_width(fmt_values, self.justify)
  1174. def _format_strings(self) -> list[str]:
  1175. if self.float_format is None:
  1176. float_format = get_option("display.float_format")
  1177. if float_format is None:
  1178. precision = get_option("display.precision")
  1179. float_format = lambda x: _trim_zeros_single_float(
  1180. f"{x: .{precision:d}f}"
  1181. )
  1182. else:
  1183. float_format = self.float_format
  1184. if self.formatter is not None:
  1185. formatter = self.formatter
  1186. elif self.fallback_formatter is not None:
  1187. formatter = self.fallback_formatter
  1188. else:
  1189. quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE
  1190. formatter = partial(
  1191. printing.pprint_thing,
  1192. escape_chars=("\t", "\r", "\n"),
  1193. quote_strings=quote_strings,
  1194. )
  1195. def _format(x):
  1196. if self.na_rep is not None and is_scalar(x) and isna(x):
  1197. try:
  1198. # try block for np.isnat specifically
  1199. # determine na_rep if x is None or NaT-like
  1200. if x is None:
  1201. return "None"
  1202. elif x is NA:
  1203. return str(NA)
  1204. elif x is NaT or np.isnat(x):
  1205. return "NaT"
  1206. except (TypeError, ValueError):
  1207. # np.isnat only handles datetime or timedelta objects
  1208. pass
  1209. return self.na_rep
  1210. elif isinstance(x, PandasObject):
  1211. return str(x)
  1212. elif isinstance(x, StringDtype):
  1213. return repr(x)
  1214. else:
  1215. # object dtype
  1216. return str(formatter(x))
  1217. vals = extract_array(self.values, extract_numpy=True)
  1218. if not isinstance(vals, np.ndarray):
  1219. raise TypeError(
  1220. "ExtensionArray formatting should use ExtensionArrayFormatter"
  1221. )
  1222. inferred = lib.map_infer(vals, is_float)
  1223. is_float_type = (
  1224. inferred
  1225. # vals may have 2 or more dimensions
  1226. & np.all(notna(vals), axis=tuple(range(1, len(vals.shape))))
  1227. )
  1228. leading_space = self.leading_space
  1229. if leading_space is None:
  1230. leading_space = is_float_type.any()
  1231. fmt_values = []
  1232. for i, v in enumerate(vals):
  1233. if (not is_float_type[i] or self.formatter is not None) and leading_space:
  1234. fmt_values.append(f" {_format(v)}")
  1235. elif is_float_type[i]:
  1236. fmt_values.append(float_format(v))
  1237. else:
  1238. if leading_space is False:
  1239. # False specifically, so that the default is
  1240. # to include a space if we get here.
  1241. tpl = "{v}"
  1242. else:
  1243. tpl = " {v}"
  1244. fmt_values.append(tpl.format(v=_format(v)))
  1245. return fmt_values
  1246. class FloatArrayFormatter(GenericArrayFormatter):
  1247. def __init__(self, *args, **kwargs) -> None:
  1248. super().__init__(*args, **kwargs)
  1249. # float_format is expected to be a string
  1250. # formatter should be used to pass a function
  1251. if self.float_format is not None and self.formatter is None:
  1252. # GH21625, GH22270
  1253. self.fixed_width = False
  1254. if callable(self.float_format):
  1255. self.formatter = self.float_format
  1256. self.float_format = None
  1257. def _value_formatter(
  1258. self,
  1259. float_format: FloatFormatType | None = None,
  1260. threshold: float | None = None,
  1261. ) -> Callable:
  1262. """Returns a function to be applied on each value to format it"""
  1263. # the float_format parameter supersedes self.float_format
  1264. if float_format is None:
  1265. float_format = self.float_format
  1266. # we are going to compose different functions, to first convert to
  1267. # a string, then replace the decimal symbol, and finally chop according
  1268. # to the threshold
  1269. # when there is no float_format, we use str instead of '%g'
  1270. # because str(0.0) = '0.0' while '%g' % 0.0 = '0'
  1271. if float_format:
  1272. def base_formatter(v):
  1273. assert float_format is not None # for mypy
  1274. # error: "str" not callable
  1275. # error: Unexpected keyword argument "value" for "__call__" of
  1276. # "EngFormatter"
  1277. return (
  1278. float_format(value=v) # type: ignore[operator,call-arg]
  1279. if notna(v)
  1280. else self.na_rep
  1281. )
  1282. else:
  1283. def base_formatter(v):
  1284. return str(v) if notna(v) else self.na_rep
  1285. if self.decimal != ".":
  1286. def decimal_formatter(v):
  1287. return base_formatter(v).replace(".", self.decimal, 1)
  1288. else:
  1289. decimal_formatter = base_formatter
  1290. if threshold is None:
  1291. return decimal_formatter
  1292. def formatter(value):
  1293. if notna(value):
  1294. if abs(value) > threshold:
  1295. return decimal_formatter(value)
  1296. else:
  1297. return decimal_formatter(0.0)
  1298. else:
  1299. return self.na_rep
  1300. return formatter
  1301. def get_result_as_array(self) -> np.ndarray:
  1302. """
  1303. Returns the float values converted into strings using
  1304. the parameters given at initialisation, as a numpy array
  1305. """
  1306. def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str):
  1307. mask = isna(values)
  1308. formatted = np.array(
  1309. [
  1310. formatter(val) if not m else na_rep
  1311. for val, m in zip(values.ravel(), mask.ravel())
  1312. ]
  1313. ).reshape(values.shape)
  1314. return formatted
  1315. if self.formatter is not None:
  1316. return format_with_na_rep(self.values, self.formatter, self.na_rep)
  1317. if self.fixed_width:
  1318. threshold = get_option("display.chop_threshold")
  1319. else:
  1320. threshold = None
  1321. # if we have a fixed_width, we'll need to try different float_format
  1322. def format_values_with(float_format):
  1323. formatter = self._value_formatter(float_format, threshold)
  1324. # default formatter leaves a space to the left when formatting
  1325. # floats, must be consistent for left-justifying NaNs (GH #25061)
  1326. if self.justify == "left":
  1327. na_rep = " " + self.na_rep
  1328. else:
  1329. na_rep = self.na_rep
  1330. # separate the wheat from the chaff
  1331. values = self.values
  1332. is_complex = is_complex_dtype(values)
  1333. values = format_with_na_rep(values, formatter, na_rep)
  1334. if self.fixed_width:
  1335. if is_complex:
  1336. result = _trim_zeros_complex(values, self.decimal)
  1337. else:
  1338. result = _trim_zeros_float(values, self.decimal)
  1339. return np.asarray(result, dtype="object")
  1340. return values
  1341. # There is a special default string when we are fixed-width
  1342. # The default is otherwise to use str instead of a formatting string
  1343. float_format: FloatFormatType | None
  1344. if self.float_format is None:
  1345. if self.fixed_width:
  1346. if self.leading_space is True:
  1347. fmt_str = "{value: .{digits:d}f}"
  1348. else:
  1349. fmt_str = "{value:.{digits:d}f}"
  1350. float_format = partial(fmt_str.format, digits=self.digits)
  1351. else:
  1352. float_format = self.float_format
  1353. else:
  1354. float_format = lambda value: self.float_format % value
  1355. formatted_values = format_values_with(float_format)
  1356. if not self.fixed_width:
  1357. return formatted_values
  1358. # we need do convert to engineering format if some values are too small
  1359. # and would appear as 0, or if some values are too big and take too
  1360. # much space
  1361. if len(formatted_values) > 0:
  1362. maxlen = max(len(x) for x in formatted_values)
  1363. too_long = maxlen > self.digits + 6
  1364. else:
  1365. too_long = False
  1366. with np.errstate(invalid="ignore"):
  1367. abs_vals = np.abs(self.values)
  1368. # this is pretty arbitrary for now
  1369. # large values: more that 8 characters including decimal symbol
  1370. # and first digit, hence > 1e6
  1371. has_large_values = (abs_vals > 1e6).any()
  1372. has_small_values = (
  1373. (abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)
  1374. ).any()
  1375. if has_small_values or (too_long and has_large_values):
  1376. if self.leading_space is True:
  1377. fmt_str = "{value: .{digits:d}e}"
  1378. else:
  1379. fmt_str = "{value:.{digits:d}e}"
  1380. float_format = partial(fmt_str.format, digits=self.digits)
  1381. formatted_values = format_values_with(float_format)
  1382. return formatted_values
  1383. def _format_strings(self) -> list[str]:
  1384. return list(self.get_result_as_array())
  1385. class IntArrayFormatter(GenericArrayFormatter):
  1386. def _format_strings(self) -> list[str]:
  1387. if self.leading_space is False:
  1388. formatter_str = lambda x: f"{x:d}".format(x=x)
  1389. else:
  1390. formatter_str = lambda x: f"{x: d}".format(x=x)
  1391. formatter = self.formatter or formatter_str
  1392. fmt_values = [formatter(x) for x in self.values]
  1393. return fmt_values
  1394. class Datetime64Formatter(GenericArrayFormatter):
  1395. def __init__(
  1396. self,
  1397. values: np.ndarray | Series | DatetimeIndex | DatetimeArray,
  1398. nat_rep: str = "NaT",
  1399. date_format: None = None,
  1400. **kwargs,
  1401. ) -> None:
  1402. super().__init__(values, **kwargs)
  1403. self.nat_rep = nat_rep
  1404. self.date_format = date_format
  1405. def _format_strings(self) -> list[str]:
  1406. """we by definition have DO NOT have a TZ"""
  1407. values = self.values
  1408. if not isinstance(values, DatetimeIndex):
  1409. values = DatetimeIndex(values)
  1410. if self.formatter is not None and callable(self.formatter):
  1411. return [self.formatter(x) for x in values]
  1412. fmt_values = values._data._format_native_types(
  1413. na_rep=self.nat_rep, date_format=self.date_format
  1414. )
  1415. return fmt_values.tolist()
  1416. class ExtensionArrayFormatter(GenericArrayFormatter):
  1417. def _format_strings(self) -> list[str]:
  1418. values = extract_array(self.values, extract_numpy=True)
  1419. formatter = self.formatter
  1420. fallback_formatter = None
  1421. if formatter is None:
  1422. fallback_formatter = values._formatter(boxed=True)
  1423. if isinstance(values, Categorical):
  1424. # Categorical is special for now, so that we can preserve tzinfo
  1425. array = values._internal_get_values()
  1426. else:
  1427. array = np.asarray(values)
  1428. fmt_values = format_array(
  1429. array,
  1430. formatter,
  1431. float_format=self.float_format,
  1432. na_rep=self.na_rep,
  1433. digits=self.digits,
  1434. space=self.space,
  1435. justify=self.justify,
  1436. decimal=self.decimal,
  1437. leading_space=self.leading_space,
  1438. quoting=self.quoting,
  1439. fallback_formatter=fallback_formatter,
  1440. )
  1441. return fmt_values
  1442. def format_percentiles(
  1443. percentiles: (np.ndarray | Sequence[float]),
  1444. ) -> list[str]:
  1445. """
  1446. Outputs rounded and formatted percentiles.
  1447. Parameters
  1448. ----------
  1449. percentiles : list-like, containing floats from interval [0,1]
  1450. Returns
  1451. -------
  1452. formatted : list of strings
  1453. Notes
  1454. -----
  1455. Rounding precision is chosen so that: (1) if any two elements of
  1456. ``percentiles`` differ, they remain different after rounding
  1457. (2) no entry is *rounded* to 0% or 100%.
  1458. Any non-integer is always rounded to at least 1 decimal place.
  1459. Examples
  1460. --------
  1461. Keeps all entries different after rounding:
  1462. >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
  1463. ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
  1464. No element is rounded to 0% or 100% (unless already equal to it).
  1465. Duplicates are allowed:
  1466. >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
  1467. ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
  1468. """
  1469. percentiles = np.asarray(percentiles)
  1470. # It checks for np.NaN as well
  1471. with np.errstate(invalid="ignore"):
  1472. if (
  1473. not is_numeric_dtype(percentiles)
  1474. or not np.all(percentiles >= 0)
  1475. or not np.all(percentiles <= 1)
  1476. ):
  1477. raise ValueError("percentiles should all be in the interval [0,1]")
  1478. percentiles = 100 * percentiles
  1479. percentiles_round_type = percentiles.round().astype(int)
  1480. int_idx = np.isclose(percentiles_round_type, percentiles)
  1481. if np.all(int_idx):
  1482. out = percentiles_round_type.astype(str)
  1483. return [i + "%" for i in out]
  1484. unique_pcts = np.unique(percentiles)
  1485. to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None
  1486. to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None
  1487. # Least precision that keeps percentiles unique after rounding
  1488. prec = -np.floor(
  1489. np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)))
  1490. ).astype(int)
  1491. prec = max(1, prec)
  1492. out = np.empty_like(percentiles, dtype=object)
  1493. out[int_idx] = percentiles[int_idx].round().astype(int).astype(str)
  1494. out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
  1495. return [i + "%" for i in out]
  1496. def is_dates_only(values: np.ndarray | DatetimeArray | Index | DatetimeIndex) -> bool:
  1497. # return a boolean if we are only dates (and don't have a timezone)
  1498. if not isinstance(values, Index):
  1499. values = values.ravel()
  1500. if not isinstance(values, (DatetimeArray, DatetimeIndex)):
  1501. values = DatetimeIndex(values)
  1502. if values.tz is not None:
  1503. return False
  1504. values_int = values.asi8
  1505. consider_values = values_int != iNaT
  1506. # error: Argument 1 to "py_get_unit_from_dtype" has incompatible type
  1507. # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
  1508. reso = get_unit_from_dtype(values.dtype) # type: ignore[arg-type]
  1509. ppd = periods_per_day(reso)
  1510. # TODO: can we reuse is_date_array_normalized? would need a skipna kwd
  1511. even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0
  1512. if even_days:
  1513. return True
  1514. return False
  1515. def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str:
  1516. if x is NaT:
  1517. return nat_rep
  1518. # Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ')
  1519. # so it already uses string formatting rather than strftime (faster).
  1520. return str(x)
  1521. def _format_datetime64_dateonly(
  1522. x: NaTType | Timestamp,
  1523. nat_rep: str = "NaT",
  1524. date_format: str | None = None,
  1525. ) -> str:
  1526. if isinstance(x, NaTType):
  1527. return nat_rep
  1528. if date_format:
  1529. return x.strftime(date_format)
  1530. else:
  1531. # Timestamp._date_repr relies on string formatting (faster than strftime)
  1532. return x._date_repr
  1533. def get_format_datetime64(
  1534. is_dates_only_: bool, nat_rep: str = "NaT", date_format: str | None = None
  1535. ) -> Callable:
  1536. """Return a formatter callable taking a datetime64 as input and providing
  1537. a string as output"""
  1538. if is_dates_only_:
  1539. return lambda x: _format_datetime64_dateonly(
  1540. x, nat_rep=nat_rep, date_format=date_format
  1541. )
  1542. else:
  1543. return lambda x: _format_datetime64(x, nat_rep=nat_rep)
  1544. def get_format_datetime64_from_values(
  1545. values: np.ndarray | DatetimeArray | DatetimeIndex, date_format: str | None
  1546. ) -> str | None:
  1547. """given values and a date_format, return a string format"""
  1548. if isinstance(values, np.ndarray) and values.ndim > 1:
  1549. # We don't actually care about the order of values, and DatetimeIndex
  1550. # only accepts 1D values
  1551. values = values.ravel()
  1552. ido = is_dates_only(values)
  1553. if ido:
  1554. # Only dates and no timezone: provide a default format
  1555. return date_format or "%Y-%m-%d"
  1556. return date_format
  1557. class Datetime64TZFormatter(Datetime64Formatter):
  1558. def _format_strings(self) -> list[str]:
  1559. """we by definition have a TZ"""
  1560. values = self.values.astype(object)
  1561. ido = is_dates_only(values)
  1562. formatter = self.formatter or get_format_datetime64(
  1563. ido, date_format=self.date_format
  1564. )
  1565. fmt_values = [formatter(x) for x in values]
  1566. return fmt_values
  1567. class Timedelta64Formatter(GenericArrayFormatter):
  1568. def __init__(
  1569. self,
  1570. values: np.ndarray | TimedeltaIndex,
  1571. nat_rep: str = "NaT",
  1572. box: bool = False,
  1573. **kwargs,
  1574. ) -> None:
  1575. super().__init__(values, **kwargs)
  1576. self.nat_rep = nat_rep
  1577. self.box = box
  1578. def _format_strings(self) -> list[str]:
  1579. formatter = self.formatter or get_format_timedelta64(
  1580. self.values, nat_rep=self.nat_rep, box=self.box
  1581. )
  1582. return [formatter(x) for x in self.values]
  1583. def get_format_timedelta64(
  1584. values: np.ndarray | TimedeltaIndex | TimedeltaArray,
  1585. nat_rep: str | float = "NaT",
  1586. box: bool = False,
  1587. ) -> Callable:
  1588. """
  1589. Return a formatter function for a range of timedeltas.
  1590. These will all have the same format argument
  1591. If box, then show the return in quotes
  1592. """
  1593. values_int = values.view(np.int64)
  1594. consider_values = values_int != iNaT
  1595. one_day_nanos = 86400 * 10**9
  1596. # error: Unsupported operand types for % ("ExtensionArray" and "int")
  1597. not_midnight = values_int % one_day_nanos != 0 # type: ignore[operator]
  1598. # error: Argument 1 to "__call__" of "ufunc" has incompatible type
  1599. # "Union[Any, ExtensionArray, ndarray]"; expected
  1600. # "Union[Union[int, float, complex, str, bytes, generic],
  1601. # Sequence[Union[int, float, complex, str, bytes, generic]],
  1602. # Sequence[Sequence[Any]], _SupportsArray]"
  1603. both = np.logical_and(consider_values, not_midnight) # type: ignore[arg-type]
  1604. even_days = both.sum() == 0
  1605. if even_days:
  1606. format = None
  1607. else:
  1608. format = "long"
  1609. def _formatter(x):
  1610. if x is None or (is_scalar(x) and isna(x)):
  1611. return nat_rep
  1612. if not isinstance(x, Timedelta):
  1613. x = Timedelta(x)
  1614. # Timedelta._repr_base uses string formatting (faster than strftime)
  1615. result = x._repr_base(format=format)
  1616. if box:
  1617. result = f"'{result}'"
  1618. return result
  1619. return _formatter
  1620. def _make_fixed_width(
  1621. strings: list[str],
  1622. justify: str = "right",
  1623. minimum: int | None = None,
  1624. adj: TextAdjustment | None = None,
  1625. ) -> list[str]:
  1626. if len(strings) == 0 or justify == "all":
  1627. return strings
  1628. if adj is None:
  1629. adjustment = get_adjustment()
  1630. else:
  1631. adjustment = adj
  1632. max_len = max(adjustment.len(x) for x in strings)
  1633. if minimum is not None:
  1634. max_len = max(minimum, max_len)
  1635. conf_max = get_option("display.max_colwidth")
  1636. if conf_max is not None and max_len > conf_max:
  1637. max_len = conf_max
  1638. def just(x: str) -> str:
  1639. if conf_max is not None:
  1640. if (conf_max > 3) & (adjustment.len(x) > max_len):
  1641. x = x[: max_len - 3] + "..."
  1642. return x
  1643. strings = [just(x) for x in strings]
  1644. result = adjustment.justify(strings, max_len, mode=justify)
  1645. return result
  1646. def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> list[str]:
  1647. """
  1648. Separates the real and imaginary parts from the complex number, and
  1649. executes the _trim_zeros_float method on each of those.
  1650. """
  1651. trimmed = [
  1652. "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal))
  1653. for x in str_complexes
  1654. ]
  1655. # pad strings to the length of the longest trimmed string for alignment
  1656. lengths = [len(s) for s in trimmed]
  1657. max_length = max(lengths)
  1658. padded = [
  1659. s[: -((k - 1) // 2 + 1)] # real part
  1660. + (max_length - k) // 2 * "0"
  1661. + s[-((k - 1) // 2 + 1) : -((k - 1) // 2)] # + / -
  1662. + s[-((k - 1) // 2) : -1] # imaginary part
  1663. + (max_length - k) // 2 * "0"
  1664. + s[-1]
  1665. for s, k in zip(trimmed, lengths)
  1666. ]
  1667. return padded
  1668. def _trim_zeros_single_float(str_float: str) -> str:
  1669. """
  1670. Trims trailing zeros after a decimal point,
  1671. leaving just one if necessary.
  1672. """
  1673. str_float = str_float.rstrip("0")
  1674. if str_float.endswith("."):
  1675. str_float += "0"
  1676. return str_float
  1677. def _trim_zeros_float(
  1678. str_floats: np.ndarray | list[str], decimal: str = "."
  1679. ) -> list[str]:
  1680. """
  1681. Trims the maximum number of trailing zeros equally from
  1682. all numbers containing decimals, leaving just one if
  1683. necessary.
  1684. """
  1685. trimmed = str_floats
  1686. number_regex = re.compile(rf"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$")
  1687. def is_number_with_decimal(x) -> bool:
  1688. return re.match(number_regex, x) is not None
  1689. def should_trim(values: np.ndarray | list[str]) -> bool:
  1690. """
  1691. Determine if an array of strings should be trimmed.
  1692. Returns True if all numbers containing decimals (defined by the
  1693. above regular expression) within the array end in a zero, otherwise
  1694. returns False.
  1695. """
  1696. numbers = [x for x in values if is_number_with_decimal(x)]
  1697. return len(numbers) > 0 and all(x.endswith("0") for x in numbers)
  1698. while should_trim(trimmed):
  1699. trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed]
  1700. # leave one 0 after the decimal points if need be.
  1701. result = [
  1702. x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x
  1703. for x in trimmed
  1704. ]
  1705. return result
  1706. def _has_names(index: Index) -> bool:
  1707. if isinstance(index, MultiIndex):
  1708. return com.any_not_none(*index.names)
  1709. else:
  1710. return index.name is not None
  1711. class EngFormatter:
  1712. """
  1713. Formats float values according to engineering format.
  1714. Based on matplotlib.ticker.EngFormatter
  1715. """
  1716. # The SI engineering prefixes
  1717. ENG_PREFIXES = {
  1718. -24: "y",
  1719. -21: "z",
  1720. -18: "a",
  1721. -15: "f",
  1722. -12: "p",
  1723. -9: "n",
  1724. -6: "u",
  1725. -3: "m",
  1726. 0: "",
  1727. 3: "k",
  1728. 6: "M",
  1729. 9: "G",
  1730. 12: "T",
  1731. 15: "P",
  1732. 18: "E",
  1733. 21: "Z",
  1734. 24: "Y",
  1735. }
  1736. def __init__(
  1737. self, accuracy: int | None = None, use_eng_prefix: bool = False
  1738. ) -> None:
  1739. self.accuracy = accuracy
  1740. self.use_eng_prefix = use_eng_prefix
  1741. def __call__(self, num: float) -> str:
  1742. """
  1743. Formats a number in engineering notation, appending a letter
  1744. representing the power of 1000 of the original number. Some examples:
  1745. >>> format_eng = EngFormatter(accuracy=0, use_eng_prefix=True)
  1746. >>> format_eng(0)
  1747. ' 0'
  1748. >>> format_eng = EngFormatter(accuracy=1, use_eng_prefix=True)
  1749. >>> format_eng(1_000_000)
  1750. ' 1.0M'
  1751. >>> format_eng = EngFormatter(accuracy=2, use_eng_prefix=False)
  1752. >>> format_eng("-1e-6")
  1753. '-1.00E-06'
  1754. @param num: the value to represent
  1755. @type num: either a numeric value or a string that can be converted to
  1756. a numeric value (as per decimal.Decimal constructor)
  1757. @return: engineering formatted string
  1758. """
  1759. dnum = Decimal(str(num))
  1760. if Decimal.is_nan(dnum):
  1761. return "NaN"
  1762. if Decimal.is_infinite(dnum):
  1763. return "inf"
  1764. sign = 1
  1765. if dnum < 0: # pragma: no cover
  1766. sign = -1
  1767. dnum = -dnum
  1768. if dnum != 0:
  1769. pow10 = Decimal(int(math.floor(dnum.log10() / 3) * 3))
  1770. else:
  1771. pow10 = Decimal(0)
  1772. pow10 = pow10.min(max(self.ENG_PREFIXES.keys()))
  1773. pow10 = pow10.max(min(self.ENG_PREFIXES.keys()))
  1774. int_pow10 = int(pow10)
  1775. if self.use_eng_prefix:
  1776. prefix = self.ENG_PREFIXES[int_pow10]
  1777. else:
  1778. if int_pow10 < 0:
  1779. prefix = f"E-{-int_pow10:02d}"
  1780. else:
  1781. prefix = f"E+{int_pow10:02d}"
  1782. mant = sign * dnum / (10**pow10)
  1783. if self.accuracy is None: # pragma: no cover
  1784. format_str = "{mant: g}{prefix}"
  1785. else:
  1786. format_str = f"{{mant: .{self.accuracy:d}f}}{{prefix}}"
  1787. formatted = format_str.format(mant=mant, prefix=prefix)
  1788. return formatted
  1789. def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None:
  1790. """
  1791. Format float representation in DataFrame with SI notation.
  1792. Parameters
  1793. ----------
  1794. accuracy : int, default 3
  1795. Number of decimal digits after the floating point.
  1796. use_eng_prefix : bool, default False
  1797. Whether to represent a value with SI prefixes.
  1798. Returns
  1799. -------
  1800. None
  1801. Examples
  1802. --------
  1803. >>> df = pd.DataFrame([1e-9, 1e-3, 1, 1e3, 1e6])
  1804. >>> df
  1805. 0
  1806. 0 1.000000e-09
  1807. 1 1.000000e-03
  1808. 2 1.000000e+00
  1809. 3 1.000000e+03
  1810. 4 1.000000e+06
  1811. >>> pd.set_eng_float_format(accuracy=1)
  1812. >>> df
  1813. 0
  1814. 0 1.0E-09
  1815. 1 1.0E-03
  1816. 2 1.0E+00
  1817. 3 1.0E+03
  1818. 4 1.0E+06
  1819. >>> pd.set_eng_float_format(use_eng_prefix=True)
  1820. >>> df
  1821. 0
  1822. 0 1.000n
  1823. 1 1.000m
  1824. 2 1.000
  1825. 3 1.000k
  1826. 4 1.000M
  1827. >>> pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
  1828. >>> df
  1829. 0
  1830. 0 1.0n
  1831. 1 1.0m
  1832. 2 1.0
  1833. 3 1.0k
  1834. 4 1.0M
  1835. >>> pd.set_option("display.float_format", None) # unset option
  1836. """
  1837. set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix))
  1838. def get_level_lengths(
  1839. levels: Any, sentinel: bool | object | str = ""
  1840. ) -> list[dict[int, int]]:
  1841. """
  1842. For each index in each level the function returns lengths of indexes.
  1843. Parameters
  1844. ----------
  1845. levels : list of lists
  1846. List of values on for level.
  1847. sentinel : string, optional
  1848. Value which states that no new index starts on there.
  1849. Returns
  1850. -------
  1851. Returns list of maps. For each level returns map of indexes (key is index
  1852. in row and value is length of index).
  1853. """
  1854. if len(levels) == 0:
  1855. return []
  1856. control = [True] * len(levels[0])
  1857. result = []
  1858. for level in levels:
  1859. last_index = 0
  1860. lengths = {}
  1861. for i, key in enumerate(level):
  1862. if control[i] and key == sentinel:
  1863. pass
  1864. else:
  1865. control[i] = False
  1866. lengths[last_index] = i - last_index
  1867. last_index = i
  1868. lengths[last_index] = len(level) - last_index
  1869. result.append(lengths)
  1870. return result
  1871. def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None:
  1872. """
  1873. Appends lines to a buffer.
  1874. Parameters
  1875. ----------
  1876. buf
  1877. The buffer to write to
  1878. lines
  1879. The lines to append.
  1880. """
  1881. if any(isinstance(x, str) for x in lines):
  1882. lines = [str(x) for x in lines]
  1883. buf.write("\n".join(lines))