info.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101
  1. from __future__ import annotations
  2. from abc import (
  3. ABC,
  4. abstractmethod,
  5. )
  6. import sys
  7. from textwrap import dedent
  8. from typing import (
  9. TYPE_CHECKING,
  10. Iterable,
  11. Iterator,
  12. Mapping,
  13. Sequence,
  14. )
  15. from pandas._config import get_option
  16. from pandas._typing import (
  17. Dtype,
  18. WriteBuffer,
  19. )
  20. from pandas.io.formats import format as fmt
  21. from pandas.io.formats.printing import pprint_thing
  22. if TYPE_CHECKING:
  23. from pandas import (
  24. DataFrame,
  25. Index,
  26. Series,
  27. )
  28. frame_max_cols_sub = dedent(
  29. """\
  30. max_cols : int, optional
  31. When to switch from the verbose to the truncated output. If the
  32. DataFrame has more than `max_cols` columns, the truncated output
  33. is used. By default, the setting in
  34. ``pandas.options.display.max_info_columns`` is used."""
  35. )
  36. show_counts_sub = dedent(
  37. """\
  38. show_counts : bool, optional
  39. Whether to show the non-null counts. By default, this is shown
  40. only if the DataFrame is smaller than
  41. ``pandas.options.display.max_info_rows`` and
  42. ``pandas.options.display.max_info_columns``. A value of True always
  43. shows the counts, and False never shows the counts."""
  44. )
  45. frame_examples_sub = dedent(
  46. """\
  47. >>> int_values = [1, 2, 3, 4, 5]
  48. >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
  49. >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
  50. >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
  51. ... "float_col": float_values})
  52. >>> df
  53. int_col text_col float_col
  54. 0 1 alpha 0.00
  55. 1 2 beta 0.25
  56. 2 3 gamma 0.50
  57. 3 4 delta 0.75
  58. 4 5 epsilon 1.00
  59. Prints information of all columns:
  60. >>> df.info(verbose=True)
  61. <class 'pandas.core.frame.DataFrame'>
  62. RangeIndex: 5 entries, 0 to 4
  63. Data columns (total 3 columns):
  64. # Column Non-Null Count Dtype
  65. --- ------ -------------- -----
  66. 0 int_col 5 non-null int64
  67. 1 text_col 5 non-null object
  68. 2 float_col 5 non-null float64
  69. dtypes: float64(1), int64(1), object(1)
  70. memory usage: 248.0+ bytes
  71. Prints a summary of columns count and its dtypes but not per column
  72. information:
  73. >>> df.info(verbose=False)
  74. <class 'pandas.core.frame.DataFrame'>
  75. RangeIndex: 5 entries, 0 to 4
  76. Columns: 3 entries, int_col to float_col
  77. dtypes: float64(1), int64(1), object(1)
  78. memory usage: 248.0+ bytes
  79. Pipe output of DataFrame.info to buffer instead of sys.stdout, get
  80. buffer content and writes to a text file:
  81. >>> import io
  82. >>> buffer = io.StringIO()
  83. >>> df.info(buf=buffer)
  84. >>> s = buffer.getvalue()
  85. >>> with open("df_info.txt", "w",
  86. ... encoding="utf-8") as f: # doctest: +SKIP
  87. ... f.write(s)
  88. 260
  89. The `memory_usage` parameter allows deep introspection mode, specially
  90. useful for big DataFrames and fine-tune memory optimization:
  91. >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
  92. >>> df = pd.DataFrame({
  93. ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  94. ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  95. ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
  96. ... })
  97. >>> df.info()
  98. <class 'pandas.core.frame.DataFrame'>
  99. RangeIndex: 1000000 entries, 0 to 999999
  100. Data columns (total 3 columns):
  101. # Column Non-Null Count Dtype
  102. --- ------ -------------- -----
  103. 0 column_1 1000000 non-null object
  104. 1 column_2 1000000 non-null object
  105. 2 column_3 1000000 non-null object
  106. dtypes: object(3)
  107. memory usage: 22.9+ MB
  108. >>> df.info(memory_usage='deep')
  109. <class 'pandas.core.frame.DataFrame'>
  110. RangeIndex: 1000000 entries, 0 to 999999
  111. Data columns (total 3 columns):
  112. # Column Non-Null Count Dtype
  113. --- ------ -------------- -----
  114. 0 column_1 1000000 non-null object
  115. 1 column_2 1000000 non-null object
  116. 2 column_3 1000000 non-null object
  117. dtypes: object(3)
  118. memory usage: 165.9 MB"""
  119. )
  120. frame_see_also_sub = dedent(
  121. """\
  122. DataFrame.describe: Generate descriptive statistics of DataFrame
  123. columns.
  124. DataFrame.memory_usage: Memory usage of DataFrame columns."""
  125. )
  126. frame_sub_kwargs = {
  127. "klass": "DataFrame",
  128. "type_sub": " and columns",
  129. "max_cols_sub": frame_max_cols_sub,
  130. "show_counts_sub": show_counts_sub,
  131. "examples_sub": frame_examples_sub,
  132. "see_also_sub": frame_see_also_sub,
  133. "version_added_sub": "",
  134. }
  135. series_examples_sub = dedent(
  136. """\
  137. >>> int_values = [1, 2, 3, 4, 5]
  138. >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
  139. >>> s = pd.Series(text_values, index=int_values)
  140. >>> s.info()
  141. <class 'pandas.core.series.Series'>
  142. Index: 5 entries, 1 to 5
  143. Series name: None
  144. Non-Null Count Dtype
  145. -------------- -----
  146. 5 non-null object
  147. dtypes: object(1)
  148. memory usage: 80.0+ bytes
  149. Prints a summary excluding information about its values:
  150. >>> s.info(verbose=False)
  151. <class 'pandas.core.series.Series'>
  152. Index: 5 entries, 1 to 5
  153. dtypes: object(1)
  154. memory usage: 80.0+ bytes
  155. Pipe output of Series.info to buffer instead of sys.stdout, get
  156. buffer content and writes to a text file:
  157. >>> import io
  158. >>> buffer = io.StringIO()
  159. >>> s.info(buf=buffer)
  160. >>> s = buffer.getvalue()
  161. >>> with open("df_info.txt", "w",
  162. ... encoding="utf-8") as f: # doctest: +SKIP
  163. ... f.write(s)
  164. 260
  165. The `memory_usage` parameter allows deep introspection mode, specially
  166. useful for big Series and fine-tune memory optimization:
  167. >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
  168. >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
  169. >>> s.info()
  170. <class 'pandas.core.series.Series'>
  171. RangeIndex: 1000000 entries, 0 to 999999
  172. Series name: None
  173. Non-Null Count Dtype
  174. -------------- -----
  175. 1000000 non-null object
  176. dtypes: object(1)
  177. memory usage: 7.6+ MB
  178. >>> s.info(memory_usage='deep')
  179. <class 'pandas.core.series.Series'>
  180. RangeIndex: 1000000 entries, 0 to 999999
  181. Series name: None
  182. Non-Null Count Dtype
  183. -------------- -----
  184. 1000000 non-null object
  185. dtypes: object(1)
  186. memory usage: 55.3 MB"""
  187. )
  188. series_see_also_sub = dedent(
  189. """\
  190. Series.describe: Generate descriptive statistics of Series.
  191. Series.memory_usage: Memory usage of Series."""
  192. )
  193. series_sub_kwargs = {
  194. "klass": "Series",
  195. "type_sub": "",
  196. "max_cols_sub": "",
  197. "show_counts_sub": show_counts_sub,
  198. "examples_sub": series_examples_sub,
  199. "see_also_sub": series_see_also_sub,
  200. "version_added_sub": "\n.. versionadded:: 1.4.0\n",
  201. }
  202. INFO_DOCSTRING = dedent(
  203. """
  204. Print a concise summary of a {klass}.
  205. This method prints information about a {klass} including
  206. the index dtype{type_sub}, non-null values and memory usage.
  207. {version_added_sub}\
  208. Parameters
  209. ----------
  210. verbose : bool, optional
  211. Whether to print the full summary. By default, the setting in
  212. ``pandas.options.display.max_info_columns`` is followed.
  213. buf : writable buffer, defaults to sys.stdout
  214. Where to send the output. By default, the output is printed to
  215. sys.stdout. Pass a writable buffer if you need to further process
  216. the output.
  217. {max_cols_sub}
  218. memory_usage : bool, str, optional
  219. Specifies whether total memory usage of the {klass}
  220. elements (including the index) should be displayed. By default,
  221. this follows the ``pandas.options.display.memory_usage`` setting.
  222. True always show memory usage. False never shows memory usage.
  223. A value of 'deep' is equivalent to "True with deep introspection".
  224. Memory usage is shown in human-readable units (base-2
  225. representation). Without deep introspection a memory estimation is
  226. made based in column dtype and number of rows assuming values
  227. consume the same memory amount for corresponding dtypes. With deep
  228. memory introspection, a real memory usage calculation is performed
  229. at the cost of computational resources. See the
  230. :ref:`Frequently Asked Questions <df-memory-usage>` for more
  231. details.
  232. {show_counts_sub}
  233. Returns
  234. -------
  235. None
  236. This method prints a summary of a {klass} and returns None.
  237. See Also
  238. --------
  239. {see_also_sub}
  240. Examples
  241. --------
  242. {examples_sub}
  243. """
  244. )
  245. def _put_str(s: str | Dtype, space: int) -> str:
  246. """
  247. Make string of specified length, padding to the right if necessary.
  248. Parameters
  249. ----------
  250. s : Union[str, Dtype]
  251. String to be formatted.
  252. space : int
  253. Length to force string to be of.
  254. Returns
  255. -------
  256. str
  257. String coerced to given length.
  258. Examples
  259. --------
  260. >>> pd.io.formats.info._put_str("panda", 6)
  261. 'panda '
  262. >>> pd.io.formats.info._put_str("panda", 4)
  263. 'pand'
  264. """
  265. return str(s)[:space].ljust(space)
  266. def _sizeof_fmt(num: float, size_qualifier: str) -> str:
  267. """
  268. Return size in human readable format.
  269. Parameters
  270. ----------
  271. num : int
  272. Size in bytes.
  273. size_qualifier : str
  274. Either empty, or '+' (if lower bound).
  275. Returns
  276. -------
  277. str
  278. Size in human readable format.
  279. Examples
  280. --------
  281. >>> _sizeof_fmt(23028, '')
  282. '22.5 KB'
  283. >>> _sizeof_fmt(23028, '+')
  284. '22.5+ KB'
  285. """
  286. for x in ["bytes", "KB", "MB", "GB", "TB"]:
  287. if num < 1024.0:
  288. return f"{num:3.1f}{size_qualifier} {x}"
  289. num /= 1024.0
  290. return f"{num:3.1f}{size_qualifier} PB"
  291. def _initialize_memory_usage(
  292. memory_usage: bool | str | None = None,
  293. ) -> bool | str:
  294. """Get memory usage based on inputs and display options."""
  295. if memory_usage is None:
  296. memory_usage = get_option("display.memory_usage")
  297. return memory_usage
  298. class BaseInfo(ABC):
  299. """
  300. Base class for DataFrameInfo and SeriesInfo.
  301. Parameters
  302. ----------
  303. data : DataFrame or Series
  304. Either dataframe or series.
  305. memory_usage : bool or str, optional
  306. If "deep", introspect the data deeply by interrogating object dtypes
  307. for system-level memory consumption, and include it in the returned
  308. values.
  309. """
  310. data: DataFrame | Series
  311. memory_usage: bool | str
  312. @property
  313. @abstractmethod
  314. def dtypes(self) -> Iterable[Dtype]:
  315. """
  316. Dtypes.
  317. Returns
  318. -------
  319. dtypes : sequence
  320. Dtype of each of the DataFrame's columns (or one series column).
  321. """
  322. @property
  323. @abstractmethod
  324. def dtype_counts(self) -> Mapping[str, int]:
  325. """Mapping dtype - number of counts."""
  326. @property
  327. @abstractmethod
  328. def non_null_counts(self) -> Sequence[int]:
  329. """Sequence of non-null counts for all columns or column (if series)."""
  330. @property
  331. @abstractmethod
  332. def memory_usage_bytes(self) -> int:
  333. """
  334. Memory usage in bytes.
  335. Returns
  336. -------
  337. memory_usage_bytes : int
  338. Object's total memory usage in bytes.
  339. """
  340. @property
  341. def memory_usage_string(self) -> str:
  342. """Memory usage in a form of human readable string."""
  343. return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"
  344. @property
  345. def size_qualifier(self) -> str:
  346. size_qualifier = ""
  347. if self.memory_usage:
  348. if self.memory_usage != "deep":
  349. # size_qualifier is just a best effort; not guaranteed to catch
  350. # all cases (e.g., it misses categorical data even with object
  351. # categories)
  352. if (
  353. "object" in self.dtype_counts
  354. or self.data.index._is_memory_usage_qualified()
  355. ):
  356. size_qualifier = "+"
  357. return size_qualifier
  358. @abstractmethod
  359. def render(
  360. self,
  361. *,
  362. buf: WriteBuffer[str] | None,
  363. max_cols: int | None,
  364. verbose: bool | None,
  365. show_counts: bool | None,
  366. ) -> None:
  367. pass
  368. class DataFrameInfo(BaseInfo):
  369. """
  370. Class storing dataframe-specific info.
  371. """
  372. def __init__(
  373. self,
  374. data: DataFrame,
  375. memory_usage: bool | str | None = None,
  376. ) -> None:
  377. self.data: DataFrame = data
  378. self.memory_usage = _initialize_memory_usage(memory_usage)
  379. @property
  380. def dtype_counts(self) -> Mapping[str, int]:
  381. return _get_dataframe_dtype_counts(self.data)
  382. @property
  383. def dtypes(self) -> Iterable[Dtype]:
  384. """
  385. Dtypes.
  386. Returns
  387. -------
  388. dtypes
  389. Dtype of each of the DataFrame's columns.
  390. """
  391. return self.data.dtypes
  392. @property
  393. def ids(self) -> Index:
  394. """
  395. Column names.
  396. Returns
  397. -------
  398. ids : Index
  399. DataFrame's column names.
  400. """
  401. return self.data.columns
  402. @property
  403. def col_count(self) -> int:
  404. """Number of columns to be summarized."""
  405. return len(self.ids)
  406. @property
  407. def non_null_counts(self) -> Sequence[int]:
  408. """Sequence of non-null counts for all columns or column (if series)."""
  409. return self.data.count()
  410. @property
  411. def memory_usage_bytes(self) -> int:
  412. deep = self.memory_usage == "deep"
  413. return self.data.memory_usage(index=True, deep=deep).sum()
  414. def render(
  415. self,
  416. *,
  417. buf: WriteBuffer[str] | None,
  418. max_cols: int | None,
  419. verbose: bool | None,
  420. show_counts: bool | None,
  421. ) -> None:
  422. printer = DataFrameInfoPrinter(
  423. info=self,
  424. max_cols=max_cols,
  425. verbose=verbose,
  426. show_counts=show_counts,
  427. )
  428. printer.to_buffer(buf)
  429. class SeriesInfo(BaseInfo):
  430. """
  431. Class storing series-specific info.
  432. """
  433. def __init__(
  434. self,
  435. data: Series,
  436. memory_usage: bool | str | None = None,
  437. ) -> None:
  438. self.data: Series = data
  439. self.memory_usage = _initialize_memory_usage(memory_usage)
  440. def render(
  441. self,
  442. *,
  443. buf: WriteBuffer[str] | None = None,
  444. max_cols: int | None = None,
  445. verbose: bool | None = None,
  446. show_counts: bool | None = None,
  447. ) -> None:
  448. if max_cols is not None:
  449. raise ValueError(
  450. "Argument `max_cols` can only be passed "
  451. "in DataFrame.info, not Series.info"
  452. )
  453. printer = SeriesInfoPrinter(
  454. info=self,
  455. verbose=verbose,
  456. show_counts=show_counts,
  457. )
  458. printer.to_buffer(buf)
  459. @property
  460. def non_null_counts(self) -> Sequence[int]:
  461. return [self.data.count()]
  462. @property
  463. def dtypes(self) -> Iterable[Dtype]:
  464. return [self.data.dtypes]
  465. @property
  466. def dtype_counts(self) -> Mapping[str, int]:
  467. from pandas.core.frame import DataFrame
  468. return _get_dataframe_dtype_counts(DataFrame(self.data))
  469. @property
  470. def memory_usage_bytes(self) -> int:
  471. """Memory usage in bytes.
  472. Returns
  473. -------
  474. memory_usage_bytes : int
  475. Object's total memory usage in bytes.
  476. """
  477. deep = self.memory_usage == "deep"
  478. return self.data.memory_usage(index=True, deep=deep)
  479. class InfoPrinterAbstract:
  480. """
  481. Class for printing dataframe or series info.
  482. """
  483. def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None:
  484. """Save dataframe info into buffer."""
  485. table_builder = self._create_table_builder()
  486. lines = table_builder.get_lines()
  487. if buf is None: # pragma: no cover
  488. buf = sys.stdout
  489. fmt.buffer_put_lines(buf, lines)
  490. @abstractmethod
  491. def _create_table_builder(self) -> TableBuilderAbstract:
  492. """Create instance of table builder."""
  493. class DataFrameInfoPrinter(InfoPrinterAbstract):
  494. """
  495. Class for printing dataframe info.
  496. Parameters
  497. ----------
  498. info : DataFrameInfo
  499. Instance of DataFrameInfo.
  500. max_cols : int, optional
  501. When to switch from the verbose to the truncated output.
  502. verbose : bool, optional
  503. Whether to print the full summary.
  504. show_counts : bool, optional
  505. Whether to show the non-null counts.
  506. """
  507. def __init__(
  508. self,
  509. info: DataFrameInfo,
  510. max_cols: int | None = None,
  511. verbose: bool | None = None,
  512. show_counts: bool | None = None,
  513. ) -> None:
  514. self.info = info
  515. self.data = info.data
  516. self.verbose = verbose
  517. self.max_cols = self._initialize_max_cols(max_cols)
  518. self.show_counts = self._initialize_show_counts(show_counts)
  519. @property
  520. def max_rows(self) -> int:
  521. """Maximum info rows to be displayed."""
  522. return get_option("display.max_info_rows", len(self.data) + 1)
  523. @property
  524. def exceeds_info_cols(self) -> bool:
  525. """Check if number of columns to be summarized does not exceed maximum."""
  526. return bool(self.col_count > self.max_cols)
  527. @property
  528. def exceeds_info_rows(self) -> bool:
  529. """Check if number of rows to be summarized does not exceed maximum."""
  530. return bool(len(self.data) > self.max_rows)
  531. @property
  532. def col_count(self) -> int:
  533. """Number of columns to be summarized."""
  534. return self.info.col_count
  535. def _initialize_max_cols(self, max_cols: int | None) -> int:
  536. if max_cols is None:
  537. return get_option("display.max_info_columns", self.col_count + 1)
  538. return max_cols
  539. def _initialize_show_counts(self, show_counts: bool | None) -> bool:
  540. if show_counts is None:
  541. return bool(not self.exceeds_info_cols and not self.exceeds_info_rows)
  542. else:
  543. return show_counts
  544. def _create_table_builder(self) -> DataFrameTableBuilder:
  545. """
  546. Create instance of table builder based on verbosity and display settings.
  547. """
  548. if self.verbose:
  549. return DataFrameTableBuilderVerbose(
  550. info=self.info,
  551. with_counts=self.show_counts,
  552. )
  553. elif self.verbose is False: # specifically set to False, not necessarily None
  554. return DataFrameTableBuilderNonVerbose(info=self.info)
  555. else:
  556. if self.exceeds_info_cols:
  557. return DataFrameTableBuilderNonVerbose(info=self.info)
  558. else:
  559. return DataFrameTableBuilderVerbose(
  560. info=self.info,
  561. with_counts=self.show_counts,
  562. )
  563. class SeriesInfoPrinter(InfoPrinterAbstract):
  564. """Class for printing series info.
  565. Parameters
  566. ----------
  567. info : SeriesInfo
  568. Instance of SeriesInfo.
  569. verbose : bool, optional
  570. Whether to print the full summary.
  571. show_counts : bool, optional
  572. Whether to show the non-null counts.
  573. """
  574. def __init__(
  575. self,
  576. info: SeriesInfo,
  577. verbose: bool | None = None,
  578. show_counts: bool | None = None,
  579. ) -> None:
  580. self.info = info
  581. self.data = info.data
  582. self.verbose = verbose
  583. self.show_counts = self._initialize_show_counts(show_counts)
  584. def _create_table_builder(self) -> SeriesTableBuilder:
  585. """
  586. Create instance of table builder based on verbosity.
  587. """
  588. if self.verbose or self.verbose is None:
  589. return SeriesTableBuilderVerbose(
  590. info=self.info,
  591. with_counts=self.show_counts,
  592. )
  593. else:
  594. return SeriesTableBuilderNonVerbose(info=self.info)
  595. def _initialize_show_counts(self, show_counts: bool | None) -> bool:
  596. if show_counts is None:
  597. return True
  598. else:
  599. return show_counts
  600. class TableBuilderAbstract(ABC):
  601. """
  602. Abstract builder for info table.
  603. """
  604. _lines: list[str]
  605. info: BaseInfo
  606. @abstractmethod
  607. def get_lines(self) -> list[str]:
  608. """Product in a form of list of lines (strings)."""
  609. @property
  610. def data(self) -> DataFrame | Series:
  611. return self.info.data
  612. @property
  613. def dtypes(self) -> Iterable[Dtype]:
  614. """Dtypes of each of the DataFrame's columns."""
  615. return self.info.dtypes
  616. @property
  617. def dtype_counts(self) -> Mapping[str, int]:
  618. """Mapping dtype - number of counts."""
  619. return self.info.dtype_counts
  620. @property
  621. def display_memory_usage(self) -> bool:
  622. """Whether to display memory usage."""
  623. return bool(self.info.memory_usage)
  624. @property
  625. def memory_usage_string(self) -> str:
  626. """Memory usage string with proper size qualifier."""
  627. return self.info.memory_usage_string
  628. @property
  629. def non_null_counts(self) -> Sequence[int]:
  630. return self.info.non_null_counts
  631. def add_object_type_line(self) -> None:
  632. """Add line with string representation of dataframe to the table."""
  633. self._lines.append(str(type(self.data)))
  634. def add_index_range_line(self) -> None:
  635. """Add line with range of indices to the table."""
  636. self._lines.append(self.data.index._summary())
  637. def add_dtypes_line(self) -> None:
  638. """Add summary line with dtypes present in dataframe."""
  639. collected_dtypes = [
  640. f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items())
  641. ]
  642. self._lines.append(f"dtypes: {', '.join(collected_dtypes)}")
  643. class DataFrameTableBuilder(TableBuilderAbstract):
  644. """
  645. Abstract builder for dataframe info table.
  646. Parameters
  647. ----------
  648. info : DataFrameInfo.
  649. Instance of DataFrameInfo.
  650. """
  651. def __init__(self, *, info: DataFrameInfo) -> None:
  652. self.info: DataFrameInfo = info
  653. def get_lines(self) -> list[str]:
  654. self._lines = []
  655. if self.col_count == 0:
  656. self._fill_empty_info()
  657. else:
  658. self._fill_non_empty_info()
  659. return self._lines
  660. def _fill_empty_info(self) -> None:
  661. """Add lines to the info table, pertaining to empty dataframe."""
  662. self.add_object_type_line()
  663. self.add_index_range_line()
  664. self._lines.append(f"Empty {type(self.data).__name__}\n")
  665. @abstractmethod
  666. def _fill_non_empty_info(self) -> None:
  667. """Add lines to the info table, pertaining to non-empty dataframe."""
  668. @property
  669. def data(self) -> DataFrame:
  670. """DataFrame."""
  671. return self.info.data
  672. @property
  673. def ids(self) -> Index:
  674. """Dataframe columns."""
  675. return self.info.ids
  676. @property
  677. def col_count(self) -> int:
  678. """Number of dataframe columns to be summarized."""
  679. return self.info.col_count
  680. def add_memory_usage_line(self) -> None:
  681. """Add line containing memory usage."""
  682. self._lines.append(f"memory usage: {self.memory_usage_string}")
  683. class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder):
  684. """
  685. Dataframe info table builder for non-verbose output.
  686. """
  687. def _fill_non_empty_info(self) -> None:
  688. """Add lines to the info table, pertaining to non-empty dataframe."""
  689. self.add_object_type_line()
  690. self.add_index_range_line()
  691. self.add_columns_summary_line()
  692. self.add_dtypes_line()
  693. if self.display_memory_usage:
  694. self.add_memory_usage_line()
  695. def add_columns_summary_line(self) -> None:
  696. self._lines.append(self.ids._summary(name="Columns"))
  697. class TableBuilderVerboseMixin(TableBuilderAbstract):
  698. """
  699. Mixin for verbose info output.
  700. """
  701. SPACING: str = " " * 2
  702. strrows: Sequence[Sequence[str]]
  703. gross_column_widths: Sequence[int]
  704. with_counts: bool
  705. @property
  706. @abstractmethod
  707. def headers(self) -> Sequence[str]:
  708. """Headers names of the columns in verbose table."""
  709. @property
  710. def header_column_widths(self) -> Sequence[int]:
  711. """Widths of header columns (only titles)."""
  712. return [len(col) for col in self.headers]
  713. def _get_gross_column_widths(self) -> Sequence[int]:
  714. """Get widths of columns containing both headers and actual content."""
  715. body_column_widths = self._get_body_column_widths()
  716. return [
  717. max(*widths)
  718. for widths in zip(self.header_column_widths, body_column_widths)
  719. ]
  720. def _get_body_column_widths(self) -> Sequence[int]:
  721. """Get widths of table content columns."""
  722. strcols: Sequence[Sequence[str]] = list(zip(*self.strrows))
  723. return [max(len(x) for x in col) for col in strcols]
  724. def _gen_rows(self) -> Iterator[Sequence[str]]:
  725. """
  726. Generator function yielding rows content.
  727. Each element represents a row comprising a sequence of strings.
  728. """
  729. if self.with_counts:
  730. return self._gen_rows_with_counts()
  731. else:
  732. return self._gen_rows_without_counts()
  733. @abstractmethod
  734. def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
  735. """Iterator with string representation of body data with counts."""
  736. @abstractmethod
  737. def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
  738. """Iterator with string representation of body data without counts."""
  739. def add_header_line(self) -> None:
  740. header_line = self.SPACING.join(
  741. [
  742. _put_str(header, col_width)
  743. for header, col_width in zip(self.headers, self.gross_column_widths)
  744. ]
  745. )
  746. self._lines.append(header_line)
  747. def add_separator_line(self) -> None:
  748. separator_line = self.SPACING.join(
  749. [
  750. _put_str("-" * header_colwidth, gross_colwidth)
  751. for header_colwidth, gross_colwidth in zip(
  752. self.header_column_widths, self.gross_column_widths
  753. )
  754. ]
  755. )
  756. self._lines.append(separator_line)
  757. def add_body_lines(self) -> None:
  758. for row in self.strrows:
  759. body_line = self.SPACING.join(
  760. [
  761. _put_str(col, gross_colwidth)
  762. for col, gross_colwidth in zip(row, self.gross_column_widths)
  763. ]
  764. )
  765. self._lines.append(body_line)
  766. def _gen_non_null_counts(self) -> Iterator[str]:
  767. """Iterator with string representation of non-null counts."""
  768. for count in self.non_null_counts:
  769. yield f"{count} non-null"
  770. def _gen_dtypes(self) -> Iterator[str]:
  771. """Iterator with string representation of column dtypes."""
  772. for dtype in self.dtypes:
  773. yield pprint_thing(dtype)
  774. class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin):
  775. """
  776. Dataframe info table builder for verbose output.
  777. """
  778. def __init__(
  779. self,
  780. *,
  781. info: DataFrameInfo,
  782. with_counts: bool,
  783. ) -> None:
  784. self.info = info
  785. self.with_counts = with_counts
  786. self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
  787. self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
  788. def _fill_non_empty_info(self) -> None:
  789. """Add lines to the info table, pertaining to non-empty dataframe."""
  790. self.add_object_type_line()
  791. self.add_index_range_line()
  792. self.add_columns_summary_line()
  793. self.add_header_line()
  794. self.add_separator_line()
  795. self.add_body_lines()
  796. self.add_dtypes_line()
  797. if self.display_memory_usage:
  798. self.add_memory_usage_line()
  799. @property
  800. def headers(self) -> Sequence[str]:
  801. """Headers names of the columns in verbose table."""
  802. if self.with_counts:
  803. return [" # ", "Column", "Non-Null Count", "Dtype"]
  804. return [" # ", "Column", "Dtype"]
  805. def add_columns_summary_line(self) -> None:
  806. self._lines.append(f"Data columns (total {self.col_count} columns):")
  807. def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
  808. """Iterator with string representation of body data without counts."""
  809. yield from zip(
  810. self._gen_line_numbers(),
  811. self._gen_columns(),
  812. self._gen_dtypes(),
  813. )
  814. def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
  815. """Iterator with string representation of body data with counts."""
  816. yield from zip(
  817. self._gen_line_numbers(),
  818. self._gen_columns(),
  819. self._gen_non_null_counts(),
  820. self._gen_dtypes(),
  821. )
  822. def _gen_line_numbers(self) -> Iterator[str]:
  823. """Iterator with string representation of column numbers."""
  824. for i, _ in enumerate(self.ids):
  825. yield f" {i}"
  826. def _gen_columns(self) -> Iterator[str]:
  827. """Iterator with string representation of column names."""
  828. for col in self.ids:
  829. yield pprint_thing(col)
  830. class SeriesTableBuilder(TableBuilderAbstract):
  831. """
  832. Abstract builder for series info table.
  833. Parameters
  834. ----------
  835. info : SeriesInfo.
  836. Instance of SeriesInfo.
  837. """
  838. def __init__(self, *, info: SeriesInfo) -> None:
  839. self.info: SeriesInfo = info
  840. def get_lines(self) -> list[str]:
  841. self._lines = []
  842. self._fill_non_empty_info()
  843. return self._lines
  844. @property
  845. def data(self) -> Series:
  846. """Series."""
  847. return self.info.data
  848. def add_memory_usage_line(self) -> None:
  849. """Add line containing memory usage."""
  850. self._lines.append(f"memory usage: {self.memory_usage_string}")
  851. @abstractmethod
  852. def _fill_non_empty_info(self) -> None:
  853. """Add lines to the info table, pertaining to non-empty series."""
  854. class SeriesTableBuilderNonVerbose(SeriesTableBuilder):
  855. """
  856. Series info table builder for non-verbose output.
  857. """
  858. def _fill_non_empty_info(self) -> None:
  859. """Add lines to the info table, pertaining to non-empty series."""
  860. self.add_object_type_line()
  861. self.add_index_range_line()
  862. self.add_dtypes_line()
  863. if self.display_memory_usage:
  864. self.add_memory_usage_line()
  865. class SeriesTableBuilderVerbose(SeriesTableBuilder, TableBuilderVerboseMixin):
  866. """
  867. Series info table builder for verbose output.
  868. """
  869. def __init__(
  870. self,
  871. *,
  872. info: SeriesInfo,
  873. with_counts: bool,
  874. ) -> None:
  875. self.info = info
  876. self.with_counts = with_counts
  877. self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
  878. self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
  879. def _fill_non_empty_info(self) -> None:
  880. """Add lines to the info table, pertaining to non-empty series."""
  881. self.add_object_type_line()
  882. self.add_index_range_line()
  883. self.add_series_name_line()
  884. self.add_header_line()
  885. self.add_separator_line()
  886. self.add_body_lines()
  887. self.add_dtypes_line()
  888. if self.display_memory_usage:
  889. self.add_memory_usage_line()
  890. def add_series_name_line(self) -> None:
  891. self._lines.append(f"Series name: {self.data.name}")
  892. @property
  893. def headers(self) -> Sequence[str]:
  894. """Headers names of the columns in verbose table."""
  895. if self.with_counts:
  896. return ["Non-Null Count", "Dtype"]
  897. return ["Dtype"]
  898. def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
  899. """Iterator with string representation of body data without counts."""
  900. yield from self._gen_dtypes()
  901. def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
  902. """Iterator with string representation of body data with counts."""
  903. yield from zip(
  904. self._gen_non_null_counts(),
  905. self._gen_dtypes(),
  906. )
  907. def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]:
  908. """
  909. Create mapping between datatypes and their number of occurrences.
  910. """
  911. # groupby dtype.name to collect e.g. Categorical columns
  912. return df.dtypes.value_counts().groupby(lambda x: x.name).sum()