test_info.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. from io import StringIO
  2. import re
  3. from string import ascii_uppercase as uppercase
  4. import sys
  5. import textwrap
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import (
  9. IS64,
  10. PYPY,
  11. )
  12. from pandas import (
  13. CategoricalIndex,
  14. DataFrame,
  15. MultiIndex,
  16. Series,
  17. date_range,
  18. option_context,
  19. )
  20. import pandas._testing as tm
  21. @pytest.fixture
  22. def duplicate_columns_frame():
  23. """Dataframe with duplicate column names."""
  24. return DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
  25. def test_info_empty():
  26. # GH #45494
  27. df = DataFrame()
  28. buf = StringIO()
  29. df.info(buf=buf)
  30. result = buf.getvalue()
  31. expected = textwrap.dedent(
  32. """\
  33. <class 'pandas.core.frame.DataFrame'>
  34. RangeIndex: 0 entries
  35. Empty DataFrame\n"""
  36. )
  37. assert result == expected
  38. def test_info_categorical_column_smoke_test():
  39. n = 2500
  40. df = DataFrame({"int64": np.random.randint(100, size=n)})
  41. df["category"] = Series(
  42. np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
  43. ).astype("category")
  44. df.isna()
  45. buf = StringIO()
  46. df.info(buf=buf)
  47. df2 = df[df["category"] == "d"]
  48. buf = StringIO()
  49. df2.info(buf=buf)
  50. @pytest.mark.parametrize(
  51. "fixture_func_name",
  52. [
  53. "int_frame",
  54. "float_frame",
  55. "datetime_frame",
  56. "duplicate_columns_frame",
  57. ],
  58. )
  59. def test_info_smoke_test(fixture_func_name, request):
  60. frame = request.getfixturevalue(fixture_func_name)
  61. buf = StringIO()
  62. frame.info(buf=buf)
  63. result = buf.getvalue().splitlines()
  64. assert len(result) > 10
  65. @pytest.mark.parametrize(
  66. "num_columns, max_info_columns, verbose",
  67. [
  68. (10, 100, True),
  69. (10, 11, True),
  70. (10, 10, True),
  71. (10, 9, False),
  72. (10, 1, False),
  73. ],
  74. )
  75. def test_info_default_verbose_selection(num_columns, max_info_columns, verbose):
  76. frame = DataFrame(np.random.randn(5, num_columns))
  77. with option_context("display.max_info_columns", max_info_columns):
  78. io_default = StringIO()
  79. frame.info(buf=io_default)
  80. result = io_default.getvalue()
  81. io_explicit = StringIO()
  82. frame.info(buf=io_explicit, verbose=verbose)
  83. expected = io_explicit.getvalue()
  84. assert result == expected
  85. def test_info_verbose_check_header_separator_body():
  86. buf = StringIO()
  87. size = 1001
  88. start = 5
  89. frame = DataFrame(np.random.randn(3, size))
  90. frame.info(verbose=True, buf=buf)
  91. res = buf.getvalue()
  92. header = " # Column Dtype \n--- ------ ----- "
  93. assert header in res
  94. frame.info(verbose=True, buf=buf)
  95. buf.seek(0)
  96. lines = buf.readlines()
  97. assert len(lines) > 0
  98. for i, line in enumerate(lines):
  99. if start <= i < start + size:
  100. line_nr = f" {i - start} "
  101. assert line.startswith(line_nr)
  102. @pytest.mark.parametrize(
  103. "size, header_exp, separator_exp, first_line_exp, last_line_exp",
  104. [
  105. (
  106. 4,
  107. " # Column Non-Null Count Dtype ",
  108. "--- ------ -------------- ----- ",
  109. " 0 0 3 non-null float64",
  110. " 3 3 3 non-null float64",
  111. ),
  112. (
  113. 11,
  114. " # Column Non-Null Count Dtype ",
  115. "--- ------ -------------- ----- ",
  116. " 0 0 3 non-null float64",
  117. " 10 10 3 non-null float64",
  118. ),
  119. (
  120. 101,
  121. " # Column Non-Null Count Dtype ",
  122. "--- ------ -------------- ----- ",
  123. " 0 0 3 non-null float64",
  124. " 100 100 3 non-null float64",
  125. ),
  126. (
  127. 1001,
  128. " # Column Non-Null Count Dtype ",
  129. "--- ------ -------------- ----- ",
  130. " 0 0 3 non-null float64",
  131. " 1000 1000 3 non-null float64",
  132. ),
  133. (
  134. 10001,
  135. " # Column Non-Null Count Dtype ",
  136. "--- ------ -------------- ----- ",
  137. " 0 0 3 non-null float64",
  138. " 10000 10000 3 non-null float64",
  139. ),
  140. ],
  141. )
  142. def test_info_verbose_with_counts_spacing(
  143. size, header_exp, separator_exp, first_line_exp, last_line_exp
  144. ):
  145. """Test header column, spacer, first line and last line in verbose mode."""
  146. frame = DataFrame(np.random.randn(3, size))
  147. with StringIO() as buf:
  148. frame.info(verbose=True, show_counts=True, buf=buf)
  149. all_lines = buf.getvalue().splitlines()
  150. # Here table would contain only header, separator and table lines
  151. # dframe repr, index summary, memory usage and dtypes are excluded
  152. table = all_lines[3:-2]
  153. header, separator, first_line, *rest, last_line = table
  154. assert header == header_exp
  155. assert separator == separator_exp
  156. assert first_line == first_line_exp
  157. assert last_line == last_line_exp
  158. def test_info_memory():
  159. # https://github.com/pandas-dev/pandas/issues/21056
  160. df = DataFrame({"a": Series([1, 2], dtype="i8")})
  161. buf = StringIO()
  162. df.info(buf=buf)
  163. result = buf.getvalue()
  164. bytes = float(df.memory_usage().sum())
  165. expected = textwrap.dedent(
  166. f"""\
  167. <class 'pandas.core.frame.DataFrame'>
  168. RangeIndex: 2 entries, 0 to 1
  169. Data columns (total 1 columns):
  170. # Column Non-Null Count Dtype
  171. --- ------ -------------- -----
  172. 0 a 2 non-null int64
  173. dtypes: int64(1)
  174. memory usage: {bytes} bytes
  175. """
  176. )
  177. assert result == expected
  178. def test_info_wide():
  179. io = StringIO()
  180. df = DataFrame(np.random.randn(5, 101))
  181. df.info(buf=io)
  182. io = StringIO()
  183. df.info(buf=io, max_cols=101)
  184. result = io.getvalue()
  185. assert len(result.splitlines()) > 100
  186. expected = result
  187. with option_context("display.max_info_columns", 101):
  188. io = StringIO()
  189. df.info(buf=io)
  190. result = io.getvalue()
  191. assert result == expected
  192. def test_info_duplicate_columns_shows_correct_dtypes():
  193. # GH11761
  194. io = StringIO()
  195. frame = DataFrame([[1, 2.0]], columns=["a", "a"])
  196. frame.info(buf=io)
  197. lines = io.getvalue().splitlines(True)
  198. assert " 0 a 1 non-null int64 \n" == lines[5]
  199. assert " 1 a 1 non-null float64\n" == lines[6]
  200. def test_info_shows_column_dtypes():
  201. dtypes = [
  202. "int64",
  203. "float64",
  204. "datetime64[ns]",
  205. "timedelta64[ns]",
  206. "complex128",
  207. "object",
  208. "bool",
  209. ]
  210. data = {}
  211. n = 10
  212. for i, dtype in enumerate(dtypes):
  213. data[i] = np.random.randint(2, size=n).astype(dtype)
  214. df = DataFrame(data)
  215. buf = StringIO()
  216. df.info(buf=buf)
  217. res = buf.getvalue()
  218. header = (
  219. " # Column Non-Null Count Dtype \n"
  220. "--- ------ -------------- ----- "
  221. )
  222. assert header in res
  223. for i, dtype in enumerate(dtypes):
  224. name = f" {i:d} {i:d} {n:d} non-null {dtype}"
  225. assert name in res
  226. def test_info_max_cols():
  227. df = DataFrame(np.random.randn(10, 5))
  228. for len_, verbose in [(5, None), (5, False), (12, True)]:
  229. # For verbose always ^ setting ^ summarize ^ full output
  230. with option_context("max_info_columns", 4):
  231. buf = StringIO()
  232. df.info(buf=buf, verbose=verbose)
  233. res = buf.getvalue()
  234. assert len(res.strip().split("\n")) == len_
  235. for len_, verbose in [(12, None), (5, False), (12, True)]:
  236. # max_cols not exceeded
  237. with option_context("max_info_columns", 5):
  238. buf = StringIO()
  239. df.info(buf=buf, verbose=verbose)
  240. res = buf.getvalue()
  241. assert len(res.strip().split("\n")) == len_
  242. for len_, max_cols in [(12, 5), (5, 4)]:
  243. # setting truncates
  244. with option_context("max_info_columns", 4):
  245. buf = StringIO()
  246. df.info(buf=buf, max_cols=max_cols)
  247. res = buf.getvalue()
  248. assert len(res.strip().split("\n")) == len_
  249. # setting wouldn't truncate
  250. with option_context("max_info_columns", 5):
  251. buf = StringIO()
  252. df.info(buf=buf, max_cols=max_cols)
  253. res = buf.getvalue()
  254. assert len(res.strip().split("\n")) == len_
  255. def test_info_memory_usage():
  256. # Ensure memory usage is displayed, when asserted, on the last line
  257. dtypes = [
  258. "int64",
  259. "float64",
  260. "datetime64[ns]",
  261. "timedelta64[ns]",
  262. "complex128",
  263. "object",
  264. "bool",
  265. ]
  266. data = {}
  267. n = 10
  268. for i, dtype in enumerate(dtypes):
  269. data[i] = np.random.randint(2, size=n).astype(dtype)
  270. df = DataFrame(data)
  271. buf = StringIO()
  272. # display memory usage case
  273. df.info(buf=buf, memory_usage=True)
  274. res = buf.getvalue().splitlines()
  275. assert "memory usage: " in res[-1]
  276. # do not display memory usage case
  277. df.info(buf=buf, memory_usage=False)
  278. res = buf.getvalue().splitlines()
  279. assert "memory usage: " not in res[-1]
  280. df.info(buf=buf, memory_usage=True)
  281. res = buf.getvalue().splitlines()
  282. # memory usage is a lower bound, so print it as XYZ+ MB
  283. assert re.match(r"memory usage: [^+]+\+", res[-1])
  284. df.iloc[:, :5].info(buf=buf, memory_usage=True)
  285. res = buf.getvalue().splitlines()
  286. # excluded column with object dtype, so estimate is accurate
  287. assert not re.match(r"memory usage: [^+]+\+", res[-1])
  288. # Test a DataFrame with duplicate columns
  289. dtypes = ["int64", "int64", "int64", "float64"]
  290. data = {}
  291. n = 100
  292. for i, dtype in enumerate(dtypes):
  293. data[i] = np.random.randint(2, size=n).astype(dtype)
  294. df = DataFrame(data)
  295. df.columns = dtypes
  296. df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
  297. df_with_object_index.info(buf=buf, memory_usage=True)
  298. res = buf.getvalue().splitlines()
  299. assert re.match(r"memory usage: [^+]+\+", res[-1])
  300. df_with_object_index.info(buf=buf, memory_usage="deep")
  301. res = buf.getvalue().splitlines()
  302. assert re.match(r"memory usage: [^+]+$", res[-1])
  303. # Ensure df size is as expected
  304. # (cols * rows * bytes) + index size
  305. df_size = df.memory_usage().sum()
  306. exp_size = len(dtypes) * n * 8 + df.index.nbytes
  307. assert df_size == exp_size
  308. # Ensure number of cols in memory_usage is the same as df
  309. size_df = np.size(df.columns.values) + 1 # index=True; default
  310. assert size_df == np.size(df.memory_usage())
  311. # assert deep works only on object
  312. assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
  313. # test for validity
  314. DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
  315. DataFrame(1, index=["a"], columns=["A"]).index.nbytes
  316. df = DataFrame(
  317. data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
  318. )
  319. df.index.nbytes
  320. df.memory_usage(index=True)
  321. df.index.values.nbytes
  322. mem = df.memory_usage(deep=True).sum()
  323. assert mem > 0
  324. @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
  325. def test_info_memory_usage_deep_not_pypy():
  326. df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
  327. assert (
  328. df_with_object_index.memory_usage(index=True, deep=True).sum()
  329. > df_with_object_index.memory_usage(index=True).sum()
  330. )
  331. df_object = DataFrame({"a": ["a"]})
  332. assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
  333. @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
  334. def test_info_memory_usage_deep_pypy():
  335. df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
  336. assert (
  337. df_with_object_index.memory_usage(index=True, deep=True).sum()
  338. == df_with_object_index.memory_usage(index=True).sum()
  339. )
  340. df_object = DataFrame({"a": ["a"]})
  341. assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
  342. @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
  343. def test_usage_via_getsizeof():
  344. df = DataFrame(
  345. data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
  346. )
  347. mem = df.memory_usage(deep=True).sum()
  348. # sys.getsizeof will call the .memory_usage with
  349. # deep=True, and add on some GC overhead
  350. diff = mem - sys.getsizeof(df)
  351. assert abs(diff) < 100
  352. def test_info_memory_usage_qualified():
  353. buf = StringIO()
  354. df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
  355. df.info(buf=buf)
  356. assert "+" not in buf.getvalue()
  357. buf = StringIO()
  358. df = DataFrame(1, columns=list("ab"), index=list("ABC"))
  359. df.info(buf=buf)
  360. assert "+" in buf.getvalue()
  361. buf = StringIO()
  362. df = DataFrame(
  363. 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
  364. )
  365. df.info(buf=buf)
  366. assert "+" not in buf.getvalue()
  367. buf = StringIO()
  368. df = DataFrame(
  369. 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
  370. )
  371. df.info(buf=buf)
  372. assert "+" in buf.getvalue()
  373. def test_info_memory_usage_bug_on_multiindex():
  374. # GH 14308
  375. # memory usage introspection should not materialize .values
  376. def memory_usage(f):
  377. return f.memory_usage(deep=True).sum()
  378. N = 100
  379. M = len(uppercase)
  380. index = MultiIndex.from_product(
  381. [list(uppercase), date_range("20160101", periods=N)],
  382. names=["id", "date"],
  383. )
  384. df = DataFrame({"value": np.random.randn(N * M)}, index=index)
  385. unstacked = df.unstack("id")
  386. assert df.values.nbytes == unstacked.values.nbytes
  387. assert memory_usage(df) > memory_usage(unstacked)
  388. # high upper bound
  389. assert memory_usage(unstacked) - memory_usage(df) < 2000
  390. def test_info_categorical():
  391. # GH14298
  392. idx = CategoricalIndex(["a", "b"])
  393. df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
  394. buf = StringIO()
  395. df.info(buf=buf)
  396. @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
  397. def test_info_int_columns():
  398. # GH#37245
  399. df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
  400. buf = StringIO()
  401. df.info(show_counts=True, buf=buf)
  402. result = buf.getvalue()
  403. expected = textwrap.dedent(
  404. """\
  405. <class 'pandas.core.frame.DataFrame'>
  406. Index: 2 entries, A to B
  407. Data columns (total 2 columns):
  408. # Column Non-Null Count Dtype
  409. --- ------ -------------- -----
  410. 0 1 2 non-null int64
  411. 1 2 2 non-null int64
  412. dtypes: int64(2)
  413. memory usage: 48.0+ bytes
  414. """
  415. )
  416. assert result == expected
  417. def test_memory_usage_empty_no_warning():
  418. # GH#50066
  419. df = DataFrame(index=["a", "b"])
  420. with tm.assert_produces_warning(None):
  421. result = df.memory_usage()
  422. expected = Series(16 if IS64 else 8, index=["Index"])
  423. tm.assert_series_equal(result, expected)