123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503 |
- from io import StringIO
- import re
- from string import ascii_uppercase as uppercase
- import sys
- import textwrap
- import numpy as np
- import pytest
- from pandas.compat import (
- IS64,
- PYPY,
- )
- from pandas import (
- CategoricalIndex,
- DataFrame,
- MultiIndex,
- Series,
- date_range,
- option_context,
- )
- import pandas._testing as tm
- @pytest.fixture
- def duplicate_columns_frame():
- """Dataframe with duplicate column names."""
- return DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
- def test_info_empty():
- # GH #45494
- df = DataFrame()
- buf = StringIO()
- df.info(buf=buf)
- result = buf.getvalue()
- expected = textwrap.dedent(
- """\
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 0 entries
- Empty DataFrame\n"""
- )
- assert result == expected
- def test_info_categorical_column_smoke_test():
- n = 2500
- df = DataFrame({"int64": np.random.randint(100, size=n)})
- df["category"] = Series(
- np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
- ).astype("category")
- df.isna()
- buf = StringIO()
- df.info(buf=buf)
- df2 = df[df["category"] == "d"]
- buf = StringIO()
- df2.info(buf=buf)
- @pytest.mark.parametrize(
- "fixture_func_name",
- [
- "int_frame",
- "float_frame",
- "datetime_frame",
- "duplicate_columns_frame",
- ],
- )
- def test_info_smoke_test(fixture_func_name, request):
- frame = request.getfixturevalue(fixture_func_name)
- buf = StringIO()
- frame.info(buf=buf)
- result = buf.getvalue().splitlines()
- assert len(result) > 10
- @pytest.mark.parametrize(
- "num_columns, max_info_columns, verbose",
- [
- (10, 100, True),
- (10, 11, True),
- (10, 10, True),
- (10, 9, False),
- (10, 1, False),
- ],
- )
- def test_info_default_verbose_selection(num_columns, max_info_columns, verbose):
- frame = DataFrame(np.random.randn(5, num_columns))
- with option_context("display.max_info_columns", max_info_columns):
- io_default = StringIO()
- frame.info(buf=io_default)
- result = io_default.getvalue()
- io_explicit = StringIO()
- frame.info(buf=io_explicit, verbose=verbose)
- expected = io_explicit.getvalue()
- assert result == expected
- def test_info_verbose_check_header_separator_body():
- buf = StringIO()
- size = 1001
- start = 5
- frame = DataFrame(np.random.randn(3, size))
- frame.info(verbose=True, buf=buf)
- res = buf.getvalue()
- header = " # Column Dtype \n--- ------ ----- "
- assert header in res
- frame.info(verbose=True, buf=buf)
- buf.seek(0)
- lines = buf.readlines()
- assert len(lines) > 0
- for i, line in enumerate(lines):
- if start <= i < start + size:
- line_nr = f" {i - start} "
- assert line.startswith(line_nr)
- @pytest.mark.parametrize(
- "size, header_exp, separator_exp, first_line_exp, last_line_exp",
- [
- (
- 4,
- " # Column Non-Null Count Dtype ",
- "--- ------ -------------- ----- ",
- " 0 0 3 non-null float64",
- " 3 3 3 non-null float64",
- ),
- (
- 11,
- " # Column Non-Null Count Dtype ",
- "--- ------ -------------- ----- ",
- " 0 0 3 non-null float64",
- " 10 10 3 non-null float64",
- ),
- (
- 101,
- " # Column Non-Null Count Dtype ",
- "--- ------ -------------- ----- ",
- " 0 0 3 non-null float64",
- " 100 100 3 non-null float64",
- ),
- (
- 1001,
- " # Column Non-Null Count Dtype ",
- "--- ------ -------------- ----- ",
- " 0 0 3 non-null float64",
- " 1000 1000 3 non-null float64",
- ),
- (
- 10001,
- " # Column Non-Null Count Dtype ",
- "--- ------ -------------- ----- ",
- " 0 0 3 non-null float64",
- " 10000 10000 3 non-null float64",
- ),
- ],
- )
- def test_info_verbose_with_counts_spacing(
- size, header_exp, separator_exp, first_line_exp, last_line_exp
- ):
- """Test header column, spacer, first line and last line in verbose mode."""
- frame = DataFrame(np.random.randn(3, size))
- with StringIO() as buf:
- frame.info(verbose=True, show_counts=True, buf=buf)
- all_lines = buf.getvalue().splitlines()
- # Here table would contain only header, separator and table lines
- # dframe repr, index summary, memory usage and dtypes are excluded
- table = all_lines[3:-2]
- header, separator, first_line, *rest, last_line = table
- assert header == header_exp
- assert separator == separator_exp
- assert first_line == first_line_exp
- assert last_line == last_line_exp
- def test_info_memory():
- # https://github.com/pandas-dev/pandas/issues/21056
- df = DataFrame({"a": Series([1, 2], dtype="i8")})
- buf = StringIO()
- df.info(buf=buf)
- result = buf.getvalue()
- bytes = float(df.memory_usage().sum())
- expected = textwrap.dedent(
- f"""\
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 2 entries, 0 to 1
- Data columns (total 1 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 a 2 non-null int64
- dtypes: int64(1)
- memory usage: {bytes} bytes
- """
- )
- assert result == expected
- def test_info_wide():
- io = StringIO()
- df = DataFrame(np.random.randn(5, 101))
- df.info(buf=io)
- io = StringIO()
- df.info(buf=io, max_cols=101)
- result = io.getvalue()
- assert len(result.splitlines()) > 100
- expected = result
- with option_context("display.max_info_columns", 101):
- io = StringIO()
- df.info(buf=io)
- result = io.getvalue()
- assert result == expected
- def test_info_duplicate_columns_shows_correct_dtypes():
- # GH11761
- io = StringIO()
- frame = DataFrame([[1, 2.0]], columns=["a", "a"])
- frame.info(buf=io)
- lines = io.getvalue().splitlines(True)
- assert " 0 a 1 non-null int64 \n" == lines[5]
- assert " 1 a 1 non-null float64\n" == lines[6]
- def test_info_shows_column_dtypes():
- dtypes = [
- "int64",
- "float64",
- "datetime64[ns]",
- "timedelta64[ns]",
- "complex128",
- "object",
- "bool",
- ]
- data = {}
- n = 10
- for i, dtype in enumerate(dtypes):
- data[i] = np.random.randint(2, size=n).astype(dtype)
- df = DataFrame(data)
- buf = StringIO()
- df.info(buf=buf)
- res = buf.getvalue()
- header = (
- " # Column Non-Null Count Dtype \n"
- "--- ------ -------------- ----- "
- )
- assert header in res
- for i, dtype in enumerate(dtypes):
- name = f" {i:d} {i:d} {n:d} non-null {dtype}"
- assert name in res
- def test_info_max_cols():
- df = DataFrame(np.random.randn(10, 5))
- for len_, verbose in [(5, None), (5, False), (12, True)]:
- # For verbose always ^ setting ^ summarize ^ full output
- with option_context("max_info_columns", 4):
- buf = StringIO()
- df.info(buf=buf, verbose=verbose)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- for len_, verbose in [(12, None), (5, False), (12, True)]:
- # max_cols not exceeded
- with option_context("max_info_columns", 5):
- buf = StringIO()
- df.info(buf=buf, verbose=verbose)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- for len_, max_cols in [(12, 5), (5, 4)]:
- # setting truncates
- with option_context("max_info_columns", 4):
- buf = StringIO()
- df.info(buf=buf, max_cols=max_cols)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- # setting wouldn't truncate
- with option_context("max_info_columns", 5):
- buf = StringIO()
- df.info(buf=buf, max_cols=max_cols)
- res = buf.getvalue()
- assert len(res.strip().split("\n")) == len_
- def test_info_memory_usage():
- # Ensure memory usage is displayed, when asserted, on the last line
- dtypes = [
- "int64",
- "float64",
- "datetime64[ns]",
- "timedelta64[ns]",
- "complex128",
- "object",
- "bool",
- ]
- data = {}
- n = 10
- for i, dtype in enumerate(dtypes):
- data[i] = np.random.randint(2, size=n).astype(dtype)
- df = DataFrame(data)
- buf = StringIO()
- # display memory usage case
- df.info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- assert "memory usage: " in res[-1]
- # do not display memory usage case
- df.info(buf=buf, memory_usage=False)
- res = buf.getvalue().splitlines()
- assert "memory usage: " not in res[-1]
- df.info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- # memory usage is a lower bound, so print it as XYZ+ MB
- assert re.match(r"memory usage: [^+]+\+", res[-1])
- df.iloc[:, :5].info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- # excluded column with object dtype, so estimate is accurate
- assert not re.match(r"memory usage: [^+]+\+", res[-1])
- # Test a DataFrame with duplicate columns
- dtypes = ["int64", "int64", "int64", "float64"]
- data = {}
- n = 100
- for i, dtype in enumerate(dtypes):
- data[i] = np.random.randint(2, size=n).astype(dtype)
- df = DataFrame(data)
- df.columns = dtypes
- df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
- df_with_object_index.info(buf=buf, memory_usage=True)
- res = buf.getvalue().splitlines()
- assert re.match(r"memory usage: [^+]+\+", res[-1])
- df_with_object_index.info(buf=buf, memory_usage="deep")
- res = buf.getvalue().splitlines()
- assert re.match(r"memory usage: [^+]+$", res[-1])
- # Ensure df size is as expected
- # (cols * rows * bytes) + index size
- df_size = df.memory_usage().sum()
- exp_size = len(dtypes) * n * 8 + df.index.nbytes
- assert df_size == exp_size
- # Ensure number of cols in memory_usage is the same as df
- size_df = np.size(df.columns.values) + 1 # index=True; default
- assert size_df == np.size(df.memory_usage())
- # assert deep works only on object
- assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
- # test for validity
- DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
- DataFrame(1, index=["a"], columns=["A"]).index.nbytes
- df = DataFrame(
- data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
- )
- df.index.nbytes
- df.memory_usage(index=True)
- df.index.values.nbytes
- mem = df.memory_usage(deep=True).sum()
- assert mem > 0
- @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
- def test_info_memory_usage_deep_not_pypy():
- df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
- assert (
- df_with_object_index.memory_usage(index=True, deep=True).sum()
- > df_with_object_index.memory_usage(index=True).sum()
- )
- df_object = DataFrame({"a": ["a"]})
- assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
- @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
- def test_info_memory_usage_deep_pypy():
- df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
- assert (
- df_with_object_index.memory_usage(index=True, deep=True).sum()
- == df_with_object_index.memory_usage(index=True).sum()
- )
- df_object = DataFrame({"a": ["a"]})
- assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
- @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
- def test_usage_via_getsizeof():
- df = DataFrame(
- data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
- )
- mem = df.memory_usage(deep=True).sum()
- # sys.getsizeof will call the .memory_usage with
- # deep=True, and add on some GC overhead
- diff = mem - sys.getsizeof(df)
- assert abs(diff) < 100
- def test_info_memory_usage_qualified():
- buf = StringIO()
- df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
- df.info(buf=buf)
- assert "+" not in buf.getvalue()
- buf = StringIO()
- df = DataFrame(1, columns=list("ab"), index=list("ABC"))
- df.info(buf=buf)
- assert "+" in buf.getvalue()
- buf = StringIO()
- df = DataFrame(
- 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
- )
- df.info(buf=buf)
- assert "+" not in buf.getvalue()
- buf = StringIO()
- df = DataFrame(
- 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
- )
- df.info(buf=buf)
- assert "+" in buf.getvalue()
- def test_info_memory_usage_bug_on_multiindex():
- # GH 14308
- # memory usage introspection should not materialize .values
- def memory_usage(f):
- return f.memory_usage(deep=True).sum()
- N = 100
- M = len(uppercase)
- index = MultiIndex.from_product(
- [list(uppercase), date_range("20160101", periods=N)],
- names=["id", "date"],
- )
- df = DataFrame({"value": np.random.randn(N * M)}, index=index)
- unstacked = df.unstack("id")
- assert df.values.nbytes == unstacked.values.nbytes
- assert memory_usage(df) > memory_usage(unstacked)
- # high upper bound
- assert memory_usage(unstacked) - memory_usage(df) < 2000
- def test_info_categorical():
- # GH14298
- idx = CategoricalIndex(["a", "b"])
- df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
- buf = StringIO()
- df.info(buf=buf)
- @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
- def test_info_int_columns():
- # GH#37245
- df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
- buf = StringIO()
- df.info(show_counts=True, buf=buf)
- result = buf.getvalue()
- expected = textwrap.dedent(
- """\
- <class 'pandas.core.frame.DataFrame'>
- Index: 2 entries, A to B
- Data columns (total 2 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 1 2 non-null int64
- 1 2 2 non-null int64
- dtypes: int64(2)
- memory usage: 48.0+ bytes
- """
- )
- assert result == expected
- def test_memory_usage_empty_no_warning():
- # GH#50066
- df = DataFrame(index=["a", "b"])
- with tm.assert_produces_warning(None):
- result = df.memory_usage()
- expected = Series(16 if IS64 else 8, index=["Index"])
- tm.assert_series_equal(result, expected)
|