123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417 |
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- Categorical,
- DataFrame,
- Series,
- Timestamp,
- date_range,
- )
- import pandas._testing as tm
- class TestDataFrameDescribe:
- def test_describe_bool_in_mixed_frame(self):
- df = DataFrame(
- {
- "string_data": ["a", "b", "c", "d", "e"],
- "bool_data": [True, True, False, False, False],
- "int_data": [10, 20, 30, 40, 50],
- }
- )
- # Integer data are included in .describe() output,
- # Boolean and string data are not.
- result = df.describe()
- expected = DataFrame(
- {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- )
- tm.assert_frame_equal(result, expected)
- # Top value is a boolean value that is False
- result = df.describe(include=["bool"])
- expected = DataFrame(
- {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
- )
- tm.assert_frame_equal(result, expected)
- def test_describe_empty_object(self):
- # GH#27183
- df = DataFrame({"A": [None, None]}, dtype=object)
- result = df.describe()
- expected = DataFrame(
- {"A": [0, 0, np.nan, np.nan]},
- dtype=object,
- index=["count", "unique", "top", "freq"],
- )
- tm.assert_frame_equal(result, expected)
- result = df.iloc[:0].describe()
- tm.assert_frame_equal(result, expected)
- def test_describe_bool_frame(self):
- # GH#13891
- df = DataFrame(
- {
- "bool_data_1": [False, False, True, True],
- "bool_data_2": [False, True, True, True],
- }
- )
- result = df.describe()
- expected = DataFrame(
- {"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
- index=["count", "unique", "top", "freq"],
- )
- tm.assert_frame_equal(result, expected)
- df = DataFrame(
- {
- "bool_data": [False, False, True, True, False],
- "int_data": [0, 1, 2, 3, 4],
- }
- )
- result = df.describe()
- expected = DataFrame(
- {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- )
- tm.assert_frame_equal(result, expected)
- df = DataFrame(
- {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
- )
- result = df.describe()
- expected = DataFrame(
- {"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
- index=["count", "unique", "top", "freq"],
- )
- tm.assert_frame_equal(result, expected)
- def test_describe_categorical(self):
- df = DataFrame({"value": np.random.randint(0, 10000, 100)})
- labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
- cat_labels = Categorical(labels, labels)
- df = df.sort_values(by=["value"], ascending=True)
- df["value_group"] = pd.cut(
- df.value, range(0, 10500, 500), right=False, labels=cat_labels
- )
- cat = df
- # Categoricals should not show up together with numerical columns
- result = cat.describe()
- assert len(result.columns) == 1
- # In a frame, describe() for the cat should be the same as for string
- # arrays (count, unique, top, freq)
- cat = Categorical(
- ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
- )
- s = Series(cat)
- result = s.describe()
- expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
- tm.assert_series_equal(result, expected)
- cat = Series(Categorical(["a", "b", "c", "c"]))
- df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
- result = df3.describe()
- tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
- def test_describe_empty_categorical_column(self):
- # GH#26397
- # Ensure the index of an empty categorical DataFrame column
- # also contains (count, unique, top, freq)
- df = DataFrame({"empty_col": Categorical([])})
- result = df.describe()
- expected = DataFrame(
- {"empty_col": [0, 0, np.nan, np.nan]},
- index=["count", "unique", "top", "freq"],
- dtype="object",
- )
- tm.assert_frame_equal(result, expected)
- # ensure NaN, not None
- assert np.isnan(result.iloc[2, 0])
- assert np.isnan(result.iloc[3, 0])
- def test_describe_categorical_columns(self):
- # GH#11558
- columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
- df = DataFrame(
- {
- "int1": [10, 20, 30, 40, 50],
- "int2": [10, 20, 30, 40, 50],
- "obj": ["A", 0, None, "X", 1],
- },
- columns=columns,
- )
- result = df.describe()
- exp_columns = pd.CategoricalIndex(
- ["int1", "int2"],
- categories=["int1", "int2", "obj"],
- ordered=True,
- name="XXX",
- )
- expected = DataFrame(
- {
- "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
- "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
- },
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- columns=exp_columns,
- )
- tm.assert_frame_equal(result, expected)
- tm.assert_categorical_equal(result.columns.values, expected.columns.values)
- def test_describe_datetime_columns(self):
- columns = pd.DatetimeIndex(
- ["2011-01-01", "2011-02-01", "2011-03-01"],
- freq="MS",
- tz="US/Eastern",
- name="XXX",
- )
- df = DataFrame(
- {
- 0: [10, 20, 30, 40, 50],
- 1: [10, 20, 30, 40, 50],
- 2: ["A", 0, None, "X", 1],
- }
- )
- df.columns = columns
- result = df.describe()
- exp_columns = pd.DatetimeIndex(
- ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
- )
- expected = DataFrame(
- {
- 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
- 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
- },
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- )
- expected.columns = exp_columns
- tm.assert_frame_equal(result, expected)
- assert result.columns.freq == "MS"
- assert result.columns.tz == expected.columns.tz
- def test_describe_timedelta_values(self):
- # GH#6145
- t1 = pd.timedelta_range("1 days", freq="D", periods=5)
- t2 = pd.timedelta_range("1 hours", freq="H", periods=5)
- df = DataFrame({"t1": t1, "t2": t2})
- expected = DataFrame(
- {
- "t1": [
- 5,
- pd.Timedelta("3 days"),
- df.iloc[:, 0].std(),
- pd.Timedelta("1 days"),
- pd.Timedelta("2 days"),
- pd.Timedelta("3 days"),
- pd.Timedelta("4 days"),
- pd.Timedelta("5 days"),
- ],
- "t2": [
- 5,
- pd.Timedelta("3 hours"),
- df.iloc[:, 1].std(),
- pd.Timedelta("1 hours"),
- pd.Timedelta("2 hours"),
- pd.Timedelta("3 hours"),
- pd.Timedelta("4 hours"),
- pd.Timedelta("5 hours"),
- ],
- },
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- )
- result = df.describe()
- tm.assert_frame_equal(result, expected)
- exp_repr = (
- " t1 t2\n"
- "count 5 5\n"
- "mean 3 days 00:00:00 0 days 03:00:00\n"
- "std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n"
- "min 1 days 00:00:00 0 days 01:00:00\n"
- "25% 2 days 00:00:00 0 days 02:00:00\n"
- "50% 3 days 00:00:00 0 days 03:00:00\n"
- "75% 4 days 00:00:00 0 days 04:00:00\n"
- "max 5 days 00:00:00 0 days 05:00:00"
- )
- assert repr(result) == exp_repr
- def test_describe_tz_values(self, tz_naive_fixture):
- # GH#21332
- tz = tz_naive_fixture
- s1 = Series(range(5))
- start = Timestamp(2018, 1, 1)
- end = Timestamp(2018, 1, 5)
- s2 = Series(date_range(start, end, tz=tz))
- df = DataFrame({"s1": s1, "s2": s2})
- expected = DataFrame(
- {
- "s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
- "s2": [
- 5,
- Timestamp(2018, 1, 3).tz_localize(tz),
- start.tz_localize(tz),
- s2[1],
- s2[2],
- s2[3],
- end.tz_localize(tz),
- np.nan,
- ],
- },
- index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
- )
- result = df.describe(include="all")
- tm.assert_frame_equal(result, expected)
- def test_datetime_is_numeric_includes_datetime(self):
- df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
- result = df.describe()
- expected = DataFrame(
- {
- "a": [
- 3,
- Timestamp("2012-01-02"),
- Timestamp("2012-01-01"),
- Timestamp("2012-01-01T12:00:00"),
- Timestamp("2012-01-02"),
- Timestamp("2012-01-02T12:00:00"),
- Timestamp("2012-01-03"),
- np.nan,
- ],
- "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
- },
- index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
- )
- tm.assert_frame_equal(result, expected)
- def test_describe_tz_values2(self):
- tz = "CET"
- s1 = Series(range(5))
- start = Timestamp(2018, 1, 1)
- end = Timestamp(2018, 1, 5)
- s2 = Series(date_range(start, end, tz=tz))
- df = DataFrame({"s1": s1, "s2": s2})
- s1_ = s1.describe()
- s2_ = s2.describe()
- idx = [
- "count",
- "mean",
- "min",
- "25%",
- "50%",
- "75%",
- "max",
- "std",
- ]
- expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
- idx, copy=False
- )
- result = df.describe(include="all")
- tm.assert_frame_equal(result, expected)
- def test_describe_percentiles_integer_idx(self):
- # GH#26660
- df = DataFrame({"x": [1]})
- pct = np.linspace(0, 1, 10 + 1)
- result = df.describe(percentiles=pct)
- expected = DataFrame(
- {"x": [1.0, 1.0, np.NaN, 1.0, *(1.0 for _ in pct), 1.0]},
- index=[
- "count",
- "mean",
- "std",
- "min",
- "0%",
- "10%",
- "20%",
- "30%",
- "40%",
- "50%",
- "60%",
- "70%",
- "80%",
- "90%",
- "100%",
- "max",
- ],
- )
- tm.assert_frame_equal(result, expected)
- def test_describe_does_not_raise_error_for_dictlike_elements(self):
- # GH#32409
- df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
- expected = DataFrame(
- {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
- )
- result = df.describe()
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
- def test_describe_when_include_all_exclude_not_allowed(self, exclude):
- """
- When include is 'all', then setting exclude != None is not allowed.
- """
- df = DataFrame({"x": [1], "y": [2], "z": [3]})
- msg = "exclude must be None when include is 'all'"
- with pytest.raises(ValueError, match=msg):
- df.describe(include="all", exclude=exclude)
- def test_describe_with_duplicate_columns(self):
- df = DataFrame(
- [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
- columns=["bar", "a", "a"],
- dtype="float64",
- )
- result = df.describe()
- ser = df.iloc[:, 0].describe()
- expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
- tm.assert_frame_equal(result, expected)
- def test_ea_with_na(self, any_numeric_ea_dtype):
- # GH#48778
- df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
- result = df.describe()
- expected = DataFrame(
- {"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- dtype="Float64",
- )
- tm.assert_frame_equal(result, expected)
- def test_describe_exclude_pa_dtype(self):
- # GH#52570
- pa = pytest.importorskip("pyarrow")
- df = DataFrame(
- {
- "a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
- "b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
- "c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
- }
- )
- result = df.describe(
- include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32())
- )
- expected = DataFrame(
- {"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]},
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- dtype=pd.ArrowDtype(pa.float64()),
- )
- tm.assert_frame_equal(result, expected)
|