test_describe.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. Categorical,
  6. DataFrame,
  7. Series,
  8. Timestamp,
  9. date_range,
  10. )
  11. import pandas._testing as tm
  12. class TestDataFrameDescribe:
  13. def test_describe_bool_in_mixed_frame(self):
  14. df = DataFrame(
  15. {
  16. "string_data": ["a", "b", "c", "d", "e"],
  17. "bool_data": [True, True, False, False, False],
  18. "int_data": [10, 20, 30, 40, 50],
  19. }
  20. )
  21. # Integer data are included in .describe() output,
  22. # Boolean and string data are not.
  23. result = df.describe()
  24. expected = DataFrame(
  25. {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
  26. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  27. )
  28. tm.assert_frame_equal(result, expected)
  29. # Top value is a boolean value that is False
  30. result = df.describe(include=["bool"])
  31. expected = DataFrame(
  32. {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
  33. )
  34. tm.assert_frame_equal(result, expected)
  35. def test_describe_empty_object(self):
  36. # GH#27183
  37. df = DataFrame({"A": [None, None]}, dtype=object)
  38. result = df.describe()
  39. expected = DataFrame(
  40. {"A": [0, 0, np.nan, np.nan]},
  41. dtype=object,
  42. index=["count", "unique", "top", "freq"],
  43. )
  44. tm.assert_frame_equal(result, expected)
  45. result = df.iloc[:0].describe()
  46. tm.assert_frame_equal(result, expected)
  47. def test_describe_bool_frame(self):
  48. # GH#13891
  49. df = DataFrame(
  50. {
  51. "bool_data_1": [False, False, True, True],
  52. "bool_data_2": [False, True, True, True],
  53. }
  54. )
  55. result = df.describe()
  56. expected = DataFrame(
  57. {"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
  58. index=["count", "unique", "top", "freq"],
  59. )
  60. tm.assert_frame_equal(result, expected)
  61. df = DataFrame(
  62. {
  63. "bool_data": [False, False, True, True, False],
  64. "int_data": [0, 1, 2, 3, 4],
  65. }
  66. )
  67. result = df.describe()
  68. expected = DataFrame(
  69. {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
  70. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  71. )
  72. tm.assert_frame_equal(result, expected)
  73. df = DataFrame(
  74. {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
  75. )
  76. result = df.describe()
  77. expected = DataFrame(
  78. {"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
  79. index=["count", "unique", "top", "freq"],
  80. )
  81. tm.assert_frame_equal(result, expected)
  82. def test_describe_categorical(self):
  83. df = DataFrame({"value": np.random.randint(0, 10000, 100)})
  84. labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
  85. cat_labels = Categorical(labels, labels)
  86. df = df.sort_values(by=["value"], ascending=True)
  87. df["value_group"] = pd.cut(
  88. df.value, range(0, 10500, 500), right=False, labels=cat_labels
  89. )
  90. cat = df
  91. # Categoricals should not show up together with numerical columns
  92. result = cat.describe()
  93. assert len(result.columns) == 1
  94. # In a frame, describe() for the cat should be the same as for string
  95. # arrays (count, unique, top, freq)
  96. cat = Categorical(
  97. ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
  98. )
  99. s = Series(cat)
  100. result = s.describe()
  101. expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
  102. tm.assert_series_equal(result, expected)
  103. cat = Series(Categorical(["a", "b", "c", "c"]))
  104. df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
  105. result = df3.describe()
  106. tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
  107. def test_describe_empty_categorical_column(self):
  108. # GH#26397
  109. # Ensure the index of an empty categorical DataFrame column
  110. # also contains (count, unique, top, freq)
  111. df = DataFrame({"empty_col": Categorical([])})
  112. result = df.describe()
  113. expected = DataFrame(
  114. {"empty_col": [0, 0, np.nan, np.nan]},
  115. index=["count", "unique", "top", "freq"],
  116. dtype="object",
  117. )
  118. tm.assert_frame_equal(result, expected)
  119. # ensure NaN, not None
  120. assert np.isnan(result.iloc[2, 0])
  121. assert np.isnan(result.iloc[3, 0])
  122. def test_describe_categorical_columns(self):
  123. # GH#11558
  124. columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
  125. df = DataFrame(
  126. {
  127. "int1": [10, 20, 30, 40, 50],
  128. "int2": [10, 20, 30, 40, 50],
  129. "obj": ["A", 0, None, "X", 1],
  130. },
  131. columns=columns,
  132. )
  133. result = df.describe()
  134. exp_columns = pd.CategoricalIndex(
  135. ["int1", "int2"],
  136. categories=["int1", "int2", "obj"],
  137. ordered=True,
  138. name="XXX",
  139. )
  140. expected = DataFrame(
  141. {
  142. "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
  143. "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
  144. },
  145. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  146. columns=exp_columns,
  147. )
  148. tm.assert_frame_equal(result, expected)
  149. tm.assert_categorical_equal(result.columns.values, expected.columns.values)
  150. def test_describe_datetime_columns(self):
  151. columns = pd.DatetimeIndex(
  152. ["2011-01-01", "2011-02-01", "2011-03-01"],
  153. freq="MS",
  154. tz="US/Eastern",
  155. name="XXX",
  156. )
  157. df = DataFrame(
  158. {
  159. 0: [10, 20, 30, 40, 50],
  160. 1: [10, 20, 30, 40, 50],
  161. 2: ["A", 0, None, "X", 1],
  162. }
  163. )
  164. df.columns = columns
  165. result = df.describe()
  166. exp_columns = pd.DatetimeIndex(
  167. ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
  168. )
  169. expected = DataFrame(
  170. {
  171. 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
  172. 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
  173. },
  174. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  175. )
  176. expected.columns = exp_columns
  177. tm.assert_frame_equal(result, expected)
  178. assert result.columns.freq == "MS"
  179. assert result.columns.tz == expected.columns.tz
  180. def test_describe_timedelta_values(self):
  181. # GH#6145
  182. t1 = pd.timedelta_range("1 days", freq="D", periods=5)
  183. t2 = pd.timedelta_range("1 hours", freq="H", periods=5)
  184. df = DataFrame({"t1": t1, "t2": t2})
  185. expected = DataFrame(
  186. {
  187. "t1": [
  188. 5,
  189. pd.Timedelta("3 days"),
  190. df.iloc[:, 0].std(),
  191. pd.Timedelta("1 days"),
  192. pd.Timedelta("2 days"),
  193. pd.Timedelta("3 days"),
  194. pd.Timedelta("4 days"),
  195. pd.Timedelta("5 days"),
  196. ],
  197. "t2": [
  198. 5,
  199. pd.Timedelta("3 hours"),
  200. df.iloc[:, 1].std(),
  201. pd.Timedelta("1 hours"),
  202. pd.Timedelta("2 hours"),
  203. pd.Timedelta("3 hours"),
  204. pd.Timedelta("4 hours"),
  205. pd.Timedelta("5 hours"),
  206. ],
  207. },
  208. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  209. )
  210. result = df.describe()
  211. tm.assert_frame_equal(result, expected)
  212. exp_repr = (
  213. " t1 t2\n"
  214. "count 5 5\n"
  215. "mean 3 days 00:00:00 0 days 03:00:00\n"
  216. "std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n"
  217. "min 1 days 00:00:00 0 days 01:00:00\n"
  218. "25% 2 days 00:00:00 0 days 02:00:00\n"
  219. "50% 3 days 00:00:00 0 days 03:00:00\n"
  220. "75% 4 days 00:00:00 0 days 04:00:00\n"
  221. "max 5 days 00:00:00 0 days 05:00:00"
  222. )
  223. assert repr(result) == exp_repr
  224. def test_describe_tz_values(self, tz_naive_fixture):
  225. # GH#21332
  226. tz = tz_naive_fixture
  227. s1 = Series(range(5))
  228. start = Timestamp(2018, 1, 1)
  229. end = Timestamp(2018, 1, 5)
  230. s2 = Series(date_range(start, end, tz=tz))
  231. df = DataFrame({"s1": s1, "s2": s2})
  232. expected = DataFrame(
  233. {
  234. "s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
  235. "s2": [
  236. 5,
  237. Timestamp(2018, 1, 3).tz_localize(tz),
  238. start.tz_localize(tz),
  239. s2[1],
  240. s2[2],
  241. s2[3],
  242. end.tz_localize(tz),
  243. np.nan,
  244. ],
  245. },
  246. index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
  247. )
  248. result = df.describe(include="all")
  249. tm.assert_frame_equal(result, expected)
  250. def test_datetime_is_numeric_includes_datetime(self):
  251. df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
  252. result = df.describe()
  253. expected = DataFrame(
  254. {
  255. "a": [
  256. 3,
  257. Timestamp("2012-01-02"),
  258. Timestamp("2012-01-01"),
  259. Timestamp("2012-01-01T12:00:00"),
  260. Timestamp("2012-01-02"),
  261. Timestamp("2012-01-02T12:00:00"),
  262. Timestamp("2012-01-03"),
  263. np.nan,
  264. ],
  265. "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
  266. },
  267. index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
  268. )
  269. tm.assert_frame_equal(result, expected)
  270. def test_describe_tz_values2(self):
  271. tz = "CET"
  272. s1 = Series(range(5))
  273. start = Timestamp(2018, 1, 1)
  274. end = Timestamp(2018, 1, 5)
  275. s2 = Series(date_range(start, end, tz=tz))
  276. df = DataFrame({"s1": s1, "s2": s2})
  277. s1_ = s1.describe()
  278. s2_ = s2.describe()
  279. idx = [
  280. "count",
  281. "mean",
  282. "min",
  283. "25%",
  284. "50%",
  285. "75%",
  286. "max",
  287. "std",
  288. ]
  289. expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
  290. idx, copy=False
  291. )
  292. result = df.describe(include="all")
  293. tm.assert_frame_equal(result, expected)
  294. def test_describe_percentiles_integer_idx(self):
  295. # GH#26660
  296. df = DataFrame({"x": [1]})
  297. pct = np.linspace(0, 1, 10 + 1)
  298. result = df.describe(percentiles=pct)
  299. expected = DataFrame(
  300. {"x": [1.0, 1.0, np.NaN, 1.0, *(1.0 for _ in pct), 1.0]},
  301. index=[
  302. "count",
  303. "mean",
  304. "std",
  305. "min",
  306. "0%",
  307. "10%",
  308. "20%",
  309. "30%",
  310. "40%",
  311. "50%",
  312. "60%",
  313. "70%",
  314. "80%",
  315. "90%",
  316. "100%",
  317. "max",
  318. ],
  319. )
  320. tm.assert_frame_equal(result, expected)
  321. def test_describe_does_not_raise_error_for_dictlike_elements(self):
  322. # GH#32409
  323. df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
  324. expected = DataFrame(
  325. {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
  326. )
  327. result = df.describe()
  328. tm.assert_frame_equal(result, expected)
  329. @pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
  330. def test_describe_when_include_all_exclude_not_allowed(self, exclude):
  331. """
  332. When include is 'all', then setting exclude != None is not allowed.
  333. """
  334. df = DataFrame({"x": [1], "y": [2], "z": [3]})
  335. msg = "exclude must be None when include is 'all'"
  336. with pytest.raises(ValueError, match=msg):
  337. df.describe(include="all", exclude=exclude)
  338. def test_describe_with_duplicate_columns(self):
  339. df = DataFrame(
  340. [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
  341. columns=["bar", "a", "a"],
  342. dtype="float64",
  343. )
  344. result = df.describe()
  345. ser = df.iloc[:, 0].describe()
  346. expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
  347. tm.assert_frame_equal(result, expected)
  348. def test_ea_with_na(self, any_numeric_ea_dtype):
  349. # GH#48778
  350. df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
  351. result = df.describe()
  352. expected = DataFrame(
  353. {"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
  354. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  355. dtype="Float64",
  356. )
  357. tm.assert_frame_equal(result, expected)
  358. def test_describe_exclude_pa_dtype(self):
  359. # GH#52570
  360. pa = pytest.importorskip("pyarrow")
  361. df = DataFrame(
  362. {
  363. "a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
  364. "b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
  365. "c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
  366. }
  367. )
  368. result = df.describe(
  369. include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32())
  370. )
  371. expected = DataFrame(
  372. {"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]},
  373. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  374. dtype=pd.ArrowDtype(pa.float64()),
  375. )
  376. tm.assert_frame_equal(result, expected)