test_describe.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. import numpy as np
  2. import pytest
  3. from pandas.compat.numpy import np_version_gte1p25
  4. from pandas.core.dtypes.common import (
  5. is_complex_dtype,
  6. is_extension_array_dtype,
  7. )
  8. from pandas import (
  9. NA,
  10. Period,
  11. Series,
  12. Timedelta,
  13. Timestamp,
  14. date_range,
  15. )
  16. import pandas._testing as tm
  17. class TestSeriesDescribe:
  18. def test_describe_ints(self):
  19. ser = Series([0, 1, 2, 3, 4], name="int_data")
  20. result = ser.describe()
  21. expected = Series(
  22. [5, 2, ser.std(), 0, 1, 2, 3, 4],
  23. name="int_data",
  24. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  25. )
  26. tm.assert_series_equal(result, expected)
  27. def test_describe_bools(self):
  28. ser = Series([True, True, False, False, False], name="bool_data")
  29. result = ser.describe()
  30. expected = Series(
  31. [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"]
  32. )
  33. tm.assert_series_equal(result, expected)
  34. def test_describe_strs(self):
  35. ser = Series(["a", "a", "b", "c", "d"], name="str_data")
  36. result = ser.describe()
  37. expected = Series(
  38. [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"]
  39. )
  40. tm.assert_series_equal(result, expected)
  41. def test_describe_timedelta64(self):
  42. ser = Series(
  43. [
  44. Timedelta("1 days"),
  45. Timedelta("2 days"),
  46. Timedelta("3 days"),
  47. Timedelta("4 days"),
  48. Timedelta("5 days"),
  49. ],
  50. name="timedelta_data",
  51. )
  52. result = ser.describe()
  53. expected = Series(
  54. [5, ser[2], ser.std(), ser[0], ser[1], ser[2], ser[3], ser[4]],
  55. name="timedelta_data",
  56. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  57. )
  58. tm.assert_series_equal(result, expected)
  59. def test_describe_period(self):
  60. ser = Series(
  61. [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")],
  62. name="period_data",
  63. )
  64. result = ser.describe()
  65. expected = Series(
  66. [3, 2, ser[0], 2],
  67. name="period_data",
  68. index=["count", "unique", "top", "freq"],
  69. )
  70. tm.assert_series_equal(result, expected)
  71. def test_describe_empty_object(self):
  72. # https://github.com/pandas-dev/pandas/issues/27183
  73. s = Series([None, None], dtype=object)
  74. result = s.describe()
  75. expected = Series(
  76. [0, 0, np.nan, np.nan],
  77. dtype=object,
  78. index=["count", "unique", "top", "freq"],
  79. )
  80. tm.assert_series_equal(result, expected)
  81. result = s[:0].describe()
  82. tm.assert_series_equal(result, expected)
  83. # ensure NaN, not None
  84. assert np.isnan(result.iloc[2])
  85. assert np.isnan(result.iloc[3])
  86. def test_describe_with_tz(self, tz_naive_fixture):
  87. # GH 21332
  88. tz = tz_naive_fixture
  89. name = str(tz_naive_fixture)
  90. start = Timestamp(2018, 1, 1)
  91. end = Timestamp(2018, 1, 5)
  92. s = Series(date_range(start, end, tz=tz), name=name)
  93. result = s.describe()
  94. expected = Series(
  95. [
  96. 5,
  97. Timestamp(2018, 1, 3).tz_localize(tz),
  98. start.tz_localize(tz),
  99. s[1],
  100. s[2],
  101. s[3],
  102. end.tz_localize(tz),
  103. ],
  104. name=name,
  105. index=["count", "mean", "min", "25%", "50%", "75%", "max"],
  106. )
  107. tm.assert_series_equal(result, expected)
  108. def test_describe_with_tz_numeric(self):
  109. name = tz = "CET"
  110. start = Timestamp(2018, 1, 1)
  111. end = Timestamp(2018, 1, 5)
  112. s = Series(date_range(start, end, tz=tz), name=name)
  113. result = s.describe()
  114. expected = Series(
  115. [
  116. 5,
  117. Timestamp("2018-01-03 00:00:00", tz=tz),
  118. Timestamp("2018-01-01 00:00:00", tz=tz),
  119. Timestamp("2018-01-02 00:00:00", tz=tz),
  120. Timestamp("2018-01-03 00:00:00", tz=tz),
  121. Timestamp("2018-01-04 00:00:00", tz=tz),
  122. Timestamp("2018-01-05 00:00:00", tz=tz),
  123. ],
  124. name=name,
  125. index=["count", "mean", "min", "25%", "50%", "75%", "max"],
  126. )
  127. tm.assert_series_equal(result, expected)
  128. def test_datetime_is_numeric_includes_datetime(self):
  129. s = Series(date_range("2012", periods=3))
  130. result = s.describe()
  131. expected = Series(
  132. [
  133. 3,
  134. Timestamp("2012-01-02"),
  135. Timestamp("2012-01-01"),
  136. Timestamp("2012-01-01T12:00:00"),
  137. Timestamp("2012-01-02"),
  138. Timestamp("2012-01-02T12:00:00"),
  139. Timestamp("2012-01-03"),
  140. ],
  141. index=["count", "mean", "min", "25%", "50%", "75%", "max"],
  142. )
  143. tm.assert_series_equal(result, expected)
  144. def test_numeric_result_dtype(self, any_numeric_dtype):
  145. # GH#48340 - describe should always return float on non-complex numeric input
  146. if is_extension_array_dtype(any_numeric_dtype):
  147. dtype = "Float64"
  148. else:
  149. dtype = "complex128" if is_complex_dtype(any_numeric_dtype) else None
  150. ser = Series([0, 1], dtype=any_numeric_dtype)
  151. if dtype == "complex128" and np_version_gte1p25:
  152. with pytest.raises(
  153. TypeError, match=r"^a must be an array of real numbers$"
  154. ):
  155. ser.describe()
  156. return
  157. result = ser.describe()
  158. expected = Series(
  159. [
  160. 2.0,
  161. 0.5,
  162. ser.std(),
  163. 0,
  164. 0.25,
  165. 0.5,
  166. 0.75,
  167. 1.0,
  168. ],
  169. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  170. dtype=dtype,
  171. )
  172. tm.assert_series_equal(result, expected)
  173. def test_describe_one_element_ea(self):
  174. # GH#52515
  175. ser = Series([0.0], dtype="Float64")
  176. with tm.assert_produces_warning(None):
  177. result = ser.describe()
  178. expected = Series(
  179. [1, 0, NA, 0, 0, 0, 0, 0],
  180. dtype="Float64",
  181. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  182. )
  183. tm.assert_series_equal(result, expected)