test_libgroupby.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. import numpy as np
  2. import pytest
  3. from pandas._libs import groupby as libgroupby
  4. from pandas._libs.groupby import (
  5. group_cumprod,
  6. group_cumsum,
  7. group_mean,
  8. group_var,
  9. )
  10. from pandas.core.dtypes.common import ensure_platform_int
  11. from pandas import isna
  12. import pandas._testing as tm
  13. class GroupVarTestMixin:
  14. def test_group_var_generic_1d(self):
  15. prng = np.random.RandomState(1234)
  16. out = (np.nan * np.ones((5, 1))).astype(self.dtype)
  17. counts = np.zeros(5, dtype="int64")
  18. values = 10 * prng.rand(15, 1).astype(self.dtype)
  19. labels = np.tile(np.arange(5), (3,)).astype("intp")
  20. expected_out = (
  21. np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
  22. )[:, np.newaxis]
  23. expected_counts = counts + 3
  24. self.algo(out, counts, values, labels)
  25. assert np.allclose(out, expected_out, self.rtol)
  26. tm.assert_numpy_array_equal(counts, expected_counts)
  27. def test_group_var_generic_1d_flat_labels(self):
  28. prng = np.random.RandomState(1234)
  29. out = (np.nan * np.ones((1, 1))).astype(self.dtype)
  30. counts = np.zeros(1, dtype="int64")
  31. values = 10 * prng.rand(5, 1).astype(self.dtype)
  32. labels = np.zeros(5, dtype="intp")
  33. expected_out = np.array([[values.std(ddof=1) ** 2]])
  34. expected_counts = counts + 5
  35. self.algo(out, counts, values, labels)
  36. assert np.allclose(out, expected_out, self.rtol)
  37. tm.assert_numpy_array_equal(counts, expected_counts)
  38. def test_group_var_generic_2d_all_finite(self):
  39. prng = np.random.RandomState(1234)
  40. out = (np.nan * np.ones((5, 2))).astype(self.dtype)
  41. counts = np.zeros(5, dtype="int64")
  42. values = 10 * prng.rand(10, 2).astype(self.dtype)
  43. labels = np.tile(np.arange(5), (2,)).astype("intp")
  44. expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
  45. expected_counts = counts + 2
  46. self.algo(out, counts, values, labels)
  47. assert np.allclose(out, expected_out, self.rtol)
  48. tm.assert_numpy_array_equal(counts, expected_counts)
  49. def test_group_var_generic_2d_some_nan(self):
  50. prng = np.random.RandomState(1234)
  51. out = (np.nan * np.ones((5, 2))).astype(self.dtype)
  52. counts = np.zeros(5, dtype="int64")
  53. values = 10 * prng.rand(10, 2).astype(self.dtype)
  54. values[:, 1] = np.nan
  55. labels = np.tile(np.arange(5), (2,)).astype("intp")
  56. expected_out = np.vstack(
  57. [
  58. values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
  59. np.nan * np.ones(5),
  60. ]
  61. ).T.astype(self.dtype)
  62. expected_counts = counts + 2
  63. self.algo(out, counts, values, labels)
  64. tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
  65. tm.assert_numpy_array_equal(counts, expected_counts)
  66. def test_group_var_constant(self):
  67. # Regression test from GH 10448.
  68. out = np.array([[np.nan]], dtype=self.dtype)
  69. counts = np.array([0], dtype="int64")
  70. values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
  71. labels = np.zeros(3, dtype="intp")
  72. self.algo(out, counts, values, labels)
  73. assert counts[0] == 3
  74. assert out[0, 0] >= 0
  75. tm.assert_almost_equal(out[0, 0], 0.0)
  76. class TestGroupVarFloat64(GroupVarTestMixin):
  77. __test__ = True
  78. algo = staticmethod(group_var)
  79. dtype = np.float64
  80. rtol = 1e-5
  81. def test_group_var_large_inputs(self):
  82. prng = np.random.RandomState(1234)
  83. out = np.array([[np.nan]], dtype=self.dtype)
  84. counts = np.array([0], dtype="int64")
  85. values = (prng.rand(10**6) + 10**12).astype(self.dtype)
  86. values.shape = (10**6, 1)
  87. labels = np.zeros(10**6, dtype="intp")
  88. self.algo(out, counts, values, labels)
  89. assert counts[0] == 10**6
  90. tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
  91. class TestGroupVarFloat32(GroupVarTestMixin):
  92. __test__ = True
  93. algo = staticmethod(group_var)
  94. dtype = np.float32
  95. rtol = 1e-2
  96. @pytest.mark.parametrize("dtype", ["float32", "float64"])
  97. def test_group_ohlc(dtype):
  98. obj = np.array(np.random.randn(20), dtype=dtype)
  99. bins = np.array([6, 12, 20])
  100. out = np.zeros((3, 4), dtype)
  101. counts = np.zeros(len(out), dtype=np.int64)
  102. labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
  103. func = libgroupby.group_ohlc
  104. func(out, counts, obj[:, None], labels)
  105. def _ohlc(group):
  106. if isna(group).all():
  107. return np.repeat(np.nan, 4)
  108. return [group[0], group.max(), group.min(), group[-1]]
  109. expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
  110. tm.assert_almost_equal(out, expected)
  111. tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
  112. obj[:6] = np.nan
  113. func(out, counts, obj[:, None], labels)
  114. expected[0] = np.nan
  115. tm.assert_almost_equal(out, expected)
  116. def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
  117. """
  118. Check a group transform that executes a cumulative function.
  119. Parameters
  120. ----------
  121. pd_op : callable
  122. The pandas cumulative function.
  123. np_op : callable
  124. The analogous one in NumPy.
  125. dtype : type
  126. The specified dtype of the data.
  127. """
  128. is_datetimelike = False
  129. data = np.array([[1], [2], [3], [4]], dtype=dtype)
  130. answer = np.zeros_like(data)
  131. labels = np.array([0, 0, 0, 0], dtype=np.intp)
  132. ngroups = 1
  133. pd_op(answer, data, labels, ngroups, is_datetimelike)
  134. tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
  135. @pytest.mark.parametrize("np_dtype", ["int64", "uint64", "float32", "float64"])
  136. def test_cython_group_transform_cumsum(np_dtype):
  137. # see gh-4095
  138. dtype = np.dtype(np_dtype).type
  139. pd_op, np_op = group_cumsum, np.cumsum
  140. _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
  141. def test_cython_group_transform_cumprod():
  142. # see gh-4095
  143. dtype = np.float64
  144. pd_op, np_op = group_cumprod, np.cumprod
  145. _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
  146. def test_cython_group_transform_algos():
  147. # see gh-4095
  148. is_datetimelike = False
  149. # with nans
  150. labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
  151. ngroups = 1
  152. data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
  153. actual = np.zeros_like(data)
  154. actual.fill(np.nan)
  155. group_cumprod(actual, data, labels, ngroups, is_datetimelike)
  156. expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
  157. tm.assert_numpy_array_equal(actual[:, 0], expected)
  158. actual = np.zeros_like(data)
  159. actual.fill(np.nan)
  160. group_cumsum(actual, data, labels, ngroups, is_datetimelike)
  161. expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
  162. tm.assert_numpy_array_equal(actual[:, 0], expected)
  163. # timedelta
  164. is_datetimelike = True
  165. data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
  166. actual = np.zeros_like(data, dtype="int64")
  167. group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
  168. expected = np.array(
  169. [
  170. np.timedelta64(1, "ns"),
  171. np.timedelta64(2, "ns"),
  172. np.timedelta64(3, "ns"),
  173. np.timedelta64(4, "ns"),
  174. np.timedelta64(5, "ns"),
  175. ]
  176. )
  177. tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
  178. def test_cython_group_mean_datetimelike():
  179. actual = np.zeros(shape=(1, 1), dtype="float64")
  180. counts = np.array([0], dtype="int64")
  181. data = (
  182. np.array(
  183. [np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
  184. dtype="m8[ns]",
  185. )[:, None]
  186. .view("int64")
  187. .astype("float64")
  188. )
  189. labels = np.zeros(len(data), dtype=np.intp)
  190. group_mean(actual, counts, data, labels, is_datetimelike=True)
  191. tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
  192. def test_cython_group_mean_wrong_min_count():
  193. actual = np.zeros(shape=(1, 1), dtype="float64")
  194. counts = np.zeros(1, dtype="int64")
  195. data = np.zeros(1, dtype="float64")[:, None]
  196. labels = np.zeros(1, dtype=np.intp)
  197. with pytest.raises(AssertionError, match="min_count"):
  198. group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
  199. def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
  200. actual = np.zeros(shape=(1, 1), dtype="float64")
  201. counts = np.array([0], dtype="int64")
  202. data = (
  203. np.array(
  204. [np.timedelta64("NaT"), np.timedelta64("NaT")],
  205. dtype="m8[ns]",
  206. )[:, None]
  207. .view("int64")
  208. .astype("float64")
  209. )
  210. labels = np.zeros(len(data), dtype=np.intp)
  211. group_mean(actual, counts, data, labels, is_datetimelike=False)
  212. tm.assert_numpy_array_equal(
  213. actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
  214. )