test_groupby_shift_diff.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. NaT,
  6. Series,
  7. Timedelta,
  8. Timestamp,
  9. )
  10. import pandas._testing as tm
  11. def test_group_shift_with_null_key():
  12. # This test is designed to replicate the segfault in issue #13813.
  13. n_rows = 1200
  14. # Generate a moderately large dataframe with occasional missing
  15. # values in column `B`, and then group by [`A`, `B`]. This should
  16. # force `-1` in `labels` array of `g.grouper.group_info` exactly
  17. # at those places, where the group-by key is partially missing.
  18. df = DataFrame(
  19. [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
  20. dtype=float,
  21. columns=["A", "B", "Z"],
  22. index=None,
  23. )
  24. g = df.groupby(["A", "B"])
  25. expected = DataFrame(
  26. [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
  27. dtype=float,
  28. columns=["Z"],
  29. index=None,
  30. )
  31. result = g.shift(-1)
  32. tm.assert_frame_equal(result, expected)
  33. def test_group_shift_with_fill_value():
  34. # GH #24128
  35. n_rows = 24
  36. df = DataFrame(
  37. [(i % 12, i % 3, i) for i in range(n_rows)],
  38. dtype=float,
  39. columns=["A", "B", "Z"],
  40. index=None,
  41. )
  42. g = df.groupby(["A", "B"])
  43. expected = DataFrame(
  44. [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
  45. dtype=float,
  46. columns=["Z"],
  47. index=None,
  48. )
  49. result = g.shift(-1, fill_value=0)
  50. tm.assert_frame_equal(result, expected)
  51. def test_group_shift_lose_timezone():
  52. # GH 30134
  53. now_dt = Timestamp.utcnow()
  54. df = DataFrame({"a": [1, 1], "date": now_dt})
  55. result = df.groupby("a").shift(0).iloc[0]
  56. expected = Series({"date": now_dt}, name=result.name)
  57. tm.assert_series_equal(result, expected)
  58. def test_group_diff_real_series(any_real_numpy_dtype):
  59. df = DataFrame(
  60. {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
  61. dtype=any_real_numpy_dtype,
  62. )
  63. result = df.groupby("a")["b"].diff()
  64. exp_dtype = "float"
  65. if any_real_numpy_dtype in ["int8", "int16", "float32"]:
  66. exp_dtype = "float32"
  67. expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
  68. tm.assert_series_equal(result, expected)
  69. def test_group_diff_real_frame(any_real_numpy_dtype):
  70. df = DataFrame(
  71. {
  72. "a": [1, 2, 3, 3, 2],
  73. "b": [1, 2, 3, 4, 5],
  74. "c": [1, 2, 3, 4, 6],
  75. },
  76. dtype=any_real_numpy_dtype,
  77. )
  78. result = df.groupby("a").diff()
  79. exp_dtype = "float"
  80. if any_real_numpy_dtype in ["int8", "int16", "float32"]:
  81. exp_dtype = "float32"
  82. expected = DataFrame(
  83. {
  84. "b": [np.nan, np.nan, np.nan, 1.0, 3.0],
  85. "c": [np.nan, np.nan, np.nan, 1.0, 4.0],
  86. },
  87. dtype=exp_dtype,
  88. )
  89. tm.assert_frame_equal(result, expected)
  90. @pytest.mark.parametrize(
  91. "data",
  92. [
  93. [
  94. Timestamp("2013-01-01"),
  95. Timestamp("2013-01-02"),
  96. Timestamp("2013-01-03"),
  97. ],
  98. [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
  99. ],
  100. )
  101. def test_group_diff_datetimelike(data):
  102. df = DataFrame({"a": [1, 2, 2], "b": data})
  103. result = df.groupby("a")["b"].diff()
  104. expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
  105. tm.assert_series_equal(result, expected)
  106. def test_group_diff_bool():
  107. df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
  108. result = df.groupby("a")["b"].diff()
  109. expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
  110. tm.assert_series_equal(result, expected)
  111. def test_group_diff_object_raises(object_dtype):
  112. df = DataFrame(
  113. {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
  114. )
  115. with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
  116. df.groupby("a")["b"].diff()
  117. def test_empty_shift_with_fill():
  118. # GH 41264, single-index check
  119. df = DataFrame(columns=["a", "b", "c"])
  120. shifted = df.groupby(["a"]).shift(1)
  121. shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
  122. tm.assert_frame_equal(shifted, shifted_with_fill)
  123. tm.assert_index_equal(shifted.index, shifted_with_fill.index)
  124. def test_multindex_empty_shift_with_fill():
  125. # GH 41264, multi-index check
  126. df = DataFrame(columns=["a", "b", "c"])
  127. shifted = df.groupby(["a", "b"]).shift(1)
  128. shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
  129. tm.assert_frame_equal(shifted, shifted_with_fill)
  130. tm.assert_index_equal(shifted.index, shifted_with_fill.index)