test_diff.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Series,
  7. Timestamp,
  8. date_range,
  9. )
  10. import pandas._testing as tm
  11. class TestDataFrameDiff:
  12. def test_diff_requires_integer(self):
  13. df = DataFrame(np.random.randn(2, 2))
  14. with pytest.raises(ValueError, match="periods must be an integer"):
  15. df.diff(1.5)
  16. # GH#44572 np.int64 is accepted
  17. @pytest.mark.parametrize("num", [1, np.int64(1)])
  18. def test_diff(self, datetime_frame, num):
  19. df = datetime_frame
  20. the_diff = df.diff(num)
  21. expected = df["A"] - df["A"].shift(num)
  22. tm.assert_series_equal(the_diff["A"], expected)
  23. def test_diff_int_dtype(self):
  24. # int dtype
  25. a = 10_000_000_000_000_000
  26. b = a + 1
  27. ser = Series([a, b])
  28. rs = DataFrame({"s": ser}).diff()
  29. assert rs.s[1] == 1
  30. def test_diff_mixed_numeric(self, datetime_frame):
  31. # mixed numeric
  32. tf = datetime_frame.astype("float32")
  33. the_diff = tf.diff(1)
  34. tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
  35. def test_diff_axis1_nonconsolidated(self):
  36. # GH#10907
  37. df = DataFrame({"y": Series([2]), "z": Series([3])})
  38. df.insert(0, "x", 1)
  39. result = df.diff(axis=1)
  40. expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)})
  41. tm.assert_frame_equal(result, expected)
  42. def test_diff_timedelta64_with_nat(self):
  43. # GH#32441
  44. arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
  45. arr[:, 0] = np.timedelta64("NaT", "ns")
  46. df = DataFrame(arr)
  47. result = df.diff(1, axis=0)
  48. expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]})
  49. tm.assert_equal(result, expected)
  50. result = df.diff(0)
  51. expected = df - df
  52. assert expected[0].isna().all()
  53. tm.assert_equal(result, expected)
  54. result = df.diff(-1, axis=1)
  55. expected = df * np.nan
  56. tm.assert_equal(result, expected)
  57. @pytest.mark.parametrize("tz", [None, "UTC"])
  58. def test_diff_datetime_axis0_with_nat(self, tz):
  59. # GH#32441
  60. dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz)
  61. ser = Series(dti)
  62. df = ser.to_frame()
  63. result = df.diff()
  64. ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)])
  65. expected = Series(ex_index).to_frame()
  66. tm.assert_frame_equal(result, expected)
  67. @pytest.mark.parametrize("tz", [None, "UTC"])
  68. def test_diff_datetime_with_nat_zero_periods(self, tz):
  69. # diff on NaT values should give NaT, not timedelta64(0)
  70. dti = date_range("2016-01-01", periods=4, tz=tz)
  71. ser = Series(dti)
  72. df = ser.to_frame()
  73. df[1] = ser.copy()
  74. df.iloc[:, 0] = pd.NaT
  75. expected = df - df
  76. assert expected[0].isna().all()
  77. result = df.diff(0, axis=0)
  78. tm.assert_frame_equal(result, expected)
  79. result = df.diff(0, axis=1)
  80. tm.assert_frame_equal(result, expected)
  81. @pytest.mark.parametrize("tz", [None, "UTC"])
  82. def test_diff_datetime_axis0(self, tz):
  83. # GH#18578
  84. df = DataFrame(
  85. {
  86. 0: date_range("2010", freq="D", periods=2, tz=tz),
  87. 1: date_range("2010", freq="D", periods=2, tz=tz),
  88. }
  89. )
  90. result = df.diff(axis=0)
  91. expected = DataFrame(
  92. {
  93. 0: pd.TimedeltaIndex(["NaT", "1 days"]),
  94. 1: pd.TimedeltaIndex(["NaT", "1 days"]),
  95. }
  96. )
  97. tm.assert_frame_equal(result, expected)
  98. @pytest.mark.parametrize("tz", [None, "UTC"])
  99. def test_diff_datetime_axis1(self, tz):
  100. # GH#18578
  101. df = DataFrame(
  102. {
  103. 0: date_range("2010", freq="D", periods=2, tz=tz),
  104. 1: date_range("2010", freq="D", periods=2, tz=tz),
  105. }
  106. )
  107. result = df.diff(axis=1)
  108. expected = DataFrame(
  109. {
  110. 0: pd.TimedeltaIndex(["NaT", "NaT"]),
  111. 1: pd.TimedeltaIndex(["0 days", "0 days"]),
  112. }
  113. )
  114. tm.assert_frame_equal(result, expected)
  115. def test_diff_timedelta(self):
  116. # GH#4533
  117. df = DataFrame(
  118. {
  119. "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
  120. "value": [1.0, 2.0],
  121. }
  122. )
  123. res = df.diff()
  124. exp = DataFrame(
  125. [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
  126. )
  127. tm.assert_frame_equal(res, exp)
  128. def test_diff_mixed_dtype(self):
  129. df = DataFrame(np.random.randn(5, 3))
  130. df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
  131. result = df.diff()
  132. assert result[0].dtype == np.float64
  133. def test_diff_neg_n(self, datetime_frame):
  134. rs = datetime_frame.diff(-1)
  135. xp = datetime_frame - datetime_frame.shift(-1)
  136. tm.assert_frame_equal(rs, xp)
  137. def test_diff_float_n(self, datetime_frame):
  138. rs = datetime_frame.diff(1.0)
  139. xp = datetime_frame.diff(1)
  140. tm.assert_frame_equal(rs, xp)
  141. def test_diff_axis(self):
  142. # GH#9727
  143. df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
  144. tm.assert_frame_equal(
  145. df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])
  146. )
  147. tm.assert_frame_equal(
  148. df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
  149. )
  150. def test_diff_period(self):
  151. # GH#32995 Don't pass an incorrect axis
  152. pi = date_range("2016-01-01", periods=3).to_period("D")
  153. df = DataFrame({"A": pi})
  154. result = df.diff(1, axis=1)
  155. expected = (df - pd.NaT).astype(object)
  156. tm.assert_frame_equal(result, expected)
  157. def test_diff_axis1_mixed_dtypes(self):
  158. # GH#32995 operate column-wise when we have mixed dtypes and axis=1
  159. df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
  160. expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2})
  161. result = df.diff(axis=1)
  162. tm.assert_frame_equal(result, expected)
  163. # GH#21437 mixed-float-dtypes
  164. df = DataFrame(
  165. {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
  166. )
  167. result = df.diff(axis=1)
  168. expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
  169. tm.assert_frame_equal(result, expected)
  170. def test_diff_axis1_mixed_dtypes_large_periods(self):
  171. # GH#32995 operate column-wise when we have mixed dtypes and axis=1
  172. df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
  173. expected = df * np.nan
  174. result = df.diff(axis=1, periods=3)
  175. tm.assert_frame_equal(result, expected)
  176. def test_diff_axis1_mixed_dtypes_negative_periods(self):
  177. # GH#32995 operate column-wise when we have mixed dtypes and axis=1
  178. df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
  179. expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan})
  180. result = df.diff(axis=1, periods=-1)
  181. tm.assert_frame_equal(result, expected)
  182. def test_diff_sparse(self):
  183. # GH#28813 .diff() should work for sparse dataframes as well
  184. sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]")
  185. result = sparse_df.diff()
  186. expected = DataFrame(
  187. [[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0)
  188. )
  189. tm.assert_frame_equal(result, expected)
  190. @pytest.mark.parametrize(
  191. "axis,expected",
  192. [
  193. (
  194. 0,
  195. DataFrame(
  196. {
  197. "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0],
  198. "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan],
  199. "c": np.repeat(np.nan, 8),
  200. "d": [np.nan, 3, 5, 7, 9, 11, 13, 15],
  201. },
  202. dtype="Int64",
  203. ),
  204. ),
  205. (
  206. 1,
  207. DataFrame(
  208. {
  209. "a": np.repeat(np.nan, 8),
  210. "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0],
  211. "c": np.repeat(np.nan, 8),
  212. "d": np.repeat(np.nan, 8),
  213. },
  214. dtype="Int64",
  215. ),
  216. ),
  217. ],
  218. )
  219. def test_diff_integer_na(self, axis, expected):
  220. # GH#24171 IntegerNA Support for DataFrame.diff()
  221. df = DataFrame(
  222. {
  223. "a": np.repeat([0, 1, np.nan, 2], 2),
  224. "b": np.tile([0, 1, np.nan, 2], 2),
  225. "c": np.repeat(np.nan, 8),
  226. "d": np.arange(1, 9) ** 2,
  227. },
  228. dtype="Int64",
  229. )
  230. # Test case for default behaviour of diff
  231. result = df.diff(axis=axis)
  232. tm.assert_frame_equal(result, expected)
  233. def test_diff_readonly(self):
  234. # https://github.com/pandas-dev/pandas/issues/35559
  235. arr = np.random.randn(5, 2)
  236. arr.flags.writeable = False
  237. df = DataFrame(arr)
  238. result = df.diff()
  239. expected = DataFrame(np.array(df)).diff()
  240. tm.assert_frame_equal(result, expected)
  241. def test_diff_all_int_dtype(self, any_int_numpy_dtype):
  242. # GH 14773
  243. df = DataFrame(range(5))
  244. df = df.astype(any_int_numpy_dtype)
  245. result = df.diff()
  246. expected_dtype = (
  247. "float32" if any_int_numpy_dtype in ("int8", "int16") else "float64"
  248. )
  249. expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype)
  250. tm.assert_frame_equal(result, expected)