test_dropna.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. import datetime
  2. import dateutil
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. Series,
  9. )
  10. import pandas._testing as tm
  11. class TestDataFrameMissingData:
  12. def test_dropEmptyRows(self, float_frame):
  13. N = len(float_frame.index)
  14. mat = np.random.randn(N)
  15. mat[:5] = np.nan
  16. frame = DataFrame({"foo": mat}, index=float_frame.index)
  17. original = Series(mat, index=float_frame.index, name="foo")
  18. expected = original.dropna()
  19. inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
  20. smaller_frame = frame.dropna(how="all")
  21. # check that original was preserved
  22. tm.assert_series_equal(frame["foo"], original)
  23. return_value = inplace_frame1.dropna(how="all", inplace=True)
  24. tm.assert_series_equal(smaller_frame["foo"], expected)
  25. tm.assert_series_equal(inplace_frame1["foo"], expected)
  26. assert return_value is None
  27. smaller_frame = frame.dropna(how="all", subset=["foo"])
  28. return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
  29. tm.assert_series_equal(smaller_frame["foo"], expected)
  30. tm.assert_series_equal(inplace_frame2["foo"], expected)
  31. assert return_value is None
  32. def test_dropIncompleteRows(self, float_frame):
  33. N = len(float_frame.index)
  34. mat = np.random.randn(N)
  35. mat[:5] = np.nan
  36. frame = DataFrame({"foo": mat}, index=float_frame.index)
  37. frame["bar"] = 5
  38. original = Series(mat, index=float_frame.index, name="foo")
  39. inp_frame1, inp_frame2 = frame.copy(), frame.copy()
  40. smaller_frame = frame.dropna()
  41. tm.assert_series_equal(frame["foo"], original)
  42. return_value = inp_frame1.dropna(inplace=True)
  43. exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
  44. tm.assert_series_equal(smaller_frame["foo"], exp)
  45. tm.assert_series_equal(inp_frame1["foo"], exp)
  46. assert return_value is None
  47. samesize_frame = frame.dropna(subset=["bar"])
  48. tm.assert_series_equal(frame["foo"], original)
  49. assert (frame["bar"] == 5).all()
  50. return_value = inp_frame2.dropna(subset=["bar"], inplace=True)
  51. tm.assert_index_equal(samesize_frame.index, float_frame.index)
  52. tm.assert_index_equal(inp_frame2.index, float_frame.index)
  53. assert return_value is None
  54. def test_dropna(self):
  55. df = DataFrame(np.random.randn(6, 4))
  56. df.iloc[:2, 2] = np.nan
  57. dropped = df.dropna(axis=1)
  58. expected = df.loc[:, [0, 1, 3]]
  59. inp = df.copy()
  60. return_value = inp.dropna(axis=1, inplace=True)
  61. tm.assert_frame_equal(dropped, expected)
  62. tm.assert_frame_equal(inp, expected)
  63. assert return_value is None
  64. dropped = df.dropna(axis=0)
  65. expected = df.loc[list(range(2, 6))]
  66. inp = df.copy()
  67. return_value = inp.dropna(axis=0, inplace=True)
  68. tm.assert_frame_equal(dropped, expected)
  69. tm.assert_frame_equal(inp, expected)
  70. assert return_value is None
  71. # threshold
  72. dropped = df.dropna(axis=1, thresh=5)
  73. expected = df.loc[:, [0, 1, 3]]
  74. inp = df.copy()
  75. return_value = inp.dropna(axis=1, thresh=5, inplace=True)
  76. tm.assert_frame_equal(dropped, expected)
  77. tm.assert_frame_equal(inp, expected)
  78. assert return_value is None
  79. dropped = df.dropna(axis=0, thresh=4)
  80. expected = df.loc[range(2, 6)]
  81. inp = df.copy()
  82. return_value = inp.dropna(axis=0, thresh=4, inplace=True)
  83. tm.assert_frame_equal(dropped, expected)
  84. tm.assert_frame_equal(inp, expected)
  85. assert return_value is None
  86. dropped = df.dropna(axis=1, thresh=4)
  87. tm.assert_frame_equal(dropped, df)
  88. dropped = df.dropna(axis=1, thresh=3)
  89. tm.assert_frame_equal(dropped, df)
  90. # subset
  91. dropped = df.dropna(axis=0, subset=[0, 1, 3])
  92. inp = df.copy()
  93. return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
  94. tm.assert_frame_equal(dropped, df)
  95. tm.assert_frame_equal(inp, df)
  96. assert return_value is None
  97. # all
  98. dropped = df.dropna(axis=1, how="all")
  99. tm.assert_frame_equal(dropped, df)
  100. df[2] = np.nan
  101. dropped = df.dropna(axis=1, how="all")
  102. expected = df.loc[:, [0, 1, 3]]
  103. tm.assert_frame_equal(dropped, expected)
  104. # bad input
  105. msg = "No axis named 3 for object type DataFrame"
  106. with pytest.raises(ValueError, match=msg):
  107. df.dropna(axis=3)
  108. def test_drop_and_dropna_caching(self):
  109. # tst that cacher updates
  110. original = Series([1, 2, np.nan], name="A")
  111. expected = Series([1, 2], dtype=original.dtype, name="A")
  112. df = DataFrame({"A": original.values.copy()})
  113. df2 = df.copy()
  114. df["A"].dropna()
  115. tm.assert_series_equal(df["A"], original)
  116. ser = df["A"]
  117. return_value = ser.dropna(inplace=True)
  118. tm.assert_series_equal(ser, expected)
  119. tm.assert_series_equal(df["A"], original)
  120. assert return_value is None
  121. df2["A"].drop([1])
  122. tm.assert_series_equal(df2["A"], original)
  123. ser = df2["A"]
  124. return_value = ser.drop([1], inplace=True)
  125. tm.assert_series_equal(ser, original.drop([1]))
  126. tm.assert_series_equal(df2["A"], original)
  127. assert return_value is None
  128. def test_dropna_corner(self, float_frame):
  129. # bad input
  130. msg = "invalid how option: foo"
  131. with pytest.raises(ValueError, match=msg):
  132. float_frame.dropna(how="foo")
  133. # non-existent column - 8303
  134. with pytest.raises(KeyError, match=r"^\['X'\]$"):
  135. float_frame.dropna(subset=["A", "X"])
  136. def test_dropna_multiple_axes(self):
  137. df = DataFrame(
  138. [
  139. [1, np.nan, 2, 3],
  140. [4, np.nan, 5, 6],
  141. [np.nan, np.nan, np.nan, np.nan],
  142. [7, np.nan, 8, 9],
  143. ]
  144. )
  145. # GH20987
  146. with pytest.raises(TypeError, match="supplying multiple axes"):
  147. df.dropna(how="all", axis=[0, 1])
  148. with pytest.raises(TypeError, match="supplying multiple axes"):
  149. df.dropna(how="all", axis=(0, 1))
  150. inp = df.copy()
  151. with pytest.raises(TypeError, match="supplying multiple axes"):
  152. inp.dropna(how="all", axis=(0, 1), inplace=True)
  153. def test_dropna_tz_aware_datetime(self):
  154. # GH13407
  155. df = DataFrame()
  156. dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
  157. dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
  158. df["Time"] = [dt1]
  159. result = df.dropna(axis=0)
  160. expected = DataFrame({"Time": [dt1]})
  161. tm.assert_frame_equal(result, expected)
  162. # Ex2
  163. df = DataFrame({"Time": [dt1, None, np.nan, dt2]})
  164. result = df.dropna(axis=0)
  165. expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3])
  166. tm.assert_frame_equal(result, expected)
  167. def test_dropna_categorical_interval_index(self):
  168. # GH 25087
  169. ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28])
  170. ci = pd.CategoricalIndex(ii)
  171. df = DataFrame({"A": list("abc")}, index=ci)
  172. expected = df
  173. result = df.dropna()
  174. tm.assert_frame_equal(result, expected)
  175. def test_dropna_with_duplicate_columns(self):
  176. df = DataFrame(
  177. {
  178. "A": np.random.randn(5),
  179. "B": np.random.randn(5),
  180. "C": np.random.randn(5),
  181. "D": ["a", "b", "c", "d", "e"],
  182. }
  183. )
  184. df.iloc[2, [0, 1, 2]] = np.nan
  185. df.iloc[0, 0] = np.nan
  186. df.iloc[1, 1] = np.nan
  187. df.iloc[:, 3] = np.nan
  188. expected = df.dropna(subset=["A", "B", "C"], how="all")
  189. expected.columns = ["A", "A", "B", "C"]
  190. df.columns = ["A", "A", "B", "C"]
  191. result = df.dropna(subset=["A", "C"], how="all")
  192. tm.assert_frame_equal(result, expected)
  193. def test_set_single_column_subset(self):
  194. # GH 41021
  195. df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.NaN, 5]})
  196. expected = DataFrame(
  197. {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2]
  198. )
  199. result = df.dropna(subset="C")
  200. tm.assert_frame_equal(result, expected)
  201. def test_single_column_not_present_in_axis(self):
  202. # GH 41021
  203. df = DataFrame({"A": [1, 2, 3]})
  204. # Column not present
  205. with pytest.raises(KeyError, match="['D']"):
  206. df.dropna(subset="D", axis=0)
  207. def test_subset_is_nparray(self):
  208. # GH 41021
  209. df = DataFrame({"A": [1, 2, np.NaN], "B": list("abc"), "C": [4, np.NaN, 5]})
  210. expected = DataFrame({"A": [1.0], "B": ["a"], "C": [4.0]})
  211. result = df.dropna(subset=np.array(["A", "C"]))
  212. tm.assert_frame_equal(result, expected)
  213. def test_no_nans_in_frame(self, axis):
  214. # GH#41965
  215. df = DataFrame([[1, 2], [3, 4]], columns=pd.RangeIndex(0, 2))
  216. expected = df.copy()
  217. result = df.dropna(axis=axis)
  218. tm.assert_frame_equal(result, expected, check_index_type=True)
  219. def test_how_thresh_param_incompatible(self):
  220. # GH46575
  221. df = DataFrame([1, 2, pd.NA])
  222. msg = "You cannot set both the how and thresh arguments at the same time"
  223. with pytest.raises(TypeError, match=msg):
  224. df.dropna(how="all", thresh=2)
  225. with pytest.raises(TypeError, match=msg):
  226. df.dropna(how="any", thresh=2)
  227. with pytest.raises(TypeError, match=msg):
  228. df.dropna(how=None, thresh=None)
  229. @pytest.mark.parametrize("val", [1, 1.5])
  230. def test_dropna_ignore_index(self, val):
  231. # GH#31725
  232. df = DataFrame({"a": [1, 2, val]}, index=[3, 2, 1])
  233. result = df.dropna(ignore_index=True)
  234. expected = DataFrame({"a": [1, 2, val]})
  235. tm.assert_frame_equal(result, expected)
  236. df.dropna(ignore_index=True, inplace=True)
  237. tm.assert_frame_equal(df, expected)