test_isin.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. MultiIndex,
  7. Series,
  8. )
  9. import pandas._testing as tm
  10. class TestDataFrameIsIn:
  11. def test_isin(self):
  12. # GH#4211
  13. df = DataFrame(
  14. {
  15. "vals": [1, 2, 3, 4],
  16. "ids": ["a", "b", "f", "n"],
  17. "ids2": ["a", "n", "c", "n"],
  18. },
  19. index=["foo", "bar", "baz", "qux"],
  20. )
  21. other = ["a", "b", "c"]
  22. result = df.isin(other)
  23. expected = DataFrame([df.loc[s].isin(other) for s in df.index])
  24. tm.assert_frame_equal(result, expected)
  25. @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
  26. def test_isin_empty(self, empty):
  27. # GH#16991
  28. df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
  29. expected = DataFrame(False, df.index, df.columns)
  30. result = df.isin(empty)
  31. tm.assert_frame_equal(result, expected)
  32. def test_isin_dict(self):
  33. df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
  34. d = {"A": ["a"]}
  35. expected = DataFrame(False, df.index, df.columns)
  36. expected.loc[0, "A"] = True
  37. result = df.isin(d)
  38. tm.assert_frame_equal(result, expected)
  39. # non unique columns
  40. df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
  41. df.columns = ["A", "A"]
  42. expected = DataFrame(False, df.index, df.columns)
  43. expected.loc[0, "A"] = True
  44. result = df.isin(d)
  45. tm.assert_frame_equal(result, expected)
  46. def test_isin_with_string_scalar(self):
  47. # GH#4763
  48. df = DataFrame(
  49. {
  50. "vals": [1, 2, 3, 4],
  51. "ids": ["a", "b", "f", "n"],
  52. "ids2": ["a", "n", "c", "n"],
  53. },
  54. index=["foo", "bar", "baz", "qux"],
  55. )
  56. msg = (
  57. r"only list-like or dict-like objects are allowed "
  58. r"to be passed to DataFrame.isin\(\), you passed a 'str'"
  59. )
  60. with pytest.raises(TypeError, match=msg):
  61. df.isin("a")
  62. with pytest.raises(TypeError, match=msg):
  63. df.isin("aaa")
  64. def test_isin_df(self):
  65. df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
  66. df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]})
  67. expected = DataFrame(False, df1.index, df1.columns)
  68. result = df1.isin(df2)
  69. expected.loc[[1, 3], "A"] = True
  70. expected.loc[[0, 2], "B"] = True
  71. tm.assert_frame_equal(result, expected)
  72. # partial overlapping columns
  73. df2.columns = ["A", "C"]
  74. result = df1.isin(df2)
  75. expected["B"] = False
  76. tm.assert_frame_equal(result, expected)
  77. def test_isin_tuples(self):
  78. # GH#16394
  79. df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]})
  80. df["C"] = list(zip(df["A"], df["B"]))
  81. result = df["C"].isin([(1, "a")])
  82. tm.assert_series_equal(result, Series([True, False, False], name="C"))
  83. def test_isin_df_dupe_values(self):
  84. df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
  85. # just cols duped
  86. df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"])
  87. msg = r"cannot compute isin with a duplicate axis\."
  88. with pytest.raises(ValueError, match=msg):
  89. df1.isin(df2)
  90. # just index duped
  91. df2 = DataFrame(
  92. [[0, 2], [12, 4], [2, np.nan], [4, 5]],
  93. columns=["A", "B"],
  94. index=[0, 0, 1, 1],
  95. )
  96. with pytest.raises(ValueError, match=msg):
  97. df1.isin(df2)
  98. # cols and index:
  99. df2.columns = ["B", "B"]
  100. with pytest.raises(ValueError, match=msg):
  101. df1.isin(df2)
  102. def test_isin_dupe_self(self):
  103. other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]})
  104. df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"])
  105. result = df.isin(other)
  106. expected = DataFrame(False, index=df.index, columns=df.columns)
  107. expected.loc[0] = True
  108. expected.iloc[1, 1] = True
  109. tm.assert_frame_equal(result, expected)
  110. def test_isin_against_series(self):
  111. df = DataFrame(
  112. {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"]
  113. )
  114. s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"])
  115. expected = DataFrame(False, index=df.index, columns=df.columns)
  116. expected.loc["a", "A"] = True
  117. expected.loc["d"] = True
  118. result = df.isin(s)
  119. tm.assert_frame_equal(result, expected)
  120. def test_isin_multiIndex(self):
  121. idx = MultiIndex.from_tuples(
  122. [
  123. (0, "a", "foo"),
  124. (0, "a", "bar"),
  125. (0, "b", "bar"),
  126. (0, "b", "baz"),
  127. (2, "a", "foo"),
  128. (2, "a", "bar"),
  129. (2, "c", "bar"),
  130. (2, "c", "baz"),
  131. (1, "b", "foo"),
  132. (1, "b", "bar"),
  133. (1, "c", "bar"),
  134. (1, "c", "baz"),
  135. ]
  136. )
  137. df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx)
  138. df2 = DataFrame(
  139. {
  140. "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
  141. "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1],
  142. }
  143. )
  144. # against regular index
  145. expected = DataFrame(False, index=df1.index, columns=df1.columns)
  146. result = df1.isin(df2)
  147. tm.assert_frame_equal(result, expected)
  148. df2.index = idx
  149. expected = df2.values.astype(bool)
  150. expected[:, 1] = ~expected[:, 1]
  151. expected = DataFrame(expected, columns=["A", "B"], index=idx)
  152. result = df1.isin(df2)
  153. tm.assert_frame_equal(result, expected)
  154. def test_isin_empty_datetimelike(self):
  155. # GH#15473
  156. df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])})
  157. df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]})
  158. df2 = DataFrame({"date": []})
  159. df3 = DataFrame()
  160. expected = DataFrame({"date": [False, False]})
  161. result = df1_ts.isin(df2)
  162. tm.assert_frame_equal(result, expected)
  163. result = df1_ts.isin(df3)
  164. tm.assert_frame_equal(result, expected)
  165. result = df1_td.isin(df2)
  166. tm.assert_frame_equal(result, expected)
  167. result = df1_td.isin(df3)
  168. tm.assert_frame_equal(result, expected)
  169. @pytest.mark.parametrize(
  170. "values",
  171. [
  172. DataFrame({"a": [1, 2, 3]}, dtype="category"),
  173. Series([1, 2, 3], dtype="category"),
  174. ],
  175. )
  176. def test_isin_category_frame(self, values):
  177. # GH#34256
  178. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
  179. expected = DataFrame({"a": [True, True, True], "b": [False, False, False]})
  180. result = df.isin(values)
  181. tm.assert_frame_equal(result, expected)
  182. def test_isin_read_only(self):
  183. # https://github.com/pandas-dev/pandas/issues/37174
  184. arr = np.array([1, 2, 3])
  185. arr.setflags(write=False)
  186. df = DataFrame([1, 2, 3])
  187. result = df.isin(arr)
  188. expected = DataFrame([True, True, True])
  189. tm.assert_frame_equal(result, expected)