test_missing.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. import collections
  2. import numpy as np
  3. import pytest
  4. from pandas.core.dtypes.dtypes import CategoricalDtype
  5. import pandas as pd
  6. from pandas import (
  7. Categorical,
  8. DataFrame,
  9. Index,
  10. Series,
  11. isna,
  12. )
  13. import pandas._testing as tm
  14. class TestCategoricalMissing:
  15. def test_isna(self):
  16. exp = np.array([False, False, True])
  17. cat = Categorical(["a", "b", np.nan])
  18. res = cat.isna()
  19. tm.assert_numpy_array_equal(res, exp)
  20. def test_na_flags_int_categories(self):
  21. # #1457
  22. categories = list(range(10))
  23. labels = np.random.randint(0, 10, 20)
  24. labels[::5] = -1
  25. cat = Categorical(labels, categories, fastpath=True)
  26. repr(cat)
  27. tm.assert_numpy_array_equal(isna(cat), labels == -1)
  28. def test_nan_handling(self):
  29. # Nans are represented as -1 in codes
  30. c = Categorical(["a", "b", np.nan, "a"])
  31. tm.assert_index_equal(c.categories, Index(["a", "b"]))
  32. tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
  33. c[1] = np.nan
  34. tm.assert_index_equal(c.categories, Index(["a", "b"]))
  35. tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
  36. # Adding nan to categories should make assigned nan point to the
  37. # category!
  38. c = Categorical(["a", "b", np.nan, "a"])
  39. tm.assert_index_equal(c.categories, Index(["a", "b"]))
  40. tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
  41. def test_set_dtype_nans(self):
  42. c = Categorical(["a", "b", np.nan])
  43. result = c._set_dtype(CategoricalDtype(["a", "c"]))
  44. tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
  45. def test_set_item_nan(self):
  46. cat = Categorical([1, 2, 3])
  47. cat[1] = np.nan
  48. exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
  49. tm.assert_categorical_equal(cat, exp)
  50. @pytest.mark.parametrize(
  51. "fillna_kwargs, msg",
  52. [
  53. (
  54. {"value": 1, "method": "ffill"},
  55. "Cannot specify both 'value' and 'method'.",
  56. ),
  57. ({}, "Must specify a fill 'value' or 'method'."),
  58. ({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
  59. (
  60. {"value": Series([1, 2, 3, 4, "a"])},
  61. "Cannot setitem on a Categorical with a new category",
  62. ),
  63. ],
  64. )
  65. def test_fillna_raises(self, fillna_kwargs, msg):
  66. # https://github.com/pandas-dev/pandas/issues/19682
  67. # https://github.com/pandas-dev/pandas/issues/13628
  68. cat = Categorical([1, 2, 3, None, None])
  69. if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
  70. err = TypeError
  71. else:
  72. err = ValueError
  73. with pytest.raises(err, match=msg):
  74. cat.fillna(**fillna_kwargs)
  75. @pytest.mark.parametrize("named", [True, False])
  76. def test_fillna_iterable_category(self, named):
  77. # https://github.com/pandas-dev/pandas/issues/21097
  78. if named:
  79. Point = collections.namedtuple("Point", "x y")
  80. else:
  81. Point = lambda *args: args # tuple
  82. cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
  83. result = cat.fillna(Point(0, 0))
  84. expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
  85. tm.assert_categorical_equal(result, expected)
  86. # Case where the Point is not among our categories; we want ValueError,
  87. # not NotImplementedError GH#41914
  88. cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
  89. msg = "Cannot setitem on a Categorical with a new category"
  90. with pytest.raises(TypeError, match=msg):
  91. cat.fillna(Point(0, 0))
  92. def test_fillna_array(self):
  93. # accept Categorical or ndarray value if it holds appropriate values
  94. cat = Categorical(["A", "B", "C", None, None])
  95. other = cat.fillna("C")
  96. result = cat.fillna(other)
  97. tm.assert_categorical_equal(result, other)
  98. assert isna(cat[-1]) # didn't modify original inplace
  99. other = np.array(["A", "B", "C", "B", "A"])
  100. result = cat.fillna(other)
  101. expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
  102. tm.assert_categorical_equal(result, expected)
  103. assert isna(cat[-1]) # didn't modify original inplace
  104. @pytest.mark.parametrize(
  105. "values, expected",
  106. [
  107. ([1, 2, 3], np.array([False, False, False])),
  108. ([1, 2, np.nan], np.array([False, False, True])),
  109. ([1, 2, np.inf], np.array([False, False, True])),
  110. ([1, 2, pd.NA], np.array([False, False, True])),
  111. ],
  112. )
  113. def test_use_inf_as_na(self, values, expected):
  114. # https://github.com/pandas-dev/pandas/issues/33594
  115. with pd.option_context("mode.use_inf_as_na", True):
  116. cat = Categorical(values)
  117. result = cat.isna()
  118. tm.assert_numpy_array_equal(result, expected)
  119. result = Series(cat).isna()
  120. expected = Series(expected)
  121. tm.assert_series_equal(result, expected)
  122. result = DataFrame(cat).isna()
  123. expected = DataFrame(expected)
  124. tm.assert_frame_equal(result, expected)
  125. @pytest.mark.parametrize(
  126. "values, expected",
  127. [
  128. ([1, 2, 3], np.array([False, False, False])),
  129. ([1, 2, np.nan], np.array([False, False, True])),
  130. ([1, 2, np.inf], np.array([False, False, True])),
  131. ([1, 2, pd.NA], np.array([False, False, True])),
  132. ],
  133. )
  134. def test_use_inf_as_na_outside_context(self, values, expected):
  135. # https://github.com/pandas-dev/pandas/issues/33594
  136. # Using isna directly for Categorical will fail in general here
  137. cat = Categorical(values)
  138. with pd.option_context("mode.use_inf_as_na", True):
  139. result = isna(cat)
  140. tm.assert_numpy_array_equal(result, expected)
  141. result = isna(Series(cat))
  142. expected = Series(expected)
  143. tm.assert_series_equal(result, expected)
  144. result = isna(DataFrame(cat))
  145. expected = DataFrame(expected)
  146. tm.assert_frame_equal(result, expected)
  147. @pytest.mark.parametrize(
  148. "a1, a2, categories",
  149. [
  150. (["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
  151. ([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
  152. ],
  153. )
  154. def test_compare_categorical_with_missing(self, a1, a2, categories):
  155. # GH 28384
  156. cat_type = CategoricalDtype(categories)
  157. # !=
  158. result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
  159. expected = Series(a1) != Series(a2)
  160. tm.assert_series_equal(result, expected)
  161. # ==
  162. result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
  163. expected = Series(a1) == Series(a2)
  164. tm.assert_series_equal(result, expected)
  165. @pytest.mark.parametrize(
  166. "na_value, dtype",
  167. [
  168. (pd.NaT, "datetime64[ns]"),
  169. (None, "float64"),
  170. (np.nan, "float64"),
  171. (pd.NA, "float64"),
  172. ],
  173. )
  174. def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
  175. # GH#44900
  176. result = Categorical([na_value, na_value])
  177. tm.assert_index_equal(result.categories, Index([], dtype=dtype))