test_union_categoricals.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. import numpy as np
  2. import pytest
  3. from pandas.core.dtypes.concat import union_categoricals
  4. import pandas as pd
  5. from pandas import (
  6. Categorical,
  7. CategoricalIndex,
  8. Series,
  9. )
  10. import pandas._testing as tm
  11. class TestUnionCategoricals:
  12. @pytest.mark.parametrize(
  13. "a, b, combined",
  14. [
  15. (list("abc"), list("abd"), list("abcabd")),
  16. ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
  17. ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
  18. (
  19. ["b", "b", np.nan, "a"],
  20. ["a", np.nan, "c"],
  21. ["b", "b", np.nan, "a", "a", np.nan, "c"],
  22. ),
  23. (
  24. pd.date_range("2014-01-01", "2014-01-05"),
  25. pd.date_range("2014-01-06", "2014-01-07"),
  26. pd.date_range("2014-01-01", "2014-01-07"),
  27. ),
  28. (
  29. pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
  30. pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
  31. pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
  32. ),
  33. (
  34. pd.period_range("2014-01-01", "2014-01-05"),
  35. pd.period_range("2014-01-06", "2014-01-07"),
  36. pd.period_range("2014-01-01", "2014-01-07"),
  37. ),
  38. ],
  39. )
  40. @pytest.mark.parametrize("box", [Categorical, CategoricalIndex, Series])
  41. def test_union_categorical(self, a, b, combined, box):
  42. # GH 13361
  43. result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
  44. expected = Categorical(combined)
  45. tm.assert_categorical_equal(result, expected)
  46. def test_union_categorical_ordered_appearance(self):
  47. # new categories ordered by appearance
  48. s = Categorical(["x", "y", "z"])
  49. s2 = Categorical(["a", "b", "c"])
  50. result = union_categoricals([s, s2])
  51. expected = Categorical(
  52. ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
  53. )
  54. tm.assert_categorical_equal(result, expected)
  55. def test_union_categorical_ordered_true(self):
  56. s = Categorical([0, 1.2, 2], ordered=True)
  57. s2 = Categorical([0, 1.2, 2], ordered=True)
  58. result = union_categoricals([s, s2])
  59. expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
  60. tm.assert_categorical_equal(result, expected)
  61. def test_union_categorical_match_types(self):
  62. # must exactly match types
  63. s = Categorical([0, 1.2, 2])
  64. s2 = Categorical([2, 3, 4])
  65. msg = "dtype of categories must be the same"
  66. with pytest.raises(TypeError, match=msg):
  67. union_categoricals([s, s2])
  68. def test_union_categorical_empty(self):
  69. msg = "No Categoricals to union"
  70. with pytest.raises(ValueError, match=msg):
  71. union_categoricals([])
  72. def test_union_categoricals_nan(self):
  73. # GH 13759
  74. res = union_categoricals(
  75. [Categorical([1, 2, np.nan]), Categorical([3, 2, np.nan])]
  76. )
  77. exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
  78. tm.assert_categorical_equal(res, exp)
  79. res = union_categoricals(
  80. [Categorical(["A", "B"]), Categorical(["B", "B", np.nan])]
  81. )
  82. exp = Categorical(["A", "B", "B", "B", np.nan])
  83. tm.assert_categorical_equal(res, exp)
  84. val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
  85. val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
  86. res = union_categoricals([Categorical(val1), Categorical(val2)])
  87. exp = Categorical(
  88. val1 + val2,
  89. categories=[
  90. pd.Timestamp("2011-01-01"),
  91. pd.Timestamp("2011-03-01"),
  92. pd.Timestamp("2011-02-01"),
  93. ],
  94. )
  95. tm.assert_categorical_equal(res, exp)
  96. # all NaN
  97. res = union_categoricals(
  98. [
  99. Categorical(np.array([np.nan, np.nan], dtype=object)),
  100. Categorical(["X"]),
  101. ]
  102. )
  103. exp = Categorical([np.nan, np.nan, "X"])
  104. tm.assert_categorical_equal(res, exp)
  105. res = union_categoricals(
  106. [Categorical([np.nan, np.nan]), Categorical([np.nan, np.nan])]
  107. )
  108. exp = Categorical([np.nan, np.nan, np.nan, np.nan])
  109. tm.assert_categorical_equal(res, exp)
  110. @pytest.mark.parametrize("val", [[], ["1"]])
  111. def test_union_categoricals_empty(self, val):
  112. # GH 13759
  113. res = union_categoricals([Categorical([]), Categorical(val)])
  114. exp = Categorical(val)
  115. tm.assert_categorical_equal(res, exp)
  116. def test_union_categorical_same_category(self):
  117. # check fastpath
  118. c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
  119. c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
  120. res = union_categoricals([c1, c2])
  121. exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
  122. tm.assert_categorical_equal(res, exp)
  123. def test_union_categorical_same_category_str(self):
  124. c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
  125. c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
  126. res = union_categoricals([c1, c2])
  127. exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
  128. tm.assert_categorical_equal(res, exp)
  129. def test_union_categorical_same_categories_different_order(self):
  130. # https://github.com/pandas-dev/pandas/issues/19096
  131. c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
  132. c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
  133. result = union_categoricals([c1, c2])
  134. expected = Categorical(
  135. ["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
  136. )
  137. tm.assert_categorical_equal(result, expected)
  138. def test_union_categoricals_ordered(self):
  139. c1 = Categorical([1, 2, 3], ordered=True)
  140. c2 = Categorical([1, 2, 3], ordered=False)
  141. msg = "Categorical.ordered must be the same"
  142. with pytest.raises(TypeError, match=msg):
  143. union_categoricals([c1, c2])
  144. res = union_categoricals([c1, c1])
  145. exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
  146. tm.assert_categorical_equal(res, exp)
  147. c1 = Categorical([1, 2, 3, np.nan], ordered=True)
  148. c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
  149. res = union_categoricals([c1, c2])
  150. exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
  151. tm.assert_categorical_equal(res, exp)
  152. c1 = Categorical([1, 2, 3], ordered=True)
  153. c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
  154. msg = "to union ordered Categoricals, all categories must be the same"
  155. with pytest.raises(TypeError, match=msg):
  156. union_categoricals([c1, c2])
  157. def test_union_categoricals_ignore_order(self):
  158. # GH 15219
  159. c1 = Categorical([1, 2, 3], ordered=True)
  160. c2 = Categorical([1, 2, 3], ordered=False)
  161. res = union_categoricals([c1, c2], ignore_order=True)
  162. exp = Categorical([1, 2, 3, 1, 2, 3])
  163. tm.assert_categorical_equal(res, exp)
  164. msg = "Categorical.ordered must be the same"
  165. with pytest.raises(TypeError, match=msg):
  166. union_categoricals([c1, c2], ignore_order=False)
  167. res = union_categoricals([c1, c1], ignore_order=True)
  168. exp = Categorical([1, 2, 3, 1, 2, 3])
  169. tm.assert_categorical_equal(res, exp)
  170. res = union_categoricals([c1, c1], ignore_order=False)
  171. exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
  172. tm.assert_categorical_equal(res, exp)
  173. c1 = Categorical([1, 2, 3, np.nan], ordered=True)
  174. c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
  175. res = union_categoricals([c1, c2], ignore_order=True)
  176. exp = Categorical([1, 2, 3, np.nan, 3, 2])
  177. tm.assert_categorical_equal(res, exp)
  178. c1 = Categorical([1, 2, 3], ordered=True)
  179. c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
  180. res = union_categoricals([c1, c2], ignore_order=True)
  181. exp = Categorical([1, 2, 3, 1, 2, 3])
  182. tm.assert_categorical_equal(res, exp)
  183. res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
  184. exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
  185. tm.assert_categorical_equal(res, exp)
  186. c1 = Categorical([1, 2, 3], ordered=True)
  187. c2 = Categorical([4, 5, 6], ordered=True)
  188. result = union_categoricals([c1, c2], ignore_order=True)
  189. expected = Categorical([1, 2, 3, 4, 5, 6])
  190. tm.assert_categorical_equal(result, expected)
  191. msg = "to union ordered Categoricals, all categories must be the same"
  192. with pytest.raises(TypeError, match=msg):
  193. union_categoricals([c1, c2], ignore_order=False)
  194. with pytest.raises(TypeError, match=msg):
  195. union_categoricals([c1, c2])
  196. def test_union_categoricals_sort(self):
  197. # GH 13846
  198. c1 = Categorical(["x", "y", "z"])
  199. c2 = Categorical(["a", "b", "c"])
  200. result = union_categoricals([c1, c2], sort_categories=True)
  201. expected = Categorical(
  202. ["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
  203. )
  204. tm.assert_categorical_equal(result, expected)
  205. # fastpath
  206. c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
  207. c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
  208. result = union_categoricals([c1, c2], sort_categories=True)
  209. expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
  210. tm.assert_categorical_equal(result, expected)
  211. c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
  212. c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
  213. result = union_categoricals([c1, c2], sort_categories=True)
  214. expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
  215. tm.assert_categorical_equal(result, expected)
  216. # fastpath - skip resort
  217. c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
  218. c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
  219. result = union_categoricals([c1, c2], sort_categories=True)
  220. expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
  221. tm.assert_categorical_equal(result, expected)
  222. c1 = Categorical(["x", np.nan])
  223. c2 = Categorical([np.nan, "b"])
  224. result = union_categoricals([c1, c2], sort_categories=True)
  225. expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
  226. tm.assert_categorical_equal(result, expected)
  227. c1 = Categorical([np.nan])
  228. c2 = Categorical([np.nan])
  229. result = union_categoricals([c1, c2], sort_categories=True)
  230. expected = Categorical([np.nan, np.nan])
  231. tm.assert_categorical_equal(result, expected)
  232. c1 = Categorical([])
  233. c2 = Categorical([])
  234. result = union_categoricals([c1, c2], sort_categories=True)
  235. expected = Categorical([])
  236. tm.assert_categorical_equal(result, expected)
  237. c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
  238. c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
  239. msg = "Cannot use sort_categories=True with ordered Categoricals"
  240. with pytest.raises(TypeError, match=msg):
  241. union_categoricals([c1, c2], sort_categories=True)
  242. def test_union_categoricals_sort_false(self):
  243. # GH 13846
  244. c1 = Categorical(["x", "y", "z"])
  245. c2 = Categorical(["a", "b", "c"])
  246. result = union_categoricals([c1, c2], sort_categories=False)
  247. expected = Categorical(
  248. ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
  249. )
  250. tm.assert_categorical_equal(result, expected)
  251. def test_union_categoricals_sort_false_fastpath(self):
  252. # fastpath
  253. c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
  254. c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
  255. result = union_categoricals([c1, c2], sort_categories=False)
  256. expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
  257. tm.assert_categorical_equal(result, expected)
  258. def test_union_categoricals_sort_false_skipresort(self):
  259. # fastpath - skip resort
  260. c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
  261. c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
  262. result = union_categoricals([c1, c2], sort_categories=False)
  263. expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
  264. tm.assert_categorical_equal(result, expected)
  265. def test_union_categoricals_sort_false_one_nan(self):
  266. c1 = Categorical(["x", np.nan])
  267. c2 = Categorical([np.nan, "b"])
  268. result = union_categoricals([c1, c2], sort_categories=False)
  269. expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
  270. tm.assert_categorical_equal(result, expected)
  271. def test_union_categoricals_sort_false_only_nan(self):
  272. c1 = Categorical([np.nan])
  273. c2 = Categorical([np.nan])
  274. result = union_categoricals([c1, c2], sort_categories=False)
  275. expected = Categorical([np.nan, np.nan])
  276. tm.assert_categorical_equal(result, expected)
  277. def test_union_categoricals_sort_false_empty(self):
  278. c1 = Categorical([])
  279. c2 = Categorical([])
  280. result = union_categoricals([c1, c2], sort_categories=False)
  281. expected = Categorical([])
  282. tm.assert_categorical_equal(result, expected)
  283. def test_union_categoricals_sort_false_ordered_true(self):
  284. c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
  285. c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
  286. result = union_categoricals([c1, c2], sort_categories=False)
  287. expected = Categorical(
  288. ["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
  289. )
  290. tm.assert_categorical_equal(result, expected)
  291. def test_union_categorical_unwrap(self):
  292. # GH 14173
  293. c1 = Categorical(["a", "b"])
  294. c2 = Series(["b", "c"], dtype="category")
  295. result = union_categoricals([c1, c2])
  296. expected = Categorical(["a", "b", "b", "c"])
  297. tm.assert_categorical_equal(result, expected)
  298. c2 = CategoricalIndex(c2)
  299. result = union_categoricals([c1, c2])
  300. tm.assert_categorical_equal(result, expected)
  301. c1 = Series(c1)
  302. result = union_categoricals([c1, c2])
  303. tm.assert_categorical_equal(result, expected)
  304. msg = "all components to combine must be Categorical"
  305. with pytest.raises(TypeError, match=msg):
  306. union_categoricals([c1, ["a", "b", "c"]])