123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363 |
- import numpy as np
- import pytest
- from pandas.core.dtypes.concat import union_categoricals
- import pandas as pd
- from pandas import (
- Categorical,
- CategoricalIndex,
- Series,
- )
- import pandas._testing as tm
- class TestUnionCategoricals:
- @pytest.mark.parametrize(
- "a, b, combined",
- [
- (list("abc"), list("abd"), list("abcabd")),
- ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
- ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
- (
- ["b", "b", np.nan, "a"],
- ["a", np.nan, "c"],
- ["b", "b", np.nan, "a", "a", np.nan, "c"],
- ),
- (
- pd.date_range("2014-01-01", "2014-01-05"),
- pd.date_range("2014-01-06", "2014-01-07"),
- pd.date_range("2014-01-01", "2014-01-07"),
- ),
- (
- pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
- pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
- pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
- ),
- (
- pd.period_range("2014-01-01", "2014-01-05"),
- pd.period_range("2014-01-06", "2014-01-07"),
- pd.period_range("2014-01-01", "2014-01-07"),
- ),
- ],
- )
- @pytest.mark.parametrize("box", [Categorical, CategoricalIndex, Series])
- def test_union_categorical(self, a, b, combined, box):
- # GH 13361
- result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
- expected = Categorical(combined)
- tm.assert_categorical_equal(result, expected)
- def test_union_categorical_ordered_appearance(self):
- # new categories ordered by appearance
- s = Categorical(["x", "y", "z"])
- s2 = Categorical(["a", "b", "c"])
- result = union_categoricals([s, s2])
- expected = Categorical(
- ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
- )
- tm.assert_categorical_equal(result, expected)
- def test_union_categorical_ordered_true(self):
- s = Categorical([0, 1.2, 2], ordered=True)
- s2 = Categorical([0, 1.2, 2], ordered=True)
- result = union_categoricals([s, s2])
- expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
- tm.assert_categorical_equal(result, expected)
- def test_union_categorical_match_types(self):
- # must exactly match types
- s = Categorical([0, 1.2, 2])
- s2 = Categorical([2, 3, 4])
- msg = "dtype of categories must be the same"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([s, s2])
- def test_union_categorical_empty(self):
- msg = "No Categoricals to union"
- with pytest.raises(ValueError, match=msg):
- union_categoricals([])
- def test_union_categoricals_nan(self):
- # GH 13759
- res = union_categoricals(
- [Categorical([1, 2, np.nan]), Categorical([3, 2, np.nan])]
- )
- exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals(
- [Categorical(["A", "B"]), Categorical(["B", "B", np.nan])]
- )
- exp = Categorical(["A", "B", "B", "B", np.nan])
- tm.assert_categorical_equal(res, exp)
- val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
- val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
- res = union_categoricals([Categorical(val1), Categorical(val2)])
- exp = Categorical(
- val1 + val2,
- categories=[
- pd.Timestamp("2011-01-01"),
- pd.Timestamp("2011-03-01"),
- pd.Timestamp("2011-02-01"),
- ],
- )
- tm.assert_categorical_equal(res, exp)
- # all NaN
- res = union_categoricals(
- [
- Categorical(np.array([np.nan, np.nan], dtype=object)),
- Categorical(["X"]),
- ]
- )
- exp = Categorical([np.nan, np.nan, "X"])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals(
- [Categorical([np.nan, np.nan]), Categorical([np.nan, np.nan])]
- )
- exp = Categorical([np.nan, np.nan, np.nan, np.nan])
- tm.assert_categorical_equal(res, exp)
- @pytest.mark.parametrize("val", [[], ["1"]])
- def test_union_categoricals_empty(self, val):
- # GH 13759
- res = union_categoricals([Categorical([]), Categorical(val)])
- exp = Categorical(val)
- tm.assert_categorical_equal(res, exp)
- def test_union_categorical_same_category(self):
- # check fastpath
- c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
- c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
- res = union_categoricals([c1, c2])
- exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
- tm.assert_categorical_equal(res, exp)
- def test_union_categorical_same_category_str(self):
- c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
- c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
- res = union_categoricals([c1, c2])
- exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
- tm.assert_categorical_equal(res, exp)
- def test_union_categorical_same_categories_different_order(self):
- # https://github.com/pandas-dev/pandas/issues/19096
- c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
- c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
- result = union_categoricals([c1, c2])
- expected = Categorical(
- ["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
- )
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_ordered(self):
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], ordered=False)
- msg = "Categorical.ordered must be the same"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2])
- res = union_categoricals([c1, c1])
- exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3, np.nan], ordered=True)
- c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
- res = union_categoricals([c1, c2])
- exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
- msg = "to union ordered Categoricals, all categories must be the same"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2])
- def test_union_categoricals_ignore_order(self):
- # GH 15219
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], ordered=False)
- res = union_categoricals([c1, c2], ignore_order=True)
- exp = Categorical([1, 2, 3, 1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- msg = "Categorical.ordered must be the same"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2], ignore_order=False)
- res = union_categoricals([c1, c1], ignore_order=True)
- exp = Categorical([1, 2, 3, 1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals([c1, c1], ignore_order=False)
- exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3, np.nan], ordered=True)
- c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
- res = union_categoricals([c1, c2], ignore_order=True)
- exp = Categorical([1, 2, 3, np.nan, 3, 2])
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
- res = union_categoricals([c1, c2], ignore_order=True)
- exp = Categorical([1, 2, 3, 1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
- exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
- tm.assert_categorical_equal(res, exp)
- c1 = Categorical([1, 2, 3], ordered=True)
- c2 = Categorical([4, 5, 6], ordered=True)
- result = union_categoricals([c1, c2], ignore_order=True)
- expected = Categorical([1, 2, 3, 4, 5, 6])
- tm.assert_categorical_equal(result, expected)
- msg = "to union ordered Categoricals, all categories must be the same"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2], ignore_order=False)
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2])
- def test_union_categoricals_sort(self):
- # GH 13846
- c1 = Categorical(["x", "y", "z"])
- c2 = Categorical(["a", "b", "c"])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(
- ["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
- )
- tm.assert_categorical_equal(result, expected)
- # fastpath
- c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
- c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
- c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
- tm.assert_categorical_equal(result, expected)
- # fastpath - skip resort
- c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
- c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(["x", np.nan])
- c2 = Categorical([np.nan, "b"])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical([np.nan])
- c2 = Categorical([np.nan])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical([np.nan, np.nan])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical([])
- c2 = Categorical([])
- result = union_categoricals([c1, c2], sort_categories=True)
- expected = Categorical([])
- tm.assert_categorical_equal(result, expected)
- c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
- c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
- msg = "Cannot use sort_categories=True with ordered Categoricals"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, c2], sort_categories=True)
- def test_union_categoricals_sort_false(self):
- # GH 13846
- c1 = Categorical(["x", "y", "z"])
- c2 = Categorical(["a", "b", "c"])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(
- ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
- )
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_sort_false_fastpath(self):
- # fastpath
- c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
- c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_sort_false_skipresort(self):
- # fastpath - skip resort
- c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
- c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_sort_false_one_nan(self):
- c1 = Categorical(["x", np.nan])
- c2 = Categorical([np.nan, "b"])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_sort_false_only_nan(self):
- c1 = Categorical([np.nan])
- c2 = Categorical([np.nan])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical([np.nan, np.nan])
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_sort_false_empty(self):
- c1 = Categorical([])
- c2 = Categorical([])
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical([])
- tm.assert_categorical_equal(result, expected)
- def test_union_categoricals_sort_false_ordered_true(self):
- c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
- c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
- result = union_categoricals([c1, c2], sort_categories=False)
- expected = Categorical(
- ["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
- )
- tm.assert_categorical_equal(result, expected)
- def test_union_categorical_unwrap(self):
- # GH 14173
- c1 = Categorical(["a", "b"])
- c2 = Series(["b", "c"], dtype="category")
- result = union_categoricals([c1, c2])
- expected = Categorical(["a", "b", "b", "c"])
- tm.assert_categorical_equal(result, expected)
- c2 = CategoricalIndex(c2)
- result = union_categoricals([c1, c2])
- tm.assert_categorical_equal(result, expected)
- c1 = Series(c1)
- result = union_categoricals([c1, c2])
- tm.assert_categorical_equal(result, expected)
- msg = "all components to combine must be Categorical"
- with pytest.raises(TypeError, match=msg):
- union_categoricals([c1, ["a", "b", "c"]])
|