test_categorical.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. import numpy as np
  2. from pandas.core.dtypes.dtypes import CategoricalDtype
  3. import pandas as pd
  4. from pandas import (
  5. Categorical,
  6. DataFrame,
  7. Series,
  8. )
  9. import pandas._testing as tm
  10. class TestCategoricalConcat:
  11. def test_categorical_concat(self, sort):
  12. # See GH 10177
  13. df1 = DataFrame(
  14. np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"]
  15. )
  16. df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"])
  17. cat_values = ["one", "one", "two", "one", "two", "two", "one"]
  18. df2["h"] = Series(Categorical(cat_values))
  19. res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
  20. exp = DataFrame(
  21. {
  22. "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
  23. "b": [
  24. 1,
  25. 4,
  26. 7,
  27. 10,
  28. 13,
  29. 16,
  30. np.nan,
  31. np.nan,
  32. np.nan,
  33. np.nan,
  34. np.nan,
  35. np.nan,
  36. np.nan,
  37. ],
  38. "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
  39. "h": [None] * 6 + cat_values,
  40. }
  41. )
  42. exp["h"] = exp["h"].astype(df2["h"].dtype)
  43. tm.assert_frame_equal(res, exp)
  44. def test_categorical_concat_dtypes(self):
  45. # GH8143
  46. index = ["cat", "obj", "num"]
  47. cat = Categorical(["a", "b", "c"])
  48. obj = Series(["a", "b", "c"])
  49. num = Series([1, 2, 3])
  50. df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
  51. result = df.dtypes == "object"
  52. expected = Series([False, True, False], index=index)
  53. tm.assert_series_equal(result, expected)
  54. result = df.dtypes == "int64"
  55. expected = Series([False, False, True], index=index)
  56. tm.assert_series_equal(result, expected)
  57. result = df.dtypes == "category"
  58. expected = Series([True, False, False], index=index)
  59. tm.assert_series_equal(result, expected)
  60. def test_concat_categoricalindex(self):
  61. # GH 16111, categories that aren't lexsorted
  62. categories = [9, 0, 1, 2, 3]
  63. a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories))
  64. b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories))
  65. c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories))
  66. result = pd.concat([a, b, c], axis=1)
  67. exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
  68. exp = DataFrame(
  69. {
  70. 0: [1, 1, np.nan, np.nan],
  71. 1: [np.nan, 2, 2, np.nan],
  72. 2: [np.nan, np.nan, 3, 3],
  73. },
  74. columns=[0, 1, 2],
  75. index=exp_idx,
  76. )
  77. tm.assert_frame_equal(result, exp)
  78. def test_categorical_concat_preserve(self):
  79. # GH 8641 series concat not preserving category dtype
  80. # GH 13524 can concat different categories
  81. s = Series(list("abc"), dtype="category")
  82. s2 = Series(list("abd"), dtype="category")
  83. exp = Series(list("abcabd"))
  84. res = pd.concat([s, s2], ignore_index=True)
  85. tm.assert_series_equal(res, exp)
  86. exp = Series(list("abcabc"), dtype="category")
  87. res = pd.concat([s, s], ignore_index=True)
  88. tm.assert_series_equal(res, exp)
  89. exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category")
  90. res = pd.concat([s, s])
  91. tm.assert_series_equal(res, exp)
  92. a = Series(np.arange(6, dtype="int64"))
  93. b = Series(list("aabbca"))
  94. df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))})
  95. res = pd.concat([df2, df2])
  96. exp = DataFrame(
  97. {
  98. "A": pd.concat([a, a]),
  99. "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
  100. }
  101. )
  102. tm.assert_frame_equal(res, exp)
  103. def test_categorical_index_preserver(self):
  104. a = Series(np.arange(6, dtype="int64"))
  105. b = Series(list("aabbca"))
  106. df2 = DataFrame(
  107. {"A": a, "B": b.astype(CategoricalDtype(list("cab")))}
  108. ).set_index("B")
  109. result = pd.concat([df2, df2])
  110. expected = DataFrame(
  111. {
  112. "A": pd.concat([a, a]),
  113. "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
  114. }
  115. ).set_index("B")
  116. tm.assert_frame_equal(result, expected)
  117. # wrong categories -> uses concat_compat, which casts to object
  118. df3 = DataFrame(
  119. {"A": a, "B": Categorical(b, categories=list("abe"))}
  120. ).set_index("B")
  121. result = pd.concat([df2, df3])
  122. expected = pd.concat(
  123. [
  124. df2.set_axis(df2.index.astype(object), axis=0),
  125. df3.set_axis(df3.index.astype(object), axis=0),
  126. ]
  127. )
  128. tm.assert_frame_equal(result, expected)
  129. def test_concat_categorical_tz(self):
  130. # GH-23816
  131. a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific"))
  132. b = Series(["a", "b"], dtype="category")
  133. result = pd.concat([a, b], ignore_index=True)
  134. expected = Series(
  135. [
  136. pd.Timestamp("2017-01-01", tz="US/Pacific"),
  137. pd.Timestamp("2017-01-02", tz="US/Pacific"),
  138. "a",
  139. "b",
  140. ]
  141. )
  142. tm.assert_series_equal(result, expected)
  143. def test_concat_categorical_unchanged(self):
  144. # GH-12007
  145. # test fix for when concat on categorical and float
  146. # coerces dtype categorical -> float
  147. df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A"))
  148. ser = Series([0, 1, 2], index=[0, 1, 3], name="B")
  149. result = pd.concat([df, ser], axis=1)
  150. expected = DataFrame(
  151. {
  152. "A": Series(["a", "b", "c", np.nan], dtype="category"),
  153. "B": Series([0, 1, np.nan, 2], dtype="float"),
  154. }
  155. )
  156. tm.assert_equal(result, expected)
  157. def test_categorical_concat_gh7864(self):
  158. # GH 7864
  159. # make sure ordering is preserved
  160. df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")})
  161. df["grade"] = Categorical(df["raw_grade"])
  162. df["grade"].cat.set_categories(["e", "a", "b"])
  163. df1 = df[0:3]
  164. df2 = df[3:]
  165. tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories)
  166. tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories)
  167. dfx = pd.concat([df1, df2])
  168. tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories)
  169. dfa = df1._append(df2)
  170. tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories)
  171. def test_categorical_index_upcast(self):
  172. # GH 17629
  173. # test upcasting to object when concatinating on categorical indexes
  174. # with non-identical categories
  175. a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"]))
  176. b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"]))
  177. res = pd.concat([a, b])
  178. exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"])
  179. tm.assert_equal(res, exp)
  180. a = Series([1, 2], index=Categorical(["foo", "bar"]))
  181. b = Series([4, 3], index=Categorical(["baz", "bar"]))
  182. res = pd.concat([a, b])
  183. exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"])
  184. tm.assert_equal(res, exp)
  185. def test_categorical_missing_from_one_frame(self):
  186. # GH 25412
  187. df1 = DataFrame({"f1": [1, 2, 3]})
  188. df2 = DataFrame({"f1": [2, 3, 1], "f2": Series([4, 4, 4]).astype("category")})
  189. result = pd.concat([df1, df2], sort=True)
  190. dtype = CategoricalDtype([4])
  191. expected = DataFrame(
  192. {
  193. "f1": [1, 2, 3, 2, 3, 1],
  194. "f2": Categorical.from_codes([-1, -1, -1, 0, 0, 0], dtype=dtype),
  195. },
  196. index=[0, 1, 2, 0, 1, 2],
  197. )
  198. tm.assert_frame_equal(result, expected)
  199. def test_concat_categorical_same_categories_different_order(self):
  200. # https://github.com/pandas-dev/pandas/issues/24845
  201. c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
  202. c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
  203. c3 = pd.CategoricalIndex(
  204. ["a", "a", "b", "b"], categories=["a", "b"], ordered=False
  205. )
  206. df1 = DataFrame({"A": [1, 2]}, index=c1)
  207. df2 = DataFrame({"A": [3, 4]}, index=c2)
  208. result = pd.concat((df1, df2))
  209. expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3)
  210. tm.assert_frame_equal(result, expected)