reshaping.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. import itertools
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas.api.extensions import ExtensionArray
  6. from pandas.core.internals.blocks import EABackedBlock
  7. from pandas.tests.extension.base.base import BaseExtensionTests
  8. class BaseReshapingTests(BaseExtensionTests):
  9. """Tests for reshaping and concatenation."""
  10. @pytest.mark.parametrize("in_frame", [True, False])
  11. def test_concat(self, data, in_frame):
  12. wrapped = pd.Series(data)
  13. if in_frame:
  14. wrapped = pd.DataFrame(wrapped)
  15. result = pd.concat([wrapped, wrapped], ignore_index=True)
  16. assert len(result) == len(data) * 2
  17. if in_frame:
  18. dtype = result.dtypes[0]
  19. else:
  20. dtype = result.dtype
  21. assert dtype == data.dtype
  22. if hasattr(result._mgr, "blocks"):
  23. assert isinstance(result._mgr.blocks[0], EABackedBlock)
  24. assert isinstance(result._mgr.arrays[0], ExtensionArray)
  25. @pytest.mark.parametrize("in_frame", [True, False])
  26. def test_concat_all_na_block(self, data_missing, in_frame):
  27. valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
  28. na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
  29. if in_frame:
  30. valid_block = pd.DataFrame({"a": valid_block})
  31. na_block = pd.DataFrame({"a": na_block})
  32. result = pd.concat([valid_block, na_block])
  33. if in_frame:
  34. expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
  35. self.assert_frame_equal(result, expected)
  36. else:
  37. expected = pd.Series(data_missing.take([1, 1, 0, 0]))
  38. self.assert_series_equal(result, expected)
  39. def test_concat_mixed_dtypes(self, data):
  40. # https://github.com/pandas-dev/pandas/issues/20762
  41. df1 = pd.DataFrame({"A": data[:3]})
  42. df2 = pd.DataFrame({"A": [1, 2, 3]})
  43. df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
  44. dfs = [df1, df2, df3]
  45. # dataframes
  46. result = pd.concat(dfs)
  47. expected = pd.concat([x.astype(object) for x in dfs])
  48. self.assert_frame_equal(result, expected)
  49. # series
  50. result = pd.concat([x["A"] for x in dfs])
  51. expected = pd.concat([x["A"].astype(object) for x in dfs])
  52. self.assert_series_equal(result, expected)
  53. # simple test for just EA and one other
  54. result = pd.concat([df1, df2.astype(object)])
  55. expected = pd.concat([df1.astype("object"), df2.astype("object")])
  56. self.assert_frame_equal(result, expected)
  57. result = pd.concat([df1["A"], df2["A"].astype(object)])
  58. expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
  59. self.assert_series_equal(result, expected)
  60. def test_concat_columns(self, data, na_value):
  61. df1 = pd.DataFrame({"A": data[:3]})
  62. df2 = pd.DataFrame({"B": [1, 2, 3]})
  63. expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
  64. result = pd.concat([df1, df2], axis=1)
  65. self.assert_frame_equal(result, expected)
  66. result = pd.concat([df1["A"], df2["B"]], axis=1)
  67. self.assert_frame_equal(result, expected)
  68. # non-aligned
  69. df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
  70. expected = pd.DataFrame(
  71. {
  72. "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
  73. "B": [np.nan, 1, 2, 3],
  74. }
  75. )
  76. result = pd.concat([df1, df2], axis=1)
  77. self.assert_frame_equal(result, expected)
  78. result = pd.concat([df1["A"], df2["B"]], axis=1)
  79. self.assert_frame_equal(result, expected)
  80. def test_concat_extension_arrays_copy_false(self, data, na_value):
  81. # GH 20756
  82. df1 = pd.DataFrame({"A": data[:3]})
  83. df2 = pd.DataFrame({"B": data[3:7]})
  84. expected = pd.DataFrame(
  85. {
  86. "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
  87. "B": data[3:7],
  88. }
  89. )
  90. result = pd.concat([df1, df2], axis=1, copy=False)
  91. self.assert_frame_equal(result, expected)
  92. def test_concat_with_reindex(self, data):
  93. # GH-33027
  94. a = pd.DataFrame({"a": data[:5]})
  95. b = pd.DataFrame({"b": data[:5]})
  96. result = pd.concat([a, b], ignore_index=True)
  97. expected = pd.DataFrame(
  98. {
  99. "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
  100. "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
  101. }
  102. )
  103. self.assert_frame_equal(result, expected)
  104. def test_align(self, data, na_value):
  105. a = data[:3]
  106. b = data[2:5]
  107. r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
  108. # Assumes that the ctor can take a list of scalars of the type
  109. e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
  110. e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
  111. self.assert_series_equal(r1, e1)
  112. self.assert_series_equal(r2, e2)
  113. def test_align_frame(self, data, na_value):
  114. a = data[:3]
  115. b = data[2:5]
  116. r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
  117. # Assumes that the ctor can take a list of scalars of the type
  118. e1 = pd.DataFrame(
  119. {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
  120. )
  121. e2 = pd.DataFrame(
  122. {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
  123. )
  124. self.assert_frame_equal(r1, e1)
  125. self.assert_frame_equal(r2, e2)
  126. def test_align_series_frame(self, data, na_value):
  127. # https://github.com/pandas-dev/pandas/issues/20576
  128. ser = pd.Series(data, name="a")
  129. df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
  130. r1, r2 = ser.align(df)
  131. e1 = pd.Series(
  132. data._from_sequence(list(data) + [na_value], dtype=data.dtype),
  133. name=ser.name,
  134. )
  135. self.assert_series_equal(r1, e1)
  136. self.assert_frame_equal(r2, df)
  137. def test_set_frame_expand_regular_with_extension(self, data):
  138. df = pd.DataFrame({"A": [1] * len(data)})
  139. df["B"] = data
  140. expected = pd.DataFrame({"A": [1] * len(data), "B": data})
  141. self.assert_frame_equal(df, expected)
  142. def test_set_frame_expand_extension_with_regular(self, data):
  143. df = pd.DataFrame({"A": data})
  144. df["B"] = [1] * len(data)
  145. expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
  146. self.assert_frame_equal(df, expected)
  147. def test_set_frame_overwrite_object(self, data):
  148. # https://github.com/pandas-dev/pandas/issues/20555
  149. df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
  150. df["A"] = data
  151. assert df.dtypes["A"] == data.dtype
  152. def test_merge(self, data, na_value):
  153. # GH-20743
  154. df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
  155. df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
  156. res = pd.merge(df1, df2)
  157. exp = pd.DataFrame(
  158. {
  159. "int1": [1, 1, 2],
  160. "int2": [1, 2, 3],
  161. "key": [0, 0, 1],
  162. "ext": data._from_sequence(
  163. [data[0], data[0], data[1]], dtype=data.dtype
  164. ),
  165. }
  166. )
  167. self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
  168. res = pd.merge(df1, df2, how="outer")
  169. exp = pd.DataFrame(
  170. {
  171. "int1": [1, 1, 2, 3, np.nan],
  172. "int2": [1, 2, 3, np.nan, 4],
  173. "key": [0, 0, 1, 2, 3],
  174. "ext": data._from_sequence(
  175. [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
  176. ),
  177. }
  178. )
  179. self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
  180. def test_merge_on_extension_array(self, data):
  181. # GH 23020
  182. a, b = data[:2]
  183. key = type(data)._from_sequence([a, b], dtype=data.dtype)
  184. df = pd.DataFrame({"key": key, "val": [1, 2]})
  185. result = pd.merge(df, df, on="key")
  186. expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
  187. self.assert_frame_equal(result, expected)
  188. # order
  189. result = pd.merge(df.iloc[[1, 0]], df, on="key")
  190. expected = expected.iloc[[1, 0]].reset_index(drop=True)
  191. self.assert_frame_equal(result, expected)
  192. def test_merge_on_extension_array_duplicates(self, data):
  193. # GH 23020
  194. a, b = data[:2]
  195. key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
  196. df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
  197. df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
  198. result = pd.merge(df1, df2, on="key")
  199. expected = pd.DataFrame(
  200. {
  201. "key": key.take([0, 0, 0, 0, 1]),
  202. "val_x": [1, 1, 3, 3, 2],
  203. "val_y": [1, 3, 1, 3, 2],
  204. }
  205. )
  206. self.assert_frame_equal(result, expected)
  207. @pytest.mark.parametrize(
  208. "columns",
  209. [
  210. ["A", "B"],
  211. pd.MultiIndex.from_tuples(
  212. [("A", "a"), ("A", "b")], names=["outer", "inner"]
  213. ),
  214. ],
  215. )
  216. def test_stack(self, data, columns):
  217. df = pd.DataFrame({"A": data[:5], "B": data[:5]})
  218. df.columns = columns
  219. result = df.stack()
  220. expected = df.astype(object).stack()
  221. # we need a second astype(object), in case the constructor inferred
  222. # object -> specialized, as is done for period.
  223. expected = expected.astype(object)
  224. if isinstance(expected, pd.Series):
  225. assert result.dtype == df.iloc[:, 0].dtype
  226. else:
  227. assert all(result.dtypes == df.iloc[:, 0].dtype)
  228. result = result.astype(object)
  229. self.assert_equal(result, expected)
  230. @pytest.mark.parametrize(
  231. "index",
  232. [
  233. # Two levels, uniform.
  234. pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
  235. # non-uniform
  236. pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
  237. # three levels, non-uniform
  238. pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
  239. pd.MultiIndex.from_tuples(
  240. [
  241. ("A", "a", 1),
  242. ("A", "b", 0),
  243. ("A", "a", 0),
  244. ("B", "a", 0),
  245. ("B", "c", 1),
  246. ]
  247. ),
  248. ],
  249. )
  250. @pytest.mark.parametrize("obj", ["series", "frame"])
  251. def test_unstack(self, data, index, obj):
  252. data = data[: len(index)]
  253. if obj == "series":
  254. ser = pd.Series(data, index=index)
  255. else:
  256. ser = pd.DataFrame({"A": data, "B": data}, index=index)
  257. n = index.nlevels
  258. levels = list(range(n))
  259. # [0, 1, 2]
  260. # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
  261. combinations = itertools.chain.from_iterable(
  262. itertools.permutations(levels, i) for i in range(1, n)
  263. )
  264. for level in combinations:
  265. result = ser.unstack(level=level)
  266. assert all(
  267. isinstance(result[col].array, type(data)) for col in result.columns
  268. )
  269. if obj == "series":
  270. # We should get the same result with to_frame+unstack+droplevel
  271. df = ser.to_frame()
  272. alt = df.unstack(level=level).droplevel(0, axis=1)
  273. self.assert_frame_equal(result, alt)
  274. obj_ser = ser.astype(object)
  275. expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
  276. if obj == "series":
  277. assert (expected.dtypes == object).all()
  278. result = result.astype(object)
  279. self.assert_frame_equal(result, expected)
  280. def test_ravel(self, data):
  281. # as long as EA is 1D-only, ravel is a no-op
  282. result = data.ravel()
  283. assert type(result) == type(data)
  284. # Check that we have a view, not a copy
  285. result[0] = result[1]
  286. assert data[0] == data[1]
  287. def test_transpose(self, data):
  288. result = data.transpose()
  289. assert type(result) == type(data)
  290. # check we get a new object
  291. assert result is not data
  292. # If we ever _did_ support 2D, shape should be reversed
  293. assert result.shape == data.shape[::-1]
  294. # Check that we have a view, not a copy
  295. result[0] = result[1]
  296. assert data[0] == data[1]
  297. def test_transpose_frame(self, data):
  298. df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
  299. result = df.T
  300. expected = pd.DataFrame(
  301. {
  302. "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
  303. "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
  304. "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
  305. "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
  306. },
  307. index=["A", "B"],
  308. )
  309. self.assert_frame_equal(result, expected)
  310. self.assert_frame_equal(np.transpose(np.transpose(df)), df)
  311. self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])