test_dataframe.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. Series,
  8. concat,
  9. )
  10. import pandas._testing as tm
  11. class TestDataFrameConcat:
  12. def test_concat_multiple_frames_dtypes(self):
  13. # GH#2759
  14. df1 = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
  15. df2 = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
  16. results = concat((df1, df2), axis=1).dtypes
  17. expected = Series(
  18. [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
  19. index=["foo", "bar", 0, 1],
  20. )
  21. tm.assert_series_equal(results, expected)
  22. def test_concat_tuple_keys(self):
  23. # GH#14438
  24. df1 = DataFrame(np.ones((2, 2)), columns=list("AB"))
  25. df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
  26. results = concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
  27. expected = DataFrame(
  28. {
  29. "A": {
  30. ("bee", "bah", 0): 1.0,
  31. ("bee", "bah", 1): 1.0,
  32. ("bee", "boo", 0): 2.0,
  33. ("bee", "boo", 1): 2.0,
  34. ("bee", "boo", 2): 2.0,
  35. },
  36. "B": {
  37. ("bee", "bah", 0): 1.0,
  38. ("bee", "bah", 1): 1.0,
  39. ("bee", "boo", 0): 2.0,
  40. ("bee", "boo", 1): 2.0,
  41. ("bee", "boo", 2): 2.0,
  42. },
  43. }
  44. )
  45. tm.assert_frame_equal(results, expected)
  46. def test_concat_named_keys(self):
  47. # GH#14252
  48. df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
  49. index = Index(["a", "b"], name="baz")
  50. concatted_named_from_keys = concat([df, df], keys=index)
  51. expected_named = DataFrame(
  52. {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
  53. index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
  54. )
  55. tm.assert_frame_equal(concatted_named_from_keys, expected_named)
  56. index_no_name = Index(["a", "b"], name=None)
  57. concatted_named_from_names = concat([df, df], keys=index_no_name, names=["baz"])
  58. tm.assert_frame_equal(concatted_named_from_names, expected_named)
  59. concatted_unnamed = concat([df, df], keys=index_no_name)
  60. expected_unnamed = DataFrame(
  61. {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
  62. index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
  63. )
  64. tm.assert_frame_equal(concatted_unnamed, expected_unnamed)
  65. def test_concat_axis_parameter(self):
  66. # GH#14369
  67. df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2))
  68. df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2))
  69. # Index/row/0 DataFrame
  70. expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
  71. concatted_index = concat([df1, df2], axis="index")
  72. tm.assert_frame_equal(concatted_index, expected_index)
  73. concatted_row = concat([df1, df2], axis="rows")
  74. tm.assert_frame_equal(concatted_row, expected_index)
  75. concatted_0 = concat([df1, df2], axis=0)
  76. tm.assert_frame_equal(concatted_0, expected_index)
  77. # Columns/1 DataFrame
  78. expected_columns = DataFrame(
  79. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
  80. )
  81. concatted_columns = concat([df1, df2], axis="columns")
  82. tm.assert_frame_equal(concatted_columns, expected_columns)
  83. concatted_1 = concat([df1, df2], axis=1)
  84. tm.assert_frame_equal(concatted_1, expected_columns)
  85. series1 = Series([0.1, 0.2])
  86. series2 = Series([0.3, 0.4])
  87. # Index/row/0 Series
  88. expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
  89. concatted_index_series = concat([series1, series2], axis="index")
  90. tm.assert_series_equal(concatted_index_series, expected_index_series)
  91. concatted_row_series = concat([series1, series2], axis="rows")
  92. tm.assert_series_equal(concatted_row_series, expected_index_series)
  93. concatted_0_series = concat([series1, series2], axis=0)
  94. tm.assert_series_equal(concatted_0_series, expected_index_series)
  95. # Columns/1 Series
  96. expected_columns_series = DataFrame(
  97. [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
  98. )
  99. concatted_columns_series = concat([series1, series2], axis="columns")
  100. tm.assert_frame_equal(concatted_columns_series, expected_columns_series)
  101. concatted_1_series = concat([series1, series2], axis=1)
  102. tm.assert_frame_equal(concatted_1_series, expected_columns_series)
  103. # Testing ValueError
  104. with pytest.raises(ValueError, match="No axis named"):
  105. concat([series1, series2], axis="something")
  106. def test_concat_numerical_names(self):
  107. # GH#15262, GH#12223
  108. df = DataFrame(
  109. {"col": range(9)},
  110. dtype="int32",
  111. index=(
  112. pd.MultiIndex.from_product(
  113. [["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
  114. )
  115. ),
  116. )
  117. result = concat((df.iloc[:2, :], df.iloc[-2:, :]))
  118. expected = DataFrame(
  119. {"col": [0, 1, 7, 8]},
  120. dtype="int32",
  121. index=pd.MultiIndex.from_tuples(
  122. [("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
  123. ),
  124. )
  125. tm.assert_frame_equal(result, expected)
  126. def test_concat_astype_dup_col(self):
  127. # GH#23049
  128. df = DataFrame([{"a": "b"}])
  129. df = concat([df, df], axis=1)
  130. result = df.astype("category")
  131. expected = DataFrame(
  132. np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
  133. ).astype("category")
  134. tm.assert_frame_equal(result, expected)
  135. def test_concat_dataframe_keys_bug(self, sort):
  136. t1 = DataFrame(
  137. {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))}
  138. )
  139. t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))})
  140. # it works
  141. result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
  142. assert list(result.columns) == [("t1", "value"), ("t2", "value")]
  143. def test_concat_bool_with_int(self):
  144. # GH#42092 we may want to change this to return object, but that
  145. # would need a deprecation
  146. df1 = DataFrame(Series([True, False, True, True], dtype="bool"))
  147. df2 = DataFrame(Series([1, 0, 1], dtype="int64"))
  148. result = concat([df1, df2])
  149. expected = concat([df1.astype("int64"), df2])
  150. tm.assert_frame_equal(result, expected)
  151. def test_concat_duplicates_in_index_with_keys(self):
  152. # GH#42651
  153. index = [1, 1, 3]
  154. data = [1, 2, 3]
  155. df = DataFrame(data=data, index=index)
  156. result = concat([df], keys=["A"], names=["ID", "date"])
  157. mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"])
  158. expected = DataFrame(data=data, index=mi)
  159. tm.assert_frame_equal(result, expected)
  160. tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))
  161. @pytest.mark.parametrize("ignore_index", [True, False])
  162. @pytest.mark.parametrize("order", ["C", "F"])
  163. @pytest.mark.parametrize("axis", [0, 1])
  164. def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write):
  165. # based on asv ConcatDataFrames
  166. df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order))
  167. res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)
  168. if not using_copy_on_write:
  169. for arr in res._iter_column_arrays():
  170. for arr2 in df._iter_column_arrays():
  171. assert not np.shares_memory(arr, arr2)
  172. def test_outer_sort_columns(self):
  173. # GH#47127
  174. df1 = DataFrame({"A": [0], "B": [1], 0: 1})
  175. df2 = DataFrame({"A": [100]})
  176. result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
  177. expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]})
  178. tm.assert_frame_equal(result, expected)
  179. def test_inner_sort_columns(self):
  180. # GH#47127
  181. df1 = DataFrame({"A": [0], "B": [1], 0: 1})
  182. df2 = DataFrame({"A": [100], 0: 2})
  183. result = concat([df1, df2], ignore_index=True, join="inner", sort=True)
  184. expected = DataFrame({0: [1, 2], "A": [0, 100]})
  185. tm.assert_frame_equal(result, expected)
  186. def test_sort_columns_one_df(self):
  187. # GH#47127
  188. df1 = DataFrame({"A": [100], 0: 2})
  189. result = concat([df1], ignore_index=True, join="inner", sort=True)
  190. expected = DataFrame({0: [2], "A": [100]})
  191. tm.assert_frame_equal(result, expected)