test_index.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. from copy import deepcopy
  2. import numpy as np
  3. import pytest
  4. from pandas.errors import PerformanceWarning
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. Index,
  9. MultiIndex,
  10. Series,
  11. concat,
  12. )
  13. import pandas._testing as tm
  14. class TestIndexConcat:
  15. def test_concat_ignore_index(self, sort):
  16. frame1 = DataFrame(
  17. {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}
  18. )
  19. frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
  20. frame1.index = Index(["x", "y", "z"])
  21. frame2.index = Index(["x", "y", "q"])
  22. v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort)
  23. nan = np.nan
  24. expected = DataFrame(
  25. [
  26. [nan, nan, nan, 4.3],
  27. ["a", 1, 4.5, 5.2],
  28. ["b", 2, 3.2, 2.2],
  29. ["c", 3, 1.2, nan],
  30. ],
  31. index=Index(["q", "x", "y", "z"]),
  32. )
  33. if not sort:
  34. expected = expected.loc[["x", "y", "z", "q"]]
  35. tm.assert_frame_equal(v1, expected)
  36. @pytest.mark.parametrize(
  37. "name_in1,name_in2,name_in3,name_out",
  38. [
  39. ("idx", "idx", "idx", "idx"),
  40. ("idx", "idx", None, None),
  41. ("idx", None, None, None),
  42. ("idx1", "idx2", None, None),
  43. ("idx1", "idx1", "idx2", None),
  44. ("idx1", "idx2", "idx3", None),
  45. (None, None, None, None),
  46. ],
  47. )
  48. def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
  49. # GH13475
  50. indices = [
  51. Index(["a", "b", "c"], name=name_in1),
  52. Index(["b", "c", "d"], name=name_in2),
  53. Index(["c", "d", "e"], name=name_in3),
  54. ]
  55. frames = [
  56. DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"])
  57. ]
  58. result = concat(frames, axis=1)
  59. exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
  60. expected = DataFrame(
  61. {
  62. "x": [0, 1, 2, np.nan, np.nan],
  63. "y": [np.nan, 0, 1, 2, np.nan],
  64. "z": [np.nan, np.nan, 0, 1, 2],
  65. },
  66. index=exp_ind,
  67. )
  68. tm.assert_frame_equal(result, expected)
  69. def test_concat_rename_index(self):
  70. a = DataFrame(
  71. np.random.rand(3, 3),
  72. columns=list("ABC"),
  73. index=Index(list("abc"), name="index_a"),
  74. )
  75. b = DataFrame(
  76. np.random.rand(3, 3),
  77. columns=list("ABC"),
  78. index=Index(list("abc"), name="index_b"),
  79. )
  80. result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"])
  81. exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"])
  82. names = list(exp.index.names)
  83. names[1] = "lvl1"
  84. exp.index.set_names(names, inplace=True)
  85. tm.assert_frame_equal(result, exp)
  86. assert result.index.names == exp.index.names
  87. def test_concat_copy_index_series(self, axis, using_copy_on_write):
  88. # GH 29879
  89. ser = Series([1, 2])
  90. comb = concat([ser, ser], axis=axis, copy=True)
  91. if not using_copy_on_write or axis in [0, "index"]:
  92. assert comb.index is not ser.index
  93. else:
  94. assert comb.index is ser.index
  95. def test_concat_copy_index_frame(self, axis, using_copy_on_write):
  96. # GH 29879
  97. df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
  98. comb = concat([df, df], axis=axis, copy=True)
  99. if not using_copy_on_write:
  100. assert comb.index is not df.index
  101. assert comb.columns is not df.columns
  102. elif axis in [0, "index"]:
  103. assert comb.index is not df.index
  104. assert comb.columns is df.columns
  105. elif axis in [1, "columns"]:
  106. assert comb.index is df.index
  107. assert comb.columns is not df.columns
  108. def test_default_index(self):
  109. # is_series and ignore_index
  110. s1 = Series([1, 2, 3], name="x")
  111. s2 = Series([4, 5, 6], name="y")
  112. res = concat([s1, s2], axis=1, ignore_index=True)
  113. assert isinstance(res.columns, pd.RangeIndex)
  114. exp = DataFrame([[1, 4], [2, 5], [3, 6]])
  115. # use check_index_type=True to check the result have
  116. # RangeIndex (default index)
  117. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  118. # is_series and all inputs have no names
  119. s1 = Series([1, 2, 3])
  120. s2 = Series([4, 5, 6])
  121. res = concat([s1, s2], axis=1, ignore_index=False)
  122. assert isinstance(res.columns, pd.RangeIndex)
  123. exp = DataFrame([[1, 4], [2, 5], [3, 6]])
  124. exp.columns = pd.RangeIndex(2)
  125. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  126. # is_dataframe and ignore_index
  127. df1 = DataFrame({"A": [1, 2], "B": [5, 6]})
  128. df2 = DataFrame({"A": [3, 4], "B": [7, 8]})
  129. res = concat([df1, df2], axis=0, ignore_index=True)
  130. exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"])
  131. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  132. res = concat([df1, df2], axis=1, ignore_index=True)
  133. exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
  134. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  135. def test_dups_index(self):
  136. # GH 4771
  137. # single dtypes
  138. df = DataFrame(
  139. np.random.randint(0, 10, size=40).reshape(10, 4),
  140. columns=["A", "A", "C", "C"],
  141. )
  142. result = concat([df, df], axis=1)
  143. tm.assert_frame_equal(result.iloc[:, :4], df)
  144. tm.assert_frame_equal(result.iloc[:, 4:], df)
  145. result = concat([df, df], axis=0)
  146. tm.assert_frame_equal(result.iloc[:10], df)
  147. tm.assert_frame_equal(result.iloc[10:], df)
  148. # multi dtypes
  149. df = concat(
  150. [
  151. DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
  152. DataFrame(
  153. np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
  154. ),
  155. ],
  156. axis=1,
  157. )
  158. result = concat([df, df], axis=1)
  159. tm.assert_frame_equal(result.iloc[:, :6], df)
  160. tm.assert_frame_equal(result.iloc[:, 6:], df)
  161. result = concat([df, df], axis=0)
  162. tm.assert_frame_equal(result.iloc[:10], df)
  163. tm.assert_frame_equal(result.iloc[10:], df)
  164. # append
  165. result = df.iloc[0:8, :]._append(df.iloc[8:])
  166. tm.assert_frame_equal(result, df)
  167. result = df.iloc[0:8, :]._append(df.iloc[8:9])._append(df.iloc[9:10])
  168. tm.assert_frame_equal(result, df)
  169. expected = concat([df, df], axis=0)
  170. result = df._append(df)
  171. tm.assert_frame_equal(result, expected)
  172. class TestMultiIndexConcat:
  173. def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data):
  174. frame = multiindex_dataframe_random_data
  175. index = frame.index
  176. result = concat([frame, frame], keys=[0, 1], names=["iteration"])
  177. assert result.index.names == ("iteration",) + index.names
  178. tm.assert_frame_equal(result.loc[0], frame)
  179. tm.assert_frame_equal(result.loc[1], frame)
  180. assert result.index.nlevels == 3
  181. def test_concat_multiindex_with_none_in_index_names(self):
  182. # GH 15787
  183. index = MultiIndex.from_product([[1], range(5)], names=["level1", None])
  184. df = DataFrame({"col": range(5)}, index=index, dtype=np.int32)
  185. result = concat([df, df], keys=[1, 2], names=["level2"])
  186. index = MultiIndex.from_product(
  187. [[1, 2], [1], range(5)], names=["level2", "level1", None]
  188. )
  189. expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32)
  190. tm.assert_frame_equal(result, expected)
  191. result = concat([df, df[:2]], keys=[1, 2], names=["level2"])
  192. level2 = [1] * 5 + [2] * 2
  193. level1 = [1] * 7
  194. no_name = list(range(5)) + list(range(2))
  195. tuples = list(zip(level2, level1, no_name))
  196. index = MultiIndex.from_tuples(tuples, names=["level2", "level1", None])
  197. expected = DataFrame({"col": no_name}, index=index, dtype=np.int32)
  198. tm.assert_frame_equal(result, expected)
  199. def test_concat_multiindex_rangeindex(self):
  200. # GH13542
  201. # when multi-index levels are RangeIndex objects
  202. # there is a bug in concat with objects of len 1
  203. df = DataFrame(np.random.randn(9, 2))
  204. df.index = MultiIndex(
  205. levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
  206. codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)],
  207. )
  208. res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
  209. exp = df.iloc[[2, 3, 4, 5], :]
  210. tm.assert_frame_equal(res, exp)
  211. def test_concat_multiindex_dfs_with_deepcopy(self):
  212. # GH 9967
  213. example_multiindex1 = MultiIndex.from_product([["a"], ["b"]])
  214. example_dataframe1 = DataFrame([0], index=example_multiindex1)
  215. example_multiindex2 = MultiIndex.from_product([["a"], ["c"]])
  216. example_dataframe2 = DataFrame([1], index=example_multiindex2)
  217. example_dict = {"s1": example_dataframe1, "s2": example_dataframe2}
  218. expected_index = MultiIndex(
  219. levels=[["s1", "s2"], ["a"], ["b", "c"]],
  220. codes=[[0, 1], [0, 0], [0, 1]],
  221. names=["testname", None, None],
  222. )
  223. expected = DataFrame([[0], [1]], index=expected_index)
  224. result_copy = concat(deepcopy(example_dict), names=["testname"])
  225. tm.assert_frame_equal(result_copy, expected)
  226. result_no_copy = concat(example_dict, names=["testname"])
  227. tm.assert_frame_equal(result_no_copy, expected)
  228. @pytest.mark.parametrize(
  229. "mi1_list",
  230. [
  231. [["a"], range(2)],
  232. [["b"], np.arange(2.0, 4.0)],
  233. [["c"], ["A", "B"]],
  234. [["d"], pd.date_range(start="2017", end="2018", periods=2)],
  235. ],
  236. )
  237. @pytest.mark.parametrize(
  238. "mi2_list",
  239. [
  240. [["a"], range(2)],
  241. [["b"], np.arange(2.0, 4.0)],
  242. [["c"], ["A", "B"]],
  243. [["d"], pd.date_range(start="2017", end="2018", periods=2)],
  244. ],
  245. )
  246. def test_concat_with_various_multiindex_dtypes(
  247. self, mi1_list: list, mi2_list: list
  248. ):
  249. # GitHub #23478
  250. mi1 = MultiIndex.from_product(mi1_list)
  251. mi2 = MultiIndex.from_product(mi2_list)
  252. df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1)
  253. df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2)
  254. if mi1_list[0] == mi2_list[0]:
  255. expected_mi = MultiIndex(
  256. levels=[mi1_list[0], list(mi1_list[1])],
  257. codes=[[0, 0, 0, 0], [0, 1, 0, 1]],
  258. )
  259. else:
  260. expected_mi = MultiIndex(
  261. levels=[
  262. mi1_list[0] + mi2_list[0],
  263. list(mi1_list[1]) + list(mi2_list[1]),
  264. ],
  265. codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
  266. )
  267. expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi)
  268. with tm.assert_produces_warning(None):
  269. result_df = concat((df1, df2), axis=1)
  270. tm.assert_frame_equal(expected_df, result_df)
  271. def test_concat_multiindex_(self):
  272. # GitHub #44786
  273. df = DataFrame({"col": ["a", "b", "c"]}, index=["1", "2", "2"])
  274. df = concat([df], keys=["X"])
  275. iterables = [["X"], ["1", "2", "2"]]
  276. result_index = df.index
  277. expected_index = MultiIndex.from_product(iterables)
  278. tm.assert_index_equal(result_index, expected_index)
  279. result_df = df
  280. expected_df = DataFrame(
  281. {"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables)
  282. )
  283. tm.assert_frame_equal(result_df, expected_df)
  284. def test_concat_with_key_not_unique(self):
  285. # GitHub #46519
  286. df1 = DataFrame({"name": [1]})
  287. df2 = DataFrame({"name": [2]})
  288. df3 = DataFrame({"name": [3]})
  289. df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
  290. # the warning is caused by indexing unsorted multi-index
  291. with tm.assert_produces_warning(
  292. PerformanceWarning, match="indexing past lexsort depth"
  293. ):
  294. out_a = df_a.loc[("x", 0), :]
  295. df_b = DataFrame(
  296. {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)])
  297. )
  298. with tm.assert_produces_warning(
  299. PerformanceWarning, match="indexing past lexsort depth"
  300. ):
  301. out_b = df_b.loc[("x", 0)]
  302. tm.assert_frame_equal(out_a, out_b)
  303. df1 = DataFrame({"name": ["a", "a", "b"]})
  304. df2 = DataFrame({"name": ["a", "b"]})
  305. df3 = DataFrame({"name": ["c", "d"]})
  306. df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
  307. with tm.assert_produces_warning(
  308. PerformanceWarning, match="indexing past lexsort depth"
  309. ):
  310. out_a = df_a.loc[("x", 0), :]
  311. df_b = DataFrame(
  312. {
  313. "a": ["x", "x", "x", "y", "y", "x", "x"],
  314. "b": [0, 1, 2, 0, 1, 0, 1],
  315. "name": list("aababcd"),
  316. }
  317. ).set_index(["a", "b"])
  318. df_b.index.names = [None, None]
  319. with tm.assert_produces_warning(
  320. PerformanceWarning, match="indexing past lexsort depth"
  321. ):
  322. out_b = df_b.loc[("x", 0), :]
  323. tm.assert_frame_equal(out_a, out_b)
  324. def test_concat_with_duplicated_levels(self):
  325. # keyword levels should be unique
  326. df1 = DataFrame({"A": [1]}, index=["x"])
  327. df2 = DataFrame({"A": [1]}, index=["y"])
  328. msg = r"Level values not unique: \['x', 'y', 'y'\]"
  329. with pytest.raises(ValueError, match=msg):
  330. concat([df1, df2], keys=["x", "y"], levels=[["x", "y", "y"]])
  331. @pytest.mark.parametrize("levels", [[["x", "y"]], [["x", "y", "y"]]])
  332. def test_concat_with_levels_with_none_keys(self, levels):
  333. df1 = DataFrame({"A": [1]}, index=["x"])
  334. df2 = DataFrame({"A": [1]}, index=["y"])
  335. msg = "levels supported only when keys is not None"
  336. with pytest.raises(ValueError, match=msg):
  337. concat([df1, df2], levels=levels)
  338. def test_concat_range_index_result(self):
  339. # GH#47501
  340. df1 = DataFrame({"a": [1, 2]})
  341. df2 = DataFrame({"b": [1, 2]})
  342. result = concat([df1, df2], sort=True, axis=1)
  343. expected = DataFrame({"a": [1, 2], "b": [1, 2]})
  344. tm.assert_frame_equal(result, expected)
  345. expected_index = pd.RangeIndex(0, 2)
  346. tm.assert_index_equal(result.index, expected_index, exact=True)
  347. def test_concat_index_keep_dtype(self):
  348. # GH#47329
  349. df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype="object"))
  350. df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="object"))
  351. result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
  352. expected = DataFrame(
  353. [[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="object")
  354. )
  355. tm.assert_frame_equal(result, expected)
  356. def test_concat_index_keep_dtype_ea_numeric(self, any_numeric_ea_dtype):
  357. # GH#47329
  358. df1 = DataFrame(
  359. [[0, 1, 1]], columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype)
  360. )
  361. df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=any_numeric_ea_dtype))
  362. result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
  363. expected = DataFrame(
  364. [[0, 1, 1.0], [0, 1, np.nan]],
  365. columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype),
  366. )
  367. tm.assert_frame_equal(result, expected)
  368. @pytest.mark.parametrize("dtype", ["Int8", "Int16", "Int32"])
  369. def test_concat_index_find_common(self, dtype):
  370. # GH#47329
  371. df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
  372. df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="Int32"))
  373. result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
  374. expected = DataFrame(
  375. [[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="Int32")
  376. )
  377. tm.assert_frame_equal(result, expected)
  378. def test_concat_axis_1_sort_false_rangeindex(self):
  379. # GH 46675
  380. s1 = Series(["a", "b", "c"])
  381. s2 = Series(["a", "b"])
  382. s3 = Series(["a", "b", "c", "d"])
  383. s4 = Series([], dtype=object)
  384. result = concat(
  385. [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1
  386. )
  387. expected = DataFrame(
  388. [
  389. ["a"] * 3 + [np.nan],
  390. ["b"] * 3 + [np.nan],
  391. ["c", np.nan] * 2,
  392. [np.nan] * 2 + ["d"] + [np.nan],
  393. ],
  394. dtype=object,
  395. )
  396. tm.assert_frame_equal(
  397. result, expected, check_index_type=True, check_column_type=True
  398. )