test_join.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. Categorical,
  6. DataFrame,
  7. Index,
  8. MultiIndex,
  9. Series,
  10. Timestamp,
  11. concat,
  12. merge,
  13. )
  14. import pandas._testing as tm
  15. def get_test_data(ngroups=8, n=50):
  16. unique_groups = list(range(ngroups))
  17. arr = np.asarray(np.tile(unique_groups, n // ngroups))
  18. if len(arr) < n:
  19. arr = np.asarray(list(arr) + unique_groups[: n - len(arr)])
  20. np.random.shuffle(arr)
  21. return arr
  22. class TestJoin:
  23. # aggregate multiple columns
  24. @pytest.fixture
  25. def df(self):
  26. df = DataFrame(
  27. {
  28. "key1": get_test_data(),
  29. "key2": get_test_data(),
  30. "data1": np.random.randn(50),
  31. "data2": np.random.randn(50),
  32. }
  33. )
  34. # exclude a couple keys for fun
  35. df = df[df["key2"] > 1]
  36. return df
  37. @pytest.fixture
  38. def df2(self):
  39. return DataFrame(
  40. {
  41. "key1": get_test_data(n=10),
  42. "key2": get_test_data(ngroups=4, n=10),
  43. "value": np.random.randn(10),
  44. }
  45. )
  46. @pytest.fixture
  47. def target_source(self):
  48. index, data = tm.getMixedTypeDict()
  49. target = DataFrame(data, index=index)
  50. # Join on string value
  51. source = DataFrame(
  52. {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"]
  53. )
  54. return target, source
  55. def test_left_outer_join(self, df, df2):
  56. joined_key2 = merge(df, df2, on="key2")
  57. _check_join(df, df2, joined_key2, ["key2"], how="left")
  58. joined_both = merge(df, df2)
  59. _check_join(df, df2, joined_both, ["key1", "key2"], how="left")
  60. def test_right_outer_join(self, df, df2):
  61. joined_key2 = merge(df, df2, on="key2", how="right")
  62. _check_join(df, df2, joined_key2, ["key2"], how="right")
  63. joined_both = merge(df, df2, how="right")
  64. _check_join(df, df2, joined_both, ["key1", "key2"], how="right")
  65. def test_full_outer_join(self, df, df2):
  66. joined_key2 = merge(df, df2, on="key2", how="outer")
  67. _check_join(df, df2, joined_key2, ["key2"], how="outer")
  68. joined_both = merge(df, df2, how="outer")
  69. _check_join(df, df2, joined_both, ["key1", "key2"], how="outer")
  70. def test_inner_join(self, df, df2):
  71. joined_key2 = merge(df, df2, on="key2", how="inner")
  72. _check_join(df, df2, joined_key2, ["key2"], how="inner")
  73. joined_both = merge(df, df2, how="inner")
  74. _check_join(df, df2, joined_both, ["key1", "key2"], how="inner")
  75. def test_handle_overlap(self, df, df2):
  76. joined = merge(df, df2, on="key2", suffixes=(".foo", ".bar"))
  77. assert "key1.foo" in joined
  78. assert "key1.bar" in joined
  79. def test_handle_overlap_arbitrary_key(self, df, df2):
  80. joined = merge(
  81. df,
  82. df2,
  83. left_on="key2",
  84. right_on="key1",
  85. suffixes=(".foo", ".bar"),
  86. )
  87. assert "key1.foo" in joined
  88. assert "key2.bar" in joined
  89. def test_join_on(self, target_source):
  90. target, source = target_source
  91. merged = target.join(source, on="C")
  92. tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
  93. tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False)
  94. # join with duplicates (fix regression from DataFrame/Matrix merge)
  95. df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
  96. df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
  97. joined = df.join(df2, on="key")
  98. expected = DataFrame(
  99. {"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]}
  100. )
  101. tm.assert_frame_equal(joined, expected)
  102. # Test when some are missing
  103. df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
  104. df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
  105. df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
  106. joined = df_a.join(df_b, on="one")
  107. joined = joined.join(df_c, on="one")
  108. assert np.isnan(joined["two"]["c"])
  109. assert np.isnan(joined["three"]["c"])
  110. # merge column not p resent
  111. with pytest.raises(KeyError, match="^'E'$"):
  112. target.join(source, on="E")
  113. # overlap
  114. source_copy = source.copy()
  115. source_copy["A"] = 0
  116. msg = (
  117. "You are trying to merge on float64 and object columns. If "
  118. "you wish to proceed you should use pd.concat"
  119. )
  120. with pytest.raises(ValueError, match=msg):
  121. target.join(source_copy, on="A")
  122. def test_join_on_fails_with_different_right_index(self):
  123. df = DataFrame(
  124. {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
  125. )
  126. df2 = DataFrame(
  127. {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
  128. index=tm.makeCustomIndex(10, 2),
  129. )
  130. msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
  131. with pytest.raises(ValueError, match=msg):
  132. merge(df, df2, left_on="a", right_index=True)
  133. def test_join_on_fails_with_different_left_index(self):
  134. df = DataFrame(
  135. {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)},
  136. index=tm.makeCustomIndex(3, 2),
  137. )
  138. df2 = DataFrame(
  139. {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}
  140. )
  141. msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
  142. with pytest.raises(ValueError, match=msg):
  143. merge(df, df2, right_on="b", left_index=True)
  144. def test_join_on_fails_with_different_column_counts(self):
  145. df = DataFrame(
  146. {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
  147. )
  148. df2 = DataFrame(
  149. {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
  150. index=tm.makeCustomIndex(10, 2),
  151. )
  152. msg = r"len\(right_on\) must equal len\(left_on\)"
  153. with pytest.raises(ValueError, match=msg):
  154. merge(df, df2, right_on="a", left_on=["a", "b"])
  155. @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
  156. def test_join_on_fails_with_wrong_object_type(self, wrong_type):
  157. # GH12081 - original issue
  158. # GH21220 - merging of Series and DataFrame is now allowed
  159. # Edited test to remove the Series object from test parameters
  160. df = DataFrame({"a": [1, 1]})
  161. msg = (
  162. "Can only merge Series or DataFrame objects, "
  163. f"a {type(wrong_type)} was passed"
  164. )
  165. with pytest.raises(TypeError, match=msg):
  166. merge(wrong_type, df, left_on="a", right_on="a")
  167. with pytest.raises(TypeError, match=msg):
  168. merge(df, wrong_type, left_on="a", right_on="a")
  169. def test_join_on_pass_vector(self, target_source):
  170. target, source = target_source
  171. expected = target.join(source, on="C")
  172. del expected["C"]
  173. join_col = target.pop("C")
  174. result = target.join(source, on=join_col)
  175. tm.assert_frame_equal(result, expected)
  176. def test_join_with_len0(self, target_source):
  177. # nothing to merge
  178. target, source = target_source
  179. merged = target.join(source.reindex([]), on="C")
  180. for col in source:
  181. assert col in merged
  182. assert merged[col].isna().all()
  183. merged2 = target.join(source.reindex([]), on="C", how="inner")
  184. tm.assert_index_equal(merged2.columns, merged.columns)
  185. assert len(merged2) == 0
  186. def test_join_on_inner(self):
  187. df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
  188. df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])
  189. joined = df.join(df2, on="key", how="inner")
  190. expected = df.join(df2, on="key")
  191. expected = expected[expected["value"].notna()]
  192. tm.assert_series_equal(joined["key"], expected["key"])
  193. tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
  194. tm.assert_index_equal(joined.index, expected.index)
  195. def test_join_on_singlekey_list(self):
  196. df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
  197. df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
  198. # corner cases
  199. joined = df.join(df2, on=["key"])
  200. expected = df.join(df2, on="key")
  201. tm.assert_frame_equal(joined, expected)
  202. def test_join_on_series(self, target_source):
  203. target, source = target_source
  204. result = target.join(source["MergedA"], on="C")
  205. expected = target.join(source[["MergedA"]], on="C")
  206. tm.assert_frame_equal(result, expected)
  207. def test_join_on_series_buglet(self):
  208. # GH #638
  209. df = DataFrame({"a": [1, 1]})
  210. ds = Series([2], index=[1], name="b")
  211. result = df.join(ds, on="a")
  212. expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
  213. tm.assert_frame_equal(result, expected)
  214. def test_join_index_mixed(self, join_type):
  215. # no overlapping blocks
  216. df1 = DataFrame(index=np.arange(10))
  217. df1["bool"] = True
  218. df1["string"] = "foo"
  219. df2 = DataFrame(index=np.arange(5, 15))
  220. df2["int"] = 1
  221. df2["float"] = 1.0
  222. joined = df1.join(df2, how=join_type)
  223. expected = _join_by_hand(df1, df2, how=join_type)
  224. tm.assert_frame_equal(joined, expected)
  225. joined = df2.join(df1, how=join_type)
  226. expected = _join_by_hand(df2, df1, how=join_type)
  227. tm.assert_frame_equal(joined, expected)
  228. def test_join_index_mixed_overlap(self):
  229. df1 = DataFrame(
  230. {"A": 1.0, "B": 2, "C": "foo", "D": True},
  231. index=np.arange(10),
  232. columns=["A", "B", "C", "D"],
  233. )
  234. assert df1["B"].dtype == np.int64
  235. assert df1["D"].dtype == np.bool_
  236. df2 = DataFrame(
  237. {"A": 1.0, "B": 2, "C": "foo", "D": True},
  238. index=np.arange(0, 10, 2),
  239. columns=["A", "B", "C", "D"],
  240. )
  241. # overlap
  242. joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
  243. expected_columns = [
  244. "A_one",
  245. "B_one",
  246. "C_one",
  247. "D_one",
  248. "A_two",
  249. "B_two",
  250. "C_two",
  251. "D_two",
  252. ]
  253. df1.columns = expected_columns[:4]
  254. df2.columns = expected_columns[4:]
  255. expected = _join_by_hand(df1, df2)
  256. tm.assert_frame_equal(joined, expected)
  257. def test_join_empty_bug(self):
  258. # generated an exception in 0.4.3
  259. x = DataFrame()
  260. x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")
  261. def test_join_unconsolidated(self):
  262. # GH #331
  263. a = DataFrame(np.random.randn(30, 2), columns=["a", "b"])
  264. c = Series(np.random.randn(30))
  265. a["c"] = c
  266. d = DataFrame(np.random.randn(30, 1), columns=["q"])
  267. # it works!
  268. a.join(d)
  269. d.join(a)
  270. def test_join_multiindex(self):
  271. index1 = MultiIndex.from_arrays(
  272. [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
  273. names=["first", "second"],
  274. )
  275. index2 = MultiIndex.from_arrays(
  276. [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
  277. names=["first", "second"],
  278. )
  279. df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
  280. df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])
  281. df1 = df1.sort_index(level=0)
  282. df2 = df2.sort_index(level=0)
  283. joined = df1.join(df2, how="outer")
  284. ex_index = Index(index1.values).union(Index(index2.values))
  285. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  286. expected.index.names = index1.names
  287. tm.assert_frame_equal(joined, expected)
  288. assert joined.index.names == index1.names
  289. df1 = df1.sort_index(level=1)
  290. df2 = df2.sort_index(level=1)
  291. joined = df1.join(df2, how="outer").sort_index(level=0)
  292. ex_index = Index(index1.values).union(Index(index2.values))
  293. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  294. expected.index.names = index1.names
  295. tm.assert_frame_equal(joined, expected)
  296. assert joined.index.names == index1.names
  297. def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex):
  298. key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
  299. key2 = [
  300. "two",
  301. "one",
  302. "three",
  303. "one",
  304. "two",
  305. "one",
  306. "two",
  307. "two",
  308. "three",
  309. "one",
  310. ]
  311. data = np.random.randn(len(key1))
  312. data = DataFrame({"key1": key1, "key2": key2, "data": data})
  313. index = lexsorted_two_level_string_multiindex
  314. to_join = DataFrame(
  315. np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
  316. )
  317. joined = data.join(to_join, on=["key1", "key2"], how="inner")
  318. expected = merge(
  319. data,
  320. to_join.reset_index(),
  321. left_on=["key1", "key2"],
  322. right_on=["first", "second"],
  323. how="inner",
  324. sort=False,
  325. )
  326. expected2 = merge(
  327. to_join,
  328. data,
  329. right_on=["key1", "key2"],
  330. left_index=True,
  331. how="inner",
  332. sort=False,
  333. )
  334. tm.assert_frame_equal(joined, expected2.reindex_like(joined))
  335. expected2 = merge(
  336. to_join,
  337. data,
  338. right_on=["key1", "key2"],
  339. left_index=True,
  340. how="inner",
  341. sort=False,
  342. )
  343. expected = expected.drop(["first", "second"], axis=1)
  344. expected.index = joined.index
  345. assert joined.index.is_monotonic_increasing
  346. tm.assert_frame_equal(joined, expected)
  347. # _assert_same_contents(expected, expected2.loc[:, expected.columns])
  348. def test_join_hierarchical_mixed_raises(self):
  349. # GH 2024
  350. # GH 40993: For raising, enforced in 2.0
  351. df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
  352. new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
  353. other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
  354. other_df.set_index("a", inplace=True)
  355. # GH 9455, 12219
  356. with pytest.raises(
  357. pd.errors.MergeError, match="Not allowed to merge between different levels"
  358. ):
  359. merge(new_df, other_df, left_index=True, right_index=True)
  360. def test_join_float64_float32(self):
  361. a = DataFrame(np.random.randn(10, 2), columns=["a", "b"], dtype=np.float64)
  362. b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32)
  363. joined = a.join(b)
  364. assert joined.dtypes["a"] == "float64"
  365. assert joined.dtypes["b"] == "float64"
  366. assert joined.dtypes["c"] == "float32"
  367. a = np.random.randint(0, 5, 100).astype("int64")
  368. b = np.random.random(100).astype("float64")
  369. c = np.random.random(100).astype("float32")
  370. df = DataFrame({"a": a, "b": b, "c": c})
  371. xpdf = DataFrame({"a": a, "b": b, "c": c})
  372. s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
  373. rs = df.merge(s, left_on="a", right_index=True)
  374. assert rs.dtypes["a"] == "int64"
  375. assert rs.dtypes["b"] == "float64"
  376. assert rs.dtypes["c"] == "float32"
  377. assert rs.dtypes["md"] == "float32"
  378. xp = xpdf.merge(s, left_on="a", right_index=True)
  379. tm.assert_frame_equal(rs, xp)
  380. def test_join_many_non_unique_index(self):
  381. df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
  382. df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
  383. df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
  384. idf1 = df1.set_index(["a", "b"])
  385. idf2 = df2.set_index(["a", "b"])
  386. idf3 = df3.set_index(["a", "b"])
  387. result = idf1.join([idf2, idf3], how="outer")
  388. df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
  389. expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")
  390. result = result.reset_index()
  391. expected = expected[result.columns]
  392. expected["a"] = expected.a.astype("int64")
  393. expected["b"] = expected.b.astype("int64")
  394. tm.assert_frame_equal(result, expected)
  395. df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
  396. df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
  397. df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
  398. idf1 = df1.set_index(["a", "b"])
  399. idf2 = df2.set_index(["a", "b"])
  400. idf3 = df3.set_index(["a", "b"])
  401. result = idf1.join([idf2, idf3], how="inner")
  402. df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
  403. expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")
  404. result = result.reset_index()
  405. tm.assert_frame_equal(result, expected.loc[:, result.columns])
  406. # GH 11519
  407. df = DataFrame(
  408. {
  409. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  410. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  411. "C": np.random.randn(8),
  412. "D": np.random.randn(8),
  413. }
  414. )
  415. s = Series(
  416. np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST"
  417. )
  418. inner = df.join(s, how="inner")
  419. outer = df.join(s, how="outer")
  420. left = df.join(s, how="left")
  421. right = df.join(s, how="right")
  422. tm.assert_frame_equal(inner, outer)
  423. tm.assert_frame_equal(inner, left)
  424. tm.assert_frame_equal(inner, right)
  425. def test_join_sort(self):
  426. left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
  427. right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
  428. joined = left.join(right, on="key", sort=True)
  429. expected = DataFrame(
  430. {
  431. "key": ["bar", "baz", "foo", "foo"],
  432. "value": [2, 3, 1, 4],
  433. "value2": ["a", "b", "c", "c"],
  434. },
  435. index=[1, 2, 0, 3],
  436. )
  437. tm.assert_frame_equal(joined, expected)
  438. # smoke test
  439. joined = left.join(right, on="key", sort=False)
  440. tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
  441. def test_join_mixed_non_unique_index(self):
  442. # GH 12814, unorderable types in py3 with a non-unique index
  443. df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
  444. df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
  445. result = df1.join(df2)
  446. expected = DataFrame(
  447. {"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]},
  448. index=[1, 2, 3, 3, "a"],
  449. )
  450. tm.assert_frame_equal(result, expected)
  451. df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
  452. df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
  453. result = df3.join(df4)
  454. expected = DataFrame(
  455. {"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"]
  456. )
  457. tm.assert_frame_equal(result, expected)
  458. def test_join_non_unique_period_index(self):
  459. # GH #16871
  460. index = pd.period_range("2016-01-01", periods=16, freq="M")
  461. df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
  462. df2 = concat([df, df])
  463. result = df.join(df2, how="inner", rsuffix="_df2")
  464. expected = DataFrame(
  465. np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
  466. columns=["pnum", "pnum_df2"],
  467. index=df2.sort_index().index,
  468. )
  469. tm.assert_frame_equal(result, expected)
  470. def test_mixed_type_join_with_suffix(self):
  471. # GH #916
  472. df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
  473. df.insert(0, "id", 0)
  474. df.insert(5, "dt", "foo")
  475. grouped = df.groupby("id")
  476. with pytest.raises(TypeError, match="Could not convert"):
  477. grouped.mean()
  478. mn = grouped.mean(numeric_only=True)
  479. cn = grouped.count()
  480. # it works!
  481. mn.join(cn, rsuffix="_right")
  482. def test_join_many(self):
  483. df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
  484. df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]
  485. joined = df_list[0].join(df_list[1:])
  486. tm.assert_frame_equal(joined, df)
  487. df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]]
  488. def _check_diff_index(df_list, result, exp_index):
  489. reindexed = [x.reindex(exp_index) for x in df_list]
  490. expected = reindexed[0].join(reindexed[1:])
  491. tm.assert_frame_equal(result, expected)
  492. # different join types
  493. joined = df_list[0].join(df_list[1:], how="outer")
  494. _check_diff_index(df_list, joined, df.index)
  495. joined = df_list[0].join(df_list[1:])
  496. _check_diff_index(df_list, joined, df_list[0].index)
  497. joined = df_list[0].join(df_list[1:], how="inner")
  498. _check_diff_index(df_list, joined, df.index[2:8])
  499. msg = "Joining multiple DataFrames only supported for joining on index"
  500. with pytest.raises(ValueError, match=msg):
  501. df_list[0].join(df_list[1:], on="a")
  502. def test_join_many_mixed(self):
  503. df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
  504. df["key"] = ["foo", "bar"] * 4
  505. df1 = df.loc[:, ["A", "B"]]
  506. df2 = df.loc[:, ["C", "D"]]
  507. df3 = df.loc[:, ["key"]]
  508. result = df1.join([df2, df3])
  509. tm.assert_frame_equal(result, df)
  510. def test_join_dups(self):
  511. # joining dups
  512. df = concat(
  513. [
  514. DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
  515. DataFrame(
  516. np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
  517. ),
  518. ],
  519. axis=1,
  520. )
  521. expected = concat([df, df], axis=1)
  522. result = df.join(df, rsuffix="_2")
  523. result.columns = expected.columns
  524. tm.assert_frame_equal(result, expected)
  525. # GH 4975, invalid join on dups
  526. w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  527. x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  528. y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  529. z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
  530. dta = x.merge(y, left_index=True, right_index=True).merge(
  531. z, left_index=True, right_index=True, how="outer"
  532. )
  533. # GH 40991: As of 2.0 causes duplicate columns
  534. with pytest.raises(
  535. pd.errors.MergeError,
  536. match="Passing 'suffixes' which cause duplicate columns",
  537. ):
  538. dta.merge(w, left_index=True, right_index=True)
  539. def test_join_multi_to_multi(self, join_type):
  540. # GH 20475
  541. leftindex = MultiIndex.from_product(
  542. [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]
  543. )
  544. left = DataFrame({"v1": range(12)}, index=leftindex)
  545. rightindex = MultiIndex.from_product(
  546. [list("abc"), list("xy")], names=["abc", "xy"]
  547. )
  548. right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex)
  549. result = left.join(right, on=["abc", "xy"], how=join_type)
  550. expected = (
  551. left.reset_index()
  552. .merge(right.reset_index(), on=["abc", "xy"], how=join_type)
  553. .set_index(["abc", "xy", "num"])
  554. )
  555. tm.assert_frame_equal(expected, result)
  556. msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
  557. with pytest.raises(ValueError, match=msg):
  558. left.join(right, on="xy", how=join_type)
  559. with pytest.raises(ValueError, match=msg):
  560. right.join(left, on=["abc", "xy"], how=join_type)
  561. def test_join_on_tz_aware_datetimeindex(self):
  562. # GH 23931, 26335
  563. df1 = DataFrame(
  564. {
  565. "date": pd.date_range(
  566. start="2018-01-01", periods=5, tz="America/Chicago"
  567. ),
  568. "vals": list("abcde"),
  569. }
  570. )
  571. df2 = DataFrame(
  572. {
  573. "date": pd.date_range(
  574. start="2018-01-03", periods=5, tz="America/Chicago"
  575. ),
  576. "vals_2": list("tuvwx"),
  577. }
  578. )
  579. result = df1.join(df2.set_index("date"), on="date")
  580. expected = df1.copy()
  581. expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object)
  582. tm.assert_frame_equal(result, expected)
  583. def test_join_datetime_string(self):
  584. # GH 5647
  585. dfa = DataFrame(
  586. [
  587. ["2012-08-02", "L", 10],
  588. ["2012-08-02", "J", 15],
  589. ["2013-04-06", "L", 20],
  590. ["2013-04-06", "J", 25],
  591. ],
  592. columns=["x", "y", "a"],
  593. )
  594. dfa["x"] = pd.to_datetime(dfa["x"])
  595. dfb = DataFrame(
  596. [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
  597. columns=["x", "y", "z"],
  598. index=[2, 4],
  599. )
  600. dfb["x"] = pd.to_datetime(dfb["x"])
  601. result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
  602. expected = DataFrame(
  603. [
  604. [Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
  605. [Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
  606. ],
  607. index=[2, 4],
  608. columns=["x", "y", "z", "a"],
  609. )
  610. tm.assert_frame_equal(result, expected)
  611. def test_join_with_categorical_index(self):
  612. # GH47812
  613. ix = ["a", "b"]
  614. id1 = pd.CategoricalIndex(ix, categories=ix)
  615. id2 = pd.CategoricalIndex(reversed(ix), categories=reversed(ix))
  616. df1 = DataFrame({"c1": ix}, index=id1)
  617. df2 = DataFrame({"c2": reversed(ix)}, index=id2)
  618. result = df1.join(df2)
  619. expected = DataFrame(
  620. {"c1": ["a", "b"], "c2": ["a", "b"]},
  621. index=pd.CategoricalIndex(["a", "b"], categories=["a", "b"]),
  622. )
  623. tm.assert_frame_equal(result, expected)
  624. def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"):
  625. # some smoke tests
  626. for c in join_col:
  627. assert result[c].notna().all()
  628. left_grouped = left.groupby(join_col)
  629. right_grouped = right.groupby(join_col)
  630. for group_key, group in result.groupby(
  631. join_col if len(join_col) > 1 else join_col[0]
  632. ):
  633. l_joined = _restrict_to_columns(group, left.columns, lsuffix)
  634. r_joined = _restrict_to_columns(group, right.columns, rsuffix)
  635. try:
  636. lgroup = left_grouped.get_group(group_key)
  637. except KeyError as err:
  638. if how in ("left", "inner"):
  639. raise AssertionError(
  640. f"key {group_key} should not have been in the join"
  641. ) from err
  642. _assert_all_na(l_joined, left.columns, join_col)
  643. else:
  644. _assert_same_contents(l_joined, lgroup)
  645. try:
  646. rgroup = right_grouped.get_group(group_key)
  647. except KeyError as err:
  648. if how in ("right", "inner"):
  649. raise AssertionError(
  650. f"key {group_key} should not have been in the join"
  651. ) from err
  652. _assert_all_na(r_joined, right.columns, join_col)
  653. else:
  654. _assert_same_contents(r_joined, rgroup)
  655. def _restrict_to_columns(group, columns, suffix):
  656. found = [
  657. c for c in group.columns if c in columns or c.replace(suffix, "") in columns
  658. ]
  659. # filter
  660. group = group.loc[:, found]
  661. # get rid of suffixes, if any
  662. group = group.rename(columns=lambda x: x.replace(suffix, ""))
  663. # put in the right order...
  664. group = group.loc[:, columns]
  665. return group
  666. def _assert_same_contents(join_chunk, source):
  667. NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
  668. jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
  669. svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
  670. rows = {tuple(row) for row in jvalues}
  671. assert len(rows) == len(source)
  672. assert all(tuple(row) in rows for row in svalues)
  673. def _assert_all_na(join_chunk, source_columns, join_col):
  674. for c in source_columns:
  675. if c in join_col:
  676. continue
  677. assert join_chunk[c].isna().all()
  678. def _join_by_hand(a, b, how="left"):
  679. join_index = a.index.join(b.index, how=how)
  680. a_re = a.reindex(join_index)
  681. b_re = b.reindex(join_index)
  682. result_columns = a.columns.append(b.columns)
  683. for col, s in b_re.items():
  684. a_re[col] = s
  685. return a_re.reindex(columns=result_columns)
  686. def test_join_inner_multiindex_deterministic_order():
  687. # GH: 36910
  688. left = DataFrame(
  689. data={"e": 5},
  690. index=MultiIndex.from_tuples([(1, 2, 4)], names=("a", "b", "d")),
  691. )
  692. right = DataFrame(
  693. data={"f": 6}, index=MultiIndex.from_tuples([(2, 3)], names=("b", "c"))
  694. )
  695. result = left.join(right, how="inner")
  696. expected = DataFrame(
  697. {"e": [5], "f": [6]},
  698. index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")),
  699. )
  700. tm.assert_frame_equal(result, expected)
  701. @pytest.mark.parametrize(
  702. ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
  703. )
  704. def test_join_cross(input_col, output_cols):
  705. # GH#5401
  706. left = DataFrame({"a": [1, 3]})
  707. right = DataFrame({input_col: [3, 4]})
  708. result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y")
  709. expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
  710. tm.assert_frame_equal(result, expected)
  711. def test_join_multiindex_one_level(join_type):
  712. # GH#36909
  713. left = DataFrame(
  714. data={"c": 3}, index=MultiIndex.from_tuples([(1, 2)], names=("a", "b"))
  715. )
  716. right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",)))
  717. result = left.join(right, how=join_type)
  718. expected = DataFrame(
  719. {"c": [3], "d": [4]},
  720. index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]),
  721. )
  722. tm.assert_frame_equal(result, expected)
  723. @pytest.mark.parametrize(
  724. "categories, values",
  725. [
  726. (["Y", "X"], ["Y", "X", "X"]),
  727. ([2, 1], [2, 1, 1]),
  728. ([2.5, 1.5], [2.5, 1.5, 1.5]),
  729. (
  730. [Timestamp("2020-12-31"), Timestamp("2019-12-31")],
  731. [Timestamp("2020-12-31"), Timestamp("2019-12-31"), Timestamp("2019-12-31")],
  732. ),
  733. ],
  734. )
  735. def test_join_multiindex_not_alphabetical_categorical(categories, values):
  736. # GH#38502
  737. left = DataFrame(
  738. {
  739. "first": ["A", "A"],
  740. "second": Categorical(categories, categories=categories),
  741. "value": [1, 2],
  742. }
  743. ).set_index(["first", "second"])
  744. right = DataFrame(
  745. {
  746. "first": ["A", "A", "B"],
  747. "second": Categorical(values, categories=categories),
  748. "value": [3, 4, 5],
  749. }
  750. ).set_index(["first", "second"])
  751. result = left.join(right, lsuffix="_left", rsuffix="_right")
  752. expected = DataFrame(
  753. {
  754. "first": ["A", "A"],
  755. "second": Categorical(categories, categories=categories),
  756. "value_left": [1, 2],
  757. "value_right": [3, 4],
  758. }
  759. ).set_index(["first", "second"])
  760. tm.assert_frame_equal(result, expected)
  761. @pytest.mark.parametrize(
  762. "left_empty, how, exp",
  763. [
  764. (False, "left", "left"),
  765. (False, "right", "empty"),
  766. (False, "inner", "empty"),
  767. (False, "outer", "left"),
  768. (False, "cross", "empty"),
  769. (True, "left", "empty"),
  770. (True, "right", "right"),
  771. (True, "inner", "empty"),
  772. (True, "outer", "right"),
  773. (True, "cross", "empty"),
  774. ],
  775. )
  776. def test_join_empty(left_empty, how, exp):
  777. left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A")
  778. right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A")
  779. if left_empty:
  780. left = left.head(0)
  781. else:
  782. right = right.head(0)
  783. result = left.join(right, how=how)
  784. if exp == "left":
  785. expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
  786. expected = expected.set_index("A")
  787. elif exp == "right":
  788. expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
  789. expected = expected.set_index("A")
  790. elif exp == "empty":
  791. expected = DataFrame(columns=["B", "C"], dtype="int64")
  792. if how != "cross":
  793. expected = expected.rename_axis("A")
  794. tm.assert_frame_equal(result, expected)
  795. @pytest.mark.parametrize(
  796. "how, values",
  797. [
  798. ("inner", [0, 1, 2]),
  799. ("outer", [0, 1, 2]),
  800. ("left", [0, 1, 2]),
  801. ("right", [0, 2, 1]),
  802. ],
  803. )
  804. def test_join_multiindex_categorical_output_index_dtype(how, values):
  805. # GH#50906
  806. df1 = DataFrame(
  807. {
  808. "a": Categorical([0, 1, 2]),
  809. "b": Categorical([0, 1, 2]),
  810. "c": [0, 1, 2],
  811. }
  812. ).set_index(["a", "b"])
  813. df2 = DataFrame(
  814. {
  815. "a": Categorical([0, 2, 1]),
  816. "b": Categorical([0, 2, 1]),
  817. "d": [0, 2, 1],
  818. }
  819. ).set_index(["a", "b"])
  820. expected = DataFrame(
  821. {
  822. "a": Categorical(values),
  823. "b": Categorical(values),
  824. "c": values,
  825. "d": values,
  826. }
  827. ).set_index(["a", "b"])
  828. result = df1.join(df2, how=how)
  829. tm.assert_frame_equal(result, expected)