test_multi.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. MultiIndex,
  8. RangeIndex,
  9. Series,
  10. Timestamp,
  11. )
  12. import pandas._testing as tm
  13. from pandas.core.reshape.concat import concat
  14. from pandas.core.reshape.merge import merge
  15. @pytest.fixture
  16. def left():
  17. """left dataframe (not multi-indexed) for multi-index join tests"""
  18. # a little relevant example with NAs
  19. key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
  20. key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
  21. data = np.random.randn(len(key1))
  22. return DataFrame({"key1": key1, "key2": key2, "data": data})
  23. @pytest.fixture
  24. def right(multiindex_dataframe_random_data):
  25. """right dataframe (multi-indexed) for multi-index join tests"""
  26. df = multiindex_dataframe_random_data
  27. df.index.names = ["key1", "key2"]
  28. df.columns = ["j_one", "j_two", "j_three"]
  29. return df
  30. @pytest.fixture
  31. def left_multi():
  32. return DataFrame(
  33. {
  34. "Origin": ["A", "A", "B", "B", "C"],
  35. "Destination": ["A", "B", "A", "C", "A"],
  36. "Period": ["AM", "AM", "IP", "AM", "OP"],
  37. "TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
  38. "Trips": [1987, 3647, 2470, 4296, 4444],
  39. },
  40. columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
  41. ).set_index(["Origin", "Destination", "Period", "TripPurp"])
  42. @pytest.fixture
  43. def right_multi():
  44. return DataFrame(
  45. {
  46. "Origin": ["A", "A", "B", "B", "C", "C", "E"],
  47. "Destination": ["A", "B", "A", "B", "A", "B", "F"],
  48. "Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
  49. "LinkType": ["a", "b", "c", "b", "a", "b", "a"],
  50. "Distance": [100, 80, 90, 80, 75, 35, 55],
  51. },
  52. columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
  53. ).set_index(["Origin", "Destination", "Period", "LinkType"])
  54. @pytest.fixture
  55. def on_cols_multi():
  56. return ["Origin", "Destination", "Period"]
  57. @pytest.fixture
  58. def idx_cols_multi():
  59. return ["Origin", "Destination", "Period", "TripPurp", "LinkType"]
  60. class TestMergeMulti:
  61. def test_merge_on_multikey(self, left, right, join_type):
  62. on_cols = ["key1", "key2"]
  63. result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
  64. expected = merge(left, right.reset_index(), on=on_cols, how=join_type)
  65. tm.assert_frame_equal(result, expected)
  66. result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
  67. drop=True
  68. )
  69. expected = merge(
  70. left, right.reset_index(), on=on_cols, how=join_type, sort=True
  71. )
  72. tm.assert_frame_equal(result, expected)
  73. @pytest.mark.parametrize("sort", [False, True])
  74. def test_left_join_multi_index(self, sort):
  75. icols = ["1st", "2nd", "3rd"]
  76. def bind_cols(df):
  77. iord = lambda a: 0 if a != a else ord(a)
  78. f = lambda ts: ts.map(iord) - ord("a")
  79. return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4
  80. def run_asserts(left, right, sort):
  81. res = left.join(right, on=icols, how="left", sort=sort)
  82. assert len(left) < len(res) + 1
  83. assert not res["4th"].isna().any()
  84. assert not res["5th"].isna().any()
  85. tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
  86. result = bind_cols(res.iloc[:, :-2])
  87. tm.assert_series_equal(res["4th"], result, check_names=False)
  88. assert result.name is None
  89. if sort:
  90. tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
  91. out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
  92. res.index = RangeIndex(len(res))
  93. tm.assert_frame_equal(out, res)
  94. lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
  95. left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"])
  96. # Explicit cast to float to avoid implicit cast when setting nan
  97. left.insert(1, "2nd", np.random.randint(0, 1000, len(left)).astype("float"))
  98. i = np.random.permutation(len(left))
  99. right = left.iloc[i].copy()
  100. left["4th"] = bind_cols(left)
  101. right["5th"] = -bind_cols(right)
  102. right.set_index(icols, inplace=True)
  103. run_asserts(left, right, sort)
  104. # inject some nulls
  105. left.loc[1::23, "1st"] = np.nan
  106. left.loc[2::37, "2nd"] = np.nan
  107. left.loc[3::43, "3rd"] = np.nan
  108. left["4th"] = bind_cols(left)
  109. i = np.random.permutation(len(left))
  110. right = left.iloc[i, :-1]
  111. right["5th"] = -bind_cols(right)
  112. right.set_index(icols, inplace=True)
  113. run_asserts(left, right, sort)
  114. @pytest.mark.parametrize("sort", [False, True])
  115. def test_merge_right_vs_left(self, left, right, sort):
  116. # compare left vs right merge with multikey
  117. on_cols = ["key1", "key2"]
  118. merged_left_right = left.merge(
  119. right, left_on=on_cols, right_index=True, how="left", sort=sort
  120. )
  121. merge_right_left = right.merge(
  122. left, right_on=on_cols, left_index=True, how="right", sort=sort
  123. )
  124. # Reorder columns
  125. merge_right_left = merge_right_left[merged_left_right.columns]
  126. tm.assert_frame_equal(merged_left_right, merge_right_left)
  127. def test_merge_multiple_cols_with_mixed_cols_index(self):
  128. # GH29522
  129. s = Series(
  130. range(6),
  131. MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]),
  132. name="Amount",
  133. )
  134. df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0})
  135. result = merge(df, s.reset_index(), on=["lev1", "lev2"])
  136. expected = DataFrame(
  137. {
  138. "lev1": list("AAABBB"),
  139. "lev2": [1, 2, 3, 1, 2, 3],
  140. "col": [0] * 6,
  141. "Amount": range(6),
  142. }
  143. )
  144. tm.assert_frame_equal(result, expected)
  145. def test_compress_group_combinations(self):
  146. # ~ 40000000 possible unique groups
  147. key1 = tm.rands_array(10, 10000)
  148. key1 = np.tile(key1, 2)
  149. key2 = key1[::-1]
  150. df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)})
  151. df2 = DataFrame(
  152. {"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)}
  153. )
  154. # just to hit the label compression code path
  155. merge(df, df2, how="outer")
  156. def test_left_join_index_preserve_order(self):
  157. on_cols = ["k1", "k2"]
  158. left = DataFrame(
  159. {
  160. "k1": [0, 1, 2] * 8,
  161. "k2": ["foo", "bar"] * 12,
  162. "v": np.array(np.arange(24), dtype=np.int64),
  163. }
  164. )
  165. index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
  166. right = DataFrame({"v2": [5, 7]}, index=index)
  167. result = left.join(right, on=on_cols)
  168. expected = left.copy()
  169. expected["v2"] = np.nan
  170. expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
  171. expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
  172. tm.assert_frame_equal(result, expected)
  173. result.sort_values(on_cols, kind="mergesort", inplace=True)
  174. expected = left.join(right, on=on_cols, sort=True)
  175. tm.assert_frame_equal(result, expected)
  176. # test join with multi dtypes blocks
  177. left = DataFrame(
  178. {
  179. "k1": [0, 1, 2] * 8,
  180. "k2": ["foo", "bar"] * 12,
  181. "k3": np.array([0, 1, 2] * 8, dtype=np.float32),
  182. "v": np.array(np.arange(24), dtype=np.int32),
  183. }
  184. )
  185. index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
  186. right = DataFrame({"v2": [5, 7]}, index=index)
  187. result = left.join(right, on=on_cols)
  188. expected = left.copy()
  189. expected["v2"] = np.nan
  190. expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
  191. expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
  192. tm.assert_frame_equal(result, expected)
  193. result = result.sort_values(on_cols, kind="mergesort")
  194. expected = left.join(right, on=on_cols, sort=True)
  195. tm.assert_frame_equal(result, expected)
  196. def test_left_join_index_multi_match_multiindex(self):
  197. left = DataFrame(
  198. [
  199. ["X", "Y", "C", "a"],
  200. ["W", "Y", "C", "e"],
  201. ["V", "Q", "A", "h"],
  202. ["V", "R", "D", "i"],
  203. ["X", "Y", "D", "b"],
  204. ["X", "Y", "A", "c"],
  205. ["W", "Q", "B", "f"],
  206. ["W", "R", "C", "g"],
  207. ["V", "Y", "C", "j"],
  208. ["X", "Y", "B", "d"],
  209. ],
  210. columns=["cola", "colb", "colc", "tag"],
  211. index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
  212. )
  213. right = DataFrame(
  214. [
  215. ["W", "R", "C", 0],
  216. ["W", "Q", "B", 3],
  217. ["W", "Q", "B", 8],
  218. ["X", "Y", "A", 1],
  219. ["X", "Y", "A", 4],
  220. ["X", "Y", "B", 5],
  221. ["X", "Y", "C", 6],
  222. ["X", "Y", "C", 9],
  223. ["X", "Q", "C", -6],
  224. ["X", "R", "C", -9],
  225. ["V", "Y", "C", 7],
  226. ["V", "R", "D", 2],
  227. ["V", "R", "D", -1],
  228. ["V", "Q", "A", -3],
  229. ],
  230. columns=["col1", "col2", "col3", "val"],
  231. ).set_index(["col1", "col2", "col3"])
  232. result = left.join(right, on=["cola", "colb", "colc"], how="left")
  233. expected = DataFrame(
  234. [
  235. ["X", "Y", "C", "a", 6],
  236. ["X", "Y", "C", "a", 9],
  237. ["W", "Y", "C", "e", np.nan],
  238. ["V", "Q", "A", "h", -3],
  239. ["V", "R", "D", "i", 2],
  240. ["V", "R", "D", "i", -1],
  241. ["X", "Y", "D", "b", np.nan],
  242. ["X", "Y", "A", "c", 1],
  243. ["X", "Y", "A", "c", 4],
  244. ["W", "Q", "B", "f", 3],
  245. ["W", "Q", "B", "f", 8],
  246. ["W", "R", "C", "g", 0],
  247. ["V", "Y", "C", "j", 7],
  248. ["X", "Y", "B", "d", 5],
  249. ],
  250. columns=["cola", "colb", "colc", "tag", "val"],
  251. index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
  252. )
  253. tm.assert_frame_equal(result, expected)
  254. result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
  255. expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
  256. tm.assert_frame_equal(result, expected)
  257. def test_left_join_index_multi_match(self):
  258. left = DataFrame(
  259. [["c", 0], ["b", 1], ["a", 2], ["b", 3]],
  260. columns=["tag", "val"],
  261. index=[2, 0, 1, 3],
  262. )
  263. right = DataFrame(
  264. [
  265. ["a", "v"],
  266. ["c", "w"],
  267. ["c", "x"],
  268. ["d", "y"],
  269. ["a", "z"],
  270. ["c", "r"],
  271. ["e", "q"],
  272. ["c", "s"],
  273. ],
  274. columns=["tag", "char"],
  275. ).set_index("tag")
  276. result = left.join(right, on="tag", how="left")
  277. expected = DataFrame(
  278. [
  279. ["c", 0, "w"],
  280. ["c", 0, "x"],
  281. ["c", 0, "r"],
  282. ["c", 0, "s"],
  283. ["b", 1, np.nan],
  284. ["a", 2, "v"],
  285. ["a", 2, "z"],
  286. ["b", 3, np.nan],
  287. ],
  288. columns=["tag", "val", "char"],
  289. index=[2, 2, 2, 2, 0, 1, 1, 3],
  290. )
  291. tm.assert_frame_equal(result, expected)
  292. result = left.join(right, on="tag", how="left", sort=True)
  293. expected2 = expected.sort_values("tag", kind="mergesort")
  294. tm.assert_frame_equal(result, expected2)
  295. # GH7331 - maintain left frame order in left merge
  296. result = merge(left, right.reset_index(), how="left", on="tag")
  297. expected.index = RangeIndex(len(expected))
  298. tm.assert_frame_equal(result, expected)
  299. def test_left_merge_na_buglet(self):
  300. left = DataFrame(
  301. {
  302. "id": list("abcde"),
  303. "v1": np.random.randn(5),
  304. "v2": np.random.randn(5),
  305. "dummy": list("abcde"),
  306. "v3": np.random.randn(5),
  307. },
  308. columns=["id", "v1", "v2", "dummy", "v3"],
  309. )
  310. right = DataFrame(
  311. {
  312. "id": ["a", "b", np.nan, np.nan, np.nan],
  313. "sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
  314. }
  315. )
  316. result = merge(left, right, on="id", how="left")
  317. rdf = right.drop(["id"], axis=1)
  318. expected = left.join(rdf)
  319. tm.assert_frame_equal(result, expected)
  320. def test_merge_na_keys(self):
  321. data = [
  322. [1950, "A", 1.5],
  323. [1950, "B", 1.5],
  324. [1955, "B", 1.5],
  325. [1960, "B", np.nan],
  326. [1970, "B", 4.0],
  327. [1950, "C", 4.0],
  328. [1960, "C", np.nan],
  329. [1965, "C", 3.0],
  330. [1970, "C", 4.0],
  331. ]
  332. frame = DataFrame(data, columns=["year", "panel", "data"])
  333. other_data = [
  334. [1960, "A", np.nan],
  335. [1970, "A", np.nan],
  336. [1955, "A", np.nan],
  337. [1965, "A", np.nan],
  338. [1965, "B", np.nan],
  339. [1955, "C", np.nan],
  340. ]
  341. other = DataFrame(other_data, columns=["year", "panel", "data"])
  342. result = frame.merge(other, how="outer")
  343. expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
  344. expected = expected.replace(-999, np.nan)
  345. tm.assert_frame_equal(result, expected)
  346. @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
  347. def test_merge_datetime_index(self, klass):
  348. # see gh-19038
  349. df = DataFrame(
  350. [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
  351. )
  352. df.index = pd.to_datetime(df.index)
  353. on_vector = df.index.year
  354. if klass is not None:
  355. on_vector = klass(on_vector)
  356. exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
  357. expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
  358. result = df.merge(df, on=["a", on_vector], how="inner")
  359. tm.assert_frame_equal(result, expected)
  360. expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
  361. result = df.merge(df, on=[df.index.year], how="inner")
  362. tm.assert_frame_equal(result, expected)
  363. @pytest.mark.parametrize("merge_type", ["left", "right"])
  364. def test_merge_datetime_multi_index_empty_df(self, merge_type):
  365. # see gh-36895
  366. left = DataFrame(
  367. data={
  368. "data": [1.5, 1.5],
  369. },
  370. index=MultiIndex.from_tuples(
  371. [[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
  372. names=["date", "panel"],
  373. ),
  374. )
  375. right = DataFrame(
  376. index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"]
  377. )
  378. expected_index = MultiIndex.from_tuples(
  379. [[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
  380. names=["date", "panel"],
  381. )
  382. if merge_type == "left":
  383. expected = DataFrame(
  384. data={
  385. "data": [1.5, 1.5],
  386. "state": [None, None],
  387. },
  388. index=expected_index,
  389. )
  390. results_merge = left.merge(right, how="left", on=["date", "panel"])
  391. results_join = left.join(right, how="left")
  392. else:
  393. expected = DataFrame(
  394. data={
  395. "state": [None, None],
  396. "data": [1.5, 1.5],
  397. },
  398. index=expected_index,
  399. )
  400. results_merge = right.merge(left, how="right", on=["date", "panel"])
  401. results_join = right.join(left, how="right")
  402. tm.assert_frame_equal(results_merge, expected)
  403. tm.assert_frame_equal(results_join, expected)
  404. @pytest.fixture
  405. def household(self):
  406. household = DataFrame(
  407. {
  408. "household_id": [1, 2, 3],
  409. "male": [0, 1, 0],
  410. "wealth": [196087.3, 316478.7, 294750],
  411. },
  412. columns=["household_id", "male", "wealth"],
  413. ).set_index("household_id")
  414. return household
  415. @pytest.fixture
  416. def portfolio(self):
  417. portfolio = DataFrame(
  418. {
  419. "household_id": [1, 2, 2, 3, 3, 3, 4],
  420. "asset_id": [
  421. "nl0000301109",
  422. "nl0000289783",
  423. "gb00b03mlx29",
  424. "gb00b03mlx29",
  425. "lu0197800237",
  426. "nl0000289965",
  427. np.nan,
  428. ],
  429. "name": [
  430. "ABN Amro",
  431. "Robeco",
  432. "Royal Dutch Shell",
  433. "Royal Dutch Shell",
  434. "AAB Eastern Europe Equity Fund",
  435. "Postbank BioTech Fonds",
  436. np.nan,
  437. ],
  438. "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
  439. },
  440. columns=["household_id", "asset_id", "name", "share"],
  441. ).set_index(["household_id", "asset_id"])
  442. return portfolio
  443. @pytest.fixture
  444. def expected(self):
  445. expected = (
  446. DataFrame(
  447. {
  448. "male": [0, 1, 1, 0, 0, 0],
  449. "wealth": [
  450. 196087.3,
  451. 316478.7,
  452. 316478.7,
  453. 294750.0,
  454. 294750.0,
  455. 294750.0,
  456. ],
  457. "name": [
  458. "ABN Amro",
  459. "Robeco",
  460. "Royal Dutch Shell",
  461. "Royal Dutch Shell",
  462. "AAB Eastern Europe Equity Fund",
  463. "Postbank BioTech Fonds",
  464. ],
  465. "share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
  466. "household_id": [1, 2, 2, 3, 3, 3],
  467. "asset_id": [
  468. "nl0000301109",
  469. "nl0000289783",
  470. "gb00b03mlx29",
  471. "gb00b03mlx29",
  472. "lu0197800237",
  473. "nl0000289965",
  474. ],
  475. }
  476. )
  477. .set_index(["household_id", "asset_id"])
  478. .reindex(columns=["male", "wealth", "name", "share"])
  479. )
  480. return expected
  481. def test_join_multi_levels(self, portfolio, household, expected):
  482. portfolio = portfolio.copy()
  483. household = household.copy()
  484. # GH 3662
  485. # merge multi-levels
  486. result = household.join(portfolio, how="inner")
  487. tm.assert_frame_equal(result, expected)
  488. def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
  489. portfolio = portfolio.copy()
  490. household = household.copy()
  491. # equivalency
  492. result = merge(
  493. household.reset_index(),
  494. portfolio.reset_index(),
  495. on=["household_id"],
  496. how="inner",
  497. ).set_index(["household_id", "asset_id"])
  498. tm.assert_frame_equal(result, expected)
  499. def test_join_multi_levels_outer(self, portfolio, household, expected):
  500. portfolio = portfolio.copy()
  501. household = household.copy()
  502. result = household.join(portfolio, how="outer")
  503. expected = concat(
  504. [
  505. expected,
  506. (
  507. DataFrame(
  508. {"share": [1.00]},
  509. index=MultiIndex.from_tuples(
  510. [(4, np.nan)], names=["household_id", "asset_id"]
  511. ),
  512. )
  513. ),
  514. ],
  515. axis=0,
  516. sort=True,
  517. ).reindex(columns=expected.columns)
  518. tm.assert_frame_equal(result, expected)
  519. def test_join_multi_levels_invalid(self, portfolio, household):
  520. portfolio = portfolio.copy()
  521. household = household.copy()
  522. # invalid cases
  523. household.index.name = "foo"
  524. with pytest.raises(
  525. ValueError, match="cannot join with no overlapping index names"
  526. ):
  527. household.join(portfolio, how="inner")
  528. portfolio2 = portfolio.copy()
  529. portfolio2.index.set_names(["household_id", "foo"])
  530. with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
  531. portfolio2.join(portfolio, how="inner")
  532. def test_join_multi_levels2(self):
  533. # some more advanced merges
  534. # GH6360
  535. household = DataFrame(
  536. {
  537. "household_id": [1, 2, 2, 3, 3, 3, 4],
  538. "asset_id": [
  539. "nl0000301109",
  540. "nl0000301109",
  541. "gb00b03mlx29",
  542. "gb00b03mlx29",
  543. "lu0197800237",
  544. "nl0000289965",
  545. np.nan,
  546. ],
  547. "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
  548. },
  549. columns=["household_id", "asset_id", "share"],
  550. ).set_index(["household_id", "asset_id"])
  551. log_return = DataFrame(
  552. {
  553. "asset_id": [
  554. "gb00b03mlx29",
  555. "gb00b03mlx29",
  556. "gb00b03mlx29",
  557. "lu0197800237",
  558. "lu0197800237",
  559. ],
  560. "t": [233, 234, 235, 180, 181],
  561. "log_return": [
  562. 0.09604978,
  563. -0.06524096,
  564. 0.03532373,
  565. 0.03025441,
  566. 0.036997,
  567. ],
  568. }
  569. ).set_index(["asset_id", "t"])
  570. expected = (
  571. DataFrame(
  572. {
  573. "household_id": [2, 2, 2, 3, 3, 3, 3, 3],
  574. "asset_id": [
  575. "gb00b03mlx29",
  576. "gb00b03mlx29",
  577. "gb00b03mlx29",
  578. "gb00b03mlx29",
  579. "gb00b03mlx29",
  580. "gb00b03mlx29",
  581. "lu0197800237",
  582. "lu0197800237",
  583. ],
  584. "t": [233, 234, 235, 233, 234, 235, 180, 181],
  585. "share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
  586. "log_return": [
  587. 0.09604978,
  588. -0.06524096,
  589. 0.03532373,
  590. 0.09604978,
  591. -0.06524096,
  592. 0.03532373,
  593. 0.03025441,
  594. 0.036997,
  595. ],
  596. }
  597. )
  598. .set_index(["household_id", "asset_id", "t"])
  599. .reindex(columns=["share", "log_return"])
  600. )
  601. # this is the equivalency
  602. result = merge(
  603. household.reset_index(),
  604. log_return.reset_index(),
  605. on=["asset_id"],
  606. how="inner",
  607. ).set_index(["household_id", "asset_id", "t"])
  608. tm.assert_frame_equal(result, expected)
  609. expected = (
  610. DataFrame(
  611. {
  612. "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
  613. "asset_id": [
  614. "nl0000301109",
  615. "nl0000301109",
  616. "gb00b03mlx29",
  617. "gb00b03mlx29",
  618. "gb00b03mlx29",
  619. "gb00b03mlx29",
  620. "gb00b03mlx29",
  621. "gb00b03mlx29",
  622. "lu0197800237",
  623. "lu0197800237",
  624. "nl0000289965",
  625. None,
  626. ],
  627. "t": [
  628. None,
  629. None,
  630. 233,
  631. 234,
  632. 235,
  633. 233,
  634. 234,
  635. 235,
  636. 180,
  637. 181,
  638. None,
  639. None,
  640. ],
  641. "share": [
  642. 1.0,
  643. 0.4,
  644. 0.6,
  645. 0.6,
  646. 0.6,
  647. 0.15,
  648. 0.15,
  649. 0.15,
  650. 0.6,
  651. 0.6,
  652. 0.25,
  653. 1.0,
  654. ],
  655. "log_return": [
  656. None,
  657. None,
  658. 0.09604978,
  659. -0.06524096,
  660. 0.03532373,
  661. 0.09604978,
  662. -0.06524096,
  663. 0.03532373,
  664. 0.03025441,
  665. 0.036997,
  666. None,
  667. None,
  668. ],
  669. }
  670. )
  671. .set_index(["household_id", "asset_id", "t"])
  672. .reindex(columns=["share", "log_return"])
  673. )
  674. result = merge(
  675. household.reset_index(),
  676. log_return.reset_index(),
  677. on=["asset_id"],
  678. how="outer",
  679. ).set_index(["household_id", "asset_id", "t"])
  680. tm.assert_frame_equal(result, expected)
  681. class TestJoinMultiMulti:
  682. def test_join_multi_multi(
  683. self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
  684. ):
  685. # Multi-index join tests
  686. expected = (
  687. merge(
  688. left_multi.reset_index(),
  689. right_multi.reset_index(),
  690. how=join_type,
  691. on=on_cols_multi,
  692. )
  693. .set_index(idx_cols_multi)
  694. .sort_index()
  695. )
  696. result = left_multi.join(right_multi, how=join_type).sort_index()
  697. tm.assert_frame_equal(result, expected)
  698. def test_join_multi_empty_frames(
  699. self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
  700. ):
  701. left_multi = left_multi.drop(columns=left_multi.columns)
  702. right_multi = right_multi.drop(columns=right_multi.columns)
  703. expected = (
  704. merge(
  705. left_multi.reset_index(),
  706. right_multi.reset_index(),
  707. how=join_type,
  708. on=on_cols_multi,
  709. )
  710. .set_index(idx_cols_multi)
  711. .sort_index()
  712. )
  713. result = left_multi.join(right_multi, how=join_type).sort_index()
  714. tm.assert_frame_equal(result, expected)
  715. @pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
  716. def test_merge_datetime_index(self, box):
  717. # see gh-19038
  718. df = DataFrame(
  719. [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
  720. )
  721. df.index = pd.to_datetime(df.index)
  722. on_vector = df.index.year
  723. if box is not None:
  724. on_vector = box(on_vector)
  725. exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
  726. expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
  727. result = df.merge(df, on=["a", on_vector], how="inner")
  728. tm.assert_frame_equal(result, expected)
  729. expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
  730. result = df.merge(df, on=[df.index.year], how="inner")
  731. tm.assert_frame_equal(result, expected)
  732. def test_single_common_level(self):
  733. index_left = MultiIndex.from_tuples(
  734. [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
  735. )
  736. left = DataFrame(
  737. {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
  738. )
  739. index_right = MultiIndex.from_tuples(
  740. [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
  741. )
  742. right = DataFrame(
  743. {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
  744. index=index_right,
  745. )
  746. result = left.join(right)
  747. expected = merge(
  748. left.reset_index(), right.reset_index(), on=["key"], how="inner"
  749. ).set_index(["key", "X", "Y"])
  750. tm.assert_frame_equal(result, expected)
  751. def test_join_multi_wrong_order(self):
  752. # GH 25760
  753. # GH 28956
  754. midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
  755. midx3 = MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
  756. left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
  757. right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
  758. result = left.join(right)
  759. expected = DataFrame(
  760. index=midx1,
  761. data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
  762. )
  763. tm.assert_frame_equal(result, expected)