test_merge_asof.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591
  1. import datetime
  2. import numpy as np
  3. import pytest
  4. import pytz
  5. import pandas as pd
  6. from pandas import (
  7. Index,
  8. Timedelta,
  9. merge_asof,
  10. read_csv,
  11. to_datetime,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.reshape.merge import MergeError
  15. @pytest.fixture(params=["s", "ms", "us", "ns"])
  16. def unit(request):
  17. """
  18. Resolution for datetimelike dtypes.
  19. """
  20. return request.param
  21. class TestAsOfMerge:
  22. def read_data(self, datapath, name, dedupe=False):
  23. path = datapath("reshape", "merge", "data", name)
  24. x = read_csv(path)
  25. if dedupe:
  26. x = x.drop_duplicates(["time", "ticker"], keep="last").reset_index(
  27. drop=True
  28. )
  29. x.time = to_datetime(x.time)
  30. return x
  31. @pytest.fixture
  32. def trades(self, datapath):
  33. return self.read_data(datapath, "trades.csv")
  34. @pytest.fixture
  35. def quotes(self, datapath):
  36. return self.read_data(datapath, "quotes.csv", dedupe=True)
  37. @pytest.fixture
  38. def asof(self, datapath):
  39. return self.read_data(datapath, "asof.csv")
  40. @pytest.fixture
  41. def tolerance(self, datapath):
  42. return self.read_data(datapath, "tolerance.csv")
  43. @pytest.fixture
  44. def allow_exact_matches(self, datapath):
  45. return self.read_data(datapath, "allow_exact_matches.csv")
  46. @pytest.fixture
  47. def allow_exact_matches_and_tolerance(self, datapath):
  48. return self.read_data(datapath, "allow_exact_matches_and_tolerance.csv")
  49. def test_examples1(self):
  50. """doc-string examples"""
  51. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  52. right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
  53. expected = pd.DataFrame(
  54. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 3, 7]}
  55. )
  56. result = merge_asof(left, right, on="a")
  57. tm.assert_frame_equal(result, expected)
  58. def test_examples2(self, unit):
  59. """doc-string examples"""
  60. if unit == "s":
  61. pytest.skip(
  62. "This test is invalid for unit='s' because that would "
  63. "round the trades['time']]"
  64. )
  65. trades = pd.DataFrame(
  66. {
  67. "time": to_datetime(
  68. [
  69. "20160525 13:30:00.023",
  70. "20160525 13:30:00.038",
  71. "20160525 13:30:00.048",
  72. "20160525 13:30:00.048",
  73. "20160525 13:30:00.048",
  74. ]
  75. ).astype(f"M8[{unit}]"),
  76. "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
  77. "price": [51.95, 51.95, 720.77, 720.92, 98.00],
  78. "quantity": [75, 155, 100, 100, 100],
  79. },
  80. columns=["time", "ticker", "price", "quantity"],
  81. )
  82. quotes = pd.DataFrame(
  83. {
  84. "time": to_datetime(
  85. [
  86. "20160525 13:30:00.023",
  87. "20160525 13:30:00.023",
  88. "20160525 13:30:00.030",
  89. "20160525 13:30:00.041",
  90. "20160525 13:30:00.048",
  91. "20160525 13:30:00.049",
  92. "20160525 13:30:00.072",
  93. "20160525 13:30:00.075",
  94. ]
  95. ).astype(f"M8[{unit}]"),
  96. "ticker": [
  97. "GOOG",
  98. "MSFT",
  99. "MSFT",
  100. "MSFT",
  101. "GOOG",
  102. "AAPL",
  103. "GOOG",
  104. "MSFT",
  105. ],
  106. "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
  107. "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
  108. },
  109. columns=["time", "ticker", "bid", "ask"],
  110. )
  111. merge_asof(trades, quotes, on="time", by="ticker")
  112. merge_asof(trades, quotes, on="time", by="ticker", tolerance=Timedelta("2ms"))
  113. expected = pd.DataFrame(
  114. {
  115. "time": to_datetime(
  116. [
  117. "20160525 13:30:00.023",
  118. "20160525 13:30:00.038",
  119. "20160525 13:30:00.048",
  120. "20160525 13:30:00.048",
  121. "20160525 13:30:00.048",
  122. ]
  123. ).astype(f"M8[{unit}]"),
  124. "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
  125. "price": [51.95, 51.95, 720.77, 720.92, 98.00],
  126. "quantity": [75, 155, 100, 100, 100],
  127. "bid": [np.nan, 51.97, np.nan, np.nan, np.nan],
  128. "ask": [np.nan, 51.98, np.nan, np.nan, np.nan],
  129. },
  130. columns=["time", "ticker", "price", "quantity", "bid", "ask"],
  131. )
  132. result = merge_asof(
  133. trades,
  134. quotes,
  135. on="time",
  136. by="ticker",
  137. tolerance=Timedelta("10ms"),
  138. allow_exact_matches=False,
  139. )
  140. tm.assert_frame_equal(result, expected)
  141. def test_examples3(self):
  142. """doc-string examples"""
  143. # GH14887
  144. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  145. right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
  146. expected = pd.DataFrame(
  147. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, np.nan]}
  148. )
  149. result = merge_asof(left, right, on="a", direction="forward")
  150. tm.assert_frame_equal(result, expected)
  151. def test_examples4(self):
  152. """doc-string examples"""
  153. # GH14887
  154. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  155. right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
  156. expected = pd.DataFrame(
  157. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, 7]}
  158. )
  159. result = merge_asof(left, right, on="a", direction="nearest")
  160. tm.assert_frame_equal(result, expected)
  161. def test_basic(self, trades, asof, quotes):
  162. expected = asof
  163. result = merge_asof(trades, quotes, on="time", by="ticker")
  164. tm.assert_frame_equal(result, expected)
  165. def test_basic_categorical(self, trades, asof, quotes):
  166. expected = asof
  167. trades.ticker = trades.ticker.astype("category")
  168. quotes.ticker = quotes.ticker.astype("category")
  169. expected.ticker = expected.ticker.astype("category")
  170. result = merge_asof(trades, quotes, on="time", by="ticker")
  171. tm.assert_frame_equal(result, expected)
  172. def test_basic_left_index(self, trades, asof, quotes):
  173. # GH14253
  174. expected = asof
  175. trades = trades.set_index("time")
  176. result = merge_asof(
  177. trades, quotes, left_index=True, right_on="time", by="ticker"
  178. )
  179. # left-only index uses right"s index, oddly
  180. expected.index = result.index
  181. # time column appears after left"s columns
  182. expected = expected[result.columns]
  183. tm.assert_frame_equal(result, expected)
  184. def test_basic_right_index(self, trades, asof, quotes):
  185. expected = asof
  186. quotes = quotes.set_index("time")
  187. result = merge_asof(
  188. trades, quotes, left_on="time", right_index=True, by="ticker"
  189. )
  190. tm.assert_frame_equal(result, expected)
  191. def test_basic_left_index_right_index(self, trades, asof, quotes):
  192. expected = asof.set_index("time")
  193. trades = trades.set_index("time")
  194. quotes = quotes.set_index("time")
  195. result = merge_asof(
  196. trades, quotes, left_index=True, right_index=True, by="ticker"
  197. )
  198. tm.assert_frame_equal(result, expected)
  199. def test_multi_index_left(self, trades, quotes):
  200. # MultiIndex is prohibited
  201. trades = trades.set_index(["time", "price"])
  202. quotes = quotes.set_index("time")
  203. with pytest.raises(MergeError, match="left can only have one index"):
  204. merge_asof(trades, quotes, left_index=True, right_index=True)
  205. def test_multi_index_right(self, trades, quotes):
  206. # MultiIndex is prohibited
  207. trades = trades.set_index("time")
  208. quotes = quotes.set_index(["time", "bid"])
  209. with pytest.raises(MergeError, match="right can only have one index"):
  210. merge_asof(trades, quotes, left_index=True, right_index=True)
  211. def test_on_and_index_left_on(self, trades, quotes):
  212. # "on" parameter and index together is prohibited
  213. trades = trades.set_index("time")
  214. quotes = quotes.set_index("time")
  215. msg = 'Can only pass argument "left_on" OR "left_index" not both.'
  216. with pytest.raises(MergeError, match=msg):
  217. merge_asof(
  218. trades, quotes, left_on="price", left_index=True, right_index=True
  219. )
  220. def test_on_and_index_right_on(self, trades, quotes):
  221. trades = trades.set_index("time")
  222. quotes = quotes.set_index("time")
  223. msg = 'Can only pass argument "right_on" OR "right_index" not both.'
  224. with pytest.raises(MergeError, match=msg):
  225. merge_asof(
  226. trades, quotes, right_on="bid", left_index=True, right_index=True
  227. )
  228. def test_basic_left_by_right_by(self, trades, asof, quotes):
  229. # GH14253
  230. expected = asof
  231. result = merge_asof(
  232. trades, quotes, on="time", left_by="ticker", right_by="ticker"
  233. )
  234. tm.assert_frame_equal(result, expected)
  235. def test_missing_right_by(self, trades, asof, quotes):
  236. expected = asof
  237. q = quotes[quotes.ticker != "MSFT"]
  238. result = merge_asof(trades, q, on="time", by="ticker")
  239. expected.loc[expected.ticker == "MSFT", ["bid", "ask"]] = np.nan
  240. tm.assert_frame_equal(result, expected)
  241. def test_multiby(self):
  242. # GH13936
  243. trades = pd.DataFrame(
  244. {
  245. "time": to_datetime(
  246. [
  247. "20160525 13:30:00.023",
  248. "20160525 13:30:00.023",
  249. "20160525 13:30:00.046",
  250. "20160525 13:30:00.048",
  251. "20160525 13:30:00.050",
  252. ]
  253. ),
  254. "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
  255. "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"],
  256. "price": [51.95, 51.95, 720.77, 720.92, 98.00],
  257. "quantity": [75, 155, 100, 100, 100],
  258. },
  259. columns=["time", "ticker", "exch", "price", "quantity"],
  260. )
  261. quotes = pd.DataFrame(
  262. {
  263. "time": to_datetime(
  264. [
  265. "20160525 13:30:00.023",
  266. "20160525 13:30:00.023",
  267. "20160525 13:30:00.030",
  268. "20160525 13:30:00.041",
  269. "20160525 13:30:00.045",
  270. "20160525 13:30:00.049",
  271. ]
  272. ),
  273. "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL"],
  274. "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"],
  275. "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99],
  276. "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01],
  277. },
  278. columns=["time", "ticker", "exch", "bid", "ask"],
  279. )
  280. expected = pd.DataFrame(
  281. {
  282. "time": to_datetime(
  283. [
  284. "20160525 13:30:00.023",
  285. "20160525 13:30:00.023",
  286. "20160525 13:30:00.046",
  287. "20160525 13:30:00.048",
  288. "20160525 13:30:00.050",
  289. ]
  290. ),
  291. "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
  292. "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"],
  293. "price": [51.95, 51.95, 720.77, 720.92, 98.00],
  294. "quantity": [75, 155, 100, 100, 100],
  295. "bid": [np.nan, 51.95, 720.50, 720.51, np.nan],
  296. "ask": [np.nan, 51.96, 720.93, 720.92, np.nan],
  297. },
  298. columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"],
  299. )
  300. result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
  301. tm.assert_frame_equal(result, expected)
  302. def test_multiby_heterogeneous_types(self):
  303. # GH13936
  304. trades = pd.DataFrame(
  305. {
  306. "time": to_datetime(
  307. [
  308. "20160525 13:30:00.023",
  309. "20160525 13:30:00.023",
  310. "20160525 13:30:00.046",
  311. "20160525 13:30:00.048",
  312. "20160525 13:30:00.050",
  313. ]
  314. ),
  315. "ticker": [0, 0, 1, 1, 2],
  316. "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"],
  317. "price": [51.95, 51.95, 720.77, 720.92, 98.00],
  318. "quantity": [75, 155, 100, 100, 100],
  319. },
  320. columns=["time", "ticker", "exch", "price", "quantity"],
  321. )
  322. quotes = pd.DataFrame(
  323. {
  324. "time": to_datetime(
  325. [
  326. "20160525 13:30:00.023",
  327. "20160525 13:30:00.023",
  328. "20160525 13:30:00.030",
  329. "20160525 13:30:00.041",
  330. "20160525 13:30:00.045",
  331. "20160525 13:30:00.049",
  332. ]
  333. ),
  334. "ticker": [1, 0, 0, 0, 1, 2],
  335. "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"],
  336. "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99],
  337. "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01],
  338. },
  339. columns=["time", "ticker", "exch", "bid", "ask"],
  340. )
  341. expected = pd.DataFrame(
  342. {
  343. "time": to_datetime(
  344. [
  345. "20160525 13:30:00.023",
  346. "20160525 13:30:00.023",
  347. "20160525 13:30:00.046",
  348. "20160525 13:30:00.048",
  349. "20160525 13:30:00.050",
  350. ]
  351. ),
  352. "ticker": [0, 0, 1, 1, 2],
  353. "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"],
  354. "price": [51.95, 51.95, 720.77, 720.92, 98.00],
  355. "quantity": [75, 155, 100, 100, 100],
  356. "bid": [np.nan, 51.95, 720.50, 720.51, np.nan],
  357. "ask": [np.nan, 51.96, 720.93, 720.92, np.nan],
  358. },
  359. columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"],
  360. )
  361. result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
  362. tm.assert_frame_equal(result, expected)
  363. def test_multiby_indexed(self):
  364. # GH15676
  365. left = pd.DataFrame(
  366. [
  367. [to_datetime("20160602"), 1, "a"],
  368. [to_datetime("20160602"), 2, "a"],
  369. [to_datetime("20160603"), 1, "b"],
  370. [to_datetime("20160603"), 2, "b"],
  371. ],
  372. columns=["time", "k1", "k2"],
  373. ).set_index("time")
  374. right = pd.DataFrame(
  375. [
  376. [to_datetime("20160502"), 1, "a", 1.0],
  377. [to_datetime("20160502"), 2, "a", 2.0],
  378. [to_datetime("20160503"), 1, "b", 3.0],
  379. [to_datetime("20160503"), 2, "b", 4.0],
  380. ],
  381. columns=["time", "k1", "k2", "value"],
  382. ).set_index("time")
  383. expected = pd.DataFrame(
  384. [
  385. [to_datetime("20160602"), 1, "a", 1.0],
  386. [to_datetime("20160602"), 2, "a", 2.0],
  387. [to_datetime("20160603"), 1, "b", 3.0],
  388. [to_datetime("20160603"), 2, "b", 4.0],
  389. ],
  390. columns=["time", "k1", "k2", "value"],
  391. ).set_index("time")
  392. result = merge_asof(
  393. left, right, left_index=True, right_index=True, by=["k1", "k2"]
  394. )
  395. tm.assert_frame_equal(expected, result)
  396. with pytest.raises(
  397. MergeError, match="left_by and right_by must be same length"
  398. ):
  399. merge_asof(
  400. left,
  401. right,
  402. left_index=True,
  403. right_index=True,
  404. left_by=["k1", "k2"],
  405. right_by=["k1"],
  406. )
  407. def test_basic2(self, datapath):
  408. expected = self.read_data(datapath, "asof2.csv")
  409. trades = self.read_data(datapath, "trades2.csv")
  410. quotes = self.read_data(datapath, "quotes2.csv", dedupe=True)
  411. result = merge_asof(trades, quotes, on="time", by="ticker")
  412. tm.assert_frame_equal(result, expected)
  413. def test_basic_no_by(self, trades, asof, quotes):
  414. f = (
  415. lambda x: x[x.ticker == "MSFT"]
  416. .drop("ticker", axis=1)
  417. .reset_index(drop=True)
  418. )
  419. # just use a single ticker
  420. expected = f(asof)
  421. trades = f(trades)
  422. quotes = f(quotes)
  423. result = merge_asof(trades, quotes, on="time")
  424. tm.assert_frame_equal(result, expected)
  425. def test_valid_join_keys(self, trades, quotes):
  426. msg = r"incompatible merge keys \[1\] .* must be the same type"
  427. with pytest.raises(MergeError, match=msg):
  428. merge_asof(trades, quotes, left_on="time", right_on="bid", by="ticker")
  429. with pytest.raises(MergeError, match="can only asof on a key for left"):
  430. merge_asof(trades, quotes, on=["time", "ticker"], by="ticker")
  431. with pytest.raises(MergeError, match="can only asof on a key for left"):
  432. merge_asof(trades, quotes, by="ticker")
  433. def test_with_duplicates(self, datapath, trades, quotes):
  434. q = (
  435. pd.concat([quotes, quotes])
  436. .sort_values(["time", "ticker"])
  437. .reset_index(drop=True)
  438. )
  439. result = merge_asof(trades, q, on="time", by="ticker")
  440. expected = self.read_data(datapath, "asof.csv")
  441. tm.assert_frame_equal(result, expected)
  442. def test_with_duplicates_no_on(self):
  443. df1 = pd.DataFrame({"key": [1, 1, 3], "left_val": [1, 2, 3]})
  444. df2 = pd.DataFrame({"key": [1, 2, 2], "right_val": [1, 2, 3]})
  445. result = merge_asof(df1, df2, on="key")
  446. expected = pd.DataFrame(
  447. {"key": [1, 1, 3], "left_val": [1, 2, 3], "right_val": [1, 1, 3]}
  448. )
  449. tm.assert_frame_equal(result, expected)
  450. def test_valid_allow_exact_matches(self, trades, quotes):
  451. msg = "allow_exact_matches must be boolean, passed foo"
  452. with pytest.raises(MergeError, match=msg):
  453. merge_asof(
  454. trades, quotes, on="time", by="ticker", allow_exact_matches="foo"
  455. )
  456. def test_valid_tolerance(self, trades, quotes):
  457. # dti
  458. merge_asof(trades, quotes, on="time", by="ticker", tolerance=Timedelta("1s"))
  459. # integer
  460. merge_asof(
  461. trades.reset_index(),
  462. quotes.reset_index(),
  463. on="index",
  464. by="ticker",
  465. tolerance=1,
  466. )
  467. msg = r"incompatible tolerance .*, must be compat with type .*"
  468. # incompat
  469. with pytest.raises(MergeError, match=msg):
  470. merge_asof(trades, quotes, on="time", by="ticker", tolerance=1)
  471. # invalid
  472. with pytest.raises(MergeError, match=msg):
  473. merge_asof(
  474. trades.reset_index(),
  475. quotes.reset_index(),
  476. on="index",
  477. by="ticker",
  478. tolerance=1.0,
  479. )
  480. msg = "tolerance must be positive"
  481. # invalid negative
  482. with pytest.raises(MergeError, match=msg):
  483. merge_asof(
  484. trades, quotes, on="time", by="ticker", tolerance=-Timedelta("1s")
  485. )
  486. with pytest.raises(MergeError, match=msg):
  487. merge_asof(
  488. trades.reset_index(),
  489. quotes.reset_index(),
  490. on="index",
  491. by="ticker",
  492. tolerance=-1,
  493. )
  494. def test_non_sorted(self, trades, quotes):
  495. trades = trades.sort_values("time", ascending=False)
  496. quotes = quotes.sort_values("time", ascending=False)
  497. # we require that we are already sorted on time & quotes
  498. assert not trades.time.is_monotonic_increasing
  499. assert not quotes.time.is_monotonic_increasing
  500. with pytest.raises(ValueError, match="left keys must be sorted"):
  501. merge_asof(trades, quotes, on="time", by="ticker")
  502. trades = trades.sort_values("time")
  503. assert trades.time.is_monotonic_increasing
  504. assert not quotes.time.is_monotonic_increasing
  505. with pytest.raises(ValueError, match="right keys must be sorted"):
  506. merge_asof(trades, quotes, on="time", by="ticker")
  507. quotes = quotes.sort_values("time")
  508. assert trades.time.is_monotonic_increasing
  509. assert quotes.time.is_monotonic_increasing
  510. # ok, though has dupes
  511. merge_asof(trades, quotes, on="time", by="ticker")
  512. @pytest.mark.parametrize(
  513. "tolerance_ts",
  514. [Timedelta("1day"), datetime.timedelta(days=1)],
  515. ids=["Timedelta", "datetime.timedelta"],
  516. )
  517. def test_tolerance(self, tolerance_ts, trades, quotes, tolerance):
  518. result = merge_asof(
  519. trades, quotes, on="time", by="ticker", tolerance=tolerance_ts
  520. )
  521. expected = tolerance
  522. tm.assert_frame_equal(result, expected)
  523. def test_tolerance_forward(self):
  524. # GH14887
  525. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  526. right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]})
  527. expected = pd.DataFrame(
  528. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]}
  529. )
  530. result = merge_asof(left, right, on="a", direction="forward", tolerance=1)
  531. tm.assert_frame_equal(result, expected)
  532. def test_tolerance_nearest(self):
  533. # GH14887
  534. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  535. right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]})
  536. expected = pd.DataFrame(
  537. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]}
  538. )
  539. result = merge_asof(left, right, on="a", direction="nearest", tolerance=1)
  540. tm.assert_frame_equal(result, expected)
  541. def test_tolerance_tz(self, unit):
  542. # GH 14844
  543. left = pd.DataFrame(
  544. {
  545. "date": pd.date_range(
  546. start=to_datetime("2016-01-02"),
  547. freq="D",
  548. periods=5,
  549. tz=pytz.timezone("UTC"),
  550. unit=unit,
  551. ),
  552. "value1": np.arange(5),
  553. }
  554. )
  555. right = pd.DataFrame(
  556. {
  557. "date": pd.date_range(
  558. start=to_datetime("2016-01-01"),
  559. freq="D",
  560. periods=5,
  561. tz=pytz.timezone("UTC"),
  562. unit=unit,
  563. ),
  564. "value2": list("ABCDE"),
  565. }
  566. )
  567. result = merge_asof(left, right, on="date", tolerance=Timedelta("1 day"))
  568. expected = pd.DataFrame(
  569. {
  570. "date": pd.date_range(
  571. start=to_datetime("2016-01-02"),
  572. freq="D",
  573. periods=5,
  574. tz=pytz.timezone("UTC"),
  575. unit=unit,
  576. ),
  577. "value1": np.arange(5),
  578. "value2": list("BCDEE"),
  579. }
  580. )
  581. tm.assert_frame_equal(result, expected)
  582. def test_tolerance_float(self):
  583. # GH22981
  584. left = pd.DataFrame({"a": [1.1, 3.5, 10.9], "left_val": ["a", "b", "c"]})
  585. right = pd.DataFrame(
  586. {"a": [1.0, 2.5, 3.3, 7.5, 11.5], "right_val": [1.0, 2.5, 3.3, 7.5, 11.5]}
  587. )
  588. expected = pd.DataFrame(
  589. {
  590. "a": [1.1, 3.5, 10.9],
  591. "left_val": ["a", "b", "c"],
  592. "right_val": [1, 3.3, np.nan],
  593. }
  594. )
  595. result = merge_asof(left, right, on="a", direction="nearest", tolerance=0.5)
  596. tm.assert_frame_equal(result, expected)
  597. def test_index_tolerance(self, trades, quotes, tolerance):
  598. # GH 15135
  599. expected = tolerance.set_index("time")
  600. trades = trades.set_index("time")
  601. quotes = quotes.set_index("time")
  602. result = merge_asof(
  603. trades,
  604. quotes,
  605. left_index=True,
  606. right_index=True,
  607. by="ticker",
  608. tolerance=Timedelta("1day"),
  609. )
  610. tm.assert_frame_equal(result, expected)
  611. def test_allow_exact_matches(self, trades, quotes, allow_exact_matches):
  612. result = merge_asof(
  613. trades, quotes, on="time", by="ticker", allow_exact_matches=False
  614. )
  615. expected = allow_exact_matches
  616. tm.assert_frame_equal(result, expected)
  617. def test_allow_exact_matches_forward(self):
  618. # GH14887
  619. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  620. right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]})
  621. expected = pd.DataFrame(
  622. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 7, 11]}
  623. )
  624. result = merge_asof(
  625. left, right, on="a", direction="forward", allow_exact_matches=False
  626. )
  627. tm.assert_frame_equal(result, expected)
  628. def test_allow_exact_matches_nearest(self):
  629. # GH14887
  630. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  631. right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]})
  632. expected = pd.DataFrame(
  633. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 3, 11]}
  634. )
  635. result = merge_asof(
  636. left, right, on="a", direction="nearest", allow_exact_matches=False
  637. )
  638. tm.assert_frame_equal(result, expected)
  639. def test_allow_exact_matches_and_tolerance(
  640. self, trades, quotes, allow_exact_matches_and_tolerance
  641. ):
  642. result = merge_asof(
  643. trades,
  644. quotes,
  645. on="time",
  646. by="ticker",
  647. tolerance=Timedelta("100ms"),
  648. allow_exact_matches=False,
  649. )
  650. expected = allow_exact_matches_and_tolerance
  651. tm.assert_frame_equal(result, expected)
  652. def test_allow_exact_matches_and_tolerance2(self):
  653. # GH 13695
  654. df1 = pd.DataFrame(
  655. {"time": to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"]}
  656. )
  657. df2 = pd.DataFrame(
  658. {
  659. "time": to_datetime(
  660. ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"]
  661. ),
  662. "version": [1, 2],
  663. }
  664. )
  665. result = merge_asof(df1, df2, on="time")
  666. expected = pd.DataFrame(
  667. {
  668. "time": to_datetime(["2016-07-15 13:30:00.030"]),
  669. "username": ["bob"],
  670. "version": [2],
  671. }
  672. )
  673. tm.assert_frame_equal(result, expected)
  674. result = merge_asof(df1, df2, on="time", allow_exact_matches=False)
  675. expected = pd.DataFrame(
  676. {
  677. "time": to_datetime(["2016-07-15 13:30:00.030"]),
  678. "username": ["bob"],
  679. "version": [1],
  680. }
  681. )
  682. tm.assert_frame_equal(result, expected)
  683. result = merge_asof(
  684. df1,
  685. df2,
  686. on="time",
  687. allow_exact_matches=False,
  688. tolerance=Timedelta("10ms"),
  689. )
  690. expected = pd.DataFrame(
  691. {
  692. "time": to_datetime(["2016-07-15 13:30:00.030"]),
  693. "username": ["bob"],
  694. "version": [np.nan],
  695. }
  696. )
  697. tm.assert_frame_equal(result, expected)
  698. def test_allow_exact_matches_and_tolerance3(self):
  699. # GH 13709
  700. df1 = pd.DataFrame(
  701. {
  702. "time": to_datetime(
  703. ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"]
  704. ),
  705. "username": ["bob", "charlie"],
  706. }
  707. )
  708. df2 = pd.DataFrame(
  709. {
  710. "time": to_datetime(
  711. ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"]
  712. ),
  713. "version": [1, 2],
  714. }
  715. )
  716. result = merge_asof(
  717. df1,
  718. df2,
  719. on="time",
  720. allow_exact_matches=False,
  721. tolerance=Timedelta("10ms"),
  722. )
  723. expected = pd.DataFrame(
  724. {
  725. "time": to_datetime(
  726. ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"]
  727. ),
  728. "username": ["bob", "charlie"],
  729. "version": [np.nan, np.nan],
  730. }
  731. )
  732. tm.assert_frame_equal(result, expected)
  733. def test_allow_exact_matches_and_tolerance_forward(self):
  734. # GH14887
  735. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  736. right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 6, 11]})
  737. expected = pd.DataFrame(
  738. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 6, 11]}
  739. )
  740. result = merge_asof(
  741. left,
  742. right,
  743. on="a",
  744. direction="forward",
  745. allow_exact_matches=False,
  746. tolerance=1,
  747. )
  748. tm.assert_frame_equal(result, expected)
  749. def test_allow_exact_matches_and_tolerance_nearest(self):
  750. # GH14887
  751. left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  752. right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 7, 11]})
  753. expected = pd.DataFrame(
  754. {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 4, 11]}
  755. )
  756. result = merge_asof(
  757. left,
  758. right,
  759. on="a",
  760. direction="nearest",
  761. allow_exact_matches=False,
  762. tolerance=1,
  763. )
  764. tm.assert_frame_equal(result, expected)
  765. def test_forward_by(self):
  766. # GH14887
  767. left = pd.DataFrame(
  768. {
  769. "a": [1, 5, 10, 12, 15],
  770. "b": ["X", "X", "Y", "Z", "Y"],
  771. "left_val": ["a", "b", "c", "d", "e"],
  772. }
  773. )
  774. right = pd.DataFrame(
  775. {
  776. "a": [1, 6, 11, 15, 16],
  777. "b": ["X", "Z", "Y", "Z", "Y"],
  778. "right_val": [1, 6, 11, 15, 16],
  779. }
  780. )
  781. expected = pd.DataFrame(
  782. {
  783. "a": [1, 5, 10, 12, 15],
  784. "b": ["X", "X", "Y", "Z", "Y"],
  785. "left_val": ["a", "b", "c", "d", "e"],
  786. "right_val": [1, np.nan, 11, 15, 16],
  787. }
  788. )
  789. result = merge_asof(left, right, on="a", by="b", direction="forward")
  790. tm.assert_frame_equal(result, expected)
  791. def test_nearest_by(self):
  792. # GH14887
  793. left = pd.DataFrame(
  794. {
  795. "a": [1, 5, 10, 12, 15],
  796. "b": ["X", "X", "Z", "Z", "Y"],
  797. "left_val": ["a", "b", "c", "d", "e"],
  798. }
  799. )
  800. right = pd.DataFrame(
  801. {
  802. "a": [1, 6, 11, 15, 16],
  803. "b": ["X", "Z", "Z", "Z", "Y"],
  804. "right_val": [1, 6, 11, 15, 16],
  805. }
  806. )
  807. expected = pd.DataFrame(
  808. {
  809. "a": [1, 5, 10, 12, 15],
  810. "b": ["X", "X", "Z", "Z", "Y"],
  811. "left_val": ["a", "b", "c", "d", "e"],
  812. "right_val": [1, 1, 11, 11, 16],
  813. }
  814. )
  815. result = merge_asof(left, right, on="a", by="b", direction="nearest")
  816. tm.assert_frame_equal(result, expected)
  817. def test_by_int(self):
  818. # we specialize by type, so test that this is correct
  819. df1 = pd.DataFrame(
  820. {
  821. "time": to_datetime(
  822. [
  823. "20160525 13:30:00.020",
  824. "20160525 13:30:00.030",
  825. "20160525 13:30:00.040",
  826. "20160525 13:30:00.050",
  827. "20160525 13:30:00.060",
  828. ]
  829. ),
  830. "key": [1, 2, 1, 3, 2],
  831. "value1": [1.1, 1.2, 1.3, 1.4, 1.5],
  832. },
  833. columns=["time", "key", "value1"],
  834. )
  835. df2 = pd.DataFrame(
  836. {
  837. "time": to_datetime(
  838. [
  839. "20160525 13:30:00.015",
  840. "20160525 13:30:00.020",
  841. "20160525 13:30:00.025",
  842. "20160525 13:30:00.035",
  843. "20160525 13:30:00.040",
  844. "20160525 13:30:00.055",
  845. "20160525 13:30:00.060",
  846. "20160525 13:30:00.065",
  847. ]
  848. ),
  849. "key": [2, 1, 1, 3, 2, 1, 2, 3],
  850. "value2": [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8],
  851. },
  852. columns=["time", "key", "value2"],
  853. )
  854. result = merge_asof(df1, df2, on="time", by="key")
  855. expected = pd.DataFrame(
  856. {
  857. "time": to_datetime(
  858. [
  859. "20160525 13:30:00.020",
  860. "20160525 13:30:00.030",
  861. "20160525 13:30:00.040",
  862. "20160525 13:30:00.050",
  863. "20160525 13:30:00.060",
  864. ]
  865. ),
  866. "key": [1, 2, 1, 3, 2],
  867. "value1": [1.1, 1.2, 1.3, 1.4, 1.5],
  868. "value2": [2.2, 2.1, 2.3, 2.4, 2.7],
  869. },
  870. columns=["time", "key", "value1", "value2"],
  871. )
  872. tm.assert_frame_equal(result, expected)
  873. def test_on_float(self):
  874. # mimics how to determine the minimum-price variation
  875. df1 = pd.DataFrame(
  876. {
  877. "price": [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078],
  878. "symbol": list("ABCDEFG"),
  879. },
  880. columns=["symbol", "price"],
  881. )
  882. df2 = pd.DataFrame(
  883. {"price": [0.0, 1.0, 100.0], "mpv": [0.0001, 0.01, 0.05]},
  884. columns=["price", "mpv"],
  885. )
  886. df1 = df1.sort_values("price").reset_index(drop=True)
  887. result = merge_asof(df1, df2, on="price")
  888. expected = pd.DataFrame(
  889. {
  890. "symbol": list("BGACEDF"),
  891. "price": [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90],
  892. "mpv": [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05],
  893. },
  894. columns=["symbol", "price", "mpv"],
  895. )
  896. tm.assert_frame_equal(result, expected)
  897. def test_on_specialized_type(self, any_real_numpy_dtype):
  898. # see gh-13936
  899. dtype = np.dtype(any_real_numpy_dtype).type
  900. df1 = pd.DataFrame(
  901. {"value": [5, 2, 25, 100, 78, 120, 79], "symbol": list("ABCDEFG")},
  902. columns=["symbol", "value"],
  903. )
  904. df1.value = dtype(df1.value)
  905. df2 = pd.DataFrame(
  906. {"value": [0, 80, 120, 125], "result": list("xyzw")},
  907. columns=["value", "result"],
  908. )
  909. df2.value = dtype(df2.value)
  910. df1 = df1.sort_values("value").reset_index(drop=True)
  911. result = merge_asof(df1, df2, on="value")
  912. expected = pd.DataFrame(
  913. {
  914. "symbol": list("BACEGDF"),
  915. "value": [2, 5, 25, 78, 79, 100, 120],
  916. "result": list("xxxxxyz"),
  917. },
  918. columns=["symbol", "value", "result"],
  919. )
  920. expected.value = dtype(expected.value)
  921. tm.assert_frame_equal(result, expected)
  922. def test_on_specialized_type_by_int(self, any_real_numpy_dtype):
  923. # see gh-13936
  924. dtype = np.dtype(any_real_numpy_dtype).type
  925. df1 = pd.DataFrame(
  926. {
  927. "value": [5, 2, 25, 100, 78, 120, 79],
  928. "key": [1, 2, 3, 2, 3, 1, 2],
  929. "symbol": list("ABCDEFG"),
  930. },
  931. columns=["symbol", "key", "value"],
  932. )
  933. df1.value = dtype(df1.value)
  934. df2 = pd.DataFrame(
  935. {"value": [0, 80, 120, 125], "key": [1, 2, 2, 3], "result": list("xyzw")},
  936. columns=["value", "key", "result"],
  937. )
  938. df2.value = dtype(df2.value)
  939. df1 = df1.sort_values("value").reset_index(drop=True)
  940. result = merge_asof(df1, df2, on="value", by="key")
  941. expected = pd.DataFrame(
  942. {
  943. "symbol": list("BACEGDF"),
  944. "key": [2, 1, 3, 3, 2, 2, 1],
  945. "value": [2, 5, 25, 78, 79, 100, 120],
  946. "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"],
  947. },
  948. columns=["symbol", "key", "value", "result"],
  949. )
  950. expected.value = dtype(expected.value)
  951. tm.assert_frame_equal(result, expected)
  952. def test_on_float_by_int(self):
  953. # type specialize both "by" and "on" parameters
  954. df1 = pd.DataFrame(
  955. {
  956. "symbol": list("AAABBBCCC"),
  957. "exch": [1, 2, 3, 1, 2, 3, 1, 2, 3],
  958. "price": [
  959. 3.26,
  960. 3.2599,
  961. 3.2598,
  962. 12.58,
  963. 12.59,
  964. 12.5,
  965. 378.15,
  966. 378.2,
  967. 378.25,
  968. ],
  969. },
  970. columns=["symbol", "exch", "price"],
  971. )
  972. df2 = pd.DataFrame(
  973. {
  974. "exch": [1, 1, 1, 2, 2, 2, 3, 3, 3],
  975. "price": [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0],
  976. "mpv": [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0],
  977. },
  978. columns=["exch", "price", "mpv"],
  979. )
  980. df1 = df1.sort_values("price").reset_index(drop=True)
  981. df2 = df2.sort_values("price").reset_index(drop=True)
  982. result = merge_asof(df1, df2, on="price", by="exch")
  983. expected = pd.DataFrame(
  984. {
  985. "symbol": list("AAABBBCCC"),
  986. "exch": [3, 2, 1, 3, 1, 2, 1, 2, 3],
  987. "price": [
  988. 3.2598,
  989. 3.2599,
  990. 3.26,
  991. 12.5,
  992. 12.58,
  993. 12.59,
  994. 378.15,
  995. 378.2,
  996. 378.25,
  997. ],
  998. "mpv": [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25],
  999. },
  1000. columns=["symbol", "exch", "price", "mpv"],
  1001. )
  1002. tm.assert_frame_equal(result, expected)
  1003. def test_merge_datatype_error_raises(self):
  1004. msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype"
  1005. left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]})
  1006. right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]})
  1007. with pytest.raises(MergeError, match=msg):
  1008. merge_asof(left, right, on="a")
  1009. def test_merge_datatype_categorical_error_raises(self):
  1010. msg = (
  1011. r"incompatible merge keys \[0\] .* both sides category, "
  1012. "but not equal ones"
  1013. )
  1014. left = pd.DataFrame(
  1015. {"left_val": [1, 5, 10], "a": pd.Categorical(["a", "b", "c"])}
  1016. )
  1017. right = pd.DataFrame(
  1018. {
  1019. "right_val": [1, 2, 3, 6, 7],
  1020. "a": pd.Categorical(["a", "X", "c", "X", "b"]),
  1021. }
  1022. )
  1023. with pytest.raises(MergeError, match=msg):
  1024. merge_asof(left, right, on="a")
  1025. def test_merge_groupby_multiple_column_with_categorical_column(self):
  1026. # GH 16454
  1027. df = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])})
  1028. result = merge_asof(df, df, on="x", by=["y", "z"])
  1029. expected = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])})
  1030. tm.assert_frame_equal(result, expected)
  1031. @pytest.mark.parametrize(
  1032. "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"]
  1033. )
  1034. @pytest.mark.parametrize("side", ["left", "right"])
  1035. def test_merge_on_nans(self, func, side):
  1036. # GH 23189
  1037. msg = f"Merge keys contain null values on {side} side"
  1038. nulls = func([1.0, 5.0, np.nan])
  1039. non_nulls = func([1.0, 5.0, 10.0])
  1040. df_null = pd.DataFrame({"a": nulls, "left_val": ["a", "b", "c"]})
  1041. df = pd.DataFrame({"a": non_nulls, "right_val": [1, 6, 11]})
  1042. with pytest.raises(ValueError, match=msg):
  1043. if side == "left":
  1044. merge_asof(df_null, df, on="a")
  1045. else:
  1046. merge_asof(df, df_null, on="a")
  1047. def test_by_nullable(self, any_numeric_ea_dtype):
  1048. # Note: this test passes if instead of using pd.array we use
  1049. # np.array([np.nan, 1]). Other than that, I (@jbrockmendel)
  1050. # have NO IDEA what the expected behavior is.
  1051. # TODO(GH#32306): may be relevant to the expected behavior here.
  1052. arr = pd.array([pd.NA, 0, 1], dtype=any_numeric_ea_dtype)
  1053. if arr.dtype.kind in ["i", "u"]:
  1054. max_val = np.iinfo(arr.dtype.numpy_dtype).max
  1055. else:
  1056. max_val = np.finfo(arr.dtype.numpy_dtype).max
  1057. # set value s.t. (at least for integer dtypes) arr._values_for_argsort
  1058. # is not an injection
  1059. arr[2] = max_val
  1060. left = pd.DataFrame(
  1061. {
  1062. "by_col1": arr,
  1063. "by_col2": ["HELLO", "To", "You"],
  1064. "on_col": [2, 4, 6],
  1065. "value": ["a", "c", "e"],
  1066. }
  1067. )
  1068. right = pd.DataFrame(
  1069. {
  1070. "by_col1": arr,
  1071. "by_col2": ["WORLD", "Wide", "Web"],
  1072. "on_col": [1, 2, 6],
  1073. "value": ["b", "d", "f"],
  1074. }
  1075. )
  1076. result = merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col")
  1077. expected = pd.DataFrame(
  1078. {
  1079. "by_col1": arr,
  1080. "by_col2": ["HELLO", "To", "You"],
  1081. "on_col": [2, 4, 6],
  1082. "value_x": ["a", "c", "e"],
  1083. }
  1084. )
  1085. expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object)
  1086. tm.assert_frame_equal(result, expected)
  1087. def test_merge_by_col_tz_aware(self):
  1088. # GH 21184
  1089. left = pd.DataFrame(
  1090. {
  1091. "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"),
  1092. "on_col": [2],
  1093. "values": ["a"],
  1094. }
  1095. )
  1096. right = pd.DataFrame(
  1097. {
  1098. "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"),
  1099. "on_col": [1],
  1100. "values": ["b"],
  1101. }
  1102. )
  1103. result = merge_asof(left, right, by="by_col", on="on_col")
  1104. expected = pd.DataFrame(
  1105. [[pd.Timestamp("2018-01-01", tz="UTC"), 2, "a", "b"]],
  1106. columns=["by_col", "on_col", "values_x", "values_y"],
  1107. )
  1108. tm.assert_frame_equal(result, expected)
  1109. def test_by_mixed_tz_aware(self):
  1110. # GH 26649
  1111. left = pd.DataFrame(
  1112. {
  1113. "by_col1": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"),
  1114. "by_col2": ["HELLO"],
  1115. "on_col": [2],
  1116. "value": ["a"],
  1117. }
  1118. )
  1119. right = pd.DataFrame(
  1120. {
  1121. "by_col1": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"),
  1122. "by_col2": ["WORLD"],
  1123. "on_col": [1],
  1124. "value": ["b"],
  1125. }
  1126. )
  1127. result = merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col")
  1128. expected = pd.DataFrame(
  1129. [[pd.Timestamp("2018-01-01", tz="UTC"), "HELLO", 2, "a"]],
  1130. columns=["by_col1", "by_col2", "on_col", "value_x"],
  1131. )
  1132. expected["value_y"] = np.array([np.nan], dtype=object)
  1133. tm.assert_frame_equal(result, expected)
  1134. def test_timedelta_tolerance_nearest(self, unit):
  1135. # GH 27642
  1136. if unit == "s":
  1137. pytest.skip(
  1138. "This test is invalid with unit='s' because that would "
  1139. "round left['time']"
  1140. )
  1141. left = pd.DataFrame(
  1142. list(zip([0, 5, 10, 15, 20, 25], [0, 1, 2, 3, 4, 5])),
  1143. columns=["time", "left"],
  1144. )
  1145. left["time"] = pd.to_timedelta(left["time"], "ms").astype(f"m8[{unit}]")
  1146. right = pd.DataFrame(
  1147. list(zip([0, 3, 9, 12, 15, 18], [0, 1, 2, 3, 4, 5])),
  1148. columns=["time", "right"],
  1149. )
  1150. right["time"] = pd.to_timedelta(right["time"], "ms").astype(f"m8[{unit}]")
  1151. expected = pd.DataFrame(
  1152. list(
  1153. zip(
  1154. [0, 5, 10, 15, 20, 25],
  1155. [0, 1, 2, 3, 4, 5],
  1156. [0, np.nan, 2, 4, np.nan, np.nan],
  1157. )
  1158. ),
  1159. columns=["time", "left", "right"],
  1160. )
  1161. expected["time"] = pd.to_timedelta(expected["time"], "ms").astype(f"m8[{unit}]")
  1162. result = merge_asof(
  1163. left, right, on="time", tolerance=Timedelta("1ms"), direction="nearest"
  1164. )
  1165. tm.assert_frame_equal(result, expected)
  1166. # TODO: any_int_dtype; causes failures in _get_join_indexers
  1167. def test_int_type_tolerance(self, any_int_numpy_dtype):
  1168. # GH #28870
  1169. left = pd.DataFrame({"a": [0, 10, 20], "left_val": [1, 2, 3]})
  1170. right = pd.DataFrame({"a": [5, 15, 25], "right_val": [1, 2, 3]})
  1171. left["a"] = left["a"].astype(any_int_numpy_dtype)
  1172. right["a"] = right["a"].astype(any_int_numpy_dtype)
  1173. expected = pd.DataFrame(
  1174. {"a": [0, 10, 20], "left_val": [1, 2, 3], "right_val": [np.nan, 1.0, 2.0]}
  1175. )
  1176. expected["a"] = expected["a"].astype(any_int_numpy_dtype)
  1177. result = merge_asof(left, right, on="a", tolerance=10)
  1178. tm.assert_frame_equal(result, expected)
  1179. def test_merge_index_column_tz(self):
  1180. # GH 29864
  1181. index = pd.date_range("2019-10-01", freq="30min", periods=5, tz="UTC")
  1182. left = pd.DataFrame([0.9, 0.8, 0.7, 0.6], columns=["xyz"], index=index[1:])
  1183. right = pd.DataFrame({"from_date": index, "abc": [2.46] * 4 + [2.19]})
  1184. result = merge_asof(
  1185. left=left, right=right, left_index=True, right_on=["from_date"]
  1186. )
  1187. expected = pd.DataFrame(
  1188. {
  1189. "xyz": [0.9, 0.8, 0.7, 0.6],
  1190. "from_date": index[1:],
  1191. "abc": [2.46] * 3 + [2.19],
  1192. },
  1193. index=pd.date_range(
  1194. "2019-10-01 00:30:00", freq="30min", periods=4, tz="UTC"
  1195. ),
  1196. )
  1197. tm.assert_frame_equal(result, expected)
  1198. result = merge_asof(
  1199. left=right, right=left, right_index=True, left_on=["from_date"]
  1200. )
  1201. expected = pd.DataFrame(
  1202. {
  1203. "from_date": index,
  1204. "abc": [2.46] * 4 + [2.19],
  1205. "xyz": [np.nan, 0.9, 0.8, 0.7, 0.6],
  1206. },
  1207. index=Index([0, 1, 2, 3, 4]),
  1208. )
  1209. tm.assert_frame_equal(result, expected)
  1210. def test_left_index_right_index_tolerance(self, unit):
  1211. # https://github.com/pandas-dev/pandas/issues/35558
  1212. if unit == "s":
  1213. pytest.skip(
  1214. "This test is invalid with unit='s' because that would round dr1"
  1215. )
  1216. dr1 = pd.date_range(
  1217. start="1/1/2020", end="1/20/2020", freq="2D", unit=unit
  1218. ) + Timedelta(seconds=0.4).as_unit(unit)
  1219. dr2 = pd.date_range(start="1/1/2020", end="2/1/2020", unit=unit)
  1220. df1 = pd.DataFrame({"val1": "foo"}, index=pd.DatetimeIndex(dr1))
  1221. df2 = pd.DataFrame({"val2": "bar"}, index=pd.DatetimeIndex(dr2))
  1222. expected = pd.DataFrame(
  1223. {"val1": "foo", "val2": "bar"}, index=pd.DatetimeIndex(dr1)
  1224. )
  1225. result = merge_asof(
  1226. df1,
  1227. df2,
  1228. left_index=True,
  1229. right_index=True,
  1230. tolerance=Timedelta(seconds=0.5),
  1231. )
  1232. tm.assert_frame_equal(result, expected)
  1233. @pytest.mark.parametrize(
  1234. "kwargs", [{"on": "x"}, {"left_index": True, "right_index": True}]
  1235. )
  1236. @pytest.mark.parametrize(
  1237. "data",
  1238. [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]],
  1239. )
  1240. def test_merge_asof_non_numerical_dtype(kwargs, data):
  1241. # GH#29130
  1242. left = pd.DataFrame({"x": data}, index=data)
  1243. right = pd.DataFrame({"x": data}, index=data)
  1244. with pytest.raises(
  1245. MergeError,
  1246. match=r"Incompatible merge dtype, .*, both sides must have numeric dtype",
  1247. ):
  1248. merge_asof(left, right, **kwargs)
  1249. def test_merge_asof_non_numerical_dtype_object():
  1250. # GH#29130
  1251. left = pd.DataFrame({"a": ["12", "13", "15"], "left_val1": ["a", "b", "c"]})
  1252. right = pd.DataFrame({"a": ["a", "b", "c"], "left_val": ["d", "e", "f"]})
  1253. with pytest.raises(
  1254. MergeError,
  1255. match=r"Incompatible merge dtype, .*, both sides must have numeric dtype",
  1256. ):
  1257. merge_asof(
  1258. left,
  1259. right,
  1260. left_on="left_val1",
  1261. right_on="a",
  1262. left_by="a",
  1263. right_by="left_val",
  1264. )
  1265. @pytest.mark.parametrize(
  1266. "kwargs",
  1267. [
  1268. {"right_index": True, "left_index": True},
  1269. {"left_on": "left_time", "right_index": True},
  1270. {"left_index": True, "right_on": "right"},
  1271. ],
  1272. )
  1273. def test_merge_asof_index_behavior(kwargs):
  1274. # GH 33463
  1275. index = Index([1, 5, 10], name="test")
  1276. left = pd.DataFrame({"left": ["a", "b", "c"], "left_time": [1, 4, 10]}, index=index)
  1277. right = pd.DataFrame({"right": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
  1278. result = merge_asof(left, right, **kwargs)
  1279. expected = pd.DataFrame(
  1280. {"left": ["a", "b", "c"], "left_time": [1, 4, 10], "right": [1, 3, 7]},
  1281. index=index,
  1282. )
  1283. tm.assert_frame_equal(result, expected)
  1284. def test_merge_asof_numeri_column_in_index():
  1285. # GH#34488
  1286. left = pd.DataFrame({"b": [10, 11, 12]}, index=Index([1, 2, 3], name="a"))
  1287. right = pd.DataFrame({"c": [20, 21, 22]}, index=Index([0, 2, 3], name="a"))
  1288. result = merge_asof(left, right, left_on="a", right_on="a")
  1289. expected = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12], "c": [20, 21, 22]})
  1290. tm.assert_frame_equal(result, expected)
  1291. def test_merge_asof_numeri_column_in_multiindex():
  1292. # GH#34488
  1293. left = pd.DataFrame(
  1294. {"b": [10, 11, 12]},
  1295. index=pd.MultiIndex.from_arrays([[1, 2, 3], ["a", "b", "c"]], names=["a", "z"]),
  1296. )
  1297. right = pd.DataFrame(
  1298. {"c": [20, 21, 22]},
  1299. index=pd.MultiIndex.from_arrays([[1, 2, 3], ["x", "y", "z"]], names=["a", "y"]),
  1300. )
  1301. result = merge_asof(left, right, left_on="a", right_on="a")
  1302. expected = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12], "c": [20, 21, 22]})
  1303. tm.assert_frame_equal(result, expected)
  1304. def test_merge_asof_numeri_column_in_index_object_dtype():
  1305. # GH#34488
  1306. left = pd.DataFrame({"b": [10, 11, 12]}, index=Index(["1", "2", "3"], name="a"))
  1307. right = pd.DataFrame({"c": [20, 21, 22]}, index=Index(["m", "n", "o"], name="a"))
  1308. with pytest.raises(
  1309. MergeError,
  1310. match=r"Incompatible merge dtype, .*, both sides must have numeric dtype",
  1311. ):
  1312. merge_asof(left, right, left_on="a", right_on="a")
  1313. left = left.reset_index().set_index(["a", "b"])
  1314. right = right.reset_index().set_index(["a", "c"])
  1315. with pytest.raises(
  1316. MergeError,
  1317. match=r"Incompatible merge dtype, .*, both sides must have numeric dtype",
  1318. ):
  1319. merge_asof(left, right, left_on="a", right_on="a")
  1320. def test_merge_asof_array_as_on():
  1321. # GH#42844
  1322. right = pd.DataFrame(
  1323. {
  1324. "a": [2, 6],
  1325. "ts": [pd.Timestamp("2021/01/01 00:37"), pd.Timestamp("2021/01/01 01:40")],
  1326. }
  1327. )
  1328. ts_merge = pd.date_range(
  1329. start=pd.Timestamp("2021/01/01 00:00"), periods=3, freq="1h"
  1330. )
  1331. left = pd.DataFrame({"b": [4, 8, 7]})
  1332. result = merge_asof(
  1333. left,
  1334. right,
  1335. left_on=ts_merge,
  1336. right_on="ts",
  1337. allow_exact_matches=False,
  1338. direction="backward",
  1339. )
  1340. expected = pd.DataFrame({"b": [4, 8, 7], "a": [np.nan, 2, 6], "ts": ts_merge})
  1341. tm.assert_frame_equal(result, expected)
  1342. result = merge_asof(
  1343. right,
  1344. left,
  1345. left_on="ts",
  1346. right_on=ts_merge,
  1347. allow_exact_matches=False,
  1348. direction="backward",
  1349. )
  1350. expected = pd.DataFrame(
  1351. {
  1352. "a": [2, 6],
  1353. "ts": [pd.Timestamp("2021/01/01 00:37"), pd.Timestamp("2021/01/01 01:40")],
  1354. "b": [4, 8],
  1355. }
  1356. )
  1357. tm.assert_frame_equal(result, expected)
  1358. def test_merge_asof_raise_for_duplicate_columns():
  1359. # GH#50102
  1360. left = pd.DataFrame([[1, 2, "a"]], columns=["a", "a", "left_val"])
  1361. right = pd.DataFrame([[1, 1, 1]], columns=["a", "a", "right_val"])
  1362. with pytest.raises(ValueError, match="column label 'a'"):
  1363. merge_asof(left, right, on="a")
  1364. with pytest.raises(ValueError, match="column label 'a'"):
  1365. merge_asof(left, right, left_on="a", right_on="right_val")
  1366. with pytest.raises(ValueError, match="column label 'a'"):
  1367. merge_asof(left, right, left_on="left_val", right_on="a")