test_indexing.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973
  1. from datetime import timedelta
  2. import re
  3. import numpy as np
  4. import pytest
  5. from pandas.errors import (
  6. InvalidIndexError,
  7. PerformanceWarning,
  8. )
  9. import pandas as pd
  10. from pandas import (
  11. Categorical,
  12. Index,
  13. MultiIndex,
  14. date_range,
  15. )
  16. import pandas._testing as tm
  17. class TestSliceLocs:
  18. def test_slice_locs_partial(self, idx):
  19. sorted_idx, _ = idx.sortlevel(0)
  20. result = sorted_idx.slice_locs(("foo", "two"), ("qux", "one"))
  21. assert result == (1, 5)
  22. result = sorted_idx.slice_locs(None, ("qux", "one"))
  23. assert result == (0, 5)
  24. result = sorted_idx.slice_locs(("foo", "two"), None)
  25. assert result == (1, len(sorted_idx))
  26. result = sorted_idx.slice_locs("bar", "baz")
  27. assert result == (2, 4)
  28. def test_slice_locs(self):
  29. df = tm.makeTimeDataFrame()
  30. stacked = df.stack()
  31. idx = stacked.index
  32. slob = slice(*idx.slice_locs(df.index[5], df.index[15]))
  33. sliced = stacked[slob]
  34. expected = df[5:16].stack()
  35. tm.assert_almost_equal(sliced.values, expected.values)
  36. slob = slice(
  37. *idx.slice_locs(
  38. df.index[5] + timedelta(seconds=30),
  39. df.index[15] - timedelta(seconds=30),
  40. )
  41. )
  42. sliced = stacked[slob]
  43. expected = df[6:15].stack()
  44. tm.assert_almost_equal(sliced.values, expected.values)
  45. def test_slice_locs_with_type_mismatch(self):
  46. df = tm.makeTimeDataFrame()
  47. stacked = df.stack()
  48. idx = stacked.index
  49. with pytest.raises(TypeError, match="^Level type mismatch"):
  50. idx.slice_locs((1, 3))
  51. with pytest.raises(TypeError, match="^Level type mismatch"):
  52. idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2))
  53. df = tm.makeCustomDataframe(5, 5)
  54. stacked = df.stack()
  55. idx = stacked.index
  56. with pytest.raises(TypeError, match="^Level type mismatch"):
  57. idx.slice_locs(timedelta(seconds=30))
  58. # TODO: Try creating a UnicodeDecodeError in exception message
  59. with pytest.raises(TypeError, match="^Level type mismatch"):
  60. idx.slice_locs(df.index[1], (16, "a"))
  61. def test_slice_locs_not_sorted(self):
  62. index = MultiIndex(
  63. levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))],
  64. codes=[
  65. np.array([0, 0, 1, 2, 2, 2, 3, 3]),
  66. np.array([0, 1, 0, 0, 0, 1, 0, 1]),
  67. np.array([1, 0, 1, 1, 0, 0, 1, 0]),
  68. ],
  69. )
  70. msg = "[Kk]ey length.*greater than MultiIndex lexsort depth"
  71. with pytest.raises(KeyError, match=msg):
  72. index.slice_locs((1, 0, 1), (2, 1, 0))
  73. # works
  74. sorted_index, _ = index.sortlevel(0)
  75. # should there be a test case here???
  76. sorted_index.slice_locs((1, 0, 1), (2, 1, 0))
  77. def test_slice_locs_not_contained(self):
  78. # some searchsorted action
  79. index = MultiIndex(
  80. levels=[[0, 2, 4, 6], [0, 2, 4]],
  81. codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]],
  82. )
  83. result = index.slice_locs((1, 0), (5, 2))
  84. assert result == (3, 6)
  85. result = index.slice_locs(1, 5)
  86. assert result == (3, 6)
  87. result = index.slice_locs((2, 2), (5, 2))
  88. assert result == (3, 6)
  89. result = index.slice_locs(2, 5)
  90. assert result == (3, 6)
  91. result = index.slice_locs((1, 0), (6, 3))
  92. assert result == (3, 8)
  93. result = index.slice_locs(-1, 10)
  94. assert result == (0, len(index))
  95. @pytest.mark.parametrize(
  96. "index_arr,expected,start_idx,end_idx",
  97. [
  98. ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None),
  99. ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"),
  100. ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")),
  101. ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None),
  102. ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"),
  103. ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")),
  104. ],
  105. )
  106. def test_slice_locs_with_missing_value(
  107. self, index_arr, expected, start_idx, end_idx
  108. ):
  109. # issue 19132
  110. idx = MultiIndex.from_arrays(index_arr)
  111. result = idx.slice_locs(start=start_idx, end=end_idx)
  112. assert result == expected
  113. class TestPutmask:
  114. def test_putmask_with_wrong_mask(self, idx):
  115. # GH18368
  116. msg = "putmask: mask and data must be the same size"
  117. with pytest.raises(ValueError, match=msg):
  118. idx.putmask(np.ones(len(idx) + 1, np.bool_), 1)
  119. with pytest.raises(ValueError, match=msg):
  120. idx.putmask(np.ones(len(idx) - 1, np.bool_), 1)
  121. with pytest.raises(ValueError, match=msg):
  122. idx.putmask("foo", 1)
  123. def test_putmask_multiindex_other(self):
  124. # GH#43212 `value` is also a MultiIndex
  125. left = MultiIndex.from_tuples([(np.nan, 6), (np.nan, 6), ("a", 4)])
  126. right = MultiIndex.from_tuples([("a", 1), ("a", 1), ("d", 1)])
  127. mask = np.array([True, True, False])
  128. result = left.putmask(mask, right)
  129. expected = MultiIndex.from_tuples([right[0], right[1], left[2]])
  130. tm.assert_index_equal(result, expected)
  131. def test_putmask_keep_dtype(self, any_numeric_ea_dtype):
  132. # GH#49830
  133. midx = MultiIndex.from_arrays(
  134. [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]]
  135. )
  136. midx2 = MultiIndex.from_arrays(
  137. [pd.Series([5, 6, 7], dtype=any_numeric_ea_dtype), [-1, -2, -3]]
  138. )
  139. result = midx.putmask([True, False, False], midx2)
  140. expected = MultiIndex.from_arrays(
  141. [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]]
  142. )
  143. tm.assert_index_equal(result, expected)
  144. def test_putmask_keep_dtype_shorter_value(self, any_numeric_ea_dtype):
  145. # GH#49830
  146. midx = MultiIndex.from_arrays(
  147. [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]]
  148. )
  149. midx2 = MultiIndex.from_arrays(
  150. [pd.Series([5], dtype=any_numeric_ea_dtype), [-1]]
  151. )
  152. result = midx.putmask([True, False, False], midx2)
  153. expected = MultiIndex.from_arrays(
  154. [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]]
  155. )
  156. tm.assert_index_equal(result, expected)
  157. class TestGetIndexer:
  158. def test_get_indexer(self):
  159. major_axis = Index(np.arange(4))
  160. minor_axis = Index(np.arange(2))
  161. major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp)
  162. minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp)
  163. index = MultiIndex(
  164. levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
  165. )
  166. idx1 = index[:5]
  167. idx2 = index[[1, 3, 5]]
  168. r1 = idx1.get_indexer(idx2)
  169. tm.assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))
  170. r1 = idx2.get_indexer(idx1, method="pad")
  171. e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
  172. tm.assert_almost_equal(r1, e1)
  173. r2 = idx2.get_indexer(idx1[::-1], method="pad")
  174. tm.assert_almost_equal(r2, e1[::-1])
  175. rffill1 = idx2.get_indexer(idx1, method="ffill")
  176. tm.assert_almost_equal(r1, rffill1)
  177. r1 = idx2.get_indexer(idx1, method="backfill")
  178. e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
  179. tm.assert_almost_equal(r1, e1)
  180. r2 = idx2.get_indexer(idx1[::-1], method="backfill")
  181. tm.assert_almost_equal(r2, e1[::-1])
  182. rbfill1 = idx2.get_indexer(idx1, method="bfill")
  183. tm.assert_almost_equal(r1, rbfill1)
  184. # pass non-MultiIndex
  185. r1 = idx1.get_indexer(idx2.values)
  186. rexp1 = idx1.get_indexer(idx2)
  187. tm.assert_almost_equal(r1, rexp1)
  188. r1 = idx1.get_indexer([1, 2, 3])
  189. assert (r1 == [-1, -1, -1]).all()
  190. # create index with duplicates
  191. idx1 = Index(list(range(10)) + list(range(10)))
  192. idx2 = Index(list(range(20)))
  193. msg = "Reindexing only valid with uniquely valued Index objects"
  194. with pytest.raises(InvalidIndexError, match=msg):
  195. idx1.get_indexer(idx2)
  196. def test_get_indexer_nearest(self):
  197. midx = MultiIndex.from_tuples([("a", 1), ("b", 2)])
  198. msg = (
  199. "method='nearest' not implemented yet for MultiIndex; "
  200. "see GitHub issue 9365"
  201. )
  202. with pytest.raises(NotImplementedError, match=msg):
  203. midx.get_indexer(["a"], method="nearest")
  204. msg = "tolerance not implemented yet for MultiIndex"
  205. with pytest.raises(NotImplementedError, match=msg):
  206. midx.get_indexer(["a"], method="pad", tolerance=2)
  207. def test_get_indexer_categorical_time(self):
  208. # https://github.com/pandas-dev/pandas/issues/21390
  209. midx = MultiIndex.from_product(
  210. [
  211. Categorical(["a", "b", "c"]),
  212. Categorical(date_range("2012-01-01", periods=3, freq="H")),
  213. ]
  214. )
  215. result = midx.get_indexer(midx)
  216. tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp))
  217. @pytest.mark.parametrize(
  218. "index_arr,labels,expected",
  219. [
  220. (
  221. [[1, np.nan, 2], [3, 4, 5]],
  222. [1, np.nan, 2],
  223. np.array([-1, -1, -1], dtype=np.intp),
  224. ),
  225. ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)),
  226. ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)),
  227. (
  228. [[1, 2, 3], [np.nan, 4, 5]],
  229. [np.nan, 4, 5],
  230. np.array([-1, -1, -1], dtype=np.intp),
  231. ),
  232. ],
  233. )
  234. def test_get_indexer_with_missing_value(self, index_arr, labels, expected):
  235. # issue 19132
  236. idx = MultiIndex.from_arrays(index_arr)
  237. result = idx.get_indexer(labels)
  238. tm.assert_numpy_array_equal(result, expected)
  239. def test_get_indexer_methods(self):
  240. # https://github.com/pandas-dev/pandas/issues/29896
  241. # test getting an indexer for another index with different methods
  242. # confirms that getting an indexer without a filling method, getting an
  243. # indexer and backfilling, and getting an indexer and padding all behave
  244. # correctly in the case where all of the target values fall in between
  245. # several levels in the MultiIndex into which they are getting an indexer
  246. #
  247. # visually, the MultiIndexes used in this test are:
  248. # mult_idx_1:
  249. # 0: -1 0
  250. # 1: 2
  251. # 2: 3
  252. # 3: 4
  253. # 4: 0 0
  254. # 5: 2
  255. # 6: 3
  256. # 7: 4
  257. # 8: 1 0
  258. # 9: 2
  259. # 10: 3
  260. # 11: 4
  261. #
  262. # mult_idx_2:
  263. # 0: 0 1
  264. # 1: 3
  265. # 2: 4
  266. mult_idx_1 = MultiIndex.from_product([[-1, 0, 1], [0, 2, 3, 4]])
  267. mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]])
  268. indexer = mult_idx_1.get_indexer(mult_idx_2)
  269. expected = np.array([-1, 6, 7], dtype=indexer.dtype)
  270. tm.assert_almost_equal(expected, indexer)
  271. backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill")
  272. expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype)
  273. tm.assert_almost_equal(expected, backfill_indexer)
  274. # ensure the legacy "bfill" option functions identically to "backfill"
  275. backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill")
  276. expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype)
  277. tm.assert_almost_equal(expected, backfill_indexer)
  278. pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad")
  279. expected = np.array([4, 6, 7], dtype=pad_indexer.dtype)
  280. tm.assert_almost_equal(expected, pad_indexer)
  281. # ensure the legacy "ffill" option functions identically to "pad"
  282. pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill")
  283. expected = np.array([4, 6, 7], dtype=pad_indexer.dtype)
  284. tm.assert_almost_equal(expected, pad_indexer)
  285. def test_get_indexer_three_or_more_levels(self):
  286. # https://github.com/pandas-dev/pandas/issues/29896
  287. # tests get_indexer() on MultiIndexes with 3+ levels
  288. # visually, these are
  289. # mult_idx_1:
  290. # 0: 1 2 5
  291. # 1: 7
  292. # 2: 4 5
  293. # 3: 7
  294. # 4: 6 5
  295. # 5: 7
  296. # 6: 3 2 5
  297. # 7: 7
  298. # 8: 4 5
  299. # 9: 7
  300. # 10: 6 5
  301. # 11: 7
  302. #
  303. # mult_idx_2:
  304. # 0: 1 1 8
  305. # 1: 1 5 9
  306. # 2: 1 6 7
  307. # 3: 2 1 6
  308. # 4: 2 7 6
  309. # 5: 2 7 8
  310. # 6: 3 6 8
  311. mult_idx_1 = MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]])
  312. mult_idx_2 = MultiIndex.from_tuples(
  313. [
  314. (1, 1, 8),
  315. (1, 5, 9),
  316. (1, 6, 7),
  317. (2, 1, 6),
  318. (2, 7, 7),
  319. (2, 7, 8),
  320. (3, 6, 8),
  321. ]
  322. )
  323. # sanity check
  324. assert mult_idx_1.is_monotonic_increasing
  325. assert mult_idx_1.is_unique
  326. assert mult_idx_2.is_monotonic_increasing
  327. assert mult_idx_2.is_unique
  328. # show the relationships between the two
  329. assert mult_idx_2[0] < mult_idx_1[0]
  330. assert mult_idx_1[3] < mult_idx_2[1] < mult_idx_1[4]
  331. assert mult_idx_1[5] == mult_idx_2[2]
  332. assert mult_idx_1[5] < mult_idx_2[3] < mult_idx_1[6]
  333. assert mult_idx_1[5] < mult_idx_2[4] < mult_idx_1[6]
  334. assert mult_idx_1[5] < mult_idx_2[5] < mult_idx_1[6]
  335. assert mult_idx_1[-1] < mult_idx_2[6]
  336. indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2)
  337. expected = np.array([-1, -1, 5, -1, -1, -1, -1], dtype=indexer_no_fill.dtype)
  338. tm.assert_almost_equal(expected, indexer_no_fill)
  339. # test with backfilling
  340. indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill")
  341. expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype=indexer_backfilled.dtype)
  342. tm.assert_almost_equal(expected, indexer_backfilled)
  343. # now, the same thing, but forward-filled (aka "padded")
  344. indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method="pad")
  345. expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype=indexer_padded.dtype)
  346. tm.assert_almost_equal(expected, indexer_padded)
  347. # now, do the indexing in the other direction
  348. assert mult_idx_2[0] < mult_idx_1[0] < mult_idx_2[1]
  349. assert mult_idx_2[0] < mult_idx_1[1] < mult_idx_2[1]
  350. assert mult_idx_2[0] < mult_idx_1[2] < mult_idx_2[1]
  351. assert mult_idx_2[0] < mult_idx_1[3] < mult_idx_2[1]
  352. assert mult_idx_2[1] < mult_idx_1[4] < mult_idx_2[2]
  353. assert mult_idx_2[2] == mult_idx_1[5]
  354. assert mult_idx_2[5] < mult_idx_1[6] < mult_idx_2[6]
  355. assert mult_idx_2[5] < mult_idx_1[7] < mult_idx_2[6]
  356. assert mult_idx_2[5] < mult_idx_1[8] < mult_idx_2[6]
  357. assert mult_idx_2[5] < mult_idx_1[9] < mult_idx_2[6]
  358. assert mult_idx_2[5] < mult_idx_1[10] < mult_idx_2[6]
  359. assert mult_idx_2[5] < mult_idx_1[11] < mult_idx_2[6]
  360. indexer = mult_idx_2.get_indexer(mult_idx_1)
  361. expected = np.array(
  362. [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], dtype=indexer.dtype
  363. )
  364. tm.assert_almost_equal(expected, indexer)
  365. backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill")
  366. expected = np.array(
  367. [1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype=backfill_indexer.dtype
  368. )
  369. tm.assert_almost_equal(expected, backfill_indexer)
  370. pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad")
  371. expected = np.array(
  372. [0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype
  373. )
  374. tm.assert_almost_equal(expected, pad_indexer)
  375. def test_get_indexer_crossing_levels(self):
  376. # https://github.com/pandas-dev/pandas/issues/29896
  377. # tests a corner case with get_indexer() with MultiIndexes where, when we
  378. # need to "carry" across levels, proper tuple ordering is respected
  379. #
  380. # the MultiIndexes used in this test, visually, are:
  381. # mult_idx_1:
  382. # 0: 1 1 1 1
  383. # 1: 2
  384. # 2: 2 1
  385. # 3: 2
  386. # 4: 1 2 1 1
  387. # 5: 2
  388. # 6: 2 1
  389. # 7: 2
  390. # 8: 2 1 1 1
  391. # 9: 2
  392. # 10: 2 1
  393. # 11: 2
  394. # 12: 2 2 1 1
  395. # 13: 2
  396. # 14: 2 1
  397. # 15: 2
  398. #
  399. # mult_idx_2:
  400. # 0: 1 3 2 2
  401. # 1: 2 3 2 2
  402. mult_idx_1 = MultiIndex.from_product([[1, 2]] * 4)
  403. mult_idx_2 = MultiIndex.from_tuples([(1, 3, 2, 2), (2, 3, 2, 2)])
  404. # show the tuple orderings, which get_indexer() should respect
  405. assert mult_idx_1[7] < mult_idx_2[0] < mult_idx_1[8]
  406. assert mult_idx_1[-1] < mult_idx_2[1]
  407. indexer = mult_idx_1.get_indexer(mult_idx_2)
  408. expected = np.array([-1, -1], dtype=indexer.dtype)
  409. tm.assert_almost_equal(expected, indexer)
  410. backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill")
  411. expected = np.array([8, -1], dtype=backfill_indexer.dtype)
  412. tm.assert_almost_equal(expected, backfill_indexer)
  413. pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill")
  414. expected = np.array([7, 15], dtype=pad_indexer.dtype)
  415. tm.assert_almost_equal(expected, pad_indexer)
  416. def test_get_indexer_kwarg_validation(self):
  417. # GH#41918
  418. mi = MultiIndex.from_product([range(3), ["A", "B"]])
  419. msg = "limit argument only valid if doing pad, backfill or nearest"
  420. with pytest.raises(ValueError, match=msg):
  421. mi.get_indexer(mi[:-1], limit=4)
  422. msg = "tolerance argument only valid if doing pad, backfill or nearest"
  423. with pytest.raises(ValueError, match=msg):
  424. mi.get_indexer(mi[:-1], tolerance="piano")
  425. def test_get_indexer_nan(self):
  426. # GH#37222
  427. idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
  428. idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"])
  429. expected = np.array([-1, 1])
  430. result = idx2.get_indexer(idx1)
  431. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  432. result = idx1.get_indexer(idx2)
  433. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  434. def test_getitem(idx):
  435. # scalar
  436. assert idx[2] == ("bar", "one")
  437. # slice
  438. result = idx[2:5]
  439. expected = idx[[2, 3, 4]]
  440. assert result.equals(expected)
  441. # boolean
  442. result = idx[[True, False, True, False, True, True]]
  443. result2 = idx[np.array([True, False, True, False, True, True])]
  444. expected = idx[[0, 2, 4, 5]]
  445. assert result.equals(expected)
  446. assert result2.equals(expected)
  447. def test_getitem_group_select(idx):
  448. sorted_idx, _ = idx.sortlevel(0)
  449. assert sorted_idx.get_loc("baz") == slice(3, 4)
  450. assert sorted_idx.get_loc("foo") == slice(0, 2)
  451. @pytest.mark.parametrize("ind1", [[True] * 5, Index([True] * 5)])
  452. @pytest.mark.parametrize(
  453. "ind2",
  454. [[True, False, True, False, False], Index([True, False, True, False, False])],
  455. )
  456. def test_getitem_bool_index_all(ind1, ind2):
  457. # GH#22533
  458. idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)])
  459. tm.assert_index_equal(idx[ind1], idx)
  460. expected = MultiIndex.from_tuples([(10, 1), (30, 3)])
  461. tm.assert_index_equal(idx[ind2], expected)
  462. @pytest.mark.parametrize("ind1", [[True], Index([True])])
  463. @pytest.mark.parametrize("ind2", [[False], Index([False])])
  464. def test_getitem_bool_index_single(ind1, ind2):
  465. # GH#22533
  466. idx = MultiIndex.from_tuples([(10, 1)])
  467. tm.assert_index_equal(idx[ind1], idx)
  468. expected = MultiIndex(
  469. levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)],
  470. codes=[[], []],
  471. )
  472. tm.assert_index_equal(idx[ind2], expected)
  473. class TestGetLoc:
  474. def test_get_loc(self, idx):
  475. assert idx.get_loc(("foo", "two")) == 1
  476. assert idx.get_loc(("baz", "two")) == 3
  477. with pytest.raises(KeyError, match=r"^15$"):
  478. idx.get_loc(("bar", "two"))
  479. with pytest.raises(KeyError, match=r"^'quux'$"):
  480. idx.get_loc("quux")
  481. # 3 levels
  482. index = MultiIndex(
  483. levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))],
  484. codes=[
  485. np.array([0, 0, 1, 2, 2, 2, 3, 3]),
  486. np.array([0, 1, 0, 0, 0, 1, 0, 1]),
  487. np.array([1, 0, 1, 1, 0, 0, 1, 0]),
  488. ],
  489. )
  490. with pytest.raises(KeyError, match=r"^\(1, 1\)$"):
  491. index.get_loc((1, 1))
  492. assert index.get_loc((2, 0)) == slice(3, 5)
  493. def test_get_loc_duplicates(self):
  494. index = Index([2, 2, 2, 2])
  495. result = index.get_loc(2)
  496. expected = slice(0, 4)
  497. assert result == expected
  498. index = Index(["c", "a", "a", "b", "b"])
  499. rs = index.get_loc("c")
  500. xp = 0
  501. assert rs == xp
  502. with pytest.raises(KeyError, match="2"):
  503. index.get_loc(2)
  504. def test_get_loc_level(self):
  505. index = MultiIndex(
  506. levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))],
  507. codes=[
  508. np.array([0, 0, 1, 2, 2, 2, 3, 3]),
  509. np.array([0, 1, 0, 0, 0, 1, 0, 1]),
  510. np.array([1, 0, 1, 1, 0, 0, 1, 0]),
  511. ],
  512. )
  513. loc, new_index = index.get_loc_level((0, 1))
  514. expected = slice(1, 2)
  515. exp_index = index[expected].droplevel(0).droplevel(0)
  516. assert loc == expected
  517. assert new_index.equals(exp_index)
  518. loc, new_index = index.get_loc_level((0, 1, 0))
  519. expected = 1
  520. assert loc == expected
  521. assert new_index is None
  522. with pytest.raises(KeyError, match=r"^\(2, 2\)$"):
  523. index.get_loc_level((2, 2))
  524. # GH 22221: unused label
  525. with pytest.raises(KeyError, match=r"^2$"):
  526. index.drop(2).get_loc_level(2)
  527. # Unused label on unsorted level:
  528. with pytest.raises(KeyError, match=r"^2$"):
  529. index.drop(1, level=2).get_loc_level(2, level=2)
  530. index = MultiIndex(
  531. levels=[[2000], list(range(4))],
  532. codes=[np.array([0, 0, 0, 0]), np.array([0, 1, 2, 3])],
  533. )
  534. result, new_index = index.get_loc_level((2000, slice(None, None)))
  535. expected = slice(None, None)
  536. assert result == expected
  537. assert new_index.equals(index.droplevel(0))
  538. @pytest.mark.parametrize("dtype1", [int, float, bool, str])
  539. @pytest.mark.parametrize("dtype2", [int, float, bool, str])
  540. def test_get_loc_multiple_dtypes(self, dtype1, dtype2):
  541. # GH 18520
  542. levels = [np.array([0, 1]).astype(dtype1), np.array([0, 1]).astype(dtype2)]
  543. idx = MultiIndex.from_product(levels)
  544. assert idx.get_loc(idx[2]) == 2
  545. @pytest.mark.parametrize("level", [0, 1])
  546. @pytest.mark.parametrize("dtypes", [[int, float], [float, int]])
  547. def test_get_loc_implicit_cast(self, level, dtypes):
  548. # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa
  549. levels = [["a", "b"], ["c", "d"]]
  550. key = ["b", "d"]
  551. lev_dtype, key_dtype = dtypes
  552. levels[level] = np.array([0, 1], dtype=lev_dtype)
  553. key[level] = key_dtype(1)
  554. idx = MultiIndex.from_product(levels)
  555. assert idx.get_loc(tuple(key)) == 3
  556. @pytest.mark.parametrize("dtype", [bool, object])
  557. def test_get_loc_cast_bool(self, dtype):
  558. # GH 19086 : int is casted to bool, but not vice-versa (for object dtype)
  559. # With bool dtype, we don't cast in either direction.
  560. levels = [Index([False, True], dtype=dtype), np.arange(2, dtype="int64")]
  561. idx = MultiIndex.from_product(levels)
  562. if dtype is bool:
  563. with pytest.raises(KeyError, match=r"^\(0, 1\)$"):
  564. assert idx.get_loc((0, 1)) == 1
  565. with pytest.raises(KeyError, match=r"^\(1, 0\)$"):
  566. assert idx.get_loc((1, 0)) == 2
  567. else:
  568. # We use python object comparisons, which treat 0 == False and 1 == True
  569. assert idx.get_loc((0, 1)) == 1
  570. assert idx.get_loc((1, 0)) == 2
  571. with pytest.raises(KeyError, match=r"^\(False, True\)$"):
  572. idx.get_loc((False, True))
  573. with pytest.raises(KeyError, match=r"^\(True, False\)$"):
  574. idx.get_loc((True, False))
  575. @pytest.mark.parametrize("level", [0, 1])
  576. def test_get_loc_nan(self, level, nulls_fixture):
  577. # GH 18485 : NaN in MultiIndex
  578. levels = [["a", "b"], ["c", "d"]]
  579. key = ["b", "d"]
  580. levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture))
  581. key[level] = nulls_fixture
  582. idx = MultiIndex.from_product(levels)
  583. assert idx.get_loc(tuple(key)) == 3
  584. def test_get_loc_missing_nan(self):
  585. # GH 8569
  586. idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])
  587. assert isinstance(idx.get_loc(1), slice)
  588. with pytest.raises(KeyError, match=r"^3$"):
  589. idx.get_loc(3)
  590. with pytest.raises(KeyError, match=r"^nan$"):
  591. idx.get_loc(np.nan)
  592. with pytest.raises(InvalidIndexError, match=r"\[nan\]"):
  593. # listlike/non-hashable raises TypeError
  594. idx.get_loc([np.nan])
  595. def test_get_loc_with_values_including_missing_values(self):
  596. # issue 19132
  597. idx = MultiIndex.from_product([[np.nan, 1]] * 2)
  598. expected = slice(0, 2, None)
  599. assert idx.get_loc(np.nan) == expected
  600. idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]])
  601. expected = np.array([True, False, False, True])
  602. tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected)
  603. idx = MultiIndex.from_product([[np.nan, 1]] * 3)
  604. expected = slice(2, 4, None)
  605. assert idx.get_loc((np.nan, 1)) == expected
  606. def test_get_loc_duplicates2(self):
  607. # TODO: de-duplicate with test_get_loc_duplicates above?
  608. index = MultiIndex(
  609. levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]],
  610. codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
  611. names=["tag", "day"],
  612. )
  613. assert index.get_loc("D") == slice(0, 3)
  614. def test_get_loc_past_lexsort_depth(self):
  615. # GH#30053
  616. idx = MultiIndex(
  617. levels=[["a"], [0, 7], [1]],
  618. codes=[[0, 0], [1, 0], [0, 0]],
  619. names=["x", "y", "z"],
  620. sortorder=0,
  621. )
  622. key = ("a", 7)
  623. with tm.assert_produces_warning(PerformanceWarning):
  624. # PerformanceWarning: indexing past lexsort depth may impact performance
  625. result = idx.get_loc(key)
  626. assert result == slice(0, 1, None)
  627. def test_multiindex_get_loc_list_raises(self):
  628. # GH#35878
  629. idx = MultiIndex.from_tuples([("a", 1), ("b", 2)])
  630. msg = r"\[\]"
  631. with pytest.raises(InvalidIndexError, match=msg):
  632. idx.get_loc([])
  633. def test_get_loc_nested_tuple_raises_keyerror(self):
  634. # raise KeyError, not TypeError
  635. mi = MultiIndex.from_product([range(3), range(4), range(5), range(6)])
  636. key = ((2, 3, 4), "foo")
  637. with pytest.raises(KeyError, match=re.escape(str(key))):
  638. mi.get_loc(key)
  639. class TestWhere:
  640. def test_where(self):
  641. i = MultiIndex.from_tuples([("A", 1), ("A", 2)])
  642. msg = r"\.where is not supported for MultiIndex operations"
  643. with pytest.raises(NotImplementedError, match=msg):
  644. i.where(True)
  645. def test_where_array_like(self, listlike_box):
  646. mi = MultiIndex.from_tuples([("A", 1), ("A", 2)])
  647. cond = [False, True]
  648. msg = r"\.where is not supported for MultiIndex operations"
  649. with pytest.raises(NotImplementedError, match=msg):
  650. mi.where(listlike_box(cond))
  651. class TestContains:
  652. def test_contains_top_level(self):
  653. midx = MultiIndex.from_product([["A", "B"], [1, 2]])
  654. assert "A" in midx
  655. assert "A" not in midx._engine
  656. def test_contains_with_nat(self):
  657. # MI with a NaT
  658. mi = MultiIndex(
  659. levels=[["C"], date_range("2012-01-01", periods=5)],
  660. codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
  661. names=[None, "B"],
  662. )
  663. assert ("C", pd.Timestamp("2012-01-01")) in mi
  664. for val in mi.values:
  665. assert val in mi
  666. def test_contains(self, idx):
  667. assert ("foo", "two") in idx
  668. assert ("bar", "two") not in idx
  669. assert None not in idx
  670. def test_contains_with_missing_value(self):
  671. # GH#19132
  672. idx = MultiIndex.from_arrays([[1, np.nan, 2]])
  673. assert np.nan in idx
  674. idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]])
  675. assert np.nan not in idx
  676. assert (1, np.nan) in idx
  677. def test_multiindex_contains_dropped(self):
  678. # GH#19027
  679. # test that dropped MultiIndex levels are not in the MultiIndex
  680. # despite continuing to be in the MultiIndex's levels
  681. idx = MultiIndex.from_product([[1, 2], [3, 4]])
  682. assert 2 in idx
  683. idx = idx.drop(2)
  684. # drop implementation keeps 2 in the levels
  685. assert 2 in idx.levels[0]
  686. # but it should no longer be in the index itself
  687. assert 2 not in idx
  688. # also applies to strings
  689. idx = MultiIndex.from_product([["a", "b"], ["c", "d"]])
  690. assert "a" in idx
  691. idx = idx.drop("a")
  692. assert "a" in idx.levels[0]
  693. assert "a" not in idx
  694. def test_contains_td64_level(self):
  695. # GH#24570
  696. tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min")
  697. idx = MultiIndex.from_arrays([tx, np.arange(len(tx))])
  698. assert tx[0] in idx
  699. assert "element_not_exit" not in idx
  700. assert "0 day 09:30:00" in idx
  701. @pytest.mark.slow
  702. def test_large_mi_contains(self):
  703. # GH#10645
  704. result = MultiIndex.from_arrays([range(10**6), range(10**6)])
  705. assert (10**6, 0) not in result
  706. def test_timestamp_multiindex_indexer():
  707. # https://github.com/pandas-dev/pandas/issues/26944
  708. idx = MultiIndex.from_product(
  709. [
  710. date_range("2019-01-01T00:15:33", periods=100, freq="H", name="date"),
  711. ["x"],
  712. [3],
  713. ]
  714. )
  715. df = pd.DataFrame({"foo": np.arange(len(idx))}, idx)
  716. result = df.loc[pd.IndexSlice["2019-1-2":, "x", :], "foo"]
  717. qidx = MultiIndex.from_product(
  718. [
  719. date_range(
  720. start="2019-01-02T00:15:33",
  721. end="2019-01-05T03:15:33",
  722. freq="H",
  723. name="date",
  724. ),
  725. ["x"],
  726. [3],
  727. ]
  728. )
  729. should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo")
  730. tm.assert_series_equal(result, should_be)
  731. @pytest.mark.parametrize(
  732. "index_arr,expected,target,algo",
  733. [
  734. ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"),
  735. ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"),
  736. ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"),
  737. ],
  738. )
  739. def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
  740. # issue 19132
  741. idx = MultiIndex.from_arrays(index_arr)
  742. result = idx.get_slice_bound(target, side=algo)
  743. assert result == expected
  744. @pytest.mark.parametrize(
  745. "index_arr,expected,start_idx,end_idx",
  746. [
  747. ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1),
  748. ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)),
  749. ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3),
  750. ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)),
  751. ],
  752. )
  753. def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx):
  754. # issue 19132
  755. idx = MultiIndex.from_arrays(index_arr)
  756. result = idx.slice_indexer(start=start_idx, end=end_idx)
  757. assert result == expected
  758. def test_pyint_engine():
  759. # GH#18519 : when combinations of codes cannot be represented in 64
  760. # bits, the index underlying the MultiIndex engine works with Python
  761. # integers, rather than uint64.
  762. N = 5
  763. keys = [
  764. tuple(arr)
  765. for arr in [
  766. [0] * 10 * N,
  767. [1] * 10 * N,
  768. [2] * 10 * N,
  769. [np.nan] * N + [2] * 9 * N,
  770. [0] * N + [2] * 9 * N,
  771. [np.nan] * N + [2] * 8 * N + [0] * N,
  772. ]
  773. ]
  774. # Each level contains 4 elements (including NaN), so it is represented
  775. # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
  776. # 64 bit engine and truncating the first levels, the fourth and fifth
  777. # keys would collide; if truncating the last levels, the fifth and
  778. # sixth; if rotating bits rather than shifting, the third and fifth.
  779. for idx, key_value in enumerate(keys):
  780. index = MultiIndex.from_tuples(keys)
  781. assert index.get_loc(key_value) == idx
  782. expected = np.arange(idx + 1, dtype=np.intp)
  783. result = index.get_indexer([keys[i] for i in expected])
  784. tm.assert_numpy_array_equal(result, expected)
  785. # With missing key:
  786. idces = range(len(keys))
  787. expected = np.array([-1] + list(idces), dtype=np.intp)
  788. missing = tuple([0, 1] * 5 * N)
  789. result = index.get_indexer([missing] + [keys[i] for i in idces])
  790. tm.assert_numpy_array_equal(result, expected)
  791. @pytest.mark.parametrize(
  792. "keys,expected",
  793. [
  794. ((slice(None), [5, 4]), [1, 0]),
  795. ((slice(None), [4, 5]), [0, 1]),
  796. (([True, False, True], [4, 6]), [0, 2]),
  797. (([True, False, True], [6, 4]), [0, 2]),
  798. ((2, [4, 5]), [0, 1]),
  799. ((2, [5, 4]), [1, 0]),
  800. (([2], [4, 5]), [0, 1]),
  801. (([2], [5, 4]), [1, 0]),
  802. ],
  803. )
  804. def test_get_locs_reordering(keys, expected):
  805. # GH48384
  806. idx = MultiIndex.from_arrays(
  807. [
  808. [2, 2, 1],
  809. [4, 5, 6],
  810. ]
  811. )
  812. result = idx.get_locs(keys)
  813. expected = np.array(expected, dtype=np.intp)
  814. tm.assert_numpy_array_equal(result, expected)
  815. def test_get_indexer_for_multiindex_with_nans(nulls_fixture):
  816. # GH37222
  817. idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
  818. idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"])
  819. result = idx2.get_indexer(idx1)
  820. expected = np.array([-1, 1], dtype=np.intp)
  821. tm.assert_numpy_array_equal(result, expected)
  822. result = idx1.get_indexer(idx2)
  823. expected = np.array([-1, 1], dtype=np.intp)
  824. tm.assert_numpy_array_equal(result, expected)