test_multiindex.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. import numpy as np
  2. import pytest
  3. import pandas._libs.index as _index
  4. from pandas.errors import PerformanceWarning
  5. import pandas as pd
  6. from pandas import (
  7. CategoricalDtype,
  8. DataFrame,
  9. Index,
  10. MultiIndex,
  11. Series,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.arrays.boolean import BooleanDtype
  15. class TestMultiIndexBasic:
  16. def test_multiindex_perf_warn(self):
  17. df = DataFrame(
  18. {
  19. "jim": [0, 0, 1, 1],
  20. "joe": ["x", "x", "z", "y"],
  21. "jolie": np.random.rand(4),
  22. }
  23. ).set_index(["jim", "joe"])
  24. with tm.assert_produces_warning(PerformanceWarning):
  25. df.loc[(1, "z")]
  26. df = df.iloc[[2, 1, 3, 0]]
  27. with tm.assert_produces_warning(PerformanceWarning):
  28. df.loc[(0,)]
  29. def test_indexing_over_hashtable_size_cutoff(self):
  30. n = 10000
  31. old_cutoff = _index._SIZE_CUTOFF
  32. _index._SIZE_CUTOFF = 20000
  33. s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
  34. # hai it works!
  35. assert s[("a", 5)] == 5
  36. assert s[("a", 6)] == 6
  37. assert s[("a", 7)] == 7
  38. _index._SIZE_CUTOFF = old_cutoff
  39. def test_multi_nan_indexing(self):
  40. # GH 3588
  41. df = DataFrame(
  42. {
  43. "a": ["R1", "R2", np.nan, "R4"],
  44. "b": ["C1", "C2", "C3", "C4"],
  45. "c": [10, 15, np.nan, 20],
  46. }
  47. )
  48. result = df.set_index(["a", "b"], drop=False)
  49. expected = DataFrame(
  50. {
  51. "a": ["R1", "R2", np.nan, "R4"],
  52. "b": ["C1", "C2", "C3", "C4"],
  53. "c": [10, 15, np.nan, 20],
  54. },
  55. index=[
  56. Index(["R1", "R2", np.nan, "R4"], name="a"),
  57. Index(["C1", "C2", "C3", "C4"], name="b"),
  58. ],
  59. )
  60. tm.assert_frame_equal(result, expected)
  61. def test_exclusive_nat_column_indexing(self):
  62. # GH 38025
  63. # test multi indexing when one column exclusively contains NaT values
  64. df = DataFrame(
  65. {
  66. "a": [pd.NaT, pd.NaT, pd.NaT, pd.NaT],
  67. "b": ["C1", "C2", "C3", "C4"],
  68. "c": [10, 15, np.nan, 20],
  69. }
  70. )
  71. df = df.set_index(["a", "b"])
  72. expected = DataFrame(
  73. {
  74. "c": [10, 15, np.nan, 20],
  75. },
  76. index=[
  77. Index([pd.NaT, pd.NaT, pd.NaT, pd.NaT], name="a"),
  78. Index(["C1", "C2", "C3", "C4"], name="b"),
  79. ],
  80. )
  81. tm.assert_frame_equal(df, expected)
  82. def test_nested_tuples_duplicates(self):
  83. # GH#30892
  84. dti = pd.to_datetime(["20190101", "20190101", "20190102"])
  85. idx = Index(["a", "a", "c"])
  86. mi = MultiIndex.from_arrays([dti, idx], names=["index1", "index2"])
  87. df = DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi)
  88. expected = DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi)
  89. df2 = df.copy(deep=True)
  90. df2.loc[(dti[0], "a"), "c2"] = 1.0
  91. tm.assert_frame_equal(df2, expected)
  92. df3 = df.copy(deep=True)
  93. df3.loc[[(dti[0], "a")], "c2"] = 1.0
  94. tm.assert_frame_equal(df3, expected)
  95. def test_multiindex_with_datatime_level_preserves_freq(self):
  96. # https://github.com/pandas-dev/pandas/issues/35563
  97. idx = Index(range(2), name="A")
  98. dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
  99. mi = MultiIndex.from_product([idx, dti])
  100. df = DataFrame(np.random.randn(14, 2), index=mi)
  101. result = df.loc[0].index
  102. tm.assert_index_equal(result, dti)
  103. assert result.freq == dti.freq
  104. def test_multiindex_complex(self):
  105. # GH#42145
  106. complex_data = [1 + 2j, 4 - 3j, 10 - 1j]
  107. non_complex_data = [3, 4, 5]
  108. result = DataFrame(
  109. {
  110. "x": complex_data,
  111. "y": non_complex_data,
  112. "z": non_complex_data,
  113. }
  114. )
  115. result.set_index(["x", "y"], inplace=True)
  116. expected = DataFrame(
  117. {"z": non_complex_data},
  118. index=MultiIndex.from_arrays(
  119. [complex_data, non_complex_data],
  120. names=("x", "y"),
  121. ),
  122. )
  123. tm.assert_frame_equal(result, expected)
  124. def test_rename_multiindex_with_duplicates(self):
  125. # GH 38015
  126. mi = MultiIndex.from_tuples([("A", "cat"), ("B", "cat"), ("B", "cat")])
  127. df = DataFrame(index=mi)
  128. df = df.rename(index={"A": "Apple"}, level=0)
  129. mi2 = MultiIndex.from_tuples([("Apple", "cat"), ("B", "cat"), ("B", "cat")])
  130. expected = DataFrame(index=mi2)
  131. tm.assert_frame_equal(df, expected)
  132. def test_series_align_multiindex_with_nan_overlap_only(self):
  133. # GH 38439
  134. mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
  135. mi2 = MultiIndex.from_arrays([[np.nan, 82.0], [np.nan, np.nan]])
  136. ser1 = Series([1, 2], index=mi1)
  137. ser2 = Series([1, 2], index=mi2)
  138. result1, result2 = ser1.align(ser2)
  139. mi = MultiIndex.from_arrays([[81.0, 82.0, np.nan], [np.nan, np.nan, np.nan]])
  140. expected1 = Series([1.0, np.nan, 2.0], index=mi)
  141. expected2 = Series([np.nan, 2.0, 1.0], index=mi)
  142. tm.assert_series_equal(result1, expected1)
  143. tm.assert_series_equal(result2, expected2)
  144. def test_series_align_multiindex_with_nan(self):
  145. # GH 38439
  146. mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
  147. mi2 = MultiIndex.from_arrays([[np.nan, 81.0], [np.nan, np.nan]])
  148. ser1 = Series([1, 2], index=mi1)
  149. ser2 = Series([1, 2], index=mi2)
  150. result1, result2 = ser1.align(ser2)
  151. mi = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
  152. expected1 = Series([1, 2], index=mi)
  153. expected2 = Series([2, 1], index=mi)
  154. tm.assert_series_equal(result1, expected1)
  155. tm.assert_series_equal(result2, expected2)
  156. def test_nunique_smoke(self):
  157. # GH 34019
  158. n = DataFrame([[1, 2], [1, 2]]).set_index([0, 1]).index.nunique()
  159. assert n == 1
  160. def test_multiindex_repeated_keys(self):
  161. # GH19414
  162. tm.assert_series_equal(
  163. Series([1, 2], MultiIndex.from_arrays([["a", "b"]])).loc[
  164. ["a", "a", "b", "b"]
  165. ],
  166. Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])),
  167. )
  168. def test_multiindex_with_na_missing_key(self):
  169. # GH46173
  170. df = DataFrame.from_dict(
  171. {
  172. ("foo",): [1, 2, 3],
  173. ("bar",): [5, 6, 7],
  174. (None,): [8, 9, 0],
  175. }
  176. )
  177. with pytest.raises(KeyError, match="missing_key"):
  178. df[[("missing_key",)]]
  179. def test_multiindex_dtype_preservation(self):
  180. # GH51261
  181. columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"])
  182. df = DataFrame(["value"], columns=columns).astype("category")
  183. df_no_multiindex = df["A"]
  184. assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype)
  185. # geopandas 1763 analogue
  186. df = DataFrame(
  187. [[1, 0], [0, 1]],
  188. columns=[
  189. ["foo", "foo"],
  190. ["location", "location"],
  191. ["x", "y"],
  192. ],
  193. ).assign(bools=Series([True, False], dtype="boolean"))
  194. assert isinstance(df["bools"].dtype, BooleanDtype)