test_integrity.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  5. import pandas as pd
  6. from pandas import (
  7. Index,
  8. IntervalIndex,
  9. MultiIndex,
  10. RangeIndex,
  11. )
  12. import pandas._testing as tm
  13. def test_labels_dtypes():
  14. # GH 8456
  15. i = MultiIndex.from_tuples([("A", 1), ("A", 2)])
  16. assert i.codes[0].dtype == "int8"
  17. assert i.codes[1].dtype == "int8"
  18. i = MultiIndex.from_product([["a"], range(40)])
  19. assert i.codes[1].dtype == "int8"
  20. i = MultiIndex.from_product([["a"], range(400)])
  21. assert i.codes[1].dtype == "int16"
  22. i = MultiIndex.from_product([["a"], range(40000)])
  23. assert i.codes[1].dtype == "int32"
  24. i = MultiIndex.from_product([["a"], range(1000)])
  25. assert (i.codes[0] >= 0).all()
  26. assert (i.codes[1] >= 0).all()
  27. def test_values_boxed():
  28. tuples = [
  29. (1, pd.Timestamp("2000-01-01")),
  30. (2, pd.NaT),
  31. (3, pd.Timestamp("2000-01-03")),
  32. (1, pd.Timestamp("2000-01-04")),
  33. (2, pd.Timestamp("2000-01-02")),
  34. (3, pd.Timestamp("2000-01-03")),
  35. ]
  36. result = MultiIndex.from_tuples(tuples)
  37. expected = construct_1d_object_array_from_listlike(tuples)
  38. tm.assert_numpy_array_equal(result.values, expected)
  39. # Check that code branches for boxed values produce identical results
  40. tm.assert_numpy_array_equal(result.values[:4], result[:4].values)
  41. def test_values_multiindex_datetimeindex():
  42. # Test to ensure we hit the boxing / nobox part of MI.values
  43. ints = np.arange(10**18, 10**18 + 5)
  44. naive = pd.DatetimeIndex(ints)
  45. aware = pd.DatetimeIndex(ints, tz="US/Central")
  46. idx = MultiIndex.from_arrays([naive, aware])
  47. result = idx.values
  48. outer = pd.DatetimeIndex([x[0] for x in result])
  49. tm.assert_index_equal(outer, naive)
  50. inner = pd.DatetimeIndex([x[1] for x in result])
  51. tm.assert_index_equal(inner, aware)
  52. # n_lev > n_lab
  53. result = idx[:2].values
  54. outer = pd.DatetimeIndex([x[0] for x in result])
  55. tm.assert_index_equal(outer, naive[:2])
  56. inner = pd.DatetimeIndex([x[1] for x in result])
  57. tm.assert_index_equal(inner, aware[:2])
  58. def test_values_multiindex_periodindex():
  59. # Test to ensure we hit the boxing / nobox part of MI.values
  60. ints = np.arange(2007, 2012)
  61. pidx = pd.PeriodIndex(ints, freq="D")
  62. idx = MultiIndex.from_arrays([ints, pidx])
  63. result = idx.values
  64. outer = Index([x[0] for x in result])
  65. tm.assert_index_equal(outer, Index(ints, dtype=np.int64))
  66. inner = pd.PeriodIndex([x[1] for x in result])
  67. tm.assert_index_equal(inner, pidx)
  68. # n_lev > n_lab
  69. result = idx[:2].values
  70. outer = Index([x[0] for x in result])
  71. tm.assert_index_equal(outer, Index(ints[:2], dtype=np.int64))
  72. inner = pd.PeriodIndex([x[1] for x in result])
  73. tm.assert_index_equal(inner, pidx[:2])
  74. def test_consistency():
  75. # need to construct an overflow
  76. major_axis = list(range(70000))
  77. minor_axis = list(range(10))
  78. major_codes = np.arange(70000)
  79. minor_codes = np.repeat(range(10), 7000)
  80. # the fact that is works means it's consistent
  81. index = MultiIndex(
  82. levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
  83. )
  84. # inconsistent
  85. major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])
  86. minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])
  87. index = MultiIndex(
  88. levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
  89. )
  90. assert index.is_unique is False
  91. @pytest.mark.slow
  92. def test_hash_collisions():
  93. # non-smoke test that we don't get hash collisions
  94. index = MultiIndex.from_product(
  95. [np.arange(1000), np.arange(1000)], names=["one", "two"]
  96. )
  97. result = index.get_indexer(index.values)
  98. tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp"))
  99. for i in [0, 1, len(index) - 2, len(index) - 1]:
  100. result = index.get_loc(index[i])
  101. assert result == i
  102. def test_dims():
  103. pass
  104. def test_take_invalid_kwargs():
  105. vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]]
  106. idx = MultiIndex.from_product(vals, names=["str", "dt"])
  107. indices = [1, 2]
  108. msg = r"take\(\) got an unexpected keyword argument 'foo'"
  109. with pytest.raises(TypeError, match=msg):
  110. idx.take(indices, foo=2)
  111. msg = "the 'out' parameter is not supported"
  112. with pytest.raises(ValueError, match=msg):
  113. idx.take(indices, out=indices)
  114. msg = "the 'mode' parameter is not supported"
  115. with pytest.raises(ValueError, match=msg):
  116. idx.take(indices, mode="clip")
  117. def test_isna_behavior(idx):
  118. # should not segfault GH5123
  119. # NOTE: if MI representation changes, may make sense to allow
  120. # isna(MI)
  121. msg = "isna is not defined for MultiIndex"
  122. with pytest.raises(NotImplementedError, match=msg):
  123. pd.isna(idx)
  124. def test_large_multiindex_error():
  125. # GH12527
  126. df_below_1000000 = pd.DataFrame(
  127. 1, index=MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"]
  128. )
  129. with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
  130. df_below_1000000.loc[(-1, 0), "dest"]
  131. with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
  132. df_below_1000000.loc[(3, 0), "dest"]
  133. df_above_1000000 = pd.DataFrame(
  134. 1, index=MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"]
  135. )
  136. with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
  137. df_above_1000000.loc[(-1, 0), "dest"]
  138. with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
  139. df_above_1000000.loc[(3, 0), "dest"]
  140. def test_million_record_attribute_error():
  141. # GH 18165
  142. r = list(range(1000000))
  143. df = pd.DataFrame(
  144. {"a": r, "b": r}, index=MultiIndex.from_tuples([(x, x) for x in r])
  145. )
  146. msg = "'Series' object has no attribute 'foo'"
  147. with pytest.raises(AttributeError, match=msg):
  148. df["a"].foo()
  149. def test_can_hold_identifiers(idx):
  150. key = idx[0]
  151. assert idx._can_hold_identifiers_and_holds_name(key) is True
  152. def test_metadata_immutable(idx):
  153. levels, codes = idx.levels, idx.codes
  154. # shouldn't be able to set at either the top level or base level
  155. mutable_regex = re.compile("does not support mutable operations")
  156. with pytest.raises(TypeError, match=mutable_regex):
  157. levels[0] = levels[0]
  158. with pytest.raises(TypeError, match=mutable_regex):
  159. levels[0][0] = levels[0][0]
  160. # ditto for labels
  161. with pytest.raises(TypeError, match=mutable_regex):
  162. codes[0] = codes[0]
  163. with pytest.raises(ValueError, match="assignment destination is read-only"):
  164. codes[0][0] = codes[0][0]
  165. # and for names
  166. names = idx.names
  167. with pytest.raises(TypeError, match=mutable_regex):
  168. names[0] = names[0]
  169. def test_level_setting_resets_attributes():
  170. ind = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
  171. assert ind.is_monotonic_increasing
  172. ind = ind.set_levels([["A", "B"], [1, 3, 2]])
  173. # if this fails, probably didn't reset the cache correctly.
  174. assert not ind.is_monotonic_increasing
  175. def test_rangeindex_fallback_coercion_bug():
  176. # GH 12893
  177. df1 = pd.DataFrame(np.arange(100).reshape((10, 10)))
  178. df2 = pd.DataFrame(np.arange(100).reshape((10, 10)))
  179. df = pd.concat({"df1": df1.stack(), "df2": df2.stack()}, axis=1)
  180. df.index.names = ["fizz", "buzz"]
  181. str(df)
  182. expected = pd.DataFrame(
  183. {"df2": np.arange(100), "df1": np.arange(100)},
  184. index=MultiIndex.from_product([range(10), range(10)], names=["fizz", "buzz"]),
  185. )
  186. tm.assert_frame_equal(df, expected, check_like=True)
  187. result = df.index.get_level_values("fizz")
  188. expected = Index(np.arange(10, dtype=np.int64), name="fizz").repeat(10)
  189. tm.assert_index_equal(result, expected)
  190. result = df.index.get_level_values("buzz")
  191. expected = Index(np.tile(np.arange(10, dtype=np.int64), 10), name="buzz")
  192. tm.assert_index_equal(result, expected)
  193. def test_memory_usage(idx):
  194. result = idx.memory_usage()
  195. if len(idx):
  196. idx.get_loc(idx[0])
  197. result2 = idx.memory_usage()
  198. result3 = idx.memory_usage(deep=True)
  199. # RangeIndex, IntervalIndex
  200. # don't have engines
  201. if not isinstance(idx, (RangeIndex, IntervalIndex)):
  202. assert result2 > result
  203. if idx.inferred_type == "object":
  204. assert result3 > result2
  205. else:
  206. # we report 0 for no-length
  207. assert result == 0
  208. def test_nlevels(idx):
  209. assert idx.nlevels == 2