test_set_index.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. """
  2. See also: test_reindex.py:TestReindexSetIndex
  3. """
  4. from datetime import (
  5. datetime,
  6. timedelta,
  7. )
  8. import numpy as np
  9. import pytest
  10. from pandas import (
  11. Categorical,
  12. DataFrame,
  13. DatetimeIndex,
  14. Index,
  15. MultiIndex,
  16. Series,
  17. date_range,
  18. period_range,
  19. to_datetime,
  20. )
  21. import pandas._testing as tm
  22. class TestSetIndex:
  23. def test_set_index_multiindex(self):
  24. # segfault in GH#3308
  25. d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
  26. df = DataFrame(d)
  27. tuples = [(0, 1), (0, 2), (1, 2)]
  28. df["tuples"] = tuples
  29. index = MultiIndex.from_tuples(df["tuples"])
  30. # it works!
  31. df.set_index(index)
  32. def test_set_index_empty_column(self):
  33. # GH#1971
  34. df = DataFrame(
  35. [
  36. {"a": 1, "p": 0},
  37. {"a": 2, "m": 10},
  38. {"a": 3, "m": 11, "p": 20},
  39. {"a": 4, "m": 12, "p": 21},
  40. ],
  41. columns=["a", "m", "p", "x"],
  42. )
  43. result = df.set_index(["a", "x"])
  44. expected = df[["m", "p"]]
  45. expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"])
  46. tm.assert_frame_equal(result, expected)
  47. def test_set_index_empty_dataframe(self):
  48. # GH#38419
  49. df1 = DataFrame(
  50. {"a": Series(dtype="datetime64[ns]"), "b": Series(dtype="int64"), "c": []}
  51. )
  52. df2 = df1.set_index(["a", "b"])
  53. result = df2.index.to_frame().dtypes
  54. expected = df1[["a", "b"]].dtypes
  55. tm.assert_series_equal(result, expected)
  56. def test_set_index_multiindexcolumns(self):
  57. columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)])
  58. df = DataFrame(np.random.randn(3, 3), columns=columns)
  59. result = df.set_index(df.columns[0])
  60. expected = df.iloc[:, 1:]
  61. expected.index = df.iloc[:, 0].values
  62. expected.index.names = [df.columns[0]]
  63. tm.assert_frame_equal(result, expected)
  64. def test_set_index_timezone(self):
  65. # GH#12358
  66. # tz-aware Series should retain the tz
  67. idx = DatetimeIndex(["2014-01-01 10:10:10"], tz="UTC").tz_convert("Europe/Rome")
  68. df = DataFrame({"A": idx})
  69. assert df.set_index(idx).index[0].hour == 11
  70. assert DatetimeIndex(Series(df.A))[0].hour == 11
  71. assert df.set_index(df.A).index[0].hour == 11
  72. def test_set_index_cast_datetimeindex(self):
  73. df = DataFrame(
  74. {
  75. "A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],
  76. "B": np.random.randn(1000),
  77. }
  78. )
  79. idf = df.set_index("A")
  80. assert isinstance(idf.index, DatetimeIndex)
  81. def test_set_index_dst(self):
  82. di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific")
  83. df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index()
  84. # single level
  85. res = df.set_index("index")
  86. exp = DataFrame(
  87. data={"a": [0, 1, 2], "b": [3, 4, 5]},
  88. index=Index(di, name="index"),
  89. )
  90. exp.index = exp.index._with_freq(None)
  91. tm.assert_frame_equal(res, exp)
  92. # GH#12920
  93. res = df.set_index(["index", "a"])
  94. exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"])
  95. exp = DataFrame({"b": [3, 4, 5]}, index=exp_index)
  96. tm.assert_frame_equal(res, exp)
  97. def test_set_index(self, float_string_frame):
  98. df = float_string_frame
  99. idx = Index(np.arange(len(df))[::-1])
  100. df = df.set_index(idx)
  101. tm.assert_index_equal(df.index, idx)
  102. with pytest.raises(ValueError, match="Length mismatch"):
  103. df.set_index(idx[::2])
  104. def test_set_index_names(self):
  105. df = tm.makeDataFrame()
  106. df.index.name = "name"
  107. assert df.set_index(df.index).index.names == ["name"]
  108. mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"])
  109. mi2 = MultiIndex.from_arrays(
  110. df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"]
  111. )
  112. df = df.set_index(["A", "B"])
  113. assert df.set_index(df.index).index.names == ["A", "B"]
  114. # Check that set_index isn't converting a MultiIndex into an Index
  115. assert isinstance(df.set_index(df.index).index, MultiIndex)
  116. # Check actual equality
  117. tm.assert_index_equal(df.set_index(df.index).index, mi)
  118. idx2 = df.index.rename(["C", "D"])
  119. # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
  120. # than a pair of tuples
  121. assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)
  122. # Check equality
  123. tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
  124. # A has duplicate values, C does not
  125. @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
  126. @pytest.mark.parametrize("inplace", [True, False])
  127. @pytest.mark.parametrize("drop", [True, False])
  128. def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):
  129. df = frame_of_index_cols
  130. if isinstance(keys, list):
  131. idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)
  132. else:
  133. idx = Index(df[keys], name=keys)
  134. expected = df.drop(keys, axis=1) if drop else df
  135. expected.index = idx
  136. if inplace:
  137. result = df.copy()
  138. return_value = result.set_index(keys, drop=drop, inplace=True)
  139. assert return_value is None
  140. else:
  141. result = df.set_index(keys, drop=drop)
  142. tm.assert_frame_equal(result, expected)
  143. # A has duplicate values, C does not
  144. @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
  145. @pytest.mark.parametrize("drop", [True, False])
  146. def test_set_index_append(self, frame_of_index_cols, drop, keys):
  147. df = frame_of_index_cols
  148. keys = keys if isinstance(keys, list) else [keys]
  149. idx = MultiIndex.from_arrays(
  150. [df.index] + [df[x] for x in keys], names=[None] + keys
  151. )
  152. expected = df.drop(keys, axis=1) if drop else df.copy()
  153. expected.index = idx
  154. result = df.set_index(keys, drop=drop, append=True)
  155. tm.assert_frame_equal(result, expected)
  156. # A has duplicate values, C does not
  157. @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
  158. @pytest.mark.parametrize("drop", [True, False])
  159. def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys):
  160. # append to existing multiindex
  161. df = frame_of_index_cols.set_index(["D"], drop=drop, append=True)
  162. keys = keys if isinstance(keys, list) else [keys]
  163. expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True)
  164. result = df.set_index(keys, drop=drop, append=True)
  165. tm.assert_frame_equal(result, expected)
  166. def test_set_index_after_mutation(self):
  167. # GH#1590
  168. df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]})
  169. expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key"))
  170. df2 = df.loc[df.index.map(lambda indx: indx >= 1)]
  171. result = df2.set_index("key")
  172. tm.assert_frame_equal(result, expected)
  173. # MultiIndex constructor does not work directly on Series -> lambda
  174. # Add list-of-list constructor because list is ambiguous -> lambda
  175. # also test index name if append=True (name is duplicate here for B)
  176. @pytest.mark.parametrize(
  177. "box",
  178. [
  179. Series,
  180. Index,
  181. np.array,
  182. list,
  183. lambda x: [list(x)],
  184. lambda x: MultiIndex.from_arrays([x]),
  185. ],
  186. )
  187. @pytest.mark.parametrize(
  188. "append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)]
  189. )
  190. @pytest.mark.parametrize("drop", [True, False])
  191. def test_set_index_pass_single_array(
  192. self, frame_of_index_cols, drop, append, index_name, box
  193. ):
  194. df = frame_of_index_cols
  195. df.index.name = index_name
  196. key = box(df["B"])
  197. if box == list:
  198. # list of strings gets interpreted as list of keys
  199. msg = "['one', 'two', 'three', 'one', 'two']"
  200. with pytest.raises(KeyError, match=msg):
  201. df.set_index(key, drop=drop, append=append)
  202. else:
  203. # np.array/list-of-list "forget" the name of B
  204. name_mi = getattr(key, "names", None)
  205. name = [getattr(key, "name", None)] if name_mi is None else name_mi
  206. result = df.set_index(key, drop=drop, append=append)
  207. # only valid column keys are dropped
  208. # since B is always passed as array above, nothing is dropped
  209. expected = df.set_index(["B"], drop=False, append=append)
  210. expected.index.names = [index_name] + name if append else name
  211. tm.assert_frame_equal(result, expected)
  212. # MultiIndex constructor does not work directly on Series -> lambda
  213. # also test index name if append=True (name is duplicate here for A & B)
  214. @pytest.mark.parametrize(
  215. "box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]
  216. )
  217. @pytest.mark.parametrize(
  218. "append, index_name",
  219. [(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)],
  220. )
  221. @pytest.mark.parametrize("drop", [True, False])
  222. def test_set_index_pass_arrays(
  223. self, frame_of_index_cols, drop, append, index_name, box
  224. ):
  225. df = frame_of_index_cols
  226. df.index.name = index_name
  227. keys = ["A", box(df["B"])]
  228. # np.array/list "forget" the name of B
  229. names = ["A", None if box in [np.array, list, tuple, iter] else "B"]
  230. result = df.set_index(keys, drop=drop, append=append)
  231. # only valid column keys are dropped
  232. # since B is always passed as array above, only A is dropped, if at all
  233. expected = df.set_index(["A", "B"], drop=False, append=append)
  234. expected = expected.drop("A", axis=1) if drop else expected
  235. expected.index.names = [index_name] + names if append else names
  236. tm.assert_frame_equal(result, expected)
  237. # MultiIndex constructor does not work directly on Series -> lambda
  238. # We also emulate a "constructor" for the label -> lambda
  239. # also test index name if append=True (name is duplicate here for A)
  240. @pytest.mark.parametrize(
  241. "box2",
  242. [
  243. Series,
  244. Index,
  245. np.array,
  246. list,
  247. iter,
  248. lambda x: MultiIndex.from_arrays([x]),
  249. lambda x: x.name,
  250. ],
  251. )
  252. @pytest.mark.parametrize(
  253. "box1",
  254. [
  255. Series,
  256. Index,
  257. np.array,
  258. list,
  259. iter,
  260. lambda x: MultiIndex.from_arrays([x]),
  261. lambda x: x.name,
  262. ],
  263. )
  264. @pytest.mark.parametrize(
  265. "append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)]
  266. )
  267. @pytest.mark.parametrize("drop", [True, False])
  268. def test_set_index_pass_arrays_duplicate(
  269. self, frame_of_index_cols, drop, append, index_name, box1, box2
  270. ):
  271. df = frame_of_index_cols
  272. df.index.name = index_name
  273. keys = [box1(df["A"]), box2(df["A"])]
  274. result = df.set_index(keys, drop=drop, append=append)
  275. # if either box is iter, it has been consumed; re-read
  276. keys = [box1(df["A"]), box2(df["A"])]
  277. # need to adapt first drop for case that both keys are 'A' --
  278. # cannot drop the same column twice;
  279. # plain == would give ambiguous Boolean error for containers
  280. first_drop = (
  281. False
  282. if (
  283. isinstance(keys[0], str)
  284. and keys[0] == "A"
  285. and isinstance(keys[1], str)
  286. and keys[1] == "A"
  287. )
  288. else drop
  289. )
  290. # to test against already-tested behaviour, we add sequentially,
  291. # hence second append always True; must wrap keys in list, otherwise
  292. # box = list would be interpreted as keys
  293. expected = df.set_index([keys[0]], drop=first_drop, append=append)
  294. expected = expected.set_index([keys[1]], drop=drop, append=True)
  295. tm.assert_frame_equal(result, expected)
  296. @pytest.mark.parametrize("append", [True, False])
  297. @pytest.mark.parametrize("drop", [True, False])
  298. def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append):
  299. df = frame_of_index_cols
  300. keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"])
  301. result = df.set_index(keys, drop=drop, append=append)
  302. # setting with a MultiIndex will never drop columns
  303. expected = df.set_index(["A", "B"], drop=False, append=append)
  304. tm.assert_frame_equal(result, expected)
  305. def test_construction_with_categorical_index(self):
  306. ci = tm.makeCategoricalIndex(10)
  307. ci.name = "B"
  308. # with Categorical
  309. df = DataFrame({"A": np.random.randn(10), "B": ci.values})
  310. idf = df.set_index("B")
  311. tm.assert_index_equal(idf.index, ci)
  312. # from a CategoricalIndex
  313. df = DataFrame({"A": np.random.randn(10), "B": ci})
  314. idf = df.set_index("B")
  315. tm.assert_index_equal(idf.index, ci)
  316. # round-trip
  317. idf = idf.reset_index().set_index("B")
  318. tm.assert_index_equal(idf.index, ci)
  319. def test_set_index_preserve_categorical_dtype(self):
  320. # GH#13743, GH#13854
  321. df = DataFrame(
  322. {
  323. "A": [1, 2, 1, 1, 2],
  324. "B": [10, 16, 22, 28, 34],
  325. "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
  326. "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
  327. }
  328. )
  329. for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]:
  330. result = df.set_index(cols).reset_index()
  331. result = result.reindex(columns=df.columns)
  332. tm.assert_frame_equal(result, df)
  333. def test_set_index_datetime(self):
  334. # GH#3950
  335. df = DataFrame(
  336. {
  337. "label": ["a", "a", "a", "b", "b", "b"],
  338. "datetime": [
  339. "2011-07-19 07:00:00",
  340. "2011-07-19 08:00:00",
  341. "2011-07-19 09:00:00",
  342. "2011-07-19 07:00:00",
  343. "2011-07-19 08:00:00",
  344. "2011-07-19 09:00:00",
  345. ],
  346. "value": range(6),
  347. }
  348. )
  349. df.index = to_datetime(df.pop("datetime"), utc=True)
  350. df.index = df.index.tz_convert("US/Pacific")
  351. expected = DatetimeIndex(
  352. ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
  353. name="datetime",
  354. )
  355. expected = expected.tz_localize("UTC").tz_convert("US/Pacific")
  356. df = df.set_index("label", append=True)
  357. tm.assert_index_equal(df.index.levels[0], expected)
  358. tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label"))
  359. assert df.index.names == ["datetime", "label"]
  360. df = df.swaplevel(0, 1)
  361. tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label"))
  362. tm.assert_index_equal(df.index.levels[1], expected)
  363. assert df.index.names == ["label", "datetime"]
  364. df = DataFrame(np.random.random(6))
  365. idx1 = DatetimeIndex(
  366. [
  367. "2011-07-19 07:00:00",
  368. "2011-07-19 08:00:00",
  369. "2011-07-19 09:00:00",
  370. "2011-07-19 07:00:00",
  371. "2011-07-19 08:00:00",
  372. "2011-07-19 09:00:00",
  373. ],
  374. tz="US/Eastern",
  375. )
  376. idx2 = DatetimeIndex(
  377. [
  378. "2012-04-01 09:00",
  379. "2012-04-01 09:00",
  380. "2012-04-01 09:00",
  381. "2012-04-02 09:00",
  382. "2012-04-02 09:00",
  383. "2012-04-02 09:00",
  384. ],
  385. tz="US/Eastern",
  386. )
  387. idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
  388. idx3 = idx3._with_freq(None)
  389. df = df.set_index(idx1)
  390. df = df.set_index(idx2, append=True)
  391. df = df.set_index(idx3, append=True)
  392. expected1 = DatetimeIndex(
  393. ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
  394. tz="US/Eastern",
  395. )
  396. expected2 = DatetimeIndex(
  397. ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern"
  398. )
  399. tm.assert_index_equal(df.index.levels[0], expected1)
  400. tm.assert_index_equal(df.index.levels[1], expected2)
  401. tm.assert_index_equal(df.index.levels[2], idx3)
  402. # GH#7092
  403. tm.assert_index_equal(df.index.get_level_values(0), idx1)
  404. tm.assert_index_equal(df.index.get_level_values(1), idx2)
  405. tm.assert_index_equal(df.index.get_level_values(2), idx3)
  406. def test_set_index_period(self):
  407. # GH#6631
  408. df = DataFrame(np.random.random(6))
  409. idx1 = period_range("2011-01-01", periods=3, freq="M")
  410. idx1 = idx1.append(idx1)
  411. idx2 = period_range("2013-01-01 09:00", periods=2, freq="H")
  412. idx2 = idx2.append(idx2).append(idx2)
  413. idx3 = period_range("2005", periods=6, freq="A")
  414. df = df.set_index(idx1)
  415. df = df.set_index(idx2, append=True)
  416. df = df.set_index(idx3, append=True)
  417. expected1 = period_range("2011-01-01", periods=3, freq="M")
  418. expected2 = period_range("2013-01-01 09:00", periods=2, freq="H")
  419. tm.assert_index_equal(df.index.levels[0], expected1)
  420. tm.assert_index_equal(df.index.levels[1], expected2)
  421. tm.assert_index_equal(df.index.levels[2], idx3)
  422. tm.assert_index_equal(df.index.get_level_values(0), idx1)
  423. tm.assert_index_equal(df.index.get_level_values(1), idx2)
  424. tm.assert_index_equal(df.index.get_level_values(2), idx3)
  425. class TestSetIndexInvalid:
  426. def test_set_index_verify_integrity(self, frame_of_index_cols):
  427. df = frame_of_index_cols
  428. with pytest.raises(ValueError, match="Index has duplicate keys"):
  429. df.set_index("A", verify_integrity=True)
  430. # with MultiIndex
  431. with pytest.raises(ValueError, match="Index has duplicate keys"):
  432. df.set_index([df["A"], df["A"]], verify_integrity=True)
  433. @pytest.mark.parametrize("append", [True, False])
  434. @pytest.mark.parametrize("drop", [True, False])
  435. def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
  436. df = frame_of_index_cols
  437. with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
  438. # column names are A-E, as well as one tuple
  439. df.set_index(["foo", "bar", "baz"], drop=drop, append=append)
  440. # non-existent key in list with arrays
  441. with pytest.raises(KeyError, match="X"):
  442. df.set_index([df["A"], df["B"], "X"], drop=drop, append=append)
  443. msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
  444. # tuples always raise KeyError
  445. with pytest.raises(KeyError, match=msg):
  446. df.set_index(tuple(df["A"]), drop=drop, append=append)
  447. # also within a list
  448. with pytest.raises(KeyError, match=msg):
  449. df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append)
  450. @pytest.mark.parametrize("append", [True, False])
  451. @pytest.mark.parametrize("drop", [True, False])
  452. @pytest.mark.parametrize("box", [set], ids=["set"])
  453. def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append):
  454. df = frame_of_index_cols
  455. msg = 'The parameter "keys" may be a column key, .*'
  456. # forbidden type, e.g. set
  457. with pytest.raises(TypeError, match=msg):
  458. df.set_index(box(df["A"]), drop=drop, append=append)
  459. # forbidden type in list, e.g. set
  460. with pytest.raises(TypeError, match=msg):
  461. df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append)
  462. # MultiIndex constructor does not work directly on Series -> lambda
  463. @pytest.mark.parametrize(
  464. "box",
  465. [Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])],
  466. ids=["Series", "Index", "np.array", "iter", "MultiIndex"],
  467. )
  468. @pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"])
  469. @pytest.mark.parametrize("append", [True, False])
  470. @pytest.mark.parametrize("drop", [True, False])
  471. def test_set_index_raise_on_len(
  472. self, frame_of_index_cols, box, length, drop, append
  473. ):
  474. # GH 24984
  475. df = frame_of_index_cols # has length 5
  476. values = np.random.randint(0, 10, (length,))
  477. msg = "Length mismatch: Expected 5 rows, received array of length.*"
  478. # wrong length directly
  479. with pytest.raises(ValueError, match=msg):
  480. df.set_index(box(values), drop=drop, append=append)
  481. # wrong length in list
  482. with pytest.raises(ValueError, match=msg):
  483. df.set_index(["A", df.A, box(values)], drop=drop, append=append)
  484. class TestSetIndexCustomLabelType:
  485. def test_set_index_custom_label_type(self):
  486. # GH#24969
  487. class Thing:
  488. def __init__(self, name, color) -> None:
  489. self.name = name
  490. self.color = color
  491. def __str__(self) -> str:
  492. return f"<Thing {repr(self.name)}>"
  493. # necessary for pretty KeyError
  494. __repr__ = __str__
  495. thing1 = Thing("One", "red")
  496. thing2 = Thing("Two", "blue")
  497. df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
  498. expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
  499. # use custom label directly
  500. result = df.set_index(thing2)
  501. tm.assert_frame_equal(result, expected)
  502. # custom label wrapped in list
  503. result = df.set_index([thing2])
  504. tm.assert_frame_equal(result, expected)
  505. # missing key
  506. thing3 = Thing("Three", "pink")
  507. msg = "<Thing 'Three'>"
  508. with pytest.raises(KeyError, match=msg):
  509. # missing label directly
  510. df.set_index(thing3)
  511. with pytest.raises(KeyError, match=msg):
  512. # missing label in list
  513. df.set_index([thing3])
  514. def test_set_index_custom_label_hashable_iterable(self):
  515. # GH#24969
  516. # actual example discussed in GH 24984 was e.g. for shapely.geometry
  517. # objects (e.g. a collection of Points) that can be both hashable and
  518. # iterable; using frozenset as a stand-in for testing here
  519. class Thing(frozenset):
  520. # need to stabilize repr for KeyError (due to random order in sets)
  521. def __repr__(self) -> str:
  522. tmp = sorted(self)
  523. joined_reprs = ", ".join(map(repr, tmp))
  524. # double curly brace prints one brace in format string
  525. return f"frozenset({{{joined_reprs}}})"
  526. thing1 = Thing(["One", "red"])
  527. thing2 = Thing(["Two", "blue"])
  528. df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
  529. expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
  530. # use custom label directly
  531. result = df.set_index(thing2)
  532. tm.assert_frame_equal(result, expected)
  533. # custom label wrapped in list
  534. result = df.set_index([thing2])
  535. tm.assert_frame_equal(result, expected)
  536. # missing key
  537. thing3 = Thing(["Three", "pink"])
  538. msg = r"frozenset\(\{'Three', 'pink'\}\)"
  539. with pytest.raises(KeyError, match=msg):
  540. # missing label directly
  541. df.set_index(thing3)
  542. with pytest.raises(KeyError, match=msg):
  543. # missing label in list
  544. df.set_index([thing3])
  545. def test_set_index_custom_label_type_raises(self):
  546. # GH#24969
  547. # purposefully inherit from something unhashable
  548. class Thing(set):
  549. def __init__(self, name, color) -> None:
  550. self.name = name
  551. self.color = color
  552. def __str__(self) -> str:
  553. return f"<Thing {repr(self.name)}>"
  554. thing1 = Thing("One", "red")
  555. thing2 = Thing("Two", "blue")
  556. df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])
  557. msg = 'The parameter "keys" may be a column key, .*'
  558. with pytest.raises(TypeError, match=msg):
  559. # use custom label directly
  560. df.set_index(thing2)
  561. with pytest.raises(TypeError, match=msg):
  562. # custom label wrapped in list
  563. df.set_index([thing2])
  564. def test_set_index_periodindex(self):
  565. # GH#6631
  566. df = DataFrame(np.random.random(6))
  567. idx1 = period_range("2011/01/01", periods=6, freq="M")
  568. idx2 = period_range("2013", periods=6, freq="A")
  569. df = df.set_index(idx1)
  570. tm.assert_index_equal(df.index, idx1)
  571. df = df.set_index(idx2)
  572. tm.assert_index_equal(df.index, idx2)