test_duplicate_labels.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. """Tests dealing with the NDFrame.allows_duplicates."""
  2. import operator
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. import pandas._testing as tm
  7. not_implemented = pytest.mark.xfail(reason="Not implemented.")
  8. # ----------------------------------------------------------------------------
  9. # Preservation
  10. class TestPreserves:
  11. @pytest.mark.parametrize(
  12. "cls, data",
  13. [
  14. (pd.Series, np.array([])),
  15. (pd.Series, [1, 2]),
  16. (pd.DataFrame, {}),
  17. (pd.DataFrame, {"A": [1, 2]}),
  18. ],
  19. )
  20. def test_construction_ok(self, cls, data):
  21. result = cls(data)
  22. assert result.flags.allows_duplicate_labels is True
  23. result = cls(data).set_flags(allows_duplicate_labels=False)
  24. assert result.flags.allows_duplicate_labels is False
  25. @pytest.mark.parametrize(
  26. "func",
  27. [
  28. operator.itemgetter(["a"]),
  29. operator.methodcaller("add", 1),
  30. operator.methodcaller("rename", str.upper),
  31. operator.methodcaller("rename", "name"),
  32. operator.methodcaller("abs"),
  33. np.abs,
  34. ],
  35. )
  36. def test_preserved_series(self, func):
  37. s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
  38. assert func(s).flags.allows_duplicate_labels is False
  39. @pytest.mark.parametrize(
  40. "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])]
  41. )
  42. # TODO: frame
  43. @not_implemented
  44. def test_align(self, other):
  45. s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
  46. a, b = s.align(other)
  47. assert a.flags.allows_duplicate_labels is False
  48. assert b.flags.allows_duplicate_labels is False
  49. def test_preserved_frame(self):
  50. df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
  51. allows_duplicate_labels=False
  52. )
  53. assert df.loc[["a"]].flags.allows_duplicate_labels is False
  54. assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False
  55. def test_to_frame(self):
  56. ser = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False)
  57. assert ser.to_frame().flags.allows_duplicate_labels is False
  58. @pytest.mark.parametrize("func", ["add", "sub"])
  59. @pytest.mark.parametrize("frame", [False, True])
  60. @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")])
  61. def test_binops(self, func, other, frame):
  62. df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags(
  63. allows_duplicate_labels=False
  64. )
  65. if frame:
  66. df = df.to_frame()
  67. if isinstance(other, pd.Series) and frame:
  68. other = other.to_frame()
  69. func = operator.methodcaller(func, other)
  70. assert df.flags.allows_duplicate_labels is False
  71. assert func(df).flags.allows_duplicate_labels is False
  72. def test_preserve_getitem(self):
  73. df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
  74. assert df[["A"]].flags.allows_duplicate_labels is False
  75. assert df["A"].flags.allows_duplicate_labels is False
  76. assert df.loc[0].flags.allows_duplicate_labels is False
  77. assert df.loc[[0]].flags.allows_duplicate_labels is False
  78. assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False
  79. def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write):
  80. if not using_copy_on_write:
  81. request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior."))
  82. # NDFrame.__getitem__ will cache the first df['A']. May need to
  83. # invalidate that cache? Update the cached entries?
  84. df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)
  85. assert df["A"].flags.allows_duplicate_labels is False
  86. df.flags.allows_duplicate_labels = True
  87. assert df["A"].flags.allows_duplicate_labels is True
  88. @pytest.mark.parametrize(
  89. "objs, kwargs",
  90. [
  91. # Series
  92. (
  93. [
  94. pd.Series(1, index=["a", "b"]).set_flags(
  95. allows_duplicate_labels=False
  96. ),
  97. pd.Series(2, index=["c", "d"]).set_flags(
  98. allows_duplicate_labels=False
  99. ),
  100. ],
  101. {},
  102. ),
  103. (
  104. [
  105. pd.Series(1, index=["a", "b"]).set_flags(
  106. allows_duplicate_labels=False
  107. ),
  108. pd.Series(2, index=["a", "b"]).set_flags(
  109. allows_duplicate_labels=False
  110. ),
  111. ],
  112. {"ignore_index": True},
  113. ),
  114. (
  115. [
  116. pd.Series(1, index=["a", "b"]).set_flags(
  117. allows_duplicate_labels=False
  118. ),
  119. pd.Series(2, index=["a", "b"]).set_flags(
  120. allows_duplicate_labels=False
  121. ),
  122. ],
  123. {"axis": 1},
  124. ),
  125. # Frame
  126. (
  127. [
  128. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
  129. allows_duplicate_labels=False
  130. ),
  131. pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags(
  132. allows_duplicate_labels=False
  133. ),
  134. ],
  135. {},
  136. ),
  137. (
  138. [
  139. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
  140. allows_duplicate_labels=False
  141. ),
  142. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
  143. allows_duplicate_labels=False
  144. ),
  145. ],
  146. {"ignore_index": True},
  147. ),
  148. (
  149. [
  150. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
  151. allows_duplicate_labels=False
  152. ),
  153. pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags(
  154. allows_duplicate_labels=False
  155. ),
  156. ],
  157. {"axis": 1},
  158. ),
  159. # Series / Frame
  160. (
  161. [
  162. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
  163. allows_duplicate_labels=False
  164. ),
  165. pd.Series(
  166. [1, 2],
  167. index=["a", "b"],
  168. name="B",
  169. ).set_flags(
  170. allows_duplicate_labels=False,
  171. ),
  172. ],
  173. {"axis": 1},
  174. ),
  175. ],
  176. )
  177. def test_concat(self, objs, kwargs):
  178. result = pd.concat(objs, **kwargs)
  179. assert result.flags.allows_duplicate_labels is False
  180. @pytest.mark.parametrize(
  181. "left, right, kwargs, expected",
  182. [
  183. # false false false
  184. pytest.param(
  185. pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
  186. allows_duplicate_labels=False
  187. ),
  188. pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags(
  189. allows_duplicate_labels=False
  190. ),
  191. {"left_index": True, "right_index": True},
  192. False,
  193. marks=not_implemented,
  194. ),
  195. # false true false
  196. pytest.param(
  197. pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
  198. allows_duplicate_labels=False
  199. ),
  200. pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
  201. {"left_index": True, "right_index": True},
  202. False,
  203. marks=not_implemented,
  204. ),
  205. # true true true
  206. (
  207. pd.DataFrame({"A": [0, 1]}, index=["a", "b"]),
  208. pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
  209. {"left_index": True, "right_index": True},
  210. True,
  211. ),
  212. ],
  213. )
  214. def test_merge(self, left, right, kwargs, expected):
  215. result = pd.merge(left, right, **kwargs)
  216. assert result.flags.allows_duplicate_labels is expected
  217. @not_implemented
  218. def test_groupby(self):
  219. # XXX: This is under tested
  220. # TODO:
  221. # - apply
  222. # - transform
  223. # - Should passing a grouper that disallows duplicates propagate?
  224. df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False)
  225. result = df.groupby([0, 0, 1]).agg("count")
  226. assert result.flags.allows_duplicate_labels is False
  227. @pytest.mark.parametrize("frame", [True, False])
  228. @not_implemented
  229. def test_window(self, frame):
  230. df = pd.Series(
  231. 1,
  232. index=pd.date_range("2000", periods=12),
  233. name="A",
  234. allows_duplicate_labels=False,
  235. )
  236. if frame:
  237. df = df.to_frame()
  238. assert df.rolling(3).mean().flags.allows_duplicate_labels is False
  239. assert df.ewm(3).mean().flags.allows_duplicate_labels is False
  240. assert df.expanding(3).mean().flags.allows_duplicate_labels is False
  241. # ----------------------------------------------------------------------------
  242. # Raises
  243. class TestRaises:
  244. @pytest.mark.parametrize(
  245. "cls, axes",
  246. [
  247. (pd.Series, {"index": ["a", "a"], "dtype": float}),
  248. (pd.DataFrame, {"index": ["a", "a"]}),
  249. (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}),
  250. (pd.DataFrame, {"columns": ["b", "b"]}),
  251. ],
  252. )
  253. def test_set_flags_with_duplicates(self, cls, axes):
  254. result = cls(**axes)
  255. assert result.flags.allows_duplicate_labels is True
  256. msg = "Index has duplicates."
  257. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  258. cls(**axes).set_flags(allows_duplicate_labels=False)
  259. @pytest.mark.parametrize(
  260. "data",
  261. [
  262. pd.Series(index=[0, 0], dtype=float),
  263. pd.DataFrame(index=[0, 0]),
  264. pd.DataFrame(columns=[0, 0]),
  265. ],
  266. )
  267. def test_setting_allows_duplicate_labels_raises(self, data):
  268. msg = "Index has duplicates."
  269. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  270. data.flags.allows_duplicate_labels = False
  271. assert data.flags.allows_duplicate_labels is True
  272. def test_series_raises(self):
  273. a = pd.Series(0, index=["a", "b"])
  274. b = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
  275. msg = "Index has duplicates."
  276. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  277. pd.concat([a, b])
  278. @pytest.mark.parametrize(
  279. "getter, target",
  280. [
  281. (operator.itemgetter(["A", "A"]), None),
  282. # loc
  283. (operator.itemgetter(["a", "a"]), "loc"),
  284. pytest.param(operator.itemgetter(("a", ["A", "A"])), "loc"),
  285. (operator.itemgetter((["a", "a"], "A")), "loc"),
  286. # iloc
  287. (operator.itemgetter([0, 0]), "iloc"),
  288. pytest.param(operator.itemgetter((0, [0, 0])), "iloc"),
  289. pytest.param(operator.itemgetter(([0, 0], 0)), "iloc"),
  290. ],
  291. )
  292. def test_getitem_raises(self, getter, target):
  293. df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
  294. allows_duplicate_labels=False
  295. )
  296. if target:
  297. # df, df.loc, or df.iloc
  298. target = getattr(df, target)
  299. else:
  300. target = df
  301. msg = "Index has duplicates."
  302. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  303. getter(target)
  304. @pytest.mark.parametrize(
  305. "objs, kwargs",
  306. [
  307. (
  308. [
  309. pd.Series(1, index=[0, 1], name="a").set_flags(
  310. allows_duplicate_labels=False
  311. ),
  312. pd.Series(2, index=[0, 1], name="a").set_flags(
  313. allows_duplicate_labels=False
  314. ),
  315. ],
  316. {"axis": 1},
  317. )
  318. ],
  319. )
  320. def test_concat_raises(self, objs, kwargs):
  321. msg = "Index has duplicates."
  322. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  323. pd.concat(objs, **kwargs)
  324. @not_implemented
  325. def test_merge_raises(self):
  326. a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags(
  327. allows_duplicate_labels=False
  328. )
  329. b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"])
  330. msg = "Index has duplicates."
  331. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  332. pd.merge(a, b, left_index=True, right_index=True)
  333. @pytest.mark.parametrize(
  334. "idx",
  335. [
  336. pd.Index([1, 1]),
  337. pd.Index(["a", "a"]),
  338. pd.Index([1.1, 1.1]),
  339. pd.PeriodIndex([pd.Period("2000", "D")] * 2),
  340. pd.DatetimeIndex([pd.Timestamp("2000")] * 2),
  341. pd.TimedeltaIndex([pd.Timedelta("1D")] * 2),
  342. pd.CategoricalIndex(["a", "a"]),
  343. pd.IntervalIndex([pd.Interval(0, 1)] * 2),
  344. pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]),
  345. ],
  346. ids=lambda x: type(x).__name__,
  347. )
  348. def test_raises_basic(idx):
  349. msg = "Index has duplicates."
  350. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  351. pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False)
  352. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  353. pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False)
  354. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  355. pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False)
  356. def test_format_duplicate_labels_message():
  357. idx = pd.Index(["a", "b", "a", "b", "c"])
  358. result = idx._format_duplicate_message()
  359. expected = pd.DataFrame(
  360. {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label")
  361. )
  362. tm.assert_frame_equal(result, expected)
  363. def test_format_duplicate_labels_message_multi():
  364. idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]])
  365. result = idx._format_duplicate_message()
  366. expected = pd.DataFrame(
  367. {"positions": [[0, 2], [1, 3]]},
  368. index=pd.MultiIndex.from_product([["A"], ["a", "b"]]),
  369. )
  370. tm.assert_frame_equal(result, expected)
  371. def test_dataframe_insert_raises():
  372. df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
  373. msg = "Cannot specify"
  374. with pytest.raises(ValueError, match=msg):
  375. df.insert(0, "A", [3, 4], allow_duplicates=True)
  376. @pytest.mark.parametrize(
  377. "method, frame_only",
  378. [
  379. (operator.methodcaller("set_index", "A", inplace=True), True),
  380. (operator.methodcaller("reset_index", inplace=True), True),
  381. (operator.methodcaller("rename", lambda x: x, inplace=True), False),
  382. ],
  383. )
  384. def test_inplace_raises(method, frame_only):
  385. df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags(
  386. allows_duplicate_labels=False
  387. )
  388. s = df["A"]
  389. s.flags.allows_duplicate_labels = False
  390. msg = "Cannot specify"
  391. with pytest.raises(ValueError, match=msg):
  392. method(df)
  393. if not frame_only:
  394. with pytest.raises(ValueError, match=msg):
  395. method(s)
  396. def test_pickle():
  397. a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False)
  398. b = tm.round_trip_pickle(a)
  399. tm.assert_series_equal(a, b)
  400. a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False)
  401. b = tm.round_trip_pickle(a)
  402. tm.assert_frame_equal(a, b)