test_nonunique_indexes.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Series,
  7. date_range,
  8. )
  9. import pandas._testing as tm
  10. def check(result, expected=None):
  11. if expected is not None:
  12. tm.assert_frame_equal(result, expected)
  13. result.dtypes
  14. str(result)
  15. class TestDataFrameNonuniqueIndexes:
  16. def test_setattr_columns_vs_construct_with_columns(self):
  17. # assignment
  18. # GH 3687
  19. arr = np.random.randn(3, 2)
  20. idx = list(range(2))
  21. df = DataFrame(arr, columns=["A", "A"])
  22. df.columns = idx
  23. expected = DataFrame(arr, columns=idx)
  24. check(df, expected)
  25. def test_setattr_columns_vs_construct_with_columns_datetimeindx(self):
  26. idx = date_range("20130101", periods=4, freq="Q-NOV")
  27. df = DataFrame(
  28. [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
  29. )
  30. df.columns = idx
  31. expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
  32. check(df, expected)
  33. def test_insert_with_duplicate_columns(self):
  34. # insert
  35. df = DataFrame(
  36. [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
  37. columns=["foo", "bar", "foo", "hello"],
  38. )
  39. df["string"] = "bah"
  40. expected = DataFrame(
  41. [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
  42. columns=["foo", "bar", "foo", "hello", "string"],
  43. )
  44. check(df, expected)
  45. with pytest.raises(ValueError, match="Length of value"):
  46. df.insert(0, "AnotherColumn", range(len(df.index) - 1))
  47. # insert same dtype
  48. df["foo2"] = 3
  49. expected = DataFrame(
  50. [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
  51. columns=["foo", "bar", "foo", "hello", "string", "foo2"],
  52. )
  53. check(df, expected)
  54. # set (non-dup)
  55. df["foo2"] = 4
  56. expected = DataFrame(
  57. [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
  58. columns=["foo", "bar", "foo", "hello", "string", "foo2"],
  59. )
  60. check(df, expected)
  61. df["foo2"] = 3
  62. # delete (non dup)
  63. del df["bar"]
  64. expected = DataFrame(
  65. [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
  66. columns=["foo", "foo", "hello", "string", "foo2"],
  67. )
  68. check(df, expected)
  69. # try to delete again (its not consolidated)
  70. del df["hello"]
  71. expected = DataFrame(
  72. [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
  73. columns=["foo", "foo", "string", "foo2"],
  74. )
  75. check(df, expected)
  76. # consolidate
  77. df = df._consolidate()
  78. expected = DataFrame(
  79. [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
  80. columns=["foo", "foo", "string", "foo2"],
  81. )
  82. check(df, expected)
  83. # insert
  84. df.insert(2, "new_col", 5.0)
  85. expected = DataFrame(
  86. [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
  87. columns=["foo", "foo", "new_col", "string", "foo2"],
  88. )
  89. check(df, expected)
  90. # insert a dup
  91. with pytest.raises(ValueError, match="cannot insert"):
  92. df.insert(2, "new_col", 4.0)
  93. df.insert(2, "new_col", 4.0, allow_duplicates=True)
  94. expected = DataFrame(
  95. [
  96. [1, 1, 4.0, 5.0, "bah", 3],
  97. [1, 2, 4.0, 5.0, "bah", 3],
  98. [2, 3, 4.0, 5.0, "bah", 3],
  99. ],
  100. columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
  101. )
  102. check(df, expected)
  103. # delete (dup)
  104. del df["foo"]
  105. expected = DataFrame(
  106. [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
  107. columns=["new_col", "new_col", "string", "foo2"],
  108. )
  109. tm.assert_frame_equal(df, expected)
  110. def test_dup_across_dtypes(self):
  111. # dup across dtypes
  112. df = DataFrame(
  113. [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
  114. columns=["foo", "bar", "foo", "hello"],
  115. )
  116. check(df)
  117. df["foo2"] = 7.0
  118. expected = DataFrame(
  119. [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
  120. columns=["foo", "bar", "foo", "hello", "foo2"],
  121. )
  122. check(df, expected)
  123. result = df["foo"]
  124. expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
  125. check(result, expected)
  126. # multiple replacements
  127. df["foo"] = "string"
  128. expected = DataFrame(
  129. [
  130. ["string", 1, "string", 5, 7.0],
  131. ["string", 1, "string", 5, 7.0],
  132. ["string", 1, "string", 5, 7.0],
  133. ],
  134. columns=["foo", "bar", "foo", "hello", "foo2"],
  135. )
  136. check(df, expected)
  137. del df["foo"]
  138. expected = DataFrame(
  139. [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
  140. )
  141. check(df, expected)
  142. def test_column_dups_indexes(self):
  143. # check column dups with index equal and not equal to df's index
  144. df = DataFrame(
  145. np.random.randn(5, 3),
  146. index=["a", "b", "c", "d", "e"],
  147. columns=["A", "B", "A"],
  148. )
  149. for index in [df.index, pd.Index(list("edcba"))]:
  150. this_df = df.copy()
  151. expected_ser = Series(index.values, index=this_df.index)
  152. expected_df = DataFrame(
  153. {"A": expected_ser, "B": this_df["B"]},
  154. columns=["A", "B", "A"],
  155. )
  156. this_df["A"] = index
  157. check(this_df, expected_df)
  158. def test_changing_dtypes_with_duplicate_columns(self):
  159. # multiple assignments that change dtypes
  160. # the location indexer is a slice
  161. # GH 6120
  162. df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
  163. expected = DataFrame(1.0, index=range(5), columns=["that", "that"])
  164. df["that"] = 1.0
  165. check(df, expected)
  166. df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
  167. expected = DataFrame(1, index=range(5), columns=["that", "that"])
  168. df["that"] = 1
  169. check(df, expected)
  170. def test_dup_columns_comparisons(self):
  171. # equality
  172. df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
  173. df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
  174. # not-comparing like-labelled
  175. msg = (
  176. r"Can only compare identically-labeled \(both index and columns\) "
  177. "DataFrame objects"
  178. )
  179. with pytest.raises(ValueError, match=msg):
  180. df1 == df2
  181. df1r = df1.reindex_like(df2)
  182. result = df1r == df2
  183. expected = DataFrame(
  184. [[False, True], [True, False], [False, False], [True, False]],
  185. columns=["A", "A"],
  186. )
  187. tm.assert_frame_equal(result, expected)
  188. def test_mixed_column_selection(self):
  189. # mixed column selection
  190. # GH 5639
  191. dfbool = DataFrame(
  192. {
  193. "one": Series([True, True, False], index=["a", "b", "c"]),
  194. "two": Series([False, False, True, False], index=["a", "b", "c", "d"]),
  195. "three": Series([False, True, True, True], index=["a", "b", "c", "d"]),
  196. }
  197. )
  198. expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1)
  199. result = dfbool[["one", "three", "one"]]
  200. check(result, expected)
  201. def test_multi_axis_dups(self):
  202. # multi-axis dups
  203. # GH 6121
  204. df = DataFrame(
  205. np.arange(25.0).reshape(5, 5),
  206. index=["a", "b", "c", "d", "e"],
  207. columns=["A", "B", "C", "D", "E"],
  208. )
  209. z = df[["A", "C", "A"]].copy()
  210. expected = z.loc[["a", "c", "a"]]
  211. df = DataFrame(
  212. np.arange(25.0).reshape(5, 5),
  213. index=["a", "b", "c", "d", "e"],
  214. columns=["A", "B", "C", "D", "E"],
  215. )
  216. z = df[["A", "C", "A"]]
  217. result = z.loc[["a", "c", "a"]]
  218. check(result, expected)
  219. def test_columns_with_dups(self):
  220. # GH 3468 related
  221. # basic
  222. df = DataFrame([[1, 2]], columns=["a", "a"])
  223. df.columns = ["a", "a.1"]
  224. str(df)
  225. expected = DataFrame([[1, 2]], columns=["a", "a.1"])
  226. tm.assert_frame_equal(df, expected)
  227. df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"])
  228. df.columns = ["b", "a", "a.1"]
  229. str(df)
  230. expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"])
  231. tm.assert_frame_equal(df, expected)
  232. def test_columns_with_dup_index(self):
  233. # with a dup index
  234. df = DataFrame([[1, 2]], columns=["a", "a"])
  235. df.columns = ["b", "b"]
  236. str(df)
  237. expected = DataFrame([[1, 2]], columns=["b", "b"])
  238. tm.assert_frame_equal(df, expected)
  239. def test_multi_dtype(self):
  240. # multi-dtype
  241. df = DataFrame(
  242. [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
  243. columns=["a", "a", "b", "b", "d", "c", "c"],
  244. )
  245. df.columns = list("ABCDEFG")
  246. str(df)
  247. expected = DataFrame(
  248. [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG")
  249. )
  250. tm.assert_frame_equal(df, expected)
  251. def test_multi_dtype2(self):
  252. df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"])
  253. df.columns = ["a", "a.1", "a.2", "a.3"]
  254. str(df)
  255. expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"])
  256. tm.assert_frame_equal(df, expected)
  257. def test_dups_across_blocks(self, using_array_manager):
  258. # dups across blocks
  259. df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
  260. df_int = DataFrame(np.random.randn(10, 3).astype("int64"))
  261. df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
  262. df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
  263. df_dt = DataFrame(
  264. pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns
  265. )
  266. df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
  267. if not using_array_manager:
  268. assert len(df._mgr.blknos) == len(df.columns)
  269. assert len(df._mgr.blklocs) == len(df.columns)
  270. # testing iloc
  271. for i in range(len(df.columns)):
  272. df.iloc[:, i]
  273. def test_dup_columns_across_dtype(self):
  274. # dup columns across dtype GH 2079/2194
  275. vals = [[1, -1, 2.0], [2, -2, 3.0]]
  276. rs = DataFrame(vals, columns=["A", "A", "B"])
  277. xp = DataFrame(vals)
  278. xp.columns = ["A", "A", "B"]
  279. tm.assert_frame_equal(rs, xp)
  280. def test_set_value_by_index(self):
  281. # See gh-12344
  282. warn = None
  283. msg = "will attempt to set the values inplace"
  284. df = DataFrame(np.arange(9).reshape(3, 3).T)
  285. df.columns = list("AAA")
  286. expected = df.iloc[:, 2]
  287. with tm.assert_produces_warning(warn, match=msg):
  288. df.iloc[:, 0] = 3
  289. tm.assert_series_equal(df.iloc[:, 2], expected)
  290. df = DataFrame(np.arange(9).reshape(3, 3).T)
  291. df.columns = [2, float(2), str(2)]
  292. expected = df.iloc[:, 1]
  293. with tm.assert_produces_warning(warn, match=msg):
  294. df.iloc[:, 0] = 3
  295. tm.assert_series_equal(df.iloc[:, 1], expected)