test_index.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. """
  2. Tests that work on both the Python and C engines but do not have a
  3. specific classification into the other test modules.
  4. """
  5. from datetime import datetime
  6. from io import StringIO
  7. import os
  8. import pytest
  9. from pandas import (
  10. DataFrame,
  11. Index,
  12. MultiIndex,
  13. )
  14. import pandas._testing as tm
  15. xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
  16. # GH#43650: Some expected failures with the pyarrow engine can occasionally
  17. # cause a deadlock instead, so we skip these instead of xfailing
  18. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  19. @pytest.mark.parametrize(
  20. "data,kwargs,expected",
  21. [
  22. (
  23. """foo,2,3,4,5
  24. bar,7,8,9,10
  25. baz,12,13,14,15
  26. qux,12,13,14,15
  27. foo2,12,13,14,15
  28. bar2,12,13,14,15
  29. """,
  30. {"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
  31. DataFrame(
  32. [
  33. [2, 3, 4, 5],
  34. [7, 8, 9, 10],
  35. [12, 13, 14, 15],
  36. [12, 13, 14, 15],
  37. [12, 13, 14, 15],
  38. [12, 13, 14, 15],
  39. ],
  40. index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
  41. columns=["A", "B", "C", "D"],
  42. ),
  43. ),
  44. (
  45. """foo,one,2,3,4,5
  46. foo,two,7,8,9,10
  47. foo,three,12,13,14,15
  48. bar,one,12,13,14,15
  49. bar,two,12,13,14,15
  50. """,
  51. {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
  52. DataFrame(
  53. [
  54. [2, 3, 4, 5],
  55. [7, 8, 9, 10],
  56. [12, 13, 14, 15],
  57. [12, 13, 14, 15],
  58. [12, 13, 14, 15],
  59. ],
  60. index=MultiIndex.from_tuples(
  61. [
  62. ("foo", "one"),
  63. ("foo", "two"),
  64. ("foo", "three"),
  65. ("bar", "one"),
  66. ("bar", "two"),
  67. ],
  68. names=["index1", "index2"],
  69. ),
  70. columns=["A", "B", "C", "D"],
  71. ),
  72. ),
  73. ],
  74. )
  75. def test_pass_names_with_index(all_parsers, data, kwargs, expected):
  76. parser = all_parsers
  77. result = parser.read_csv(StringIO(data), **kwargs)
  78. tm.assert_frame_equal(result, expected)
  79. @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
  80. def test_multi_index_no_level_names(all_parsers, index_col):
  81. data = """index1,index2,A,B,C,D
  82. foo,one,2,3,4,5
  83. foo,two,7,8,9,10
  84. foo,three,12,13,14,15
  85. bar,one,12,13,14,15
  86. bar,two,12,13,14,15
  87. """
  88. headless_data = "\n".join(data.split("\n")[1:])
  89. names = ["A", "B", "C", "D"]
  90. parser = all_parsers
  91. result = parser.read_csv(
  92. StringIO(headless_data), index_col=index_col, header=None, names=names
  93. )
  94. expected = parser.read_csv(StringIO(data), index_col=index_col)
  95. # No index names in headless data.
  96. expected.index.names = [None] * 2
  97. tm.assert_frame_equal(result, expected)
  98. @xfail_pyarrow
  99. def test_multi_index_no_level_names_implicit(all_parsers):
  100. parser = all_parsers
  101. data = """A,B,C,D
  102. foo,one,2,3,4,5
  103. foo,two,7,8,9,10
  104. foo,three,12,13,14,15
  105. bar,one,12,13,14,15
  106. bar,two,12,13,14,15
  107. """
  108. result = parser.read_csv(StringIO(data))
  109. expected = DataFrame(
  110. [
  111. [2, 3, 4, 5],
  112. [7, 8, 9, 10],
  113. [12, 13, 14, 15],
  114. [12, 13, 14, 15],
  115. [12, 13, 14, 15],
  116. ],
  117. columns=["A", "B", "C", "D"],
  118. index=MultiIndex.from_tuples(
  119. [
  120. ("foo", "one"),
  121. ("foo", "two"),
  122. ("foo", "three"),
  123. ("bar", "one"),
  124. ("bar", "two"),
  125. ]
  126. ),
  127. )
  128. tm.assert_frame_equal(result, expected)
  129. @xfail_pyarrow
  130. @pytest.mark.parametrize(
  131. "data,expected,header",
  132. [
  133. ("a,b", DataFrame(columns=["a", "b"]), [0]),
  134. (
  135. "a,b\nc,d",
  136. DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
  137. [0, 1],
  138. ),
  139. ],
  140. )
  141. @pytest.mark.parametrize("round_trip", [True, False])
  142. def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
  143. # see gh-14545
  144. parser = all_parsers
  145. data = expected.to_csv(index=False) if round_trip else data
  146. result = parser.read_csv(StringIO(data), header=header)
  147. tm.assert_frame_equal(result, expected)
  148. @xfail_pyarrow
  149. def test_no_unnamed_index(all_parsers):
  150. parser = all_parsers
  151. data = """ id c0 c1 c2
  152. 0 1 0 a b
  153. 1 2 0 c d
  154. 2 2 2 e f
  155. """
  156. result = parser.read_csv(StringIO(data), sep=" ")
  157. expected = DataFrame(
  158. [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
  159. columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
  160. )
  161. tm.assert_frame_equal(result, expected)
  162. def test_read_duplicate_index_explicit(all_parsers):
  163. data = """index,A,B,C,D
  164. foo,2,3,4,5
  165. bar,7,8,9,10
  166. baz,12,13,14,15
  167. qux,12,13,14,15
  168. foo,12,13,14,15
  169. bar,12,13,14,15
  170. """
  171. parser = all_parsers
  172. result = parser.read_csv(StringIO(data), index_col=0)
  173. expected = DataFrame(
  174. [
  175. [2, 3, 4, 5],
  176. [7, 8, 9, 10],
  177. [12, 13, 14, 15],
  178. [12, 13, 14, 15],
  179. [12, 13, 14, 15],
  180. [12, 13, 14, 15],
  181. ],
  182. columns=["A", "B", "C", "D"],
  183. index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
  184. )
  185. tm.assert_frame_equal(result, expected)
  186. @xfail_pyarrow
  187. def test_read_duplicate_index_implicit(all_parsers):
  188. data = """A,B,C,D
  189. foo,2,3,4,5
  190. bar,7,8,9,10
  191. baz,12,13,14,15
  192. qux,12,13,14,15
  193. foo,12,13,14,15
  194. bar,12,13,14,15
  195. """
  196. parser = all_parsers
  197. result = parser.read_csv(StringIO(data))
  198. expected = DataFrame(
  199. [
  200. [2, 3, 4, 5],
  201. [7, 8, 9, 10],
  202. [12, 13, 14, 15],
  203. [12, 13, 14, 15],
  204. [12, 13, 14, 15],
  205. [12, 13, 14, 15],
  206. ],
  207. columns=["A", "B", "C", "D"],
  208. index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
  209. )
  210. tm.assert_frame_equal(result, expected)
  211. @xfail_pyarrow
  212. def test_read_csv_no_index_name(all_parsers, csv_dir_path):
  213. parser = all_parsers
  214. csv2 = os.path.join(csv_dir_path, "test2.csv")
  215. result = parser.read_csv(csv2, index_col=0, parse_dates=True)
  216. expected = DataFrame(
  217. [
  218. [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
  219. [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
  220. [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
  221. [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
  222. [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
  223. ],
  224. columns=["A", "B", "C", "D", "E"],
  225. index=Index(
  226. [
  227. datetime(2000, 1, 3),
  228. datetime(2000, 1, 4),
  229. datetime(2000, 1, 5),
  230. datetime(2000, 1, 6),
  231. datetime(2000, 1, 7),
  232. ]
  233. ),
  234. )
  235. tm.assert_frame_equal(result, expected)
  236. @xfail_pyarrow
  237. def test_empty_with_index(all_parsers):
  238. # see gh-10184
  239. data = "x,y"
  240. parser = all_parsers
  241. result = parser.read_csv(StringIO(data), index_col=0)
  242. expected = DataFrame(columns=["y"], index=Index([], name="x"))
  243. tm.assert_frame_equal(result, expected)
  244. @skip_pyarrow
  245. def test_empty_with_multi_index(all_parsers):
  246. # see gh-10467
  247. data = "x,y,z"
  248. parser = all_parsers
  249. result = parser.read_csv(StringIO(data), index_col=["x", "y"])
  250. expected = DataFrame(
  251. columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
  252. )
  253. tm.assert_frame_equal(result, expected)
  254. @skip_pyarrow
  255. def test_empty_with_reversed_multi_index(all_parsers):
  256. data = "x,y,z"
  257. parser = all_parsers
  258. result = parser.read_csv(StringIO(data), index_col=[1, 0])
  259. expected = DataFrame(
  260. columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
  261. )
  262. tm.assert_frame_equal(result, expected)