test_index_col.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. """
  2. Tests that the specified index column (a.k.a "index_col")
  3. is properly handled or inferred during parsing for all of
  4. the parsers defined in parsers.py
  5. """
  6. from io import StringIO
  7. import numpy as np
  8. import pytest
  9. from pandas import (
  10. DataFrame,
  11. Index,
  12. MultiIndex,
  13. )
  14. import pandas._testing as tm
  15. # TODO(1.4): Change me to xfails at release time
  16. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  17. @pytest.mark.parametrize("with_header", [True, False])
  18. def test_index_col_named(all_parsers, with_header):
  19. parser = all_parsers
  20. no_header = """\
  21. KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
  22. KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
  23. KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
  24. KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
  25. KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
  26. KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
  27. header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
  28. if with_header:
  29. data = header + no_header
  30. result = parser.read_csv(StringIO(data), index_col="ID")
  31. expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
  32. tm.assert_frame_equal(result, expected)
  33. else:
  34. data = no_header
  35. msg = "Index ID invalid"
  36. with pytest.raises(ValueError, match=msg):
  37. parser.read_csv(StringIO(data), index_col="ID")
  38. def test_index_col_named2(all_parsers):
  39. parser = all_parsers
  40. data = """\
  41. 1,2,3,4,hello
  42. 5,6,7,8,world
  43. 9,10,11,12,foo
  44. """
  45. expected = DataFrame(
  46. {"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
  47. index=Index(["hello", "world", "foo"], name="message"),
  48. )
  49. names = ["a", "b", "c", "d", "message"]
  50. result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
  51. tm.assert_frame_equal(result, expected)
  52. def test_index_col_is_true(all_parsers):
  53. # see gh-9798
  54. data = "a,b\n1,2"
  55. parser = all_parsers
  56. msg = "The value of index_col couldn't be 'True'"
  57. with pytest.raises(ValueError, match=msg):
  58. parser.read_csv(StringIO(data), index_col=True)
  59. @skip_pyarrow
  60. def test_infer_index_col(all_parsers):
  61. data = """A,B,C
  62. foo,1,2,3
  63. bar,4,5,6
  64. baz,7,8,9
  65. """
  66. parser = all_parsers
  67. result = parser.read_csv(StringIO(data))
  68. expected = DataFrame(
  69. [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  70. index=["foo", "bar", "baz"],
  71. columns=["A", "B", "C"],
  72. )
  73. tm.assert_frame_equal(result, expected)
  74. @skip_pyarrow
  75. @pytest.mark.parametrize(
  76. "index_col,kwargs",
  77. [
  78. (None, {"columns": ["x", "y", "z"]}),
  79. (False, {"columns": ["x", "y", "z"]}),
  80. (0, {"columns": ["y", "z"], "index": Index([], name="x")}),
  81. (1, {"columns": ["x", "z"], "index": Index([], name="y")}),
  82. ("x", {"columns": ["y", "z"], "index": Index([], name="x")}),
  83. ("y", {"columns": ["x", "z"], "index": Index([], name="y")}),
  84. (
  85. [0, 1],
  86. {
  87. "columns": ["z"],
  88. "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
  89. },
  90. ),
  91. (
  92. ["x", "y"],
  93. {
  94. "columns": ["z"],
  95. "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
  96. },
  97. ),
  98. (
  99. [1, 0],
  100. {
  101. "columns": ["z"],
  102. "index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
  103. },
  104. ),
  105. (
  106. ["y", "x"],
  107. {
  108. "columns": ["z"],
  109. "index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
  110. },
  111. ),
  112. ],
  113. )
  114. def test_index_col_empty_data(all_parsers, index_col, kwargs):
  115. data = "x,y,z"
  116. parser = all_parsers
  117. result = parser.read_csv(StringIO(data), index_col=index_col)
  118. expected = DataFrame(**kwargs)
  119. tm.assert_frame_equal(result, expected)
  120. @skip_pyarrow
  121. def test_empty_with_index_col_false(all_parsers):
  122. # see gh-10413
  123. data = "x,y"
  124. parser = all_parsers
  125. result = parser.read_csv(StringIO(data), index_col=False)
  126. expected = DataFrame(columns=["x", "y"])
  127. tm.assert_frame_equal(result, expected)
  128. @skip_pyarrow
  129. @pytest.mark.parametrize(
  130. "index_names",
  131. [
  132. ["", ""],
  133. ["foo", ""],
  134. ["", "bar"],
  135. ["foo", "bar"],
  136. ["NotReallyUnnamed", "Unnamed: 0"],
  137. ],
  138. )
  139. def test_multi_index_naming(all_parsers, index_names):
  140. parser = all_parsers
  141. # We don't want empty index names being replaced with "Unnamed: 0"
  142. data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
  143. result = parser.read_csv(StringIO(data), index_col=[0, 1])
  144. expected = DataFrame(
  145. {"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
  146. )
  147. expected.index.names = [name if name else None for name in index_names]
  148. tm.assert_frame_equal(result, expected)
  149. @skip_pyarrow
  150. def test_multi_index_naming_not_all_at_beginning(all_parsers):
  151. parser = all_parsers
  152. data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
  153. result = parser.read_csv(StringIO(data), index_col=[0, 2])
  154. expected = DataFrame(
  155. {"Unnamed: 2": ["c", "d", "c", "d"]},
  156. index=MultiIndex(
  157. levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
  158. ),
  159. )
  160. tm.assert_frame_equal(result, expected)
  161. @skip_pyarrow
  162. def test_no_multi_index_level_names_empty(all_parsers):
  163. # GH 10984
  164. parser = all_parsers
  165. midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
  166. expected = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"])
  167. with tm.ensure_clean() as path:
  168. expected.to_csv(path)
  169. result = parser.read_csv(path, index_col=[0, 1, 2])
  170. tm.assert_frame_equal(result, expected)
  171. @skip_pyarrow
  172. def test_header_with_index_col(all_parsers):
  173. # GH 33476
  174. parser = all_parsers
  175. data = """
  176. I11,A,A
  177. I12,B,B
  178. I2,1,3
  179. """
  180. midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
  181. idx = Index(["I2"])
  182. expected = DataFrame([[1, 3]], index=idx, columns=midx)
  183. result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
  184. tm.assert_frame_equal(result, expected)
  185. col_idx = Index(["A", "A.1"])
  186. idx = Index(["I12", "I2"], name="I11")
  187. expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
  188. result = parser.read_csv(StringIO(data), index_col="I11", header=0)
  189. tm.assert_frame_equal(result, expected)
  190. @pytest.mark.slow
  191. def test_index_col_large_csv(all_parsers):
  192. # https://github.com/pandas-dev/pandas/issues/37094
  193. parser = all_parsers
  194. N = 1_000_001
  195. df = DataFrame({"a": range(N), "b": np.random.randn(N)})
  196. with tm.ensure_clean() as path:
  197. df.to_csv(path, index=False)
  198. result = parser.read_csv(path, index_col=[0])
  199. tm.assert_frame_equal(result, df.set_index("a"))
  200. @skip_pyarrow
  201. def test_index_col_multiindex_columns_no_data(all_parsers):
  202. # GH#38292
  203. parser = all_parsers
  204. result = parser.read_csv(
  205. StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0
  206. )
  207. expected = DataFrame(
  208. [],
  209. index=Index([]),
  210. columns=MultiIndex.from_arrays(
  211. [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
  212. ),
  213. )
  214. tm.assert_frame_equal(result, expected)
  215. @skip_pyarrow
  216. def test_index_col_header_no_data(all_parsers):
  217. # GH#38292
  218. parser = all_parsers
  219. result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0)
  220. expected = DataFrame(
  221. [],
  222. columns=["a1", "a2"],
  223. index=Index([], name="a0"),
  224. )
  225. tm.assert_frame_equal(result, expected)
  226. @skip_pyarrow
  227. def test_multiindex_columns_no_data(all_parsers):
  228. # GH#38292
  229. parser = all_parsers
  230. result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1])
  231. expected = DataFrame(
  232. [], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]])
  233. )
  234. tm.assert_frame_equal(result, expected)
  235. @skip_pyarrow
  236. def test_multiindex_columns_index_col_with_data(all_parsers):
  237. # GH#38292
  238. parser = all_parsers
  239. result = parser.read_csv(
  240. StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0
  241. )
  242. expected = DataFrame(
  243. [["data", "data"]],
  244. columns=MultiIndex.from_arrays(
  245. [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
  246. ),
  247. index=Index(["data"]),
  248. )
  249. tm.assert_frame_equal(result, expected)
  250. @skip_pyarrow
  251. def test_infer_types_boolean_sum(all_parsers):
  252. # GH#44079
  253. parser = all_parsers
  254. result = parser.read_csv(
  255. StringIO("0,1"),
  256. names=["a", "b"],
  257. index_col=["a"],
  258. dtype={"a": "UInt8"},
  259. )
  260. expected = DataFrame(
  261. data={
  262. "a": [
  263. 0,
  264. ],
  265. "b": [1],
  266. }
  267. ).set_index("a")
  268. # Not checking index type now, because the C parser will return a
  269. # index column of dtype 'object', and the Python parser will return a
  270. # index column of dtype 'int64'.
  271. tm.assert_frame_equal(result, expected, check_index_type=False)
  272. @skip_pyarrow
  273. @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
  274. def test_specify_dtype_for_index_col(all_parsers, dtype, val):
  275. # GH#9435
  276. data = "a,b\n01,2"
  277. parser = all_parsers
  278. result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
  279. expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
  280. tm.assert_frame_equal(result, expected)
  281. @skip_pyarrow
  282. def test_multiindex_columns_not_leading_index_col(all_parsers):
  283. # GH#38549
  284. parser = all_parsers
  285. data = """a,b,c,d
  286. e,f,g,h
  287. x,y,1,2
  288. """
  289. result = parser.read_csv(
  290. StringIO(data),
  291. header=[0, 1],
  292. index_col=1,
  293. )
  294. cols = MultiIndex.from_tuples(
  295. [("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"]
  296. )
  297. expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
  298. tm.assert_frame_equal(result, expected)