test_parse_dates.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. """
  2. Tests the usecols functionality during parsing
  3. for all of the parsers defined in parsers.py
  4. """
  5. from io import StringIO
  6. import pytest
  7. from pandas import (
  8. DataFrame,
  9. Index,
  10. Timestamp,
  11. )
  12. import pandas._testing as tm
  13. _msg_validate_usecols_arg = (
  14. "'usecols' must either be list-like "
  15. "of all strings, all unicode, all "
  16. "integers or a callable."
  17. )
  18. _msg_validate_usecols_names = (
  19. "Usecols do not match columns, columns expected but not found: {0}"
  20. )
  21. # TODO(1.4): Change these to xfails whenever parse_dates support(which was
  22. # intentionally disable to keep small PR sizes) is added back
  23. pytestmark = pytest.mark.usefixtures("pyarrow_skip")
  24. @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
  25. def test_usecols_with_parse_dates(all_parsers, usecols):
  26. # see gh-9755
  27. data = """a,b,c,d,e
  28. 0,1,2014-01-01,09:00,4
  29. 0,1,2014-01-02,10:00,4"""
  30. parser = all_parsers
  31. parse_dates = [[1, 2]]
  32. cols = {
  33. "a": [0, 0],
  34. "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
  35. }
  36. expected = DataFrame(cols, columns=["c_d", "a"])
  37. result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
  38. tm.assert_frame_equal(result, expected)
  39. def test_usecols_with_parse_dates2(all_parsers):
  40. # see gh-13604
  41. parser = all_parsers
  42. data = """2008-02-07 09:40,1032.43
  43. 2008-02-07 09:50,1042.54
  44. 2008-02-07 10:00,1051.65"""
  45. names = ["date", "values"]
  46. usecols = names[:]
  47. parse_dates = [0]
  48. index = Index(
  49. [
  50. Timestamp("2008-02-07 09:40"),
  51. Timestamp("2008-02-07 09:50"),
  52. Timestamp("2008-02-07 10:00"),
  53. ],
  54. name="date",
  55. )
  56. cols = {"values": [1032.43, 1042.54, 1051.65]}
  57. expected = DataFrame(cols, index=index)
  58. result = parser.read_csv(
  59. StringIO(data),
  60. parse_dates=parse_dates,
  61. index_col=0,
  62. usecols=usecols,
  63. header=None,
  64. names=names,
  65. )
  66. tm.assert_frame_equal(result, expected)
  67. def test_usecols_with_parse_dates3(all_parsers):
  68. # see gh-14792
  69. parser = all_parsers
  70. data = """a,b,c,d,e,f,g,h,i,j
  71. 2016/09/21,1,1,2,3,4,5,6,7,8"""
  72. usecols = list("abcdefghij")
  73. parse_dates = [0]
  74. cols = {
  75. "a": Timestamp("2016-09-21"),
  76. "b": [1],
  77. "c": [1],
  78. "d": [2],
  79. "e": [3],
  80. "f": [4],
  81. "g": [5],
  82. "h": [6],
  83. "i": [7],
  84. "j": [8],
  85. }
  86. expected = DataFrame(cols, columns=usecols)
  87. result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
  88. tm.assert_frame_equal(result, expected)
  89. def test_usecols_with_parse_dates4(all_parsers):
  90. data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
  91. usecols = list("abcdefghij")
  92. parse_dates = [[0, 1]]
  93. parser = all_parsers
  94. cols = {
  95. "a_b": "2016/09/21 1",
  96. "c": [1],
  97. "d": [2],
  98. "e": [3],
  99. "f": [4],
  100. "g": [5],
  101. "h": [6],
  102. "i": [7],
  103. "j": [8],
  104. }
  105. expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
  106. result = parser.read_csv(
  107. StringIO(data),
  108. usecols=usecols,
  109. parse_dates=parse_dates,
  110. )
  111. tm.assert_frame_equal(result, expected)
  112. @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
  113. @pytest.mark.parametrize(
  114. "names",
  115. [
  116. list("abcde"), # Names span all columns in original data.
  117. list("acd"), # Names span only the selected columns.
  118. ],
  119. )
  120. def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
  121. # see gh-9755
  122. s = """0,1,2014-01-01,09:00,4
  123. 0,1,2014-01-02,10:00,4"""
  124. parse_dates = [[1, 2]]
  125. parser = all_parsers
  126. cols = {
  127. "a": [0, 0],
  128. "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
  129. }
  130. expected = DataFrame(cols, columns=["c_d", "a"])
  131. result = parser.read_csv(
  132. StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
  133. )
  134. tm.assert_frame_equal(result, expected)