123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- """
- Tests the usecols functionality during parsing
- for all of the parsers defined in parsers.py
- """
- from io import StringIO
- import pytest
- from pandas import (
- DataFrame,
- Index,
- Timestamp,
- )
- import pandas._testing as tm
- _msg_validate_usecols_arg = (
- "'usecols' must either be list-like "
- "of all strings, all unicode, all "
- "integers or a callable."
- )
- _msg_validate_usecols_names = (
- "Usecols do not match columns, columns expected but not found: {0}"
- )
- # TODO(1.4): Change these to xfails whenever parse_dates support(which was
- # intentionally disable to keep small PR sizes) is added back
- pytestmark = pytest.mark.usefixtures("pyarrow_skip")
- @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
- def test_usecols_with_parse_dates(all_parsers, usecols):
- # see gh-9755
- data = """a,b,c,d,e
- 0,1,2014-01-01,09:00,4
- 0,1,2014-01-02,10:00,4"""
- parser = all_parsers
- parse_dates = [[1, 2]]
- cols = {
- "a": [0, 0],
- "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
- }
- expected = DataFrame(cols, columns=["c_d", "a"])
- result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
- tm.assert_frame_equal(result, expected)
- def test_usecols_with_parse_dates2(all_parsers):
- # see gh-13604
- parser = all_parsers
- data = """2008-02-07 09:40,1032.43
- 2008-02-07 09:50,1042.54
- 2008-02-07 10:00,1051.65"""
- names = ["date", "values"]
- usecols = names[:]
- parse_dates = [0]
- index = Index(
- [
- Timestamp("2008-02-07 09:40"),
- Timestamp("2008-02-07 09:50"),
- Timestamp("2008-02-07 10:00"),
- ],
- name="date",
- )
- cols = {"values": [1032.43, 1042.54, 1051.65]}
- expected = DataFrame(cols, index=index)
- result = parser.read_csv(
- StringIO(data),
- parse_dates=parse_dates,
- index_col=0,
- usecols=usecols,
- header=None,
- names=names,
- )
- tm.assert_frame_equal(result, expected)
- def test_usecols_with_parse_dates3(all_parsers):
- # see gh-14792
- parser = all_parsers
- data = """a,b,c,d,e,f,g,h,i,j
- 2016/09/21,1,1,2,3,4,5,6,7,8"""
- usecols = list("abcdefghij")
- parse_dates = [0]
- cols = {
- "a": Timestamp("2016-09-21"),
- "b": [1],
- "c": [1],
- "d": [2],
- "e": [3],
- "f": [4],
- "g": [5],
- "h": [6],
- "i": [7],
- "j": [8],
- }
- expected = DataFrame(cols, columns=usecols)
- result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
- tm.assert_frame_equal(result, expected)
- def test_usecols_with_parse_dates4(all_parsers):
- data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
- usecols = list("abcdefghij")
- parse_dates = [[0, 1]]
- parser = all_parsers
- cols = {
- "a_b": "2016/09/21 1",
- "c": [1],
- "d": [2],
- "e": [3],
- "f": [4],
- "g": [5],
- "h": [6],
- "i": [7],
- "j": [8],
- }
- expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
- result = parser.read_csv(
- StringIO(data),
- usecols=usecols,
- parse_dates=parse_dates,
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
- @pytest.mark.parametrize(
- "names",
- [
- list("abcde"), # Names span all columns in original data.
- list("acd"), # Names span only the selected columns.
- ],
- )
- def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
- # see gh-9755
- s = """0,1,2014-01-01,09:00,4
- 0,1,2014-01-02,10:00,4"""
- parse_dates = [[1, 2]]
- parser = all_parsers
- cols = {
- "a": [0, 0],
- "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
- }
- expected = DataFrame(cols, columns=["c_d", "a"])
- result = parser.read_csv(
- StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
- )
- tm.assert_frame_equal(result, expected)
|