test_converters.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. """
  2. Tests column conversion functionality during parsing
  3. for all of the parsers defined in parsers.py
  4. """
  5. from io import StringIO
  6. from dateutil.parser import parse
  7. import numpy as np
  8. import pytest
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. )
  14. import pandas._testing as tm
  15. pytestmark = pytest.mark.usefixtures("pyarrow_skip")
  16. def test_converters_type_must_be_dict(all_parsers):
  17. parser = all_parsers
  18. data = """index,A,B,C,D
  19. foo,2,3,4,5
  20. """
  21. with pytest.raises(TypeError, match="Type converters.+"):
  22. parser.read_csv(StringIO(data), converters=0)
  23. @pytest.mark.parametrize("column", [3, "D"])
  24. @pytest.mark.parametrize(
  25. "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
  26. )
  27. def test_converters(all_parsers, column, converter):
  28. parser = all_parsers
  29. data = """A,B,C,D
  30. a,1,2,01/01/2009
  31. b,3,4,01/02/2009
  32. c,4,5,01/03/2009
  33. """
  34. result = parser.read_csv(StringIO(data), converters={column: converter})
  35. expected = parser.read_csv(StringIO(data))
  36. expected["D"] = expected["D"].map(converter)
  37. tm.assert_frame_equal(result, expected)
  38. def test_converters_no_implicit_conv(all_parsers):
  39. # see gh-2184
  40. parser = all_parsers
  41. data = """000102,1.2,A\n001245,2,B"""
  42. converters = {0: lambda x: x.strip()}
  43. result = parser.read_csv(StringIO(data), header=None, converters=converters)
  44. # Column 0 should not be casted to numeric and should remain as object.
  45. expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
  46. tm.assert_frame_equal(result, expected)
  47. def test_converters_euro_decimal_format(all_parsers):
  48. # see gh-583
  49. converters = {}
  50. parser = all_parsers
  51. data = """Id;Number1;Number2;Text1;Text2;Number3
  52. 1;1521,1541;187101,9543;ABC;poi;4,7387
  53. 2;121,12;14897,76;DEF;uyt;0,3773
  54. 3;878,158;108013,434;GHI;rez;2,7356"""
  55. converters["Number1"] = converters["Number2"] = converters[
  56. "Number3"
  57. ] = lambda x: float(x.replace(",", "."))
  58. result = parser.read_csv(StringIO(data), sep=";", converters=converters)
  59. expected = DataFrame(
  60. [
  61. [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
  62. [2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
  63. [3, 878.158, 108013.434, "GHI", "rez", 2.7356],
  64. ],
  65. columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
  66. )
  67. tm.assert_frame_equal(result, expected)
  68. def test_converters_corner_with_nans(all_parsers):
  69. parser = all_parsers
  70. data = """id,score,days
  71. 1,2,12
  72. 2,2-5,
  73. 3,,14+
  74. 4,6-12,2"""
  75. # Example converters.
  76. def convert_days(x):
  77. x = x.strip()
  78. if not x:
  79. return np.nan
  80. is_plus = x.endswith("+")
  81. if is_plus:
  82. x = int(x[:-1]) + 1
  83. else:
  84. x = int(x)
  85. return x
  86. def convert_days_sentinel(x):
  87. x = x.strip()
  88. if not x:
  89. return np.nan
  90. is_plus = x.endswith("+")
  91. if is_plus:
  92. x = int(x[:-1]) + 1
  93. else:
  94. x = int(x)
  95. return x
  96. def convert_score(x):
  97. x = x.strip()
  98. if not x:
  99. return np.nan
  100. if x.find("-") > 0:
  101. val_min, val_max = map(int, x.split("-"))
  102. val = 0.5 * (val_min + val_max)
  103. else:
  104. val = float(x)
  105. return val
  106. results = []
  107. for day_converter in [convert_days, convert_days_sentinel]:
  108. result = parser.read_csv(
  109. StringIO(data),
  110. converters={"score": convert_score, "days": day_converter},
  111. na_values=["", None],
  112. )
  113. assert pd.isna(result["days"][1])
  114. results.append(result)
  115. tm.assert_frame_equal(results[0], results[1])
  116. @pytest.mark.parametrize("conv_f", [lambda x: x, str])
  117. def test_converter_index_col_bug(all_parsers, conv_f):
  118. # see gh-1835 , GH#40589
  119. parser = all_parsers
  120. data = "A;B\n1;2\n3;4"
  121. rs = parser.read_csv(
  122. StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
  123. )
  124. xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
  125. tm.assert_frame_equal(rs, xp)
  126. def test_converter_identity_object(all_parsers):
  127. # GH#40589
  128. parser = all_parsers
  129. data = "A,B\n1,2\n3,4"
  130. rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
  131. xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
  132. tm.assert_frame_equal(rs, xp)
  133. def test_converter_multi_index(all_parsers):
  134. # GH 42446
  135. parser = all_parsers
  136. data = "A,B,B\nX,Y,Z\n1,2,3"
  137. result = parser.read_csv(
  138. StringIO(data),
  139. header=list(range(2)),
  140. converters={
  141. ("A", "X"): np.int32,
  142. ("B", "Y"): np.int32,
  143. ("B", "Z"): np.float32,
  144. },
  145. )
  146. expected = DataFrame(
  147. {
  148. ("A", "X"): np.int32([1]),
  149. ("B", "Y"): np.int32([2]),
  150. ("B", "Z"): np.float32([3]),
  151. }
  152. )
  153. tm.assert_frame_equal(result, expected)