test_na_values.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. """
  2. Tests that NA values are properly handled during
  3. parsing for all of the parsers defined in parsers.py
  4. """
  5. from io import StringIO
  6. import numpy as np
  7. import pytest
  8. from pandas._libs.parsers import STR_NA_VALUES
  9. from pandas import (
  10. DataFrame,
  11. Index,
  12. MultiIndex,
  13. )
  14. import pandas._testing as tm
  15. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  16. xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
  17. @skip_pyarrow
  18. def test_string_nas(all_parsers):
  19. parser = all_parsers
  20. data = """A,B,C
  21. a,b,c
  22. d,,f
  23. ,g,h
  24. """
  25. result = parser.read_csv(StringIO(data))
  26. expected = DataFrame(
  27. [["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
  28. columns=["A", "B", "C"],
  29. )
  30. tm.assert_frame_equal(result, expected)
  31. @skip_pyarrow
  32. def test_detect_string_na(all_parsers):
  33. parser = all_parsers
  34. data = """A,B
  35. foo,bar
  36. NA,baz
  37. NaN,nan
  38. """
  39. expected = DataFrame(
  40. [["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
  41. )
  42. result = parser.read_csv(StringIO(data))
  43. tm.assert_frame_equal(result, expected)
  44. @skip_pyarrow
  45. @pytest.mark.parametrize(
  46. "na_values",
  47. [
  48. ["-999.0", "-999"],
  49. [-999, -999.0],
  50. [-999.0, -999],
  51. ["-999.0"],
  52. ["-999"],
  53. [-999.0],
  54. [-999],
  55. ],
  56. )
  57. @pytest.mark.parametrize(
  58. "data",
  59. [
  60. """A,B
  61. -999,1.2
  62. 2,-999
  63. 3,4.5
  64. """,
  65. """A,B
  66. -999,1.200
  67. 2,-999.000
  68. 3,4.500
  69. """,
  70. ],
  71. )
  72. def test_non_string_na_values(all_parsers, data, na_values):
  73. # see gh-3611: with an odd float format, we can't match
  74. # the string "999.0" exactly but still need float matching
  75. parser = all_parsers
  76. expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
  77. result = parser.read_csv(StringIO(data), na_values=na_values)
  78. tm.assert_frame_equal(result, expected)
  79. @skip_pyarrow
  80. def test_default_na_values(all_parsers):
  81. _NA_VALUES = {
  82. "-1.#IND",
  83. "1.#QNAN",
  84. "1.#IND",
  85. "-1.#QNAN",
  86. "#N/A",
  87. "N/A",
  88. "n/a",
  89. "NA",
  90. "<NA>",
  91. "#NA",
  92. "NULL",
  93. "null",
  94. "NaN",
  95. "nan",
  96. "-NaN",
  97. "-nan",
  98. "#N/A N/A",
  99. "",
  100. "None",
  101. }
  102. assert _NA_VALUES == STR_NA_VALUES
  103. parser = all_parsers
  104. nv = len(_NA_VALUES)
  105. def f(i, v):
  106. if i == 0:
  107. buf = ""
  108. elif i > 0:
  109. buf = "".join([","] * i)
  110. buf = f"{buf}{v}"
  111. if i < nv - 1:
  112. joined = "".join([","] * (nv - i - 1))
  113. buf = f"{buf}{joined}"
  114. return buf
  115. data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)]))
  116. expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
  117. result = parser.read_csv(data, header=None)
  118. tm.assert_frame_equal(result, expected)
  119. @skip_pyarrow
  120. @pytest.mark.parametrize("na_values", ["baz", ["baz"]])
  121. def test_custom_na_values(all_parsers, na_values):
  122. parser = all_parsers
  123. data = """A,B,C
  124. ignore,this,row
  125. 1,NA,3
  126. -1.#IND,5,baz
  127. 7,8,NaN
  128. """
  129. expected = DataFrame(
  130. [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
  131. )
  132. result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
  133. tm.assert_frame_equal(result, expected)
  134. def test_bool_na_values(all_parsers):
  135. data = """A,B,C
  136. True,False,True
  137. NA,True,False
  138. False,NA,True"""
  139. parser = all_parsers
  140. result = parser.read_csv(StringIO(data))
  141. expected = DataFrame(
  142. {
  143. "A": np.array([True, np.nan, False], dtype=object),
  144. "B": np.array([False, True, np.nan], dtype=object),
  145. "C": [True, False, True],
  146. }
  147. )
  148. tm.assert_frame_equal(result, expected)
  149. @skip_pyarrow
  150. def test_na_value_dict(all_parsers):
  151. data = """A,B,C
  152. foo,bar,NA
  153. bar,foo,foo
  154. foo,bar,NA
  155. bar,foo,foo"""
  156. parser = all_parsers
  157. df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
  158. expected = DataFrame(
  159. {
  160. "A": [np.nan, "bar", np.nan, "bar"],
  161. "B": [np.nan, "foo", np.nan, "foo"],
  162. "C": [np.nan, "foo", np.nan, "foo"],
  163. }
  164. )
  165. tm.assert_frame_equal(df, expected)
  166. @skip_pyarrow
  167. @pytest.mark.parametrize(
  168. "index_col,expected",
  169. [
  170. (
  171. [0],
  172. DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
  173. ),
  174. (
  175. [0, 2],
  176. DataFrame(
  177. {"b": [np.nan], "d": [5]},
  178. index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
  179. ),
  180. ),
  181. (
  182. ["a", "c"],
  183. DataFrame(
  184. {"b": [np.nan], "d": [5]},
  185. index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
  186. ),
  187. ),
  188. ],
  189. )
  190. def test_na_value_dict_multi_index(all_parsers, index_col, expected):
  191. data = """\
  192. a,b,c,d
  193. 0,NA,1,5
  194. """
  195. parser = all_parsers
  196. result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
  197. tm.assert_frame_equal(result, expected)
  198. @skip_pyarrow
  199. @pytest.mark.parametrize(
  200. "kwargs,expected",
  201. [
  202. (
  203. {},
  204. DataFrame(
  205. {
  206. "A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
  207. "B": [1, 2, 3, 4, 5, 6, 7],
  208. "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
  209. }
  210. ),
  211. ),
  212. (
  213. {"na_values": {"A": [], "C": []}, "keep_default_na": False},
  214. DataFrame(
  215. {
  216. "A": ["a", "b", "", "d", "e", "nan", "g"],
  217. "B": [1, 2, 3, 4, 5, 6, 7],
  218. "C": ["one", "two", "three", "nan", "five", "", "seven"],
  219. }
  220. ),
  221. ),
  222. (
  223. {"na_values": ["a"], "keep_default_na": False},
  224. DataFrame(
  225. {
  226. "A": [np.nan, "b", "", "d", "e", "nan", "g"],
  227. "B": [1, 2, 3, 4, 5, 6, 7],
  228. "C": ["one", "two", "three", "nan", "five", "", "seven"],
  229. }
  230. ),
  231. ),
  232. (
  233. {"na_values": {"A": [], "C": []}},
  234. DataFrame(
  235. {
  236. "A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
  237. "B": [1, 2, 3, 4, 5, 6, 7],
  238. "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
  239. }
  240. ),
  241. ),
  242. ],
  243. )
  244. def test_na_values_keep_default(all_parsers, kwargs, expected):
  245. data = """\
  246. A,B,C
  247. a,1,one
  248. b,2,two
  249. ,3,three
  250. d,4,nan
  251. e,5,five
  252. nan,6,
  253. g,7,seven
  254. """
  255. parser = all_parsers
  256. result = parser.read_csv(StringIO(data), **kwargs)
  257. tm.assert_frame_equal(result, expected)
  258. @skip_pyarrow
  259. def test_no_na_values_no_keep_default(all_parsers):
  260. # see gh-4318: passing na_values=None and
  261. # keep_default_na=False yields 'None" as a na_value
  262. data = """\
  263. A,B,C
  264. a,1,None
  265. b,2,two
  266. ,3,None
  267. d,4,nan
  268. e,5,five
  269. nan,6,
  270. g,7,seven
  271. """
  272. parser = all_parsers
  273. result = parser.read_csv(StringIO(data), keep_default_na=False)
  274. expected = DataFrame(
  275. {
  276. "A": ["a", "b", "", "d", "e", "nan", "g"],
  277. "B": [1, 2, 3, 4, 5, 6, 7],
  278. "C": ["None", "two", "None", "nan", "five", "", "seven"],
  279. }
  280. )
  281. tm.assert_frame_equal(result, expected)
  282. @skip_pyarrow
  283. def test_no_keep_default_na_dict_na_values(all_parsers):
  284. # see gh-19227
  285. data = "a,b\n,2"
  286. parser = all_parsers
  287. result = parser.read_csv(
  288. StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
  289. )
  290. expected = DataFrame({"a": [""], "b": [np.nan]})
  291. tm.assert_frame_equal(result, expected)
  292. @skip_pyarrow
  293. def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
  294. # see gh-19227
  295. #
  296. # Scalar values shouldn't cause the parsing to crash or fail.
  297. data = "a,b\n1,2"
  298. parser = all_parsers
  299. df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
  300. expected = DataFrame({"a": [1], "b": [np.nan]})
  301. tm.assert_frame_equal(df, expected)
  302. @skip_pyarrow
  303. @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
  304. def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
  305. # see gh-19227
  306. data = """\
  307. 113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
  308. 729639,"qwer","",asdfkj,466.681,,252.373
  309. """
  310. parser = all_parsers
  311. expected = DataFrame(
  312. {
  313. 0: [np.nan, 729639.0],
  314. 1: [np.nan, "qwer"],
  315. 2: ["/blaha", np.nan],
  316. 3: ["kjsdkj", "asdfkj"],
  317. 4: [412.166, 466.681],
  318. 5: ["225.874", ""],
  319. 6: [np.nan, 252.373],
  320. }
  321. )
  322. result = parser.read_csv(
  323. StringIO(data),
  324. header=None,
  325. keep_default_na=False,
  326. na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
  327. )
  328. tm.assert_frame_equal(result, expected)
  329. @skip_pyarrow
  330. @pytest.mark.parametrize(
  331. "na_filter,row_data",
  332. [
  333. (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
  334. (False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
  335. ],
  336. )
  337. def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
  338. data = """\
  339. A,B
  340. 1,A
  341. nan,B
  342. 3,C
  343. """
  344. parser = all_parsers
  345. result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
  346. expected = DataFrame(row_data, columns=["A", "B"])
  347. tm.assert_frame_equal(result, expected)
  348. @skip_pyarrow
  349. def test_na_trailing_columns(all_parsers):
  350. parser = all_parsers
  351. data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
  352. 2012-03-14,USD,AAPL,BUY,1000
  353. 2012-05-12,USD,SBUX,SELL,500"""
  354. # Trailing columns should be all NaN.
  355. result = parser.read_csv(StringIO(data))
  356. expected = DataFrame(
  357. [
  358. ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
  359. ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
  360. ],
  361. columns=[
  362. "Date",
  363. "Currency",
  364. "Symbol",
  365. "Type",
  366. "Units",
  367. "UnitPrice",
  368. "Cost",
  369. "Tax",
  370. ],
  371. )
  372. tm.assert_frame_equal(result, expected)
  373. @skip_pyarrow
  374. @pytest.mark.parametrize(
  375. "na_values,row_data",
  376. [
  377. (1, [[np.nan, 2.0], [2.0, np.nan]]),
  378. ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
  379. ],
  380. )
  381. def test_na_values_scalar(all_parsers, na_values, row_data):
  382. # see gh-12224
  383. parser = all_parsers
  384. names = ["a", "b"]
  385. data = "1,2\n2,1"
  386. result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
  387. expected = DataFrame(row_data, columns=names)
  388. tm.assert_frame_equal(result, expected)
  389. @skip_pyarrow
  390. def test_na_values_dict_aliasing(all_parsers):
  391. parser = all_parsers
  392. na_values = {"a": 2, "b": 1}
  393. na_values_copy = na_values.copy()
  394. names = ["a", "b"]
  395. data = "1,2\n2,1"
  396. expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
  397. result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
  398. tm.assert_frame_equal(result, expected)
  399. tm.assert_dict_equal(na_values, na_values_copy)
  400. @skip_pyarrow
  401. def test_na_values_dict_col_index(all_parsers):
  402. # see gh-14203
  403. data = "a\nfoo\n1"
  404. parser = all_parsers
  405. na_values = {0: "foo"}
  406. result = parser.read_csv(StringIO(data), na_values=na_values)
  407. expected = DataFrame({"a": [np.nan, 1]})
  408. tm.assert_frame_equal(result, expected)
  409. @skip_pyarrow
  410. @pytest.mark.parametrize(
  411. "data,kwargs,expected",
  412. [
  413. (
  414. str(2**63) + "\n" + str(2**63 + 1),
  415. {"na_values": [2**63]},
  416. DataFrame([str(2**63), str(2**63 + 1)]),
  417. ),
  418. (str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])),
  419. (str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])),
  420. ],
  421. )
  422. def test_na_values_uint64(all_parsers, data, kwargs, expected):
  423. # see gh-14983
  424. parser = all_parsers
  425. result = parser.read_csv(StringIO(data), header=None, **kwargs)
  426. tm.assert_frame_equal(result, expected)
  427. def test_empty_na_values_no_default_with_index(all_parsers):
  428. # see gh-15835
  429. data = "a,1\nb,2"
  430. parser = all_parsers
  431. expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
  432. result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
  433. tm.assert_frame_equal(result, expected)
  434. @skip_pyarrow
  435. @pytest.mark.parametrize(
  436. "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
  437. )
  438. def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
  439. # see gh-5239
  440. #
  441. # Don't parse NA-values in index unless na_filter=True
  442. parser = all_parsers
  443. data = "a,b,c\n1,,3\n4,5,6"
  444. expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
  445. result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
  446. tm.assert_frame_equal(result, expected)
  447. def test_inf_na_values_with_int_index(all_parsers):
  448. # see gh-17128
  449. parser = all_parsers
  450. data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
  451. # Don't fail with OverflowError with inf's and integer index column.
  452. out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
  453. expected = DataFrame(
  454. {"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
  455. )
  456. tm.assert_frame_equal(out, expected)
  457. @skip_pyarrow
  458. @pytest.mark.parametrize("na_filter", [True, False])
  459. def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
  460. # see gh-20377
  461. parser = all_parsers
  462. data = "a,b,c\n1,,3\n4,5,6"
  463. # na_filter=True --> missing value becomes NaN.
  464. # na_filter=False --> missing value remains empty string.
  465. empty = np.nan if na_filter else ""
  466. expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
  467. result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
  468. tm.assert_frame_equal(result, expected)
  469. @skip_pyarrow
  470. @pytest.mark.parametrize(
  471. "data, na_values",
  472. [
  473. ("false,1\n,1\ntrue", None),
  474. ("false,1\nnull,1\ntrue", None),
  475. ("false,1\nnan,1\ntrue", None),
  476. ("false,1\nfoo,1\ntrue", "foo"),
  477. ("false,1\nfoo,1\ntrue", ["foo"]),
  478. ("false,1\nfoo,1\ntrue", {"a": "foo"}),
  479. ],
  480. )
  481. def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
  482. parser = all_parsers
  483. msg = (
  484. "(Bool column has NA values in column [0a])|"
  485. "(cannot safely convert passed user dtype of "
  486. "bool for object dtyped data in column 0)"
  487. )
  488. with pytest.raises(ValueError, match=msg):
  489. parser.read_csv(
  490. StringIO(data),
  491. header=None,
  492. names=["a", "b"],
  493. dtype={"a": "bool"},
  494. na_values=na_values,
  495. )
  496. @skip_pyarrow
  497. def test_str_nan_dropped(all_parsers):
  498. # see gh-21131
  499. parser = all_parsers
  500. data = """File: small.csv,,
  501. 10010010233,0123,654
  502. foo,,bar
  503. 01001000155,4530,898"""
  504. result = parser.read_csv(
  505. StringIO(data),
  506. header=None,
  507. names=["col1", "col2", "col3"],
  508. dtype={"col1": str, "col2": str, "col3": str},
  509. ).dropna()
  510. expected = DataFrame(
  511. {
  512. "col1": ["10010010233", "01001000155"],
  513. "col2": ["0123", "4530"],
  514. "col3": ["654", "898"],
  515. },
  516. index=[1, 3],
  517. )
  518. tm.assert_frame_equal(result, expected)
  519. @skip_pyarrow
  520. def test_nan_multi_index(all_parsers):
  521. # GH 42446
  522. parser = all_parsers
  523. data = "A,B,B\nX,Y,Z\n1,2,inf"
  524. result = parser.read_csv(
  525. StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
  526. )
  527. expected = DataFrame(
  528. {
  529. ("A", "X"): [1],
  530. ("B", "Y"): [2],
  531. ("B", "Z"): [np.nan],
  532. }
  533. )
  534. tm.assert_frame_equal(result, expected)
  535. @xfail_pyarrow
  536. def test_bool_and_nan_to_bool(all_parsers):
  537. # GH#42808
  538. parser = all_parsers
  539. data = """0
  540. NaN
  541. True
  542. False
  543. """
  544. with pytest.raises(ValueError, match="NA values"):
  545. parser.read_csv(StringIO(data), dtype="bool")
  546. def test_bool_and_nan_to_int(all_parsers):
  547. # GH#42808
  548. parser = all_parsers
  549. data = """0
  550. NaN
  551. True
  552. False
  553. """
  554. with pytest.raises(ValueError, match="convert|NoneType"):
  555. parser.read_csv(StringIO(data), dtype="int")
  556. def test_bool_and_nan_to_float(all_parsers):
  557. # GH#42808
  558. parser = all_parsers
  559. data = """0
  560. NaN
  561. True
  562. False
  563. """
  564. result = parser.read_csv(StringIO(data), dtype="float")
  565. expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
  566. tm.assert_frame_equal(result, expected)