test_to_numeric.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956
  1. import decimal
  2. import numpy as np
  3. from numpy import iinfo
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. ArrowDtype,
  8. DataFrame,
  9. Index,
  10. Series,
  11. to_numeric,
  12. )
  13. import pandas._testing as tm
  14. @pytest.fixture(params=[None, "ignore", "raise", "coerce"])
  15. def errors(request):
  16. return request.param
  17. @pytest.fixture(params=[True, False])
  18. def signed(request):
  19. return request.param
  20. @pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
  21. def transform(request):
  22. return request.param
  23. @pytest.fixture(params=[47393996303418497800, 100000000000000000000])
  24. def large_val(request):
  25. return request.param
  26. @pytest.fixture(params=[True, False])
  27. def multiple_elts(request):
  28. return request.param
  29. @pytest.fixture(
  30. params=[
  31. (lambda x: Index(x, name="idx"), tm.assert_index_equal),
  32. (lambda x: Series(x, name="ser"), tm.assert_series_equal),
  33. (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal),
  34. ]
  35. )
  36. def transform_assert_equal(request):
  37. return request.param
  38. @pytest.mark.parametrize(
  39. "input_kwargs,result_kwargs",
  40. [
  41. ({}, {"dtype": np.int64}),
  42. ({"errors": "coerce", "downcast": "integer"}, {"dtype": np.int8}),
  43. ],
  44. )
  45. def test_empty(input_kwargs, result_kwargs):
  46. # see gh-16302
  47. ser = Series([], dtype=object)
  48. result = to_numeric(ser, **input_kwargs)
  49. expected = Series([], **result_kwargs)
  50. tm.assert_series_equal(result, expected)
  51. @pytest.mark.parametrize("last_val", ["7", 7])
  52. def test_series(last_val):
  53. ser = Series(["1", "-3.14", last_val])
  54. result = to_numeric(ser)
  55. expected = Series([1, -3.14, 7])
  56. tm.assert_series_equal(result, expected)
  57. @pytest.mark.parametrize(
  58. "data",
  59. [
  60. [1, 3, 4, 5],
  61. [1.0, 3.0, 4.0, 5.0],
  62. # Bool is regarded as numeric.
  63. [True, False, True, True],
  64. ],
  65. )
  66. def test_series_numeric(data):
  67. ser = Series(data, index=list("ABCD"), name="EFG")
  68. result = to_numeric(ser)
  69. tm.assert_series_equal(result, ser)
  70. @pytest.mark.parametrize(
  71. "data,msg",
  72. [
  73. ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'),
  74. (
  75. ["orange", 1, -3.14, "apple"],
  76. 'Unable to parse string "orange" at position 0',
  77. ),
  78. ],
  79. )
  80. def test_error(data, msg):
  81. ser = Series(data)
  82. with pytest.raises(ValueError, match=msg):
  83. to_numeric(ser, errors="raise")
  84. @pytest.mark.parametrize(
  85. "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])]
  86. )
  87. def test_ignore_error(errors, exp_data):
  88. ser = Series([1, -3.14, "apple"])
  89. result = to_numeric(ser, errors=errors)
  90. expected = Series(exp_data)
  91. tm.assert_series_equal(result, expected)
  92. @pytest.mark.parametrize(
  93. "errors,exp",
  94. [
  95. ("raise", 'Unable to parse string "apple" at position 2'),
  96. ("ignore", [True, False, "apple"]),
  97. # Coerces to float.
  98. ("coerce", [1.0, 0.0, np.nan]),
  99. ],
  100. )
  101. def test_bool_handling(errors, exp):
  102. ser = Series([True, False, "apple"])
  103. if isinstance(exp, str):
  104. with pytest.raises(ValueError, match=exp):
  105. to_numeric(ser, errors=errors)
  106. else:
  107. result = to_numeric(ser, errors=errors)
  108. expected = Series(exp)
  109. tm.assert_series_equal(result, expected)
  110. def test_list():
  111. ser = ["1", "-3.14", "7"]
  112. res = to_numeric(ser)
  113. expected = np.array([1, -3.14, 7])
  114. tm.assert_numpy_array_equal(res, expected)
  115. @pytest.mark.parametrize(
  116. "data,arr_kwargs",
  117. [
  118. ([1, 3, 4, 5], {"dtype": np.int64}),
  119. ([1.0, 3.0, 4.0, 5.0], {}),
  120. # Boolean is regarded as numeric.
  121. ([True, False, True, True], {}),
  122. ],
  123. )
  124. def test_list_numeric(data, arr_kwargs):
  125. result = to_numeric(data)
  126. expected = np.array(data, **arr_kwargs)
  127. tm.assert_numpy_array_equal(result, expected)
  128. @pytest.mark.parametrize("kwargs", [{"dtype": "O"}, {}])
  129. def test_numeric(kwargs):
  130. data = [1, -3.14, 7]
  131. ser = Series(data, **kwargs)
  132. result = to_numeric(ser)
  133. expected = Series(data)
  134. tm.assert_series_equal(result, expected)
  135. @pytest.mark.parametrize(
  136. "columns",
  137. [
  138. # One column.
  139. "a",
  140. # Multiple columns.
  141. ["a", "b"],
  142. ],
  143. )
  144. def test_numeric_df_columns(columns):
  145. # see gh-14827
  146. df = DataFrame(
  147. {
  148. "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"],
  149. "b": [1.0, 2.0, 3.0, 4.0],
  150. }
  151. )
  152. expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]})
  153. df_copy = df.copy()
  154. df_copy[columns] = df_copy[columns].apply(to_numeric)
  155. tm.assert_frame_equal(df_copy, expected)
  156. @pytest.mark.parametrize(
  157. "data,exp_data",
  158. [
  159. (
  160. [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1],
  161. [[3.14, 1.0], 1.6, 0.1],
  162. ),
  163. ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]),
  164. ],
  165. )
  166. def test_numeric_embedded_arr_likes(data, exp_data):
  167. # Test to_numeric with embedded lists and arrays
  168. df = DataFrame({"a": data})
  169. df["a"] = df["a"].apply(to_numeric)
  170. expected = DataFrame({"a": exp_data})
  171. tm.assert_frame_equal(df, expected)
  172. def test_all_nan():
  173. ser = Series(["a", "b", "c"])
  174. result = to_numeric(ser, errors="coerce")
  175. expected = Series([np.nan, np.nan, np.nan])
  176. tm.assert_series_equal(result, expected)
  177. def test_type_check(errors):
  178. # see gh-11776
  179. df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
  180. kwargs = {"errors": errors} if errors is not None else {}
  181. with pytest.raises(TypeError, match="1-d array"):
  182. to_numeric(df, **kwargs)
  183. @pytest.mark.parametrize("val", [1, 1.1, 20001])
  184. def test_scalar(val, signed, transform):
  185. val = -val if signed else val
  186. assert to_numeric(transform(val)) == float(val)
  187. def test_really_large_scalar(large_val, signed, transform, errors):
  188. # see gh-24910
  189. kwargs = {"errors": errors} if errors is not None else {}
  190. val = -large_val if signed else large_val
  191. val = transform(val)
  192. val_is_string = isinstance(val, str)
  193. if val_is_string and errors in (None, "raise"):
  194. msg = "Integer out of range. at position 0"
  195. with pytest.raises(ValueError, match=msg):
  196. to_numeric(val, **kwargs)
  197. else:
  198. expected = float(val) if (errors == "coerce" and val_is_string) else val
  199. tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
  200. def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
  201. # see gh-24910
  202. kwargs = {"errors": errors} if errors is not None else {}
  203. val = -large_val if signed else large_val
  204. val = transform(val)
  205. extra_elt = "string"
  206. arr = [val] + multiple_elts * [extra_elt]
  207. val_is_string = isinstance(val, str)
  208. coercing = errors == "coerce"
  209. if errors in (None, "raise") and (val_is_string or multiple_elts):
  210. if val_is_string:
  211. msg = "Integer out of range. at position 0"
  212. else:
  213. msg = 'Unable to parse string "string" at position 1'
  214. with pytest.raises(ValueError, match=msg):
  215. to_numeric(arr, **kwargs)
  216. else:
  217. result = to_numeric(arr, **kwargs)
  218. exp_val = float(val) if (coercing and val_is_string) else val
  219. expected = [exp_val]
  220. if multiple_elts:
  221. if coercing:
  222. expected.append(np.nan)
  223. exp_dtype = float
  224. else:
  225. expected.append(extra_elt)
  226. exp_dtype = object
  227. else:
  228. exp_dtype = float if isinstance(exp_val, (int, float)) else object
  229. tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
  230. def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors):
  231. # see gh-24910
  232. #
  233. # Even if we discover that we have to hold float, does not mean
  234. # we should be lenient on subsequent elements that fail to be integer.
  235. kwargs = {"errors": errors} if errors is not None else {}
  236. arr = [str(-large_val if signed else large_val)]
  237. if multiple_elts:
  238. arr.insert(0, large_val)
  239. if errors in (None, "raise"):
  240. index = int(multiple_elts)
  241. msg = f"Integer out of range. at position {index}"
  242. with pytest.raises(ValueError, match=msg):
  243. to_numeric(arr, **kwargs)
  244. else:
  245. result = to_numeric(arr, **kwargs)
  246. if errors == "coerce":
  247. expected = [float(i) for i in arr]
  248. exp_dtype = float
  249. else:
  250. expected = arr
  251. exp_dtype = object
  252. tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
  253. @pytest.mark.parametrize(
  254. "errors,checker",
  255. [
  256. ("raise", 'Unable to parse string "fail" at position 0'),
  257. ("ignore", lambda x: x == "fail"),
  258. ("coerce", lambda x: np.isnan(x)),
  259. ],
  260. )
  261. def test_scalar_fail(errors, checker):
  262. scalar = "fail"
  263. if isinstance(checker, str):
  264. with pytest.raises(ValueError, match=checker):
  265. to_numeric(scalar, errors=errors)
  266. else:
  267. assert checker(to_numeric(scalar, errors=errors))
  268. @pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]])
  269. def test_numeric_dtypes(data, transform_assert_equal):
  270. transform, assert_equal = transform_assert_equal
  271. data = transform(data)
  272. result = to_numeric(data)
  273. assert_equal(result, data)
  274. @pytest.mark.parametrize(
  275. "data,exp",
  276. [
  277. (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")),
  278. (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])),
  279. ],
  280. )
  281. def test_str(data, exp, transform_assert_equal):
  282. transform, assert_equal = transform_assert_equal
  283. result = to_numeric(transform(data))
  284. expected = transform(exp)
  285. assert_equal(result, expected)
  286. def test_datetime_like(tz_naive_fixture, transform_assert_equal):
  287. transform, assert_equal = transform_assert_equal
  288. idx = pd.date_range("20130101", periods=3, tz=tz_naive_fixture)
  289. result = to_numeric(transform(idx))
  290. expected = transform(idx.asi8)
  291. assert_equal(result, expected)
  292. def test_timedelta(transform_assert_equal):
  293. transform, assert_equal = transform_assert_equal
  294. idx = pd.timedelta_range("1 days", periods=3, freq="D")
  295. result = to_numeric(transform(idx))
  296. expected = transform(idx.asi8)
  297. assert_equal(result, expected)
  298. def test_period(request, transform_assert_equal):
  299. transform, assert_equal = transform_assert_equal
  300. idx = pd.period_range("2011-01", periods=3, freq="M", name="")
  301. inp = transform(idx)
  302. if not isinstance(inp, Index):
  303. request.node.add_marker(
  304. pytest.mark.xfail(reason="Missing PeriodDtype support in to_numeric")
  305. )
  306. result = to_numeric(inp)
  307. expected = transform(idx.asi8)
  308. assert_equal(result, expected)
  309. @pytest.mark.parametrize(
  310. "errors,expected",
  311. [
  312. ("raise", "Invalid object type at position 0"),
  313. ("ignore", Series([[10.0, 2], 1.0, "apple"])),
  314. ("coerce", Series([np.nan, 1.0, np.nan])),
  315. ],
  316. )
  317. def test_non_hashable(errors, expected):
  318. # see gh-13324
  319. ser = Series([[10.0, 2], 1.0, "apple"])
  320. if isinstance(expected, str):
  321. with pytest.raises(TypeError, match=expected):
  322. to_numeric(ser, errors=errors)
  323. else:
  324. result = to_numeric(ser, errors=errors)
  325. tm.assert_series_equal(result, expected)
  326. def test_downcast_invalid_cast():
  327. # see gh-13352
  328. data = ["1", 2, 3]
  329. invalid_downcast = "unsigned-integer"
  330. msg = "invalid downcasting method provided"
  331. with pytest.raises(ValueError, match=msg):
  332. to_numeric(data, downcast=invalid_downcast)
  333. def test_errors_invalid_value():
  334. # see gh-26466
  335. data = ["1", 2, 3]
  336. invalid_error_value = "invalid"
  337. msg = "invalid error value specified"
  338. with pytest.raises(ValueError, match=msg):
  339. to_numeric(data, errors=invalid_error_value)
  340. @pytest.mark.parametrize(
  341. "data",
  342. [
  343. ["1", 2, 3],
  344. [1, 2, 3],
  345. np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
  346. ],
  347. )
  348. @pytest.mark.parametrize(
  349. "kwargs,exp_dtype",
  350. [
  351. # Basic function tests.
  352. ({}, np.int64),
  353. ({"downcast": None}, np.int64),
  354. # Support below np.float32 is rare and far between.
  355. ({"downcast": "float"}, np.dtype(np.float32).char),
  356. # Basic dtype support.
  357. ({"downcast": "unsigned"}, np.dtype(np.typecodes["UnsignedInteger"][0])),
  358. ],
  359. )
  360. def test_downcast_basic(data, kwargs, exp_dtype):
  361. # see gh-13352
  362. result = to_numeric(data, **kwargs)
  363. expected = np.array([1, 2, 3], dtype=exp_dtype)
  364. tm.assert_numpy_array_equal(result, expected)
  365. @pytest.mark.parametrize("signed_downcast", ["integer", "signed"])
  366. @pytest.mark.parametrize(
  367. "data",
  368. [
  369. ["1", 2, 3],
  370. [1, 2, 3],
  371. np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
  372. ],
  373. )
  374. def test_signed_downcast(data, signed_downcast):
  375. # see gh-13352
  376. smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
  377. expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
  378. res = to_numeric(data, downcast=signed_downcast)
  379. tm.assert_numpy_array_equal(res, expected)
  380. def test_ignore_downcast_invalid_data():
  381. # If we can't successfully cast the given
  382. # data to a numeric dtype, do not bother
  383. # with the downcast parameter.
  384. data = ["foo", 2, 3]
  385. expected = np.array(data, dtype=object)
  386. res = to_numeric(data, errors="ignore", downcast="unsigned")
  387. tm.assert_numpy_array_equal(res, expected)
  388. def test_ignore_downcast_neg_to_unsigned():
  389. # Cannot cast to an unsigned integer
  390. # because we have a negative number.
  391. data = ["-1", 2, 3]
  392. expected = np.array([-1, 2, 3], dtype=np.int64)
  393. res = to_numeric(data, downcast="unsigned")
  394. tm.assert_numpy_array_equal(res, expected)
  395. # Warning in 32 bit platforms
  396. @pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
  397. @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
  398. @pytest.mark.parametrize(
  399. "data,expected",
  400. [
  401. (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)),
  402. (
  403. [10000.0, 20000, 3000, 40000.36, 50000, 50000.00],
  404. np.array(
  405. [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64
  406. ),
  407. ),
  408. ],
  409. )
  410. def test_ignore_downcast_cannot_convert_float(data, expected, downcast):
  411. # Cannot cast to an integer (signed or unsigned)
  412. # because we have a float number.
  413. res = to_numeric(data, downcast=downcast)
  414. tm.assert_numpy_array_equal(res, expected)
  415. @pytest.mark.parametrize(
  416. "downcast,expected_dtype",
  417. [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)],
  418. )
  419. def test_downcast_not8bit(downcast, expected_dtype):
  420. # the smallest integer dtype need not be np.(u)int8
  421. data = ["256", 257, 258]
  422. expected = np.array([256, 257, 258], dtype=expected_dtype)
  423. res = to_numeric(data, downcast=downcast)
  424. tm.assert_numpy_array_equal(res, expected)
  425. @pytest.mark.parametrize(
  426. "dtype,downcast,min_max",
  427. [
  428. ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]),
  429. ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]),
  430. ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]),
  431. ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]),
  432. ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]),
  433. ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]),
  434. ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]),
  435. ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]),
  436. ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]),
  437. ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]),
  438. ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]),
  439. ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]),
  440. ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]),
  441. ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]),
  442. ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]),
  443. ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]),
  444. ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]),
  445. ],
  446. )
  447. def test_downcast_limits(dtype, downcast, min_max):
  448. # see gh-14404: test the limits of each downcast.
  449. series = to_numeric(Series(min_max), downcast=downcast)
  450. assert series.dtype == dtype
  451. def test_downcast_float64_to_float32():
  452. # GH-43693: Check float64 preservation when >= 16,777,217
  453. series = Series([16777217.0, np.finfo(np.float64).max, np.nan], dtype=np.float64)
  454. result = to_numeric(series, downcast="float")
  455. assert series.dtype == result.dtype
  456. @pytest.mark.parametrize(
  457. "ser,expected",
  458. [
  459. (
  460. Series([0, 9223372036854775808]),
  461. Series([0, 9223372036854775808], dtype=np.uint64),
  462. )
  463. ],
  464. )
  465. def test_downcast_uint64(ser, expected):
  466. # see gh-14422:
  467. # BUG: to_numeric doesn't work uint64 numbers
  468. result = to_numeric(ser, downcast="unsigned")
  469. tm.assert_series_equal(result, expected)
  470. @pytest.mark.parametrize(
  471. "data,exp_data",
  472. [
  473. (
  474. [200, 300, "", "NaN", 30000000000000000000],
  475. [200, 300, np.nan, np.nan, 30000000000000000000],
  476. ),
  477. (
  478. ["12345678901234567890", "1234567890", "ITEM"],
  479. [12345678901234567890, 1234567890, np.nan],
  480. ),
  481. ],
  482. )
  483. def test_coerce_uint64_conflict(data, exp_data):
  484. # see gh-17007 and gh-17125
  485. #
  486. # Still returns float despite the uint64-nan conflict,
  487. # which would normally force the casting to object.
  488. result = to_numeric(Series(data), errors="coerce")
  489. expected = Series(exp_data, dtype=float)
  490. tm.assert_series_equal(result, expected)
  491. @pytest.mark.parametrize(
  492. "errors,exp",
  493. [
  494. ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])),
  495. ("raise", "Unable to parse string"),
  496. ],
  497. )
  498. def test_non_coerce_uint64_conflict(errors, exp):
  499. # see gh-17007 and gh-17125
  500. #
  501. # For completeness.
  502. ser = Series(["12345678901234567890", "1234567890", "ITEM"])
  503. if isinstance(exp, str):
  504. with pytest.raises(ValueError, match=exp):
  505. to_numeric(ser, errors=errors)
  506. else:
  507. result = to_numeric(ser, errors=errors)
  508. tm.assert_series_equal(result, ser)
  509. @pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"])
  510. @pytest.mark.parametrize("dc2", ["integer", "float", "unsigned"])
  511. def test_downcast_empty(dc1, dc2):
  512. # GH32493
  513. tm.assert_numpy_array_equal(
  514. to_numeric([], downcast=dc1),
  515. to_numeric([], downcast=dc2),
  516. check_dtype=False,
  517. )
  518. def test_failure_to_convert_uint64_string_to_NaN():
  519. # GH 32394
  520. result = to_numeric("uint64", errors="coerce")
  521. assert np.isnan(result)
  522. ser = Series([32, 64, np.nan])
  523. result = to_numeric(Series(["32", "64", "uint64"]), errors="coerce")
  524. tm.assert_series_equal(result, ser)
  525. @pytest.mark.parametrize(
  526. "strrep",
  527. [
  528. "243.164",
  529. "245.968",
  530. "249.585",
  531. "259.745",
  532. "265.742",
  533. "272.567",
  534. "279.196",
  535. "280.366",
  536. "275.034",
  537. "271.351",
  538. "272.889",
  539. "270.627",
  540. "280.828",
  541. "290.383",
  542. "308.153",
  543. "319.945",
  544. "336.0",
  545. "344.09",
  546. "351.385",
  547. "356.178",
  548. "359.82",
  549. "361.03",
  550. "367.701",
  551. "380.812",
  552. "387.98",
  553. "391.749",
  554. "391.171",
  555. "385.97",
  556. "385.345",
  557. "386.121",
  558. "390.996",
  559. "399.734",
  560. "413.073",
  561. "421.532",
  562. "430.221",
  563. "437.092",
  564. "439.746",
  565. "446.01",
  566. "451.191",
  567. "460.463",
  568. "469.779",
  569. "472.025",
  570. "479.49",
  571. "474.864",
  572. "467.54",
  573. "471.978",
  574. ],
  575. )
  576. def test_precision_float_conversion(strrep):
  577. # GH 31364
  578. result = to_numeric(strrep)
  579. assert result == float(strrep)
  580. @pytest.mark.parametrize(
  581. "values, expected",
  582. [
  583. (["1", "2", None], Series([1, 2, np.nan], dtype="Int64")),
  584. (["1", "2", "3"], Series([1, 2, 3], dtype="Int64")),
  585. (["1", "2", 3], Series([1, 2, 3], dtype="Int64")),
  586. (["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")),
  587. (["1", None, 3.5], Series([1, np.nan, 3.5], dtype="Float64")),
  588. (["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")),
  589. ],
  590. )
  591. def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
  592. # https://github.com/pandas-dev/pandas/issues/37262
  593. s = Series(values, dtype=nullable_string_dtype)
  594. result = to_numeric(s)
  595. tm.assert_series_equal(result, expected)
  596. def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype):
  597. # GH#52146
  598. values = ["a", "1"]
  599. ser = Series(values, dtype=nullable_string_dtype)
  600. result = to_numeric(ser, errors="coerce")
  601. expected = Series([pd.NA, 1], dtype="Int64")
  602. tm.assert_series_equal(result, expected)
  603. def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype):
  604. # GH#52146
  605. values = ["a", "1"]
  606. ser = Series(values, dtype=nullable_string_dtype)
  607. expected = ser.copy()
  608. result = to_numeric(ser, errors="ignore")
  609. tm.assert_series_equal(result, expected)
  610. @pytest.mark.parametrize(
  611. "data, input_dtype, downcast, expected_dtype",
  612. (
  613. ([1, 1], "Int64", "integer", "Int8"),
  614. ([1.0, pd.NA], "Float64", "integer", "Int8"),
  615. ([1.0, 1.1], "Float64", "integer", "Float64"),
  616. ([1, pd.NA], "Int64", "integer", "Int8"),
  617. ([450, 300], "Int64", "integer", "Int16"),
  618. ([1, 1], "Float64", "integer", "Int8"),
  619. ([np.iinfo(np.int64).max - 1, 1], "Int64", "integer", "Int64"),
  620. ([1, 1], "Int64", "signed", "Int8"),
  621. ([1.0, 1.0], "Float32", "signed", "Int8"),
  622. ([1.0, 1.1], "Float64", "signed", "Float64"),
  623. ([1, pd.NA], "Int64", "signed", "Int8"),
  624. ([450, -300], "Int64", "signed", "Int16"),
  625. ([np.iinfo(np.uint64).max - 1, 1], "UInt64", "signed", "UInt64"),
  626. ([1, 1], "Int64", "unsigned", "UInt8"),
  627. ([1.0, 1.0], "Float32", "unsigned", "UInt8"),
  628. ([1.0, 1.1], "Float64", "unsigned", "Float64"),
  629. ([1, pd.NA], "Int64", "unsigned", "UInt8"),
  630. ([450, -300], "Int64", "unsigned", "Int64"),
  631. ([-1, -1], "Int32", "unsigned", "Int32"),
  632. ([1, 1], "Float64", "float", "Float32"),
  633. ([1, 1.1], "Float64", "float", "Float32"),
  634. ([1, 1], "Float32", "float", "Float32"),
  635. ([1, 1.1], "Float32", "float", "Float32"),
  636. ),
  637. )
  638. def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype):
  639. arr = pd.array(data, dtype=input_dtype)
  640. result = to_numeric(arr, downcast=downcast)
  641. expected = pd.array(data, dtype=expected_dtype)
  642. tm.assert_extension_array_equal(result, expected)
  643. def test_downcast_nullable_mask_is_copied():
  644. # GH38974
  645. arr = pd.array([1, 2, pd.NA], dtype="Int64")
  646. result = to_numeric(arr, downcast="integer")
  647. expected = pd.array([1, 2, pd.NA], dtype="Int8")
  648. tm.assert_extension_array_equal(result, expected)
  649. arr[1] = pd.NA # should not modify result
  650. tm.assert_extension_array_equal(result, expected)
  651. def test_to_numeric_scientific_notation():
  652. # GH 15898
  653. result = to_numeric("1.7e+308")
  654. expected = np.float64(1.7e308)
  655. assert result == expected
  656. @pytest.mark.parametrize("val", [9876543210.0, 2.0**128])
  657. def test_to_numeric_large_float_not_downcast_to_float_32(val):
  658. # GH 19729
  659. expected = Series([val])
  660. result = to_numeric(expected, downcast="float")
  661. tm.assert_series_equal(result, expected)
  662. @pytest.mark.parametrize(
  663. "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
  664. )
  665. def test_to_numeric_dtype_backend(val, dtype):
  666. # GH#50505
  667. ser = Series([val], dtype=object)
  668. result = to_numeric(ser, dtype_backend="numpy_nullable")
  669. expected = Series([val], dtype=dtype)
  670. tm.assert_series_equal(result, expected)
  671. @pytest.mark.parametrize(
  672. "val, dtype",
  673. [
  674. (1, "Int64"),
  675. (1.5, "Float64"),
  676. (True, "boolean"),
  677. (1, "int64[pyarrow]"),
  678. (1.5, "float64[pyarrow]"),
  679. (True, "bool[pyarrow]"),
  680. ],
  681. )
  682. def test_to_numeric_dtype_backend_na(val, dtype):
  683. # GH#50505
  684. if "pyarrow" in dtype:
  685. pytest.importorskip("pyarrow")
  686. dtype_backend = "pyarrow"
  687. else:
  688. dtype_backend = "numpy_nullable"
  689. ser = Series([val, None], dtype=object)
  690. result = to_numeric(ser, dtype_backend=dtype_backend)
  691. expected = Series([val, pd.NA], dtype=dtype)
  692. tm.assert_series_equal(result, expected)
  693. @pytest.mark.parametrize(
  694. "val, dtype, downcast",
  695. [
  696. (1, "Int8", "integer"),
  697. (1.5, "Float32", "float"),
  698. (1, "Int8", "signed"),
  699. (1, "int8[pyarrow]", "integer"),
  700. (1.5, "float[pyarrow]", "float"),
  701. (1, "int8[pyarrow]", "signed"),
  702. ],
  703. )
  704. def test_to_numeric_dtype_backend_downcasting(val, dtype, downcast):
  705. # GH#50505
  706. if "pyarrow" in dtype:
  707. pytest.importorskip("pyarrow")
  708. dtype_backend = "pyarrow"
  709. else:
  710. dtype_backend = "numpy_nullable"
  711. ser = Series([val, None], dtype=object)
  712. result = to_numeric(ser, dtype_backend=dtype_backend, downcast=downcast)
  713. expected = Series([val, pd.NA], dtype=dtype)
  714. tm.assert_series_equal(result, expected)
  715. @pytest.mark.parametrize(
  716. "smaller, dtype_backend",
  717. [["UInt8", "numpy_nullable"], ["uint8[pyarrow]", "pyarrow"]],
  718. )
  719. def test_to_numeric_dtype_backend_downcasting_uint(smaller, dtype_backend):
  720. # GH#50505
  721. if dtype_backend == "pyarrow":
  722. pytest.importorskip("pyarrow")
  723. ser = Series([1, pd.NA], dtype="UInt64")
  724. result = to_numeric(ser, dtype_backend=dtype_backend, downcast="unsigned")
  725. expected = Series([1, pd.NA], dtype=smaller)
  726. tm.assert_series_equal(result, expected)
  727. @pytest.mark.parametrize(
  728. "dtype",
  729. [
  730. "Int64",
  731. "UInt64",
  732. "Float64",
  733. "boolean",
  734. "int64[pyarrow]",
  735. "uint64[pyarrow]",
  736. "float64[pyarrow]",
  737. "bool[pyarrow]",
  738. ],
  739. )
  740. def test_to_numeric_dtype_backend_already_nullable(dtype):
  741. # GH#50505
  742. if "pyarrow" in dtype:
  743. pytest.importorskip("pyarrow")
  744. ser = Series([1, pd.NA], dtype=dtype)
  745. result = to_numeric(ser, dtype_backend="numpy_nullable")
  746. expected = Series([1, pd.NA], dtype=dtype)
  747. tm.assert_series_equal(result, expected)
  748. def test_to_numeric_dtype_backend_error(dtype_backend):
  749. # GH#50505
  750. ser = Series(["a", "b", ""])
  751. expected = ser.copy()
  752. with pytest.raises(ValueError, match="Unable to parse string"):
  753. to_numeric(ser, dtype_backend=dtype_backend)
  754. result = to_numeric(ser, dtype_backend=dtype_backend, errors="ignore")
  755. tm.assert_series_equal(result, expected)
  756. result = to_numeric(ser, dtype_backend=dtype_backend, errors="coerce")
  757. if dtype_backend == "pyarrow":
  758. dtype = "double[pyarrow]"
  759. else:
  760. dtype = "Float64"
  761. expected = Series([np.nan, np.nan, np.nan], dtype=dtype)
  762. tm.assert_series_equal(result, expected)
  763. def test_invalid_dtype_backend():
  764. ser = Series([1, 2, 3])
  765. msg = (
  766. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  767. "'pyarrow' are allowed."
  768. )
  769. with pytest.raises(ValueError, match=msg):
  770. to_numeric(ser, dtype_backend="numpy")
  771. def test_coerce_pyarrow_backend():
  772. # GH 52588
  773. pa = pytest.importorskip("pyarrow")
  774. ser = Series(list("12x"), dtype=ArrowDtype(pa.string()))
  775. result = to_numeric(ser, errors="coerce", dtype_backend="pyarrow")
  776. expected = Series([1, 2, None], dtype=ArrowDtype(pa.int64()))
  777. tm.assert_series_equal(result, expected)