123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956 |
- import decimal
- import numpy as np
- from numpy import iinfo
- import pytest
- import pandas as pd
- from pandas import (
- ArrowDtype,
- DataFrame,
- Index,
- Series,
- to_numeric,
- )
- import pandas._testing as tm
- @pytest.fixture(params=[None, "ignore", "raise", "coerce"])
- def errors(request):
- return request.param
- @pytest.fixture(params=[True, False])
- def signed(request):
- return request.param
- @pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
- def transform(request):
- return request.param
- @pytest.fixture(params=[47393996303418497800, 100000000000000000000])
- def large_val(request):
- return request.param
- @pytest.fixture(params=[True, False])
- def multiple_elts(request):
- return request.param
- @pytest.fixture(
- params=[
- (lambda x: Index(x, name="idx"), tm.assert_index_equal),
- (lambda x: Series(x, name="ser"), tm.assert_series_equal),
- (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal),
- ]
- )
- def transform_assert_equal(request):
- return request.param
- @pytest.mark.parametrize(
- "input_kwargs,result_kwargs",
- [
- ({}, {"dtype": np.int64}),
- ({"errors": "coerce", "downcast": "integer"}, {"dtype": np.int8}),
- ],
- )
- def test_empty(input_kwargs, result_kwargs):
- # see gh-16302
- ser = Series([], dtype=object)
- result = to_numeric(ser, **input_kwargs)
- expected = Series([], **result_kwargs)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("last_val", ["7", 7])
- def test_series(last_val):
- ser = Series(["1", "-3.14", last_val])
- result = to_numeric(ser)
- expected = Series([1, -3.14, 7])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "data",
- [
- [1, 3, 4, 5],
- [1.0, 3.0, 4.0, 5.0],
- # Bool is regarded as numeric.
- [True, False, True, True],
- ],
- )
- def test_series_numeric(data):
- ser = Series(data, index=list("ABCD"), name="EFG")
- result = to_numeric(ser)
- tm.assert_series_equal(result, ser)
- @pytest.mark.parametrize(
- "data,msg",
- [
- ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'),
- (
- ["orange", 1, -3.14, "apple"],
- 'Unable to parse string "orange" at position 0',
- ),
- ],
- )
- def test_error(data, msg):
- ser = Series(data)
- with pytest.raises(ValueError, match=msg):
- to_numeric(ser, errors="raise")
- @pytest.mark.parametrize(
- "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])]
- )
- def test_ignore_error(errors, exp_data):
- ser = Series([1, -3.14, "apple"])
- result = to_numeric(ser, errors=errors)
- expected = Series(exp_data)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "errors,exp",
- [
- ("raise", 'Unable to parse string "apple" at position 2'),
- ("ignore", [True, False, "apple"]),
- # Coerces to float.
- ("coerce", [1.0, 0.0, np.nan]),
- ],
- )
- def test_bool_handling(errors, exp):
- ser = Series([True, False, "apple"])
- if isinstance(exp, str):
- with pytest.raises(ValueError, match=exp):
- to_numeric(ser, errors=errors)
- else:
- result = to_numeric(ser, errors=errors)
- expected = Series(exp)
- tm.assert_series_equal(result, expected)
- def test_list():
- ser = ["1", "-3.14", "7"]
- res = to_numeric(ser)
- expected = np.array([1, -3.14, 7])
- tm.assert_numpy_array_equal(res, expected)
- @pytest.mark.parametrize(
- "data,arr_kwargs",
- [
- ([1, 3, 4, 5], {"dtype": np.int64}),
- ([1.0, 3.0, 4.0, 5.0], {}),
- # Boolean is regarded as numeric.
- ([True, False, True, True], {}),
- ],
- )
- def test_list_numeric(data, arr_kwargs):
- result = to_numeric(data)
- expected = np.array(data, **arr_kwargs)
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("kwargs", [{"dtype": "O"}, {}])
- def test_numeric(kwargs):
- data = [1, -3.14, 7]
- ser = Series(data, **kwargs)
- result = to_numeric(ser)
- expected = Series(data)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "columns",
- [
- # One column.
- "a",
- # Multiple columns.
- ["a", "b"],
- ],
- )
- def test_numeric_df_columns(columns):
- # see gh-14827
- df = DataFrame(
- {
- "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"],
- "b": [1.0, 2.0, 3.0, 4.0],
- }
- )
- expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]})
- df_copy = df.copy()
- df_copy[columns] = df_copy[columns].apply(to_numeric)
- tm.assert_frame_equal(df_copy, expected)
- @pytest.mark.parametrize(
- "data,exp_data",
- [
- (
- [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1],
- [[3.14, 1.0], 1.6, 0.1],
- ),
- ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]),
- ],
- )
- def test_numeric_embedded_arr_likes(data, exp_data):
- # Test to_numeric with embedded lists and arrays
- df = DataFrame({"a": data})
- df["a"] = df["a"].apply(to_numeric)
- expected = DataFrame({"a": exp_data})
- tm.assert_frame_equal(df, expected)
- def test_all_nan():
- ser = Series(["a", "b", "c"])
- result = to_numeric(ser, errors="coerce")
- expected = Series([np.nan, np.nan, np.nan])
- tm.assert_series_equal(result, expected)
- def test_type_check(errors):
- # see gh-11776
- df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
- kwargs = {"errors": errors} if errors is not None else {}
- with pytest.raises(TypeError, match="1-d array"):
- to_numeric(df, **kwargs)
- @pytest.mark.parametrize("val", [1, 1.1, 20001])
- def test_scalar(val, signed, transform):
- val = -val if signed else val
- assert to_numeric(transform(val)) == float(val)
- def test_really_large_scalar(large_val, signed, transform, errors):
- # see gh-24910
- kwargs = {"errors": errors} if errors is not None else {}
- val = -large_val if signed else large_val
- val = transform(val)
- val_is_string = isinstance(val, str)
- if val_is_string and errors in (None, "raise"):
- msg = "Integer out of range. at position 0"
- with pytest.raises(ValueError, match=msg):
- to_numeric(val, **kwargs)
- else:
- expected = float(val) if (errors == "coerce" and val_is_string) else val
- tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
- def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
- # see gh-24910
- kwargs = {"errors": errors} if errors is not None else {}
- val = -large_val if signed else large_val
- val = transform(val)
- extra_elt = "string"
- arr = [val] + multiple_elts * [extra_elt]
- val_is_string = isinstance(val, str)
- coercing = errors == "coerce"
- if errors in (None, "raise") and (val_is_string or multiple_elts):
- if val_is_string:
- msg = "Integer out of range. at position 0"
- else:
- msg = 'Unable to parse string "string" at position 1'
- with pytest.raises(ValueError, match=msg):
- to_numeric(arr, **kwargs)
- else:
- result = to_numeric(arr, **kwargs)
- exp_val = float(val) if (coercing and val_is_string) else val
- expected = [exp_val]
- if multiple_elts:
- if coercing:
- expected.append(np.nan)
- exp_dtype = float
- else:
- expected.append(extra_elt)
- exp_dtype = object
- else:
- exp_dtype = float if isinstance(exp_val, (int, float)) else object
- tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
- def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors):
- # see gh-24910
- #
- # Even if we discover that we have to hold float, does not mean
- # we should be lenient on subsequent elements that fail to be integer.
- kwargs = {"errors": errors} if errors is not None else {}
- arr = [str(-large_val if signed else large_val)]
- if multiple_elts:
- arr.insert(0, large_val)
- if errors in (None, "raise"):
- index = int(multiple_elts)
- msg = f"Integer out of range. at position {index}"
- with pytest.raises(ValueError, match=msg):
- to_numeric(arr, **kwargs)
- else:
- result = to_numeric(arr, **kwargs)
- if errors == "coerce":
- expected = [float(i) for i in arr]
- exp_dtype = float
- else:
- expected = arr
- exp_dtype = object
- tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
- @pytest.mark.parametrize(
- "errors,checker",
- [
- ("raise", 'Unable to parse string "fail" at position 0'),
- ("ignore", lambda x: x == "fail"),
- ("coerce", lambda x: np.isnan(x)),
- ],
- )
- def test_scalar_fail(errors, checker):
- scalar = "fail"
- if isinstance(checker, str):
- with pytest.raises(ValueError, match=checker):
- to_numeric(scalar, errors=errors)
- else:
- assert checker(to_numeric(scalar, errors=errors))
- @pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]])
- def test_numeric_dtypes(data, transform_assert_equal):
- transform, assert_equal = transform_assert_equal
- data = transform(data)
- result = to_numeric(data)
- assert_equal(result, data)
- @pytest.mark.parametrize(
- "data,exp",
- [
- (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")),
- (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])),
- ],
- )
- def test_str(data, exp, transform_assert_equal):
- transform, assert_equal = transform_assert_equal
- result = to_numeric(transform(data))
- expected = transform(exp)
- assert_equal(result, expected)
- def test_datetime_like(tz_naive_fixture, transform_assert_equal):
- transform, assert_equal = transform_assert_equal
- idx = pd.date_range("20130101", periods=3, tz=tz_naive_fixture)
- result = to_numeric(transform(idx))
- expected = transform(idx.asi8)
- assert_equal(result, expected)
- def test_timedelta(transform_assert_equal):
- transform, assert_equal = transform_assert_equal
- idx = pd.timedelta_range("1 days", periods=3, freq="D")
- result = to_numeric(transform(idx))
- expected = transform(idx.asi8)
- assert_equal(result, expected)
- def test_period(request, transform_assert_equal):
- transform, assert_equal = transform_assert_equal
- idx = pd.period_range("2011-01", periods=3, freq="M", name="")
- inp = transform(idx)
- if not isinstance(inp, Index):
- request.node.add_marker(
- pytest.mark.xfail(reason="Missing PeriodDtype support in to_numeric")
- )
- result = to_numeric(inp)
- expected = transform(idx.asi8)
- assert_equal(result, expected)
- @pytest.mark.parametrize(
- "errors,expected",
- [
- ("raise", "Invalid object type at position 0"),
- ("ignore", Series([[10.0, 2], 1.0, "apple"])),
- ("coerce", Series([np.nan, 1.0, np.nan])),
- ],
- )
- def test_non_hashable(errors, expected):
- # see gh-13324
- ser = Series([[10.0, 2], 1.0, "apple"])
- if isinstance(expected, str):
- with pytest.raises(TypeError, match=expected):
- to_numeric(ser, errors=errors)
- else:
- result = to_numeric(ser, errors=errors)
- tm.assert_series_equal(result, expected)
- def test_downcast_invalid_cast():
- # see gh-13352
- data = ["1", 2, 3]
- invalid_downcast = "unsigned-integer"
- msg = "invalid downcasting method provided"
- with pytest.raises(ValueError, match=msg):
- to_numeric(data, downcast=invalid_downcast)
- def test_errors_invalid_value():
- # see gh-26466
- data = ["1", 2, 3]
- invalid_error_value = "invalid"
- msg = "invalid error value specified"
- with pytest.raises(ValueError, match=msg):
- to_numeric(data, errors=invalid_error_value)
- @pytest.mark.parametrize(
- "data",
- [
- ["1", 2, 3],
- [1, 2, 3],
- np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
- ],
- )
- @pytest.mark.parametrize(
- "kwargs,exp_dtype",
- [
- # Basic function tests.
- ({}, np.int64),
- ({"downcast": None}, np.int64),
- # Support below np.float32 is rare and far between.
- ({"downcast": "float"}, np.dtype(np.float32).char),
- # Basic dtype support.
- ({"downcast": "unsigned"}, np.dtype(np.typecodes["UnsignedInteger"][0])),
- ],
- )
- def test_downcast_basic(data, kwargs, exp_dtype):
- # see gh-13352
- result = to_numeric(data, **kwargs)
- expected = np.array([1, 2, 3], dtype=exp_dtype)
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("signed_downcast", ["integer", "signed"])
- @pytest.mark.parametrize(
- "data",
- [
- ["1", 2, 3],
- [1, 2, 3],
- np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
- ],
- )
- def test_signed_downcast(data, signed_downcast):
- # see gh-13352
- smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
- expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
- res = to_numeric(data, downcast=signed_downcast)
- tm.assert_numpy_array_equal(res, expected)
- def test_ignore_downcast_invalid_data():
- # If we can't successfully cast the given
- # data to a numeric dtype, do not bother
- # with the downcast parameter.
- data = ["foo", 2, 3]
- expected = np.array(data, dtype=object)
- res = to_numeric(data, errors="ignore", downcast="unsigned")
- tm.assert_numpy_array_equal(res, expected)
- def test_ignore_downcast_neg_to_unsigned():
- # Cannot cast to an unsigned integer
- # because we have a negative number.
- data = ["-1", 2, 3]
- expected = np.array([-1, 2, 3], dtype=np.int64)
- res = to_numeric(data, downcast="unsigned")
- tm.assert_numpy_array_equal(res, expected)
- # Warning in 32 bit platforms
- @pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
- @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
- @pytest.mark.parametrize(
- "data,expected",
- [
- (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)),
- (
- [10000.0, 20000, 3000, 40000.36, 50000, 50000.00],
- np.array(
- [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64
- ),
- ),
- ],
- )
- def test_ignore_downcast_cannot_convert_float(data, expected, downcast):
- # Cannot cast to an integer (signed or unsigned)
- # because we have a float number.
- res = to_numeric(data, downcast=downcast)
- tm.assert_numpy_array_equal(res, expected)
- @pytest.mark.parametrize(
- "downcast,expected_dtype",
- [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)],
- )
- def test_downcast_not8bit(downcast, expected_dtype):
- # the smallest integer dtype need not be np.(u)int8
- data = ["256", 257, 258]
- expected = np.array([256, 257, 258], dtype=expected_dtype)
- res = to_numeric(data, downcast=downcast)
- tm.assert_numpy_array_equal(res, expected)
- @pytest.mark.parametrize(
- "dtype,downcast,min_max",
- [
- ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]),
- ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]),
- ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]),
- ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]),
- ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]),
- ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]),
- ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]),
- ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]),
- ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]),
- ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]),
- ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]),
- ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]),
- ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]),
- ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]),
- ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]),
- ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]),
- ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]),
- ],
- )
- def test_downcast_limits(dtype, downcast, min_max):
- # see gh-14404: test the limits of each downcast.
- series = to_numeric(Series(min_max), downcast=downcast)
- assert series.dtype == dtype
- def test_downcast_float64_to_float32():
- # GH-43693: Check float64 preservation when >= 16,777,217
- series = Series([16777217.0, np.finfo(np.float64).max, np.nan], dtype=np.float64)
- result = to_numeric(series, downcast="float")
- assert series.dtype == result.dtype
- @pytest.mark.parametrize(
- "ser,expected",
- [
- (
- Series([0, 9223372036854775808]),
- Series([0, 9223372036854775808], dtype=np.uint64),
- )
- ],
- )
- def test_downcast_uint64(ser, expected):
- # see gh-14422:
- # BUG: to_numeric doesn't work uint64 numbers
- result = to_numeric(ser, downcast="unsigned")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "data,exp_data",
- [
- (
- [200, 300, "", "NaN", 30000000000000000000],
- [200, 300, np.nan, np.nan, 30000000000000000000],
- ),
- (
- ["12345678901234567890", "1234567890", "ITEM"],
- [12345678901234567890, 1234567890, np.nan],
- ),
- ],
- )
- def test_coerce_uint64_conflict(data, exp_data):
- # see gh-17007 and gh-17125
- #
- # Still returns float despite the uint64-nan conflict,
- # which would normally force the casting to object.
- result = to_numeric(Series(data), errors="coerce")
- expected = Series(exp_data, dtype=float)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "errors,exp",
- [
- ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])),
- ("raise", "Unable to parse string"),
- ],
- )
- def test_non_coerce_uint64_conflict(errors, exp):
- # see gh-17007 and gh-17125
- #
- # For completeness.
- ser = Series(["12345678901234567890", "1234567890", "ITEM"])
- if isinstance(exp, str):
- with pytest.raises(ValueError, match=exp):
- to_numeric(ser, errors=errors)
- else:
- result = to_numeric(ser, errors=errors)
- tm.assert_series_equal(result, ser)
- @pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"])
- @pytest.mark.parametrize("dc2", ["integer", "float", "unsigned"])
- def test_downcast_empty(dc1, dc2):
- # GH32493
- tm.assert_numpy_array_equal(
- to_numeric([], downcast=dc1),
- to_numeric([], downcast=dc2),
- check_dtype=False,
- )
- def test_failure_to_convert_uint64_string_to_NaN():
- # GH 32394
- result = to_numeric("uint64", errors="coerce")
- assert np.isnan(result)
- ser = Series([32, 64, np.nan])
- result = to_numeric(Series(["32", "64", "uint64"]), errors="coerce")
- tm.assert_series_equal(result, ser)
- @pytest.mark.parametrize(
- "strrep",
- [
- "243.164",
- "245.968",
- "249.585",
- "259.745",
- "265.742",
- "272.567",
- "279.196",
- "280.366",
- "275.034",
- "271.351",
- "272.889",
- "270.627",
- "280.828",
- "290.383",
- "308.153",
- "319.945",
- "336.0",
- "344.09",
- "351.385",
- "356.178",
- "359.82",
- "361.03",
- "367.701",
- "380.812",
- "387.98",
- "391.749",
- "391.171",
- "385.97",
- "385.345",
- "386.121",
- "390.996",
- "399.734",
- "413.073",
- "421.532",
- "430.221",
- "437.092",
- "439.746",
- "446.01",
- "451.191",
- "460.463",
- "469.779",
- "472.025",
- "479.49",
- "474.864",
- "467.54",
- "471.978",
- ],
- )
- def test_precision_float_conversion(strrep):
- # GH 31364
- result = to_numeric(strrep)
- assert result == float(strrep)
- @pytest.mark.parametrize(
- "values, expected",
- [
- (["1", "2", None], Series([1, 2, np.nan], dtype="Int64")),
- (["1", "2", "3"], Series([1, 2, 3], dtype="Int64")),
- (["1", "2", 3], Series([1, 2, 3], dtype="Int64")),
- (["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")),
- (["1", None, 3.5], Series([1, np.nan, 3.5], dtype="Float64")),
- (["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")),
- ],
- )
- def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
- # https://github.com/pandas-dev/pandas/issues/37262
- s = Series(values, dtype=nullable_string_dtype)
- result = to_numeric(s)
- tm.assert_series_equal(result, expected)
- def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype):
- # GH#52146
- values = ["a", "1"]
- ser = Series(values, dtype=nullable_string_dtype)
- result = to_numeric(ser, errors="coerce")
- expected = Series([pd.NA, 1], dtype="Int64")
- tm.assert_series_equal(result, expected)
- def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype):
- # GH#52146
- values = ["a", "1"]
- ser = Series(values, dtype=nullable_string_dtype)
- expected = ser.copy()
- result = to_numeric(ser, errors="ignore")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "data, input_dtype, downcast, expected_dtype",
- (
- ([1, 1], "Int64", "integer", "Int8"),
- ([1.0, pd.NA], "Float64", "integer", "Int8"),
- ([1.0, 1.1], "Float64", "integer", "Float64"),
- ([1, pd.NA], "Int64", "integer", "Int8"),
- ([450, 300], "Int64", "integer", "Int16"),
- ([1, 1], "Float64", "integer", "Int8"),
- ([np.iinfo(np.int64).max - 1, 1], "Int64", "integer", "Int64"),
- ([1, 1], "Int64", "signed", "Int8"),
- ([1.0, 1.0], "Float32", "signed", "Int8"),
- ([1.0, 1.1], "Float64", "signed", "Float64"),
- ([1, pd.NA], "Int64", "signed", "Int8"),
- ([450, -300], "Int64", "signed", "Int16"),
- ([np.iinfo(np.uint64).max - 1, 1], "UInt64", "signed", "UInt64"),
- ([1, 1], "Int64", "unsigned", "UInt8"),
- ([1.0, 1.0], "Float32", "unsigned", "UInt8"),
- ([1.0, 1.1], "Float64", "unsigned", "Float64"),
- ([1, pd.NA], "Int64", "unsigned", "UInt8"),
- ([450, -300], "Int64", "unsigned", "Int64"),
- ([-1, -1], "Int32", "unsigned", "Int32"),
- ([1, 1], "Float64", "float", "Float32"),
- ([1, 1.1], "Float64", "float", "Float32"),
- ([1, 1], "Float32", "float", "Float32"),
- ([1, 1.1], "Float32", "float", "Float32"),
- ),
- )
- def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype):
- arr = pd.array(data, dtype=input_dtype)
- result = to_numeric(arr, downcast=downcast)
- expected = pd.array(data, dtype=expected_dtype)
- tm.assert_extension_array_equal(result, expected)
- def test_downcast_nullable_mask_is_copied():
- # GH38974
- arr = pd.array([1, 2, pd.NA], dtype="Int64")
- result = to_numeric(arr, downcast="integer")
- expected = pd.array([1, 2, pd.NA], dtype="Int8")
- tm.assert_extension_array_equal(result, expected)
- arr[1] = pd.NA # should not modify result
- tm.assert_extension_array_equal(result, expected)
- def test_to_numeric_scientific_notation():
- # GH 15898
- result = to_numeric("1.7e+308")
- expected = np.float64(1.7e308)
- assert result == expected
- @pytest.mark.parametrize("val", [9876543210.0, 2.0**128])
- def test_to_numeric_large_float_not_downcast_to_float_32(val):
- # GH 19729
- expected = Series([val])
- result = to_numeric(expected, downcast="float")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
- )
- def test_to_numeric_dtype_backend(val, dtype):
- # GH#50505
- ser = Series([val], dtype=object)
- result = to_numeric(ser, dtype_backend="numpy_nullable")
- expected = Series([val], dtype=dtype)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "val, dtype",
- [
- (1, "Int64"),
- (1.5, "Float64"),
- (True, "boolean"),
- (1, "int64[pyarrow]"),
- (1.5, "float64[pyarrow]"),
- (True, "bool[pyarrow]"),
- ],
- )
- def test_to_numeric_dtype_backend_na(val, dtype):
- # GH#50505
- if "pyarrow" in dtype:
- pytest.importorskip("pyarrow")
- dtype_backend = "pyarrow"
- else:
- dtype_backend = "numpy_nullable"
- ser = Series([val, None], dtype=object)
- result = to_numeric(ser, dtype_backend=dtype_backend)
- expected = Series([val, pd.NA], dtype=dtype)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "val, dtype, downcast",
- [
- (1, "Int8", "integer"),
- (1.5, "Float32", "float"),
- (1, "Int8", "signed"),
- (1, "int8[pyarrow]", "integer"),
- (1.5, "float[pyarrow]", "float"),
- (1, "int8[pyarrow]", "signed"),
- ],
- )
- def test_to_numeric_dtype_backend_downcasting(val, dtype, downcast):
- # GH#50505
- if "pyarrow" in dtype:
- pytest.importorskip("pyarrow")
- dtype_backend = "pyarrow"
- else:
- dtype_backend = "numpy_nullable"
- ser = Series([val, None], dtype=object)
- result = to_numeric(ser, dtype_backend=dtype_backend, downcast=downcast)
- expected = Series([val, pd.NA], dtype=dtype)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "smaller, dtype_backend",
- [["UInt8", "numpy_nullable"], ["uint8[pyarrow]", "pyarrow"]],
- )
- def test_to_numeric_dtype_backend_downcasting_uint(smaller, dtype_backend):
- # GH#50505
- if dtype_backend == "pyarrow":
- pytest.importorskip("pyarrow")
- ser = Series([1, pd.NA], dtype="UInt64")
- result = to_numeric(ser, dtype_backend=dtype_backend, downcast="unsigned")
- expected = Series([1, pd.NA], dtype=smaller)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "dtype",
- [
- "Int64",
- "UInt64",
- "Float64",
- "boolean",
- "int64[pyarrow]",
- "uint64[pyarrow]",
- "float64[pyarrow]",
- "bool[pyarrow]",
- ],
- )
- def test_to_numeric_dtype_backend_already_nullable(dtype):
- # GH#50505
- if "pyarrow" in dtype:
- pytest.importorskip("pyarrow")
- ser = Series([1, pd.NA], dtype=dtype)
- result = to_numeric(ser, dtype_backend="numpy_nullable")
- expected = Series([1, pd.NA], dtype=dtype)
- tm.assert_series_equal(result, expected)
- def test_to_numeric_dtype_backend_error(dtype_backend):
- # GH#50505
- ser = Series(["a", "b", ""])
- expected = ser.copy()
- with pytest.raises(ValueError, match="Unable to parse string"):
- to_numeric(ser, dtype_backend=dtype_backend)
- result = to_numeric(ser, dtype_backend=dtype_backend, errors="ignore")
- tm.assert_series_equal(result, expected)
- result = to_numeric(ser, dtype_backend=dtype_backend, errors="coerce")
- if dtype_backend == "pyarrow":
- dtype = "double[pyarrow]"
- else:
- dtype = "Float64"
- expected = Series([np.nan, np.nan, np.nan], dtype=dtype)
- tm.assert_series_equal(result, expected)
- def test_invalid_dtype_backend():
- ser = Series([1, 2, 3])
- msg = (
- "dtype_backend numpy is invalid, only 'numpy_nullable' and "
- "'pyarrow' are allowed."
- )
- with pytest.raises(ValueError, match=msg):
- to_numeric(ser, dtype_backend="numpy")
- def test_coerce_pyarrow_backend():
- # GH 52588
- pa = pytest.importorskip("pyarrow")
- ser = Series(list("12x"), dtype=ArrowDtype(pa.string()))
- result = to_numeric(ser, errors="coerce", dtype_backend="pyarrow")
- expected = Series([1, 2, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
|