123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310 |
- from __future__ import annotations
- from typing import Literal
- import numpy as np
- from pandas._libs import lib
- from pandas._typing import (
- DateTimeErrorChoices,
- DtypeBackend,
- npt,
- )
- from pandas.util._validators import check_dtype_backend
- from pandas.core.dtypes.cast import maybe_downcast_numeric
- from pandas.core.dtypes.common import (
- ensure_object,
- is_bool_dtype,
- is_datetime_or_timedelta_dtype,
- is_decimal,
- is_integer_dtype,
- is_number,
- is_numeric_dtype,
- is_scalar,
- is_string_dtype,
- needs_i8_conversion,
- )
- from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
- )
- import pandas as pd
- from pandas.core.arrays import BaseMaskedArray
- from pandas.core.arrays.string_ import StringDtype
- def to_numeric(
- arg,
- errors: DateTimeErrorChoices = "raise",
- downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- ):
- """
- Convert argument to a numeric type.
- The default return dtype is `float64` or `int64`
- depending on the data supplied. Use the `downcast` parameter
- to obtain other dtypes.
- Please note that precision loss may occur if really large numbers
- are passed in. Due to the internal limitations of `ndarray`, if
- numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
- or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
- passed in, it is very likely they will be converted to float so that
- they can be stored in an `ndarray`. These warnings apply similarly to
- `Series` since it internally leverages `ndarray`.
- Parameters
- ----------
- arg : scalar, list, tuple, 1-d array, or Series
- Argument to be converted.
- errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- - If 'raise', then invalid parsing will raise an exception.
- - If 'coerce', then invalid parsing will be set as NaN.
- - If 'ignore', then invalid parsing will return the input.
- downcast : str, default None
- Can be 'integer', 'signed', 'unsigned', or 'float'.
- If not None, and if the data has been successfully cast to a
- numerical dtype (or if the data was numeric to begin with),
- downcast that resulting data to the smallest numerical dtype
- possible according to the following rules:
- - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
- - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
- - 'float': smallest float dtype (min.: np.float32)
- As this behaviour is separate from the core conversion to
- numeric values, any errors raised during the downcasting
- will be surfaced regardless of the value of the 'errors' input.
- In addition, downcasting will only occur if the size
- of the resulting data's dtype is strictly larger than
- the dtype it is to be cast to, so if none of the dtypes
- checked satisfy that specification, no downcasting will be
- performed on the data.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
- The dtype_backends are still experimential.
- .. versionadded:: 2.0
- Returns
- -------
- ret
- Numeric if parsing succeeded.
- Return type depends on input. Series if Series, otherwise ndarray.
- See Also
- --------
- DataFrame.astype : Cast argument to a specified dtype.
- to_datetime : Convert argument to datetime.
- to_timedelta : Convert argument to timedelta.
- numpy.ndarray.astype : Cast a numpy array to a specified type.
- DataFrame.convert_dtypes : Convert dtypes.
- Examples
- --------
- Take separate series and convert to numeric, coercing when told to
- >>> s = pd.Series(['1.0', '2', -3])
- >>> pd.to_numeric(s)
- 0 1.0
- 1 2.0
- 2 -3.0
- dtype: float64
- >>> pd.to_numeric(s, downcast='float')
- 0 1.0
- 1 2.0
- 2 -3.0
- dtype: float32
- >>> pd.to_numeric(s, downcast='signed')
- 0 1
- 1 2
- 2 -3
- dtype: int8
- >>> s = pd.Series(['apple', '1.0', '2', -3])
- >>> pd.to_numeric(s, errors='ignore')
- 0 apple
- 1 1.0
- 2 2
- 3 -3
- dtype: object
- >>> pd.to_numeric(s, errors='coerce')
- 0 NaN
- 1 1.0
- 2 2.0
- 3 -3.0
- dtype: float64
- Downcasting of nullable integer and floating dtypes is supported:
- >>> s = pd.Series([1, 2, 3], dtype="Int64")
- >>> pd.to_numeric(s, downcast="integer")
- 0 1
- 1 2
- 2 3
- dtype: Int8
- >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
- >>> pd.to_numeric(s, downcast="float")
- 0 1.0
- 1 2.1
- 2 3.0
- dtype: Float32
- """
- if downcast not in (None, "integer", "signed", "unsigned", "float"):
- raise ValueError("invalid downcasting method provided")
- if errors not in ("ignore", "raise", "coerce"):
- raise ValueError("invalid error value specified")
- check_dtype_backend(dtype_backend)
- is_series = False
- is_index = False
- is_scalars = False
- if isinstance(arg, ABCSeries):
- is_series = True
- values = arg.values
- elif isinstance(arg, ABCIndex):
- is_index = True
- if needs_i8_conversion(arg.dtype):
- values = arg.view("i8")
- else:
- values = arg.values
- elif isinstance(arg, (list, tuple)):
- values = np.array(arg, dtype="O")
- elif is_scalar(arg):
- if is_decimal(arg):
- return float(arg)
- if is_number(arg):
- return arg
- is_scalars = True
- values = np.array([arg], dtype="O")
- elif getattr(arg, "ndim", 1) > 1:
- raise TypeError("arg must be a list, tuple, 1-d array, or Series")
- else:
- values = arg
- orig_values = values
- # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
- # save mask to reconstruct the full array after casting
- mask: npt.NDArray[np.bool_] | None = None
- if isinstance(values, BaseMaskedArray):
- mask = values._mask
- values = values._data[~mask]
- values_dtype = getattr(values, "dtype", None)
- if isinstance(values_dtype, pd.ArrowDtype):
- mask = values.isna()
- values = values.dropna().to_numpy()
- new_mask: np.ndarray | None = None
- if is_numeric_dtype(values_dtype):
- pass
- elif is_datetime_or_timedelta_dtype(values_dtype):
- values = values.view(np.int64)
- else:
- values = ensure_object(values)
- coerce_numeric = errors not in ("ignore", "raise")
- try:
- values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] # noqa
- values,
- set(),
- coerce_numeric=coerce_numeric,
- convert_to_masked_nullable=dtype_backend is not lib.no_default
- or isinstance(values_dtype, StringDtype),
- )
- except (ValueError, TypeError):
- if errors == "raise":
- raise
- values = orig_values
- if new_mask is not None:
- # Remove unnecessary values, is expected later anyway and enables
- # downcasting
- values = values[~new_mask]
- elif (
- dtype_backend is not lib.no_default
- and new_mask is None
- or isinstance(values_dtype, StringDtype)
- ):
- new_mask = np.zeros(values.shape, dtype=np.bool_)
- # attempt downcast only if the data has been successfully converted
- # to a numerical dtype and if a downcast method has been specified
- if downcast is not None and is_numeric_dtype(values.dtype):
- typecodes: str | None = None
- if downcast in ("integer", "signed"):
- typecodes = np.typecodes["Integer"]
- elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0):
- typecodes = np.typecodes["UnsignedInteger"]
- elif downcast == "float":
- typecodes = np.typecodes["Float"]
- # pandas support goes only to np.float32,
- # as float dtypes smaller than that are
- # extremely rare and not well supported
- float_32_char = np.dtype(np.float32).char
- float_32_ind = typecodes.index(float_32_char)
- typecodes = typecodes[float_32_ind:]
- if typecodes is not None:
- # from smallest to largest
- for typecode in typecodes:
- dtype = np.dtype(typecode)
- if dtype.itemsize <= values.dtype.itemsize:
- values = maybe_downcast_numeric(values, dtype)
- # successful conversion
- if values.dtype == dtype:
- break
- # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
- # masked array
- if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
- if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
- # GH 52588
- mask = new_mask
- else:
- mask = mask.copy()
- assert isinstance(mask, np.ndarray)
- data = np.zeros(mask.shape, dtype=values.dtype)
- data[~mask] = values
- from pandas.core.arrays import (
- ArrowExtensionArray,
- BooleanArray,
- FloatingArray,
- IntegerArray,
- )
- klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray]
- if is_integer_dtype(data.dtype):
- klass = IntegerArray
- elif is_bool_dtype(data.dtype):
- klass = BooleanArray
- else:
- klass = FloatingArray
- values = klass(data, mask)
- if dtype_backend == "pyarrow" or isinstance(values_dtype, pd.ArrowDtype):
- values = ArrowExtensionArray(values.__arrow_array__())
- if is_series:
- return arg._constructor(values, index=arg.index, name=arg.name)
- elif is_index:
- # because we want to coerce to numeric if possible,
- # do not use _shallow_copy
- return pd.Index(values, name=arg.name)
- elif is_scalars:
- return values[0]
- else:
- return values
|