numeric.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. from __future__ import annotations
  2. from typing import Literal
  3. import numpy as np
  4. from pandas._libs import lib
  5. from pandas._typing import (
  6. DateTimeErrorChoices,
  7. DtypeBackend,
  8. npt,
  9. )
  10. from pandas.util._validators import check_dtype_backend
  11. from pandas.core.dtypes.cast import maybe_downcast_numeric
  12. from pandas.core.dtypes.common import (
  13. ensure_object,
  14. is_bool_dtype,
  15. is_datetime_or_timedelta_dtype,
  16. is_decimal,
  17. is_integer_dtype,
  18. is_number,
  19. is_numeric_dtype,
  20. is_scalar,
  21. is_string_dtype,
  22. needs_i8_conversion,
  23. )
  24. from pandas.core.dtypes.generic import (
  25. ABCIndex,
  26. ABCSeries,
  27. )
  28. import pandas as pd
  29. from pandas.core.arrays import BaseMaskedArray
  30. from pandas.core.arrays.string_ import StringDtype
  31. def to_numeric(
  32. arg,
  33. errors: DateTimeErrorChoices = "raise",
  34. downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
  35. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  36. ):
  37. """
  38. Convert argument to a numeric type.
  39. The default return dtype is `float64` or `int64`
  40. depending on the data supplied. Use the `downcast` parameter
  41. to obtain other dtypes.
  42. Please note that precision loss may occur if really large numbers
  43. are passed in. Due to the internal limitations of `ndarray`, if
  44. numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
  45. or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
  46. passed in, it is very likely they will be converted to float so that
  47. they can be stored in an `ndarray`. These warnings apply similarly to
  48. `Series` since it internally leverages `ndarray`.
  49. Parameters
  50. ----------
  51. arg : scalar, list, tuple, 1-d array, or Series
  52. Argument to be converted.
  53. errors : {'ignore', 'raise', 'coerce'}, default 'raise'
  54. - If 'raise', then invalid parsing will raise an exception.
  55. - If 'coerce', then invalid parsing will be set as NaN.
  56. - If 'ignore', then invalid parsing will return the input.
  57. downcast : str, default None
  58. Can be 'integer', 'signed', 'unsigned', or 'float'.
  59. If not None, and if the data has been successfully cast to a
  60. numerical dtype (or if the data was numeric to begin with),
  61. downcast that resulting data to the smallest numerical dtype
  62. possible according to the following rules:
  63. - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
  64. - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
  65. - 'float': smallest float dtype (min.: np.float32)
  66. As this behaviour is separate from the core conversion to
  67. numeric values, any errors raised during the downcasting
  68. will be surfaced regardless of the value of the 'errors' input.
  69. In addition, downcasting will only occur if the size
  70. of the resulting data's dtype is strictly larger than
  71. the dtype it is to be cast to, so if none of the dtypes
  72. checked satisfy that specification, no downcasting will be
  73. performed on the data.
  74. dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
  75. Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
  76. arrays, nullable dtypes are used for all dtypes that have a nullable
  77. implementation when "numpy_nullable" is set, pyarrow is used for all
  78. dtypes if "pyarrow" is set.
  79. The dtype_backends are still experimential.
  80. .. versionadded:: 2.0
  81. Returns
  82. -------
  83. ret
  84. Numeric if parsing succeeded.
  85. Return type depends on input. Series if Series, otherwise ndarray.
  86. See Also
  87. --------
  88. DataFrame.astype : Cast argument to a specified dtype.
  89. to_datetime : Convert argument to datetime.
  90. to_timedelta : Convert argument to timedelta.
  91. numpy.ndarray.astype : Cast a numpy array to a specified type.
  92. DataFrame.convert_dtypes : Convert dtypes.
  93. Examples
  94. --------
  95. Take separate series and convert to numeric, coercing when told to
  96. >>> s = pd.Series(['1.0', '2', -3])
  97. >>> pd.to_numeric(s)
  98. 0 1.0
  99. 1 2.0
  100. 2 -3.0
  101. dtype: float64
  102. >>> pd.to_numeric(s, downcast='float')
  103. 0 1.0
  104. 1 2.0
  105. 2 -3.0
  106. dtype: float32
  107. >>> pd.to_numeric(s, downcast='signed')
  108. 0 1
  109. 1 2
  110. 2 -3
  111. dtype: int8
  112. >>> s = pd.Series(['apple', '1.0', '2', -3])
  113. >>> pd.to_numeric(s, errors='ignore')
  114. 0 apple
  115. 1 1.0
  116. 2 2
  117. 3 -3
  118. dtype: object
  119. >>> pd.to_numeric(s, errors='coerce')
  120. 0 NaN
  121. 1 1.0
  122. 2 2.0
  123. 3 -3.0
  124. dtype: float64
  125. Downcasting of nullable integer and floating dtypes is supported:
  126. >>> s = pd.Series([1, 2, 3], dtype="Int64")
  127. >>> pd.to_numeric(s, downcast="integer")
  128. 0 1
  129. 1 2
  130. 2 3
  131. dtype: Int8
  132. >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
  133. >>> pd.to_numeric(s, downcast="float")
  134. 0 1.0
  135. 1 2.1
  136. 2 3.0
  137. dtype: Float32
  138. """
  139. if downcast not in (None, "integer", "signed", "unsigned", "float"):
  140. raise ValueError("invalid downcasting method provided")
  141. if errors not in ("ignore", "raise", "coerce"):
  142. raise ValueError("invalid error value specified")
  143. check_dtype_backend(dtype_backend)
  144. is_series = False
  145. is_index = False
  146. is_scalars = False
  147. if isinstance(arg, ABCSeries):
  148. is_series = True
  149. values = arg.values
  150. elif isinstance(arg, ABCIndex):
  151. is_index = True
  152. if needs_i8_conversion(arg.dtype):
  153. values = arg.view("i8")
  154. else:
  155. values = arg.values
  156. elif isinstance(arg, (list, tuple)):
  157. values = np.array(arg, dtype="O")
  158. elif is_scalar(arg):
  159. if is_decimal(arg):
  160. return float(arg)
  161. if is_number(arg):
  162. return arg
  163. is_scalars = True
  164. values = np.array([arg], dtype="O")
  165. elif getattr(arg, "ndim", 1) > 1:
  166. raise TypeError("arg must be a list, tuple, 1-d array, or Series")
  167. else:
  168. values = arg
  169. orig_values = values
  170. # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
  171. # save mask to reconstruct the full array after casting
  172. mask: npt.NDArray[np.bool_] | None = None
  173. if isinstance(values, BaseMaskedArray):
  174. mask = values._mask
  175. values = values._data[~mask]
  176. values_dtype = getattr(values, "dtype", None)
  177. if isinstance(values_dtype, pd.ArrowDtype):
  178. mask = values.isna()
  179. values = values.dropna().to_numpy()
  180. new_mask: np.ndarray | None = None
  181. if is_numeric_dtype(values_dtype):
  182. pass
  183. elif is_datetime_or_timedelta_dtype(values_dtype):
  184. values = values.view(np.int64)
  185. else:
  186. values = ensure_object(values)
  187. coerce_numeric = errors not in ("ignore", "raise")
  188. try:
  189. values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] # noqa
  190. values,
  191. set(),
  192. coerce_numeric=coerce_numeric,
  193. convert_to_masked_nullable=dtype_backend is not lib.no_default
  194. or isinstance(values_dtype, StringDtype),
  195. )
  196. except (ValueError, TypeError):
  197. if errors == "raise":
  198. raise
  199. values = orig_values
  200. if new_mask is not None:
  201. # Remove unnecessary values, is expected later anyway and enables
  202. # downcasting
  203. values = values[~new_mask]
  204. elif (
  205. dtype_backend is not lib.no_default
  206. and new_mask is None
  207. or isinstance(values_dtype, StringDtype)
  208. ):
  209. new_mask = np.zeros(values.shape, dtype=np.bool_)
  210. # attempt downcast only if the data has been successfully converted
  211. # to a numerical dtype and if a downcast method has been specified
  212. if downcast is not None and is_numeric_dtype(values.dtype):
  213. typecodes: str | None = None
  214. if downcast in ("integer", "signed"):
  215. typecodes = np.typecodes["Integer"]
  216. elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0):
  217. typecodes = np.typecodes["UnsignedInteger"]
  218. elif downcast == "float":
  219. typecodes = np.typecodes["Float"]
  220. # pandas support goes only to np.float32,
  221. # as float dtypes smaller than that are
  222. # extremely rare and not well supported
  223. float_32_char = np.dtype(np.float32).char
  224. float_32_ind = typecodes.index(float_32_char)
  225. typecodes = typecodes[float_32_ind:]
  226. if typecodes is not None:
  227. # from smallest to largest
  228. for typecode in typecodes:
  229. dtype = np.dtype(typecode)
  230. if dtype.itemsize <= values.dtype.itemsize:
  231. values = maybe_downcast_numeric(values, dtype)
  232. # successful conversion
  233. if values.dtype == dtype:
  234. break
  235. # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
  236. # masked array
  237. if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
  238. if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
  239. # GH 52588
  240. mask = new_mask
  241. else:
  242. mask = mask.copy()
  243. assert isinstance(mask, np.ndarray)
  244. data = np.zeros(mask.shape, dtype=values.dtype)
  245. data[~mask] = values
  246. from pandas.core.arrays import (
  247. ArrowExtensionArray,
  248. BooleanArray,
  249. FloatingArray,
  250. IntegerArray,
  251. )
  252. klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray]
  253. if is_integer_dtype(data.dtype):
  254. klass = IntegerArray
  255. elif is_bool_dtype(data.dtype):
  256. klass = BooleanArray
  257. else:
  258. klass = FloatingArray
  259. values = klass(data, mask)
  260. if dtype_backend == "pyarrow" or isinstance(values_dtype, pd.ArrowDtype):
  261. values = ArrowExtensionArray(values.__arrow_array__())
  262. if is_series:
  263. return arg._constructor(values, index=arg.index, name=arg.name)
  264. elif is_index:
  265. # because we want to coerce to numeric if possible,
  266. # do not use _shallow_copy
  267. return pd.Index(values, name=arg.name)
  268. elif is_scalars:
  269. return values[0]
  270. else:
  271. return values