boolean.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. from __future__ import annotations
  2. import numbers
  3. from typing import (
  4. TYPE_CHECKING,
  5. cast,
  6. )
  7. import numpy as np
  8. from pandas._libs import (
  9. lib,
  10. missing as libmissing,
  11. )
  12. from pandas._typing import (
  13. Dtype,
  14. DtypeObj,
  15. type_t,
  16. )
  17. from pandas.core.dtypes.common import (
  18. is_list_like,
  19. is_numeric_dtype,
  20. )
  21. from pandas.core.dtypes.dtypes import register_extension_dtype
  22. from pandas.core.dtypes.missing import isna
  23. from pandas.core import ops
  24. from pandas.core.array_algos import masked_accumulations
  25. from pandas.core.arrays.masked import (
  26. BaseMaskedArray,
  27. BaseMaskedDtype,
  28. )
  29. if TYPE_CHECKING:
  30. import pyarrow
  31. from pandas._typing import npt
  32. @register_extension_dtype
  33. class BooleanDtype(BaseMaskedDtype):
  34. """
  35. Extension dtype for boolean data.
  36. .. warning::
  37. BooleanDtype is considered experimental. The implementation and
  38. parts of the API may change without warning.
  39. Attributes
  40. ----------
  41. None
  42. Methods
  43. -------
  44. None
  45. Examples
  46. --------
  47. >>> pd.BooleanDtype()
  48. BooleanDtype
  49. """
  50. name = "boolean"
  51. # https://github.com/python/mypy/issues/4125
  52. # error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
  53. @property
  54. def type(self) -> type: # type: ignore[override]
  55. return np.bool_
  56. @property
  57. def kind(self) -> str:
  58. return "b"
  59. @property
  60. def numpy_dtype(self) -> np.dtype:
  61. return np.dtype("bool")
  62. @classmethod
  63. def construct_array_type(cls) -> type_t[BooleanArray]:
  64. """
  65. Return the array type associated with this dtype.
  66. Returns
  67. -------
  68. type
  69. """
  70. return BooleanArray
  71. def __repr__(self) -> str:
  72. return "BooleanDtype"
  73. @property
  74. def _is_boolean(self) -> bool:
  75. return True
  76. @property
  77. def _is_numeric(self) -> bool:
  78. return True
  79. def __from_arrow__(
  80. self, array: pyarrow.Array | pyarrow.ChunkedArray
  81. ) -> BooleanArray:
  82. """
  83. Construct BooleanArray from pyarrow Array/ChunkedArray.
  84. """
  85. import pyarrow
  86. if array.type != pyarrow.bool_():
  87. raise TypeError(f"Expected array of boolean type, got {array.type} instead")
  88. if isinstance(array, pyarrow.Array):
  89. chunks = [array]
  90. else:
  91. # pyarrow.ChunkedArray
  92. chunks = array.chunks
  93. results = []
  94. for arr in chunks:
  95. buflist = arr.buffers()
  96. data = pyarrow.BooleanArray.from_buffers(
  97. arr.type, len(arr), [None, buflist[1]], offset=arr.offset
  98. ).to_numpy(zero_copy_only=False)
  99. if arr.null_count != 0:
  100. mask = pyarrow.BooleanArray.from_buffers(
  101. arr.type, len(arr), [None, buflist[0]], offset=arr.offset
  102. ).to_numpy(zero_copy_only=False)
  103. mask = ~mask
  104. else:
  105. mask = np.zeros(len(arr), dtype=bool)
  106. bool_arr = BooleanArray(data, mask)
  107. results.append(bool_arr)
  108. if not results:
  109. return BooleanArray(
  110. np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
  111. )
  112. else:
  113. return BooleanArray._concat_same_type(results)
  114. def coerce_to_array(
  115. values, mask=None, copy: bool = False
  116. ) -> tuple[np.ndarray, np.ndarray]:
  117. """
  118. Coerce the input values array to numpy arrays with a mask.
  119. Parameters
  120. ----------
  121. values : 1D list-like
  122. mask : bool 1D array, optional
  123. copy : bool, default False
  124. if True, copy the input
  125. Returns
  126. -------
  127. tuple of (values, mask)
  128. """
  129. if isinstance(values, BooleanArray):
  130. if mask is not None:
  131. raise ValueError("cannot pass mask for BooleanArray input")
  132. values, mask = values._data, values._mask
  133. if copy:
  134. values = values.copy()
  135. mask = mask.copy()
  136. return values, mask
  137. mask_values = None
  138. if isinstance(values, np.ndarray) and values.dtype == np.bool_:
  139. if copy:
  140. values = values.copy()
  141. elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype):
  142. mask_values = isna(values)
  143. values_bool = np.zeros(len(values), dtype=bool)
  144. values_bool[~mask_values] = values[~mask_values].astype(bool)
  145. if not np.all(
  146. values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
  147. ):
  148. raise TypeError("Need to pass bool-like values")
  149. values = values_bool
  150. else:
  151. values_object = np.asarray(values, dtype=object)
  152. inferred_dtype = lib.infer_dtype(values_object, skipna=True)
  153. integer_like = ("floating", "integer", "mixed-integer-float")
  154. if inferred_dtype not in ("boolean", "empty") + integer_like:
  155. raise TypeError("Need to pass bool-like values")
  156. # mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
  157. # within this branch, it assumes it can also be None
  158. mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
  159. values = np.zeros(len(values), dtype=bool)
  160. values[~mask_values] = values_object[~mask_values].astype(bool)
  161. # if the values were integer-like, validate it were actually 0/1's
  162. if (inferred_dtype in integer_like) and not (
  163. np.all(
  164. values[~mask_values].astype(float)
  165. == values_object[~mask_values].astype(float)
  166. )
  167. ):
  168. raise TypeError("Need to pass bool-like values")
  169. if mask is None and mask_values is None:
  170. mask = np.zeros(values.shape, dtype=bool)
  171. elif mask is None:
  172. mask = mask_values
  173. else:
  174. if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
  175. if mask_values is not None:
  176. mask = mask | mask_values
  177. else:
  178. if copy:
  179. mask = mask.copy()
  180. else:
  181. mask = np.array(mask, dtype=bool)
  182. if mask_values is not None:
  183. mask = mask | mask_values
  184. if values.shape != mask.shape:
  185. raise ValueError("values.shape and mask.shape must match")
  186. return values, mask
  187. class BooleanArray(BaseMaskedArray):
  188. """
  189. Array of boolean (True/False) data with missing values.
  190. This is a pandas Extension array for boolean data, under the hood
  191. represented by 2 numpy arrays: a boolean array with the data and
  192. a boolean array with the mask (True indicating missing).
  193. BooleanArray implements Kleene logic (sometimes called three-value
  194. logic) for logical operations. See :ref:`boolean.kleene` for more.
  195. To construct an BooleanArray from generic array-like input, use
  196. :func:`pandas.array` specifying ``dtype="boolean"`` (see examples
  197. below).
  198. .. warning::
  199. BooleanArray is considered experimental. The implementation and
  200. parts of the API may change without warning.
  201. Parameters
  202. ----------
  203. values : numpy.ndarray
  204. A 1-d boolean-dtype array with the data.
  205. mask : numpy.ndarray
  206. A 1-d boolean-dtype array indicating missing values (True
  207. indicates missing).
  208. copy : bool, default False
  209. Whether to copy the `values` and `mask` arrays.
  210. Attributes
  211. ----------
  212. None
  213. Methods
  214. -------
  215. None
  216. Returns
  217. -------
  218. BooleanArray
  219. Examples
  220. --------
  221. Create an BooleanArray with :func:`pandas.array`:
  222. >>> pd.array([True, False, None], dtype="boolean")
  223. <BooleanArray>
  224. [True, False, <NA>]
  225. Length: 3, dtype: boolean
  226. """
  227. # The value used to fill '_data' to avoid upcasting
  228. _internal_fill_value = False
  229. # Fill values used for any/all
  230. # Incompatible types in assignment (expression has type "bool", base class
  231. # "BaseMaskedArray" defined the type as "<typing special form>")
  232. _truthy_value = True # type: ignore[assignment]
  233. _falsey_value = False # type: ignore[assignment]
  234. _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
  235. _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
  236. def __init__(
  237. self, values: np.ndarray, mask: np.ndarray, copy: bool = False
  238. ) -> None:
  239. if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
  240. raise TypeError(
  241. "values should be boolean numpy array. Use "
  242. "the 'pd.array' function instead"
  243. )
  244. self._dtype = BooleanDtype()
  245. super().__init__(values, mask, copy=copy)
  246. @property
  247. def dtype(self) -> BooleanDtype:
  248. return self._dtype
  249. @classmethod
  250. def _from_sequence_of_strings(
  251. cls,
  252. strings: list[str],
  253. *,
  254. dtype: Dtype | None = None,
  255. copy: bool = False,
  256. true_values: list[str] | None = None,
  257. false_values: list[str] | None = None,
  258. ) -> BooleanArray:
  259. true_values_union = cls._TRUE_VALUES.union(true_values or [])
  260. false_values_union = cls._FALSE_VALUES.union(false_values or [])
  261. def map_string(s) -> bool:
  262. if s in true_values_union:
  263. return True
  264. elif s in false_values_union:
  265. return False
  266. else:
  267. raise ValueError(f"{s} cannot be cast to bool")
  268. scalars = np.array(strings, dtype=object)
  269. mask = isna(scalars)
  270. scalars[~mask] = list(map(map_string, scalars[~mask]))
  271. return cls._from_sequence(scalars, dtype=dtype, copy=copy)
  272. _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
  273. @classmethod
  274. def _coerce_to_array(
  275. cls, value, *, dtype: DtypeObj, copy: bool = False
  276. ) -> tuple[np.ndarray, np.ndarray]:
  277. if dtype:
  278. assert dtype == "boolean"
  279. return coerce_to_array(value, copy=copy)
  280. def _logical_method(self, other, op):
  281. assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
  282. other_is_scalar = lib.is_scalar(other)
  283. mask = None
  284. if isinstance(other, BooleanArray):
  285. other, mask = other._data, other._mask
  286. elif is_list_like(other):
  287. other = np.asarray(other, dtype="bool")
  288. if other.ndim > 1:
  289. raise NotImplementedError("can only perform ops with 1-d structures")
  290. other, mask = coerce_to_array(other, copy=False)
  291. elif isinstance(other, np.bool_):
  292. other = other.item()
  293. if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
  294. raise TypeError(
  295. "'other' should be pandas.NA or a bool. "
  296. f"Got {type(other).__name__} instead."
  297. )
  298. if not other_is_scalar and len(self) != len(other):
  299. raise ValueError("Lengths must match")
  300. if op.__name__ in {"or_", "ror_"}:
  301. result, mask = ops.kleene_or(self._data, other, self._mask, mask)
  302. elif op.__name__ in {"and_", "rand_"}:
  303. result, mask = ops.kleene_and(self._data, other, self._mask, mask)
  304. else:
  305. # i.e. xor, rxor
  306. result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
  307. # i.e. BooleanArray
  308. return self._maybe_mask_result(result, mask)
  309. def _accumulate(
  310. self, name: str, *, skipna: bool = True, **kwargs
  311. ) -> BaseMaskedArray:
  312. data = self._data
  313. mask = self._mask
  314. if name in ("cummin", "cummax"):
  315. op = getattr(masked_accumulations, name)
  316. data, mask = op(data, mask, skipna=skipna, **kwargs)
  317. return type(self)(data, mask, copy=False)
  318. else:
  319. from pandas.core.arrays import IntegerArray
  320. return IntegerArray(data.astype(int), mask)._accumulate(
  321. name, skipna=skipna, **kwargs
  322. )