array.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. """
  2. Test extension array for storing nested data in a pandas container.
  3. The JSONArray stores lists of dictionaries. The storage mechanism is a list,
  4. not an ndarray.
  5. Note
  6. ----
  7. We currently store lists of UserDicts. Pandas has a few places
  8. internally that specifically check for dicts, and does non-scalar things
  9. in that case. We *want* the dictionaries to be treated as scalars, so we
  10. hack around pandas by using UserDicts.
  11. """
  12. from __future__ import annotations
  13. from collections import (
  14. UserDict,
  15. abc,
  16. )
  17. import itertools
  18. import numbers
  19. import random
  20. import string
  21. import sys
  22. from typing import (
  23. Any,
  24. Mapping,
  25. )
  26. import numpy as np
  27. from pandas._typing import type_t
  28. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  29. from pandas.core.dtypes.common import (
  30. is_bool_dtype,
  31. is_list_like,
  32. pandas_dtype,
  33. )
  34. import pandas as pd
  35. from pandas.api.extensions import (
  36. ExtensionArray,
  37. ExtensionDtype,
  38. )
  39. from pandas.core.indexers import unpack_tuple_and_ellipses
  40. class JSONDtype(ExtensionDtype):
  41. type = abc.Mapping
  42. name = "json"
  43. na_value: Mapping[str, Any] = UserDict()
  44. @classmethod
  45. def construct_array_type(cls) -> type_t[JSONArray]:
  46. """
  47. Return the array type associated with this dtype.
  48. Returns
  49. -------
  50. type
  51. """
  52. return JSONArray
  53. class JSONArray(ExtensionArray):
  54. dtype = JSONDtype()
  55. __array_priority__ = 1000
  56. def __init__(self, values, dtype=None, copy=False) -> None:
  57. for val in values:
  58. if not isinstance(val, self.dtype.type):
  59. raise TypeError("All values must be of type " + str(self.dtype.type))
  60. self.data = values
  61. # Some aliases for common attribute names to ensure pandas supports
  62. # these
  63. self._items = self._data = self.data
  64. # those aliases are currently not working due to assumptions
  65. # in internal code (GH-20735)
  66. # self._values = self.values = self.data
  67. @classmethod
  68. def _from_sequence(cls, scalars, dtype=None, copy=False):
  69. return cls(scalars)
  70. @classmethod
  71. def _from_factorized(cls, values, original):
  72. return cls([UserDict(x) for x in values if x != ()])
  73. def __getitem__(self, item):
  74. if isinstance(item, tuple):
  75. item = unpack_tuple_and_ellipses(item)
  76. if isinstance(item, numbers.Integral):
  77. return self.data[item]
  78. elif isinstance(item, slice) and item == slice(None):
  79. # Make sure we get a view
  80. return type(self)(self.data)
  81. elif isinstance(item, slice):
  82. # slice
  83. return type(self)(self.data[item])
  84. elif not is_list_like(item):
  85. # e.g. "foo" or 2.5
  86. # exception message copied from numpy
  87. raise IndexError(
  88. r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
  89. r"(`None`) and integer or boolean arrays are valid indices"
  90. )
  91. else:
  92. item = pd.api.indexers.check_array_indexer(self, item)
  93. if is_bool_dtype(item.dtype):
  94. return self._from_sequence([x for x, m in zip(self, item) if m])
  95. # integer
  96. return type(self)([self.data[i] for i in item])
  97. def __setitem__(self, key, value):
  98. if isinstance(key, numbers.Integral):
  99. self.data[key] = value
  100. else:
  101. if not isinstance(value, (type(self), abc.Sequence)):
  102. # broadcast value
  103. value = itertools.cycle([value])
  104. if isinstance(key, np.ndarray) and key.dtype == "bool":
  105. # masking
  106. for i, (k, v) in enumerate(zip(key, value)):
  107. if k:
  108. assert isinstance(v, self.dtype.type)
  109. self.data[i] = v
  110. else:
  111. for k, v in zip(key, value):
  112. assert isinstance(v, self.dtype.type)
  113. self.data[k] = v
  114. def __len__(self) -> int:
  115. return len(self.data)
  116. def __eq__(self, other):
  117. return NotImplemented
  118. def __ne__(self, other):
  119. return NotImplemented
  120. def __array__(self, dtype=None):
  121. if dtype is None:
  122. dtype = object
  123. if dtype == object:
  124. # on py38 builds it looks like numpy is inferring to a non-1D array
  125. return construct_1d_object_array_from_listlike(list(self))
  126. return np.asarray(self.data, dtype=dtype)
  127. @property
  128. def nbytes(self) -> int:
  129. return sys.getsizeof(self.data)
  130. def isna(self):
  131. return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
  132. def take(self, indexer, allow_fill=False, fill_value=None):
  133. # re-implement here, since NumPy has trouble setting
  134. # sized objects like UserDicts into scalar slots of
  135. # an ndarary.
  136. indexer = np.asarray(indexer)
  137. msg = (
  138. "Index is out of bounds or cannot do a "
  139. "non-empty take from an empty array."
  140. )
  141. if allow_fill:
  142. if fill_value is None:
  143. fill_value = self.dtype.na_value
  144. # bounds check
  145. if (indexer < -1).any():
  146. raise ValueError
  147. try:
  148. output = [
  149. self.data[loc] if loc != -1 else fill_value for loc in indexer
  150. ]
  151. except IndexError as err:
  152. raise IndexError(msg) from err
  153. else:
  154. try:
  155. output = [self.data[loc] for loc in indexer]
  156. except IndexError as err:
  157. raise IndexError(msg) from err
  158. return self._from_sequence(output)
  159. def copy(self):
  160. return type(self)(self.data[:])
  161. def astype(self, dtype, copy=True):
  162. # NumPy has issues when all the dicts are the same length.
  163. # np.array([UserDict(...), UserDict(...)]) fails,
  164. # but np.array([{...}, {...}]) works, so cast.
  165. from pandas.core.arrays.string_ import StringDtype
  166. dtype = pandas_dtype(dtype)
  167. # needed to add this check for the Series constructor
  168. if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
  169. if copy:
  170. return self.copy()
  171. return self
  172. elif isinstance(dtype, StringDtype):
  173. value = self.astype(str) # numpy doesn't like nested dicts
  174. return dtype.construct_array_type()._from_sequence(value, copy=False)
  175. return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
  176. def unique(self):
  177. # Parent method doesn't work since np.array will try to infer
  178. # a 2-dim object.
  179. return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
  180. @classmethod
  181. def _concat_same_type(cls, to_concat):
  182. data = list(itertools.chain.from_iterable(x.data for x in to_concat))
  183. return cls(data)
  184. def _values_for_factorize(self):
  185. frozen = self._values_for_argsort()
  186. if len(frozen) == 0:
  187. # factorize_array expects 1-d array, this is a len-0 2-d array.
  188. frozen = frozen.ravel()
  189. return frozen, ()
  190. def _values_for_argsort(self):
  191. # Bypass NumPy's shape inference to get a (N,) array of tuples.
  192. frozen = [tuple(x.items()) for x in self]
  193. return construct_1d_object_array_from_listlike(frozen)
  194. def make_data():
  195. # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
  196. return [
  197. UserDict(
  198. [
  199. (random.choice(string.ascii_letters), random.randint(0, 100))
  200. for _ in range(random.randint(0, 10))
  201. ]
  202. )
  203. for _ in range(100)
  204. ]