array.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. """
  2. Test extension array for storing nested data in a pandas container.
  3. The ListArray stores an ndarray of lists.
  4. """
  5. from __future__ import annotations
  6. import numbers
  7. import random
  8. import string
  9. import numpy as np
  10. from pandas._typing import type_t
  11. from pandas.core.dtypes.base import ExtensionDtype
  12. import pandas as pd
  13. from pandas.api.types import (
  14. is_object_dtype,
  15. is_string_dtype,
  16. )
  17. from pandas.core.arrays import ExtensionArray
  18. class ListDtype(ExtensionDtype):
  19. type = list
  20. name = "list"
  21. na_value = np.nan
  22. @classmethod
  23. def construct_array_type(cls) -> type_t[ListArray]:
  24. """
  25. Return the array type associated with this dtype.
  26. Returns
  27. -------
  28. type
  29. """
  30. return ListArray
  31. class ListArray(ExtensionArray):
  32. dtype = ListDtype()
  33. __array_priority__ = 1000
  34. def __init__(self, values, dtype=None, copy=False) -> None:
  35. if not isinstance(values, np.ndarray):
  36. raise TypeError("Need to pass a numpy array as values")
  37. for val in values:
  38. if not isinstance(val, self.dtype.type) and not pd.isna(val):
  39. raise TypeError("All values must be of type " + str(self.dtype.type))
  40. self.data = values
  41. @classmethod
  42. def _from_sequence(cls, scalars, dtype=None, copy=False):
  43. data = np.empty(len(scalars), dtype=object)
  44. data[:] = scalars
  45. return cls(data)
  46. def __getitem__(self, item):
  47. if isinstance(item, numbers.Integral):
  48. return self.data[item]
  49. else:
  50. # slice, list-like, mask
  51. return type(self)(self.data[item])
  52. def __len__(self) -> int:
  53. return len(self.data)
  54. def isna(self):
  55. return np.array(
  56. [not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool
  57. )
  58. def take(self, indexer, allow_fill=False, fill_value=None):
  59. # re-implement here, since NumPy has trouble setting
  60. # sized objects like UserDicts into scalar slots of
  61. # an ndarary.
  62. indexer = np.asarray(indexer)
  63. msg = (
  64. "Index is out of bounds or cannot do a "
  65. "non-empty take from an empty array."
  66. )
  67. if allow_fill:
  68. if fill_value is None:
  69. fill_value = self.dtype.na_value
  70. # bounds check
  71. if (indexer < -1).any():
  72. raise ValueError
  73. try:
  74. output = [
  75. self.data[loc] if loc != -1 else fill_value for loc in indexer
  76. ]
  77. except IndexError as err:
  78. raise IndexError(msg) from err
  79. else:
  80. try:
  81. output = [self.data[loc] for loc in indexer]
  82. except IndexError as err:
  83. raise IndexError(msg) from err
  84. return self._from_sequence(output)
  85. def copy(self):
  86. return type(self)(self.data[:])
  87. def astype(self, dtype, copy=True):
  88. if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
  89. if copy:
  90. return self.copy()
  91. return self
  92. elif is_string_dtype(dtype) and not is_object_dtype(dtype):
  93. # numpy has problems with astype(str) for nested elements
  94. return np.array([str(x) for x in self.data], dtype=dtype)
  95. return np.array(self.data, dtype=dtype, copy=copy)
  96. @classmethod
  97. def _concat_same_type(cls, to_concat):
  98. data = np.concatenate([x.data for x in to_concat])
  99. return cls(data)
  100. def make_data():
  101. # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
  102. data = np.empty(100, dtype=object)
  103. data[:] = [
  104. [random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))]
  105. for _ in range(100)
  106. ]
  107. return data