123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- """
- Test extension array for storing nested data in a pandas container.
- The ListArray stores an ndarray of lists.
- """
- from __future__ import annotations
- import numbers
- import random
- import string
- import numpy as np
- from pandas._typing import type_t
- from pandas.core.dtypes.base import ExtensionDtype
- import pandas as pd
- from pandas.api.types import (
- is_object_dtype,
- is_string_dtype,
- )
- from pandas.core.arrays import ExtensionArray
- class ListDtype(ExtensionDtype):
- type = list
- name = "list"
- na_value = np.nan
- @classmethod
- def construct_array_type(cls) -> type_t[ListArray]:
- """
- Return the array type associated with this dtype.
- Returns
- -------
- type
- """
- return ListArray
- class ListArray(ExtensionArray):
- dtype = ListDtype()
- __array_priority__ = 1000
- def __init__(self, values, dtype=None, copy=False) -> None:
- if not isinstance(values, np.ndarray):
- raise TypeError("Need to pass a numpy array as values")
- for val in values:
- if not isinstance(val, self.dtype.type) and not pd.isna(val):
- raise TypeError("All values must be of type " + str(self.dtype.type))
- self.data = values
- @classmethod
- def _from_sequence(cls, scalars, dtype=None, copy=False):
- data = np.empty(len(scalars), dtype=object)
- data[:] = scalars
- return cls(data)
- def __getitem__(self, item):
- if isinstance(item, numbers.Integral):
- return self.data[item]
- else:
- # slice, list-like, mask
- return type(self)(self.data[item])
- def __len__(self) -> int:
- return len(self.data)
- def isna(self):
- return np.array(
- [not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool
- )
- def take(self, indexer, allow_fill=False, fill_value=None):
- # re-implement here, since NumPy has trouble setting
- # sized objects like UserDicts into scalar slots of
- # an ndarary.
- indexer = np.asarray(indexer)
- msg = (
- "Index is out of bounds or cannot do a "
- "non-empty take from an empty array."
- )
- if allow_fill:
- if fill_value is None:
- fill_value = self.dtype.na_value
- # bounds check
- if (indexer < -1).any():
- raise ValueError
- try:
- output = [
- self.data[loc] if loc != -1 else fill_value for loc in indexer
- ]
- except IndexError as err:
- raise IndexError(msg) from err
- else:
- try:
- output = [self.data[loc] for loc in indexer]
- except IndexError as err:
- raise IndexError(msg) from err
- return self._from_sequence(output)
- def copy(self):
- return type(self)(self.data[:])
- def astype(self, dtype, copy=True):
- if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
- if copy:
- return self.copy()
- return self
- elif is_string_dtype(dtype) and not is_object_dtype(dtype):
- # numpy has problems with astype(str) for nested elements
- return np.array([str(x) for x in self.data], dtype=dtype)
- return np.array(self.data, dtype=dtype, copy=copy)
- @classmethod
- def _concat_same_type(cls, to_concat):
- data = np.concatenate([x.data for x in to_concat])
- return cls(data)
- def make_data():
- # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
- data = np.empty(100, dtype=object)
- data[:] = [
- [random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))]
- for _ in range(100)
- ]
- return data
|