123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460 |
- """
- A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
- """
- from __future__ import annotations
- from abc import (
- ABC,
- abstractmethod,
- )
- import enum
- from typing import (
- Any,
- Iterable,
- Sequence,
- TypedDict,
- )
- class DlpackDeviceType(enum.IntEnum):
- """Integer enum for device type codes matching DLPack."""
- CPU = 1
- CUDA = 2
- CPU_PINNED = 3
- OPENCL = 4
- VULKAN = 7
- METAL = 8
- VPI = 9
- ROCM = 10
- class DtypeKind(enum.IntEnum):
- """
- Integer enum for data types.
- Attributes
- ----------
- INT : int
- Matches to signed integer data type.
- UINT : int
- Matches to unsigned integer data type.
- FLOAT : int
- Matches to floating point data type.
- BOOL : int
- Matches to boolean data type.
- STRING : int
- Matches to string data type (UTF-8 encoded).
- DATETIME : int
- Matches to datetime data type.
- CATEGORICAL : int
- Matches to categorical data type.
- """
- INT = 0
- UINT = 1
- FLOAT = 2
- BOOL = 20
- STRING = 21 # UTF-8
- DATETIME = 22
- CATEGORICAL = 23
- class ColumnNullType(enum.IntEnum):
- """
- Integer enum for null type representation.
- Attributes
- ----------
- NON_NULLABLE : int
- Non-nullable column.
- USE_NAN : int
- Use explicit float NaN value.
- USE_SENTINEL : int
- Sentinel value besides NaN/NaT.
- USE_BITMASK : int
- The bit is set/unset representing a null on a certain position.
- USE_BYTEMASK : int
- The byte is set/unset representing a null on a certain position.
- """
- NON_NULLABLE = 0
- USE_NAN = 1
- USE_SENTINEL = 2
- USE_BITMASK = 3
- USE_BYTEMASK = 4
- class ColumnBuffers(TypedDict):
- # first element is a buffer containing the column data;
- # second element is the data buffer's associated dtype
- data: tuple[Buffer, Any]
- # first element is a buffer containing mask values indicating missing data;
- # second element is the mask value buffer's associated dtype.
- # None if the null representation is not a bit or byte mask
- validity: tuple[Buffer, Any] | None
- # first element is a buffer containing the offset values for
- # variable-size binary data (e.g., variable-length strings);
- # second element is the offsets buffer's associated dtype.
- # None if the data buffer does not have an associated offsets buffer
- offsets: tuple[Buffer, Any] | None
- class CategoricalDescription(TypedDict):
- # whether the ordering of dictionary indices is semantically meaningful
- is_ordered: bool
- # whether a dictionary-style mapping of categorical values to other objects exists
- is_dictionary: bool
- # Python-level only (e.g. ``{int: str}``).
- # None if not a dictionary-style categorical.
- categories: Column | None
- class Buffer(ABC):
- """
- Data in the buffer is guaranteed to be contiguous in memory.
- Note that there is no dtype attribute present, a buffer can be thought of
- as simply a block of memory. However, if the column that the buffer is
- attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
- implemented, then that dtype information will be contained in the return
- value from ``__dlpack__``.
- This distinction is useful to support both data exchange via DLPack on a
- buffer and (b) dtypes like variable-length strings which do not have a
- fixed number of bytes per element.
- """
- @property
- @abstractmethod
- def bufsize(self) -> int:
- """
- Buffer size in bytes.
- """
- @property
- @abstractmethod
- def ptr(self) -> int:
- """
- Pointer to start of the buffer as an integer.
- """
- @abstractmethod
- def __dlpack__(self):
- """
- Produce DLPack capsule (see array API standard).
- Raises:
- - TypeError : if the buffer contains unsupported dtypes.
- - NotImplementedError : if DLPack support is not implemented
- Useful to have to connect to array libraries. Support optional because
- it's not completely trivial to implement for a Python-only library.
- """
- raise NotImplementedError("__dlpack__")
- @abstractmethod
- def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
- """
- Device type and device ID for where the data in the buffer resides.
- Uses device type codes matching DLPack.
- Note: must be implemented even if ``__dlpack__`` is not.
- """
- class Column(ABC):
- """
- A column object, with only the methods and properties required by the
- interchange protocol defined.
- A column can contain one or more chunks. Each chunk can contain up to three
- buffers - a data buffer, a mask buffer (depending on null representation),
- and an offsets buffer (if variable-size binary; e.g., variable-length
- strings).
- TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
- Instead, it seems to use "children" for both columns with a bit mask,
- and for nested dtypes. Unclear whether this is elegant or confusing.
- This design requires checking the null representation explicitly.
- The Arrow design requires checking:
- 1. the ARROW_FLAG_NULLABLE (for sentinel values)
- 2. if a column has two children, combined with one of those children
- having a null dtype.
- Making the mask concept explicit seems useful. One null dtype would
- not be enough to cover both bit and byte masks, so that would mean
- even more checking if we did it the Arrow way.
- TBD: there's also the "chunk" concept here, which is implicit in Arrow as
- multiple buffers per array (= column here). Semantically it may make
- sense to have both: chunks were meant for example for lazy evaluation
- of data which doesn't fit in memory, while multiple buffers per column
- could also come from doing a selection operation on a single
- contiguous buffer.
- Given these concepts, one would expect chunks to be all of the same
- size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
- while multiple buffers could have data-dependent lengths. Not an issue
- in pandas if one column is backed by a single NumPy array, but in
- Arrow it seems possible.
- Are multiple chunks *and* multiple buffers per column necessary for
- the purposes of this interchange protocol, or must producers either
- reuse the chunk concept for this or copy the data?
- Note: this Column object can only be produced by ``__dataframe__``, so
- doesn't need its own version or ``__column__`` protocol.
- """
- @abstractmethod
- def size(self) -> int:
- """
- Size of the column, in elements.
- Corresponds to DataFrame.num_rows() if column is a single chunk;
- equal to size of this current chunk otherwise.
- """
- @property
- @abstractmethod
- def offset(self) -> int:
- """
- Offset of first element.
- May be > 0 if using chunks; for example for a column with N chunks of
- equal size M (only the last chunk may be shorter),
- ``offset = n * M``, ``n = 0 .. N-1``.
- """
- @property
- @abstractmethod
- def dtype(self) -> tuple[DtypeKind, int, str, str]:
- """
- Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
- Bit-width : the number of bits as an integer
- Format string : data type description format string in Apache Arrow C
- Data Interface format.
- Endianness : current only native endianness (``=``) is supported
- Notes:
- - Kind specifiers are aligned with DLPack where possible (hence the
- jump to 20, leave enough room for future extension)
- - Masks must be specified as boolean with either bit width 1 (for bit
- masks) or 8 (for byte masks).
- - Dtype width in bits was preferred over bytes
- - Endianness isn't too useful, but included now in case in the future
- we need to support non-native endianness
- - Went with Apache Arrow format strings over NumPy format strings
- because they're more complete from a dataframe perspective
- - Format strings are mostly useful for datetime specification, and
- for categoricals.
- - For categoricals, the format string describes the type of the
- categorical in the data buffer. In case of a separate encoding of
- the categorical (e.g. an integer to string mapping), this can
- be derived from ``self.describe_categorical``.
- - Data types not included: complex, Arrow-style null, binary, decimal,
- and nested (list, struct, map, union) dtypes.
- """
- @property
- @abstractmethod
- def describe_categorical(self) -> CategoricalDescription:
- """
- If the dtype is categorical, there are two options:
- - There are only values in the data buffer.
- - There is a separate non-categorical Column encoding for categorical values.
- Raises TypeError if the dtype is not categorical
- Returns the dictionary with description on how to interpret the data buffer:
- - "is_ordered" : bool, whether the ordering of dictionary indices is
- semantically meaningful.
- - "is_dictionary" : bool, whether a mapping of
- categorical values to other objects exists
- - "categories" : Column representing the (implicit) mapping of indices to
- category values (e.g. an array of cat1, cat2, ...).
- None if not a dictionary-style categorical.
- TBD: are there any other in-memory representations that are needed?
- """
- @property
- @abstractmethod
- def describe_null(self) -> tuple[ColumnNullType, Any]:
- """
- Return the missing value (or "null") representation the column dtype
- uses, as a tuple ``(kind, value)``.
- Value : if kind is "sentinel value", the actual value. If kind is a bit
- mask or a byte mask, the value (0 or 1) indicating a missing value. None
- otherwise.
- """
- @property
- @abstractmethod
- def null_count(self) -> int | None:
- """
- Number of null elements, if known.
- Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
- """
- @property
- @abstractmethod
- def metadata(self) -> dict[str, Any]:
- """
- The metadata for the column. See `DataFrame.metadata` for more details.
- """
- @abstractmethod
- def num_chunks(self) -> int:
- """
- Return the number of chunks the column consists of.
- """
- @abstractmethod
- def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
- """
- Return an iterator yielding the chunks.
- See `DataFrame.get_chunks` for details on ``n_chunks``.
- """
- @abstractmethod
- def get_buffers(self) -> ColumnBuffers:
- """
- Return a dictionary containing the underlying buffers.
- The returned dictionary has the following contents:
- - "data": a two-element tuple whose first element is a buffer
- containing the data and whose second element is the data
- buffer's associated dtype.
- - "validity": a two-element tuple whose first element is a buffer
- containing mask values indicating missing data and
- whose second element is the mask value buffer's
- associated dtype. None if the null representation is
- not a bit or byte mask.
- - "offsets": a two-element tuple whose first element is a buffer
- containing the offset values for variable-size binary
- data (e.g., variable-length strings) and whose second
- element is the offsets buffer's associated dtype. None
- if the data buffer does not have an associated offsets
- buffer.
- """
- # def get_children(self) -> Iterable[Column]:
- # """
- # Children columns underneath the column, each object in this iterator
- # must adhere to the column specification.
- # """
- # pass
- class DataFrame(ABC):
- """
- A data frame class, with only the methods required by the interchange
- protocol defined.
- A "data frame" represents an ordered collection of named columns.
- A column's "name" must be a unique string.
- Columns may be accessed by name or by position.
- This could be a public data frame class, or an object with the methods and
- attributes defined on this DataFrame class could be returned from the
- ``__dataframe__`` method of a public data frame class in a library adhering
- to the dataframe interchange protocol specification.
- """
- version = 0 # version of the protocol
- @abstractmethod
- def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
- """Construct a new interchange object, potentially changing the parameters."""
- @property
- @abstractmethod
- def metadata(self) -> dict[str, Any]:
- """
- The metadata for the data frame, as a dictionary with string keys. The
- contents of `metadata` may be anything, they are meant for a library
- to store information that it needs to, e.g., roundtrip losslessly or
- for two implementations to share data that is not (yet) part of the
- interchange protocol specification. For avoiding collisions with other
- entries, please add name the keys with the name of the library
- followed by a period and the desired name, e.g, ``pandas.indexcol``.
- """
- @abstractmethod
- def num_columns(self) -> int:
- """
- Return the number of columns in the DataFrame.
- """
- @abstractmethod
- def num_rows(self) -> int | None:
- # TODO: not happy with Optional, but need to flag it may be expensive
- # why include it if it may be None - what do we expect consumers
- # to do here?
- """
- Return the number of rows in the DataFrame, if available.
- """
- @abstractmethod
- def num_chunks(self) -> int:
- """
- Return the number of chunks the DataFrame consists of.
- """
- @abstractmethod
- def column_names(self) -> Iterable[str]:
- """
- Return an iterator yielding the column names.
- """
- @abstractmethod
- def get_column(self, i: int) -> Column:
- """
- Return the column at the indicated position.
- """
- @abstractmethod
- def get_column_by_name(self, name: str) -> Column:
- """
- Return the column whose name is the indicated name.
- """
- @abstractmethod
- def get_columns(self) -> Iterable[Column]:
- """
- Return an iterator yielding the columns.
- """
- @abstractmethod
- def select_columns(self, indices: Sequence[int]) -> DataFrame:
- """
- Create a new DataFrame by selecting a subset of columns by index.
- """
- @abstractmethod
- def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
- """
- Create a new DataFrame by selecting a subset of columns by name.
- """
- @abstractmethod
- def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
- """
- Return an iterator yielding the chunks.
- By default (None), yields the chunks that the data is stored as by the
- producer. If given, ``n_chunks`` must be a multiple of
- ``self.num_chunks()``, meaning the producer must subdivide each chunk
- before yielding it.
- """
|