dataframe_protocol.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. """
  2. A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
  3. """
  4. from __future__ import annotations
  5. from abc import (
  6. ABC,
  7. abstractmethod,
  8. )
  9. import enum
  10. from typing import (
  11. Any,
  12. Iterable,
  13. Sequence,
  14. TypedDict,
  15. )
  16. class DlpackDeviceType(enum.IntEnum):
  17. """Integer enum for device type codes matching DLPack."""
  18. CPU = 1
  19. CUDA = 2
  20. CPU_PINNED = 3
  21. OPENCL = 4
  22. VULKAN = 7
  23. METAL = 8
  24. VPI = 9
  25. ROCM = 10
  26. class DtypeKind(enum.IntEnum):
  27. """
  28. Integer enum for data types.
  29. Attributes
  30. ----------
  31. INT : int
  32. Matches to signed integer data type.
  33. UINT : int
  34. Matches to unsigned integer data type.
  35. FLOAT : int
  36. Matches to floating point data type.
  37. BOOL : int
  38. Matches to boolean data type.
  39. STRING : int
  40. Matches to string data type (UTF-8 encoded).
  41. DATETIME : int
  42. Matches to datetime data type.
  43. CATEGORICAL : int
  44. Matches to categorical data type.
  45. """
  46. INT = 0
  47. UINT = 1
  48. FLOAT = 2
  49. BOOL = 20
  50. STRING = 21 # UTF-8
  51. DATETIME = 22
  52. CATEGORICAL = 23
  53. class ColumnNullType(enum.IntEnum):
  54. """
  55. Integer enum for null type representation.
  56. Attributes
  57. ----------
  58. NON_NULLABLE : int
  59. Non-nullable column.
  60. USE_NAN : int
  61. Use explicit float NaN value.
  62. USE_SENTINEL : int
  63. Sentinel value besides NaN/NaT.
  64. USE_BITMASK : int
  65. The bit is set/unset representing a null on a certain position.
  66. USE_BYTEMASK : int
  67. The byte is set/unset representing a null on a certain position.
  68. """
  69. NON_NULLABLE = 0
  70. USE_NAN = 1
  71. USE_SENTINEL = 2
  72. USE_BITMASK = 3
  73. USE_BYTEMASK = 4
  74. class ColumnBuffers(TypedDict):
  75. # first element is a buffer containing the column data;
  76. # second element is the data buffer's associated dtype
  77. data: tuple[Buffer, Any]
  78. # first element is a buffer containing mask values indicating missing data;
  79. # second element is the mask value buffer's associated dtype.
  80. # None if the null representation is not a bit or byte mask
  81. validity: tuple[Buffer, Any] | None
  82. # first element is a buffer containing the offset values for
  83. # variable-size binary data (e.g., variable-length strings);
  84. # second element is the offsets buffer's associated dtype.
  85. # None if the data buffer does not have an associated offsets buffer
  86. offsets: tuple[Buffer, Any] | None
  87. class CategoricalDescription(TypedDict):
  88. # whether the ordering of dictionary indices is semantically meaningful
  89. is_ordered: bool
  90. # whether a dictionary-style mapping of categorical values to other objects exists
  91. is_dictionary: bool
  92. # Python-level only (e.g. ``{int: str}``).
  93. # None if not a dictionary-style categorical.
  94. categories: Column | None
  95. class Buffer(ABC):
  96. """
  97. Data in the buffer is guaranteed to be contiguous in memory.
  98. Note that there is no dtype attribute present, a buffer can be thought of
  99. as simply a block of memory. However, if the column that the buffer is
  100. attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
  101. implemented, then that dtype information will be contained in the return
  102. value from ``__dlpack__``.
  103. This distinction is useful to support both data exchange via DLPack on a
  104. buffer and (b) dtypes like variable-length strings which do not have a
  105. fixed number of bytes per element.
  106. """
  107. @property
  108. @abstractmethod
  109. def bufsize(self) -> int:
  110. """
  111. Buffer size in bytes.
  112. """
  113. @property
  114. @abstractmethod
  115. def ptr(self) -> int:
  116. """
  117. Pointer to start of the buffer as an integer.
  118. """
  119. @abstractmethod
  120. def __dlpack__(self):
  121. """
  122. Produce DLPack capsule (see array API standard).
  123. Raises:
  124. - TypeError : if the buffer contains unsupported dtypes.
  125. - NotImplementedError : if DLPack support is not implemented
  126. Useful to have to connect to array libraries. Support optional because
  127. it's not completely trivial to implement for a Python-only library.
  128. """
  129. raise NotImplementedError("__dlpack__")
  130. @abstractmethod
  131. def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
  132. """
  133. Device type and device ID for where the data in the buffer resides.
  134. Uses device type codes matching DLPack.
  135. Note: must be implemented even if ``__dlpack__`` is not.
  136. """
  137. class Column(ABC):
  138. """
  139. A column object, with only the methods and properties required by the
  140. interchange protocol defined.
  141. A column can contain one or more chunks. Each chunk can contain up to three
  142. buffers - a data buffer, a mask buffer (depending on null representation),
  143. and an offsets buffer (if variable-size binary; e.g., variable-length
  144. strings).
  145. TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
  146. Instead, it seems to use "children" for both columns with a bit mask,
  147. and for nested dtypes. Unclear whether this is elegant or confusing.
  148. This design requires checking the null representation explicitly.
  149. The Arrow design requires checking:
  150. 1. the ARROW_FLAG_NULLABLE (for sentinel values)
  151. 2. if a column has two children, combined with one of those children
  152. having a null dtype.
  153. Making the mask concept explicit seems useful. One null dtype would
  154. not be enough to cover both bit and byte masks, so that would mean
  155. even more checking if we did it the Arrow way.
  156. TBD: there's also the "chunk" concept here, which is implicit in Arrow as
  157. multiple buffers per array (= column here). Semantically it may make
  158. sense to have both: chunks were meant for example for lazy evaluation
  159. of data which doesn't fit in memory, while multiple buffers per column
  160. could also come from doing a selection operation on a single
  161. contiguous buffer.
  162. Given these concepts, one would expect chunks to be all of the same
  163. size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
  164. while multiple buffers could have data-dependent lengths. Not an issue
  165. in pandas if one column is backed by a single NumPy array, but in
  166. Arrow it seems possible.
  167. Are multiple chunks *and* multiple buffers per column necessary for
  168. the purposes of this interchange protocol, or must producers either
  169. reuse the chunk concept for this or copy the data?
  170. Note: this Column object can only be produced by ``__dataframe__``, so
  171. doesn't need its own version or ``__column__`` protocol.
  172. """
  173. @abstractmethod
  174. def size(self) -> int:
  175. """
  176. Size of the column, in elements.
  177. Corresponds to DataFrame.num_rows() if column is a single chunk;
  178. equal to size of this current chunk otherwise.
  179. """
  180. @property
  181. @abstractmethod
  182. def offset(self) -> int:
  183. """
  184. Offset of first element.
  185. May be > 0 if using chunks; for example for a column with N chunks of
  186. equal size M (only the last chunk may be shorter),
  187. ``offset = n * M``, ``n = 0 .. N-1``.
  188. """
  189. @property
  190. @abstractmethod
  191. def dtype(self) -> tuple[DtypeKind, int, str, str]:
  192. """
  193. Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
  194. Bit-width : the number of bits as an integer
  195. Format string : data type description format string in Apache Arrow C
  196. Data Interface format.
  197. Endianness : current only native endianness (``=``) is supported
  198. Notes:
  199. - Kind specifiers are aligned with DLPack where possible (hence the
  200. jump to 20, leave enough room for future extension)
  201. - Masks must be specified as boolean with either bit width 1 (for bit
  202. masks) or 8 (for byte masks).
  203. - Dtype width in bits was preferred over bytes
  204. - Endianness isn't too useful, but included now in case in the future
  205. we need to support non-native endianness
  206. - Went with Apache Arrow format strings over NumPy format strings
  207. because they're more complete from a dataframe perspective
  208. - Format strings are mostly useful for datetime specification, and
  209. for categoricals.
  210. - For categoricals, the format string describes the type of the
  211. categorical in the data buffer. In case of a separate encoding of
  212. the categorical (e.g. an integer to string mapping), this can
  213. be derived from ``self.describe_categorical``.
  214. - Data types not included: complex, Arrow-style null, binary, decimal,
  215. and nested (list, struct, map, union) dtypes.
  216. """
  217. @property
  218. @abstractmethod
  219. def describe_categorical(self) -> CategoricalDescription:
  220. """
  221. If the dtype is categorical, there are two options:
  222. - There are only values in the data buffer.
  223. - There is a separate non-categorical Column encoding for categorical values.
  224. Raises TypeError if the dtype is not categorical
  225. Returns the dictionary with description on how to interpret the data buffer:
  226. - "is_ordered" : bool, whether the ordering of dictionary indices is
  227. semantically meaningful.
  228. - "is_dictionary" : bool, whether a mapping of
  229. categorical values to other objects exists
  230. - "categories" : Column representing the (implicit) mapping of indices to
  231. category values (e.g. an array of cat1, cat2, ...).
  232. None if not a dictionary-style categorical.
  233. TBD: are there any other in-memory representations that are needed?
  234. """
  235. @property
  236. @abstractmethod
  237. def describe_null(self) -> tuple[ColumnNullType, Any]:
  238. """
  239. Return the missing value (or "null") representation the column dtype
  240. uses, as a tuple ``(kind, value)``.
  241. Value : if kind is "sentinel value", the actual value. If kind is a bit
  242. mask or a byte mask, the value (0 or 1) indicating a missing value. None
  243. otherwise.
  244. """
  245. @property
  246. @abstractmethod
  247. def null_count(self) -> int | None:
  248. """
  249. Number of null elements, if known.
  250. Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
  251. """
  252. @property
  253. @abstractmethod
  254. def metadata(self) -> dict[str, Any]:
  255. """
  256. The metadata for the column. See `DataFrame.metadata` for more details.
  257. """
  258. @abstractmethod
  259. def num_chunks(self) -> int:
  260. """
  261. Return the number of chunks the column consists of.
  262. """
  263. @abstractmethod
  264. def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
  265. """
  266. Return an iterator yielding the chunks.
  267. See `DataFrame.get_chunks` for details on ``n_chunks``.
  268. """
  269. @abstractmethod
  270. def get_buffers(self) -> ColumnBuffers:
  271. """
  272. Return a dictionary containing the underlying buffers.
  273. The returned dictionary has the following contents:
  274. - "data": a two-element tuple whose first element is a buffer
  275. containing the data and whose second element is the data
  276. buffer's associated dtype.
  277. - "validity": a two-element tuple whose first element is a buffer
  278. containing mask values indicating missing data and
  279. whose second element is the mask value buffer's
  280. associated dtype. None if the null representation is
  281. not a bit or byte mask.
  282. - "offsets": a two-element tuple whose first element is a buffer
  283. containing the offset values for variable-size binary
  284. data (e.g., variable-length strings) and whose second
  285. element is the offsets buffer's associated dtype. None
  286. if the data buffer does not have an associated offsets
  287. buffer.
  288. """
  289. # def get_children(self) -> Iterable[Column]:
  290. # """
  291. # Children columns underneath the column, each object in this iterator
  292. # must adhere to the column specification.
  293. # """
  294. # pass
  295. class DataFrame(ABC):
  296. """
  297. A data frame class, with only the methods required by the interchange
  298. protocol defined.
  299. A "data frame" represents an ordered collection of named columns.
  300. A column's "name" must be a unique string.
  301. Columns may be accessed by name or by position.
  302. This could be a public data frame class, or an object with the methods and
  303. attributes defined on this DataFrame class could be returned from the
  304. ``__dataframe__`` method of a public data frame class in a library adhering
  305. to the dataframe interchange protocol specification.
  306. """
  307. version = 0 # version of the protocol
  308. @abstractmethod
  309. def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
  310. """Construct a new interchange object, potentially changing the parameters."""
  311. @property
  312. @abstractmethod
  313. def metadata(self) -> dict[str, Any]:
  314. """
  315. The metadata for the data frame, as a dictionary with string keys. The
  316. contents of `metadata` may be anything, they are meant for a library
  317. to store information that it needs to, e.g., roundtrip losslessly or
  318. for two implementations to share data that is not (yet) part of the
  319. interchange protocol specification. For avoiding collisions with other
  320. entries, please add name the keys with the name of the library
  321. followed by a period and the desired name, e.g, ``pandas.indexcol``.
  322. """
  323. @abstractmethod
  324. def num_columns(self) -> int:
  325. """
  326. Return the number of columns in the DataFrame.
  327. """
  328. @abstractmethod
  329. def num_rows(self) -> int | None:
  330. # TODO: not happy with Optional, but need to flag it may be expensive
  331. # why include it if it may be None - what do we expect consumers
  332. # to do here?
  333. """
  334. Return the number of rows in the DataFrame, if available.
  335. """
  336. @abstractmethod
  337. def num_chunks(self) -> int:
  338. """
  339. Return the number of chunks the DataFrame consists of.
  340. """
  341. @abstractmethod
  342. def column_names(self) -> Iterable[str]:
  343. """
  344. Return an iterator yielding the column names.
  345. """
  346. @abstractmethod
  347. def get_column(self, i: int) -> Column:
  348. """
  349. Return the column at the indicated position.
  350. """
  351. @abstractmethod
  352. def get_column_by_name(self, name: str) -> Column:
  353. """
  354. Return the column whose name is the indicated name.
  355. """
  356. @abstractmethod
  357. def get_columns(self) -> Iterable[Column]:
  358. """
  359. Return an iterator yielding the columns.
  360. """
  361. @abstractmethod
  362. def select_columns(self, indices: Sequence[int]) -> DataFrame:
  363. """
  364. Create a new DataFrame by selecting a subset of columns by index.
  365. """
  366. @abstractmethod
  367. def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
  368. """
  369. Create a new DataFrame by selecting a subset of columns by name.
  370. """
  371. @abstractmethod
  372. def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
  373. """
  374. Return an iterator yielding the chunks.
  375. By default (None), yields the chunks that the data is stored as by the
  376. producer. If given, ``n_chunks`` must be a multiple of
  377. ``self.num_chunks()``, meaning the producer must subdivide each chunk
  378. before yielding it.
  379. """