column.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. from __future__ import annotations
  2. from typing import Any
  3. import numpy as np
  4. from pandas._libs.lib import infer_dtype
  5. from pandas._libs.tslibs import iNaT
  6. from pandas.errors import NoBufferPresent
  7. from pandas.util._decorators import cache_readonly
  8. import pandas as pd
  9. from pandas.api.types import (
  10. is_categorical_dtype,
  11. is_string_dtype,
  12. )
  13. from pandas.core.interchange.buffer import PandasBuffer
  14. from pandas.core.interchange.dataframe_protocol import (
  15. Column,
  16. ColumnBuffers,
  17. ColumnNullType,
  18. DtypeKind,
  19. )
  20. from pandas.core.interchange.utils import (
  21. ArrowCTypes,
  22. Endianness,
  23. dtype_to_arrow_c_fmt,
  24. )
  25. _NP_KINDS = {
  26. "i": DtypeKind.INT,
  27. "u": DtypeKind.UINT,
  28. "f": DtypeKind.FLOAT,
  29. "b": DtypeKind.BOOL,
  30. "U": DtypeKind.STRING,
  31. "M": DtypeKind.DATETIME,
  32. "m": DtypeKind.DATETIME,
  33. }
  34. _NULL_DESCRIPTION = {
  35. DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),
  36. DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),
  37. DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),
  38. DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),
  39. DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),
  40. # Null values for categoricals are stored as `-1` sentinel values
  41. # in the category date (e.g., `col.values.codes` is int8 np.ndarray)
  42. DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),
  43. # follow Arrow in using 1 as valid value and 0 for missing/null value
  44. DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),
  45. }
  46. _NO_VALIDITY_BUFFER = {
  47. ColumnNullType.NON_NULLABLE: "This column is non-nullable",
  48. ColumnNullType.USE_NAN: "This column uses NaN as null",
  49. ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",
  50. }
  51. class PandasColumn(Column):
  52. """
  53. A column object, with only the methods and properties required by the
  54. interchange protocol defined.
  55. A column can contain one or more chunks. Each chunk can contain up to three
  56. buffers - a data buffer, a mask buffer (depending on null representation),
  57. and an offsets buffer (if variable-size binary; e.g., variable-length
  58. strings).
  59. Note: this Column object can only be produced by ``__dataframe__``, so
  60. doesn't need its own version or ``__column__`` protocol.
  61. """
  62. def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
  63. """
  64. Note: doesn't deal with extension arrays yet, just assume a regular
  65. Series/ndarray for now.
  66. """
  67. if not isinstance(column, pd.Series):
  68. raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
  69. # Store the column as a private attribute
  70. self._col = column
  71. self._allow_copy = allow_copy
  72. def size(self) -> int:
  73. """
  74. Size of the column, in elements.
  75. """
  76. return self._col.size
  77. @property
  78. def offset(self) -> int:
  79. """
  80. Offset of first element. Always zero.
  81. """
  82. # TODO: chunks are implemented now, probably this should return something
  83. return 0
  84. @cache_readonly
  85. def dtype(self) -> tuple[DtypeKind, int, str, str]:
  86. dtype = self._col.dtype
  87. if is_categorical_dtype(dtype):
  88. codes = self._col.values.codes
  89. (
  90. _,
  91. bitwidth,
  92. c_arrow_dtype_f_str,
  93. _,
  94. ) = self._dtype_from_pandasdtype(codes.dtype)
  95. return (
  96. DtypeKind.CATEGORICAL,
  97. bitwidth,
  98. c_arrow_dtype_f_str,
  99. Endianness.NATIVE,
  100. )
  101. elif is_string_dtype(dtype):
  102. if infer_dtype(self._col) == "string":
  103. return (
  104. DtypeKind.STRING,
  105. 8,
  106. dtype_to_arrow_c_fmt(dtype),
  107. Endianness.NATIVE,
  108. )
  109. raise NotImplementedError("Non-string object dtypes are not supported yet")
  110. else:
  111. return self._dtype_from_pandasdtype(dtype)
  112. def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
  113. """
  114. See `self.dtype` for details.
  115. """
  116. # Note: 'c' (complex) not handled yet (not in array spec v1).
  117. # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
  118. # datetime and timedelta both map to datetime (is timedelta handled?)
  119. kind = _NP_KINDS.get(dtype.kind, None)
  120. if kind is None:
  121. # Not a NumPy dtype. Check if it's a categorical maybe
  122. raise ValueError(f"Data type {dtype} not supported by interchange protocol")
  123. return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder
  124. @property
  125. def describe_categorical(self):
  126. """
  127. If the dtype is categorical, there are two options:
  128. - There are only values in the data buffer.
  129. - There is a separate non-categorical Column encoding for categorical values.
  130. Raises TypeError if the dtype is not categorical
  131. Content of returned dict:
  132. - "is_ordered" : bool, whether the ordering of dictionary indices is
  133. semantically meaningful.
  134. - "is_dictionary" : bool, whether a dictionary-style mapping of
  135. categorical values to other objects exists
  136. - "categories" : Column representing the (implicit) mapping of indices to
  137. category values (e.g. an array of cat1, cat2, ...).
  138. None if not a dictionary-style categorical.
  139. """
  140. if not self.dtype[0] == DtypeKind.CATEGORICAL:
  141. raise TypeError(
  142. "describe_categorical only works on a column with categorical dtype!"
  143. )
  144. return {
  145. "is_ordered": self._col.cat.ordered,
  146. "is_dictionary": True,
  147. "categories": PandasColumn(pd.Series(self._col.cat.categories)),
  148. }
  149. @property
  150. def describe_null(self):
  151. kind = self.dtype[0]
  152. try:
  153. null, value = _NULL_DESCRIPTION[kind]
  154. except KeyError:
  155. raise NotImplementedError(f"Data type {kind} not yet supported")
  156. return null, value
  157. @cache_readonly
  158. def null_count(self) -> int:
  159. """
  160. Number of null elements. Should always be known.
  161. """
  162. return self._col.isna().sum().item()
  163. @property
  164. def metadata(self) -> dict[str, pd.Index]:
  165. """
  166. Store specific metadata of the column.
  167. """
  168. return {"pandas.index": self._col.index}
  169. def num_chunks(self) -> int:
  170. """
  171. Return the number of chunks the column consists of.
  172. """
  173. return 1
  174. def get_chunks(self, n_chunks: int | None = None):
  175. """
  176. Return an iterator yielding the chunks.
  177. See `DataFrame.get_chunks` for details on ``n_chunks``.
  178. """
  179. if n_chunks and n_chunks > 1:
  180. size = len(self._col)
  181. step = size // n_chunks
  182. if size % n_chunks != 0:
  183. step += 1
  184. for start in range(0, step * n_chunks, step):
  185. yield PandasColumn(
  186. self._col.iloc[start : start + step], self._allow_copy
  187. )
  188. else:
  189. yield self
  190. def get_buffers(self) -> ColumnBuffers:
  191. """
  192. Return a dictionary containing the underlying buffers.
  193. The returned dictionary has the following contents:
  194. - "data": a two-element tuple whose first element is a buffer
  195. containing the data and whose second element is the data
  196. buffer's associated dtype.
  197. - "validity": a two-element tuple whose first element is a buffer
  198. containing mask values indicating missing data and
  199. whose second element is the mask value buffer's
  200. associated dtype. None if the null representation is
  201. not a bit or byte mask.
  202. - "offsets": a two-element tuple whose first element is a buffer
  203. containing the offset values for variable-size binary
  204. data (e.g., variable-length strings) and whose second
  205. element is the offsets buffer's associated dtype. None
  206. if the data buffer does not have an associated offsets
  207. buffer.
  208. """
  209. buffers: ColumnBuffers = {
  210. "data": self._get_data_buffer(),
  211. "validity": None,
  212. "offsets": None,
  213. }
  214. try:
  215. buffers["validity"] = self._get_validity_buffer()
  216. except NoBufferPresent:
  217. pass
  218. try:
  219. buffers["offsets"] = self._get_offsets_buffer()
  220. except NoBufferPresent:
  221. pass
  222. return buffers
  223. def _get_data_buffer(
  224. self,
  225. ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple
  226. """
  227. Return the buffer containing the data and the buffer's associated dtype.
  228. """
  229. if self.dtype[0] in (
  230. DtypeKind.INT,
  231. DtypeKind.UINT,
  232. DtypeKind.FLOAT,
  233. DtypeKind.BOOL,
  234. DtypeKind.DATETIME,
  235. ):
  236. buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)
  237. dtype = self.dtype
  238. elif self.dtype[0] == DtypeKind.CATEGORICAL:
  239. codes = self._col.values._codes
  240. buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
  241. dtype = self._dtype_from_pandasdtype(codes.dtype)
  242. elif self.dtype[0] == DtypeKind.STRING:
  243. # Marshal the strings from a NumPy object array into a byte array
  244. buf = self._col.to_numpy()
  245. b = bytearray()
  246. # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
  247. for obj in buf:
  248. if isinstance(obj, str):
  249. b.extend(obj.encode(encoding="utf-8"))
  250. # Convert the byte array to a Pandas "buffer" using
  251. # a NumPy array as the backing store
  252. buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
  253. # Define the dtype for the returned buffer
  254. dtype = (
  255. DtypeKind.STRING,
  256. 8,
  257. ArrowCTypes.STRING,
  258. Endianness.NATIVE,
  259. ) # note: currently only support native endianness
  260. else:
  261. raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
  262. return buffer, dtype
  263. def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
  264. """
  265. Return the buffer containing the mask values indicating missing data and
  266. the buffer's associated dtype.
  267. Raises NoBufferPresent if null representation is not a bit or byte mask.
  268. """
  269. null, invalid = self.describe_null
  270. if self.dtype[0] == DtypeKind.STRING:
  271. # For now, use byte array as the mask.
  272. # TODO: maybe store as bit array to save space?..
  273. buf = self._col.to_numpy()
  274. # Determine the encoding for valid values
  275. valid = invalid == 0
  276. invalid = not valid
  277. mask = np.zeros(shape=(len(buf),), dtype=np.bool_)
  278. for i, obj in enumerate(buf):
  279. mask[i] = valid if isinstance(obj, str) else invalid
  280. # Convert the mask array to a Pandas "buffer" using
  281. # a NumPy array as the backing store
  282. buffer = PandasBuffer(mask)
  283. # Define the dtype of the returned buffer
  284. dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
  285. return buffer, dtype
  286. try:
  287. msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask"
  288. except KeyError:
  289. # TODO: implement for other bit/byte masks?
  290. raise NotImplementedError("See self.describe_null")
  291. raise NoBufferPresent(msg)
  292. def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:
  293. """
  294. Return the buffer containing the offset values for variable-size binary
  295. data (e.g., variable-length strings) and the buffer's associated dtype.
  296. Raises NoBufferPresent if the data buffer does not have an associated
  297. offsets buffer.
  298. """
  299. if self.dtype[0] == DtypeKind.STRING:
  300. # For each string, we need to manually determine the next offset
  301. values = self._col.to_numpy()
  302. ptr = 0
  303. offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)
  304. for i, v in enumerate(values):
  305. # For missing values (in this case, `np.nan` values)
  306. # we don't increment the pointer
  307. if isinstance(v, str):
  308. b = v.encode(encoding="utf-8")
  309. ptr += len(b)
  310. offsets[i + 1] = ptr
  311. # Convert the offsets to a Pandas "buffer" using
  312. # the NumPy array as the backing store
  313. buffer = PandasBuffer(offsets)
  314. # Assemble the buffer dtype info
  315. dtype = (
  316. DtypeKind.INT,
  317. 64,
  318. ArrowCTypes.INT64,
  319. Endianness.NATIVE,
  320. ) # note: currently only support native endianness
  321. else:
  322. raise NoBufferPresent(
  323. "This column has a fixed-length dtype so "
  324. "it does not have an offsets buffer"
  325. )
  326. return buffer, dtype