from_dataframe.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. from __future__ import annotations
  2. import ctypes
  3. import re
  4. from typing import Any
  5. import numpy as np
  6. from pandas.compat._optional import import_optional_dependency
  7. import pandas as pd
  8. from pandas.core.interchange.dataframe_protocol import (
  9. Buffer,
  10. Column,
  11. ColumnNullType,
  12. DataFrame as DataFrameXchg,
  13. DtypeKind,
  14. )
  15. from pandas.core.interchange.utils import (
  16. ArrowCTypes,
  17. Endianness,
  18. )
  19. _NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {
  20. DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
  21. DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
  22. DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
  23. DtypeKind.BOOL: {1: bool, 8: bool},
  24. }
  25. def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:
  26. """
  27. Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
  28. Parameters
  29. ----------
  30. df : DataFrameXchg
  31. Object supporting the interchange protocol, i.e. `__dataframe__` method.
  32. allow_copy : bool, default: True
  33. Whether to allow copying the memory to perform the conversion
  34. (if false then zero-copy approach is requested).
  35. Returns
  36. -------
  37. pd.DataFrame
  38. """
  39. if isinstance(df, pd.DataFrame):
  40. return df
  41. if not hasattr(df, "__dataframe__"):
  42. raise ValueError("`df` does not support __dataframe__")
  43. return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
  44. def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True):
  45. """
  46. Build a ``pd.DataFrame`` from the DataFrame interchange object.
  47. Parameters
  48. ----------
  49. df : DataFrameXchg
  50. Object supporting the interchange protocol, i.e. `__dataframe__` method.
  51. allow_copy : bool, default: True
  52. Whether to allow copying the memory to perform the conversion
  53. (if false then zero-copy approach is requested).
  54. Returns
  55. -------
  56. pd.DataFrame
  57. """
  58. pandas_dfs = []
  59. for chunk in df.get_chunks():
  60. pandas_df = protocol_df_chunk_to_pandas(chunk)
  61. pandas_dfs.append(pandas_df)
  62. if not allow_copy and len(pandas_dfs) > 1:
  63. raise RuntimeError(
  64. "To join chunks a copy is required which is forbidden by allow_copy=False"
  65. )
  66. if len(pandas_dfs) == 1:
  67. pandas_df = pandas_dfs[0]
  68. else:
  69. pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)
  70. index_obj = df.metadata.get("pandas.index", None)
  71. if index_obj is not None:
  72. pandas_df.index = index_obj
  73. return pandas_df
  74. def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
  75. """
  76. Convert interchange protocol chunk to ``pd.DataFrame``.
  77. Parameters
  78. ----------
  79. df : DataFrameXchg
  80. Returns
  81. -------
  82. pd.DataFrame
  83. """
  84. # We need a dict of columns here, with each column being a NumPy array (at
  85. # least for now, deal with non-NumPy dtypes later).
  86. columns: dict[str, Any] = {}
  87. buffers = [] # hold on to buffers, keeps memory alive
  88. for name in df.column_names():
  89. if not isinstance(name, str):
  90. raise ValueError(f"Column {name} is not a string")
  91. if name in columns:
  92. raise ValueError(f"Column {name} is not unique")
  93. col = df.get_column_by_name(name)
  94. dtype = col.dtype[0]
  95. if dtype in (
  96. DtypeKind.INT,
  97. DtypeKind.UINT,
  98. DtypeKind.FLOAT,
  99. DtypeKind.BOOL,
  100. ):
  101. columns[name], buf = primitive_column_to_ndarray(col)
  102. elif dtype == DtypeKind.CATEGORICAL:
  103. columns[name], buf = categorical_column_to_series(col)
  104. elif dtype == DtypeKind.STRING:
  105. columns[name], buf = string_column_to_ndarray(col)
  106. elif dtype == DtypeKind.DATETIME:
  107. columns[name], buf = datetime_column_to_ndarray(col)
  108. else:
  109. raise NotImplementedError(f"Data type {dtype} not handled yet")
  110. buffers.append(buf)
  111. pandas_df = pd.DataFrame(columns)
  112. pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers
  113. return pandas_df
  114. def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
  115. """
  116. Convert a column holding one of the primitive dtypes to a NumPy array.
  117. A primitive type is one of: int, uint, float, bool.
  118. Parameters
  119. ----------
  120. col : Column
  121. Returns
  122. -------
  123. tuple
  124. Tuple of np.ndarray holding the data and the memory owner object
  125. that keeps the memory alive.
  126. """
  127. buffers = col.get_buffers()
  128. data_buff, data_dtype = buffers["data"]
  129. data = buffer_to_ndarray(
  130. data_buff, data_dtype, offset=col.offset, length=col.size()
  131. )
  132. data = set_nulls(data, col, buffers["validity"])
  133. return data, buffers
  134. def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
  135. """
  136. Convert a column holding categorical data to a pandas Series.
  137. Parameters
  138. ----------
  139. col : Column
  140. Returns
  141. -------
  142. tuple
  143. Tuple of pd.Series holding the data and the memory owner object
  144. that keeps the memory alive.
  145. """
  146. categorical = col.describe_categorical
  147. if not categorical["is_dictionary"]:
  148. raise NotImplementedError("Non-dictionary categoricals not supported yet")
  149. cat_column = categorical["categories"]
  150. if hasattr(cat_column, "_col"):
  151. # Item "Column" of "Optional[Column]" has no attribute "_col"
  152. # Item "None" of "Optional[Column]" has no attribute "_col"
  153. categories = np.array(cat_column._col) # type: ignore[union-attr]
  154. else:
  155. raise NotImplementedError(
  156. "Interchanging categorical columns isn't supported yet, and our "
  157. "fallback of using the `col._col` attribute (a ndarray) failed."
  158. )
  159. buffers = col.get_buffers()
  160. codes_buff, codes_dtype = buffers["data"]
  161. codes = buffer_to_ndarray(
  162. codes_buff, codes_dtype, offset=col.offset, length=col.size()
  163. )
  164. # Doing module in order to not get ``IndexError`` for
  165. # out-of-bounds sentinel values in `codes`
  166. if len(categories) > 0:
  167. values = categories[codes % len(categories)]
  168. else:
  169. values = codes
  170. cat = pd.Categorical(
  171. values, categories=categories, ordered=categorical["is_ordered"]
  172. )
  173. data = pd.Series(cat)
  174. data = set_nulls(data, col, buffers["validity"])
  175. return data, buffers
  176. def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
  177. """
  178. Convert a column holding string data to a NumPy array.
  179. Parameters
  180. ----------
  181. col : Column
  182. Returns
  183. -------
  184. tuple
  185. Tuple of np.ndarray holding the data and the memory owner object
  186. that keeps the memory alive.
  187. """
  188. null_kind, sentinel_val = col.describe_null
  189. if null_kind not in (
  190. ColumnNullType.NON_NULLABLE,
  191. ColumnNullType.USE_BITMASK,
  192. ColumnNullType.USE_BYTEMASK,
  193. ):
  194. raise NotImplementedError(
  195. f"{null_kind} null kind is not yet supported for string columns."
  196. )
  197. buffers = col.get_buffers()
  198. assert buffers["offsets"], "String buffers must contain offsets"
  199. # Retrieve the data buffer containing the UTF-8 code units
  200. data_buff, protocol_data_dtype = buffers["data"]
  201. # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
  202. assert protocol_data_dtype[1] == 8
  203. assert protocol_data_dtype[2] in (
  204. ArrowCTypes.STRING,
  205. ArrowCTypes.LARGE_STRING,
  206. ) # format_str == utf-8
  207. # Convert the buffers to NumPy arrays. In order to go from STRING to
  208. # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
  209. data_dtype = (
  210. DtypeKind.UINT,
  211. 8,
  212. ArrowCTypes.UINT8,
  213. Endianness.NATIVE,
  214. )
  215. # Specify zero offset as we don't want to chunk the string data
  216. data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
  217. # Retrieve the offsets buffer containing the index offsets demarcating
  218. # the beginning and the ending of each string
  219. offset_buff, offset_dtype = buffers["offsets"]
  220. # Offsets buffer contains start-stop positions of strings in the data buffer,
  221. # meaning that it has more elements than in the data buffer, do `col.size() + 1`
  222. # here to pass a proper offsets buffer size
  223. offsets = buffer_to_ndarray(
  224. offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
  225. )
  226. null_pos = None
  227. if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
  228. assert buffers["validity"], "Validity buffers cannot be empty for masks"
  229. valid_buff, valid_dtype = buffers["validity"]
  230. null_pos = buffer_to_ndarray(
  231. valid_buff, valid_dtype, offset=col.offset, length=col.size()
  232. )
  233. if sentinel_val == 0:
  234. null_pos = ~null_pos
  235. # Assemble the strings from the code units
  236. str_list: list[None | float | str] = [None] * col.size()
  237. for i in range(col.size()):
  238. # Check for missing values
  239. if null_pos is not None and null_pos[i]:
  240. str_list[i] = np.nan
  241. continue
  242. # Extract a range of code units
  243. units = data[offsets[i] : offsets[i + 1]]
  244. # Convert the list of code units to bytes
  245. str_bytes = bytes(units)
  246. # Create the string
  247. string = str_bytes.decode(encoding="utf-8")
  248. # Add to our list of strings
  249. str_list[i] = string
  250. # Convert the string list to a NumPy array
  251. return np.asarray(str_list, dtype="object"), buffers
  252. def parse_datetime_format_str(format_str, data):
  253. """Parse datetime `format_str` to interpret the `data`."""
  254. # timestamp 'ts{unit}:tz'
  255. timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
  256. if timestamp_meta:
  257. unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
  258. if tz != "":
  259. raise NotImplementedError("Timezones are not supported yet")
  260. if unit != "s":
  261. # the format string describes only a first letter of the unit, so
  262. # add one extra letter to convert the unit to numpy-style:
  263. # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
  264. unit += "s"
  265. data = data.astype(f"datetime64[{unit}]")
  266. return data
  267. # date 'td{Days/Ms}'
  268. date_meta = re.match(r"td([Dm])", format_str)
  269. if date_meta:
  270. unit = date_meta.group(1)
  271. if unit == "D":
  272. # NumPy doesn't support DAY unit, so converting days to seconds
  273. # (converting to uint64 to avoid overflow)
  274. data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")
  275. elif unit == "m":
  276. data = data.astype("datetime64[ms]")
  277. else:
  278. raise NotImplementedError(f"Date unit is not supported: {unit}")
  279. return data
  280. raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
  281. def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
  282. """
  283. Convert a column holding DateTime data to a NumPy array.
  284. Parameters
  285. ----------
  286. col : Column
  287. Returns
  288. -------
  289. tuple
  290. Tuple of np.ndarray holding the data and the memory owner object
  291. that keeps the memory alive.
  292. """
  293. buffers = col.get_buffers()
  294. _, _, format_str, _ = col.dtype
  295. dbuf, dtype = buffers["data"]
  296. # Consider dtype being `uint` to get number of units passed since the 01.01.1970
  297. data = buffer_to_ndarray(
  298. dbuf,
  299. (
  300. DtypeKind.UINT,
  301. dtype[1],
  302. getattr(ArrowCTypes, f"UINT{dtype[1]}"),
  303. Endianness.NATIVE,
  304. ),
  305. offset=col.offset,
  306. length=col.size(),
  307. )
  308. data = parse_datetime_format_str(format_str, data)
  309. data = set_nulls(data, col, buffers["validity"])
  310. return data, buffers
  311. def buffer_to_ndarray(
  312. buffer: Buffer,
  313. dtype: tuple[DtypeKind, int, str, str],
  314. *,
  315. length: int,
  316. offset: int = 0,
  317. ) -> np.ndarray:
  318. """
  319. Build a NumPy array from the passed buffer.
  320. Parameters
  321. ----------
  322. buffer : Buffer
  323. Buffer to build a NumPy array from.
  324. dtype : tuple
  325. Data type of the buffer conforming protocol dtypes format.
  326. offset : int, default: 0
  327. Number of elements to offset from the start of the buffer.
  328. length : int, optional
  329. If the buffer is a bit-mask, specifies a number of bits to read
  330. from the buffer. Has no effect otherwise.
  331. Returns
  332. -------
  333. np.ndarray
  334. Notes
  335. -----
  336. The returned array doesn't own the memory. The caller of this function is
  337. responsible for keeping the memory owner object alive as long as
  338. the returned NumPy array is being used.
  339. """
  340. kind, bit_width, _, _ = dtype
  341. column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)
  342. if column_dtype is None:
  343. raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")
  344. # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer
  345. # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
  346. # it since https://github.com/numpy/numpy/pull/19083
  347. ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
  348. if bit_width == 1:
  349. assert length is not None, "`length` must be specified for a bit-mask buffer."
  350. pa = import_optional_dependency("pyarrow")
  351. arr = pa.BooleanArray.from_buffers(
  352. pa.bool_(),
  353. length,
  354. [None, pa.foreign_buffer(buffer.ptr, length)],
  355. offset=offset,
  356. )
  357. return np.asarray(arr)
  358. else:
  359. data_pointer = ctypes.cast(
  360. buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
  361. )
  362. return np.ctypeslib.as_array(
  363. data_pointer,
  364. shape=(length,),
  365. )
  366. def set_nulls(
  367. data: np.ndarray | pd.Series,
  368. col: Column,
  369. validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
  370. allow_modify_inplace: bool = True,
  371. ):
  372. """
  373. Set null values for the data according to the column null kind.
  374. Parameters
  375. ----------
  376. data : np.ndarray or pd.Series
  377. Data to set nulls in.
  378. col : Column
  379. Column object that describes the `data`.
  380. validity : tuple(Buffer, dtype) or None
  381. The return value of ``col.buffers()``. We do not access the ``col.buffers()``
  382. here to not take the ownership of the memory of buffer objects.
  383. allow_modify_inplace : bool, default: True
  384. Whether to modify the `data` inplace when zero-copy is possible (True) or always
  385. modify a copy of the `data` (False).
  386. Returns
  387. -------
  388. np.ndarray or pd.Series
  389. Data with the nulls being set.
  390. """
  391. null_kind, sentinel_val = col.describe_null
  392. null_pos = None
  393. if null_kind == ColumnNullType.USE_SENTINEL:
  394. null_pos = pd.Series(data) == sentinel_val
  395. elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
  396. assert validity, "Expected to have a validity buffer for the mask"
  397. valid_buff, valid_dtype = validity
  398. null_pos = buffer_to_ndarray(
  399. valid_buff, valid_dtype, offset=col.offset, length=col.size()
  400. )
  401. if sentinel_val == 0:
  402. null_pos = ~null_pos
  403. elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
  404. pass
  405. else:
  406. raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")
  407. if null_pos is not None and np.any(null_pos):
  408. if not allow_modify_inplace:
  409. data = data.copy()
  410. try:
  411. data[null_pos] = None
  412. except TypeError:
  413. # TypeError happens if the `data` dtype appears to be non-nullable
  414. # in numpy notation (bool, int, uint). If this happens,
  415. # cast the `data` to nullable float dtype.
  416. data = data.astype(float)
  417. data[null_pos] = None
  418. return data