utils.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. """
  2. Utility functions and objects for implementing the interchange API.
  3. """
  4. from __future__ import annotations
  5. import re
  6. import typing
  7. import numpy as np
  8. from pandas._typing import DtypeObj
  9. import pandas as pd
  10. from pandas.api.types import is_datetime64_dtype
  11. class ArrowCTypes:
  12. """
  13. Enum for Apache Arrow C type format strings.
  14. The Arrow C data interface:
  15. https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
  16. """
  17. NULL = "n"
  18. BOOL = "b"
  19. INT8 = "c"
  20. UINT8 = "C"
  21. INT16 = "s"
  22. UINT16 = "S"
  23. INT32 = "i"
  24. UINT32 = "I"
  25. INT64 = "l"
  26. UINT64 = "L"
  27. FLOAT16 = "e"
  28. FLOAT32 = "f"
  29. FLOAT64 = "g"
  30. STRING = "u" # utf-8
  31. LARGE_STRING = "U" # utf-8
  32. DATE32 = "tdD"
  33. DATE64 = "tdm"
  34. # Resoulution:
  35. # - seconds -> 's'
  36. # - milliseconds -> 'm'
  37. # - microseconds -> 'u'
  38. # - nanoseconds -> 'n'
  39. TIMESTAMP = "ts{resolution}:{tz}"
  40. TIME = "tt{resolution}"
  41. class Endianness:
  42. """Enum indicating the byte-order of a data-type."""
  43. LITTLE = "<"
  44. BIG = ">"
  45. NATIVE = "="
  46. NA = "|"
  47. def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
  48. """
  49. Represent pandas `dtype` as a format string in Apache Arrow C notation.
  50. Parameters
  51. ----------
  52. dtype : np.dtype
  53. Datatype of pandas DataFrame to represent.
  54. Returns
  55. -------
  56. str
  57. Format string in Apache Arrow C notation of the given `dtype`.
  58. """
  59. if isinstance(dtype, pd.CategoricalDtype):
  60. return ArrowCTypes.INT64
  61. elif dtype == np.dtype("O"):
  62. return ArrowCTypes.STRING
  63. format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
  64. if format_str is not None:
  65. return format_str
  66. if is_datetime64_dtype(dtype):
  67. # Selecting the first char of resolution string:
  68. # dtype.str -> '<M8[ns]'
  69. resolution = re.findall(r"\[(.*)\]", typing.cast(np.dtype, dtype).str)[0][:1]
  70. return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")
  71. raise NotImplementedError(
  72. f"Conversion of {dtype} to Arrow C format string is not implemented."
  73. )