dataframe.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. from __future__ import annotations
  2. from collections import abc
  3. from typing import TYPE_CHECKING
  4. from pandas.core.interchange.column import PandasColumn
  5. from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
  6. if TYPE_CHECKING:
  7. from pandas import (
  8. DataFrame,
  9. Index,
  10. )
  11. class PandasDataFrameXchg(DataFrameXchg):
  12. """
  13. A data frame class, with only the methods required by the interchange
  14. protocol defined.
  15. Instances of this (private) class are returned from
  16. ``pd.DataFrame.__dataframe__`` as objects with the methods and
  17. attributes defined on this class.
  18. """
  19. def __init__(
  20. self, df: DataFrame, nan_as_null: bool = False, allow_copy: bool = True
  21. ) -> None:
  22. """
  23. Constructor - an instance of this (private) class is returned from
  24. `pd.DataFrame.__dataframe__`.
  25. """
  26. self._df = df
  27. # ``nan_as_null`` is a keyword intended for the consumer to tell the
  28. # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
  29. # This currently has no effect; once support for nullable extension
  30. # dtypes is added, this value should be propagated to columns.
  31. self._nan_as_null = nan_as_null
  32. self._allow_copy = allow_copy
  33. def __dataframe__(
  34. self, nan_as_null: bool = False, allow_copy: bool = True
  35. ) -> PandasDataFrameXchg:
  36. return PandasDataFrameXchg(self._df, nan_as_null, allow_copy)
  37. @property
  38. def metadata(self) -> dict[str, Index]:
  39. # `index` isn't a regular column, and the protocol doesn't support row
  40. # labels - so we export it as Pandas-specific metadata here.
  41. return {"pandas.index": self._df.index}
  42. def num_columns(self) -> int:
  43. return len(self._df.columns)
  44. def num_rows(self) -> int:
  45. return len(self._df)
  46. def num_chunks(self) -> int:
  47. return 1
  48. def column_names(self) -> Index:
  49. return self._df.columns
  50. def get_column(self, i: int) -> PandasColumn:
  51. return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy)
  52. def get_column_by_name(self, name: str) -> PandasColumn:
  53. return PandasColumn(self._df[name], allow_copy=self._allow_copy)
  54. def get_columns(self) -> list[PandasColumn]:
  55. return [
  56. PandasColumn(self._df[name], allow_copy=self._allow_copy)
  57. for name in self._df.columns
  58. ]
  59. def select_columns(self, indices) -> PandasDataFrameXchg:
  60. if not isinstance(indices, abc.Sequence):
  61. raise ValueError("`indices` is not a sequence")
  62. if not isinstance(indices, list):
  63. indices = list(indices)
  64. return PandasDataFrameXchg(
  65. self._df.iloc[:, indices], self._nan_as_null, self._allow_copy
  66. )
  67. def select_columns_by_name(self, names) -> PandasDataFrameXchg:
  68. if not isinstance(names, abc.Sequence):
  69. raise ValueError("`names` is not a sequence")
  70. if not isinstance(names, list):
  71. names = list(names)
  72. return PandasDataFrameXchg(
  73. self._df.loc[:, names], self._nan_as_null, self._allow_copy
  74. )
  75. def get_chunks(self, n_chunks=None):
  76. """
  77. Return an iterator yielding the chunks.
  78. """
  79. if n_chunks and n_chunks > 1:
  80. size = len(self._df)
  81. step = size // n_chunks
  82. if size % n_chunks != 0:
  83. step += 1
  84. for start in range(0, step * n_chunks, step):
  85. yield PandasDataFrameXchg(
  86. self._df.iloc[start : start + step, :],
  87. self._nan_as_null,
  88. self._allow_copy,
  89. )
  90. else:
  91. yield self