test_impl.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. from datetime import datetime
  2. import random
  3. import numpy as np
  4. import pytest
  5. from pandas._libs.tslibs import iNaT
  6. import pandas.util._test_decorators as td
  7. import pandas as pd
  8. import pandas._testing as tm
  9. from pandas.core.interchange.column import PandasColumn
  10. from pandas.core.interchange.dataframe_protocol import (
  11. ColumnNullType,
  12. DtypeKind,
  13. )
  14. from pandas.core.interchange.from_dataframe import from_dataframe
  15. test_data_categorical = {
  16. "ordered": pd.Categorical(list("testdata") * 30, ordered=True),
  17. "unordered": pd.Categorical(list("testdata") * 30, ordered=False),
  18. }
  19. NCOLS, NROWS = 100, 200
  20. def _make_data(make_one):
  21. return {
  22. f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [make_one() for _ in range(NROWS)]
  23. for i in range(NCOLS)
  24. }
  25. int_data = _make_data(lambda: random.randint(-100, 100))
  26. uint_data = _make_data(lambda: random.randint(1, 100))
  27. bool_data = _make_data(lambda: random.choice([True, False]))
  28. float_data = _make_data(lambda: random.random())
  29. datetime_data = _make_data(
  30. lambda: datetime(
  31. year=random.randint(1900, 2100),
  32. month=random.randint(1, 12),
  33. day=random.randint(1, 20),
  34. )
  35. )
  36. string_data = {
  37. "separator data": [
  38. "abC|DeF,Hik",
  39. "234,3245.67",
  40. "gSaf,qWer|Gre",
  41. "asd3,4sad|",
  42. np.NaN,
  43. ]
  44. }
  45. @pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)])
  46. def test_categorical_dtype(data):
  47. df = pd.DataFrame({"A": (test_data_categorical[data[0]])})
  48. col = df.__dataframe__().get_column_by_name("A")
  49. assert col.dtype[0] == DtypeKind.CATEGORICAL
  50. assert col.null_count == 0
  51. assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1)
  52. assert col.num_chunks() == 1
  53. desc_cat = col.describe_categorical
  54. assert desc_cat["is_ordered"] == data[1]
  55. assert desc_cat["is_dictionary"] is True
  56. assert isinstance(desc_cat["categories"], PandasColumn)
  57. tm.assert_series_equal(
  58. desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"])
  59. )
  60. tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
  61. def test_categorical_pyarrow():
  62. # GH 49889
  63. pa = pytest.importorskip("pyarrow", "11.0.0")
  64. arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
  65. table = pa.table({"weekday": pa.array(arr).dictionary_encode()})
  66. exchange_df = table.__dataframe__()
  67. result = from_dataframe(exchange_df)
  68. weekday = pd.Categorical(
  69. arr, categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
  70. )
  71. expected = pd.DataFrame({"weekday": weekday})
  72. tm.assert_frame_equal(result, expected)
  73. def test_empty_categorical_pyarrow():
  74. # https://github.com/pandas-dev/pandas/issues/53077
  75. pa = pytest.importorskip("pyarrow", "11.0.0")
  76. arr = [None]
  77. table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()})
  78. exchange_df = table.__dataframe__()
  79. result = pd.api.interchange.from_dataframe(exchange_df)
  80. expected = pd.DataFrame({"arr": pd.Categorical([np.nan])})
  81. tm.assert_frame_equal(result, expected)
  82. def test_large_string_pyarrow():
  83. # GH 52795
  84. pa = pytest.importorskip("pyarrow", "11.0.0")
  85. arr = ["Mon", "Tue"]
  86. table = pa.table({"weekday": pa.array(arr, "large_string")})
  87. exchange_df = table.__dataframe__()
  88. result = from_dataframe(exchange_df)
  89. expected = pd.DataFrame({"weekday": ["Mon", "Tue"]})
  90. tm.assert_frame_equal(result, expected)
  91. # check round-trip
  92. assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
  93. @pytest.mark.parametrize(
  94. ("offset", "length", "expected_values"),
  95. [
  96. (0, None, [3.3, float("nan"), 2.1]),
  97. (1, None, [float("nan"), 2.1]),
  98. (2, None, [2.1]),
  99. (0, 2, [3.3, float("nan")]),
  100. (0, 1, [3.3]),
  101. (1, 1, [float("nan")]),
  102. ],
  103. )
  104. def test_bitmasks_pyarrow(offset, length, expected_values):
  105. # GH 52795
  106. pa = pytest.importorskip("pyarrow", "11.0.0")
  107. arr = [3.3, None, 2.1]
  108. table = pa.table({"arr": arr}).slice(offset, length)
  109. exchange_df = table.__dataframe__()
  110. result = from_dataframe(exchange_df)
  111. expected = pd.DataFrame({"arr": expected_values})
  112. tm.assert_frame_equal(result, expected)
  113. # check round-trip
  114. assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
  115. @pytest.mark.parametrize(
  116. "data", [int_data, uint_data, float_data, bool_data, datetime_data]
  117. )
  118. def test_dataframe(data):
  119. df = pd.DataFrame(data)
  120. df2 = df.__dataframe__()
  121. assert df2.num_columns() == NCOLS
  122. assert df2.num_rows() == NROWS
  123. assert list(df2.column_names()) == list(data.keys())
  124. indices = (0, 2)
  125. names = tuple(list(data.keys())[idx] for idx in indices)
  126. result = from_dataframe(df2.select_columns(indices))
  127. expected = from_dataframe(df2.select_columns_by_name(names))
  128. tm.assert_frame_equal(result, expected)
  129. assert isinstance(result.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
  130. assert isinstance(expected.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
  131. def test_missing_from_masked():
  132. df = pd.DataFrame(
  133. {
  134. "x": np.array([1, 2, 3, 4, 0]),
  135. "y": np.array([1.5, 2.5, 3.5, 4.5, 0]),
  136. "z": np.array([True, False, True, True, True]),
  137. }
  138. )
  139. df2 = df.__dataframe__()
  140. rng = np.random.RandomState(42)
  141. dict_null = {col: rng.randint(low=0, high=len(df)) for col in df.columns}
  142. for col, num_nulls in dict_null.items():
  143. null_idx = df.index[
  144. rng.choice(np.arange(len(df)), size=num_nulls, replace=False)
  145. ]
  146. df.loc[null_idx, col] = None
  147. df2 = df.__dataframe__()
  148. assert df2.get_column_by_name("x").null_count == dict_null["x"]
  149. assert df2.get_column_by_name("y").null_count == dict_null["y"]
  150. assert df2.get_column_by_name("z").null_count == dict_null["z"]
  151. @pytest.mark.parametrize(
  152. "data",
  153. [
  154. {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]},
  155. {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]},
  156. {
  157. "x": np.array([True, True, False]),
  158. "y": np.array([1, 2, 0]),
  159. "z": np.array([9.2, 10.5, 11.8]),
  160. },
  161. ],
  162. )
  163. def test_mixed_data(data):
  164. df = pd.DataFrame(data)
  165. df2 = df.__dataframe__()
  166. for col_name in df.columns:
  167. assert df2.get_column_by_name(col_name).null_count == 0
  168. def test_mixed_missing():
  169. df = pd.DataFrame(
  170. {
  171. "x": np.array([True, None, False, None, True]),
  172. "y": np.array([None, 2, None, 1, 2]),
  173. "z": np.array([9.2, 10.5, None, 11.8, None]),
  174. }
  175. )
  176. df2 = df.__dataframe__()
  177. for col_name in df.columns:
  178. assert df2.get_column_by_name(col_name).null_count == 2
  179. def test_string():
  180. test_str_data = string_data["separator data"] + [""]
  181. df = pd.DataFrame({"A": test_str_data})
  182. col = df.__dataframe__().get_column_by_name("A")
  183. assert col.size() == 6
  184. assert col.null_count == 1
  185. assert col.dtype[0] == DtypeKind.STRING
  186. assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
  187. df_sliced = df[1:]
  188. col = df_sliced.__dataframe__().get_column_by_name("A")
  189. assert col.size() == 5
  190. assert col.null_count == 1
  191. assert col.dtype[0] == DtypeKind.STRING
  192. assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
  193. def test_nonstring_object():
  194. df = pd.DataFrame({"A": ["a", 10, 1.0, ()]})
  195. col = df.__dataframe__().get_column_by_name("A")
  196. with pytest.raises(NotImplementedError, match="not supported yet"):
  197. col.dtype
  198. def test_datetime():
  199. df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]})
  200. col = df.__dataframe__().get_column_by_name("A")
  201. assert col.size() == 2
  202. assert col.null_count == 1
  203. assert col.dtype[0] == DtypeKind.DATETIME
  204. assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT)
  205. tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
  206. @td.skip_if_np_lt("1.23")
  207. def test_categorical_to_numpy_dlpack():
  208. # https://github.com/pandas-dev/pandas/issues/48393
  209. df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])})
  210. col = df.__dataframe__().get_column_by_name("A")
  211. result = np.from_dlpack(col.get_buffers()["data"][0])
  212. expected = np.array([0, 1, 0], dtype="int8")
  213. tm.assert_numpy_array_equal(result, expected)