test_spec_conformance.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. """
  2. A verbatim copy (vendored) of the spec tests.
  3. Taken from https://github.com/data-apis/dataframe-api
  4. """
  5. import ctypes
  6. import math
  7. import pytest
  8. @pytest.mark.parametrize(
  9. "test_data",
  10. [
  11. {"a": ["foo", "bar"], "b": ["baz", "qux"]},
  12. {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]},
  13. {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]},
  14. ],
  15. ids=["str_data", "float_data", "int_data"],
  16. )
  17. def test_only_one_dtype(test_data, df_from_dict):
  18. columns = list(test_data.keys())
  19. df = df_from_dict(test_data)
  20. dfX = df.__dataframe__()
  21. column_size = len(test_data[columns[0]])
  22. for column in columns:
  23. null_count = dfX.get_column_by_name(column).null_count
  24. assert null_count == 0
  25. assert isinstance(null_count, int)
  26. assert dfX.get_column_by_name(column).size() == column_size
  27. assert dfX.get_column_by_name(column).offset == 0
  28. def test_mixed_dtypes(df_from_dict):
  29. df = df_from_dict(
  30. {
  31. "a": [1, 2, 3], # dtype kind INT = 0
  32. "b": [3, 4, 5], # dtype kind INT = 0
  33. "c": [1.5, 2.5, 3.5], # dtype kind FLOAT = 2
  34. "d": [9, 10, 11], # dtype kind INT = 0
  35. "e": [True, False, True], # dtype kind BOOLEAN = 20
  36. "f": ["a", "", "c"], # dtype kind STRING = 21
  37. }
  38. )
  39. dfX = df.__dataframe__()
  40. # for meanings of dtype[0] see the spec; we cannot import the spec here as this
  41. # file is expected to be vendored *anywhere*;
  42. # values for dtype[0] are explained above
  43. columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21}
  44. for column, kind in columns.items():
  45. colX = dfX.get_column_by_name(column)
  46. assert colX.null_count == 0
  47. assert isinstance(colX.null_count, int)
  48. assert colX.size() == 3
  49. assert colX.offset == 0
  50. assert colX.dtype[0] == kind
  51. assert dfX.get_column_by_name("c").dtype[1] == 64
  52. def test_na_float(df_from_dict):
  53. df = df_from_dict({"a": [1.0, math.nan, 2.0]})
  54. dfX = df.__dataframe__()
  55. colX = dfX.get_column_by_name("a")
  56. assert colX.null_count == 1
  57. assert isinstance(colX.null_count, int)
  58. def test_noncategorical(df_from_dict):
  59. df = df_from_dict({"a": [1, 2, 3]})
  60. dfX = df.__dataframe__()
  61. colX = dfX.get_column_by_name("a")
  62. with pytest.raises(TypeError, match=".*categorical.*"):
  63. colX.describe_categorical
  64. def test_categorical(df_from_dict):
  65. df = df_from_dict(
  66. {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]},
  67. is_categorical=True,
  68. )
  69. colX = df.__dataframe__().get_column_by_name("weekday")
  70. categorical = colX.describe_categorical
  71. assert isinstance(categorical["is_ordered"], bool)
  72. assert isinstance(categorical["is_dictionary"], bool)
  73. def test_dataframe(df_from_dict):
  74. df = df_from_dict(
  75. {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}
  76. )
  77. dfX = df.__dataframe__()
  78. assert dfX.num_columns() == 3
  79. assert dfX.num_rows() == 3
  80. assert dfX.num_chunks() == 1
  81. assert list(dfX.column_names()) == ["x", "y", "z"]
  82. assert list(dfX.select_columns((0, 2)).column_names()) == list(
  83. dfX.select_columns_by_name(("x", "z")).column_names()
  84. )
  85. @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
  86. def test_df_get_chunks(size, n_chunks, df_from_dict):
  87. df = df_from_dict({"x": list(range(size))})
  88. dfX = df.__dataframe__()
  89. chunks = list(dfX.get_chunks(n_chunks))
  90. assert len(chunks) == n_chunks
  91. assert sum(chunk.num_rows() for chunk in chunks) == size
  92. @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
  93. def test_column_get_chunks(size, n_chunks, df_from_dict):
  94. df = df_from_dict({"x": list(range(size))})
  95. dfX = df.__dataframe__()
  96. chunks = list(dfX.get_column(0).get_chunks(n_chunks))
  97. assert len(chunks) == n_chunks
  98. assert sum(chunk.size() for chunk in chunks) == size
  99. def test_get_columns(df_from_dict):
  100. df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]})
  101. dfX = df.__dataframe__()
  102. for colX in dfX.get_columns():
  103. assert colX.size() == 2
  104. assert colX.num_chunks() == 1
  105. # for meanings of dtype[0] see the spec; we cannot import the spec here as this
  106. # file is expected to be vendored *anywhere*
  107. assert dfX.get_column(0).dtype[0] == 0 # INT
  108. assert dfX.get_column(1).dtype[0] == 2 # FLOAT
  109. def test_buffer(df_from_dict):
  110. arr = [0, 1, -1]
  111. df = df_from_dict({"a": arr})
  112. dfX = df.__dataframe__()
  113. colX = dfX.get_column(0)
  114. bufX = colX.get_buffers()
  115. dataBuf, dataDtype = bufX["data"]
  116. assert dataBuf.bufsize > 0
  117. assert dataBuf.ptr != 0
  118. device, _ = dataBuf.__dlpack_device__()
  119. # for meanings of dtype[0] see the spec; we cannot import the spec here as this
  120. # file is expected to be vendored *anywhere*
  121. assert dataDtype[0] == 0 # INT
  122. if device == 1: # CPU-only as we're going to directly read memory here
  123. bitwidth = dataDtype[1]
  124. ctype = {
  125. 8: ctypes.c_int8,
  126. 16: ctypes.c_int16,
  127. 32: ctypes.c_int32,
  128. 64: ctypes.c_int64,
  129. }[bitwidth]
  130. for idx, truth in enumerate(arr):
  131. val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
  132. assert val == truth, f"Buffer at index {idx} mismatch"