test_arrow_compat.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. pa = pytest.importorskip("pyarrow", minversion="1.0.1")
  6. from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask
  7. arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES]
  8. arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
  9. arrays += [pd.array([True, False, True, None], dtype="boolean")]
  10. @pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays])
  11. def data(request):
  12. """
  13. Fixture returning parametrized array from given dtype, including integer,
  14. float and boolean
  15. """
  16. return request.param
  17. def test_arrow_array(data):
  18. arr = pa.array(data)
  19. expected = pa.array(
  20. data.to_numpy(object, na_value=None),
  21. type=pa.from_numpy_dtype(data.dtype.numpy_dtype),
  22. )
  23. assert arr.equals(expected)
  24. def test_arrow_roundtrip(data):
  25. df = pd.DataFrame({"a": data})
  26. table = pa.table(df)
  27. assert table.field("a").type == str(data.dtype.numpy_dtype)
  28. result = table.to_pandas()
  29. assert result["a"].dtype == data.dtype
  30. tm.assert_frame_equal(result, df)
  31. def test_dataframe_from_arrow_types_mapper():
  32. def types_mapper(arrow_type):
  33. if pa.types.is_boolean(arrow_type):
  34. return pd.BooleanDtype()
  35. elif pa.types.is_integer(arrow_type):
  36. return pd.Int64Dtype()
  37. bools_array = pa.array([True, None, False], type=pa.bool_())
  38. ints_array = pa.array([1, None, 2], type=pa.int64())
  39. small_ints_array = pa.array([-1, 0, 7], type=pa.int8())
  40. record_batch = pa.RecordBatch.from_arrays(
  41. [bools_array, ints_array, small_ints_array], ["bools", "ints", "small_ints"]
  42. )
  43. result = record_batch.to_pandas(types_mapper=types_mapper)
  44. bools = pd.Series([True, None, False], dtype="boolean")
  45. ints = pd.Series([1, None, 2], dtype="Int64")
  46. small_ints = pd.Series([-1, 0, 7], dtype="Int64")
  47. expected = pd.DataFrame({"bools": bools, "ints": ints, "small_ints": small_ints})
  48. tm.assert_frame_equal(result, expected)
  49. def test_arrow_load_from_zero_chunks(data):
  50. # GH-41040
  51. df = pd.DataFrame({"a": data[0:0]})
  52. table = pa.table(df)
  53. assert table.field("a").type == str(data.dtype.numpy_dtype)
  54. table = pa.table(
  55. [pa.chunked_array([], type=table.field("a").type)], schema=table.schema
  56. )
  57. result = table.to_pandas()
  58. assert result["a"].dtype == data.dtype
  59. tm.assert_frame_equal(result, df)
  60. def test_arrow_from_arrow_uint():
  61. # https://github.com/pandas-dev/pandas/issues/31896
  62. # possible mismatch in types
  63. dtype = pd.UInt32Dtype()
  64. result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
  65. expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
  66. tm.assert_extension_array_equal(result, expected)
  67. def test_arrow_sliced(data):
  68. # https://github.com/pandas-dev/pandas/issues/38525
  69. df = pd.DataFrame({"a": data})
  70. table = pa.table(df)
  71. result = table.slice(2, None).to_pandas()
  72. expected = df.iloc[2:].reset_index(drop=True)
  73. tm.assert_frame_equal(result, expected)
  74. # no missing values
  75. df2 = df.fillna(data[0])
  76. table = pa.table(df2)
  77. result = table.slice(2, None).to_pandas()
  78. expected = df2.iloc[2:].reset_index(drop=True)
  79. tm.assert_frame_equal(result, expected)
  80. @pytest.fixture
  81. def np_dtype_to_arrays(any_real_numpy_dtype):
  82. """
  83. Fixture returning actual and expected dtype, pandas and numpy arrays and
  84. mask from a given numpy dtype
  85. """
  86. np_dtype = np.dtype(any_real_numpy_dtype)
  87. pa_type = pa.from_numpy_dtype(np_dtype)
  88. # None ensures the creation of a bitmask buffer.
  89. pa_array = pa.array([0, 1, 2, None], type=pa_type)
  90. # Since masked Arrow buffer slots are not required to contain a specific
  91. # value, assert only the first three values of the created np.array
  92. np_expected = np.array([0, 1, 2], dtype=np_dtype)
  93. mask_expected = np.array([True, True, True, False])
  94. return np_dtype, pa_array, np_expected, mask_expected
  95. def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
  96. """
  97. Test conversion from pyarrow array to numpy array.
  98. Modifies the pyarrow buffer to contain padding and offset, which are
  99. considered valid buffers by pyarrow.
  100. Also tests empty pyarrow arrays with non empty buffers.
  101. See https://github.com/pandas-dev/pandas/issues/40896
  102. """
  103. np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
  104. data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
  105. tm.assert_numpy_array_equal(data[:3], np_expected)
  106. tm.assert_numpy_array_equal(mask, mask_expected)
  107. mask_buffer = pa_array.buffers()[0]
  108. data_buffer = pa_array.buffers()[1]
  109. data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
  110. # Add trailing padding to the buffer.
  111. data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
  112. pa_array_trail = pa.Array.from_buffers(
  113. type=pa_array.type,
  114. length=len(pa_array),
  115. buffers=[mask_buffer, data_buffer_trail],
  116. offset=pa_array.offset,
  117. )
  118. pa_array_trail.validate()
  119. data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
  120. tm.assert_numpy_array_equal(data[:3], np_expected)
  121. tm.assert_numpy_array_equal(mask, mask_expected)
  122. # Add offset to the buffer.
  123. offset = b"\x00" * (pa_array.type.bit_width // 8)
  124. data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
  125. mask_buffer_offset = pa.py_buffer(b"\x0E")
  126. pa_array_offset = pa.Array.from_buffers(
  127. type=pa_array.type,
  128. length=len(pa_array),
  129. buffers=[mask_buffer_offset, data_buffer_offset],
  130. offset=pa_array.offset + 1,
  131. )
  132. pa_array_offset.validate()
  133. data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
  134. tm.assert_numpy_array_equal(data[:3], np_expected)
  135. tm.assert_numpy_array_equal(mask, mask_expected)
  136. # Empty array
  137. np_expected_empty = np.array([], dtype=np_dtype)
  138. mask_expected_empty = np.array([], dtype=np.bool_)
  139. pa_array_offset = pa.Array.from_buffers(
  140. type=pa_array.type,
  141. length=0,
  142. buffers=[mask_buffer, data_buffer],
  143. offset=pa_array.offset,
  144. )
  145. pa_array_offset.validate()
  146. data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
  147. tm.assert_numpy_array_equal(data[:3], np_expected_empty)
  148. tm.assert_numpy_array_equal(mask, mask_expected_empty)
  149. def test_from_arrow_type_error(data):
  150. # ensure that __from_arrow__ returns a TypeError when getting a wrong
  151. # array type
  152. arr = pa.array(data).cast("string")
  153. with pytest.raises(TypeError, match=None):
  154. # we don't test the exact error message, only the fact that it raises
  155. # a TypeError is relevant
  156. data.dtype.__from_arrow__(arr)