test_construction.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. from pandas.api.types import is_integer
  6. from pandas.core.arrays import IntegerArray
  7. from pandas.core.arrays.integer import (
  8. Int8Dtype,
  9. Int32Dtype,
  10. Int64Dtype,
  11. )
  12. @pytest.fixture(params=[pd.array, IntegerArray._from_sequence])
  13. def constructor(request):
  14. """Fixture returning parametrized IntegerArray from given sequence.
  15. Used to test dtype conversions.
  16. """
  17. return request.param
  18. def test_uses_pandas_na():
  19. a = pd.array([1, None], dtype=Int64Dtype())
  20. assert a[1] is pd.NA
  21. def test_from_dtype_from_float(data):
  22. # construct from our dtype & string dtype
  23. dtype = data.dtype
  24. # from float
  25. expected = pd.Series(data)
  26. result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
  27. tm.assert_series_equal(result, expected)
  28. # from int / list
  29. expected = pd.Series(data)
  30. result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
  31. tm.assert_series_equal(result, expected)
  32. # from int / array
  33. expected = pd.Series(data).dropna().reset_index(drop=True)
  34. dropped = np.array(data.dropna()).astype(np.dtype(dtype.type))
  35. result = pd.Series(dropped, dtype=str(dtype))
  36. tm.assert_series_equal(result, expected)
  37. def test_conversions(data_missing):
  38. # astype to object series
  39. df = pd.DataFrame({"A": data_missing})
  40. result = df["A"].astype("object")
  41. expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A")
  42. tm.assert_series_equal(result, expected)
  43. # convert to object ndarray
  44. # we assert that we are exactly equal
  45. # including type conversions of scalars
  46. result = df["A"].astype("object").values
  47. expected = np.array([pd.NA, 1], dtype=object)
  48. tm.assert_numpy_array_equal(result, expected)
  49. for r, e in zip(result, expected):
  50. if pd.isnull(r):
  51. assert pd.isnull(e)
  52. elif is_integer(r):
  53. assert r == e
  54. assert is_integer(e)
  55. else:
  56. assert r == e
  57. assert type(r) == type(e)
  58. def test_integer_array_constructor():
  59. values = np.array([1, 2, 3, 4], dtype="int64")
  60. mask = np.array([False, False, False, True], dtype="bool")
  61. result = IntegerArray(values, mask)
  62. expected = pd.array([1, 2, 3, np.nan], dtype="Int64")
  63. tm.assert_extension_array_equal(result, expected)
  64. msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
  65. with pytest.raises(TypeError, match=msg):
  66. IntegerArray(values.tolist(), mask)
  67. with pytest.raises(TypeError, match=msg):
  68. IntegerArray(values, mask.tolist())
  69. with pytest.raises(TypeError, match=msg):
  70. IntegerArray(values.astype(float), mask)
  71. msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
  72. with pytest.raises(TypeError, match=msg):
  73. IntegerArray(values)
  74. def test_integer_array_constructor_copy():
  75. values = np.array([1, 2, 3, 4], dtype="int64")
  76. mask = np.array([False, False, False, True], dtype="bool")
  77. result = IntegerArray(values, mask)
  78. assert result._data is values
  79. assert result._mask is mask
  80. result = IntegerArray(values, mask, copy=True)
  81. assert result._data is not values
  82. assert result._mask is not mask
  83. @pytest.mark.parametrize(
  84. "a, b",
  85. [
  86. ([1, None], [1, np.nan]),
  87. ([None], [np.nan]),
  88. ([None, np.nan], [np.nan, np.nan]),
  89. ([np.nan, np.nan], [np.nan, np.nan]),
  90. ],
  91. )
  92. def test_to_integer_array_none_is_nan(a, b):
  93. result = pd.array(a, dtype="Int64")
  94. expected = pd.array(b, dtype="Int64")
  95. tm.assert_extension_array_equal(result, expected)
  96. @pytest.mark.parametrize(
  97. "values",
  98. [
  99. ["foo", "bar"],
  100. "foo",
  101. 1,
  102. 1.0,
  103. pd.date_range("20130101", periods=2),
  104. np.array(["foo"]),
  105. [[1, 2], [3, 4]],
  106. [np.nan, {"a": 1}],
  107. ],
  108. )
  109. def test_to_integer_array_error(values):
  110. # error in converting existing arrays to IntegerArrays
  111. msg = "|".join(
  112. [
  113. r"cannot be converted to IntegerDtype",
  114. r"invalid literal for int\(\) with base 10:",
  115. r"values must be a 1D list-like",
  116. r"Cannot pass scalar",
  117. r"int\(\) argument must be a string",
  118. ]
  119. )
  120. with pytest.raises((ValueError, TypeError), match=msg):
  121. pd.array(values, dtype="Int64")
  122. with pytest.raises((ValueError, TypeError), match=msg):
  123. IntegerArray._from_sequence(values)
  124. def test_to_integer_array_inferred_dtype(constructor):
  125. # if values has dtype -> respect it
  126. result = constructor(np.array([1, 2], dtype="int8"))
  127. assert result.dtype == Int8Dtype()
  128. result = constructor(np.array([1, 2], dtype="int32"))
  129. assert result.dtype == Int32Dtype()
  130. # if values have no dtype -> always int64
  131. result = constructor([1, 2])
  132. assert result.dtype == Int64Dtype()
  133. def test_to_integer_array_dtype_keyword(constructor):
  134. result = constructor([1, 2], dtype="Int8")
  135. assert result.dtype == Int8Dtype()
  136. # if values has dtype -> override it
  137. result = constructor(np.array([1, 2], dtype="int8"), dtype="Int32")
  138. assert result.dtype == Int32Dtype()
  139. def test_to_integer_array_float():
  140. result = IntegerArray._from_sequence([1.0, 2.0])
  141. expected = pd.array([1, 2], dtype="Int64")
  142. tm.assert_extension_array_equal(result, expected)
  143. with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
  144. IntegerArray._from_sequence([1.5, 2.0])
  145. # for float dtypes, the itemsize is not preserved
  146. result = IntegerArray._from_sequence(np.array([1.0, 2.0], dtype="float32"))
  147. assert result.dtype == Int64Dtype()
  148. def test_to_integer_array_str():
  149. result = IntegerArray._from_sequence(["1", "2", None])
  150. expected = pd.array([1, 2, np.nan], dtype="Int64")
  151. tm.assert_extension_array_equal(result, expected)
  152. with pytest.raises(
  153. ValueError, match=r"invalid literal for int\(\) with base 10: .*"
  154. ):
  155. IntegerArray._from_sequence(["1", "2", ""])
  156. with pytest.raises(
  157. ValueError, match=r"invalid literal for int\(\) with base 10: .*"
  158. ):
  159. IntegerArray._from_sequence(["1.5", "2.0"])
  160. @pytest.mark.parametrize(
  161. "bool_values, int_values, target_dtype, expected_dtype",
  162. [
  163. ([False, True], [0, 1], Int64Dtype(), Int64Dtype()),
  164. ([False, True], [0, 1], "Int64", Int64Dtype()),
  165. ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()),
  166. ],
  167. )
  168. def test_to_integer_array_bool(
  169. constructor, bool_values, int_values, target_dtype, expected_dtype
  170. ):
  171. result = constructor(bool_values, dtype=target_dtype)
  172. assert result.dtype == expected_dtype
  173. expected = pd.array(int_values, dtype=target_dtype)
  174. tm.assert_extension_array_equal(result, expected)
  175. @pytest.mark.parametrize(
  176. "values, to_dtype, result_dtype",
  177. [
  178. (np.array([1], dtype="int64"), None, Int64Dtype),
  179. (np.array([1, np.nan]), None, Int64Dtype),
  180. (np.array([1, np.nan]), "int8", Int8Dtype),
  181. ],
  182. )
  183. def test_to_integer_array(values, to_dtype, result_dtype):
  184. # convert existing arrays to IntegerArrays
  185. result = IntegerArray._from_sequence(values, dtype=to_dtype)
  186. assert result.dtype == result_dtype()
  187. expected = pd.array(values, dtype=result_dtype())
  188. tm.assert_extension_array_equal(result, expected)