| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431 | import datetimeimport decimalimport numpy as npimport pytestimport pytzfrom pandas.core.dtypes.base import _registry as registryimport pandas as pdimport pandas._testing as tmfrom pandas.api.extensions import register_extension_dtypefrom pandas.arrays import (    BooleanArray,    DatetimeArray,    FloatingArray,    IntegerArray,    IntervalArray,    SparseArray,    TimedeltaArray,)from pandas.core.arrays import (    PandasArray,    period_array,)from pandas.tests.extension.decimal import (    DecimalArray,    DecimalDtype,    to_decimal,)@pytest.mark.parametrize(    "data, dtype, expected",    [        # Basic NumPy defaults.        ([1, 2], None, IntegerArray._from_sequence([1, 2])),        ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),        (            [1, 2],            np.dtype("float32"),            PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),        ),        (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])),        (            np.array([1.0, 2.0], dtype="float64"),            None,            FloatingArray._from_sequence([1.0, 2.0]),        ),        # String alias passes through to NumPy        ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))),        ([1, 2], "int64", PandasArray(np.array([1, 2], dtype=np.int64))),        # GH#44715 FloatingArray does not support float16, so fall back to PandasArray        (            np.array([1, 2], dtype=np.float16),            None,            PandasArray(np.array([1, 2], dtype=np.float16)),        ),        # idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))        (            PandasArray(np.array([1, 2], dtype=np.int32)),            None,            PandasArray(np.array([1, 2], dtype=np.int32)),        ),        # Period alias        (            [pd.Period("2000", "D"), pd.Period("2001", "D")],            "Period[D]",            period_array(["2000", "2001"], freq="D"),        ),        # Period dtype        (            [pd.Period("2000", "D")],            pd.PeriodDtype("D"),            period_array(["2000"], freq="D"),        ),        # Datetime (naive)        (            [1, 2],            np.dtype("datetime64[ns]"),            DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),        ),        (            np.array([1, 2], dtype="datetime64[ns]"),            None,            DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),        ),        (            pd.DatetimeIndex(["2000", "2001"]),            np.dtype("datetime64[ns]"),            DatetimeArray._from_sequence(["2000", "2001"]),        ),        (            pd.DatetimeIndex(["2000", "2001"]),            None,            DatetimeArray._from_sequence(["2000", "2001"]),        ),        (            ["2000", "2001"],            np.dtype("datetime64[ns]"),            DatetimeArray._from_sequence(["2000", "2001"]),        ),        # Datetime (tz-aware)        (            ["2000", "2001"],            pd.DatetimeTZDtype(tz="CET"),            DatetimeArray._from_sequence(                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")            ),        ),        # Timedelta        (            ["1H", "2H"],            np.dtype("timedelta64[ns]"),            TimedeltaArray._from_sequence(["1H", "2H"]),        ),        (            pd.TimedeltaIndex(["1H", "2H"]),            np.dtype("timedelta64[ns]"),            TimedeltaArray._from_sequence(["1H", "2H"]),        ),        (            pd.TimedeltaIndex(["1H", "2H"]),            None,            TimedeltaArray._from_sequence(["1H", "2H"]),        ),        (            # preserve non-nano, i.e. don't cast to PandasArray            TimedeltaArray._simple_new(                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")            ),            None,            TimedeltaArray._simple_new(                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")            ),        ),        (            # preserve non-nano, i.e. don't cast to PandasArray            TimedeltaArray._simple_new(                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")            ),            np.dtype("m8[s]"),            TimedeltaArray._simple_new(                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")            ),        ),        # Category        (["a", "b"], "category", pd.Categorical(["a", "b"])),        (            ["a", "b"],            pd.CategoricalDtype(None, ordered=True),            pd.Categorical(["a", "b"], ordered=True),        ),        # Interval        (            [pd.Interval(1, 2), pd.Interval(3, 4)],            "interval",            IntervalArray.from_tuples([(1, 2), (3, 4)]),        ),        # Sparse        ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),        # IntegerNA        ([1, None], "Int16", pd.array([1, None], dtype="Int16")),        (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),        # String        (            ["a", None],            "string",            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),        ),        (            ["a", None],            pd.StringDtype(),            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),        ),        # Boolean        ([True, None], "boolean", BooleanArray._from_sequence([True, None])),        ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])),        # Index        (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),        # Series[EA] returns the EA        (            pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),            None,            pd.Categorical(["a", "b"], categories=["a", "b", "c"]),        ),        # "3rd party" EAs work        ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),        # pass an ExtensionArray, but a different dtype        (            period_array(["2000", "2001"], freq="D"),            "category",            pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),        ),    ],)def test_array(data, dtype, expected):    result = pd.array(data, dtype=dtype)    tm.assert_equal(result, expected)def test_array_copy():    a = np.array([1, 2])    # default is to copy    b = pd.array(a, dtype=a.dtype)    assert not tm.shares_memory(a, b)    # copy=True    b = pd.array(a, dtype=a.dtype, copy=True)    assert not tm.shares_memory(a, b)    # copy=False    b = pd.array(a, dtype=a.dtype, copy=False)    assert tm.shares_memory(a, b)cet = pytz.timezone("CET")@pytest.mark.parametrize(    "data, expected",    [        # period        (            [pd.Period("2000", "D"), pd.Period("2001", "D")],            period_array(["2000", "2001"], freq="D"),        ),        # interval        ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),        # datetime        (            [pd.Timestamp("2000"), pd.Timestamp("2001")],            DatetimeArray._from_sequence(["2000", "2001"]),        ),        (            [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],            DatetimeArray._from_sequence(["2000", "2001"]),        ),        (            np.array([1, 2], dtype="M8[ns]"),            DatetimeArray(np.array([1, 2], dtype="M8[ns]")),        ),        (            np.array([1, 2], dtype="M8[us]"),            DatetimeArray._simple_new(                np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")            ),        ),        # datetimetz        (            [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],            DatetimeArray._from_sequence(                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")            ),        ),        (            [                datetime.datetime(2000, 1, 1, tzinfo=cet),                datetime.datetime(2001, 1, 1, tzinfo=cet),            ],            DatetimeArray._from_sequence(                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet)            ),        ),        # timedelta        (            [pd.Timedelta("1H"), pd.Timedelta("2H")],            TimedeltaArray._from_sequence(["1H", "2H"]),        ),        (            np.array([1, 2], dtype="m8[ns]"),            TimedeltaArray(np.array([1, 2], dtype="m8[ns]")),        ),        (            np.array([1, 2], dtype="m8[us]"),            TimedeltaArray(np.array([1, 2], dtype="m8[us]")),        ),        # integer        ([1, 2], IntegerArray._from_sequence([1, 2])),        ([1, None], IntegerArray._from_sequence([1, None])),        ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA])),        ([1, np.nan], IntegerArray._from_sequence([1, np.nan])),        # float        ([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2])),        ([0.1, None], FloatingArray._from_sequence([0.1, pd.NA])),        ([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA])),        ([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA])),        # integer-like float        ([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0])),        ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA])),        ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA])),        ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA])),        # mixed-integer-float        ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])),        ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])),        # string        (            ["a", "b"],            pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]),        ),        (            ["a", None],            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),        ),        # Boolean        ([True, False], BooleanArray._from_sequence([True, False])),        ([True, None], BooleanArray._from_sequence([True, None])),    ],)def test_array_inference(data, expected):    result = pd.array(data)    tm.assert_equal(result, expected)@pytest.mark.parametrize(    "data",    [        # mix of frequencies        [pd.Period("2000", "D"), pd.Period("2001", "A")],        # mix of closed        [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],        # Mix of timezones        [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],        # Mix of tz-aware and tz-naive        [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],        np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),    ],)def test_array_inference_fails(data):    result = pd.array(data)    expected = PandasArray(np.array(data, dtype=object))    tm.assert_extension_array_equal(result, expected)@pytest.mark.parametrize("data", [np.array(0)])def test_nd_raises(data):    with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"):        pd.array(data, dtype="int64")def test_scalar_raises():    with pytest.raises(ValueError, match="Cannot pass scalar '1'"):        pd.array(1)def test_dataframe_raises():    # GH#51167 don't accidentally cast to StringArray by doing inference on columns    df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])    msg = "Cannot pass DataFrame to 'pandas.array'"    with pytest.raises(TypeError, match=msg):        pd.array(df)def test_bounds_check():    # GH21796    with pytest.raises(        TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"    ):        pd.array([-1, 2, 3], dtype="UInt16")# ---------------------------------------------------------------------------# A couple dummy classes to ensure that Series and Indexes are unboxed before# getting to the EA classes.@register_extension_dtypeclass DecimalDtype2(DecimalDtype):    name = "decimal2"    @classmethod    def construct_array_type(cls):        """        Return the array type associated with this dtype.        Returns        -------        type        """        return DecimalArray2class DecimalArray2(DecimalArray):    @classmethod    def _from_sequence(cls, scalars, dtype=None, copy=False):        if isinstance(scalars, (pd.Series, pd.Index)):            raise TypeError("scalars should not be of type pd.Series or pd.Index")        return super()._from_sequence(scalars, dtype=dtype, copy=copy)def test_array_unboxes(index_or_series):    box = index_or_series    data = box([decimal.Decimal("1"), decimal.Decimal("2")])    # make sure it works    with pytest.raises(        TypeError, match="scalars should not be of type pd.Series or pd.Index"    ):        DecimalArray2._from_sequence(data)    result = pd.array(data, dtype="decimal2")    expected = DecimalArray2._from_sequence(data.values)    tm.assert_equal(result, expected)@pytest.fixturedef registry_without_decimal():    """Fixture yielding 'registry' with no DecimalDtype entries"""    idx = registry.dtypes.index(DecimalDtype)    registry.dtypes.pop(idx)    yield    registry.dtypes.append(DecimalDtype)def test_array_not_registered(registry_without_decimal):    # check we aren't on it    assert registry.find("decimal") is None    data = [decimal.Decimal("1"), decimal.Decimal("2")]    result = pd.array(data, dtype=DecimalDtype)    expected = DecimalArray._from_sequence(data)    tm.assert_equal(result, expected)def test_array_to_numpy_na():    # GH#40638    arr = pd.array([pd.NA, 1], dtype="string")    result = arr.to_numpy(na_value=True, dtype=bool)    expected = np.array([True, True])    tm.assert_numpy_array_equal(result, expected)
 |