test_constructors.py 114 KB


  1. import array
  2. from collections import (
  3. OrderedDict,
  4. abc,
  5. defaultdict,
  6. namedtuple,
  7. )
  8. from dataclasses import make_dataclass
  9. from datetime import (
  10. date,
  11. datetime,
  12. timedelta,
  13. )
  14. import functools
  15. import random
  16. import re
  17. from typing import Iterator
  18. import warnings
  19. import numpy as np
  20. from numpy import ma
  21. from numpy.ma import mrecords
  22. import pytest
  23. import pytz
  24. from pandas.errors import IntCastingNaNError
  25. import pandas.util._test_decorators as td
  26. from pandas.core.dtypes.common import is_integer_dtype
  27. from pandas.core.dtypes.dtypes import (
  28. DatetimeTZDtype,
  29. IntervalDtype,
  30. PandasDtype,
  31. PeriodDtype,
  32. )
  33. import pandas as pd
  34. from pandas import (
  35. Categorical,
  36. CategoricalIndex,
  37. DataFrame,
  38. DatetimeIndex,
  39. Index,
  40. Interval,
  41. MultiIndex,
  42. Period,
  43. RangeIndex,
  44. Series,
  45. Timedelta,
  46. Timestamp,
  47. cut,
  48. date_range,
  49. isna,
  50. )
  51. import pandas._testing as tm
  52. from pandas.arrays import (
  53. DatetimeArray,
  54. IntervalArray,
  55. PeriodArray,
  56. SparseArray,
  57. TimedeltaArray,
  58. )
  59. MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
  60. MIXED_INT_DTYPES = [
  61. "uint8",
  62. "uint16",
  63. "uint32",
  64. "uint64",
  65. "int8",
  66. "int16",
  67. "int32",
  68. "int64",
  69. ]
  70. class TestDataFrameConstructors:
  71. def test_constructor_from_ndarray_with_str_dtype(self):
  72. # If we don't ravel/reshape around ensure_str_array, we end up
  73. # with an array of strings each of which is e.g. "[0 1 2]"
  74. arr = np.arange(12).reshape(4, 3)
  75. df = DataFrame(arr, dtype=str)
  76. expected = DataFrame(arr.astype(str))
  77. tm.assert_frame_equal(df, expected)
  78. def test_constructor_from_2d_datetimearray(self, using_array_manager):
  79. dti = date_range("2016-01-01", periods=6, tz="US/Pacific")
  80. dta = dti._data.reshape(3, 2)
  81. df = DataFrame(dta)
  82. expected = DataFrame({0: dta[:, 0], 1: dta[:, 1]})
  83. tm.assert_frame_equal(df, expected)
  84. if not using_array_manager:
  85. # GH#44724 big performance hit if we de-consolidate
  86. assert len(df._mgr.blocks) == 1
  87. def test_constructor_dict_with_tzaware_scalar(self):
  88. # GH#42505
  89. dt = Timestamp("2019-11-03 01:00:00-0700").tz_convert("America/Los_Angeles")
  90. df = DataFrame({"dt": dt}, index=[0])
  91. expected = DataFrame({"dt": [dt]})
  92. tm.assert_frame_equal(df, expected)
  93. # Non-homogeneous
  94. df = DataFrame({"dt": dt, "value": [1]})
  95. expected = DataFrame({"dt": [dt], "value": [1]})
  96. tm.assert_frame_equal(df, expected)
  97. def test_construct_ndarray_with_nas_and_int_dtype(self):
  98. # GH#26919 match Series by not casting np.nan to meaningless int
  99. arr = np.array([[1, np.nan], [2, 3]])
  100. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  101. with pytest.raises(IntCastingNaNError, match=msg):
  102. DataFrame(arr, dtype="i8")
  103. # check this matches Series behavior
  104. with pytest.raises(IntCastingNaNError, match=msg):
  105. Series(arr[0], dtype="i8", name=0)
  106. def test_construct_from_list_of_datetimes(self):
  107. df = DataFrame([datetime.now(), datetime.now()])
  108. assert df[0].dtype == np.dtype("M8[ns]")
  109. def test_constructor_from_tzaware_datetimeindex(self):
  110. # don't cast a DatetimeIndex WITH a tz, leave as object
  111. # GH#6032
  112. naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B")
  113. idx = naive.tz_localize("US/Pacific")
  114. expected = Series(np.array(idx.tolist(), dtype="object"), name="B")
  115. assert expected.dtype == idx.dtype
  116. # convert index to series
  117. result = Series(idx)
  118. tm.assert_series_equal(result, expected)
  119. def test_array_of_dt64_nat_with_td64dtype_raises(self, frame_or_series):
  120. # GH#39462
  121. nat = np.datetime64("NaT", "ns")
  122. arr = np.array([nat], dtype=object)
  123. if frame_or_series is DataFrame:
  124. arr = arr.reshape(1, 1)
  125. msg = "Invalid type for timedelta scalar: <class 'numpy.datetime64'>"
  126. with pytest.raises(TypeError, match=msg):
  127. frame_or_series(arr, dtype="m8[ns]")
  128. @pytest.mark.parametrize("kind", ["m", "M"])
  129. def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series):
  130. # with dtype=object, we should cast dt64 values to Timestamps, not pydatetimes
  131. if kind == "M":
  132. dtype = "M8[ns]"
  133. scalar_type = Timestamp
  134. else:
  135. dtype = "m8[ns]"
  136. scalar_type = Timedelta
  137. arr = np.arange(6, dtype="i8").view(dtype).reshape(3, 2)
  138. if frame_or_series is Series:
  139. arr = arr[:, 0]
  140. obj = frame_or_series(arr, dtype=object)
  141. assert obj._mgr.arrays[0].dtype == object
  142. assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
  143. # go through a different path in internals.construction
  144. obj = frame_or_series(frame_or_series(arr), dtype=object)
  145. assert obj._mgr.arrays[0].dtype == object
  146. assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
  147. obj = frame_or_series(frame_or_series(arr), dtype=PandasDtype(object))
  148. assert obj._mgr.arrays[0].dtype == object
  149. assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
  150. if frame_or_series is DataFrame:
  151. # other paths through internals.construction
  152. sers = [Series(x) for x in arr]
  153. obj = frame_or_series(sers, dtype=object)
  154. assert obj._mgr.arrays[0].dtype == object
  155. assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
  156. def test_series_with_name_not_matching_column(self):
  157. # GH#9232
  158. x = Series(range(5), name=1)
  159. y = Series(range(5), name=0)
  160. result = DataFrame(x, columns=[0])
  161. expected = DataFrame([], columns=[0])
  162. tm.assert_frame_equal(result, expected)
  163. result = DataFrame(y, columns=[1])
  164. expected = DataFrame([], columns=[1])
  165. tm.assert_frame_equal(result, expected)
  166. @pytest.mark.parametrize(
  167. "constructor",
  168. [
  169. lambda: DataFrame(),
  170. lambda: DataFrame(None),
  171. lambda: DataFrame(()),
  172. lambda: DataFrame([]),
  173. lambda: DataFrame(_ for _ in []),
  174. lambda: DataFrame(range(0)),
  175. lambda: DataFrame(data=None),
  176. lambda: DataFrame(data=()),
  177. lambda: DataFrame(data=[]),
  178. lambda: DataFrame(data=(_ for _ in [])),
  179. lambda: DataFrame(data=range(0)),
  180. ],
  181. )
  182. def test_empty_constructor(self, constructor):
  183. expected = DataFrame()
  184. result = constructor()
  185. assert len(result.index) == 0
  186. assert len(result.columns) == 0
  187. tm.assert_frame_equal(result, expected)
  188. @pytest.mark.parametrize(
  189. "constructor",
  190. [
  191. lambda: DataFrame({}),
  192. lambda: DataFrame(data={}),
  193. ],
  194. )
  195. def test_empty_constructor_object_index(self, constructor):
  196. expected = DataFrame(index=RangeIndex(0), columns=RangeIndex(0))
  197. result = constructor()
  198. assert len(result.index) == 0
  199. assert len(result.columns) == 0
  200. tm.assert_frame_equal(result, expected, check_index_type=True)
  201. @pytest.mark.parametrize(
  202. "emptylike,expected_index,expected_columns",
  203. [
  204. ([[]], RangeIndex(1), RangeIndex(0)),
  205. ([[], []], RangeIndex(2), RangeIndex(0)),
  206. ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)),
  207. ],
  208. )
  209. def test_emptylike_constructor(self, emptylike, expected_index, expected_columns):
  210. expected = DataFrame(index=expected_index, columns=expected_columns)
  211. result = DataFrame(emptylike)
  212. tm.assert_frame_equal(result, expected)
  213. def test_constructor_mixed(self, float_string_frame):
  214. index, data = tm.getMixedTypeDict()
  215. # TODO(wesm), incomplete test?
  216. indexed_frame = DataFrame(data, index=index) # noqa
  217. unindexed_frame = DataFrame(data) # noqa
  218. assert float_string_frame["foo"].dtype == np.object_
  219. def test_constructor_cast_failure(self):
  220. # as of 2.0, we raise if we can't respect "dtype", previously we
  221. # silently ignored
  222. msg = "could not convert string to float"
  223. with pytest.raises(ValueError, match=msg):
  224. DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
  225. # GH 3010, constructing with odd arrays
  226. df = DataFrame(np.ones((4, 2)))
  227. # this is ok
  228. df["foo"] = np.ones((4, 2)).tolist()
  229. # this is not ok
  230. msg = "Expected a 1D array, got an array with shape \\(4, 2\\)"
  231. with pytest.raises(ValueError, match=msg):
  232. df["test"] = np.ones((4, 2))
  233. # this is ok
  234. df["foo2"] = np.ones((4, 2)).tolist()
  235. def test_constructor_dtype_copy(self):
  236. orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]})
  237. new_df = DataFrame(orig_df, dtype=float, copy=True)
  238. new_df["col1"] = 200.0
  239. assert orig_df["col1"][0] == 1.0
  240. def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write):
  241. df = DataFrame([[1, 2]])
  242. should_be_view = DataFrame(df, dtype=df[0].dtype)
  243. if using_copy_on_write:
  244. should_be_view.iloc[0, 0] = 99
  245. assert df.values[0, 0] == 1
  246. else:
  247. should_be_view[0][0] = 99
  248. assert df.values[0, 0] == 99
  249. def test_constructor_dtype_nocast_view_2d_array(
  250. self, using_array_manager, using_copy_on_write
  251. ):
  252. df = DataFrame([[1, 2], [3, 4]], dtype="int64")
  253. if not using_array_manager and not using_copy_on_write:
  254. should_be_view = DataFrame(df.values, dtype=df[0].dtype)
  255. should_be_view[0][0] = 97
  256. assert df.values[0, 0] == 97
  257. else:
  258. # INFO(ArrayManager) DataFrame(ndarray) doesn't necessarily preserve
  259. # a view on the array to ensure contiguous 1D arrays
  260. df2 = DataFrame(df.values, dtype=df[0].dtype)
  261. assert df2._mgr.arrays[0].flags.c_contiguous
  262. @td.skip_array_manager_invalid_test
  263. def test_1d_object_array_does_not_copy(self):
  264. # https://github.com/pandas-dev/pandas/issues/39272
  265. arr = np.array(["a", "b"], dtype="object")
  266. df = DataFrame(arr, copy=False)
  267. assert np.shares_memory(df.values, arr)
  268. @td.skip_array_manager_invalid_test
  269. def test_2d_object_array_does_not_copy(self):
  270. # https://github.com/pandas-dev/pandas/issues/39272
  271. arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
  272. df = DataFrame(arr, copy=False)
  273. assert np.shares_memory(df.values, arr)
  274. def test_constructor_dtype_list_data(self):
  275. df = DataFrame([[1, "2"], [None, "a"]], dtype=object)
  276. assert df.loc[1, 0] is None
  277. assert df.loc[0, 1] == "2"
  278. def test_constructor_list_of_2d_raises(self):
  279. # https://github.com/pandas-dev/pandas/issues/32289
  280. a = DataFrame()
  281. b = np.empty((0, 0))
  282. with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"):
  283. DataFrame([a])
  284. with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"):
  285. DataFrame([b])
  286. a = DataFrame({"A": [1, 2]})
  287. with pytest.raises(ValueError, match=r"shape=\(2, 2, 1\)"):
  288. DataFrame([a, a])
  289. @pytest.mark.parametrize(
  290. "typ, ad",
  291. [
  292. # mixed floating and integer coexist in the same frame
  293. ["float", {}],
  294. # add lots of types
  295. ["float", {"A": 1, "B": "foo", "C": "bar"}],
  296. # GH 622
  297. ["int", {}],
  298. ],
  299. )
  300. def test_constructor_mixed_dtypes(self, typ, ad):
  301. if typ == "int":
  302. dtypes = MIXED_INT_DTYPES
  303. arrays = [np.array(np.random.rand(10), dtype=d) for d in dtypes]
  304. elif typ == "float":
  305. dtypes = MIXED_FLOAT_DTYPES
  306. arrays = [np.array(np.random.randint(10, size=10), dtype=d) for d in dtypes]
  307. for d, a in zip(dtypes, arrays):
  308. assert a.dtype == d
  309. ad.update(dict(zip(dtypes, arrays)))
  310. df = DataFrame(ad)
  311. dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES
  312. for d in dtypes:
  313. if d in df:
  314. assert df.dtypes[d] == d
  315. def test_constructor_complex_dtypes(self):
  316. # GH10952
  317. a = np.random.rand(10).astype(np.complex64)
  318. b = np.random.rand(10).astype(np.complex128)
  319. df = DataFrame({"a": a, "b": b})
  320. assert a.dtype == df.a.dtype
  321. assert b.dtype == df.b.dtype
  322. def test_constructor_dtype_str_na_values(self, string_dtype):
  323. # https://github.com/pandas-dev/pandas/issues/21083
  324. df = DataFrame({"A": ["x", None]}, dtype=string_dtype)
  325. result = df.isna()
  326. expected = DataFrame({"A": [False, True]})
  327. tm.assert_frame_equal(result, expected)
  328. assert df.iloc[1, 0] is None
  329. df = DataFrame({"A": ["x", np.nan]}, dtype=string_dtype)
  330. assert np.isnan(df.iloc[1, 0])
  331. def test_constructor_rec(self, float_frame):
  332. rec = float_frame.to_records(index=False)
  333. rec.dtype.names = list(rec.dtype.names)[::-1]
  334. index = float_frame.index
  335. df = DataFrame(rec)
  336. tm.assert_index_equal(df.columns, Index(rec.dtype.names))
  337. df2 = DataFrame(rec, index=index)
  338. tm.assert_index_equal(df2.columns, Index(rec.dtype.names))
  339. tm.assert_index_equal(df2.index, index)
  340. # case with columns != the ones we would infer from the data
  341. rng = np.arange(len(rec))[::-1]
  342. df3 = DataFrame(rec, index=rng, columns=["C", "B"])
  343. expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"])
  344. tm.assert_frame_equal(df3, expected)
  345. def test_constructor_bool(self):
  346. df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)})
  347. assert df.values.dtype == np.bool_
  348. def test_constructor_overflow_int64(self):
  349. # see gh-14881
  350. values = np.array([2**64 - i for i in range(1, 10)], dtype=np.uint64)
  351. result = DataFrame({"a": values})
  352. assert result["a"].dtype == np.uint64
  353. # see gh-2355
  354. data_scores = [
  355. (6311132704823138710, 273),
  356. (2685045978526272070, 23),
  357. (8921811264899370420, 45),
  358. (17019687244989530680, 270),
  359. (9930107427299601010, 273),
  360. ]
  361. dtype = [("uid", "u8"), ("score", "u8")]
  362. data = np.zeros((len(data_scores),), dtype=dtype)
  363. data[:] = data_scores
  364. df_crawls = DataFrame(data)
  365. assert df_crawls["uid"].dtype == np.uint64
  366. @pytest.mark.parametrize(
  367. "values",
  368. [
  369. np.array([2**64], dtype=object),
  370. np.array([2**65]),
  371. [2**64 + 1],
  372. np.array([-(2**63) - 4], dtype=object),
  373. np.array([-(2**64) - 1]),
  374. [-(2**65) - 2],
  375. ],
  376. )
  377. def test_constructor_int_overflow(self, values):
  378. # see gh-18584
  379. value = values[0]
  380. result = DataFrame(values)
  381. assert result[0].dtype == object
  382. assert result[0][0] == value
  383. @pytest.mark.parametrize(
  384. "values",
  385. [
  386. np.array([1], dtype=np.uint16),
  387. np.array([1], dtype=np.uint32),
  388. np.array([1], dtype=np.uint64),
  389. [np.uint16(1)],
  390. [np.uint32(1)],
  391. [np.uint64(1)],
  392. ],
  393. )
  394. def test_constructor_numpy_uints(self, values):
  395. # GH#47294
  396. value = values[0]
  397. result = DataFrame(values)
  398. assert result[0].dtype == value.dtype
  399. assert result[0][0] == value
  400. def test_constructor_ordereddict(self):
  401. nitems = 100
  402. nums = list(range(nitems))
  403. random.shuffle(nums)
  404. expected = [f"A{i:d}" for i in nums]
  405. df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems)))
  406. assert expected == list(df.columns)
  407. def test_constructor_dict(self):
  408. datetime_series = tm.makeTimeSeries(nper=30)
  409. # test expects index shifted by 5
  410. datetime_series_short = tm.makeTimeSeries(nper=30)[5:]
  411. frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short})
  412. # col2 is padded with NaN
  413. assert len(datetime_series) == 30
  414. assert len(datetime_series_short) == 25
  415. tm.assert_series_equal(frame["col1"], datetime_series.rename("col1"))
  416. exp = Series(
  417. np.concatenate([[np.nan] * 5, datetime_series_short.values]),
  418. index=datetime_series.index,
  419. name="col2",
  420. )
  421. tm.assert_series_equal(exp, frame["col2"])
  422. frame = DataFrame(
  423. {"col1": datetime_series, "col2": datetime_series_short},
  424. columns=["col2", "col3", "col4"],
  425. )
  426. assert len(frame) == len(datetime_series_short)
  427. assert "col1" not in frame
  428. assert isna(frame["col3"]).all()
  429. # Corner cases
  430. assert len(DataFrame()) == 0
  431. # mix dict and array, wrong size - no spec for which error should raise
  432. # first
  433. msg = "Mixing dicts with non-Series may lead to ambiguous ordering."
  434. with pytest.raises(ValueError, match=msg):
  435. DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]})
  436. def test_constructor_dict_length1(self):
  437. # Length-one dict micro-optimization
  438. frame = DataFrame({"A": {"1": 1, "2": 2}})
  439. tm.assert_index_equal(frame.index, Index(["1", "2"]))
  440. def test_constructor_dict_with_index(self):
  441. # empty dict plus index
  442. idx = Index([0, 1, 2])
  443. frame = DataFrame({}, index=idx)
  444. assert frame.index is idx
  445. def test_constructor_dict_with_index_and_columns(self):
  446. # empty dict with index and columns
  447. idx = Index([0, 1, 2])
  448. frame = DataFrame({}, index=idx, columns=idx)
  449. assert frame.index is idx
  450. assert frame.columns is idx
  451. assert len(frame._series) == 3
  452. def test_constructor_dict_of_empty_lists(self):
  453. # with dict of empty list and Series
  454. frame = DataFrame({"A": [], "B": []}, columns=["A", "B"])
  455. tm.assert_index_equal(frame.index, RangeIndex(0), exact=True)
  456. def test_constructor_dict_with_none(self):
  457. # GH 14381
  458. # Dict with None value
  459. frame_none = DataFrame({"a": None}, index=[0])
  460. frame_none_list = DataFrame({"a": [None]}, index=[0])
  461. assert frame_none._get_value(0, "a") is None
  462. assert frame_none_list._get_value(0, "a") is None
  463. tm.assert_frame_equal(frame_none, frame_none_list)
  464. def test_constructor_dict_errors(self):
  465. # GH10856
  466. # dict with scalar values should raise error, even if columns passed
  467. msg = "If using all scalar values, you must pass an index"
  468. with pytest.raises(ValueError, match=msg):
  469. DataFrame({"a": 0.7})
  470. with pytest.raises(ValueError, match=msg):
  471. DataFrame({"a": 0.7}, columns=["a"])
  472. @pytest.mark.parametrize("scalar", [2, np.nan, None, "D"])
  473. def test_constructor_invalid_items_unused(self, scalar):
  474. # No error if invalid (scalar) value is in fact not used:
  475. result = DataFrame({"a": scalar}, columns=["b"])
  476. expected = DataFrame(columns=["b"])
  477. tm.assert_frame_equal(result, expected)
  478. @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")])
  479. def test_constructor_dict_nan_key(self, value):
  480. # GH 18455
  481. cols = [1, value, 3]
  482. idx = ["a", value]
  483. values = [[0, 3], [1, 4], [2, 5]]
  484. data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
  485. result = DataFrame(data).sort_values(1).sort_values("a", axis=1)
  486. expected = DataFrame(
  487. np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols
  488. )
  489. tm.assert_frame_equal(result, expected)
  490. result = DataFrame(data, index=idx).sort_values("a", axis=1)
  491. tm.assert_frame_equal(result, expected)
  492. result = DataFrame(data, index=idx, columns=cols)
  493. tm.assert_frame_equal(result, expected)
  494. @pytest.mark.parametrize("value", [np.nan, None, float("nan")])
  495. def test_constructor_dict_nan_tuple_key(self, value):
  496. # GH 18455
  497. cols = Index([(11, 21), (value, 22), (13, value)])
  498. idx = Index([("a", value), (value, 2)])
  499. values = [[0, 3], [1, 4], [2, 5]]
  500. data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
  501. result = DataFrame(data).sort_values((11, 21)).sort_values(("a", value), axis=1)
  502. expected = DataFrame(
  503. np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols
  504. )
  505. tm.assert_frame_equal(result, expected)
  506. result = DataFrame(data, index=idx).sort_values(("a", value), axis=1)
  507. tm.assert_frame_equal(result, expected)
  508. result = DataFrame(data, index=idx, columns=cols)
  509. tm.assert_frame_equal(result, expected)
  510. def test_constructor_dict_order_insertion(self):
  511. datetime_series = tm.makeTimeSeries(nper=30)
  512. datetime_series_short = tm.makeTimeSeries(nper=25)
  513. # GH19018
  514. # initialization ordering: by insertion order if python>= 3.6
  515. d = {"b": datetime_series_short, "a": datetime_series}
  516. frame = DataFrame(data=d)
  517. expected = DataFrame(data=d, columns=list("ba"))
  518. tm.assert_frame_equal(frame, expected)
  519. def test_constructor_dict_nan_key_and_columns(self):
  520. # GH 16894
  521. result = DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2])
  522. expected = DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2])
  523. tm.assert_frame_equal(result, expected)
  524. def test_constructor_multi_index(self):
  525. # GH 4078
  526. # construction error with mi and all-nan frame
  527. tuples = [(2, 3), (3, 3), (3, 3)]
  528. mi = MultiIndex.from_tuples(tuples)
  529. df = DataFrame(index=mi, columns=mi)
  530. assert isna(df).values.ravel().all()
  531. tuples = [(3, 3), (2, 3), (3, 3)]
  532. mi = MultiIndex.from_tuples(tuples)
  533. df = DataFrame(index=mi, columns=mi)
  534. assert isna(df).values.ravel().all()
  535. def test_constructor_2d_index(self):
  536. # GH 25416
  537. # handling of 2d index in construction
  538. df = DataFrame([[1]], columns=[[1]], index=[1, 2])
  539. expected = DataFrame(
  540. [1, 1],
  541. index=Index([1, 2], dtype="int64"),
  542. columns=MultiIndex(levels=[[1]], codes=[[0]]),
  543. )
  544. tm.assert_frame_equal(df, expected)
  545. df = DataFrame([[1]], columns=[[1]], index=[[1, 2]])
  546. expected = DataFrame(
  547. [1, 1],
  548. index=MultiIndex(levels=[[1, 2]], codes=[[0, 1]]),
  549. columns=MultiIndex(levels=[[1]], codes=[[0]]),
  550. )
  551. tm.assert_frame_equal(df, expected)
  552. def test_constructor_error_msgs(self):
  553. msg = "Empty data passed with indices specified."
  554. # passing an empty array with columns specified.
  555. with pytest.raises(ValueError, match=msg):
  556. DataFrame(np.empty(0), columns=list("abc"))
  557. msg = "Mixing dicts with non-Series may lead to ambiguous ordering."
  558. # mix dict and array, wrong size
  559. with pytest.raises(ValueError, match=msg):
  560. DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]})
  561. # wrong size ndarray, GH 3105
  562. msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)"
  563. with pytest.raises(ValueError, match=msg):
  564. DataFrame(
  565. np.arange(12).reshape((4, 3)),
  566. columns=["foo", "bar", "baz"],
  567. index=date_range("2000-01-01", periods=3),
  568. )
  569. arr = np.array([[4, 5, 6]])
  570. msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)"
  571. with pytest.raises(ValueError, match=msg):
  572. DataFrame(index=[0], columns=range(0, 4), data=arr)
  573. arr = np.array([4, 5, 6])
  574. msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)"
  575. with pytest.raises(ValueError, match=msg):
  576. DataFrame(index=[0], columns=range(0, 4), data=arr)
  577. # higher dim raise exception
  578. with pytest.raises(ValueError, match="Must pass 2-d input"):
  579. DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1])
  580. # wrong size axis labels
  581. msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
  582. with pytest.raises(ValueError, match=msg):
  583. DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1])
  584. msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)"
  585. with pytest.raises(ValueError, match=msg):
  586. DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2])
  587. # gh-26429
  588. msg = "2 columns passed, passed data had 10 columns"
  589. with pytest.raises(ValueError, match=msg):
  590. DataFrame((range(10), range(10, 20)), columns=("ones", "twos"))
  591. msg = "If using all scalar values, you must pass an index"
  592. with pytest.raises(ValueError, match=msg):
  593. DataFrame({"a": False, "b": True})
  594. def test_constructor_subclass_dict(self, dict_subclass):
  595. # Test for passing dict subclass to constructor
  596. data = {
  597. "col1": dict_subclass((x, 10.0 * x) for x in range(10)),
  598. "col2": dict_subclass((x, 20.0 * x) for x in range(10)),
  599. }
  600. df = DataFrame(data)
  601. refdf = DataFrame({col: dict(val.items()) for col, val in data.items()})
  602. tm.assert_frame_equal(refdf, df)
  603. data = dict_subclass(data.items())
  604. df = DataFrame(data)
  605. tm.assert_frame_equal(refdf, df)
  606. def test_constructor_defaultdict(self, float_frame):
  607. # try with defaultdict
  608. data = {}
  609. float_frame.loc[: float_frame.index[10], "B"] = np.nan
  610. for k, v in float_frame.items():
  611. dct = defaultdict(dict)
  612. dct.update(v.to_dict())
  613. data[k] = dct
  614. frame = DataFrame(data)
  615. expected = frame.reindex(index=float_frame.index)
  616. tm.assert_frame_equal(float_frame, expected)
  617. def test_constructor_dict_block(self):
  618. expected = np.array([[4.0, 3.0, 2.0, 1.0]])
  619. df = DataFrame(
  620. {"d": [4.0], "c": [3.0], "b": [2.0], "a": [1.0]},
  621. columns=["d", "c", "b", "a"],
  622. )
  623. tm.assert_numpy_array_equal(df.values, expected)
  624. def test_constructor_dict_cast(self):
  625. # cast float tests
  626. test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
  627. frame = DataFrame(test_data, dtype=float)
  628. assert len(frame) == 3
  629. assert frame["B"].dtype == np.float64
  630. assert frame["A"].dtype == np.float64
  631. frame = DataFrame(test_data)
  632. assert len(frame) == 3
  633. assert frame["B"].dtype == np.object_
  634. assert frame["A"].dtype == np.float64
  635. def test_constructor_dict_cast2(self):
  636. # can't cast to float
  637. test_data = {
  638. "A": dict(zip(range(20), tm.makeStringIndex(20))),
  639. "B": dict(zip(range(15), np.random.randn(15))),
  640. }
  641. with pytest.raises(ValueError, match="could not convert string"):
  642. DataFrame(test_data, dtype=float)
  643. def test_constructor_dict_dont_upcast(self):
  644. d = {"Col1": {"Row1": "A String", "Row2": np.nan}}
  645. df = DataFrame(d)
  646. assert isinstance(df["Col1"]["Row2"], float)
  647. def test_constructor_dict_dont_upcast2(self):
  648. dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2])
  649. assert isinstance(dm[1][1], int)
  650. def test_constructor_dict_of_tuples(self):
  651. # GH #1491
  652. data = {"a": (1, 2, 3), "b": (4, 5, 6)}
  653. result = DataFrame(data)
  654. expected = DataFrame({k: list(v) for k, v in data.items()})
  655. tm.assert_frame_equal(result, expected, check_dtype=False)
  656. def test_constructor_dict_of_ranges(self):
  657. # GH 26356
  658. data = {"a": range(3), "b": range(3, 6)}
  659. result = DataFrame(data)
  660. expected = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
  661. tm.assert_frame_equal(result, expected)
  662. def test_constructor_dict_of_iterators(self):
  663. # GH 26349
  664. data = {"a": iter(range(3)), "b": reversed(range(3))}
  665. result = DataFrame(data)
  666. expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]})
  667. tm.assert_frame_equal(result, expected)
  668. def test_constructor_dict_of_generators(self):
  669. # GH 26349
  670. data = {"a": (i for i in (range(3))), "b": (i for i in reversed(range(3)))}
  671. result = DataFrame(data)
  672. expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]})
  673. tm.assert_frame_equal(result, expected)
  674. def test_constructor_dict_multiindex(self):
  675. d = {
  676. ("a", "a"): {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2},
  677. ("b", "a"): {("i", "i"): 6, ("i", "j"): 5, ("j", "i"): 4},
  678. ("b", "c"): {("i", "i"): 7, ("i", "j"): 8, ("j", "i"): 9},
  679. }
  680. _d = sorted(d.items())
  681. df = DataFrame(d)
  682. expected = DataFrame(
  683. [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d])
  684. ).T
  685. expected.index = MultiIndex.from_tuples(expected.index)
  686. tm.assert_frame_equal(
  687. df,
  688. expected,
  689. )
  690. d["z"] = {"y": 123.0, ("i", "i"): 111, ("i", "j"): 111, ("j", "i"): 111}
  691. _d.insert(0, ("z", d["z"]))
  692. expected = DataFrame(
  693. [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False)
  694. ).T
  695. expected.index = Index(expected.index, tupleize_cols=False)
  696. df = DataFrame(d)
  697. df = df.reindex(columns=expected.columns, index=expected.index)
  698. tm.assert_frame_equal(df, expected)
  699. def test_constructor_dict_datetime64_index(self):
  700. # GH 10160
  701. dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"]
  702. def create_data(constructor):
  703. return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)}
  704. data_datetime64 = create_data(np.datetime64)
  705. data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d"))
  706. data_Timestamp = create_data(Timestamp)
  707. expected = DataFrame(
  708. [
  709. {0: 0, 1: None, 2: None, 3: None},
  710. {0: None, 1: 2, 2: None, 3: None},
  711. {0: None, 1: None, 2: 4, 3: None},
  712. {0: None, 1: None, 2: None, 3: 6},
  713. ],
  714. index=[Timestamp(dt) for dt in dates_as_str],
  715. )
  716. result_datetime64 = DataFrame(data_datetime64)
  717. result_datetime = DataFrame(data_datetime)
  718. result_Timestamp = DataFrame(data_Timestamp)
  719. tm.assert_frame_equal(result_datetime64, expected)
  720. tm.assert_frame_equal(result_datetime, expected)
  721. tm.assert_frame_equal(result_Timestamp, expected)
  722. @pytest.mark.parametrize(
  723. "klass,name",
  724. [
  725. (lambda x: np.timedelta64(x, "D"), "timedelta64"),
  726. (lambda x: timedelta(days=x), "pytimedelta"),
  727. (lambda x: Timedelta(x, "D"), "Timedelta[ns]"),
  728. (lambda x: Timedelta(x, "D").as_unit("s"), "Timedelta[s]"),
  729. ],
  730. )
  731. def test_constructor_dict_timedelta64_index(self, klass, name):
  732. # GH 10160
  733. td_as_int = [1, 2, 3, 4]
  734. data = {i: {klass(s): 2 * i} for i, s in enumerate(td_as_int)}
  735. expected = DataFrame(
  736. [
  737. {0: 0, 1: None, 2: None, 3: None},
  738. {0: None, 1: 2, 2: None, 3: None},
  739. {0: None, 1: None, 2: 4, 3: None},
  740. {0: None, 1: None, 2: None, 3: 6},
  741. ],
  742. index=[Timedelta(td, "D") for td in td_as_int],
  743. )
  744. result = DataFrame(data)
  745. tm.assert_frame_equal(result, expected)
  746. def test_constructor_period_dict(self):
  747. # PeriodIndex
  748. a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M")
  749. b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D")
  750. df = DataFrame({"a": a, "b": b})
  751. assert df["a"].dtype == a.dtype
  752. assert df["b"].dtype == b.dtype
  753. # list of periods
  754. df = DataFrame({"a": a.astype(object).tolist(), "b": b.astype(object).tolist()})
  755. assert df["a"].dtype == a.dtype
  756. assert df["b"].dtype == b.dtype
  757. def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype):
  758. ea_scalar, ea_dtype = ea_scalar_and_dtype
  759. df = DataFrame({"a": ea_scalar}, index=[0])
  760. assert df["a"].dtype == ea_dtype
  761. expected = DataFrame(index=[0], columns=["a"], data=ea_scalar)
  762. tm.assert_frame_equal(df, expected)
  763. @pytest.mark.parametrize(
  764. "data,dtype",
  765. [
  766. (Period("2020-01"), PeriodDtype("M")),
  767. (Interval(left=0, right=5), IntervalDtype("int64", "right")),
  768. (
  769. Timestamp("2011-01-01", tz="US/Eastern"),
  770. DatetimeTZDtype(tz="US/Eastern"),
  771. ),
  772. ],
  773. )
  774. def test_constructor_extension_scalar_data(self, data, dtype):
  775. # GH 34832
  776. df = DataFrame(index=[0, 1], columns=["a", "b"], data=data)
  777. assert df["a"].dtype == dtype
  778. assert df["b"].dtype == dtype
  779. arr = pd.array([data] * 2, dtype=dtype)
  780. expected = DataFrame({"a": arr, "b": arr})
  781. tm.assert_frame_equal(df, expected)
  782. def test_nested_dict_frame_constructor(self):
  783. rng = pd.period_range("1/1/2000", periods=5)
  784. df = DataFrame(np.random.randn(10, 5), columns=rng)
  785. data = {}
  786. for col in df.columns:
  787. for row in df.index:
  788. data.setdefault(col, {})[row] = df._get_value(row, col)
  789. result = DataFrame(data, columns=rng)
  790. tm.assert_frame_equal(result, df)
  791. data = {}
  792. for col in df.columns:
  793. for row in df.index:
  794. data.setdefault(row, {})[col] = df._get_value(row, col)
  795. result = DataFrame(data, index=rng).T
  796. tm.assert_frame_equal(result, df)
  797. def _check_basic_constructor(self, empty):
  798. # mat: 2d matrix with shape (3, 2) to input. empty - makes sized
  799. # objects
  800. mat = empty((2, 3), dtype=float)
  801. # 2-D input
  802. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  803. assert len(frame.index) == 2
  804. assert len(frame.columns) == 3
  805. # 1-D input
  806. frame = DataFrame(empty((3,)), columns=["A"], index=[1, 2, 3])
  807. assert len(frame.index) == 3
  808. assert len(frame.columns) == 1
  809. if empty is not np.ones:
  810. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  811. with pytest.raises(IntCastingNaNError, match=msg):
  812. DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
  813. return
  814. else:
  815. frame = DataFrame(
  816. mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64
  817. )
  818. assert frame.values.dtype == np.int64
  819. # wrong size axis labels
  820. msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
  821. with pytest.raises(ValueError, match=msg):
  822. DataFrame(mat, columns=["A", "B", "C"], index=[1])
  823. msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)"
  824. with pytest.raises(ValueError, match=msg):
  825. DataFrame(mat, columns=["A", "B"], index=[1, 2])
  826. # higher dim raise exception
  827. with pytest.raises(ValueError, match="Must pass 2-d input"):
  828. DataFrame(empty((3, 3, 3)), columns=["A", "B", "C"], index=[1])
  829. # automatic labeling
  830. frame = DataFrame(mat)
  831. tm.assert_index_equal(frame.index, Index(range(2)), exact=True)
  832. tm.assert_index_equal(frame.columns, Index(range(3)), exact=True)
  833. frame = DataFrame(mat, index=[1, 2])
  834. tm.assert_index_equal(frame.columns, Index(range(3)), exact=True)
  835. frame = DataFrame(mat, columns=["A", "B", "C"])
  836. tm.assert_index_equal(frame.index, Index(range(2)), exact=True)
  837. # 0-length axis
  838. frame = DataFrame(empty((0, 3)))
  839. assert len(frame.index) == 0
  840. frame = DataFrame(empty((3, 0)))
  841. assert len(frame.columns) == 0
  842. def test_constructor_ndarray(self):
  843. self._check_basic_constructor(np.ones)
  844. frame = DataFrame(["foo", "bar"], index=[0, 1], columns=["A"])
  845. assert len(frame) == 2
  846. def test_constructor_maskedarray(self):
  847. self._check_basic_constructor(ma.masked_all)
  848. # Check non-masked values
  849. mat = ma.masked_all((2, 3), dtype=float)
  850. mat[0, 0] = 1.0
  851. mat[1, 2] = 2.0
  852. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  853. assert 1.0 == frame["A"][1]
  854. assert 2.0 == frame["C"][2]
  855. # what is this even checking??
  856. mat = ma.masked_all((2, 3), dtype=float)
  857. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  858. assert np.all(~np.asarray(frame == frame))
  859. def test_constructor_maskedarray_nonfloat(self):
  860. # masked int promoted to float
  861. mat = ma.masked_all((2, 3), dtype=int)
  862. # 2-D input
  863. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  864. assert len(frame.index) == 2
  865. assert len(frame.columns) == 3
  866. assert np.all(~np.asarray(frame == frame))
  867. # cast type
  868. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.float64)
  869. assert frame.values.dtype == np.float64
  870. # Check non-masked values
  871. mat2 = ma.copy(mat)
  872. mat2[0, 0] = 1
  873. mat2[1, 2] = 2
  874. frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
  875. assert 1 == frame["A"][1]
  876. assert 2 == frame["C"][2]
  877. # masked np.datetime64 stays (use NaT as null)
  878. mat = ma.masked_all((2, 3), dtype="M8[ns]")
  879. # 2-D input
  880. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  881. assert len(frame.index) == 2
  882. assert len(frame.columns) == 3
  883. assert isna(frame).values.all()
  884. # cast type
  885. msg = r"datetime64\[ns\] values and dtype=int64 is not supported"
  886. with pytest.raises(TypeError, match=msg):
  887. with warnings.catch_warnings():
  888. warnings.filterwarnings(
  889. "ignore",
  890. category=DeprecationWarning,
  891. message="elementwise comparison failed",
  892. )
  893. DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
  894. # Check non-masked values
  895. mat2 = ma.copy(mat)
  896. mat2[0, 0] = 1
  897. mat2[1, 2] = 2
  898. frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
  899. assert 1 == frame["A"].view("i8")[1]
  900. assert 2 == frame["C"].view("i8")[2]
  901. # masked bool promoted to object
  902. mat = ma.masked_all((2, 3), dtype=bool)
  903. # 2-D input
  904. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
  905. assert len(frame.index) == 2
  906. assert len(frame.columns) == 3
  907. assert np.all(~np.asarray(frame == frame))
  908. # cast type
  909. frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=object)
  910. assert frame.values.dtype == object
  911. # Check non-masked values
  912. mat2 = ma.copy(mat)
  913. mat2[0, 0] = True
  914. mat2[1, 2] = False
  915. frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
  916. assert frame["A"][1] is True
  917. assert frame["C"][2] is False
  918. def test_constructor_maskedarray_hardened(self):
  919. # Check numpy masked arrays with hard masks -- from GH24574
  920. mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask()
  921. result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2])
  922. expected = DataFrame(
  923. {"A": [np.nan, np.nan], "B": [np.nan, np.nan]},
  924. columns=["A", "B"],
  925. index=[1, 2],
  926. dtype=float,
  927. )
  928. tm.assert_frame_equal(result, expected)
  929. # Check case where mask is hard but no data are masked
  930. mat_hard = ma.ones((2, 2), dtype=float).harden_mask()
  931. result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2])
  932. expected = DataFrame(
  933. {"A": [1.0, 1.0], "B": [1.0, 1.0]},
  934. columns=["A", "B"],
  935. index=[1, 2],
  936. dtype=float,
  937. )
  938. tm.assert_frame_equal(result, expected)
  939. def test_constructor_maskedrecarray_dtype(self):
  940. # Ensure constructor honors dtype
  941. data = np.ma.array(
  942. np.ma.zeros(5, dtype=[("date", "<f8"), ("price", "<f8")]), mask=[False] * 5
  943. )
  944. data = data.view(mrecords.mrecarray)
  945. with pytest.raises(TypeError, match=r"Pass \{name: data\[name\]"):
  946. # Support for MaskedRecords deprecated GH#40363
  947. DataFrame(data, dtype=int)
  948. def test_constructor_corner_shape(self):
  949. df = DataFrame(index=[])
  950. assert df.values.shape == (0, 0)
  951. @pytest.mark.parametrize(
  952. "data, index, columns, dtype, expected",
  953. [
  954. (None, list(range(10)), ["a", "b"], object, np.object_),
  955. (None, None, ["a", "b"], "int64", np.dtype("int64")),
  956. (None, list(range(10)), ["a", "b"], int, np.dtype("float64")),
  957. ({}, None, ["foo", "bar"], None, np.object_),
  958. ({"b": 1}, list(range(10)), list("abc"), int, np.dtype("float64")),
  959. ],
  960. )
  961. def test_constructor_dtype(self, data, index, columns, dtype, expected):
  962. df = DataFrame(data, index, columns, dtype)
  963. assert df.values.dtype == expected
  964. @pytest.mark.parametrize(
  965. "data,input_dtype,expected_dtype",
  966. (
  967. ([True, False, None], "boolean", pd.BooleanDtype),
  968. ([1.0, 2.0, None], "Float64", pd.Float64Dtype),
  969. ([1, 2, None], "Int64", pd.Int64Dtype),
  970. (["a", "b", "c"], "string", pd.StringDtype),
  971. ),
  972. )
  973. def test_constructor_dtype_nullable_extension_arrays(
  974. self, data, input_dtype, expected_dtype
  975. ):
  976. df = DataFrame({"a": data}, dtype=input_dtype)
  977. assert df["a"].dtype == expected_dtype()
  978. def test_constructor_scalar_inference(self):
  979. data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"}
  980. df = DataFrame(data, index=np.arange(10))
  981. assert df["int"].dtype == np.int64
  982. assert df["bool"].dtype == np.bool_
  983. assert df["float"].dtype == np.float64
  984. assert df["complex"].dtype == np.complex128
  985. assert df["object"].dtype == np.object_
  986. def test_constructor_arrays_and_scalars(self):
  987. df = DataFrame({"a": np.random.randn(10), "b": True})
  988. exp = DataFrame({"a": df["a"].values, "b": [True] * 10})
  989. tm.assert_frame_equal(df, exp)
  990. with pytest.raises(ValueError, match="must pass an index"):
  991. DataFrame({"a": False, "b": True})
  992. def test_constructor_DataFrame(self, float_frame):
  993. df = DataFrame(float_frame)
  994. tm.assert_frame_equal(df, float_frame)
  995. df_casted = DataFrame(float_frame, dtype=np.int64)
  996. assert df_casted.values.dtype == np.int64
  997. def test_constructor_empty_dataframe(self):
  998. # GH 20624
  999. actual = DataFrame(DataFrame(), dtype="object")
  1000. expected = DataFrame([], dtype="object")
  1001. tm.assert_frame_equal(actual, expected)
  1002. def test_constructor_more(self, float_frame):
  1003. # used to be in test_matrix.py
  1004. arr = np.random.randn(10)
  1005. dm = DataFrame(arr, columns=["A"], index=np.arange(10))
  1006. assert dm.values.ndim == 2
  1007. arr = np.random.randn(0)
  1008. dm = DataFrame(arr)
  1009. assert dm.values.ndim == 2
  1010. assert dm.values.ndim == 2
  1011. # no data specified
  1012. dm = DataFrame(columns=["A", "B"], index=np.arange(10))
  1013. assert dm.values.shape == (10, 2)
  1014. dm = DataFrame(columns=["A", "B"])
  1015. assert dm.values.shape == (0, 2)
  1016. dm = DataFrame(index=np.arange(10))
  1017. assert dm.values.shape == (10, 0)
  1018. # can't cast
  1019. mat = np.array(["foo", "bar"], dtype=object).reshape(2, 1)
  1020. msg = "could not convert string to float: 'foo'"
  1021. with pytest.raises(ValueError, match=msg):
  1022. DataFrame(mat, index=[0, 1], columns=[0], dtype=float)
  1023. dm = DataFrame(DataFrame(float_frame._series))
  1024. tm.assert_frame_equal(dm, float_frame)
  1025. # int cast
  1026. dm = DataFrame(
  1027. {"A": np.ones(10, dtype=int), "B": np.ones(10, dtype=np.float64)},
  1028. index=np.arange(10),
  1029. )
  1030. assert len(dm.columns) == 2
  1031. assert dm.values.dtype == np.float64
  1032. def test_constructor_empty_list(self):
  1033. df = DataFrame([], index=[])
  1034. expected = DataFrame(index=[])
  1035. tm.assert_frame_equal(df, expected)
  1036. # GH 9939
  1037. df = DataFrame([], columns=["A", "B"])
  1038. expected = DataFrame({}, columns=["A", "B"])
  1039. tm.assert_frame_equal(df, expected)
  1040. # Empty generator: list(empty_gen()) == []
  1041. def empty_gen():
  1042. yield from ()
  1043. df = DataFrame(empty_gen(), columns=["A", "B"])
  1044. tm.assert_frame_equal(df, expected)
  1045. def test_constructor_list_of_lists(self):
  1046. # GH #484
  1047. df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"])
  1048. assert is_integer_dtype(df["num"])
  1049. assert df["str"].dtype == np.object_
  1050. # GH 4851
  1051. # list of 0-dim ndarrays
  1052. expected = DataFrame({0: np.arange(10)})
  1053. data = [np.array(x) for x in range(10)]
  1054. result = DataFrame(data)
  1055. tm.assert_frame_equal(result, expected)
  1056. def test_nested_pandasarray_matches_nested_ndarray(self):
  1057. # GH#43986
  1058. ser = Series([1, 2])
  1059. arr = np.array([None, None], dtype=object)
  1060. arr[0] = ser
  1061. arr[1] = ser * 2
  1062. df = DataFrame(arr)
  1063. expected = DataFrame(pd.array(arr))
  1064. tm.assert_frame_equal(df, expected)
  1065. assert df.shape == (2, 1)
  1066. tm.assert_numpy_array_equal(df[0].values, arr)
  1067. def test_constructor_list_like_data_nested_list_column(self):
  1068. # GH 32173
  1069. arrays = [list("abcd"), list("cdef")]
  1070. result = DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
  1071. mi = MultiIndex.from_arrays(arrays)
  1072. expected = DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi)
  1073. tm.assert_frame_equal(result, expected)
  1074. def test_constructor_wrong_length_nested_list_column(self):
  1075. # GH 32173
  1076. arrays = [list("abc"), list("cde")]
  1077. msg = "3 columns passed, passed data had 4"
  1078. with pytest.raises(ValueError, match=msg):
  1079. DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
  1080. def test_constructor_unequal_length_nested_list_column(self):
  1081. # GH 32173
  1082. arrays = [list("abcd"), list("cde")]
  1083. # exception raised inside MultiIndex constructor
  1084. msg = "all arrays must be same length"
  1085. with pytest.raises(ValueError, match=msg):
  1086. DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
  1087. @pytest.mark.parametrize(
  1088. "data",
  1089. [
  1090. [[Timestamp("2021-01-01")]],
  1091. [{"x": Timestamp("2021-01-01")}],
  1092. {"x": [Timestamp("2021-01-01")]},
  1093. {"x": Timestamp("2021-01-01")},
  1094. ],
  1095. )
  1096. def test_constructor_one_element_data_list(self, data):
  1097. # GH#42810
  1098. result = DataFrame(data, index=[0, 1, 2], columns=["x"])
  1099. expected = DataFrame({"x": [Timestamp("2021-01-01")] * 3})
  1100. tm.assert_frame_equal(result, expected)
  1101. def test_constructor_sequence_like(self):
  1102. # GH 3783
  1103. # collections.Sequence like
  1104. class DummyContainer(abc.Sequence):
  1105. def __init__(self, lst) -> None:
  1106. self._lst = lst
  1107. def __getitem__(self, n):
  1108. return self._lst.__getitem__(n)
  1109. def __len__(self) -> int:
  1110. return self._lst.__len__()
  1111. lst_containers = [DummyContainer([1, "a"]), DummyContainer([2, "b"])]
  1112. columns = ["num", "str"]
  1113. result = DataFrame(lst_containers, columns=columns)
  1114. expected = DataFrame([[1, "a"], [2, "b"]], columns=columns)
  1115. tm.assert_frame_equal(result, expected, check_dtype=False)
  1116. def test_constructor_stdlib_array(self):
  1117. # GH 4297
  1118. # support Array
  1119. result = DataFrame({"A": array.array("i", range(10))})
  1120. expected = DataFrame({"A": list(range(10))})
  1121. tm.assert_frame_equal(result, expected, check_dtype=False)
  1122. expected = DataFrame([list(range(10)), list(range(10))])
  1123. result = DataFrame([array.array("i", range(10)), array.array("i", range(10))])
  1124. tm.assert_frame_equal(result, expected, check_dtype=False)
  1125. def test_constructor_range(self):
  1126. # GH26342
  1127. result = DataFrame(range(10))
  1128. expected = DataFrame(list(range(10)))
  1129. tm.assert_frame_equal(result, expected)
  1130. def test_constructor_list_of_ranges(self):
  1131. result = DataFrame([range(10), range(10)])
  1132. expected = DataFrame([list(range(10)), list(range(10))])
  1133. tm.assert_frame_equal(result, expected)
  1134. def test_constructor_iterable(self):
  1135. # GH 21987
  1136. class Iter:
  1137. def __iter__(self) -> Iterator:
  1138. for i in range(10):
  1139. yield [1, 2, 3]
  1140. expected = DataFrame([[1, 2, 3]] * 10)
  1141. result = DataFrame(Iter())
  1142. tm.assert_frame_equal(result, expected)
  1143. def test_constructor_iterator(self):
  1144. result = DataFrame(iter(range(10)))
  1145. expected = DataFrame(list(range(10)))
  1146. tm.assert_frame_equal(result, expected)
  1147. def test_constructor_list_of_iterators(self):
  1148. result = DataFrame([iter(range(10)), iter(range(10))])
  1149. expected = DataFrame([list(range(10)), list(range(10))])
  1150. tm.assert_frame_equal(result, expected)
  1151. def test_constructor_generator(self):
  1152. # related #2305
  1153. gen1 = (i for i in range(10))
  1154. gen2 = (i for i in range(10))
  1155. expected = DataFrame([list(range(10)), list(range(10))])
  1156. result = DataFrame([gen1, gen2])
  1157. tm.assert_frame_equal(result, expected)
  1158. gen = ([i, "a"] for i in range(10))
  1159. result = DataFrame(gen)
  1160. expected = DataFrame({0: range(10), 1: "a"})
  1161. tm.assert_frame_equal(result, expected, check_dtype=False)
  1162. def test_constructor_list_of_dicts(self):
  1163. result = DataFrame([{}])
  1164. expected = DataFrame(index=RangeIndex(1), columns=[])
  1165. tm.assert_frame_equal(result, expected)
  1166. def test_constructor_ordered_dict_nested_preserve_order(self):
  1167. # see gh-18166
  1168. nested1 = OrderedDict([("b", 1), ("a", 2)])
  1169. nested2 = OrderedDict([("b", 2), ("a", 5)])
  1170. data = OrderedDict([("col2", nested1), ("col1", nested2)])
  1171. result = DataFrame(data)
  1172. data = {"col2": [1, 2], "col1": [2, 5]}
  1173. expected = DataFrame(data=data, index=["b", "a"])
  1174. tm.assert_frame_equal(result, expected)
  1175. @pytest.mark.parametrize("dict_type", [dict, OrderedDict])
  1176. def test_constructor_ordered_dict_preserve_order(self, dict_type):
  1177. # see gh-13304
  1178. expected = DataFrame([[2, 1]], columns=["b", "a"])
  1179. data = dict_type()
  1180. data["b"] = [2]
  1181. data["a"] = [1]
  1182. result = DataFrame(data)
  1183. tm.assert_frame_equal(result, expected)
  1184. data = dict_type()
  1185. data["b"] = 2
  1186. data["a"] = 1
  1187. result = DataFrame([data])
  1188. tm.assert_frame_equal(result, expected)
  1189. @pytest.mark.parametrize("dict_type", [dict, OrderedDict])
  1190. def test_constructor_ordered_dict_conflicting_orders(self, dict_type):
  1191. # the first dict element sets the ordering for the DataFrame,
  1192. # even if there are conflicting orders from subsequent ones
  1193. row_one = dict_type()
  1194. row_one["b"] = 2
  1195. row_one["a"] = 1
  1196. row_two = dict_type()
  1197. row_two["a"] = 1
  1198. row_two["b"] = 2
  1199. row_three = {"b": 2, "a": 1}
  1200. expected = DataFrame([[2, 1], [2, 1]], columns=["b", "a"])
  1201. result = DataFrame([row_one, row_two])
  1202. tm.assert_frame_equal(result, expected)
  1203. expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=["b", "a"])
  1204. result = DataFrame([row_one, row_two, row_three])
  1205. tm.assert_frame_equal(result, expected)
  1206. def test_constructor_list_of_series_aligned_index(self):
  1207. series = [Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)]
  1208. result = DataFrame(series)
  1209. expected = DataFrame(
  1210. {"b": [0, 1, 2], "a": [0, 1, 2], "c": [0, 1, 2]},
  1211. columns=["b", "a", "c"],
  1212. index=["0", "1", "2"],
  1213. )
  1214. tm.assert_frame_equal(result, expected)
  1215. def test_constructor_list_of_derived_dicts(self):
  1216. class CustomDict(dict):
  1217. pass
  1218. d = {"a": 1.5, "b": 3}
  1219. data_custom = [CustomDict(d)]
  1220. data = [d]
  1221. result_custom = DataFrame(data_custom)
  1222. result = DataFrame(data)
  1223. tm.assert_frame_equal(result, result_custom)
  1224. def test_constructor_ragged(self):
  1225. data = {"A": np.random.randn(10), "B": np.random.randn(8)}
  1226. with pytest.raises(ValueError, match="All arrays must be of the same length"):
  1227. DataFrame(data)
  1228. def test_constructor_scalar(self):
  1229. idx = Index(range(3))
  1230. df = DataFrame({"a": 0}, index=idx)
  1231. expected = DataFrame({"a": [0, 0, 0]}, index=idx)
  1232. tm.assert_frame_equal(df, expected, check_dtype=False)
  1233. def test_constructor_Series_copy_bug(self, float_frame):
  1234. df = DataFrame(float_frame["A"], index=float_frame.index, columns=["A"])
  1235. df.copy()
  1236. def test_constructor_mixed_dict_and_Series(self):
  1237. data = {}
  1238. data["A"] = {"foo": 1, "bar": 2, "baz": 3}
  1239. data["B"] = Series([4, 3, 2, 1], index=["bar", "qux", "baz", "foo"])
  1240. result = DataFrame(data)
  1241. assert result.index.is_monotonic_increasing
  1242. # ordering ambiguous, raise exception
  1243. with pytest.raises(ValueError, match="ambiguous ordering"):
  1244. DataFrame({"A": ["a", "b"], "B": {"a": "a", "b": "b"}})
  1245. # this is OK though
  1246. result = DataFrame({"A": ["a", "b"], "B": Series(["a", "b"], index=["a", "b"])})
  1247. expected = DataFrame({"A": ["a", "b"], "B": ["a", "b"]}, index=["a", "b"])
  1248. tm.assert_frame_equal(result, expected)
  1249. def test_constructor_mixed_type_rows(self):
  1250. # Issue 25075
  1251. data = [[1, 2], (3, 4)]
  1252. result = DataFrame(data)
  1253. expected = DataFrame([[1, 2], [3, 4]])
  1254. tm.assert_frame_equal(result, expected)
  1255. @pytest.mark.parametrize(
  1256. "tuples,lists",
  1257. [
  1258. ((), []),
  1259. ((()), []),
  1260. (((), ()), [(), ()]),
  1261. (((), ()), [[], []]),
  1262. (([], []), [[], []]),
  1263. (([1], [2]), [[1], [2]]), # GH 32776
  1264. (([1, 2, 3], [4, 5, 6]), [[1, 2, 3], [4, 5, 6]]),
  1265. ],
  1266. )
  1267. def test_constructor_tuple(self, tuples, lists):
  1268. # GH 25691
  1269. result = DataFrame(tuples)
  1270. expected = DataFrame(lists)
  1271. tm.assert_frame_equal(result, expected)
  1272. def test_constructor_list_of_tuples(self):
  1273. result = DataFrame({"A": [(1, 2), (3, 4)]})
  1274. expected = DataFrame({"A": Series([(1, 2), (3, 4)])})
  1275. tm.assert_frame_equal(result, expected)
  1276. def test_constructor_list_of_namedtuples(self):
  1277. # GH11181
  1278. named_tuple = namedtuple("Pandas", list("ab"))
  1279. tuples = [named_tuple(1, 3), named_tuple(2, 4)]
  1280. expected = DataFrame({"a": [1, 2], "b": [3, 4]})
  1281. result = DataFrame(tuples)
  1282. tm.assert_frame_equal(result, expected)
  1283. # with columns
  1284. expected = DataFrame({"y": [1, 2], "z": [3, 4]})
  1285. result = DataFrame(tuples, columns=["y", "z"])
  1286. tm.assert_frame_equal(result, expected)
  1287. def test_constructor_list_of_dataclasses(self):
  1288. # GH21910
  1289. Point = make_dataclass("Point", [("x", int), ("y", int)])
  1290. data = [Point(0, 3), Point(1, 3)]
  1291. expected = DataFrame({"x": [0, 1], "y": [3, 3]})
  1292. result = DataFrame(data)
  1293. tm.assert_frame_equal(result, expected)
  1294. def test_constructor_list_of_dataclasses_with_varying_types(self):
  1295. # GH21910
  1296. # varying types
  1297. Point = make_dataclass("Point", [("x", int), ("y", int)])
  1298. HLine = make_dataclass("HLine", [("x0", int), ("x1", int), ("y", int)])
  1299. data = [Point(0, 3), HLine(1, 3, 3)]
  1300. expected = DataFrame(
  1301. {"x": [0, np.nan], "y": [3, 3], "x0": [np.nan, 1], "x1": [np.nan, 3]}
  1302. )
  1303. result = DataFrame(data)
  1304. tm.assert_frame_equal(result, expected)
  1305. def test_constructor_list_of_dataclasses_error_thrown(self):
  1306. # GH21910
  1307. Point = make_dataclass("Point", [("x", int), ("y", int)])
  1308. # expect TypeError
  1309. msg = "asdict() should be called on dataclass instances"
  1310. with pytest.raises(TypeError, match=re.escape(msg)):
  1311. DataFrame([Point(0, 0), {"x": 1, "y": 0}])
  1312. def test_constructor_list_of_dict_order(self):
  1313. # GH10056
  1314. data = [
  1315. {"First": 1, "Second": 4, "Third": 7, "Fourth": 10},
  1316. {"Second": 5, "First": 2, "Fourth": 11, "Third": 8},
  1317. {"Second": 6, "First": 3, "Fourth": 12, "Third": 9, "YYY": 14, "XXX": 13},
  1318. ]
  1319. expected = DataFrame(
  1320. {
  1321. "First": [1, 2, 3],
  1322. "Second": [4, 5, 6],
  1323. "Third": [7, 8, 9],
  1324. "Fourth": [10, 11, 12],
  1325. "YYY": [None, None, 14],
  1326. "XXX": [None, None, 13],
  1327. }
  1328. )
  1329. result = DataFrame(data)
  1330. tm.assert_frame_equal(result, expected)
  1331. def test_constructor_Series_named(self):
  1332. a = Series([1, 2, 3], index=["a", "b", "c"], name="x")
  1333. df = DataFrame(a)
  1334. assert df.columns[0] == "x"
  1335. tm.assert_index_equal(df.index, a.index)
  1336. # ndarray like
  1337. arr = np.random.randn(10)
  1338. s = Series(arr, name="x")
  1339. df = DataFrame(s)
  1340. expected = DataFrame({"x": s})
  1341. tm.assert_frame_equal(df, expected)
  1342. s = Series(arr, index=range(3, 13))
  1343. df = DataFrame(s)
  1344. expected = DataFrame({0: s})
  1345. tm.assert_frame_equal(df, expected)
  1346. msg = r"Shape of passed values is \(10, 1\), indices imply \(10, 2\)"
  1347. with pytest.raises(ValueError, match=msg):
  1348. DataFrame(s, columns=[1, 2])
  1349. # #2234
  1350. a = Series([], name="x", dtype=object)
  1351. df = DataFrame(a)
  1352. assert df.columns[0] == "x"
  1353. # series with name and w/o
  1354. s1 = Series(arr, name="x")
  1355. df = DataFrame([s1, arr]).T
  1356. expected = DataFrame({"x": s1, "Unnamed 0": arr}, columns=["x", "Unnamed 0"])
  1357. tm.assert_frame_equal(df, expected)
  1358. # this is a bit non-intuitive here; the series collapse down to arrays
  1359. df = DataFrame([arr, s1]).T
  1360. expected = DataFrame({1: s1, 0: arr}, columns=[0, 1])
  1361. tm.assert_frame_equal(df, expected)
  1362. def test_constructor_Series_named_and_columns(self):
  1363. # GH 9232 validation
  1364. s0 = Series(range(5), name=0)
  1365. s1 = Series(range(5), name=1)
  1366. # matching name and column gives standard frame
  1367. tm.assert_frame_equal(DataFrame(s0, columns=[0]), s0.to_frame())
  1368. tm.assert_frame_equal(DataFrame(s1, columns=[1]), s1.to_frame())
  1369. # non-matching produces empty frame
  1370. assert DataFrame(s0, columns=[1]).empty
  1371. assert DataFrame(s1, columns=[0]).empty
  1372. def test_constructor_Series_differently_indexed(self):
  1373. # name
  1374. s1 = Series([1, 2, 3], index=["a", "b", "c"], name="x")
  1375. # no name
  1376. s2 = Series([1, 2, 3], index=["a", "b", "c"])
  1377. other_index = Index(["a", "b"])
  1378. df1 = DataFrame(s1, index=other_index)
  1379. exp1 = DataFrame(s1.reindex(other_index))
  1380. assert df1.columns[0] == "x"
  1381. tm.assert_frame_equal(df1, exp1)
  1382. df2 = DataFrame(s2, index=other_index)
  1383. exp2 = DataFrame(s2.reindex(other_index))
  1384. assert df2.columns[0] == 0
  1385. tm.assert_index_equal(df2.index, other_index)
  1386. tm.assert_frame_equal(df2, exp2)
  1387. @pytest.mark.parametrize(
  1388. "name_in1,name_in2,name_in3,name_out",
  1389. [
  1390. ("idx", "idx", "idx", "idx"),
  1391. ("idx", "idx", None, None),
  1392. ("idx", None, None, None),
  1393. ("idx1", "idx2", None, None),
  1394. ("idx1", "idx1", "idx2", None),
  1395. ("idx1", "idx2", "idx3", None),
  1396. (None, None, None, None),
  1397. ],
  1398. )
  1399. def test_constructor_index_names(self, name_in1, name_in2, name_in3, name_out):
  1400. # GH13475
  1401. indices = [
  1402. Index(["a", "b", "c"], name=name_in1),
  1403. Index(["b", "c", "d"], name=name_in2),
  1404. Index(["c", "d", "e"], name=name_in3),
  1405. ]
  1406. series = {
  1407. c: Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"])
  1408. }
  1409. result = DataFrame(series)
  1410. exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
  1411. expected = DataFrame(
  1412. {
  1413. "x": [0, 1, 2, np.nan, np.nan],
  1414. "y": [np.nan, 0, 1, 2, np.nan],
  1415. "z": [np.nan, np.nan, 0, 1, 2],
  1416. },
  1417. index=exp_ind,
  1418. )
  1419. tm.assert_frame_equal(result, expected)
  1420. def test_constructor_manager_resize(self, float_frame):
  1421. index = list(float_frame.index[:5])
  1422. columns = list(float_frame.columns[:3])
  1423. result = DataFrame(float_frame._mgr, index=index, columns=columns)
  1424. tm.assert_index_equal(result.index, Index(index))
  1425. tm.assert_index_equal(result.columns, Index(columns))
  1426. def test_constructor_mix_series_nonseries(self, float_frame):
  1427. df = DataFrame(
  1428. {"A": float_frame["A"], "B": list(float_frame["B"])}, columns=["A", "B"]
  1429. )
  1430. tm.assert_frame_equal(df, float_frame.loc[:, ["A", "B"]])
  1431. msg = "does not match index length"
  1432. with pytest.raises(ValueError, match=msg):
  1433. DataFrame({"A": float_frame["A"], "B": list(float_frame["B"])[:-2]})
  1434. def test_constructor_miscast_na_int_dtype(self):
  1435. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  1436. with pytest.raises(IntCastingNaNError, match=msg):
  1437. DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64)
  1438. def test_constructor_column_duplicates(self):
  1439. # it works! #2079
  1440. df = DataFrame([[8, 5]], columns=["a", "a"])
  1441. edf = DataFrame([[8, 5]])
  1442. edf.columns = ["a", "a"]
  1443. tm.assert_frame_equal(df, edf)
  1444. idf = DataFrame.from_records([(8, 5)], columns=["a", "a"])
  1445. tm.assert_frame_equal(idf, edf)
  1446. def test_constructor_empty_with_string_dtype(self):
  1447. # GH 9428
  1448. expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
  1449. df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
  1450. tm.assert_frame_equal(df, expected)
  1451. df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
  1452. tm.assert_frame_equal(df, expected)
  1453. df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_)
  1454. tm.assert_frame_equal(df, expected)
  1455. df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
  1456. tm.assert_frame_equal(df, expected)
  1457. def test_constructor_empty_with_string_extension(self, nullable_string_dtype):
  1458. # GH 34915
  1459. expected = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
  1460. df = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
  1461. tm.assert_frame_equal(df, expected)
  1462. def test_constructor_single_value(self):
  1463. # expecting single value upcasting here
  1464. df = DataFrame(0.0, index=[1, 2, 3], columns=["a", "b", "c"])
  1465. tm.assert_frame_equal(
  1466. df, DataFrame(np.zeros(df.shape).astype("float64"), df.index, df.columns)
  1467. )
  1468. df = DataFrame(0, index=[1, 2, 3], columns=["a", "b", "c"])
  1469. tm.assert_frame_equal(
  1470. df, DataFrame(np.zeros(df.shape).astype("int64"), df.index, df.columns)
  1471. )
  1472. df = DataFrame("a", index=[1, 2], columns=["a", "c"])
  1473. tm.assert_frame_equal(
  1474. df,
  1475. DataFrame(
  1476. np.array([["a", "a"], ["a", "a"]], dtype=object),
  1477. index=[1, 2],
  1478. columns=["a", "c"],
  1479. ),
  1480. )
  1481. msg = "DataFrame constructor not properly called!"
  1482. with pytest.raises(ValueError, match=msg):
  1483. DataFrame("a", [1, 2])
  1484. with pytest.raises(ValueError, match=msg):
  1485. DataFrame("a", columns=["a", "c"])
  1486. msg = "incompatible data and dtype"
  1487. with pytest.raises(TypeError, match=msg):
  1488. DataFrame("a", [1, 2], ["a", "c"], float)
  1489. def test_constructor_with_datetimes(self):
  1490. intname = np.dtype(np.int_).name
  1491. floatname = np.dtype(np.float_).name
  1492. datetime64name = np.dtype("M8[ns]").name
  1493. objectname = np.dtype(np.object_).name
  1494. # single item
  1495. df = DataFrame(
  1496. {
  1497. "A": 1,
  1498. "B": "foo",
  1499. "C": "bar",
  1500. "D": Timestamp("20010101"),
  1501. "E": datetime(2001, 1, 2, 0, 0),
  1502. },
  1503. index=np.arange(10),
  1504. )
  1505. result = df.dtypes
  1506. expected = Series(
  1507. [np.dtype("int64")]
  1508. + [np.dtype(objectname)] * 2
  1509. + [np.dtype(datetime64name)] * 2,
  1510. index=list("ABCDE"),
  1511. )
  1512. tm.assert_series_equal(result, expected)
  1513. # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0
  1514. # ndarray with a dtype specified)
  1515. df = DataFrame(
  1516. {
  1517. "a": 1.0,
  1518. "b": 2,
  1519. "c": "foo",
  1520. floatname: np.array(1.0, dtype=floatname),
  1521. intname: np.array(1, dtype=intname),
  1522. },
  1523. index=np.arange(10),
  1524. )
  1525. result = df.dtypes
  1526. expected = Series(
  1527. [np.dtype("float64")]
  1528. + [np.dtype("int64")]
  1529. + [np.dtype("object")]
  1530. + [np.dtype("float64")]
  1531. + [np.dtype(intname)],
  1532. index=["a", "b", "c", floatname, intname],
  1533. )
  1534. tm.assert_series_equal(result, expected)
  1535. # check with ndarray construction ndim>0
  1536. df = DataFrame(
  1537. {
  1538. "a": 1.0,
  1539. "b": 2,
  1540. "c": "foo",
  1541. floatname: np.array([1.0] * 10, dtype=floatname),
  1542. intname: np.array([1] * 10, dtype=intname),
  1543. },
  1544. index=np.arange(10),
  1545. )
  1546. result = df.dtypes
  1547. expected = Series(
  1548. [np.dtype("float64")]
  1549. + [np.dtype("int64")]
  1550. + [np.dtype("object")]
  1551. + [np.dtype("float64")]
  1552. + [np.dtype(intname)],
  1553. index=["a", "b", "c", floatname, intname],
  1554. )
  1555. tm.assert_series_equal(result, expected)
  1556. def test_constructor_with_datetimes1(self):
  1557. # GH 2809
  1558. ind = date_range(start="2000-01-01", freq="D", periods=10)
  1559. datetimes = [ts.to_pydatetime() for ts in ind]
  1560. datetime_s = Series(datetimes)
  1561. assert datetime_s.dtype == "M8[ns]"
  1562. def test_constructor_with_datetimes2(self):
  1563. # GH 2810
  1564. ind = date_range(start="2000-01-01", freq="D", periods=10)
  1565. datetimes = [ts.to_pydatetime() for ts in ind]
  1566. dates = [ts.date() for ts in ind]
  1567. df = DataFrame(datetimes, columns=["datetimes"])
  1568. df["dates"] = dates
  1569. result = df.dtypes
  1570. expected = Series(
  1571. [np.dtype("datetime64[ns]"), np.dtype("object")],
  1572. index=["datetimes", "dates"],
  1573. )
  1574. tm.assert_series_equal(result, expected)
  1575. def test_constructor_with_datetimes3(self):
  1576. # GH 7594
  1577. # don't coerce tz-aware
  1578. tz = pytz.timezone("US/Eastern")
  1579. dt = tz.localize(datetime(2012, 1, 1))
  1580. df = DataFrame({"End Date": dt}, index=[0])
  1581. assert df.iat[0, 0] == dt
  1582. tm.assert_series_equal(
  1583. df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
  1584. )
  1585. df = DataFrame([{"End Date": dt}])
  1586. assert df.iat[0, 0] == dt
  1587. tm.assert_series_equal(
  1588. df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
  1589. )
  1590. def test_constructor_with_datetimes4(self):
  1591. # tz-aware (UTC and other tz's)
  1592. # GH 8411
  1593. dr = date_range("20130101", periods=3)
  1594. df = DataFrame({"value": dr})
  1595. assert df.iat[0, 0].tz is None
  1596. dr = date_range("20130101", periods=3, tz="UTC")
  1597. df = DataFrame({"value": dr})
  1598. assert str(df.iat[0, 0].tz) == "UTC"
  1599. dr = date_range("20130101", periods=3, tz="US/Eastern")
  1600. df = DataFrame({"value": dr})
  1601. assert str(df.iat[0, 0].tz) == "US/Eastern"
  1602. def test_constructor_with_datetimes5(self):
  1603. # GH 7822
  1604. # preserver an index with a tz on dict construction
  1605. i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern")
  1606. expected = DataFrame({"a": i.to_series().reset_index(drop=True)})
  1607. df = DataFrame()
  1608. df["a"] = i
  1609. tm.assert_frame_equal(df, expected)
  1610. df = DataFrame({"a": i})
  1611. tm.assert_frame_equal(df, expected)
  1612. def test_constructor_with_datetimes6(self):
  1613. # multiples
  1614. i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern")
  1615. i_no_tz = date_range("1/1/2011", periods=5, freq="10s")
  1616. df = DataFrame({"a": i, "b": i_no_tz})
  1617. expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz})
  1618. tm.assert_frame_equal(df, expected)
  1619. @pytest.mark.parametrize(
  1620. "arr",
  1621. [
  1622. np.array([None, None, None, None, datetime.now(), None]),
  1623. np.array([None, None, datetime.now(), None]),
  1624. [[np.datetime64("NaT")], [None]],
  1625. [[np.datetime64("NaT")], [pd.NaT]],
  1626. [[None], [np.datetime64("NaT")]],
  1627. [[None], [pd.NaT]],
  1628. [[pd.NaT], [np.datetime64("NaT")]],
  1629. [[pd.NaT], [None]],
  1630. ],
  1631. )
  1632. def test_constructor_datetimes_with_nulls(self, arr):
  1633. # gh-15869, GH#11220
  1634. result = DataFrame(arr).dtypes
  1635. expected = Series([np.dtype("datetime64[ns]")])
  1636. tm.assert_series_equal(result, expected)
  1637. @pytest.mark.parametrize("order", ["K", "A", "C", "F"])
  1638. @pytest.mark.parametrize(
  1639. "unit",
  1640. ["M", "D", "h", "m", "s", "ms", "us", "ns"],
  1641. )
  1642. def test_constructor_datetimes_non_ns(self, order, unit):
  1643. dtype = f"datetime64[{unit}]"
  1644. na = np.array(
  1645. [
  1646. ["2015-01-01", "2015-01-02", "2015-01-03"],
  1647. ["2017-01-01", "2017-01-02", "2017-02-03"],
  1648. ],
  1649. dtype=dtype,
  1650. order=order,
  1651. )
  1652. df = DataFrame(na)
  1653. expected = DataFrame(na.astype("M8[ns]"))
  1654. if unit in ["M", "D", "h", "m"]:
  1655. with pytest.raises(TypeError, match="Cannot cast"):
  1656. expected.astype(dtype)
  1657. # instead the constructor casts to the closest supported reso, i.e. "s"
  1658. expected = expected.astype("datetime64[s]")
  1659. else:
  1660. expected = expected.astype(dtype=dtype)
  1661. tm.assert_frame_equal(df, expected)
  1662. @pytest.mark.parametrize("order", ["K", "A", "C", "F"])
  1663. @pytest.mark.parametrize(
  1664. "unit",
  1665. [
  1666. "D",
  1667. "h",
  1668. "m",
  1669. "s",
  1670. "ms",
  1671. "us",
  1672. "ns",
  1673. ],
  1674. )
  1675. def test_constructor_timedelta_non_ns(self, order, unit):
  1676. dtype = f"timedelta64[{unit}]"
  1677. na = np.array(
  1678. [
  1679. [np.timedelta64(1, "D"), np.timedelta64(2, "D")],
  1680. [np.timedelta64(4, "D"), np.timedelta64(5, "D")],
  1681. ],
  1682. dtype=dtype,
  1683. order=order,
  1684. )
  1685. df = DataFrame(na)
  1686. if unit in ["D", "h", "m"]:
  1687. # we get the nearest supported unit, i.e. "s"
  1688. exp_unit = "s"
  1689. else:
  1690. exp_unit = unit
  1691. exp_dtype = np.dtype(f"m8[{exp_unit}]")
  1692. expected = DataFrame(
  1693. [
  1694. [Timedelta(1, "D"), Timedelta(2, "D")],
  1695. [Timedelta(4, "D"), Timedelta(5, "D")],
  1696. ],
  1697. dtype=exp_dtype,
  1698. )
  1699. # TODO(2.0): ideally we should get the same 'expected' without passing
  1700. # dtype=exp_dtype.
  1701. tm.assert_frame_equal(df, expected)
  1702. def test_constructor_for_list_with_dtypes(self):
  1703. # test list of lists/ndarrays
  1704. df = DataFrame([np.arange(5) for x in range(5)])
  1705. result = df.dtypes
  1706. expected = Series([np.dtype("int")] * 5)
  1707. tm.assert_series_equal(result, expected)
  1708. df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
  1709. result = df.dtypes
  1710. expected = Series([np.dtype("int32")] * 5)
  1711. tm.assert_series_equal(result, expected)
  1712. # overflow issue? (we always expected int64 upcasting here)
  1713. df = DataFrame({"a": [2**31, 2**31 + 1]})
  1714. assert df.dtypes.iloc[0] == np.dtype("int64")
  1715. # GH #2751 (construction with no index specified), make sure we cast to
  1716. # platform values
  1717. df = DataFrame([1, 2])
  1718. assert df.dtypes.iloc[0] == np.dtype("int64")
  1719. df = DataFrame([1.0, 2.0])
  1720. assert df.dtypes.iloc[0] == np.dtype("float64")
  1721. df = DataFrame({"a": [1, 2]})
  1722. assert df.dtypes.iloc[0] == np.dtype("int64")
  1723. df = DataFrame({"a": [1.0, 2.0]})
  1724. assert df.dtypes.iloc[0] == np.dtype("float64")
  1725. df = DataFrame({"a": 1}, index=range(3))
  1726. assert df.dtypes.iloc[0] == np.dtype("int64")
  1727. df = DataFrame({"a": 1.0}, index=range(3))
  1728. assert df.dtypes.iloc[0] == np.dtype("float64")
  1729. # with object list
  1730. df = DataFrame(
  1731. {
  1732. "a": [1, 2, 4, 7],
  1733. "b": [1.2, 2.3, 5.1, 6.3],
  1734. "c": list("abcd"),
  1735. "d": [datetime(2000, 1, 1) for i in range(4)],
  1736. "e": [1.0, 2, 4.0, 7],
  1737. }
  1738. )
  1739. result = df.dtypes
  1740. expected = Series(
  1741. [
  1742. np.dtype("int64"),
  1743. np.dtype("float64"),
  1744. np.dtype("object"),
  1745. np.dtype("datetime64[ns]"),
  1746. np.dtype("float64"),
  1747. ],
  1748. index=list("abcde"),
  1749. )
  1750. tm.assert_series_equal(result, expected)
  1751. def test_constructor_frame_copy(self, float_frame):
  1752. cop = DataFrame(float_frame, copy=True)
  1753. cop["A"] = 5
  1754. assert (cop["A"] == 5).all()
  1755. assert not (float_frame["A"] == 5).all()
  1756. def test_constructor_frame_shallow_copy(self, float_frame):
  1757. # constructing a DataFrame from DataFrame with copy=False should still
  1758. # give a "shallow" copy (share data, not attributes)
  1759. # https://github.com/pandas-dev/pandas/issues/49523
  1760. orig = float_frame.copy()
  1761. cop = DataFrame(float_frame)
  1762. assert cop._mgr is not float_frame._mgr
  1763. # Overwriting index of copy doesn't change original
  1764. cop.index = np.arange(len(cop))
  1765. tm.assert_frame_equal(float_frame, orig)
  1766. def test_constructor_ndarray_copy(
  1767. self, float_frame, using_array_manager, using_copy_on_write
  1768. ):
  1769. if not using_array_manager:
  1770. arr = float_frame.values.copy()
  1771. df = DataFrame(arr)
  1772. arr[5] = 5
  1773. if using_copy_on_write:
  1774. assert not (df.values[5] == 5).all()
  1775. else:
  1776. assert (df.values[5] == 5).all()
  1777. df = DataFrame(arr, copy=True)
  1778. arr[6] = 6
  1779. assert not (df.values[6] == 6).all()
  1780. else:
  1781. arr = float_frame.values.copy()
  1782. # default: copy to ensure contiguous arrays
  1783. df = DataFrame(arr)
  1784. assert df._mgr.arrays[0].flags.c_contiguous
  1785. arr[0, 0] = 100
  1786. assert df.iloc[0, 0] != 100
  1787. # manually specify copy=False
  1788. df = DataFrame(arr, copy=False)
  1789. assert not df._mgr.arrays[0].flags.c_contiguous
  1790. arr[0, 0] = 1000
  1791. assert df.iloc[0, 0] == 1000
  1792. def test_constructor_series_copy(self, float_frame):
  1793. series = float_frame._series
  1794. df = DataFrame({"A": series["A"]}, copy=True)
  1795. # TODO can be replaced with `df.loc[:, "A"] = 5` after deprecation about
  1796. # inplace mutation is enforced
  1797. df.loc[df.index[0] : df.index[-1], "A"] = 5
  1798. assert not (series["A"] == 5).all()
  1799. @pytest.mark.parametrize(
  1800. "df",
  1801. [
  1802. DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]),
  1803. DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan]),
  1804. DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]),
  1805. DataFrame(
  1806. [[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]
  1807. ),
  1808. DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2]),
  1809. ],
  1810. )
  1811. def test_constructor_with_nas(self, df):
  1812. # GH 5016
  1813. # na's in indices
  1814. # GH 21428 (non-unique columns)
  1815. for i in range(len(df.columns)):
  1816. df.iloc[:, i]
  1817. indexer = np.arange(len(df.columns))[isna(df.columns)]
  1818. # No NaN found -> error
  1819. if len(indexer) == 0:
  1820. with pytest.raises(KeyError, match="^nan$"):
  1821. df.loc[:, np.nan]
  1822. # single nan should result in Series
  1823. elif len(indexer) == 1:
  1824. tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan])
  1825. # multiple nans should result in DataFrame
  1826. else:
  1827. tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan])
  1828. def test_constructor_lists_to_object_dtype(self):
  1829. # from #1074
  1830. d = DataFrame({"a": [np.nan, False]})
  1831. assert d["a"].dtype == np.object_
  1832. assert not d["a"][1]
  1833. def test_constructor_ndarray_categorical_dtype(self):
  1834. cat = Categorical(["A", "B", "C"])
  1835. arr = np.array(cat).reshape(-1, 1)
  1836. arr = np.broadcast_to(arr, (3, 4))
  1837. result = DataFrame(arr, dtype=cat.dtype)
  1838. expected = DataFrame({0: cat, 1: cat, 2: cat, 3: cat})
  1839. tm.assert_frame_equal(result, expected)
  1840. def test_constructor_categorical(self):
  1841. # GH8626
  1842. # dict creation
  1843. df = DataFrame({"A": list("abc")}, dtype="category")
  1844. expected = Series(list("abc"), dtype="category", name="A")
  1845. tm.assert_series_equal(df["A"], expected)
  1846. # to_frame
  1847. s = Series(list("abc"), dtype="category")
  1848. result = s.to_frame()
  1849. expected = Series(list("abc"), dtype="category", name=0)
  1850. tm.assert_series_equal(result[0], expected)
  1851. result = s.to_frame(name="foo")
  1852. expected = Series(list("abc"), dtype="category", name="foo")
  1853. tm.assert_series_equal(result["foo"], expected)
  1854. # list-like creation
  1855. df = DataFrame(list("abc"), dtype="category")
  1856. expected = Series(list("abc"), dtype="category", name=0)
  1857. tm.assert_series_equal(df[0], expected)
  1858. def test_construct_from_1item_list_of_categorical(self):
  1859. # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove
  1860. # Categorical special case
  1861. # ndim != 1
  1862. cat = Categorical(list("abc"))
  1863. df = DataFrame([cat])
  1864. expected = DataFrame([cat.astype(object)])
  1865. tm.assert_frame_equal(df, expected)
  1866. def test_construct_from_list_of_categoricals(self):
  1867. # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove
  1868. # Categorical special case
  1869. df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))])
  1870. expected = DataFrame([["a", "b", "c"], ["a", "b", "d"]])
  1871. tm.assert_frame_equal(df, expected)
  1872. def test_from_nested_listlike_mixed_types(self):
  1873. # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove
  1874. # Categorical special case
  1875. # mixed
  1876. df = DataFrame([Categorical(list("abc")), list("def")])
  1877. expected = DataFrame([["a", "b", "c"], ["d", "e", "f"]])
  1878. tm.assert_frame_equal(df, expected)
  1879. def test_construct_from_listlikes_mismatched_lengths(self):
  1880. df = DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))])
  1881. expected = DataFrame([list("abc"), list("abdefg")])
  1882. tm.assert_frame_equal(df, expected)
  1883. def test_constructor_categorical_series(self):
  1884. items = [1, 2, 3, 1]
  1885. exp = Series(items).astype("category")
  1886. res = Series(items, dtype="category")
  1887. tm.assert_series_equal(res, exp)
  1888. items = ["a", "b", "c", "a"]
  1889. exp = Series(items).astype("category")
  1890. res = Series(items, dtype="category")
  1891. tm.assert_series_equal(res, exp)
  1892. # insert into frame with different index
  1893. # GH 8076
  1894. index = date_range("20000101", periods=3)
  1895. expected = Series(
  1896. Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"])
  1897. )
  1898. expected.index = index
  1899. expected = DataFrame({"x": expected})
  1900. df = DataFrame({"x": Series(["a", "b", "c"], dtype="category")}, index=index)
  1901. tm.assert_frame_equal(df, expected)
  1902. @pytest.mark.parametrize(
  1903. "dtype",
  1904. tm.ALL_NUMERIC_DTYPES
  1905. + tm.DATETIME64_DTYPES
  1906. + tm.TIMEDELTA64_DTYPES
  1907. + tm.BOOL_DTYPES,
  1908. )
  1909. def test_check_dtype_empty_numeric_column(self, dtype):
  1910. # GH24386: Ensure dtypes are set correctly for an empty DataFrame.
  1911. # Empty DataFrame is generated via dictionary data with non-overlapping columns.
  1912. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype)
  1913. assert data.b.dtype == dtype
  1914. @pytest.mark.parametrize(
  1915. "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES
  1916. )
  1917. def test_check_dtype_empty_string_column(self, request, dtype, using_array_manager):
  1918. # GH24386: Ensure dtypes are set correctly for an empty DataFrame.
  1919. # Empty DataFrame is generated via dictionary data with non-overlapping columns.
  1920. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype)
  1921. if using_array_manager and dtype in tm.BYTES_DTYPES:
  1922. # TODO(ArrayManager) astype to bytes dtypes does not yet give object dtype
  1923. td.mark_array_manager_not_yet_implemented(request)
  1924. assert data.b.dtype.name == "object"
  1925. def test_to_frame_with_falsey_names(self):
  1926. # GH 16114
  1927. result = Series(name=0, dtype=object).to_frame().dtypes
  1928. expected = Series({0: object})
  1929. tm.assert_series_equal(result, expected)
  1930. result = DataFrame(Series(name=0, dtype=object)).dtypes
  1931. tm.assert_series_equal(result, expected)
  1932. @pytest.mark.arm_slow
  1933. @pytest.mark.parametrize("dtype", [None, "uint8", "category"])
  1934. def test_constructor_range_dtype(self, dtype):
  1935. expected = DataFrame({"A": [0, 1, 2, 3, 4]}, dtype=dtype or "int64")
  1936. # GH 26342
  1937. result = DataFrame(range(5), columns=["A"], dtype=dtype)
  1938. tm.assert_frame_equal(result, expected)
  1939. # GH 16804
  1940. result = DataFrame({"A": range(5)}, dtype=dtype)
  1941. tm.assert_frame_equal(result, expected)
  1942. def test_frame_from_list_subclass(self):
  1943. # GH21226
  1944. class List(list):
  1945. pass
  1946. expected = DataFrame([[1, 2, 3], [4, 5, 6]])
  1947. result = DataFrame(List([List([1, 2, 3]), List([4, 5, 6])]))
  1948. tm.assert_frame_equal(result, expected)
  1949. @pytest.mark.parametrize(
  1950. "extension_arr",
  1951. [
  1952. Categorical(list("aabbc")),
  1953. SparseArray([1, np.nan, np.nan, np.nan]),
  1954. IntervalArray([Interval(0, 1), Interval(1, 5)]),
  1955. PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")),
  1956. ],
  1957. )
  1958. def test_constructor_with_extension_array(self, extension_arr):
  1959. # GH11363
  1960. expected = DataFrame(Series(extension_arr))
  1961. result = DataFrame(extension_arr)
  1962. tm.assert_frame_equal(result, expected)
  1963. def test_datetime_date_tuple_columns_from_dict(self):
  1964. # GH 10863
  1965. v = date.today()
  1966. tup = v, v
  1967. result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup])
  1968. expected = DataFrame([0, 1, 2], columns=Index(Series([tup])))
  1969. tm.assert_frame_equal(result, expected)
  1970. def test_construct_with_two_categoricalindex_series(self):
  1971. # GH 14600
  1972. s1 = Series([39, 6, 4], index=CategoricalIndex(["female", "male", "unknown"]))
  1973. s2 = Series(
  1974. [2, 152, 2, 242, 150],
  1975. index=CategoricalIndex(["f", "female", "m", "male", "unknown"]),
  1976. )
  1977. result = DataFrame([s1, s2])
  1978. expected = DataFrame(
  1979. np.array([[39, 6, 4, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]]),
  1980. columns=["female", "male", "unknown", "f", "m"],
  1981. )
  1982. tm.assert_frame_equal(result, expected)
  1983. def test_constructor_series_nonexact_categoricalindex(self):
  1984. # GH 42424
  1985. ser = Series(range(0, 100))
  1986. ser1 = cut(ser, 10).value_counts().head(5)
  1987. ser2 = cut(ser, 10).value_counts().tail(5)
  1988. result = DataFrame({"1": ser1, "2": ser2})
  1989. index = CategoricalIndex(
  1990. [
  1991. Interval(-0.099, 9.9, closed="right"),
  1992. Interval(9.9, 19.8, closed="right"),
  1993. Interval(19.8, 29.7, closed="right"),
  1994. Interval(29.7, 39.6, closed="right"),
  1995. Interval(39.6, 49.5, closed="right"),
  1996. Interval(49.5, 59.4, closed="right"),
  1997. Interval(59.4, 69.3, closed="right"),
  1998. Interval(69.3, 79.2, closed="right"),
  1999. Interval(79.2, 89.1, closed="right"),
  2000. Interval(89.1, 99, closed="right"),
  2001. ],
  2002. ordered=True,
  2003. )
  2004. expected = DataFrame(
  2005. {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
  2006. )
  2007. tm.assert_frame_equal(expected, result)
  2008. def test_from_M8_structured(self):
  2009. dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))]
  2010. arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")])
  2011. df = DataFrame(arr)
  2012. assert df["Date"][0] == dates[0][0]
  2013. assert df["Forecasting"][0] == dates[0][1]
  2014. s = Series(arr["Date"])
  2015. assert isinstance(s[0], Timestamp)
  2016. assert s[0] == dates[0][0]
  2017. def test_from_datetime_subclass(self):
  2018. # GH21142 Verify whether Datetime subclasses are also of dtype datetime
  2019. class DatetimeSubclass(datetime):
  2020. pass
  2021. data = DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]})
  2022. assert data.datetime.dtype == "datetime64[ns]"
  2023. def test_with_mismatched_index_length_raises(self):
  2024. # GH#33437
  2025. dti = date_range("2016-01-01", periods=3, tz="US/Pacific")
  2026. msg = "Shape of passed values|Passed arrays should have the same length"
  2027. with pytest.raises(ValueError, match=msg):
  2028. DataFrame(dti, index=range(4))
  2029. def test_frame_ctor_datetime64_column(self):
  2030. rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
  2031. dates = np.asarray(rng)
  2032. df = DataFrame({"A": np.random.randn(len(rng)), "B": dates})
  2033. assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]"))
  2034. def test_dataframe_constructor_infer_multiindex(self):
  2035. index_lists = [["a", "a", "b", "b"], ["x", "y", "x", "y"]]
  2036. multi = DataFrame(
  2037. np.random.randn(4, 4),
  2038. index=[np.array(x) for x in index_lists],
  2039. )
  2040. assert isinstance(multi.index, MultiIndex)
  2041. assert not isinstance(multi.columns, MultiIndex)
  2042. multi = DataFrame(np.random.randn(4, 4), columns=index_lists)
  2043. assert isinstance(multi.columns, MultiIndex)
  2044. @pytest.mark.parametrize(
  2045. "input_vals",
  2046. [
  2047. ([1, 2]),
  2048. (["1", "2"]),
  2049. (list(date_range("1/1/2011", periods=2, freq="H"))),
  2050. (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))),
  2051. ([Interval(left=0, right=5)]),
  2052. ],
  2053. )
  2054. def test_constructor_list_str(self, input_vals, string_dtype):
  2055. # GH#16605
  2056. # Ensure that data elements are converted to strings when
  2057. # dtype is str, 'str', or 'U'
  2058. result = DataFrame({"A": input_vals}, dtype=string_dtype)
  2059. expected = DataFrame({"A": input_vals}).astype({"A": string_dtype})
  2060. tm.assert_frame_equal(result, expected)
  2061. def test_constructor_list_str_na(self, string_dtype):
  2062. result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
  2063. expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object)
  2064. tm.assert_frame_equal(result, expected)
  2065. @pytest.mark.parametrize("copy", [False, True])
  2066. def test_dict_nocopy(
  2067. self,
  2068. request,
  2069. copy,
  2070. any_numeric_ea_dtype,
  2071. any_numpy_dtype,
  2072. using_array_manager,
  2073. using_copy_on_write,
  2074. ):
  2075. if (
  2076. using_array_manager
  2077. and not copy
  2078. and any_numpy_dtype not in tm.STRING_DTYPES + tm.BYTES_DTYPES
  2079. ):
  2080. # TODO(ArrayManager) properly honor copy keyword for dict input
  2081. td.mark_array_manager_not_yet_implemented(request)
  2082. a = np.array([1, 2], dtype=any_numpy_dtype)
  2083. b = np.array([3, 4], dtype=any_numpy_dtype)
  2084. if b.dtype.kind in ["S", "U"]:
  2085. # These get cast, making the checks below more cumbersome
  2086. return
  2087. c = pd.array([1, 2], dtype=any_numeric_ea_dtype)
  2088. c_orig = c.copy()
  2089. df = DataFrame({"a": a, "b": b, "c": c}, copy=copy)
  2090. def get_base(obj):
  2091. if isinstance(obj, np.ndarray):
  2092. return obj.base
  2093. elif isinstance(obj.dtype, np.dtype):
  2094. # i.e. DatetimeArray, TimedeltaArray
  2095. return obj._ndarray.base
  2096. else:
  2097. raise TypeError
  2098. def check_views(c_only: bool = False):
  2099. # written to work for either BlockManager or ArrayManager
  2100. # Check that the underlying data behind df["c"] is still `c`
  2101. # after setting with iloc. Since we don't know which entry in
  2102. # df._mgr.arrays corresponds to df["c"], we just check that exactly
  2103. # one of these arrays is `c`. GH#38939
  2104. assert sum(x is c for x in df._mgr.arrays) == 1
  2105. if c_only:
  2106. # If we ever stop consolidating in setitem_with_indexer,
  2107. # this will become unnecessary.
  2108. return
  2109. assert (
  2110. sum(
  2111. get_base(x) is a
  2112. for x in df._mgr.arrays
  2113. if isinstance(x.dtype, np.dtype)
  2114. )
  2115. == 1
  2116. )
  2117. assert (
  2118. sum(
  2119. get_base(x) is b
  2120. for x in df._mgr.arrays
  2121. if isinstance(x.dtype, np.dtype)
  2122. )
  2123. == 1
  2124. )
  2125. if not copy:
  2126. # constructor preserves views
  2127. check_views()
  2128. # TODO: most of the rest of this test belongs in indexing tests
  2129. df.iloc[0, 0] = 0
  2130. df.iloc[0, 1] = 0
  2131. if not copy:
  2132. check_views(True)
  2133. # FIXME(GH#35417): until GH#35417, iloc.setitem into EA values does not preserve
  2134. # view, so we have to check in the other direction
  2135. df.iloc[:, 2] = pd.array([45, 46], dtype=c.dtype)
  2136. assert df.dtypes.iloc[2] == c.dtype
  2137. if not copy and not using_copy_on_write:
  2138. check_views(True)
  2139. if copy:
  2140. if a.dtype.kind == "M":
  2141. assert a[0] == a.dtype.type(1, "ns")
  2142. assert b[0] == b.dtype.type(3, "ns")
  2143. else:
  2144. assert a[0] == a.dtype.type(1)
  2145. assert b[0] == b.dtype.type(3)
  2146. # FIXME(GH#35417): enable after GH#35417
  2147. assert c[0] == c_orig[0] # i.e. df.iloc[0, 2]=45 did *not* update c
  2148. elif not using_copy_on_write:
  2149. # TODO: we can call check_views if we stop consolidating
  2150. # in setitem_with_indexer
  2151. assert c[0] == 45 # i.e. df.iloc[0, 2]=45 *did* update c
  2152. # TODO: we can check b[0] == 0 if we stop consolidating in
  2153. # setitem_with_indexer (except for datetimelike?)
  2154. def test_from_series_with_name_with_columns(self):
  2155. # GH 7893
  2156. result = DataFrame(Series(1, name="foo"), columns=["bar"])
  2157. expected = DataFrame(columns=["bar"])
  2158. tm.assert_frame_equal(result, expected)
  2159. def test_nested_list_columns(self):
  2160. # GH 14467
  2161. result = DataFrame(
  2162. [[1, 2, 3], [4, 5, 6]], columns=[["A", "A", "A"], ["a", "b", "c"]]
  2163. )
  2164. expected = DataFrame(
  2165. [[1, 2, 3], [4, 5, 6]],
  2166. columns=MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("A", "c")]),
  2167. )
  2168. tm.assert_frame_equal(result, expected)
  2169. def test_from_2d_object_array_of_periods_or_intervals(self):
  2170. # Period analogue to GH#26825
  2171. pi = pd.period_range("2016-04-05", periods=3)
  2172. data = pi._data.astype(object).reshape(1, -1)
  2173. df = DataFrame(data)
  2174. assert df.shape == (1, 3)
  2175. assert (df.dtypes == pi.dtype).all()
  2176. assert (df == pi).all().all()
  2177. ii = pd.IntervalIndex.from_breaks([3, 4, 5, 6])
  2178. data2 = ii._data.astype(object).reshape(1, -1)
  2179. df2 = DataFrame(data2)
  2180. assert df2.shape == (1, 3)
  2181. assert (df2.dtypes == ii.dtype).all()
  2182. assert (df2 == ii).all().all()
  2183. # mixed
  2184. data3 = np.r_[data, data2, data, data2].T
  2185. df3 = DataFrame(data3)
  2186. expected = DataFrame({0: pi, 1: ii, 2: pi, 3: ii})
  2187. tm.assert_frame_equal(df3, expected)
  2188. @pytest.mark.parametrize(
  2189. "col_a, col_b",
  2190. [
  2191. ([[1], [2]], np.array([[1], [2]])),
  2192. (np.array([[1], [2]]), [[1], [2]]),
  2193. (np.array([[1], [2]]), np.array([[1], [2]])),
  2194. ],
  2195. )
  2196. def test_error_from_2darray(self, col_a, col_b):
  2197. msg = "Per-column arrays must each be 1-dimensional"
  2198. with pytest.raises(ValueError, match=msg):
  2199. DataFrame({"a": col_a, "b": col_b})
  2200. def test_from_dict_with_missing_copy_false(self):
  2201. # GH#45369 filled columns should not be views of one another
  2202. df = DataFrame(index=[1, 2, 3], columns=["a", "b", "c"], copy=False)
  2203. assert not np.shares_memory(df["a"]._values, df["b"]._values)
  2204. df.iloc[0, 0] = 0
  2205. expected = DataFrame(
  2206. {
  2207. "a": [0, np.nan, np.nan],
  2208. "b": [np.nan, np.nan, np.nan],
  2209. "c": [np.nan, np.nan, np.nan],
  2210. },
  2211. index=[1, 2, 3],
  2212. dtype=object,
  2213. )
  2214. tm.assert_frame_equal(df, expected)
  2215. def test_construction_empty_array_multi_column_raises(self):
  2216. # GH#46822
  2217. msg = "Empty data passed with indices specified."
  2218. with pytest.raises(ValueError, match=msg):
  2219. DataFrame(data=np.array([]), columns=["a", "b"])
  2220. class TestDataFrameConstructorIndexInference:
  2221. def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
  2222. rng1 = pd.period_range("1/1/1999", "1/1/2012", freq="M")
  2223. s1 = Series(np.random.randn(len(rng1)), rng1)
  2224. rng2 = pd.period_range("1/1/1980", "12/1/2001", freq="M")
  2225. s2 = Series(np.random.randn(len(rng2)), rng2)
  2226. df = DataFrame({"s1": s1, "s2": s2})
  2227. exp = pd.period_range("1/1/1980", "1/1/2012", freq="M")
  2228. tm.assert_index_equal(df.index, exp)
  2229. def test_frame_from_dict_with_mixed_tzaware_indexes(self):
  2230. # GH#44091
  2231. dti = date_range("2016-01-01", periods=3)
  2232. ser1 = Series(range(3), index=dti)
  2233. ser2 = Series(range(3), index=dti.tz_localize("UTC"))
  2234. ser3 = Series(range(3), index=dti.tz_localize("US/Central"))
  2235. ser4 = Series(range(3))
  2236. # no tz-naive, but we do have mixed tzs and a non-DTI
  2237. df1 = DataFrame({"A": ser2, "B": ser3, "C": ser4})
  2238. exp_index = Index(
  2239. list(ser2.index) + list(ser3.index) + list(ser4.index), dtype=object
  2240. )
  2241. tm.assert_index_equal(df1.index, exp_index)
  2242. df2 = DataFrame({"A": ser2, "C": ser4, "B": ser3})
  2243. exp_index3 = Index(
  2244. list(ser2.index) + list(ser4.index) + list(ser3.index), dtype=object
  2245. )
  2246. tm.assert_index_equal(df2.index, exp_index3)
  2247. df3 = DataFrame({"B": ser3, "A": ser2, "C": ser4})
  2248. exp_index3 = Index(
  2249. list(ser3.index) + list(ser2.index) + list(ser4.index), dtype=object
  2250. )
  2251. tm.assert_index_equal(df3.index, exp_index3)
  2252. df4 = DataFrame({"C": ser4, "B": ser3, "A": ser2})
  2253. exp_index4 = Index(
  2254. list(ser4.index) + list(ser3.index) + list(ser2.index), dtype=object
  2255. )
  2256. tm.assert_index_equal(df4.index, exp_index4)
  2257. # TODO: not clear if these raising is desired (no extant tests),
  2258. # but this is de facto behavior 2021-12-22
  2259. msg = "Cannot join tz-naive with tz-aware DatetimeIndex"
  2260. with pytest.raises(TypeError, match=msg):
  2261. DataFrame({"A": ser2, "B": ser3, "C": ser4, "D": ser1})
  2262. with pytest.raises(TypeError, match=msg):
  2263. DataFrame({"A": ser2, "B": ser3, "D": ser1})
  2264. with pytest.raises(TypeError, match=msg):
  2265. DataFrame({"D": ser1, "A": ser2, "B": ser3})
  2266. @pytest.mark.parametrize(
  2267. "key_val, col_vals, col_type",
  2268. [
  2269. ["3", ["3", "4"], "utf8"],
  2270. [3, [3, 4], "int8"],
  2271. ],
  2272. )
  2273. def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type):
  2274. # GH 53617
  2275. pa = pytest.importorskip("pyarrow")
  2276. cols = pd.arrays.ArrowExtensionArray(
  2277. pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)()))
  2278. )
  2279. result = DataFrame({key_val: [1, 2]}, columns=cols)
  2280. expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols)
  2281. expected.iloc[:, 1] = expected.iloc[:, 1].astype(object)
  2282. tm.assert_frame_equal(result, expected)
  2283. class TestDataFrameConstructorWithDtypeCoercion:
  2284. def test_floating_values_integer_dtype(self):
  2285. # GH#40110 make DataFrame behavior with arraylike floating data and
  2286. # inty dtype match Series behavior
  2287. arr = np.random.randn(10, 5)
  2288. # GH#49599 in 2.0 we raise instead of either
  2289. # a) silently ignoring dtype and returningfloat (the old Series behavior) or
  2290. # b) rounding (the old DataFrame behavior)
  2291. msg = "Trying to coerce float values to integers"
  2292. with pytest.raises(ValueError, match=msg):
  2293. DataFrame(arr, dtype="i8")
  2294. df = DataFrame(arr.round(), dtype="i8")
  2295. assert (df.dtypes == "i8").all()
  2296. # with NaNs, we go through a different path with a different warning
  2297. arr[0, 0] = np.nan
  2298. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  2299. with pytest.raises(IntCastingNaNError, match=msg):
  2300. DataFrame(arr, dtype="i8")
  2301. with pytest.raises(IntCastingNaNError, match=msg):
  2302. Series(arr[0], dtype="i8")
  2303. # The future (raising) behavior matches what we would get via astype:
  2304. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  2305. with pytest.raises(IntCastingNaNError, match=msg):
  2306. DataFrame(arr).astype("i8")
  2307. with pytest.raises(IntCastingNaNError, match=msg):
  2308. Series(arr[0]).astype("i8")
  2309. class TestDataFrameConstructorWithDatetimeTZ:
  2310. @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
  2311. def test_construction_preserves_tzaware_dtypes(self, tz):
  2312. # after GH#7822
  2313. # these retain the timezones on dict construction
  2314. dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
  2315. dr_tz = dr.tz_localize(tz)
  2316. df = DataFrame({"A": "foo", "B": dr_tz}, index=dr)
  2317. tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo)
  2318. assert df["B"].dtype == tz_expected
  2319. # GH#2810 (with timezones)
  2320. datetimes_naive = [ts.to_pydatetime() for ts in dr]
  2321. datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
  2322. df = DataFrame({"dr": dr})
  2323. df["dr_tz"] = dr_tz
  2324. df["datetimes_naive"] = datetimes_naive
  2325. df["datetimes_with_tz"] = datetimes_with_tz
  2326. result = df.dtypes
  2327. expected = Series(
  2328. [
  2329. np.dtype("datetime64[ns]"),
  2330. DatetimeTZDtype(tz=tz),
  2331. np.dtype("datetime64[ns]"),
  2332. DatetimeTZDtype(tz=tz),
  2333. ],
  2334. index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"],
  2335. )
  2336. tm.assert_series_equal(result, expected)
  2337. @pytest.mark.parametrize("pydt", [True, False])
  2338. def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture, pydt):
  2339. # GH#25843, GH#41555, GH#33401
  2340. tz = tz_aware_fixture
  2341. ts = Timestamp("2019", tz=tz)
  2342. if pydt:
  2343. ts = ts.to_pydatetime()
  2344. msg = (
  2345. "Cannot convert timezone-aware data to timezone-naive dtype. "
  2346. r"Use pd.Series\(values\).dt.tz_localize\(None\) instead."
  2347. )
  2348. with pytest.raises(ValueError, match=msg):
  2349. DataFrame({0: [ts]}, dtype="datetime64[ns]")
  2350. msg2 = "Cannot unbox tzaware Timestamp to tznaive dtype"
  2351. with pytest.raises(TypeError, match=msg2):
  2352. DataFrame({0: ts}, index=[0], dtype="datetime64[ns]")
  2353. with pytest.raises(ValueError, match=msg):
  2354. DataFrame([ts], dtype="datetime64[ns]")
  2355. with pytest.raises(ValueError, match=msg):
  2356. DataFrame(np.array([ts], dtype=object), dtype="datetime64[ns]")
  2357. with pytest.raises(TypeError, match=msg2):
  2358. DataFrame(ts, index=[0], columns=[0], dtype="datetime64[ns]")
  2359. with pytest.raises(ValueError, match=msg):
  2360. DataFrame([Series([ts])], dtype="datetime64[ns]")
  2361. with pytest.raises(ValueError, match=msg):
  2362. DataFrame([[ts]], columns=[0], dtype="datetime64[ns]")
  2363. def test_from_dict(self):
  2364. # 8260
  2365. # support datetime64 with tz
  2366. idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
  2367. dr = date_range("20130110", periods=3)
  2368. # construction
  2369. df = DataFrame({"A": idx, "B": dr})
  2370. assert df["A"].dtype, "M8[ns, US/Eastern"
  2371. assert df["A"].name == "A"
  2372. tm.assert_series_equal(df["A"], Series(idx, name="A"))
  2373. tm.assert_series_equal(df["B"], Series(dr, name="B"))
  2374. def test_from_index(self):
  2375. # from index
  2376. idx2 = date_range("20130101", periods=3, tz="US/Eastern", name="foo")
  2377. df2 = DataFrame(idx2)
  2378. tm.assert_series_equal(df2["foo"], Series(idx2, name="foo"))
  2379. df2 = DataFrame(Series(idx2))
  2380. tm.assert_series_equal(df2["foo"], Series(idx2, name="foo"))
  2381. idx2 = date_range("20130101", periods=3, tz="US/Eastern")
  2382. df2 = DataFrame(idx2)
  2383. tm.assert_series_equal(df2[0], Series(idx2, name=0))
  2384. df2 = DataFrame(Series(idx2))
  2385. tm.assert_series_equal(df2[0], Series(idx2, name=0))
  2386. def test_frame_dict_constructor_datetime64_1680(self):
  2387. dr = date_range("1/1/2012", periods=10)
  2388. s = Series(dr, index=dr)
  2389. # it works!
  2390. DataFrame({"a": "foo", "b": s}, index=dr)
  2391. DataFrame({"a": "foo", "b": s.values}, index=dr)
  2392. def test_frame_datetime64_mixed_index_ctor_1681(self):
  2393. dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
  2394. ts = Series(dr)
  2395. # it works!
  2396. d = DataFrame({"A": "foo", "B": ts}, index=dr)
  2397. assert d["B"].isna().all()
  2398. def test_frame_timeseries_column(self):
  2399. # GH19157
  2400. dr = date_range(start="20130101T10:00:00", periods=3, freq="T", tz="US/Eastern")
  2401. result = DataFrame(dr, columns=["timestamps"])
  2402. expected = DataFrame(
  2403. {
  2404. "timestamps": [
  2405. Timestamp("20130101T10:00:00", tz="US/Eastern"),
  2406. Timestamp("20130101T10:01:00", tz="US/Eastern"),
  2407. Timestamp("20130101T10:02:00", tz="US/Eastern"),
  2408. ]
  2409. }
  2410. )
  2411. tm.assert_frame_equal(result, expected)
  2412. def test_nested_dict_construction(self):
  2413. # GH22227
  2414. columns = ["Nevada", "Ohio"]
  2415. pop = {
  2416. "Nevada": {2001: 2.4, 2002: 2.9},
  2417. "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
  2418. }
  2419. result = DataFrame(pop, index=[2001, 2002, 2003], columns=columns)
  2420. expected = DataFrame(
  2421. [(2.4, 1.7), (2.9, 3.6), (np.nan, np.nan)],
  2422. columns=columns,
  2423. index=Index([2001, 2002, 2003]),
  2424. )
  2425. tm.assert_frame_equal(result, expected)
  2426. def test_from_tzaware_object_array(self):
  2427. # GH#26825 2D object array of tzaware timestamps should not raise
  2428. dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
  2429. data = dti._data.astype(object).reshape(1, -1)
  2430. df = DataFrame(data)
  2431. assert df.shape == (1, 3)
  2432. assert (df.dtypes == dti.dtype).all()
  2433. assert (df == dti).all().all()
  2434. def test_from_tzaware_mixed_object_array(self):
  2435. # GH#26825
  2436. arr = np.array(
  2437. [
  2438. [
  2439. Timestamp("2013-01-01 00:00:00"),
  2440. Timestamp("2013-01-02 00:00:00"),
  2441. Timestamp("2013-01-03 00:00:00"),
  2442. ],
  2443. [
  2444. Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
  2445. pd.NaT,
  2446. Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
  2447. ],
  2448. [
  2449. Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
  2450. pd.NaT,
  2451. Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
  2452. ],
  2453. ],
  2454. dtype=object,
  2455. ).T
  2456. res = DataFrame(arr, columns=["A", "B", "C"])
  2457. expected_dtypes = [
  2458. "datetime64[ns]",
  2459. "datetime64[ns, US/Eastern]",
  2460. "datetime64[ns, CET]",
  2461. ]
  2462. assert (res.dtypes == expected_dtypes).all()
  2463. def test_from_2d_ndarray_with_dtype(self):
  2464. # GH#12513
  2465. array_dim2 = np.arange(10).reshape((5, 2))
  2466. df = DataFrame(array_dim2, dtype="datetime64[ns, UTC]")
  2467. expected = DataFrame(array_dim2).astype("datetime64[ns, UTC]")
  2468. tm.assert_frame_equal(df, expected)
  2469. @pytest.mark.parametrize("typ", [set, frozenset])
  2470. def test_construction_from_set_raises(self, typ):
  2471. # https://github.com/pandas-dev/pandas/issues/32582
  2472. values = typ({1, 2, 3})
  2473. msg = f"'{typ.__name__}' type is unordered"
  2474. with pytest.raises(TypeError, match=msg):
  2475. DataFrame({"a": values})
  2476. with pytest.raises(TypeError, match=msg):
  2477. Series(values)
  2478. def test_construction_from_ndarray_datetimelike(self):
  2479. # ensure the underlying arrays are properly wrapped as EA when
  2480. # constructed from 2D ndarray
  2481. arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3)
  2482. df = DataFrame(arr)
  2483. assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays)
  2484. def test_construction_from_ndarray_with_eadtype_mismatched_columns(self):
  2485. arr = np.random.randn(10, 2)
  2486. dtype = pd.array([2.0]).dtype
  2487. msg = r"len\(arrays\) must match len\(columns\)"
  2488. with pytest.raises(ValueError, match=msg):
  2489. DataFrame(arr, columns=["foo"], dtype=dtype)
  2490. arr2 = pd.array([2.0, 3.0, 4.0])
  2491. with pytest.raises(ValueError, match=msg):
  2492. DataFrame(arr2, columns=["foo", "bar"])
  2493. def test_columns_indexes_raise_on_sets(self):
  2494. # GH 47215
  2495. data = [[1, 2, 3], [4, 5, 6]]
  2496. with pytest.raises(ValueError, match="index cannot be a set"):
  2497. DataFrame(data, index={"a", "b"})
  2498. with pytest.raises(ValueError, match="columns cannot be a set"):
  2499. DataFrame(data, columns={"a", "b", "c"})
  2500. def get1(obj): # TODO: make a helper in tm?
  2501. if isinstance(obj, Series):
  2502. return obj.iloc[0]
  2503. else:
  2504. return obj.iloc[0, 0]
  2505. class TestFromScalar:
  2506. @pytest.fixture(params=[list, dict, None])
  2507. def box(self, request):
  2508. return request.param
  2509. @pytest.fixture
  2510. def constructor(self, frame_or_series, box):
  2511. extra = {"index": range(2)}
  2512. if frame_or_series is DataFrame:
  2513. extra["columns"] = ["A"]
  2514. if box is None:
  2515. return functools.partial(frame_or_series, **extra)
  2516. elif box is dict:
  2517. if frame_or_series is Series:
  2518. return lambda x, **kwargs: frame_or_series(
  2519. {0: x, 1: x}, **extra, **kwargs
  2520. )
  2521. else:
  2522. return lambda x, **kwargs: frame_or_series({"A": x}, **extra, **kwargs)
  2523. else:
  2524. if frame_or_series is Series:
  2525. return lambda x, **kwargs: frame_or_series([x, x], **extra, **kwargs)
  2526. else:
  2527. return lambda x, **kwargs: frame_or_series(
  2528. {"A": [x, x]}, **extra, **kwargs
  2529. )
  2530. @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
  2531. def test_from_nat_scalar(self, dtype, constructor):
  2532. obj = constructor(pd.NaT, dtype=dtype)
  2533. assert np.all(obj.dtypes == dtype)
  2534. assert np.all(obj.isna())
  2535. def test_from_timedelta_scalar_preserves_nanos(self, constructor):
  2536. td = Timedelta(1)
  2537. obj = constructor(td, dtype="m8[ns]")
  2538. assert get1(obj) == td
  2539. def test_from_timestamp_scalar_preserves_nanos(self, constructor, fixed_now_ts):
  2540. ts = fixed_now_ts + Timedelta(1)
  2541. obj = constructor(ts, dtype="M8[ns]")
  2542. assert get1(obj) == ts
  2543. def test_from_timedelta64_scalar_object(self, constructor):
  2544. td = Timedelta(1)
  2545. td64 = td.to_timedelta64()
  2546. obj = constructor(td64, dtype=object)
  2547. assert isinstance(get1(obj), np.timedelta64)
  2548. @pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64])
  2549. def test_from_scalar_datetimelike_mismatched(self, constructor, cls):
  2550. scalar = cls("NaT", "ns")
  2551. dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls]
  2552. if cls is np.datetime64:
  2553. msg1 = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]"
  2554. else:
  2555. msg1 = r"dtype timedelta64\[ns\] cannot be converted to datetime64\[ns\]"
  2556. msg = "|".join(["Cannot cast", msg1])
  2557. with pytest.raises(TypeError, match=msg):
  2558. constructor(scalar, dtype=dtype)
  2559. scalar = cls(4, "ns")
  2560. with pytest.raises(TypeError, match=msg):
  2561. constructor(scalar, dtype=dtype)
  2562. @pytest.mark.xfail(
  2563. reason="Timestamp constructor has been updated to cast dt64 to non-nano, "
  2564. "but DatetimeArray._from_sequence has not"
  2565. )
  2566. @pytest.mark.parametrize("cls", [datetime, np.datetime64])
  2567. def test_from_out_of_bounds_ns_datetime(self, constructor, cls):
  2568. # scalar that won't fit in nanosecond dt64, but will fit in microsecond
  2569. scalar = datetime(9999, 1, 1)
  2570. exp_dtype = "M8[us]" # pydatetime objects default to this reso
  2571. if cls is np.datetime64:
  2572. scalar = np.datetime64(scalar, "D")
  2573. exp_dtype = "M8[s]" # closest reso to input
  2574. result = constructor(scalar)
  2575. item = get1(result)
  2576. dtype = result.dtype if isinstance(result, Series) else result.dtypes.iloc[0]
  2577. assert type(item) is Timestamp
  2578. assert item.asm8.dtype == exp_dtype
  2579. assert dtype == exp_dtype
  2580. def test_out_of_s_bounds_datetime64(self, constructor):
  2581. scalar = np.datetime64(np.iinfo(np.int64).max, "D")
  2582. result = constructor(scalar)
  2583. item = get1(result)
  2584. assert type(item) is np.datetime64
  2585. dtype = result.dtype if isinstance(result, Series) else result.dtypes.iloc[0]
  2586. assert dtype == object
  2587. @pytest.mark.xfail(
  2588. reason="TimedeltaArray constructor has been updated to cast td64 to non-nano, "
  2589. "but TimedeltaArray._from_sequence has not"
  2590. )
  2591. @pytest.mark.parametrize("cls", [timedelta, np.timedelta64])
  2592. def test_from_out_of_bounds_ns_timedelta(self, constructor, cls):
  2593. # scalar that won't fit in nanosecond td64, but will fit in microsecond
  2594. scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1)
  2595. exp_dtype = "m8[us]" # smallest reso that fits
  2596. if cls is np.timedelta64:
  2597. scalar = np.timedelta64(scalar, "D")
  2598. exp_dtype = "m8[s]" # closest reso to input
  2599. result = constructor(scalar)
  2600. item = get1(result)
  2601. dtype = result.dtype if isinstance(result, Series) else result.dtypes.iloc[0]
  2602. assert type(item) is Timedelta
  2603. assert item.asm8.dtype == exp_dtype
  2604. assert dtype == exp_dtype
  2605. @pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64])
  2606. def test_out_of_s_bounds_timedelta64(self, constructor, cls):
  2607. scalar = cls(np.iinfo(np.int64).max, "D")
  2608. result = constructor(scalar)
  2609. item = get1(result)
  2610. assert type(item) is cls
  2611. dtype = result.dtype if isinstance(result, Series) else result.dtypes.iloc[0]
  2612. assert dtype == object
  2613. def test_tzaware_data_tznaive_dtype(self, constructor, box, frame_or_series):
  2614. tz = "US/Eastern"
  2615. ts = Timestamp("2019", tz=tz)
  2616. if box is None or (frame_or_series is DataFrame and box is dict):
  2617. msg = "Cannot unbox tzaware Timestamp to tznaive dtype"
  2618. err = TypeError
  2619. else:
  2620. msg = (
  2621. "Cannot convert timezone-aware data to timezone-naive dtype. "
  2622. r"Use pd.Series\(values\).dt.tz_localize\(None\) instead."
  2623. )
  2624. err = ValueError
  2625. with pytest.raises(err, match=msg):
  2626. constructor(ts, dtype="M8[ns]")
  2627. # TODO: better location for this test?
  2628. class TestAllowNonNano:
  2629. # Until 2.0, we do not preserve non-nano dt64/td64 when passed as ndarray,
  2630. # but do preserve it when passed as DTA/TDA
  2631. @pytest.fixture(params=[True, False])
  2632. def as_td(self, request):
  2633. return request.param
  2634. @pytest.fixture
  2635. def arr(self, as_td):
  2636. values = np.arange(5).astype(np.int64).view("M8[s]")
  2637. if as_td:
  2638. values = values - values[0]
  2639. return TimedeltaArray._simple_new(values, dtype=values.dtype)
  2640. else:
  2641. return DatetimeArray._simple_new(values, dtype=values.dtype)
  2642. def test_index_allow_non_nano(self, arr):
  2643. idx = Index(arr)
  2644. assert idx.dtype == arr.dtype
  2645. def test_dti_tdi_allow_non_nano(self, arr, as_td):
  2646. if as_td:
  2647. idx = pd.TimedeltaIndex(arr)
  2648. else:
  2649. idx = DatetimeIndex(arr)
  2650. assert idx.dtype == arr.dtype
  2651. def test_series_allow_non_nano(self, arr):
  2652. ser = Series(arr)
  2653. assert ser.dtype == arr.dtype
  2654. def test_frame_allow_non_nano(self, arr):
  2655. df = DataFrame(arr)
  2656. assert df.dtypes[0] == arr.dtype
  2657. def test_frame_from_dict_allow_non_nano(self, arr):
  2658. df = DataFrame({0: arr})
  2659. assert df.dtypes[0] == arr.dtype