test_to_dict.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. from collections import (
  2. OrderedDict,
  3. defaultdict,
  4. )
  5. from datetime import datetime
  6. import numpy as np
  7. import pytest
  8. import pytz
  9. from pandas import (
  10. NA,
  11. DataFrame,
  12. Index,
  13. MultiIndex,
  14. Series,
  15. Timestamp,
  16. )
  17. import pandas._testing as tm
  18. class TestDataFrameToDict:
  19. def test_to_dict_timestamp(self):
  20. # GH#11247
  21. # split/records producing np.datetime64 rather than Timestamps
  22. # on datetime64[ns] dtypes only
  23. tsmp = Timestamp("20130101")
  24. test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
  25. test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
  26. expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
  27. expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
  28. assert test_data.to_dict(orient="records") == expected_records
  29. assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
  30. expected_series = {
  31. "A": Series([tsmp, tsmp], name="A"),
  32. "B": Series([tsmp, tsmp], name="B"),
  33. }
  34. expected_series_mixed = {
  35. "A": Series([tsmp, tsmp], name="A"),
  36. "B": Series([1, 2], name="B"),
  37. }
  38. tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
  39. tm.assert_dict_equal(
  40. test_data_mixed.to_dict(orient="series"), expected_series_mixed
  41. )
  42. expected_split = {
  43. "index": [0, 1],
  44. "data": [[tsmp, tsmp], [tsmp, tsmp]],
  45. "columns": ["A", "B"],
  46. }
  47. expected_split_mixed = {
  48. "index": [0, 1],
  49. "data": [[tsmp, 1], [tsmp, 2]],
  50. "columns": ["A", "B"],
  51. }
  52. tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
  53. tm.assert_dict_equal(
  54. test_data_mixed.to_dict(orient="split"), expected_split_mixed
  55. )
  56. def test_to_dict_index_not_unique_with_index_orient(self):
  57. # GH#22801
  58. # Data loss when indexes are not unique. Raise ValueError.
  59. df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
  60. msg = "DataFrame index must be unique for orient='index'"
  61. with pytest.raises(ValueError, match=msg):
  62. df.to_dict(orient="index")
  63. def test_to_dict_invalid_orient(self):
  64. df = DataFrame({"A": [0, 1]})
  65. msg = "orient 'xinvalid' not understood"
  66. with pytest.raises(ValueError, match=msg):
  67. df.to_dict(orient="xinvalid")
  68. @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"])
  69. def test_to_dict_short_orient_raises(self, orient):
  70. # GH#32515
  71. df = DataFrame({"A": [0, 1]})
  72. with pytest.raises(ValueError, match="not understood"):
  73. df.to_dict(orient=orient)
  74. @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
  75. def test_to_dict(self, mapping):
  76. # orient= should only take the listed options
  77. # see GH#32515
  78. test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
  79. # GH#16122
  80. recons_data = DataFrame(test_data).to_dict(into=mapping)
  81. for k, v in test_data.items():
  82. for k2, v2 in v.items():
  83. assert v2 == recons_data[k][k2]
  84. recons_data = DataFrame(test_data).to_dict("list", mapping)
  85. for k, v in test_data.items():
  86. for k2, v2 in v.items():
  87. assert v2 == recons_data[k][int(k2) - 1]
  88. recons_data = DataFrame(test_data).to_dict("series", mapping)
  89. for k, v in test_data.items():
  90. for k2, v2 in v.items():
  91. assert v2 == recons_data[k][k2]
  92. recons_data = DataFrame(test_data).to_dict("split", mapping)
  93. expected_split = {
  94. "columns": ["A", "B"],
  95. "index": ["1", "2", "3"],
  96. "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
  97. }
  98. tm.assert_dict_equal(recons_data, expected_split)
  99. recons_data = DataFrame(test_data).to_dict("records", mapping)
  100. expected_records = [
  101. {"A": 1.0, "B": "1"},
  102. {"A": 2.0, "B": "2"},
  103. {"A": np.nan, "B": "3"},
  104. ]
  105. assert isinstance(recons_data, list)
  106. assert len(recons_data) == 3
  107. for left, right in zip(recons_data, expected_records):
  108. tm.assert_dict_equal(left, right)
  109. # GH#10844
  110. recons_data = DataFrame(test_data).to_dict("index")
  111. for k, v in test_data.items():
  112. for k2, v2 in v.items():
  113. assert v2 == recons_data[k2][k]
  114. df = DataFrame(test_data)
  115. df["duped"] = df[df.columns[0]]
  116. recons_data = df.to_dict("index")
  117. comp_data = test_data.copy()
  118. comp_data["duped"] = comp_data[df.columns[0]]
  119. for k, v in comp_data.items():
  120. for k2, v2 in v.items():
  121. assert v2 == recons_data[k2][k]
  122. @pytest.mark.parametrize("mapping", [list, defaultdict, []])
  123. def test_to_dict_errors(self, mapping):
  124. # GH#16122
  125. df = DataFrame(np.random.randn(3, 3))
  126. msg = "|".join(
  127. [
  128. "unsupported type: <class 'list'>",
  129. r"to_dict\(\) only accepts initialized defaultdicts",
  130. ]
  131. )
  132. with pytest.raises(TypeError, match=msg):
  133. df.to_dict(into=mapping)
  134. def test_to_dict_not_unique_warning(self):
  135. # GH#16927: When converting to a dict, if a column has a non-unique name
  136. # it will be dropped, throwing a warning.
  137. df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
  138. with tm.assert_produces_warning(UserWarning):
  139. df.to_dict()
  140. # orient - orient argument to to_dict function
  141. # item_getter - function for extracting value from
  142. # the resulting dict using column name and index
  143. @pytest.mark.parametrize(
  144. "orient,item_getter",
  145. [
  146. ("dict", lambda d, col, idx: d[col][idx]),
  147. ("records", lambda d, col, idx: d[idx][col]),
  148. ("list", lambda d, col, idx: d[col][idx]),
  149. ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
  150. ("index", lambda d, col, idx: d[idx][col]),
  151. ],
  152. )
  153. def test_to_dict_box_scalars(self, orient, item_getter):
  154. # GH#14216, GH#23753
  155. # make sure that we are boxing properly
  156. df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
  157. result = df.to_dict(orient=orient)
  158. assert isinstance(item_getter(result, "a", 0), int)
  159. assert isinstance(item_getter(result, "b", 0), float)
  160. def test_to_dict_tz(self):
  161. # GH#18372 When converting to dict with orient='records' columns of
  162. # datetime that are tz-aware were not converted to required arrays
  163. data = [
  164. (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
  165. (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
  166. ]
  167. df = DataFrame(list(data), columns=["d"])
  168. result = df.to_dict(orient="records")
  169. expected = [
  170. {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
  171. {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
  172. ]
  173. tm.assert_dict_equal(result[0], expected[0])
  174. tm.assert_dict_equal(result[1], expected[1])
  175. @pytest.mark.parametrize(
  176. "into, expected",
  177. [
  178. (
  179. dict,
  180. {
  181. 0: {"int_col": 1, "float_col": 1.0},
  182. 1: {"int_col": 2, "float_col": 2.0},
  183. 2: {"int_col": 3, "float_col": 3.0},
  184. },
  185. ),
  186. (
  187. OrderedDict,
  188. OrderedDict(
  189. [
  190. (0, {"int_col": 1, "float_col": 1.0}),
  191. (1, {"int_col": 2, "float_col": 2.0}),
  192. (2, {"int_col": 3, "float_col": 3.0}),
  193. ]
  194. ),
  195. ),
  196. (
  197. defaultdict(dict),
  198. defaultdict(
  199. dict,
  200. {
  201. 0: {"int_col": 1, "float_col": 1.0},
  202. 1: {"int_col": 2, "float_col": 2.0},
  203. 2: {"int_col": 3, "float_col": 3.0},
  204. },
  205. ),
  206. ),
  207. ],
  208. )
  209. def test_to_dict_index_dtypes(self, into, expected):
  210. # GH#18580
  211. # When using to_dict(orient='index') on a dataframe with int
  212. # and float columns only the int columns were cast to float
  213. df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
  214. result = df.to_dict(orient="index", into=into)
  215. cols = ["int_col", "float_col"]
  216. result = DataFrame.from_dict(result, orient="index")[cols]
  217. expected = DataFrame.from_dict(expected, orient="index")[cols]
  218. tm.assert_frame_equal(result, expected)
  219. def test_to_dict_numeric_names(self):
  220. # GH#24940
  221. df = DataFrame({str(i): [i] for i in range(5)})
  222. result = set(df.to_dict("records")[0].keys())
  223. expected = set(df.columns)
  224. assert result == expected
  225. def test_to_dict_wide(self):
  226. # GH#24939
  227. df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
  228. result = df.to_dict("records")[0]
  229. expected = {f"A_{i:d}": i for i in range(256)}
  230. assert result == expected
  231. @pytest.mark.parametrize(
  232. "data,dtype",
  233. (
  234. ([True, True, False], bool),
  235. [
  236. [
  237. datetime(2018, 1, 1),
  238. datetime(2019, 2, 2),
  239. datetime(2020, 3, 3),
  240. ],
  241. Timestamp,
  242. ],
  243. [[1.0, 2.0, 3.0], float],
  244. [[1, 2, 3], int],
  245. [["X", "Y", "Z"], str],
  246. ),
  247. )
  248. def test_to_dict_orient_dtype(self, data, dtype):
  249. # GH22620 & GH21256
  250. df = DataFrame({"a": data})
  251. d = df.to_dict(orient="records")
  252. assert all(type(record["a"]) is dtype for record in d)
  253. @pytest.mark.parametrize(
  254. "data,expected_dtype",
  255. (
  256. [np.uint64(2), int],
  257. [np.int64(-9), int],
  258. [np.float64(1.1), float],
  259. [np.bool_(True), bool],
  260. [np.datetime64("2005-02-25"), Timestamp],
  261. ),
  262. )
  263. def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype):
  264. # GH22620 & GH21256
  265. df = DataFrame({"a": data}, index=[0])
  266. d = df.to_dict(orient="records")
  267. result = type(d[0]["a"])
  268. assert result is expected_dtype
  269. def test_to_dict_mixed_numeric_frame(self):
  270. # GH 12859
  271. df = DataFrame({"a": [1.0], "b": [9.0]})
  272. result = df.reset_index().to_dict("records")
  273. expected = [{"index": 0, "a": 1.0, "b": 9.0}]
  274. assert result == expected
  275. @pytest.mark.parametrize(
  276. "index",
  277. [
  278. None,
  279. Index(["aa", "bb"]),
  280. Index(["aa", "bb"], name="cc"),
  281. MultiIndex.from_tuples([("a", "b"), ("a", "c")]),
  282. MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]),
  283. ],
  284. )
  285. @pytest.mark.parametrize(
  286. "columns",
  287. [
  288. ["x", "y"],
  289. Index(["x", "y"]),
  290. Index(["x", "y"], name="z"),
  291. MultiIndex.from_tuples([("x", 1), ("y", 2)]),
  292. MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]),
  293. ],
  294. )
  295. def test_to_dict_orient_tight(self, index, columns):
  296. df = DataFrame.from_records(
  297. [[1, 3], [2, 4]],
  298. columns=columns,
  299. index=index,
  300. )
  301. roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")
  302. tm.assert_frame_equal(df, roundtrip)
  303. @pytest.mark.parametrize(
  304. "orient",
  305. ["dict", "list", "split", "records", "index", "tight"],
  306. )
  307. @pytest.mark.parametrize(
  308. "data,expected_types",
  309. (
  310. (
  311. {
  312. "a": [np.int64(1), 1, np.int64(3)],
  313. "b": [np.float64(1.0), 2.0, np.float64(3.0)],
  314. "c": [np.float64(1.0), 2, np.int64(3)],
  315. "d": [np.float64(1.0), "a", np.int64(3)],
  316. "e": [np.float64(1.0), ["a"], np.int64(3)],
  317. "f": [np.float64(1.0), ("a",), np.int64(3)],
  318. },
  319. {
  320. "a": [int, int, int],
  321. "b": [float, float, float],
  322. "c": [float, float, float],
  323. "d": [float, str, int],
  324. "e": [float, list, int],
  325. "f": [float, tuple, int],
  326. },
  327. ),
  328. (
  329. {
  330. "a": [1, 2, 3],
  331. "b": [1.1, 2.2, 3.3],
  332. },
  333. {
  334. "a": [int, int, int],
  335. "b": [float, float, float],
  336. },
  337. ),
  338. ( # Make sure we have one df which is all object type cols
  339. {
  340. "a": [1, "hello", 3],
  341. "b": [1.1, "world", 3.3],
  342. },
  343. {
  344. "a": [int, str, int],
  345. "b": [float, str, float],
  346. },
  347. ),
  348. ),
  349. )
  350. def test_to_dict_returns_native_types(self, orient, data, expected_types):
  351. # GH 46751
  352. # Tests we get back native types for all orient types
  353. df = DataFrame(data)
  354. result = df.to_dict(orient)
  355. if orient == "dict":
  356. assertion_iterator = (
  357. (i, key, value)
  358. for key, index_value_map in result.items()
  359. for i, value in index_value_map.items()
  360. )
  361. elif orient == "list":
  362. assertion_iterator = (
  363. (i, key, value)
  364. for key, values in result.items()
  365. for i, value in enumerate(values)
  366. )
  367. elif orient in {"split", "tight"}:
  368. assertion_iterator = (
  369. (i, key, result["data"][i][j])
  370. for i in result["index"]
  371. for j, key in enumerate(result["columns"])
  372. )
  373. elif orient == "records":
  374. assertion_iterator = (
  375. (i, key, value)
  376. for i, record in enumerate(result)
  377. for key, value in record.items()
  378. )
  379. elif orient == "index":
  380. assertion_iterator = (
  381. (i, key, value)
  382. for i, record in result.items()
  383. for key, value in record.items()
  384. )
  385. for i, key, value in assertion_iterator:
  386. assert value == data[key][i]
  387. assert type(value) is expected_types[key][i]
  388. @pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"])
  389. def test_to_dict_index_false_error(self, orient):
  390. # GH#46398
  391. df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
  392. msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'"
  393. with pytest.raises(ValueError, match=msg):
  394. df.to_dict(orient=orient, index=False)
  395. @pytest.mark.parametrize(
  396. "orient, expected",
  397. [
  398. ("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}),
  399. (
  400. "tight",
  401. {
  402. "columns": ["col1", "col2"],
  403. "data": [[1, 3], [2, 4]],
  404. "column_names": [None],
  405. },
  406. ),
  407. ],
  408. )
  409. def test_to_dict_index_false(self, orient, expected):
  410. # GH#46398
  411. df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"])
  412. result = df.to_dict(orient=orient, index=False)
  413. tm.assert_dict_equal(result, expected)
  414. @pytest.mark.parametrize(
  415. "orient, expected",
  416. [
  417. ("dict", {"a": {0: 1, 1: None}}),
  418. ("list", {"a": [1, None]}),
  419. ("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}),
  420. (
  421. "tight",
  422. {
  423. "index": [0, 1],
  424. "columns": ["a"],
  425. "data": [[1], [None]],
  426. "index_names": [None],
  427. "column_names": [None],
  428. },
  429. ),
  430. ("records", [{"a": 1}, {"a": None}]),
  431. ("index", {0: {"a": 1}, 1: {"a": None}}),
  432. ],
  433. )
  434. def test_to_dict_na_to_none(self, orient, expected):
  435. # GH#50795
  436. df = DataFrame({"a": [1, NA]}, dtype="Int64")
  437. result = df.to_dict(orient=orient)
  438. assert result == expected
  439. def test_to_dict_masked_native_python(self):
  440. # GH#34665
  441. df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1})
  442. result = df.to_dict(orient="records")
  443. assert type(result[0]["a"]) is int
  444. df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1})
  445. result = df.to_dict(orient="records")
  446. assert type(result[0]["a"]) is int