test_normalize.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893
  1. import json
  2. import numpy as np
  3. import pytest
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. Series,
  8. json_normalize,
  9. )
  10. import pandas._testing as tm
  11. from pandas.io.json._normalize import nested_to_record
  12. @pytest.fixture
  13. def deep_nested():
  14. # deeply nested data
  15. return [
  16. {
  17. "country": "USA",
  18. "states": [
  19. {
  20. "name": "California",
  21. "cities": [
  22. {"name": "San Francisco", "pop": 12345},
  23. {"name": "Los Angeles", "pop": 12346},
  24. ],
  25. },
  26. {
  27. "name": "Ohio",
  28. "cities": [
  29. {"name": "Columbus", "pop": 1234},
  30. {"name": "Cleveland", "pop": 1236},
  31. ],
  32. },
  33. ],
  34. },
  35. {
  36. "country": "Germany",
  37. "states": [
  38. {"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
  39. {
  40. "name": "Nordrhein-Westfalen",
  41. "cities": [
  42. {"name": "Duesseldorf", "pop": 1238},
  43. {"name": "Koeln", "pop": 1239},
  44. ],
  45. },
  46. ],
  47. },
  48. ]
  49. @pytest.fixture
  50. def state_data():
  51. return [
  52. {
  53. "counties": [
  54. {"name": "Dade", "population": 12345},
  55. {"name": "Broward", "population": 40000},
  56. {"name": "Palm Beach", "population": 60000},
  57. ],
  58. "info": {"governor": "Rick Scott"},
  59. "shortname": "FL",
  60. "state": "Florida",
  61. },
  62. {
  63. "counties": [
  64. {"name": "Summit", "population": 1234},
  65. {"name": "Cuyahoga", "population": 1337},
  66. ],
  67. "info": {"governor": "John Kasich"},
  68. "shortname": "OH",
  69. "state": "Ohio",
  70. },
  71. ]
  72. @pytest.fixture
  73. def author_missing_data():
  74. return [
  75. {"info": None},
  76. {
  77. "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
  78. "author_name": {"first": "Jane", "last_name": "Doe"},
  79. },
  80. ]
  81. @pytest.fixture
  82. def missing_metadata():
  83. return [
  84. {
  85. "name": "Alice",
  86. "addresses": [
  87. {
  88. "number": 9562,
  89. "street": "Morris St.",
  90. "city": "Massillon",
  91. "state": "OH",
  92. "zip": 44646,
  93. }
  94. ],
  95. "previous_residences": {"cities": [{"city_name": "Foo York City"}]},
  96. },
  97. {
  98. "addresses": [
  99. {
  100. "number": 8449,
  101. "street": "Spring St.",
  102. "city": "Elizabethton",
  103. "state": "TN",
  104. "zip": 37643,
  105. }
  106. ],
  107. "previous_residences": {"cities": [{"city_name": "Barmingham"}]},
  108. },
  109. ]
  110. @pytest.fixture
  111. def max_level_test_input_data():
  112. """
  113. input data to test json_normalize with max_level param
  114. """
  115. return [
  116. {
  117. "CreatedBy": {"Name": "User001"},
  118. "Lookup": {
  119. "TextField": "Some text",
  120. "UserField": {"Id": "ID001", "Name": "Name001"},
  121. },
  122. "Image": {"a": "b"},
  123. }
  124. ]
  125. class TestJSONNormalize:
  126. def test_simple_records(self):
  127. recs = [
  128. {"a": 1, "b": 2, "c": 3},
  129. {"a": 4, "b": 5, "c": 6},
  130. {"a": 7, "b": 8, "c": 9},
  131. {"a": 10, "b": 11, "c": 12},
  132. ]
  133. result = json_normalize(recs)
  134. expected = DataFrame(recs)
  135. tm.assert_frame_equal(result, expected)
  136. def test_simple_normalize(self, state_data):
  137. result = json_normalize(state_data[0], "counties")
  138. expected = DataFrame(state_data[0]["counties"])
  139. tm.assert_frame_equal(result, expected)
  140. result = json_normalize(state_data, "counties")
  141. expected = []
  142. for rec in state_data:
  143. expected.extend(rec["counties"])
  144. expected = DataFrame(expected)
  145. tm.assert_frame_equal(result, expected)
  146. result = json_normalize(state_data, "counties", meta="state")
  147. expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
  148. tm.assert_frame_equal(result, expected)
  149. def test_empty_array(self):
  150. result = json_normalize([])
  151. expected = DataFrame()
  152. tm.assert_frame_equal(result, expected)
  153. @pytest.mark.parametrize(
  154. "data, record_path, exception_type",
  155. [
  156. ([{"a": 0}, {"a": 1}], None, None),
  157. ({"a": [{"a": 0}, {"a": 1}]}, "a", None),
  158. ('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
  159. (None, None, NotImplementedError),
  160. ],
  161. )
  162. def test_accepted_input(self, data, record_path, exception_type):
  163. if exception_type is not None:
  164. with pytest.raises(exception_type, match=tm.EMPTY_STRING_PATTERN):
  165. json_normalize(data, record_path=record_path)
  166. else:
  167. result = json_normalize(data, record_path=record_path)
  168. expected = DataFrame([0, 1], columns=["a"])
  169. tm.assert_frame_equal(result, expected)
  170. def test_simple_normalize_with_separator(self, deep_nested):
  171. # GH 14883
  172. result = json_normalize({"A": {"A": 1, "B": 2}})
  173. expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
  174. tm.assert_frame_equal(result.reindex_like(expected), expected)
  175. result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
  176. expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
  177. tm.assert_frame_equal(result.reindex_like(expected), expected)
  178. result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
  179. expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
  180. tm.assert_frame_equal(result.reindex_like(expected), expected)
  181. result = json_normalize(
  182. deep_nested,
  183. ["states", "cities"],
  184. meta=["country", ["states", "name"]],
  185. sep="_",
  186. )
  187. expected = Index(["name", "pop", "country", "states_name"]).sort_values()
  188. assert result.columns.sort_values().equals(expected)
  189. def test_normalize_with_multichar_separator(self):
  190. # GH #43831
  191. data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}}
  192. result = json_normalize(data, sep="__")
  193. expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"])
  194. tm.assert_frame_equal(result, expected)
  195. def test_value_array_record_prefix(self):
  196. # GH 21536
  197. result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
  198. expected = DataFrame([[1], [2]], columns=["Prefix.0"])
  199. tm.assert_frame_equal(result, expected)
  200. def test_nested_object_record_path(self):
  201. # GH 22706
  202. data = {
  203. "state": "Florida",
  204. "info": {
  205. "governor": "Rick Scott",
  206. "counties": [
  207. {"name": "Dade", "population": 12345},
  208. {"name": "Broward", "population": 40000},
  209. {"name": "Palm Beach", "population": 60000},
  210. ],
  211. },
  212. }
  213. result = json_normalize(data, record_path=["info", "counties"])
  214. expected = DataFrame(
  215. [["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
  216. columns=["name", "population"],
  217. )
  218. tm.assert_frame_equal(result, expected)
  219. def test_more_deeply_nested(self, deep_nested):
  220. result = json_normalize(
  221. deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
  222. )
  223. ex_data = {
  224. "country": ["USA"] * 4 + ["Germany"] * 3,
  225. "states.name": [
  226. "California",
  227. "California",
  228. "Ohio",
  229. "Ohio",
  230. "Bayern",
  231. "Nordrhein-Westfalen",
  232. "Nordrhein-Westfalen",
  233. ],
  234. "name": [
  235. "San Francisco",
  236. "Los Angeles",
  237. "Columbus",
  238. "Cleveland",
  239. "Munich",
  240. "Duesseldorf",
  241. "Koeln",
  242. ],
  243. "pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
  244. }
  245. expected = DataFrame(ex_data, columns=result.columns)
  246. tm.assert_frame_equal(result, expected)
  247. def test_shallow_nested(self):
  248. data = [
  249. {
  250. "state": "Florida",
  251. "shortname": "FL",
  252. "info": {"governor": "Rick Scott"},
  253. "counties": [
  254. {"name": "Dade", "population": 12345},
  255. {"name": "Broward", "population": 40000},
  256. {"name": "Palm Beach", "population": 60000},
  257. ],
  258. },
  259. {
  260. "state": "Ohio",
  261. "shortname": "OH",
  262. "info": {"governor": "John Kasich"},
  263. "counties": [
  264. {"name": "Summit", "population": 1234},
  265. {"name": "Cuyahoga", "population": 1337},
  266. ],
  267. },
  268. ]
  269. result = json_normalize(
  270. data, "counties", ["state", "shortname", ["info", "governor"]]
  271. )
  272. ex_data = {
  273. "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
  274. "state": ["Florida"] * 3 + ["Ohio"] * 2,
  275. "shortname": ["FL", "FL", "FL", "OH", "OH"],
  276. "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
  277. "population": [12345, 40000, 60000, 1234, 1337],
  278. }
  279. expected = DataFrame(ex_data, columns=result.columns)
  280. tm.assert_frame_equal(result, expected)
  281. def test_nested_meta_path_with_nested_record_path(self, state_data):
  282. # GH 27220
  283. result = json_normalize(
  284. data=state_data,
  285. record_path=["counties"],
  286. meta=["state", "shortname", ["info", "governor"]],
  287. errors="ignore",
  288. )
  289. ex_data = {
  290. "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
  291. "population": [12345, 40000, 60000, 1234, 1337],
  292. "state": ["Florida"] * 3 + ["Ohio"] * 2,
  293. "shortname": ["FL"] * 3 + ["OH"] * 2,
  294. "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
  295. }
  296. expected = DataFrame(ex_data)
  297. tm.assert_frame_equal(result, expected)
  298. def test_meta_name_conflict(self):
  299. data = [
  300. {
  301. "foo": "hello",
  302. "bar": "there",
  303. "data": [
  304. {"foo": "something", "bar": "else"},
  305. {"foo": "something2", "bar": "else2"},
  306. ],
  307. }
  308. ]
  309. msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
  310. with pytest.raises(ValueError, match=msg):
  311. json_normalize(data, "data", meta=["foo", "bar"])
  312. result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
  313. for val in ["metafoo", "metabar", "foo", "bar"]:
  314. assert val in result
  315. def test_meta_parameter_not_modified(self):
  316. # GH 18610
  317. data = [
  318. {
  319. "foo": "hello",
  320. "bar": "there",
  321. "data": [
  322. {"foo": "something", "bar": "else"},
  323. {"foo": "something2", "bar": "else2"},
  324. ],
  325. }
  326. ]
  327. COLUMNS = ["foo", "bar"]
  328. result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
  329. assert COLUMNS == ["foo", "bar"]
  330. for val in ["metafoo", "metabar", "foo", "bar"]:
  331. assert val in result
  332. def test_record_prefix(self, state_data):
  333. result = json_normalize(state_data[0], "counties")
  334. expected = DataFrame(state_data[0]["counties"])
  335. tm.assert_frame_equal(result, expected)
  336. result = json_normalize(
  337. state_data, "counties", meta="state", record_prefix="county_"
  338. )
  339. expected = []
  340. for rec in state_data:
  341. expected.extend(rec["counties"])
  342. expected = DataFrame(expected)
  343. expected = expected.rename(columns=lambda x: "county_" + x)
  344. expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
  345. tm.assert_frame_equal(result, expected)
  346. def test_non_ascii_key(self):
  347. testjson = (
  348. b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
  349. + b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
  350. ).decode("utf8")
  351. testdata = {
  352. b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
  353. "sub.A": [1, 3],
  354. "sub.B": [2, 4],
  355. }
  356. expected = DataFrame(testdata)
  357. result = json_normalize(json.loads(testjson))
  358. tm.assert_frame_equal(result, expected)
  359. def test_missing_field(self, author_missing_data):
  360. # GH20030:
  361. result = json_normalize(author_missing_data)
  362. ex_data = [
  363. {
  364. "info": np.nan,
  365. "info.created_at": np.nan,
  366. "info.last_updated": np.nan,
  367. "author_name.first": np.nan,
  368. "author_name.last_name": np.nan,
  369. },
  370. {
  371. "info": None,
  372. "info.created_at": "11/08/1993",
  373. "info.last_updated": "26/05/2012",
  374. "author_name.first": "Jane",
  375. "author_name.last_name": "Doe",
  376. },
  377. ]
  378. expected = DataFrame(ex_data)
  379. tm.assert_frame_equal(result, expected)
  380. @pytest.mark.parametrize(
  381. "max_level,expected",
  382. [
  383. (
  384. 0,
  385. [
  386. {
  387. "TextField": "Some text",
  388. "UserField": {"Id": "ID001", "Name": "Name001"},
  389. "CreatedBy": {"Name": "User001"},
  390. "Image": {"a": "b"},
  391. },
  392. {
  393. "TextField": "Some text",
  394. "UserField": {"Id": "ID001", "Name": "Name001"},
  395. "CreatedBy": {"Name": "User001"},
  396. "Image": {"a": "b"},
  397. },
  398. ],
  399. ),
  400. (
  401. 1,
  402. [
  403. {
  404. "TextField": "Some text",
  405. "UserField.Id": "ID001",
  406. "UserField.Name": "Name001",
  407. "CreatedBy": {"Name": "User001"},
  408. "Image": {"a": "b"},
  409. },
  410. {
  411. "TextField": "Some text",
  412. "UserField.Id": "ID001",
  413. "UserField.Name": "Name001",
  414. "CreatedBy": {"Name": "User001"},
  415. "Image": {"a": "b"},
  416. },
  417. ],
  418. ),
  419. ],
  420. )
  421. def test_max_level_with_records_path(self, max_level, expected):
  422. # GH23843: Enhanced JSON normalize
  423. test_input = [
  424. {
  425. "CreatedBy": {"Name": "User001"},
  426. "Lookup": [
  427. {
  428. "TextField": "Some text",
  429. "UserField": {"Id": "ID001", "Name": "Name001"},
  430. },
  431. {
  432. "TextField": "Some text",
  433. "UserField": {"Id": "ID001", "Name": "Name001"},
  434. },
  435. ],
  436. "Image": {"a": "b"},
  437. "tags": [
  438. {"foo": "something", "bar": "else"},
  439. {"foo": "something2", "bar": "else2"},
  440. ],
  441. }
  442. ]
  443. result = json_normalize(
  444. test_input,
  445. record_path=["Lookup"],
  446. meta=[["CreatedBy"], ["Image"]],
  447. max_level=max_level,
  448. )
  449. expected_df = DataFrame(data=expected, columns=result.columns.values)
  450. tm.assert_equal(expected_df, result)
  451. def test_nested_flattening_consistent(self):
  452. # see gh-21537
  453. df1 = json_normalize([{"A": {"B": 1}}])
  454. df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy")
  455. # They should be the same.
  456. tm.assert_frame_equal(df1, df2)
  457. def test_nonetype_record_path(self, nulls_fixture):
  458. # see gh-30148
  459. # should not raise TypeError
  460. result = json_normalize(
  461. [
  462. {"state": "Texas", "info": nulls_fixture},
  463. {"state": "Florida", "info": [{"i": 2}]},
  464. ],
  465. record_path=["info"],
  466. )
  467. expected = DataFrame({"i": 2}, index=[0])
  468. tm.assert_equal(result, expected)
  469. @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"'])
  470. def test_non_list_record_path_errors(self, value):
  471. # see gh-30148, GH 26284
  472. parsed_value = json.loads(value)
  473. test_input = {"state": "Texas", "info": parsed_value}
  474. test_path = "info"
  475. msg = (
  476. f"{test_input} has non list value {parsed_value} for path {test_path}. "
  477. "Must be list or null."
  478. )
  479. with pytest.raises(TypeError, match=msg):
  480. json_normalize([test_input], record_path=[test_path])
  481. def test_meta_non_iterable(self):
  482. # GH 31507
  483. data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]"""
  484. result = json_normalize(json.loads(data), record_path=["data"], meta=["id"])
  485. expected = DataFrame(
  486. {"one": [1], "two": [2], "id": np.array([99], dtype=object)}
  487. )
  488. tm.assert_frame_equal(result, expected)
  489. def test_generator(self, state_data):
  490. # GH35923 Fix pd.json_normalize to not skip the first element of a
  491. # generator input
  492. def generator_data():
  493. yield from state_data[0]["counties"]
  494. result = json_normalize(generator_data())
  495. expected = DataFrame(state_data[0]["counties"])
  496. tm.assert_frame_equal(result, expected)
  497. def test_top_column_with_leading_underscore(self):
  498. # 49861
  499. data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
  500. result = json_normalize(data, sep="_")
  501. expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
  502. tm.assert_frame_equal(result, expected)
  503. class TestNestedToRecord:
  504. def test_flat_stays_flat(self):
  505. recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
  506. result = nested_to_record(recs)
  507. expected = recs
  508. assert result == expected
  509. def test_one_level_deep_flattens(self):
  510. data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
  511. result = nested_to_record(data)
  512. expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
  513. assert result == expected
  514. def test_nested_flattens(self):
  515. data = {
  516. "flat1": 1,
  517. "dict1": {"c": 1, "d": 2},
  518. "nested": {"e": {"c": 1, "d": 2}, "d": 2},
  519. }
  520. result = nested_to_record(data)
  521. expected = {
  522. "dict1.c": 1,
  523. "dict1.d": 2,
  524. "flat1": 1,
  525. "nested.d": 2,
  526. "nested.e.c": 1,
  527. "nested.e.d": 2,
  528. }
  529. assert result == expected
  530. def test_json_normalize_errors(self, missing_metadata):
  531. # GH14583:
  532. # If meta keys are not always present a new option to set
  533. # errors='ignore' has been implemented
  534. msg = (
  535. "Key 'name' not found. To replace missing values of "
  536. "'name' with np.nan, pass in errors='ignore'"
  537. )
  538. with pytest.raises(KeyError, match=msg):
  539. json_normalize(
  540. data=missing_metadata,
  541. record_path="addresses",
  542. meta="name",
  543. errors="raise",
  544. )
  545. def test_missing_meta(self, missing_metadata):
  546. # GH25468
  547. # If metadata is nullable with errors set to ignore, the null values
  548. # should be numpy.nan values
  549. result = json_normalize(
  550. data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
  551. )
  552. ex_data = [
  553. [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
  554. [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
  555. ]
  556. columns = ["number", "street", "city", "state", "zip", "name"]
  557. expected = DataFrame(ex_data, columns=columns)
  558. tm.assert_frame_equal(result, expected)
  559. def test_missing_nested_meta(self):
  560. # GH44312
  561. # If errors="ignore" and nested metadata is null, we should return nan
  562. data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
  563. result = json_normalize(
  564. data,
  565. record_path="value",
  566. meta=["meta", ["nested_meta", "leaf"]],
  567. errors="ignore",
  568. )
  569. ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
  570. columns = ["rec", "meta", "nested_meta.leaf"]
  571. expected = DataFrame(ex_data, columns=columns).astype(
  572. {"nested_meta.leaf": object}
  573. )
  574. tm.assert_frame_equal(result, expected)
  575. # If errors="raise" and nested metadata is null, we should raise with the
  576. # key of the first missing level
  577. with pytest.raises(KeyError, match="'leaf' not found"):
  578. json_normalize(
  579. data,
  580. record_path="value",
  581. meta=["meta", ["nested_meta", "leaf"]],
  582. errors="raise",
  583. )
  584. def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
  585. # GH41876
  586. # Ensure errors='raise' works as intended even when a record_path of length
  587. # greater than one is passed in
  588. msg = (
  589. "Key 'name' not found. To replace missing values of "
  590. "'name' with np.nan, pass in errors='ignore'"
  591. )
  592. with pytest.raises(KeyError, match=msg):
  593. json_normalize(
  594. data=missing_metadata,
  595. record_path=["previous_residences", "cities"],
  596. meta="name",
  597. errors="raise",
  598. )
  599. def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
  600. # GH41876
  601. # Ensure errors='ignore' works as intended even when a record_path of length
  602. # greater than one is passed in
  603. result = json_normalize(
  604. data=missing_metadata,
  605. record_path=["previous_residences", "cities"],
  606. meta="name",
  607. errors="ignore",
  608. )
  609. ex_data = [
  610. ["Foo York City", "Alice"],
  611. ["Barmingham", np.nan],
  612. ]
  613. columns = ["city_name", "name"]
  614. expected = DataFrame(ex_data, columns=columns)
  615. tm.assert_frame_equal(result, expected)
  616. def test_donot_drop_nonevalues(self):
  617. # GH21356
  618. data = [
  619. {"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
  620. {
  621. "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
  622. "author_name": {"first": "Jane", "last_name": "Doe"},
  623. },
  624. ]
  625. result = nested_to_record(data)
  626. expected = [
  627. {
  628. "info": None,
  629. "author_name.first": "Smith",
  630. "author_name.last_name": "Appleseed",
  631. },
  632. {
  633. "author_name.first": "Jane",
  634. "author_name.last_name": "Doe",
  635. "info.created_at": "11/08/1993",
  636. "info.last_updated": "26/05/2012",
  637. },
  638. ]
  639. assert result == expected
  640. def test_nonetype_top_level_bottom_level(self):
  641. # GH21158: If inner level json has a key with a null value
  642. # make sure it does not do a new_d.pop twice and except
  643. data = {
  644. "id": None,
  645. "location": {
  646. "country": {
  647. "state": {
  648. "id": None,
  649. "town.info": {
  650. "id": None,
  651. "region": None,
  652. "x": 49.151580810546875,
  653. "y": -33.148521423339844,
  654. "z": 27.572303771972656,
  655. },
  656. }
  657. }
  658. },
  659. }
  660. result = nested_to_record(data)
  661. expected = {
  662. "id": None,
  663. "location.country.state.id": None,
  664. "location.country.state.town.info.id": None,
  665. "location.country.state.town.info.region": None,
  666. "location.country.state.town.info.x": 49.151580810546875,
  667. "location.country.state.town.info.y": -33.148521423339844,
  668. "location.country.state.town.info.z": 27.572303771972656,
  669. }
  670. assert result == expected
  671. def test_nonetype_multiple_levels(self):
  672. # GH21158: If inner level json has a key with a null value
  673. # make sure it does not do a new_d.pop twice and except
  674. data = {
  675. "id": None,
  676. "location": {
  677. "id": None,
  678. "country": {
  679. "id": None,
  680. "state": {
  681. "id": None,
  682. "town.info": {
  683. "region": None,
  684. "x": 49.151580810546875,
  685. "y": -33.148521423339844,
  686. "z": 27.572303771972656,
  687. },
  688. },
  689. },
  690. },
  691. }
  692. result = nested_to_record(data)
  693. expected = {
  694. "id": None,
  695. "location.id": None,
  696. "location.country.id": None,
  697. "location.country.state.id": None,
  698. "location.country.state.town.info.region": None,
  699. "location.country.state.town.info.x": 49.151580810546875,
  700. "location.country.state.town.info.y": -33.148521423339844,
  701. "location.country.state.town.info.z": 27.572303771972656,
  702. }
  703. assert result == expected
  704. @pytest.mark.parametrize(
  705. "max_level, expected",
  706. [
  707. (
  708. None,
  709. [
  710. {
  711. "CreatedBy.Name": "User001",
  712. "Lookup.TextField": "Some text",
  713. "Lookup.UserField.Id": "ID001",
  714. "Lookup.UserField.Name": "Name001",
  715. "Image.a": "b",
  716. }
  717. ],
  718. ),
  719. (
  720. 0,
  721. [
  722. {
  723. "CreatedBy": {"Name": "User001"},
  724. "Lookup": {
  725. "TextField": "Some text",
  726. "UserField": {"Id": "ID001", "Name": "Name001"},
  727. },
  728. "Image": {"a": "b"},
  729. }
  730. ],
  731. ),
  732. (
  733. 1,
  734. [
  735. {
  736. "CreatedBy.Name": "User001",
  737. "Lookup.TextField": "Some text",
  738. "Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
  739. "Image.a": "b",
  740. }
  741. ],
  742. ),
  743. ],
  744. )
  745. def test_with_max_level(self, max_level, expected, max_level_test_input_data):
  746. # GH23843: Enhanced JSON normalize
  747. output = nested_to_record(max_level_test_input_data, max_level=max_level)
  748. assert output == expected
  749. def test_with_large_max_level(self):
  750. # GH23843: Enhanced JSON normalize
  751. max_level = 100
  752. input_data = [
  753. {
  754. "CreatedBy": {
  755. "user": {
  756. "name": {"firstname": "Leo", "LastName": "Thomson"},
  757. "family_tree": {
  758. "father": {
  759. "name": "Father001",
  760. "father": {
  761. "Name": "Father002",
  762. "father": {
  763. "name": "Father003",
  764. "father": {"Name": "Father004"},
  765. },
  766. },
  767. }
  768. },
  769. }
  770. }
  771. }
  772. ]
  773. expected = [
  774. {
  775. "CreatedBy.user.name.firstname": "Leo",
  776. "CreatedBy.user.name.LastName": "Thomson",
  777. "CreatedBy.user.family_tree.father.name": "Father001",
  778. "CreatedBy.user.family_tree.father.father.Name": "Father002",
  779. "CreatedBy.user.family_tree.father.father.father.name": "Father003",
  780. "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
  781. }
  782. ]
  783. output = nested_to_record(input_data, max_level=max_level)
  784. assert output == expected
  785. def test_series_non_zero_index(self):
  786. # GH 19020
  787. data = {
  788. 0: {"id": 1, "name": "Foo", "elements": {"a": 1}},
  789. 1: {"id": 2, "name": "Bar", "elements": {"b": 2}},
  790. 2: {"id": 3, "name": "Baz", "elements": {"c": 3}},
  791. }
  792. s = Series(data)
  793. s.index = [1, 2, 3]
  794. result = json_normalize(s)
  795. expected = DataFrame(
  796. {
  797. "id": [1, 2, 3],
  798. "name": ["Foo", "Bar", "Baz"],
  799. "elements.a": [1.0, np.nan, np.nan],
  800. "elements.b": [np.nan, 2.0, np.nan],
  801. "elements.c": [np.nan, np.nan, 3.0],
  802. }
  803. )
  804. tm.assert_frame_equal(result, expected)