_normalize.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. # ---------------------------------------------------------------------
  2. # JSON normalization routines
  3. from __future__ import annotations
  4. from collections import (
  5. abc,
  6. defaultdict,
  7. )
  8. import copy
  9. import sys
  10. from typing import (
  11. Any,
  12. DefaultDict,
  13. Iterable,
  14. )
  15. import numpy as np
  16. from pandas._libs.writers import convert_json_to_lines
  17. from pandas._typing import (
  18. IgnoreRaise,
  19. Scalar,
  20. )
  21. import pandas as pd
  22. from pandas import DataFrame
  23. def convert_to_line_delimits(s: str) -> str:
  24. """
  25. Helper function that converts JSON lists to line delimited JSON.
  26. """
  27. # Determine we have a JSON list to turn to lines otherwise just return the
  28. # json object, only lists can
  29. if not s[0] == "[" and s[-1] == "]":
  30. return s
  31. s = s[1:-1]
  32. return convert_json_to_lines(s)
  33. def nested_to_record(
  34. ds,
  35. prefix: str = "",
  36. sep: str = ".",
  37. level: int = 0,
  38. max_level: int | None = None,
  39. ):
  40. """
  41. A simplified json_normalize
  42. Converts a nested dict into a flat dict ("record"), unlike json_normalize,
  43. it does not attempt to extract a subset of the data.
  44. Parameters
  45. ----------
  46. ds : dict or list of dicts
  47. prefix: the prefix, optional, default: ""
  48. sep : str, default '.'
  49. Nested records will generate names separated by sep,
  50. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  51. level: int, optional, default: 0
  52. The number of levels in the json string.
  53. max_level: int, optional, default: None
  54. The max depth to normalize.
  55. Returns
  56. -------
  57. d - dict or list of dicts, matching `ds`
  58. Examples
  59. --------
  60. >>> nested_to_record(
  61. ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
  62. ... )
  63. {\
  64. 'flat1': 1, \
  65. 'dict1.c': 1, \
  66. 'dict1.d': 2, \
  67. 'nested.e.c': 1, \
  68. 'nested.e.d': 2, \
  69. 'nested.d': 2\
  70. }
  71. """
  72. singleton = False
  73. if isinstance(ds, dict):
  74. ds = [ds]
  75. singleton = True
  76. new_ds = []
  77. for d in ds:
  78. new_d = copy.deepcopy(d)
  79. for k, v in d.items():
  80. # each key gets renamed with prefix
  81. if not isinstance(k, str):
  82. k = str(k)
  83. if level == 0:
  84. newkey = k
  85. else:
  86. newkey = prefix + sep + k
  87. # flatten if type is dict and
  88. # current dict level < maximum level provided and
  89. # only dicts gets recurse-flattened
  90. # only at level>1 do we rename the rest of the keys
  91. if not isinstance(v, dict) or (
  92. max_level is not None and level >= max_level
  93. ):
  94. if level != 0: # so we skip copying for top level, common case
  95. v = new_d.pop(k)
  96. new_d[newkey] = v
  97. continue
  98. v = new_d.pop(k)
  99. new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
  100. new_ds.append(new_d)
  101. if singleton:
  102. return new_ds[0]
  103. return new_ds
  104. def _normalise_json(
  105. data: Any,
  106. key_string: str,
  107. normalized_dict: dict[str, Any],
  108. separator: str,
  109. ) -> dict[str, Any]:
  110. """
  111. Main recursive function
  112. Designed for the most basic use case of pd.json_normalize(data)
  113. intended as a performance improvement, see #15621
  114. Parameters
  115. ----------
  116. data : Any
  117. Type dependent on types contained within nested Json
  118. key_string : str
  119. New key (with separator(s) in) for data
  120. normalized_dict : dict
  121. The new normalized/flattened Json dict
  122. separator : str, default '.'
  123. Nested records will generate names separated by sep,
  124. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  125. """
  126. if isinstance(data, dict):
  127. for key, value in data.items():
  128. new_key = f"{key_string}{separator}{key}"
  129. if not key_string:
  130. if sys.version_info < (3, 9):
  131. from pandas.util._str_methods import removeprefix
  132. new_key = removeprefix(new_key, separator)
  133. else:
  134. new_key = new_key.removeprefix(separator)
  135. _normalise_json(
  136. data=value,
  137. key_string=new_key,
  138. normalized_dict=normalized_dict,
  139. separator=separator,
  140. )
  141. else:
  142. normalized_dict[key_string] = data
  143. return normalized_dict
  144. def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
  145. """
  146. Order the top level keys and then recursively go to depth
  147. Parameters
  148. ----------
  149. data : dict or list of dicts
  150. separator : str, default '.'
  151. Nested records will generate names separated by sep,
  152. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  153. Returns
  154. -------
  155. dict or list of dicts, matching `normalised_json_object`
  156. """
  157. top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
  158. nested_dict_ = _normalise_json(
  159. data={k: v for k, v in data.items() if isinstance(v, dict)},
  160. key_string="",
  161. normalized_dict={},
  162. separator=separator,
  163. )
  164. return {**top_dict_, **nested_dict_}
  165. def _simple_json_normalize(
  166. ds: dict | list[dict],
  167. sep: str = ".",
  168. ) -> dict | list[dict] | Any:
  169. """
  170. A optimized basic json_normalize
  171. Converts a nested dict into a flat dict ("record"), unlike
  172. json_normalize and nested_to_record it doesn't do anything clever.
  173. But for the most basic use cases it enhances performance.
  174. E.g. pd.json_normalize(data)
  175. Parameters
  176. ----------
  177. ds : dict or list of dicts
  178. sep : str, default '.'
  179. Nested records will generate names separated by sep,
  180. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  181. Returns
  182. -------
  183. frame : DataFrame
  184. d - dict or list of dicts, matching `normalised_json_object`
  185. Examples
  186. --------
  187. >>> _simple_json_normalize(
  188. ... {
  189. ... "flat1": 1,
  190. ... "dict1": {"c": 1, "d": 2},
  191. ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
  192. ... }
  193. ... )
  194. {\
  195. 'flat1': 1, \
  196. 'dict1.c': 1, \
  197. 'dict1.d': 2, \
  198. 'nested.e.c': 1, \
  199. 'nested.e.d': 2, \
  200. 'nested.d': 2\
  201. }
  202. """
  203. normalised_json_object = {}
  204. # expect a dictionary, as most jsons are. However, lists are perfectly valid
  205. if isinstance(ds, dict):
  206. normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
  207. elif isinstance(ds, list):
  208. normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
  209. return normalised_json_list
  210. return normalised_json_object
  211. def json_normalize(
  212. data: dict | list[dict],
  213. record_path: str | list | None = None,
  214. meta: str | list[str | list[str]] | None = None,
  215. meta_prefix: str | None = None,
  216. record_prefix: str | None = None,
  217. errors: IgnoreRaise = "raise",
  218. sep: str = ".",
  219. max_level: int | None = None,
  220. ) -> DataFrame:
  221. """
  222. Normalize semi-structured JSON data into a flat table.
  223. Parameters
  224. ----------
  225. data : dict or list of dicts
  226. Unserialized JSON objects.
  227. record_path : str or list of str, default None
  228. Path in each object to list of records. If not passed, data will be
  229. assumed to be an array of records.
  230. meta : list of paths (str or list of str), default None
  231. Fields to use as metadata for each record in resulting table.
  232. meta_prefix : str, default None
  233. If True, prefix records with dotted (?) path, e.g. foo.bar.field if
  234. meta is ['foo', 'bar'].
  235. record_prefix : str, default None
  236. If True, prefix records with dotted (?) path, e.g. foo.bar.field if
  237. path to records is ['foo', 'bar'].
  238. errors : {'raise', 'ignore'}, default 'raise'
  239. Configures error handling.
  240. * 'ignore' : will ignore KeyError if keys listed in meta are not
  241. always present.
  242. * 'raise' : will raise KeyError if keys listed in meta are not
  243. always present.
  244. sep : str, default '.'
  245. Nested records will generate names separated by sep.
  246. e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
  247. max_level : int, default None
  248. Max number of levels(depth of dict) to normalize.
  249. if None, normalizes all levels.
  250. Returns
  251. -------
  252. frame : DataFrame
  253. Normalize semi-structured JSON data into a flat table.
  254. Examples
  255. --------
  256. >>> data = [
  257. ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
  258. ... {"name": {"given": "Mark", "family": "Regner"}},
  259. ... {"id": 2, "name": "Faye Raker"},
  260. ... ]
  261. >>> pd.json_normalize(data)
  262. id name.first name.last name.given name.family name
  263. 0 1.0 Coleen Volk NaN NaN NaN
  264. 1 NaN NaN NaN Mark Regner NaN
  265. 2 2.0 NaN NaN NaN NaN Faye Raker
  266. >>> data = [
  267. ... {
  268. ... "id": 1,
  269. ... "name": "Cole Volk",
  270. ... "fitness": {"height": 130, "weight": 60},
  271. ... },
  272. ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
  273. ... {
  274. ... "id": 2,
  275. ... "name": "Faye Raker",
  276. ... "fitness": {"height": 130, "weight": 60},
  277. ... },
  278. ... ]
  279. >>> pd.json_normalize(data, max_level=0)
  280. id name fitness
  281. 0 1.0 Cole Volk {'height': 130, 'weight': 60}
  282. 1 NaN Mark Reg {'height': 130, 'weight': 60}
  283. 2 2.0 Faye Raker {'height': 130, 'weight': 60}
  284. Normalizes nested data up to level 1.
  285. >>> data = [
  286. ... {
  287. ... "id": 1,
  288. ... "name": "Cole Volk",
  289. ... "fitness": {"height": 130, "weight": 60},
  290. ... },
  291. ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
  292. ... {
  293. ... "id": 2,
  294. ... "name": "Faye Raker",
  295. ... "fitness": {"height": 130, "weight": 60},
  296. ... },
  297. ... ]
  298. >>> pd.json_normalize(data, max_level=1)
  299. id name fitness.height fitness.weight
  300. 0 1.0 Cole Volk 130 60
  301. 1 NaN Mark Reg 130 60
  302. 2 2.0 Faye Raker 130 60
  303. >>> data = [
  304. ... {
  305. ... "state": "Florida",
  306. ... "shortname": "FL",
  307. ... "info": {"governor": "Rick Scott"},
  308. ... "counties": [
  309. ... {"name": "Dade", "population": 12345},
  310. ... {"name": "Broward", "population": 40000},
  311. ... {"name": "Palm Beach", "population": 60000},
  312. ... ],
  313. ... },
  314. ... {
  315. ... "state": "Ohio",
  316. ... "shortname": "OH",
  317. ... "info": {"governor": "John Kasich"},
  318. ... "counties": [
  319. ... {"name": "Summit", "population": 1234},
  320. ... {"name": "Cuyahoga", "population": 1337},
  321. ... ],
  322. ... },
  323. ... ]
  324. >>> result = pd.json_normalize(
  325. ... data, "counties", ["state", "shortname", ["info", "governor"]]
  326. ... )
  327. >>> result
  328. name population state shortname info.governor
  329. 0 Dade 12345 Florida FL Rick Scott
  330. 1 Broward 40000 Florida FL Rick Scott
  331. 2 Palm Beach 60000 Florida FL Rick Scott
  332. 3 Summit 1234 Ohio OH John Kasich
  333. 4 Cuyahoga 1337 Ohio OH John Kasich
  334. >>> data = {"A": [1, 2]}
  335. >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
  336. Prefix.0
  337. 0 1
  338. 1 2
  339. Returns normalized data with columns prefixed with the given string.
  340. """
  341. def _pull_field(
  342. js: dict[str, Any], spec: list | str, extract_record: bool = False
  343. ) -> Scalar | Iterable:
  344. """Internal function to pull field"""
  345. result = js
  346. try:
  347. if isinstance(spec, list):
  348. for field in spec:
  349. if result is None:
  350. raise KeyError(field)
  351. result = result[field]
  352. else:
  353. result = result[spec]
  354. except KeyError as e:
  355. if extract_record:
  356. raise KeyError(
  357. f"Key {e} not found. If specifying a record_path, all elements of "
  358. f"data should have the path."
  359. ) from e
  360. if errors == "ignore":
  361. return np.nan
  362. else:
  363. raise KeyError(
  364. f"Key {e} not found. To replace missing values of {e} with "
  365. f"np.nan, pass in errors='ignore'"
  366. ) from e
  367. return result
  368. def _pull_records(js: dict[str, Any], spec: list | str) -> list:
  369. """
  370. Internal function to pull field for records, and similar to
  371. _pull_field, but require to return list. And will raise error
  372. if has non iterable value.
  373. """
  374. result = _pull_field(js, spec, extract_record=True)
  375. # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
  376. # null, otherwise return an empty list
  377. if not isinstance(result, list):
  378. if pd.isnull(result):
  379. result = []
  380. else:
  381. raise TypeError(
  382. f"{js} has non list value {result} for path {spec}. "
  383. "Must be list or null."
  384. )
  385. return result
  386. if isinstance(data, list) and not data:
  387. return DataFrame()
  388. elif isinstance(data, dict):
  389. # A bit of a hackjob
  390. data = [data]
  391. elif isinstance(data, abc.Iterable) and not isinstance(data, str):
  392. # GH35923 Fix pd.json_normalize to not skip the first element of a
  393. # generator input
  394. data = list(data)
  395. else:
  396. raise NotImplementedError
  397. # check to see if a simple recursive function is possible to
  398. # improve performance (see #15621) but only for cases such
  399. # as pd.Dataframe(data) or pd.Dataframe(data, sep)
  400. if (
  401. record_path is None
  402. and meta is None
  403. and meta_prefix is None
  404. and record_prefix is None
  405. and max_level is None
  406. ):
  407. return DataFrame(_simple_json_normalize(data, sep=sep))
  408. if record_path is None:
  409. if any([isinstance(x, dict) for x in y.values()] for y in data):
  410. # naive normalization, this is idempotent for flat records
  411. # and potentially will inflate the data considerably for
  412. # deeply nested structures:
  413. # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
  414. #
  415. # TODO: handle record value which are lists, at least error
  416. # reasonably
  417. data = nested_to_record(data, sep=sep, max_level=max_level)
  418. return DataFrame(data)
  419. elif not isinstance(record_path, list):
  420. record_path = [record_path]
  421. if meta is None:
  422. meta = []
  423. elif not isinstance(meta, list):
  424. meta = [meta]
  425. _meta = [m if isinstance(m, list) else [m] for m in meta]
  426. # Disastrously inefficient for now
  427. records: list = []
  428. lengths = []
  429. meta_vals: DefaultDict = defaultdict(list)
  430. meta_keys = [sep.join(val) for val in _meta]
  431. def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
  432. if isinstance(data, dict):
  433. data = [data]
  434. if len(path) > 1:
  435. for obj in data:
  436. for val, key in zip(_meta, meta_keys):
  437. if level + 1 == len(val):
  438. seen_meta[key] = _pull_field(obj, val[-1])
  439. _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
  440. else:
  441. for obj in data:
  442. recs = _pull_records(obj, path[0])
  443. recs = [
  444. nested_to_record(r, sep=sep, max_level=max_level)
  445. if isinstance(r, dict)
  446. else r
  447. for r in recs
  448. ]
  449. # For repeating the metadata later
  450. lengths.append(len(recs))
  451. for val, key in zip(_meta, meta_keys):
  452. if level + 1 > len(val):
  453. meta_val = seen_meta[key]
  454. else:
  455. meta_val = _pull_field(obj, val[level:])
  456. meta_vals[key].append(meta_val)
  457. records.extend(recs)
  458. _recursive_extract(data, record_path, {}, level=0)
  459. result = DataFrame(records)
  460. if record_prefix is not None:
  461. result = result.rename(columns=lambda x: f"{record_prefix}{x}")
  462. # Data types, a problem
  463. for k, v in meta_vals.items():
  464. if meta_prefix is not None:
  465. k = meta_prefix + k
  466. if k in result:
  467. raise ValueError(
  468. f"Conflicting metadata name {k}, need distinguishing prefix "
  469. )
  470. result[k] = np.array(v, dtype=object).repeat(lengths)
  471. return result