123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536 |
- # ---------------------------------------------------------------------
- # JSON normalization routines
- from __future__ import annotations
- from collections import (
- abc,
- defaultdict,
- )
- import copy
- import sys
- from typing import (
- Any,
- DefaultDict,
- Iterable,
- )
- import numpy as np
- from pandas._libs.writers import convert_json_to_lines
- from pandas._typing import (
- IgnoreRaise,
- Scalar,
- )
- import pandas as pd
- from pandas import DataFrame
- def convert_to_line_delimits(s: str) -> str:
- """
- Helper function that converts JSON lists to line delimited JSON.
- """
- # Determine we have a JSON list to turn to lines otherwise just return the
- # json object, only lists can
- if not s[0] == "[" and s[-1] == "]":
- return s
- s = s[1:-1]
- return convert_json_to_lines(s)
- def nested_to_record(
- ds,
- prefix: str = "",
- sep: str = ".",
- level: int = 0,
- max_level: int | None = None,
- ):
- """
- A simplified json_normalize
- Converts a nested dict into a flat dict ("record"), unlike json_normalize,
- it does not attempt to extract a subset of the data.
- Parameters
- ----------
- ds : dict or list of dicts
- prefix: the prefix, optional, default: ""
- sep : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
- level: int, optional, default: 0
- The number of levels in the json string.
- max_level: int, optional, default: None
- The max depth to normalize.
- Returns
- -------
- d - dict or list of dicts, matching `ds`
- Examples
- --------
- >>> nested_to_record(
- ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
- ... )
- {\
- 'flat1': 1, \
- 'dict1.c': 1, \
- 'dict1.d': 2, \
- 'nested.e.c': 1, \
- 'nested.e.d': 2, \
- 'nested.d': 2\
- }
- """
- singleton = False
- if isinstance(ds, dict):
- ds = [ds]
- singleton = True
- new_ds = []
- for d in ds:
- new_d = copy.deepcopy(d)
- for k, v in d.items():
- # each key gets renamed with prefix
- if not isinstance(k, str):
- k = str(k)
- if level == 0:
- newkey = k
- else:
- newkey = prefix + sep + k
- # flatten if type is dict and
- # current dict level < maximum level provided and
- # only dicts gets recurse-flattened
- # only at level>1 do we rename the rest of the keys
- if not isinstance(v, dict) or (
- max_level is not None and level >= max_level
- ):
- if level != 0: # so we skip copying for top level, common case
- v = new_d.pop(k)
- new_d[newkey] = v
- continue
- v = new_d.pop(k)
- new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
- new_ds.append(new_d)
- if singleton:
- return new_ds[0]
- return new_ds
- def _normalise_json(
- data: Any,
- key_string: str,
- normalized_dict: dict[str, Any],
- separator: str,
- ) -> dict[str, Any]:
- """
- Main recursive function
- Designed for the most basic use case of pd.json_normalize(data)
- intended as a performance improvement, see #15621
- Parameters
- ----------
- data : Any
- Type dependent on types contained within nested Json
- key_string : str
- New key (with separator(s) in) for data
- normalized_dict : dict
- The new normalized/flattened Json dict
- separator : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
- """
- if isinstance(data, dict):
- for key, value in data.items():
- new_key = f"{key_string}{separator}{key}"
- if not key_string:
- if sys.version_info < (3, 9):
- from pandas.util._str_methods import removeprefix
- new_key = removeprefix(new_key, separator)
- else:
- new_key = new_key.removeprefix(separator)
- _normalise_json(
- data=value,
- key_string=new_key,
- normalized_dict=normalized_dict,
- separator=separator,
- )
- else:
- normalized_dict[key_string] = data
- return normalized_dict
- def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
- """
- Order the top level keys and then recursively go to depth
- Parameters
- ----------
- data : dict or list of dicts
- separator : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
- Returns
- -------
- dict or list of dicts, matching `normalised_json_object`
- """
- top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
- nested_dict_ = _normalise_json(
- data={k: v for k, v in data.items() if isinstance(v, dict)},
- key_string="",
- normalized_dict={},
- separator=separator,
- )
- return {**top_dict_, **nested_dict_}
- def _simple_json_normalize(
- ds: dict | list[dict],
- sep: str = ".",
- ) -> dict | list[dict] | Any:
- """
- A optimized basic json_normalize
- Converts a nested dict into a flat dict ("record"), unlike
- json_normalize and nested_to_record it doesn't do anything clever.
- But for the most basic use cases it enhances performance.
- E.g. pd.json_normalize(data)
- Parameters
- ----------
- ds : dict or list of dicts
- sep : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
- Returns
- -------
- frame : DataFrame
- d - dict or list of dicts, matching `normalised_json_object`
- Examples
- --------
- >>> _simple_json_normalize(
- ... {
- ... "flat1": 1,
- ... "dict1": {"c": 1, "d": 2},
- ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
- ... }
- ... )
- {\
- 'flat1': 1, \
- 'dict1.c': 1, \
- 'dict1.d': 2, \
- 'nested.e.c': 1, \
- 'nested.e.d': 2, \
- 'nested.d': 2\
- }
- """
- normalised_json_object = {}
- # expect a dictionary, as most jsons are. However, lists are perfectly valid
- if isinstance(ds, dict):
- normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
- elif isinstance(ds, list):
- normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
- return normalised_json_list
- return normalised_json_object
- def json_normalize(
- data: dict | list[dict],
- record_path: str | list | None = None,
- meta: str | list[str | list[str]] | None = None,
- meta_prefix: str | None = None,
- record_prefix: str | None = None,
- errors: IgnoreRaise = "raise",
- sep: str = ".",
- max_level: int | None = None,
- ) -> DataFrame:
- """
- Normalize semi-structured JSON data into a flat table.
- Parameters
- ----------
- data : dict or list of dicts
- Unserialized JSON objects.
- record_path : str or list of str, default None
- Path in each object to list of records. If not passed, data will be
- assumed to be an array of records.
- meta : list of paths (str or list of str), default None
- Fields to use as metadata for each record in resulting table.
- meta_prefix : str, default None
- If True, prefix records with dotted (?) path, e.g. foo.bar.field if
- meta is ['foo', 'bar'].
- record_prefix : str, default None
- If True, prefix records with dotted (?) path, e.g. foo.bar.field if
- path to records is ['foo', 'bar'].
- errors : {'raise', 'ignore'}, default 'raise'
- Configures error handling.
- * 'ignore' : will ignore KeyError if keys listed in meta are not
- always present.
- * 'raise' : will raise KeyError if keys listed in meta are not
- always present.
- sep : str, default '.'
- Nested records will generate names separated by sep.
- e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
- max_level : int, default None
- Max number of levels(depth of dict) to normalize.
- if None, normalizes all levels.
- Returns
- -------
- frame : DataFrame
- Normalize semi-structured JSON data into a flat table.
- Examples
- --------
- >>> data = [
- ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
- ... {"name": {"given": "Mark", "family": "Regner"}},
- ... {"id": 2, "name": "Faye Raker"},
- ... ]
- >>> pd.json_normalize(data)
- id name.first name.last name.given name.family name
- 0 1.0 Coleen Volk NaN NaN NaN
- 1 NaN NaN NaN Mark Regner NaN
- 2 2.0 NaN NaN NaN NaN Faye Raker
- >>> data = [
- ... {
- ... "id": 1,
- ... "name": "Cole Volk",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
- ... {
- ... "id": 2,
- ... "name": "Faye Raker",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... ]
- >>> pd.json_normalize(data, max_level=0)
- id name fitness
- 0 1.0 Cole Volk {'height': 130, 'weight': 60}
- 1 NaN Mark Reg {'height': 130, 'weight': 60}
- 2 2.0 Faye Raker {'height': 130, 'weight': 60}
- Normalizes nested data up to level 1.
- >>> data = [
- ... {
- ... "id": 1,
- ... "name": "Cole Volk",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
- ... {
- ... "id": 2,
- ... "name": "Faye Raker",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... ]
- >>> pd.json_normalize(data, max_level=1)
- id name fitness.height fitness.weight
- 0 1.0 Cole Volk 130 60
- 1 NaN Mark Reg 130 60
- 2 2.0 Faye Raker 130 60
- >>> data = [
- ... {
- ... "state": "Florida",
- ... "shortname": "FL",
- ... "info": {"governor": "Rick Scott"},
- ... "counties": [
- ... {"name": "Dade", "population": 12345},
- ... {"name": "Broward", "population": 40000},
- ... {"name": "Palm Beach", "population": 60000},
- ... ],
- ... },
- ... {
- ... "state": "Ohio",
- ... "shortname": "OH",
- ... "info": {"governor": "John Kasich"},
- ... "counties": [
- ... {"name": "Summit", "population": 1234},
- ... {"name": "Cuyahoga", "population": 1337},
- ... ],
- ... },
- ... ]
- >>> result = pd.json_normalize(
- ... data, "counties", ["state", "shortname", ["info", "governor"]]
- ... )
- >>> result
- name population state shortname info.governor
- 0 Dade 12345 Florida FL Rick Scott
- 1 Broward 40000 Florida FL Rick Scott
- 2 Palm Beach 60000 Florida FL Rick Scott
- 3 Summit 1234 Ohio OH John Kasich
- 4 Cuyahoga 1337 Ohio OH John Kasich
- >>> data = {"A": [1, 2]}
- >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
- Prefix.0
- 0 1
- 1 2
- Returns normalized data with columns prefixed with the given string.
- """
- def _pull_field(
- js: dict[str, Any], spec: list | str, extract_record: bool = False
- ) -> Scalar | Iterable:
- """Internal function to pull field"""
- result = js
- try:
- if isinstance(spec, list):
- for field in spec:
- if result is None:
- raise KeyError(field)
- result = result[field]
- else:
- result = result[spec]
- except KeyError as e:
- if extract_record:
- raise KeyError(
- f"Key {e} not found. If specifying a record_path, all elements of "
- f"data should have the path."
- ) from e
- if errors == "ignore":
- return np.nan
- else:
- raise KeyError(
- f"Key {e} not found. To replace missing values of {e} with "
- f"np.nan, pass in errors='ignore'"
- ) from e
- return result
- def _pull_records(js: dict[str, Any], spec: list | str) -> list:
- """
- Internal function to pull field for records, and similar to
- _pull_field, but require to return list. And will raise error
- if has non iterable value.
- """
- result = _pull_field(js, spec, extract_record=True)
- # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
- # null, otherwise return an empty list
- if not isinstance(result, list):
- if pd.isnull(result):
- result = []
- else:
- raise TypeError(
- f"{js} has non list value {result} for path {spec}. "
- "Must be list or null."
- )
- return result
- if isinstance(data, list) and not data:
- return DataFrame()
- elif isinstance(data, dict):
- # A bit of a hackjob
- data = [data]
- elif isinstance(data, abc.Iterable) and not isinstance(data, str):
- # GH35923 Fix pd.json_normalize to not skip the first element of a
- # generator input
- data = list(data)
- else:
- raise NotImplementedError
- # check to see if a simple recursive function is possible to
- # improve performance (see #15621) but only for cases such
- # as pd.Dataframe(data) or pd.Dataframe(data, sep)
- if (
- record_path is None
- and meta is None
- and meta_prefix is None
- and record_prefix is None
- and max_level is None
- ):
- return DataFrame(_simple_json_normalize(data, sep=sep))
- if record_path is None:
- if any([isinstance(x, dict) for x in y.values()] for y in data):
- # naive normalization, this is idempotent for flat records
- # and potentially will inflate the data considerably for
- # deeply nested structures:
- # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
- #
- # TODO: handle record value which are lists, at least error
- # reasonably
- data = nested_to_record(data, sep=sep, max_level=max_level)
- return DataFrame(data)
- elif not isinstance(record_path, list):
- record_path = [record_path]
- if meta is None:
- meta = []
- elif not isinstance(meta, list):
- meta = [meta]
- _meta = [m if isinstance(m, list) else [m] for m in meta]
- # Disastrously inefficient for now
- records: list = []
- lengths = []
- meta_vals: DefaultDict = defaultdict(list)
- meta_keys = [sep.join(val) for val in _meta]
- def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
- if isinstance(data, dict):
- data = [data]
- if len(path) > 1:
- for obj in data:
- for val, key in zip(_meta, meta_keys):
- if level + 1 == len(val):
- seen_meta[key] = _pull_field(obj, val[-1])
- _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
- else:
- for obj in data:
- recs = _pull_records(obj, path[0])
- recs = [
- nested_to_record(r, sep=sep, max_level=max_level)
- if isinstance(r, dict)
- else r
- for r in recs
- ]
- # For repeating the metadata later
- lengths.append(len(recs))
- for val, key in zip(_meta, meta_keys):
- if level + 1 > len(val):
- meta_val = seen_meta[key]
- else:
- meta_val = _pull_field(obj, val[level:])
- meta_vals[key].append(meta_val)
- records.extend(recs)
- _recursive_extract(data, record_path, {}, level=0)
- result = DataFrame(records)
- if record_prefix is not None:
- result = result.rename(columns=lambda x: f"{record_prefix}{x}")
- # Data types, a problem
- for k, v in meta_vals.items():
- if meta_prefix is not None:
- k = meta_prefix + k
- if k in result:
- raise ValueError(
- f"Conflicting metadata name {k}, need distinguishing prefix "
- )
- result[k] = np.array(v, dtype=object).repeat(lengths)
- return result
|