_table_schema.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. """
  2. Table Schema builders
  3. https://specs.frictionlessdata.io/table-schema/
  4. """
  5. from __future__ import annotations
  6. from typing import (
  7. TYPE_CHECKING,
  8. Any,
  9. cast,
  10. )
  11. import warnings
  12. from pandas._libs.json import loads
  13. from pandas._libs.tslibs import timezones
  14. from pandas._typing import (
  15. DtypeObj,
  16. JSONSerializable,
  17. )
  18. from pandas.util._exceptions import find_stack_level
  19. from pandas.core.dtypes.base import _registry as registry
  20. from pandas.core.dtypes.common import (
  21. is_bool_dtype,
  22. is_categorical_dtype,
  23. is_datetime64_dtype,
  24. is_datetime64tz_dtype,
  25. is_extension_array_dtype,
  26. is_integer_dtype,
  27. is_numeric_dtype,
  28. is_period_dtype,
  29. is_string_dtype,
  30. is_timedelta64_dtype,
  31. )
  32. from pandas.core.dtypes.dtypes import CategoricalDtype
  33. from pandas import DataFrame
  34. import pandas.core.common as com
  35. if TYPE_CHECKING:
  36. from pandas import Series
  37. from pandas.core.indexes.multi import MultiIndex
  38. TABLE_SCHEMA_VERSION = "1.4.0"
  39. def as_json_table_type(x: DtypeObj) -> str:
  40. """
  41. Convert a NumPy / pandas type to its corresponding json_table.
  42. Parameters
  43. ----------
  44. x : np.dtype or ExtensionDtype
  45. Returns
  46. -------
  47. str
  48. the Table Schema data types
  49. Notes
  50. -----
  51. This table shows the relationship between NumPy / pandas dtypes,
  52. and Table Schema dtypes.
  53. ============== =================
  54. Pandas type Table Schema type
  55. ============== =================
  56. int64 integer
  57. float64 number
  58. bool boolean
  59. datetime64[ns] datetime
  60. timedelta64[ns] duration
  61. object str
  62. categorical any
  63. =============== =================
  64. """
  65. if is_integer_dtype(x):
  66. return "integer"
  67. elif is_bool_dtype(x):
  68. return "boolean"
  69. elif is_numeric_dtype(x):
  70. return "number"
  71. elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
  72. return "datetime"
  73. elif is_timedelta64_dtype(x):
  74. return "duration"
  75. elif is_categorical_dtype(x):
  76. return "any"
  77. elif is_extension_array_dtype(x):
  78. return "any"
  79. elif is_string_dtype(x):
  80. return "string"
  81. else:
  82. return "any"
  83. def set_default_names(data):
  84. """Sets index names to 'index' for regular, or 'level_x' for Multi"""
  85. if com.all_not_none(*data.index.names):
  86. nms = data.index.names
  87. if len(nms) == 1 and data.index.name == "index":
  88. warnings.warn(
  89. "Index name of 'index' is not round-trippable.",
  90. stacklevel=find_stack_level(),
  91. )
  92. elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
  93. warnings.warn(
  94. "Index names beginning with 'level_' are not round-trippable.",
  95. stacklevel=find_stack_level(),
  96. )
  97. return data
  98. data = data.copy()
  99. if data.index.nlevels > 1:
  100. data.index.names = com.fill_missing_names(data.index.names)
  101. else:
  102. data.index.name = data.index.name or "index"
  103. return data
  104. def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
  105. dtype = arr.dtype
  106. name: JSONSerializable
  107. if arr.name is None:
  108. name = "values"
  109. else:
  110. name = arr.name
  111. field: dict[str, JSONSerializable] = {
  112. "name": name,
  113. "type": as_json_table_type(dtype),
  114. }
  115. if is_categorical_dtype(dtype):
  116. cats = dtype.categories
  117. ordered = dtype.ordered
  118. field["constraints"] = {"enum": list(cats)}
  119. field["ordered"] = ordered
  120. elif is_period_dtype(dtype):
  121. field["freq"] = dtype.freq.freqstr
  122. elif is_datetime64tz_dtype(dtype):
  123. if timezones.is_utc(dtype.tz):
  124. # timezone.utc has no "zone" attr
  125. field["tz"] = "UTC"
  126. else:
  127. field["tz"] = dtype.tz.zone
  128. elif is_extension_array_dtype(dtype):
  129. field["extDtype"] = dtype.name
  130. return field
  131. def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
  132. """
  133. Converts a JSON field descriptor into its corresponding NumPy / pandas type
  134. Parameters
  135. ----------
  136. field
  137. A JSON field descriptor
  138. Returns
  139. -------
  140. dtype
  141. Raises
  142. ------
  143. ValueError
  144. If the type of the provided field is unknown or currently unsupported
  145. Examples
  146. --------
  147. >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
  148. 'int64'
  149. >>> convert_json_field_to_pandas_type(
  150. ... {
  151. ... "name": "a_categorical",
  152. ... "type": "any",
  153. ... "constraints": {"enum": ["a", "b", "c"]},
  154. ... "ordered": True,
  155. ... }
  156. ... )
  157. CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)
  158. >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
  159. 'datetime64[ns]'
  160. >>> convert_json_field_to_pandas_type(
  161. ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
  162. ... )
  163. 'datetime64[ns, US/Central]'
  164. """
  165. typ = field["type"]
  166. if typ == "string":
  167. return "object"
  168. elif typ == "integer":
  169. return field.get("extDtype", "int64")
  170. elif typ == "number":
  171. return field.get("extDtype", "float64")
  172. elif typ == "boolean":
  173. return field.get("extDtype", "bool")
  174. elif typ == "duration":
  175. return "timedelta64"
  176. elif typ == "datetime":
  177. if field.get("tz"):
  178. return f"datetime64[ns, {field['tz']}]"
  179. elif field.get("freq"):
  180. # GH#47747 using datetime over period to minimize the change surface
  181. return f"period[{field['freq']}]"
  182. else:
  183. return "datetime64[ns]"
  184. elif typ == "any":
  185. if "constraints" in field and "ordered" in field:
  186. return CategoricalDtype(
  187. categories=field["constraints"]["enum"], ordered=field["ordered"]
  188. )
  189. elif "extDtype" in field:
  190. return registry.find(field["extDtype"])
  191. else:
  192. return "object"
  193. raise ValueError(f"Unsupported or invalid field type: {typ}")
  194. def build_table_schema(
  195. data: DataFrame | Series,
  196. index: bool = True,
  197. primary_key: bool | None = None,
  198. version: bool = True,
  199. ) -> dict[str, JSONSerializable]:
  200. """
  201. Create a Table schema from ``data``.
  202. Parameters
  203. ----------
  204. data : Series, DataFrame
  205. index : bool, default True
  206. Whether to include ``data.index`` in the schema.
  207. primary_key : bool or None, default True
  208. Column names to designate as the primary key.
  209. The default `None` will set `'primaryKey'` to the index
  210. level or levels if the index is unique.
  211. version : bool, default True
  212. Whether to include a field `pandas_version` with the version
  213. of pandas that last revised the table schema. This version
  214. can be different from the installed pandas version.
  215. Returns
  216. -------
  217. dict
  218. Notes
  219. -----
  220. See `Table Schema
  221. <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
  222. conversion types.
  223. Timedeltas as converted to ISO8601 duration format with
  224. 9 decimal places after the seconds field for nanosecond precision.
  225. Categoricals are converted to the `any` dtype, and use the `enum` field
  226. constraint to list the allowed values. The `ordered` attribute is included
  227. in an `ordered` field.
  228. Examples
  229. --------
  230. >>> from pandas.io.json._table_schema import build_table_schema
  231. >>> df = pd.DataFrame(
  232. ... {'A': [1, 2, 3],
  233. ... 'B': ['a', 'b', 'c'],
  234. ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
  235. ... }, index=pd.Index(range(3), name='idx'))
  236. >>> build_table_schema(df)
  237. {'fields': \
  238. [{'name': 'idx', 'type': 'integer'}, \
  239. {'name': 'A', 'type': 'integer'}, \
  240. {'name': 'B', 'type': 'string'}, \
  241. {'name': 'C', 'type': 'datetime'}], \
  242. 'primaryKey': ['idx'], \
  243. 'pandas_version': '1.4.0'}
  244. """
  245. if index is True:
  246. data = set_default_names(data)
  247. schema: dict[str, Any] = {}
  248. fields = []
  249. if index:
  250. if data.index.nlevels > 1:
  251. data.index = cast("MultiIndex", data.index)
  252. for level, name in zip(data.index.levels, data.index.names):
  253. new_field = convert_pandas_type_to_json_field(level)
  254. new_field["name"] = name
  255. fields.append(new_field)
  256. else:
  257. fields.append(convert_pandas_type_to_json_field(data.index))
  258. if data.ndim > 1:
  259. for column, s in data.items():
  260. fields.append(convert_pandas_type_to_json_field(s))
  261. else:
  262. fields.append(convert_pandas_type_to_json_field(data))
  263. schema["fields"] = fields
  264. if index and data.index.is_unique and primary_key is None:
  265. if data.index.nlevels == 1:
  266. schema["primaryKey"] = [data.index.name]
  267. else:
  268. schema["primaryKey"] = data.index.names
  269. elif primary_key is not None:
  270. schema["primaryKey"] = primary_key
  271. if version:
  272. schema["pandas_version"] = TABLE_SCHEMA_VERSION
  273. return schema
  274. def parse_table_schema(json, precise_float):
  275. """
  276. Builds a DataFrame from a given schema
  277. Parameters
  278. ----------
  279. json :
  280. A JSON table schema
  281. precise_float : bool
  282. Flag controlling precision when decoding string to double values, as
  283. dictated by ``read_json``
  284. Returns
  285. -------
  286. df : DataFrame
  287. Raises
  288. ------
  289. NotImplementedError
  290. If the JSON table schema contains either timezone or timedelta data
  291. Notes
  292. -----
  293. Because :func:`DataFrame.to_json` uses the string 'index' to denote a
  294. name-less :class:`Index`, this function sets the name of the returned
  295. :class:`DataFrame` to ``None`` when said string is encountered with a
  296. normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
  297. applies to any strings beginning with 'level_'. Therefore, an
  298. :class:`Index` name of 'index' and :class:`MultiIndex` names starting
  299. with 'level_' are not supported.
  300. See Also
  301. --------
  302. build_table_schema : Inverse function.
  303. pandas.read_json
  304. """
  305. table = loads(json, precise_float=precise_float)
  306. col_order = [field["name"] for field in table["schema"]["fields"]]
  307. df = DataFrame(table["data"], columns=col_order)[col_order]
  308. dtypes = {
  309. field["name"]: convert_json_field_to_pandas_type(field)
  310. for field in table["schema"]["fields"]
  311. }
  312. # No ISO constructor for Timedelta as of yet, so need to raise
  313. if "timedelta64" in dtypes.values():
  314. raise NotImplementedError(
  315. 'table="orient" can not yet read ISO-formatted Timedelta data'
  316. )
  317. df = df.astype(dtypes)
  318. if "primaryKey" in table["schema"]:
  319. df = df.set_index(table["schema"]["primaryKey"])
  320. if len(df.index.names) == 1:
  321. if df.index.name == "index":
  322. df.index.name = None
  323. else:
  324. df.index.names = [
  325. None if x.startswith("level_") else x for x in df.index.names
  326. ]
  327. return df