gbq.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. """ Google BigQuery support """
  2. from __future__ import annotations
  3. from typing import (
  4. TYPE_CHECKING,
  5. Any,
  6. )
  7. from pandas.compat._optional import import_optional_dependency
  8. if TYPE_CHECKING:
  9. from pandas import DataFrame
  10. def _try_import():
  11. # since pandas is a dependency of pandas-gbq
  12. # we need to import on first use
  13. msg = (
  14. "pandas-gbq is required to load data from Google BigQuery. "
  15. "See the docs: https://pandas-gbq.readthedocs.io."
  16. )
  17. pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
  18. return pandas_gbq
  19. def read_gbq(
  20. query: str,
  21. project_id: str | None = None,
  22. index_col: str | None = None,
  23. col_order: list[str] | None = None,
  24. reauth: bool = False,
  25. auth_local_webserver: bool = True,
  26. dialect: str | None = None,
  27. location: str | None = None,
  28. configuration: dict[str, Any] | None = None,
  29. credentials=None,
  30. use_bqstorage_api: bool | None = None,
  31. max_results: int | None = None,
  32. progress_bar_type: str | None = None,
  33. ) -> DataFrame:
  34. """
  35. Load data from Google BigQuery.
  36. This function requires the `pandas-gbq package
  37. <https://pandas-gbq.readthedocs.io>`__.
  38. See the `How to authenticate with Google BigQuery
  39. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  40. guide for authentication instructions.
  41. Parameters
  42. ----------
  43. query : str
  44. SQL-Like Query to return data values.
  45. project_id : str, optional
  46. Google BigQuery Account project ID. Optional when available from
  47. the environment.
  48. index_col : str, optional
  49. Name of result column to use for index in results DataFrame.
  50. col_order : list(str), optional
  51. List of BigQuery column names in the desired order for results
  52. DataFrame.
  53. reauth : bool, default False
  54. Force Google BigQuery to re-authenticate the user. This is useful
  55. if multiple accounts are used.
  56. auth_local_webserver : bool, default True
  57. Use the `local webserver flow`_ instead of the `console flow`_
  58. when getting user credentials.
  59. .. _local webserver flow:
  60. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  61. .. _console flow:
  62. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  63. *New in version 0.2.0 of pandas-gbq*.
  64. .. versionchanged:: 1.5.0
  65. Default value is changed to ``True``. Google has deprecated the
  66. ``auth_local_webserver = False`` `"out of band" (copy-paste)
  67. flow
  68. <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
  69. dialect : str, default 'legacy'
  70. Note: The default value is changing to 'standard' in a future version.
  71. SQL syntax dialect to use. Value can be one of:
  72. ``'legacy'``
  73. Use BigQuery's legacy SQL dialect. For more information see
  74. `BigQuery Legacy SQL Reference
  75. <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
  76. ``'standard'``
  77. Use BigQuery's standard SQL, which is
  78. compliant with the SQL 2011 standard. For more information
  79. see `BigQuery Standard SQL Reference
  80. <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
  81. location : str, optional
  82. Location where the query job should run. See the `BigQuery locations
  83. documentation
  84. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  85. list of available locations. The location must match that of any
  86. datasets used in the query.
  87. *New in version 0.5.0 of pandas-gbq*.
  88. configuration : dict, optional
  89. Query config parameters for job processing.
  90. For example:
  91. configuration = {'query': {'useQueryCache': False}}
  92. For more information see `BigQuery REST API Reference
  93. <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
  94. credentials : google.auth.credentials.Credentials, optional
  95. Credentials for accessing Google APIs. Use this parameter to override
  96. default credentials, such as to use Compute Engine
  97. :class:`google.auth.compute_engine.Credentials` or Service Account
  98. :class:`google.oauth2.service_account.Credentials` directly.
  99. *New in version 0.8.0 of pandas-gbq*.
  100. use_bqstorage_api : bool, default False
  101. Use the `BigQuery Storage API
  102. <https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
  103. download query results quickly, but at an increased cost. To use this
  104. API, first `enable it in the Cloud Console
  105. <https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
  106. You must also have the `bigquery.readsessions.create
  107. <https://cloud.google.com/bigquery/docs/access-control#roles>`__
  108. permission on the project you are billing queries to.
  109. This feature requires version 0.10.0 or later of the ``pandas-gbq``
  110. package. It also requires the ``google-cloud-bigquery-storage`` and
  111. ``fastavro`` packages.
  112. max_results : int, optional
  113. If set, limit the maximum number of rows to fetch from the query
  114. results.
  115. *New in version 0.12.0 of pandas-gbq*.
  116. .. versionadded:: 1.1.0
  117. progress_bar_type : Optional, str
  118. If set, use the `tqdm <https://tqdm.github.io/>`__ library to
  119. display a progress bar while the data downloads. Install the
  120. ``tqdm`` package to use this feature.
  121. Possible values of ``progress_bar_type`` include:
  122. ``None``
  123. No progress bar.
  124. ``'tqdm'``
  125. Use the :func:`tqdm.tqdm` function to print a progress bar
  126. to :data:`sys.stderr`.
  127. ``'tqdm_notebook'``
  128. Use the :func:`tqdm.tqdm_notebook` function to display a
  129. progress bar as a Jupyter notebook widget.
  130. ``'tqdm_gui'``
  131. Use the :func:`tqdm.tqdm_gui` function to display a
  132. progress bar as a graphical dialog box.
  133. Note that this feature requires version 0.12.0 or later of the
  134. ``pandas-gbq`` package. And it requires the ``tqdm`` package. Slightly
  135. different than ``pandas-gbq``, here the default is ``None``.
  136. Returns
  137. -------
  138. df: DataFrame
  139. DataFrame representing results of query.
  140. See Also
  141. --------
  142. pandas_gbq.read_gbq : This function in the pandas-gbq library.
  143. DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
  144. """
  145. pandas_gbq = _try_import()
  146. kwargs: dict[str, str | bool | int | None] = {}
  147. # START: new kwargs. Don't populate unless explicitly set.
  148. if use_bqstorage_api is not None:
  149. kwargs["use_bqstorage_api"] = use_bqstorage_api
  150. if max_results is not None:
  151. kwargs["max_results"] = max_results
  152. kwargs["progress_bar_type"] = progress_bar_type
  153. # END: new kwargs
  154. return pandas_gbq.read_gbq(
  155. query,
  156. project_id=project_id,
  157. index_col=index_col,
  158. col_order=col_order,
  159. reauth=reauth,
  160. auth_local_webserver=auth_local_webserver,
  161. dialect=dialect,
  162. location=location,
  163. configuration=configuration,
  164. credentials=credentials,
  165. **kwargs,
  166. )
  167. def to_gbq(
  168. dataframe: DataFrame,
  169. destination_table: str,
  170. project_id: str | None = None,
  171. chunksize: int | None = None,
  172. reauth: bool = False,
  173. if_exists: str = "fail",
  174. auth_local_webserver: bool = True,
  175. table_schema: list[dict[str, str]] | None = None,
  176. location: str | None = None,
  177. progress_bar: bool = True,
  178. credentials=None,
  179. ) -> None:
  180. pandas_gbq = _try_import()
  181. pandas_gbq.to_gbq(
  182. dataframe,
  183. destination_table,
  184. project_id=project_id,
  185. chunksize=chunksize,
  186. reauth=reauth,
  187. if_exists=if_exists,
  188. auth_local_webserver=auth_local_webserver,
  189. table_schema=table_schema,
  190. location=location,
  191. progress_bar=progress_bar,
  192. credentials=credentials,
  193. )