From 9f5e257c042a04b69ee3d6de1e9f63c6f67c03d3 Mon Sep 17 00:00:00 2001 From: chelsea-lin <124939984+chelsea-lin@users.noreply.github.com> Date: Thu, 23 Mar 2023 11:17:18 -0700 Subject: [PATCH] feat: add bool, int, float, string dtype to to_dataframe (#1529) --- google/cloud/bigquery/_pandas_helpers.py | 25 +++-- google/cloud/bigquery/enums.py | 14 +++ google/cloud/bigquery/job/query.py | 55 ++++++++++- google/cloud/bigquery/table.py | 99 +++++++++++++++++++- tests/unit/test_table.py | 113 +++++++++++++++++++++++ 5 files changed, 294 insertions(+), 12 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 3d7e7d793..dfd966c64 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -21,6 +21,7 @@ import logging import queue import warnings +from typing import Any, Union from packaging import version @@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema): return pyarrow.schema(arrow_fields) -def default_types_mapper(date_as_object: bool = False): +def default_types_mapper( + date_as_object: bool = False, + bool_dtype: Union[Any, None] = None, + int_dtype: Union[Any, None] = None, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, +): """Create a mapping from pyarrow types to pandas types. This overrides the pandas defaults to use null-safe extension types where @@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False): """ def types_mapper(arrow_data_type): - if pyarrow.types.is_boolean(arrow_data_type): - return pandas.BooleanDtype() + if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type): + return bool_dtype + + elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type): + return int_dtype + + elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type): + return float_dtype + + elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type): + return string_dtype elif ( # If date_as_object is True, we know some DATE columns are @@ -310,9 +326,6 @@ def types_mapper(arrow_data_type): ): return db_dtypes.DateDtype() - elif pyarrow.types.is_integer(arrow_data_type): - return pandas.Int64Dtype() - elif pyarrow.types.is_time(arrow_data_type): return db_dtypes.TimeDtype() diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index 45d43a2a7..e4e3d22fc 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -77,6 +77,20 @@ class CreateDisposition(object): returned in the job result.""" +class DefaultPandasDTypes(enum.Enum): + """Default Pandas DataFrem DTypes to convert BigQuery data. These + Sentinel values are used instead of None to maintain backward compatibility, + and allow Pandas package is not available. For more information: + https://stackoverflow.com/a/60605919/101923 + """ + + BOOL_DTYPE = object() + """Specifies default bool dtype""" + + INT_DTYPE = object() + """Specifies default integer dtype""" + + class DestinationFormat(object): """The exported file format. The default value is :attr:`CSV`. diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 02b887b0e..b787ca036 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -28,7 +28,7 @@ from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration -from google.cloud.bigquery.enums import KeyResultStatementKind +from google.cloud.bigquery.enums import KeyResultStatementKind, DefaultPandasDTypes from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery import _helpers from google.cloud.bigquery.query import ( @@ -53,6 +53,11 @@ from google.cloud.bigquery.job.base import _JobConfig from google.cloud.bigquery.job.base import _JobReference +try: + import pandas # type: ignore +except ImportError: # pragma: NO COVER + pandas = None + if typing.TYPE_CHECKING: # pragma: NO COVER # Assumption: type checks are only used by library developers and CI environments # that have all optional dependencies installed, thus no conditional imports. @@ -1624,6 +1629,10 @@ def to_dataframe( create_bqstorage_client: bool = True, max_results: Optional[int] = None, geography_as_object: bool = False, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -1676,6 +1685,46 @@ def to_dataframe( .. versionadded:: 2.24.0 + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + + .. versionadded:: 3.7.1 + + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + + .. versionadded:: 3.7.1 + + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + + .. versionadded:: 3.7.1 + + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type + + .. versionadded:: 3.7.1 + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data @@ -1698,6 +1747,10 @@ def to_dataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_as_object=geography_as_object, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ) # If changing the signature of this method, make sure to apply the same diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index a2110a9fb..93b0da67f 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -34,6 +34,11 @@ except ImportError: # pragma: NO COVER pyarrow = None +try: + import db_dtypes # type: ignore +except ImportError: # pragma: NO COVER + db_dtypes = None + try: import geopandas # type: ignore except ImportError: @@ -55,6 +60,7 @@ import google.cloud._helpers # type: ignore from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pandas_helpers +from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource @@ -88,6 +94,11 @@ _TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"' +_NO_SUPPORTED_DTYPE = ( + "The dtype cannot to be converted to a pandas ExtensionArray " + "because the necessary `__from_arrow__` attribute is missing." +) + def _reference_getter(table): """A :class:`~google.cloud.bigquery.table.TableReference` pointing to @@ -1920,6 +1931,10 @@ def to_dataframe( progress_bar_type: str = None, create_bqstorage_client: bool = True, geography_as_object: bool = False, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -1958,6 +1973,7 @@ def to_dataframe( progress bar as a graphical dialog box. .. versionadded:: 1.11.0 + create_bqstorage_client (Optional[bool]): If ``True`` (default), create a BigQuery Storage API client using the default API settings. The BigQuery Storage API @@ -1975,6 +1991,46 @@ def to_dataframe( .. versionadded:: 2.24.0 + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + + .. versionadded:: 3.7.1 + + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + + .. versionadded:: 3.7.1 + + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + + .. versionadded:: 3.7.1 + + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type + + .. versionadded:: 3.7.1 + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column @@ -1987,7 +2043,9 @@ def to_dataframe( the :mod:`google.cloud.bigquery_storage_v1` module is required but cannot be imported. Also if `geography_as_object` is `True`, but the - :mod:`shapely` library cannot be imported. + :mod:`shapely` library cannot be imported. Also if + `bool_dtype`, `int_dtype` or other dtype parameters + is not supported dtype. """ _pandas_helpers.verify_pandas_imports() @@ -1995,6 +2053,24 @@ def to_dataframe( if geography_as_object and shapely is None: raise ValueError(_NO_SHAPELY_ERROR) + if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE: + bool_dtype = pandas.BooleanDtype() + + if int_dtype is DefaultPandasDTypes.INT_DTYPE: + int_dtype = pandas.Int64Dtype() + + if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): + raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) + + if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"): + raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE) + + if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"): + raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE) + + if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"): + raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE) + if dtypes is None: dtypes = {} @@ -2019,15 +2095,15 @@ def to_dataframe( for col in record_batch # Type can be date32 or date64 (plus units). # See: https://arrow.apache.org/docs/python/api/datatypes.html - if str(col.type).startswith("date") + if pyarrow.types.is_date(col.type) ) timestamp_as_object = not all( self.__can_cast_timestamp_ns(col) for col in record_batch - # Type can be timestamp (plus units and time zone). + # Type can be datetime and timestamp (plus units and time zone). # See: https://arrow.apache.org/docs/python/api/datatypes.html - if str(col.type).startswith("timestamp") + if pyarrow.types.is_timestamp(col.type) ) if len(record_batch) > 0: @@ -2036,7 +2112,11 @@ def to_dataframe( timestamp_as_object=timestamp_as_object, integer_object_nulls=True, types_mapper=_pandas_helpers.default_types_mapper( - date_as_object=date_as_object + date_as_object=date_as_object, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ), ) else: @@ -2233,6 +2313,10 @@ def to_dataframe( progress_bar_type=None, create_bqstorage_client=True, geography_as_object=False, + bool_dtype=None, + int_dtype=None, + float_dtype=None, + string_dtype=None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2241,6 +2325,11 @@ def to_dataframe( dtypes (Any): Ignored. Added for compatibility with RowIterator. progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + geography_as_object (bool): Ignored. Added for compatibility with RowIterator. + bool_dtype (Any): Ignored. Added for compatibility with RowIterator. + int_dtype (Any): Ignored. Added for compatibility with RowIterator. + float_dtype (Any): Ignored. Added for compatibility with RowIterator. + string_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index a79b98881..22c7c048d 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -55,6 +55,11 @@ except (ImportError, AttributeError): # pragma: NO COVER pandas = None +try: + import db_dtypes # type: ignore +except ImportError: # pragma: NO COVER + db_dtypes = None + try: import geopandas except (ImportError, AttributeError): # pragma: NO COVER @@ -3456,6 +3461,114 @@ def test_to_dataframe_w_various_types_nullable(self): self.assertIsInstance(row.complete, bool) self.assertIsInstance(row.date, datetime.date) + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_w_dtypes_mapper(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING"), + SchemaField("complete", "BOOL"), + SchemaField("age", "INTEGER"), + SchemaField("seconds", "INT64"), + SchemaField("miles", "FLOAT64"), + ] + row_data = [ + ["Phred Phlyntstone", "true", "32", "23000", "1.77"], + ["Bharney Rhubble", "false", "33", "454000", "6.66"], + ["Wylma Phlyntstone", "true", "29", "341000", "2.0"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one(_mock_client(), api_request, path, schema) + + df = row_iterator.to_dataframe( + create_bqstorage_client=False, + bool_dtype=pandas.BooleanDtype(), + int_dtype=pandas.Int32Dtype(), + float_dtype=pandas.StringDtype(), + string_dtype=pandas.StringDtype(), + ) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(df.complete.dtype.name, "boolean") + self.assertEqual(df.age.dtype.name, "Int32") + self.assertEqual(df.seconds.dtype.name, "Int32") + self.assertEqual(df.miles.dtype.name, "string") + self.assertEqual(df.name.dtype.name, "string") + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_w_none_dtypes_mapper(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING"), + SchemaField("complete", "BOOL"), + SchemaField("age", "INTEGER"), + SchemaField("seconds", "INT64"), + SchemaField("miles", "FLOAT64"), + ] + row_data = [ + ["Phred Phlyntstone", "true", "32", "23000", "1.77"], + ["Bharney Rhubble", "false", "33", "454000", "6.66"], + ["Wylma Phlyntstone", "true", "29", "341000", "2.0"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one(_mock_client(), api_request, path, schema) + + df = row_iterator.to_dataframe( + create_bqstorage_client=False, + bool_dtype=None, + int_dtype=None, + float_dtype=None, + string_dtype=None, + ) + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(df.complete.dtype.name, "bool") + self.assertEqual(df.age.dtype.name, "int64") + self.assertEqual(df.seconds.dtype.name, "int64") + self.assertEqual(df.miles.dtype.name, "float64") + self.assertEqual(df.name.dtype.name, "object") + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_w_unsupported_dtypes_mapper(self): + import numpy + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING"), + ] + row_data = [ + ["Phred Phlyntstone"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + row_iterator = self._make_one(_mock_client(), api_request, path, schema) + + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + bool_dtype=numpy.dtype("bool"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + int_dtype=numpy.dtype("int64"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + float_dtype=numpy.dtype("float64"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + string_dtype=numpy.dtype("object"), + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_column_dtypes(self): from google.cloud.bigquery.schema import SchemaField