From 9f5e257c042a04b69ee3d6de1e9f63c6f67c03d3 Mon Sep 17 00:00:00 2001
From: chelsea-lin <124939984+chelsea-lin@users.noreply.github.com>
Date: Thu, 23 Mar 2023 11:17:18 -0700
Subject: [PATCH] feat: add bool, int, float, string dtype to to_dataframe
 (#1529)

---
 google/cloud/bigquery/_pandas_helpers.py |  25 +++--
 google/cloud/bigquery/enums.py           |  14 +++
 google/cloud/bigquery/job/query.py       |  55 ++++++++++-
 google/cloud/bigquery/table.py           |  99 +++++++++++++++++++-
 tests/unit/test_table.py                 | 113 +++++++++++++++++++++++
 5 files changed, 294 insertions(+), 12 deletions(-)

diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
index 3d7e7d793..dfd966c64 100644
--- a/google/cloud/bigquery/_pandas_helpers.py
+++ b/google/cloud/bigquery/_pandas_helpers.py
@@ -21,6 +21,7 @@
 import logging
 import queue
 import warnings
+from typing import Any, Union
 
 from packaging import version
 
@@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema):
     return pyarrow.schema(arrow_fields)
 
 
-def default_types_mapper(date_as_object: bool = False):
+def default_types_mapper(
+    date_as_object: bool = False,
+    bool_dtype: Union[Any, None] = None,
+    int_dtype: Union[Any, None] = None,
+    float_dtype: Union[Any, None] = None,
+    string_dtype: Union[Any, None] = None,
+):
     """Create a mapping from pyarrow types to pandas types.
 
     This overrides the pandas defaults to use null-safe extension types where
@@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False):
     """
 
     def types_mapper(arrow_data_type):
-        if pyarrow.types.is_boolean(arrow_data_type):
-            return pandas.BooleanDtype()
+        if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type):
+            return bool_dtype
+
+        elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type):
+            return int_dtype
+
+        elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type):
+            return float_dtype
+
+        elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type):
+            return string_dtype
 
         elif (
             # If date_as_object is True, we know some DATE columns are
@@ -310,9 +326,6 @@ def types_mapper(arrow_data_type):
         ):
             return db_dtypes.DateDtype()
 
-        elif pyarrow.types.is_integer(arrow_data_type):
-            return pandas.Int64Dtype()
-
         elif pyarrow.types.is_time(arrow_data_type):
             return db_dtypes.TimeDtype()
 
diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py
index 45d43a2a7..e4e3d22fc 100644
--- a/google/cloud/bigquery/enums.py
+++ b/google/cloud/bigquery/enums.py
@@ -77,6 +77,20 @@ class CreateDisposition(object):
     returned in the job result."""
 
 
+class DefaultPandasDTypes(enum.Enum):
+    """Default Pandas DataFrem DTypes to convert BigQuery data. These
+    Sentinel values are used instead of None to maintain backward compatibility,
+    and allow Pandas package is not available. For more information:
+    https://stackoverflow.com/a/60605919/101923
+    """
+
+    BOOL_DTYPE = object()
+    """Specifies default bool dtype"""
+
+    INT_DTYPE = object()
+    """Specifies default integer dtype"""
+
+
 class DestinationFormat(object):
     """The exported file format. The default value is :attr:`CSV`.
 
diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py
index 02b887b0e..b787ca036 100644
--- a/google/cloud/bigquery/job/query.py
+++ b/google/cloud/bigquery/job/query.py
@@ -28,7 +28,7 @@
 from google.cloud.bigquery.dataset import DatasetListItem
 from google.cloud.bigquery.dataset import DatasetReference
 from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
-from google.cloud.bigquery.enums import KeyResultStatementKind
+from google.cloud.bigquery.enums import KeyResultStatementKind, DefaultPandasDTypes
 from google.cloud.bigquery.external_config import ExternalConfig
 from google.cloud.bigquery import _helpers
 from google.cloud.bigquery.query import (
@@ -53,6 +53,11 @@
 from google.cloud.bigquery.job.base import _JobConfig
 from google.cloud.bigquery.job.base import _JobReference
 
+try:
+    import pandas  # type: ignore
+except ImportError:  # pragma: NO COVER
+    pandas = None
+
 if typing.TYPE_CHECKING:  # pragma: NO COVER
     # Assumption: type checks are only used by library developers and CI environments
     # that have all optional dependencies installed, thus no conditional imports.
@@ -1624,6 +1629,10 @@ def to_dataframe(
         create_bqstorage_client: bool = True,
         max_results: Optional[int] = None,
         geography_as_object: bool = False,
+        bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
+        int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
+        float_dtype: Union[Any, None] = None,
+        string_dtype: Union[Any, None] = None,
     ) -> "pandas.DataFrame":
         """Return a pandas DataFrame from a QueryJob
 
@@ -1676,6 +1685,46 @@ def to_dataframe(
 
                 .. versionadded:: 2.24.0
 
+            bool_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
+                to convert BigQuery Boolean type, instead of relying on the default
+                ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
+
+                .. versionadded:: 3.7.1
+
+            int_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
+                to convert BigQuery Integer types, instead of relying on the default
+                ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
+                Integer types can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
+
+                .. versionadded:: 3.7.1
+
+            float_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
+                to convert BigQuery Float type, instead of relying on the default
+                ``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("float64")``. BigQuery Float
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
+
+                .. versionadded:: 3.7.1
+
+            string_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
+                convert BigQuery String type, instead of relying on the default
+                ``numpy.dtype("object")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("object")``. BigQuery String
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
+
+                .. versionadded:: 3.7.1
+
         Returns:
             pandas.DataFrame:
                 A :class:`~pandas.DataFrame` populated with row data
@@ -1698,6 +1747,10 @@ def to_dataframe(
             progress_bar_type=progress_bar_type,
             create_bqstorage_client=create_bqstorage_client,
             geography_as_object=geography_as_object,
+            bool_dtype=bool_dtype,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            string_dtype=string_dtype,
         )
 
     # If changing the signature of this method, make sure to apply the same
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
index a2110a9fb..93b0da67f 100644
--- a/google/cloud/bigquery/table.py
+++ b/google/cloud/bigquery/table.py
@@ -34,6 +34,11 @@
 except ImportError:  # pragma: NO COVER
     pyarrow = None
 
+try:
+    import db_dtypes  # type: ignore
+except ImportError:  # pragma: NO COVER
+    db_dtypes = None
+
 try:
     import geopandas  # type: ignore
 except ImportError:
@@ -55,6 +60,7 @@
 import google.cloud._helpers  # type: ignore
 from google.cloud.bigquery import _helpers
 from google.cloud.bigquery import _pandas_helpers
+from google.cloud.bigquery.enums import DefaultPandasDTypes
 from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
 from google.cloud.bigquery.schema import _build_schema_resource
 from google.cloud.bigquery.schema import _parse_schema_resource
@@ -88,6 +94,11 @@
 
 _TABLE_HAS_NO_SCHEMA = 'Table has no schema:  call "client.get_table()"'
 
+_NO_SUPPORTED_DTYPE = (
+    "The dtype cannot to be converted to a pandas ExtensionArray "
+    "because the necessary `__from_arrow__` attribute is missing."
+)
+
 
 def _reference_getter(table):
     """A :class:`~google.cloud.bigquery.table.TableReference` pointing to
@@ -1920,6 +1931,10 @@ def to_dataframe(
         progress_bar_type: str = None,
         create_bqstorage_client: bool = True,
         geography_as_object: bool = False,
+        bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
+        int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
+        float_dtype: Union[Any, None] = None,
+        string_dtype: Union[Any, None] = None,
     ) -> "pandas.DataFrame":
         """Create a pandas DataFrame by loading all pages of a query.
 
@@ -1958,6 +1973,7 @@ def to_dataframe(
                   progress bar as a graphical dialog box.
 
                 .. versionadded:: 1.11.0
+
             create_bqstorage_client (Optional[bool]):
                 If ``True`` (default), create a BigQuery Storage API client
                 using the default API settings. The BigQuery Storage API
@@ -1975,6 +1991,46 @@ def to_dataframe(
 
                 .. versionadded:: 2.24.0
 
+            bool_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
+                to convert BigQuery Boolean type, instead of relying on the default
+                ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
+
+                .. versionadded:: 3.7.1
+
+            int_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
+                to convert BigQuery Integer types, instead of relying on the default
+                ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
+                Integer types can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
+
+                .. versionadded:: 3.7.1
+
+            float_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
+                to convert BigQuery Float type, instead of relying on the default
+                ``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("float64")``. BigQuery Float
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
+
+                .. versionadded:: 3.7.1
+
+            string_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
+                convert BigQuery String type, instead of relying on the default
+                ``numpy.dtype("object")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("object")``. BigQuery String
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
+
+                .. versionadded:: 3.7.1
+
         Returns:
             pandas.DataFrame:
                 A :class:`~pandas.DataFrame` populated with row data and column
@@ -1987,7 +2043,9 @@ def to_dataframe(
                 the :mod:`google.cloud.bigquery_storage_v1` module is
                 required but cannot be imported.  Also if
                 `geography_as_object` is `True`, but the
-                :mod:`shapely` library cannot be imported.
+                :mod:`shapely` library cannot be imported. Also if
+                `bool_dtype`, `int_dtype` or other dtype parameters
+                is not supported dtype.
 
         """
         _pandas_helpers.verify_pandas_imports()
@@ -1995,6 +2053,24 @@ def to_dataframe(
         if geography_as_object and shapely is None:
             raise ValueError(_NO_SHAPELY_ERROR)
 
+        if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
+            bool_dtype = pandas.BooleanDtype()
+
+        if int_dtype is DefaultPandasDTypes.INT_DTYPE:
+            int_dtype = pandas.Int64Dtype()
+
+        if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
+            raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)
+
+        if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
+            raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)
+
+        if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
+            raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)
+
+        if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
+            raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)
+
         if dtypes is None:
             dtypes = {}
 
@@ -2019,15 +2095,15 @@ def to_dataframe(
             for col in record_batch
             # Type can be date32 or date64 (plus units).
             # See: https://arrow.apache.org/docs/python/api/datatypes.html
-            if str(col.type).startswith("date")
+            if pyarrow.types.is_date(col.type)
         )
 
         timestamp_as_object = not all(
             self.__can_cast_timestamp_ns(col)
             for col in record_batch
-            # Type can be timestamp (plus units and time zone).
+            # Type can be datetime and timestamp (plus units and time zone).
             # See: https://arrow.apache.org/docs/python/api/datatypes.html
-            if str(col.type).startswith("timestamp")
+            if pyarrow.types.is_timestamp(col.type)
         )
 
         if len(record_batch) > 0:
@@ -2036,7 +2112,11 @@ def to_dataframe(
                 timestamp_as_object=timestamp_as_object,
                 integer_object_nulls=True,
                 types_mapper=_pandas_helpers.default_types_mapper(
-                    date_as_object=date_as_object
+                    date_as_object=date_as_object,
+                    bool_dtype=bool_dtype,
+                    int_dtype=int_dtype,
+                    float_dtype=float_dtype,
+                    string_dtype=string_dtype,
                 ),
             )
         else:
@@ -2233,6 +2313,10 @@ def to_dataframe(
         progress_bar_type=None,
         create_bqstorage_client=True,
         geography_as_object=False,
+        bool_dtype=None,
+        int_dtype=None,
+        float_dtype=None,
+        string_dtype=None,
     ) -> "pandas.DataFrame":
         """Create an empty dataframe.
 
@@ -2241,6 +2325,11 @@ def to_dataframe(
             dtypes (Any): Ignored. Added for compatibility with RowIterator.
             progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
             create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
+            geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
+            bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
+            int_dtype (Any): Ignored. Added for compatibility with RowIterator.
+            float_dtype (Any): Ignored. Added for compatibility with RowIterator.
+            string_dtype (Any): Ignored. Added for compatibility with RowIterator.
 
         Returns:
             pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py
index a79b98881..22c7c048d 100644
--- a/tests/unit/test_table.py
+++ b/tests/unit/test_table.py
@@ -55,6 +55,11 @@
 except (ImportError, AttributeError):  # pragma: NO COVER
     pandas = None
 
+try:
+    import db_dtypes  # type: ignore
+except ImportError:  # pragma: NO COVER
+    db_dtypes = None
+
 try:
     import geopandas
 except (ImportError, AttributeError):  # pragma: NO COVER
@@ -3456,6 +3461,114 @@ def test_to_dataframe_w_various_types_nullable(self):
                 self.assertIsInstance(row.complete, bool)
                 self.assertIsInstance(row.date, datetime.date)
 
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    def test_to_dataframe_w_dtypes_mapper(self):
+        from google.cloud.bigquery.schema import SchemaField
+
+        schema = [
+            SchemaField("name", "STRING"),
+            SchemaField("complete", "BOOL"),
+            SchemaField("age", "INTEGER"),
+            SchemaField("seconds", "INT64"),
+            SchemaField("miles", "FLOAT64"),
+        ]
+        row_data = [
+            ["Phred Phlyntstone", "true", "32", "23000", "1.77"],
+            ["Bharney Rhubble", "false", "33", "454000", "6.66"],
+            ["Wylma Phlyntstone", "true", "29", "341000", "2.0"],
+        ]
+        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
+        path = "/foo"
+        api_request = mock.Mock(return_value={"rows": rows})
+        row_iterator = self._make_one(_mock_client(), api_request, path, schema)
+
+        df = row_iterator.to_dataframe(
+            create_bqstorage_client=False,
+            bool_dtype=pandas.BooleanDtype(),
+            int_dtype=pandas.Int32Dtype(),
+            float_dtype=pandas.StringDtype(),
+            string_dtype=pandas.StringDtype(),
+        )
+
+        self.assertIsInstance(df, pandas.DataFrame)
+        self.assertEqual(df.complete.dtype.name, "boolean")
+        self.assertEqual(df.age.dtype.name, "Int32")
+        self.assertEqual(df.seconds.dtype.name, "Int32")
+        self.assertEqual(df.miles.dtype.name, "string")
+        self.assertEqual(df.name.dtype.name, "string")
+
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    def test_to_dataframe_w_none_dtypes_mapper(self):
+        from google.cloud.bigquery.schema import SchemaField
+
+        schema = [
+            SchemaField("name", "STRING"),
+            SchemaField("complete", "BOOL"),
+            SchemaField("age", "INTEGER"),
+            SchemaField("seconds", "INT64"),
+            SchemaField("miles", "FLOAT64"),
+        ]
+        row_data = [
+            ["Phred Phlyntstone", "true", "32", "23000", "1.77"],
+            ["Bharney Rhubble", "false", "33", "454000", "6.66"],
+            ["Wylma Phlyntstone", "true", "29", "341000", "2.0"],
+        ]
+        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
+        path = "/foo"
+        api_request = mock.Mock(return_value={"rows": rows})
+        row_iterator = self._make_one(_mock_client(), api_request, path, schema)
+
+        df = row_iterator.to_dataframe(
+            create_bqstorage_client=False,
+            bool_dtype=None,
+            int_dtype=None,
+            float_dtype=None,
+            string_dtype=None,
+        )
+        self.assertIsInstance(df, pandas.DataFrame)
+        self.assertEqual(df.complete.dtype.name, "bool")
+        self.assertEqual(df.age.dtype.name, "int64")
+        self.assertEqual(df.seconds.dtype.name, "int64")
+        self.assertEqual(df.miles.dtype.name, "float64")
+        self.assertEqual(df.name.dtype.name, "object")
+
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    def test_to_dataframe_w_unsupported_dtypes_mapper(self):
+        import numpy
+        from google.cloud.bigquery.schema import SchemaField
+
+        schema = [
+            SchemaField("name", "STRING"),
+        ]
+        row_data = [
+            ["Phred Phlyntstone"],
+        ]
+        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
+        path = "/foo"
+        api_request = mock.Mock(return_value={"rows": rows})
+        row_iterator = self._make_one(_mock_client(), api_request, path, schema)
+
+        with self.assertRaises(ValueError):
+            row_iterator.to_dataframe(
+                create_bqstorage_client=False,
+                bool_dtype=numpy.dtype("bool"),
+            )
+        with self.assertRaises(ValueError):
+            row_iterator.to_dataframe(
+                create_bqstorage_client=False,
+                int_dtype=numpy.dtype("int64"),
+            )
+        with self.assertRaises(ValueError):
+            row_iterator.to_dataframe(
+                create_bqstorage_client=False,
+                float_dtype=numpy.dtype("float64"),
+            )
+        with self.assertRaises(ValueError):
+            row_iterator.to_dataframe(
+                create_bqstorage_client=False,
+                string_dtype=numpy.dtype("object"),
+            )
+
     @unittest.skipIf(pandas is None, "Requires `pandas`")
     def test_to_dataframe_column_dtypes(self):
         from google.cloud.bigquery.schema import SchemaField