Skip to content

Commit

Permalink
feat: add bool, int, float, string dtype to to_dataframe (#1529)
Browse files Browse the repository at this point in the history
  • Loading branch information
chelsea-lin authored Mar 23, 2023
1 parent a2520ca commit 5e4465d
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 12 deletions.
25 changes: 19 additions & 6 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import logging
import queue
import warnings
from typing import Any, Union

from packaging import version

Expand Down Expand Up @@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema):
return pyarrow.schema(arrow_fields)


def default_types_mapper(date_as_object: bool = False):
def default_types_mapper(
date_as_object: bool = False,
bool_dtype: Union[Any, None] = None,
int_dtype: Union[Any, None] = None,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
):
"""Create a mapping from pyarrow types to pandas types.
This overrides the pandas defaults to use null-safe extension types where
Expand All @@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False):
"""

def types_mapper(arrow_data_type):
if pyarrow.types.is_boolean(arrow_data_type):
return pandas.BooleanDtype()
if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type):
return bool_dtype

elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type):
return int_dtype

elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type):
return float_dtype

elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type):
return string_dtype

elif (
# If date_as_object is True, we know some DATE columns are
Expand All @@ -310,9 +326,6 @@ def types_mapper(arrow_data_type):
):
return db_dtypes.DateDtype()

elif pyarrow.types.is_integer(arrow_data_type):
return pandas.Int64Dtype()

elif pyarrow.types.is_time(arrow_data_type):
return db_dtypes.TimeDtype()

Expand Down
14 changes: 14 additions & 0 deletions google/cloud/bigquery/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,20 @@ class CreateDisposition(object):
returned in the job result."""


class DefaultPandasDTypes(enum.Enum):
"""Default Pandas DataFrem DTypes to convert BigQuery data. These
Sentinel values are used instead of None to maintain backward compatibility,
and allow Pandas package is not available. For more information:
https://stackoverflow.com/a/60605919/101923
"""

BOOL_DTYPE = object()
"""Specifies default bool dtype"""

INT_DTYPE = object()
"""Specifies default integer dtype"""


class DestinationFormat(object):
"""The exported file format. The default value is :attr:`CSV`.
Expand Down
55 changes: 54 additions & 1 deletion google/cloud/bigquery/job/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from google.cloud.bigquery.dataset import DatasetListItem
from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery.enums import KeyResultStatementKind
from google.cloud.bigquery.enums import KeyResultStatementKind, DefaultPandasDTypes
from google.cloud.bigquery.external_config import ExternalConfig
from google.cloud.bigquery import _helpers
from google.cloud.bigquery.query import (
Expand All @@ -53,6 +53,11 @@
from google.cloud.bigquery.job.base import _JobConfig
from google.cloud.bigquery.job.base import _JobReference

try:
import pandas # type: ignore
except ImportError: # pragma: NO COVER
pandas = None

if typing.TYPE_CHECKING: # pragma: NO COVER
# Assumption: type checks are only used by library developers and CI environments
# that have all optional dependencies installed, thus no conditional imports.
Expand Down Expand Up @@ -1620,6 +1625,10 @@ def to_dataframe(
create_bqstorage_client: bool = True,
max_results: Optional[int] = None,
geography_as_object: bool = False,
bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Return a pandas DataFrame from a QueryJob
Expand Down Expand Up @@ -1672,6 +1681,46 @@ def to_dataframe(
.. versionadded:: 2.24.0
bool_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
to convert BigQuery Boolean type, instead of relying on the default
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
.. versionadded:: 3.7.1
int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
to convert BigQuery Integer types, instead of relying on the default
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
.. versionadded:: 3.7.1
float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
to convert BigQuery Float type, instead of relying on the default
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
.. versionadded:: 3.7.1
string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
convert BigQuery String type, instead of relying on the default
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("object")``. BigQuery String
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
.. versionadded:: 3.7.1
Returns:
pandas.DataFrame:
A :class:`~pandas.DataFrame` populated with row data
Expand All @@ -1694,6 +1743,10 @@ def to_dataframe(
progress_bar_type=progress_bar_type,
create_bqstorage_client=create_bqstorage_client,
geography_as_object=geography_as_object,
bool_dtype=bool_dtype,
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
)

# If changing the signature of this method, make sure to apply the same
Expand Down
99 changes: 94 additions & 5 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
except ImportError: # pragma: NO COVER
pyarrow = None

try:
import db_dtypes # type: ignore
except ImportError: # pragma: NO COVER
db_dtypes = None

try:
import geopandas # type: ignore
except ImportError:
Expand All @@ -55,6 +60,7 @@
import google.cloud._helpers # type: ignore
from google.cloud.bigquery import _helpers
from google.cloud.bigquery import _pandas_helpers
from google.cloud.bigquery.enums import DefaultPandasDTypes
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
from google.cloud.bigquery.schema import _build_schema_resource
from google.cloud.bigquery.schema import _parse_schema_resource
Expand Down Expand Up @@ -88,6 +94,11 @@

_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'

_NO_SUPPORTED_DTYPE = (
"The dtype cannot to be converted to a pandas ExtensionArray "
"because the necessary `__from_arrow__` attribute is missing."
)


def _reference_getter(table):
"""A :class:`~google.cloud.bigquery.table.TableReference` pointing to
Expand Down Expand Up @@ -1920,6 +1931,10 @@ def to_dataframe(
progress_bar_type: str = None,
create_bqstorage_client: bool = True,
geography_as_object: bool = False,
bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Create a pandas DataFrame by loading all pages of a query.
Expand Down Expand Up @@ -1958,6 +1973,7 @@ def to_dataframe(
progress bar as a graphical dialog box.
.. versionadded:: 1.11.0
create_bqstorage_client (Optional[bool]):
If ``True`` (default), create a BigQuery Storage API client
using the default API settings. The BigQuery Storage API
Expand All @@ -1975,6 +1991,46 @@ def to_dataframe(
.. versionadded:: 2.24.0
bool_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
to convert BigQuery Boolean type, instead of relying on the default
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
.. versionadded:: 3.7.1
int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
to convert BigQuery Integer types, instead of relying on the default
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
.. versionadded:: 3.7.1
float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
to convert BigQuery Float type, instead of relying on the default
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
.. versionadded:: 3.7.1
string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
convert BigQuery String type, instead of relying on the default
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("object")``. BigQuery String
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
.. versionadded:: 3.7.1
Returns:
pandas.DataFrame:
A :class:`~pandas.DataFrame` populated with row data and column
Expand All @@ -1987,14 +2043,34 @@ def to_dataframe(
the :mod:`google.cloud.bigquery_storage_v1` module is
required but cannot be imported. Also if
`geography_as_object` is `True`, but the
:mod:`shapely` library cannot be imported.
:mod:`shapely` library cannot be imported. Also if
`bool_dtype`, `int_dtype` or other dtype parameters
is not supported dtype.
"""
_pandas_helpers.verify_pandas_imports()

if geography_as_object and shapely is None:
raise ValueError(_NO_SHAPELY_ERROR)

if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
bool_dtype = pandas.BooleanDtype()

if int_dtype is DefaultPandasDTypes.INT_DTYPE:
int_dtype = pandas.Int64Dtype()

if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)

if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)

if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)

if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)

if dtypes is None:
dtypes = {}

Expand All @@ -2019,15 +2095,15 @@ def to_dataframe(
for col in record_batch
# Type can be date32 or date64 (plus units).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if str(col.type).startswith("date")
if pyarrow.types.is_date(col.type)
)

timestamp_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be timestamp (plus units and time zone).
# Type can be datetime and timestamp (plus units and time zone).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if str(col.type).startswith("timestamp")
if pyarrow.types.is_timestamp(col.type)
)

if len(record_batch) > 0:
Expand All @@ -2036,7 +2112,11 @@ def to_dataframe(
timestamp_as_object=timestamp_as_object,
integer_object_nulls=True,
types_mapper=_pandas_helpers.default_types_mapper(
date_as_object=date_as_object
date_as_object=date_as_object,
bool_dtype=bool_dtype,
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
),
)
else:
Expand Down Expand Up @@ -2233,6 +2313,10 @@ def to_dataframe(
progress_bar_type=None,
create_bqstorage_client=True,
geography_as_object=False,
bool_dtype=None,
int_dtype=None,
float_dtype=None,
string_dtype=None,
) -> "pandas.DataFrame":
"""Create an empty dataframe.
Expand All @@ -2241,6 +2325,11 @@ def to_dataframe(
dtypes (Any): Ignored. Added for compatibility with RowIterator.
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
Returns:
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Expand Down
Loading

0 comments on commit 5e4465d

Please sign in to comment.