Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add bool, int, float, string dtype to to_dataframe #1529

Merged
merged 1 commit into from
Mar 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import logging
import queue
import warnings
from typing import Any, Union

from packaging import version

Expand Down Expand Up @@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema):
return pyarrow.schema(arrow_fields)


def default_types_mapper(date_as_object: bool = False):
def default_types_mapper(
date_as_object: bool = False,
bool_dtype: Union[Any, None] = None,
int_dtype: Union[Any, None] = None,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
):
"""Create a mapping from pyarrow types to pandas types.

This overrides the pandas defaults to use null-safe extension types where
Expand All @@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False):
"""

def types_mapper(arrow_data_type):
if pyarrow.types.is_boolean(arrow_data_type):
return pandas.BooleanDtype()
if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type):
return bool_dtype

elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type):
return int_dtype

elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type):
return float_dtype

elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type):
return string_dtype

elif (
# If date_as_object is True, we know some DATE columns are
Expand All @@ -310,9 +326,6 @@ def types_mapper(arrow_data_type):
):
return db_dtypes.DateDtype()

elif pyarrow.types.is_integer(arrow_data_type):
return pandas.Int64Dtype()

elif pyarrow.types.is_time(arrow_data_type):
return db_dtypes.TimeDtype()

Expand Down
14 changes: 14 additions & 0 deletions google/cloud/bigquery/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,20 @@ class CreateDisposition(object):
returned in the job result."""


class DefaultPandasDTypes(enum.Enum):
"""Default Pandas DataFrem DTypes to convert BigQuery data. These
Sentinel values are used instead of None to maintain backward compatibility,
and allow Pandas package is not available. For more information:
https://stackoverflow.com/a/60605919/101923
"""

BOOL_DTYPE = object()
"""Specifies default bool dtype"""

INT_DTYPE = object()
"""Specifies default integer dtype"""


class DestinationFormat(object):
"""The exported file format. The default value is :attr:`CSV`.
Expand Down
55 changes: 54 additions & 1 deletion google/cloud/bigquery/job/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from google.cloud.bigquery.dataset import DatasetListItem
from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery.enums import KeyResultStatementKind
from google.cloud.bigquery.enums import KeyResultStatementKind, DefaultPandasDTypes
from google.cloud.bigquery.external_config import ExternalConfig
from google.cloud.bigquery import _helpers
from google.cloud.bigquery.query import (
Expand All @@ -53,6 +53,11 @@
from google.cloud.bigquery.job.base import _JobConfig
from google.cloud.bigquery.job.base import _JobReference

try:
import pandas # type: ignore
except ImportError: # pragma: NO COVER
pandas = None

if typing.TYPE_CHECKING: # pragma: NO COVER
# Assumption: type checks are only used by library developers and CI environments
# that have all optional dependencies installed, thus no conditional imports.
Expand Down Expand Up @@ -1620,6 +1625,10 @@ def to_dataframe(
create_bqstorage_client: bool = True,
max_results: Optional[int] = None,
geography_as_object: bool = False,
bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Return a pandas DataFrame from a QueryJob
Expand Down Expand Up @@ -1672,6 +1681,46 @@ def to_dataframe(
.. versionadded:: 2.24.0
bool_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
to convert BigQuery Boolean type, instead of relying on the default
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
.. versionadded:: 3.7.1
int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
to convert BigQuery Integer types, instead of relying on the default
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
.. versionadded:: 3.7.1
float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
to convert BigQuery Float type, instead of relying on the default
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
.. versionadded:: 3.7.1
string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
convert BigQuery String type, instead of relying on the default
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("object")``. BigQuery String
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
.. versionadded:: 3.7.1
Returns:
pandas.DataFrame:
A :class:`~pandas.DataFrame` populated with row data
Expand All @@ -1694,6 +1743,10 @@ def to_dataframe(
progress_bar_type=progress_bar_type,
create_bqstorage_client=create_bqstorage_client,
geography_as_object=geography_as_object,
bool_dtype=bool_dtype,
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
)

# If changing the signature of this method, make sure to apply the same
Expand Down
99 changes: 94 additions & 5 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
except ImportError: # pragma: NO COVER
pyarrow = None

try:
import db_dtypes # type: ignore
except ImportError: # pragma: NO COVER
db_dtypes = None

try:
import geopandas # type: ignore
except ImportError:
Expand All @@ -55,6 +60,7 @@
import google.cloud._helpers # type: ignore
from google.cloud.bigquery import _helpers
from google.cloud.bigquery import _pandas_helpers
from google.cloud.bigquery.enums import DefaultPandasDTypes
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
from google.cloud.bigquery.schema import _build_schema_resource
from google.cloud.bigquery.schema import _parse_schema_resource
Expand Down Expand Up @@ -88,6 +94,11 @@

_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'

_NO_SUPPORTED_DTYPE = (
"The dtype cannot to be converted to a pandas ExtensionArray "
"because the necessary `__from_arrow__` attribute is missing."
)


def _reference_getter(table):
"""A :class:`~google.cloud.bigquery.table.TableReference` pointing to
Expand Down Expand Up @@ -1920,6 +1931,10 @@ def to_dataframe(
progress_bar_type: str = None,
create_bqstorage_client: bool = True,
geography_as_object: bool = False,
bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Create a pandas DataFrame by loading all pages of a query.
Expand Down Expand Up @@ -1958,6 +1973,7 @@ def to_dataframe(
progress bar as a graphical dialog box.
.. versionadded:: 1.11.0
create_bqstorage_client (Optional[bool]):
If ``True`` (default), create a BigQuery Storage API client
using the default API settings. The BigQuery Storage API
Expand All @@ -1975,6 +1991,46 @@ def to_dataframe(
.. versionadded:: 2.24.0
bool_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
to convert BigQuery Boolean type, instead of relying on the default
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
.. versionadded:: 3.7.1
int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
to convert BigQuery Integer types, instead of relying on the default
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
.. versionadded:: 3.7.1
float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
to convert BigQuery Float type, instead of relying on the default
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
.. versionadded:: 3.7.1
string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
convert BigQuery String type, instead of relying on the default
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
then the data type will be ``numpy.dtype("object")``. BigQuery String
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
.. versionadded:: 3.7.1
Returns:
pandas.DataFrame:
A :class:`~pandas.DataFrame` populated with row data and column
Expand All @@ -1987,14 +2043,34 @@ def to_dataframe(
the :mod:`google.cloud.bigquery_storage_v1` module is
required but cannot be imported. Also if
`geography_as_object` is `True`, but the
:mod:`shapely` library cannot be imported.
:mod:`shapely` library cannot be imported. Also if
`bool_dtype`, `int_dtype` or other dtype parameters
is not supported dtype.
"""
_pandas_helpers.verify_pandas_imports()

if geography_as_object and shapely is None:
raise ValueError(_NO_SHAPELY_ERROR)

if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
bool_dtype = pandas.BooleanDtype()

if int_dtype is DefaultPandasDTypes.INT_DTYPE:
int_dtype = pandas.Int64Dtype()

if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)

if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)

if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)

if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)

if dtypes is None:
dtypes = {}

Expand All @@ -2019,15 +2095,15 @@ def to_dataframe(
for col in record_batch
# Type can be date32 or date64 (plus units).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if str(col.type).startswith("date")
if pyarrow.types.is_date(col.type)
)

timestamp_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be timestamp (plus units and time zone).
# Type can be datetime and timestamp (plus units and time zone).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if str(col.type).startswith("timestamp")
if pyarrow.types.is_timestamp(col.type)
)

if len(record_batch) > 0:
Expand All @@ -2036,7 +2112,11 @@ def to_dataframe(
timestamp_as_object=timestamp_as_object,
integer_object_nulls=True,
types_mapper=_pandas_helpers.default_types_mapper(
date_as_object=date_as_object
date_as_object=date_as_object,
bool_dtype=bool_dtype,
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
),
)
else:
Expand Down Expand Up @@ -2233,6 +2313,10 @@ def to_dataframe(
progress_bar_type=None,
create_bqstorage_client=True,
geography_as_object=False,
bool_dtype=None,
int_dtype=None,
float_dtype=None,
string_dtype=None,
) -> "pandas.DataFrame":
"""Create an empty dataframe.
Expand All @@ -2241,6 +2325,11 @@ def to_dataframe(
dtypes (Any): Ignored. Added for compatibility with RowIterator.
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
Returns:
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Expand Down
Loading