Skip to content

Commit

Permalink
fix!: use nullable Int64 and boolean dtypes in to_dataframe (#786)
Browse files Browse the repository at this point in the history
To override this behavior, specify the types for the desired columns with the
`dtype` argument.

BREAKING CHANGE: uses Int64 type by default to avoid loss-of-precision in results with large integer values

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes https://issuetracker.google.com/144712110 🦕
Fixes #793
  • Loading branch information
tswast authored Aug 16, 2021
1 parent 66014c3 commit dcd78c7
Show file tree
Hide file tree
Showing 11 changed files with 340 additions and 39 deletions.
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
# directories to ignore when looking for source files.
exclude_patterns = [
"_build",
"**/.nox/**/*",
"samples/AUTHORING_GUIDE.md",
"samples/CONTRIBUTING.md",
"samples/snippets/README.rst",
Expand Down
27 changes: 24 additions & 3 deletions docs/usage/pandas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ First, ensure that the :mod:`pandas` library is installed by running:
pip install --upgrade pandas
Alternatively, you can install the BigQuery python client library with
Alternatively, you can install the BigQuery Python client library with
:mod:`pandas` by running:

.. code-block:: bash
pip install --upgrade google-cloud-bigquery[pandas]
pip install --upgrade 'google-cloud-bigquery[pandas]'
To retrieve query results as a :class:`pandas.DataFrame`:

Expand All @@ -37,6 +37,27 @@ To retrieve table rows as a :class:`pandas.DataFrame`:
:start-after: [START bigquery_list_rows_dataframe]
:end-before: [END bigquery_list_rows_dataframe]

The following data types are used when creating a pandas DataFrame.

.. list-table:: Pandas Data Type Mapping
:header-rows: 1

* - BigQuery
- pandas
- Notes
* - BOOL
- boolean
-
* - DATETIME
- datetime64[ns], object
- object is used when there are values not representable in pandas
* - FLOAT64
- float64
-
* - INT64
- Int64
-

Load a Pandas DataFrame to a BigQuery Table
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand All @@ -45,7 +66,7 @@ As of version 1.3.0, you can use the
to load data from a :class:`pandas.DataFrame` to a
:class:`~google.cloud.bigquery.table.Table`. To use this function, in addition
to :mod:`pandas`, you will need to install the :mod:`pyarrow` library. You can
install the BigQuery python client library with :mod:`pandas` and
install the BigQuery Python client library with :mod:`pandas` and
:mod:`pyarrow` by running:

.. code-block:: bash
Expand Down
37 changes: 32 additions & 5 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import functools
import logging
import queue
from typing import Dict, Sequence
import warnings

try:
Expand All @@ -42,15 +43,19 @@

_LOGGER = logging.getLogger(__name__)

_NO_BQSTORAGE_ERROR = (
"The google-cloud-bigquery-storage library is not installed, "
"please install google-cloud-bigquery-storage to use bqstorage features."
)

_PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds.

_MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads

# If you update the default dtypes, also update the docs at docs/usage/pandas.rst.
_BQ_TO_PANDAS_DTYPE_NULLSAFE = {
"BOOL": "boolean",
"BOOLEAN": "boolean",
"FLOAT": "float64",
"FLOAT64": "float64",
"INT64": "Int64",
"INTEGER": "Int64",
}
_PANDAS_DTYPE_TO_BQ = {
"bool": "BOOLEAN",
"datetime64[ns, UTC]": "TIMESTAMP",
Expand Down Expand Up @@ -217,6 +222,28 @@ def bq_to_arrow_schema(bq_schema):
return pyarrow.schema(arrow_fields)


def bq_schema_to_nullsafe_pandas_dtypes(
bq_schema: Sequence[schema.SchemaField],
) -> Dict[str, str]:
"""Return the default dtypes to use for columns in a BigQuery schema.
Only returns default dtypes which are safe to have NULL values. This
includes Int64, which has pandas.NA values and does not result in
loss-of-precision.
Returns:
A mapping from column names to pandas dtypes.
"""
dtypes = {}
for bq_field in bq_schema:
if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}:
continue
field_type = bq_field.field_type.upper()
if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE:
dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type]
return dtypes


def bq_to_arrow_array(series, bq_field):
arrow_type = bq_to_arrow_data_type(bq_field)

Expand Down
11 changes: 10 additions & 1 deletion google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1933,6 +1933,13 @@ def to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=create_bqstorage_client,
)
default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes(
self.schema
)

# Let the user-defined dtypes override the default ones.
# https://stackoverflow.com/a/26853961/101923
dtypes = {**default_dtypes, **dtypes}

# When converting timestamp values to nanosecond precision, the result
# can be out of pyarrow bounds. To avoid the error when converting to
Expand All @@ -1954,7 +1961,9 @@ def to_dataframe(

extra_kwargs = {"timestamp_as_object": timestamp_as_object}

df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)
df = record_batch.to_pandas(
date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs
)

for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column])
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
# Keep the no-op bqstorage extra for backward compatibility.
# See: https://github.com/googleapis/python-bigquery/issues/757
"bqstorage": [],
"pandas": ["pandas>=0.23.0"],
"pandas": ["pandas>=1.0.0"],
"tqdm": ["tqdm >= 4.7.4, <5.0.0dev"],
"opentelemetry": [
"opentelemetry-api >= 0.11b0",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.6.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ grpcio==1.38.1
opentelemetry-api==0.11b0
opentelemetry-instrumentation==0.11b0
opentelemetry-sdk==0.11b0
pandas==0.23.0
pandas==1.0.0
proto-plus==1.10.0
protobuf==3.12.0
pyarrow==3.0.0
Expand Down
5 changes: 1 addition & 4 deletions tests/system/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,9 @@

"""System tests for Arrow connector."""

import pyarrow
import pytest

pyarrow = pytest.importorskip(
"pyarrow", minversion="3.0.0"
) # Needs decimal256 for BIGNUMERIC columns.


@pytest.mark.parametrize(
("max_results", "scalars_table_name"),
Expand Down
72 changes: 70 additions & 2 deletions tests/system/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ def test_query_results_to_dataframe(bigquery_client):
for _, row in df.iterrows():
for col in column_names:
# all the schema fields are nullable, so None is acceptable
if not row[col] is None:
if not pandas.isna(row[col]):
assert isinstance(row[col], exp_datatypes[col])


Expand Down Expand Up @@ -597,7 +597,7 @@ def test_query_results_to_dataframe_w_bqstorage(bigquery_client):
for index, row in df.iterrows():
for col in column_names:
# all the schema fields are nullable, so None is acceptable
if not row[col] is None:
if not pandas.isna(row[col]):
assert isinstance(row[col], exp_datatypes[col])


Expand Down Expand Up @@ -795,3 +795,71 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client):
dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client)

assert len(dataframe.index) == 100


@pytest.mark.parametrize(
("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API.
)
def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results):
df = bigquery_client.list_rows(
scalars_table, max_results=max_results,
).to_dataframe()

assert df.dtypes["bool_col"].name == "boolean"
assert df.dtypes["datetime_col"].name == "datetime64[ns]"
assert df.dtypes["float64_col"].name == "float64"
assert df.dtypes["int64_col"].name == "Int64"
assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]"

# object is used by default, but we can use "datetime64[ns]" automatically
# when data is within the supported range.
# https://github.com/googleapis/python-bigquery/issues/861
assert df.dtypes["date_col"].name == "object"

# object is used by default, but we can use "timedelta64[ns]" automatically
# https://github.com/googleapis/python-bigquery/issues/862
assert df.dtypes["time_col"].name == "object"

# decimal.Decimal is used to avoid loss of precision.
assert df.dtypes["bignumeric_col"].name == "object"
assert df.dtypes["numeric_col"].name == "object"

# pandas uses Python string and bytes objects.
assert df.dtypes["bytes_col"].name == "object"
assert df.dtypes["string_col"].name == "object"


@pytest.mark.parametrize(
("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API.
)
def test_list_rows_nullable_scalars_extreme_dtypes(
bigquery_client, scalars_extreme_table, max_results
):
df = bigquery_client.list_rows(
scalars_extreme_table, max_results=max_results
).to_dataframe()

# Extreme values are out-of-bounds for pandas datetime64 values, which use
# nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must
# be represented with object.
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations
assert df.dtypes["date_col"].name == "object"
assert df.dtypes["datetime_col"].name == "object"
assert df.dtypes["timestamp_col"].name == "object"

# These pandas dtypes can handle the same ranges as BigQuery.
assert df.dtypes["bool_col"].name == "boolean"
assert df.dtypes["float64_col"].name == "float64"
assert df.dtypes["int64_col"].name == "Int64"

# object is used by default, but we can use "timedelta64[ns]" automatically
# https://github.com/googleapis/python-bigquery/issues/862
assert df.dtypes["time_col"].name == "object"

# decimal.Decimal is used to avoid loss of precision.
assert df.dtypes["numeric_col"].name == "object"
assert df.dtypes["bignumeric_col"].name == "object"

# pandas uses Python string and bytes objects.
assert df.dtypes["bytes_col"].name == "object"
assert df.dtypes["string_col"].name == "object"
22 changes: 4 additions & 18 deletions tests/unit/job/test_query_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@
import pyarrow
import pytest

try:
import pandas
except (ImportError, AttributeError): # pragma: NO COVER
pandas = None

from google.cloud import bigquery_storage

try:
Expand All @@ -36,6 +31,8 @@
from .helpers import _make_connection
from .helpers import _make_job_resource

pandas = pytest.importorskip("pandas")


@pytest.fixture
def table_read_options_kwarg():
Expand Down Expand Up @@ -78,7 +75,6 @@ def test__contains_order_by(query, expected):
assert not mut._contains_order_by(query)


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.parametrize(
"query",
(
Expand Down Expand Up @@ -413,7 +409,6 @@ def test_to_arrow_w_tqdm_wo_query_plan():
result_patch_tqdm.assert_called()


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_to_dataframe():
from google.cloud.bigquery.job import QueryJob as target_class

Expand Down Expand Up @@ -452,7 +447,6 @@ def test_to_dataframe():
assert list(df) == ["name", "age"] # verify the column names


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_to_dataframe_ddl_query():
from google.cloud.bigquery.job import QueryJob as target_class

Expand All @@ -472,7 +466,6 @@ def test_to_dataframe_ddl_query():
assert len(df) == 0


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_to_dataframe_bqstorage(table_read_options_kwarg):
from google.cloud.bigquery.job import QueryJob as target_class

Expand Down Expand Up @@ -522,7 +515,6 @@ def test_to_dataframe_bqstorage(table_read_options_kwarg):
)


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_to_dataframe_bqstorage_no_pyarrow_compression():
from google.cloud.bigquery.job import QueryJob as target_class

Expand Down Expand Up @@ -565,7 +557,6 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression():
)


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_to_dataframe_column_dtypes():
from google.cloud.bigquery.job import QueryJob as target_class

Expand Down Expand Up @@ -617,15 +608,14 @@ def test_to_dataframe_column_dtypes():
assert list(df) == exp_columns # verify the column names

assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]"
assert df.seconds.dtype.name == "int64"
assert df.seconds.dtype.name == "Int64"
assert df.miles.dtype.name == "float64"
assert df.km.dtype.name == "float16"
assert df.payment_type.dtype.name == "object"
assert df.complete.dtype.name == "bool"
assert df.complete.dtype.name == "boolean"
assert df.date.dtype.name == "object"


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_to_dataframe_column_date_dtypes():
from google.cloud.bigquery.job import QueryJob as target_class

Expand Down Expand Up @@ -657,7 +647,6 @@ def test_to_dataframe_column_date_dtypes():
assert df.date.dtype.name == "datetime64[ns]"


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
@mock.patch("tqdm.tqdm")
def test_to_dataframe_with_progress_bar(tqdm_mock):
Expand Down Expand Up @@ -685,7 +674,6 @@ def test_to_dataframe_with_progress_bar(tqdm_mock):
tqdm_mock.assert_called()


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
def test_to_dataframe_w_tqdm_pending():
from google.cloud.bigquery import table
Expand Down Expand Up @@ -741,7 +729,6 @@ def test_to_dataframe_w_tqdm_pending():
)


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
def test_to_dataframe_w_tqdm():
from google.cloud.bigquery import table
Expand Down Expand Up @@ -801,7 +788,6 @@ def test_to_dataframe_w_tqdm():
)


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
def test_to_dataframe_w_tqdm_max_results():
from google.cloud.bigquery import table
Expand Down
Loading

0 comments on commit dcd78c7

Please sign in to comment.