Skip to content

Commit

Permalink
fix: load_table_from_dataframe now assumes there may be local null …
Browse files Browse the repository at this point in the history
…values (googleapis#1735)

Even if the remote schema is REQUIRED

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes googleapis#1692 🦕
  • Loading branch information
tswast authored and kiraksi committed Nov 27, 2023
1 parent 326f052 commit cbcde2f
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 23 deletions.
10 changes: 8 additions & 2 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,18 @@ def bq_to_arrow_field(bq_field, array_type=None):
if arrow_type is not None:
if array_type is not None:
arrow_type = array_type # For GEOGRAPHY, at least initially
is_nullable = bq_field.mode.upper() == "NULLABLE"
metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get(
bq_field.field_type.upper() if bq_field.field_type else ""
)
return pyarrow.field(
bq_field.name, arrow_type, nullable=is_nullable, metadata=metadata
bq_field.name,
arrow_type,
# Even if the remote schema is REQUIRED, there's a chance there's
# local NULL values. Arrow will gladly interpret these NULL values
# as non-NULL and give you an arbitrary value. See:
# https://github.com/googleapis/python-bigquery/issues/1692
nullable=True,
metadata=metadata,
)

warnings.warn("Unable to determine type for field '{}'.".format(bq_field.name))
Expand Down
47 changes: 40 additions & 7 deletions tests/system/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,7 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id):


def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
"""Test that a DataFrame with required columns can be uploaded if a
BigQuery schema is specified.
"""Test that a DataFrame can be uploaded to a table with required columns.
See: https://github.com/googleapis/google-cloud-python/issues/8093
"""
Expand All @@ -442,7 +441,6 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):

records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}]
dataframe = pandas.DataFrame(records, columns=["name", "age"])
job_config = bigquery.LoadJobConfig(schema=table_schema)
table_id = "{}.{}.load_table_from_dataframe_w_required".format(
bigquery_client.project, dataset_id
)
Expand All @@ -453,15 +451,50 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
bigquery.Table(table_id, schema=table_schema)
)

job_config = bigquery.LoadJobConfig(schema=table_schema)
load_job = bigquery_client.load_table_from_dataframe(
dataframe, table_id, job_config=job_config
)
load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id)
load_job.result()

table = bigquery_client.get_table(table)
assert tuple(table.schema) == table_schema
assert table.num_rows == 2
for field in table.schema:
assert field.mode == "REQUIRED"


def test_load_table_from_dataframe_w_required_but_local_nulls_fails(
bigquery_client, dataset_id
):
"""Test that a DataFrame with nulls can't be uploaded to a table with
required columns.
See: https://github.com/googleapis/python-bigquery/issues/1692
"""
table_schema = (
bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
)

records = [
{"name": "Chip", "age": 2},
{"name": "Dale", "age": 3},
{"name": None, "age": None},
{"name": "Alvin", "age": 4},
]
dataframe = pandas.DataFrame(records, columns=["name", "age"])
table_id = (
"{}.{}.load_table_from_dataframe_w_required_but_local_nulls_fails".format(
bigquery_client.project, dataset_id
)
)

# Create the table before loading so that schema mismatch errors are
# identified.
helpers.retry_403(bigquery_client.create_table)(
bigquery.Table(table_id, schema=table_schema)
)

with pytest.raises(google.api_core.exceptions.BadRequest, match="null"):
bigquery_client.load_table_from_dataframe(dataframe, table_id).result()


def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id):
Expand Down
47 changes: 33 additions & 14 deletions tests/unit/test__pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,30 +1018,41 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
)

data = {
"field01": ["hello", "world"],
"field02": [b"abd", b"efg"],
"field03": [1, 2],
"field04": [3, 4],
"field05": [1.25, 9.75],
"field06": [-1.75, -3.5],
"field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")],
"field01": ["hello", None, "world"],
"field02": [b"abd", b"efg", b"hij"],
"field03": [1, 2, 3],
"field04": [4, None, 5],
"field05": [1.25, 0.0, 9.75],
"field06": [-1.75, None, -3.5],
"field07": [
decimal.Decimal("1.2345"),
decimal.Decimal("6.7891"),
-decimal.Decimal("10.111213"),
],
"field08": [
decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)),
None,
decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)),
],
"field09": [True, False],
"field10": [False, True],
"field09": [True, False, True],
"field10": [False, True, None],
"field11": [
datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=datetime.timezone.utc),
datetime.datetime(2022, 7, 14, 23, 59, 59, tzinfo=datetime.timezone.utc),
],
"field12": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)],
"field13": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)],
"field12": [datetime.date(9999, 12, 31), None, datetime.date(1970, 1, 1)],
"field13": [datetime.time(23, 59, 59, 999999), None, datetime.time(12, 0, 0)],
"field14": [
datetime.datetime(1970, 1, 1, 0, 0, 0),
None,
datetime.datetime(2012, 12, 21, 9, 7, 42),
],
"field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"],
"field15": [
None,
"POINT(30 10)",
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
],
}
dataframe = pandas.DataFrame(data)

Expand All @@ -1050,7 +1061,11 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):

assert len(arrow_schema) == len(bq_schema)
for arrow_field in arrow_schema:
assert not arrow_field.nullable
# Even if the remote schema is REQUIRED, there's a chance there's
# local NULL values. Arrow will gladly interpret these NULL values
# as non-NULL and give you an arbitrary value. See:
# https://github.com/googleapis/python-bigquery/issues/1692
assert arrow_field.nullable


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
Expand Down Expand Up @@ -1102,7 +1117,11 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
arrow_schema = arrow_table.schema

expected_fields = [
pyarrow.field("field01", "string", nullable=False),
# Even if the remote schema is REQUIRED, there's a chance there's
# local NULL values. Arrow will gladly interpret these NULL values
# as non-NULL and give you an arbitrary value. See:
# https://github.com/googleapis/python-bigquery/issues/1692
pyarrow.field("field01", "string", nullable=True),
pyarrow.field("field02", "bool", nullable=True),
]
assert list(arrow_schema) == expected_fields
Expand Down

0 comments on commit cbcde2f

Please sign in to comment.