diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 698fbc5c..2833fe98 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -12,7 +12,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: "3.10" - name: Install nox run: | python -m pip install --upgrade setuptools pip wheel diff --git a/owlbot.py b/owlbot.py index 190298a6..e50b9e9e 100644 --- a/owlbot.py +++ b/owlbot.py @@ -57,6 +57,7 @@ "noxfile.py", "README.rst", # exclude this file as we have an alternate prerelease.cfg + ".github/workflows/docs.yml", ".kokoro/presubmit/prerelease-deps.cfg", ".kokoro/presubmit/presubmit.cfg", ], diff --git a/pandas_gbq/schema/pandas_to_bigquery.py b/pandas_gbq/schema/pandas_to_bigquery.py index 5a979a12..5afae356 100644 --- a/pandas_gbq/schema/pandas_to_bigquery.py +++ b/pandas_gbq/schema/pandas_to_bigquery.py @@ -4,7 +4,7 @@ import collections.abc import datetime -from typing import Optional, Tuple +from typing import Any, Optional, Tuple import warnings import db_dtypes @@ -28,14 +28,21 @@ # `docs/source/writing.rst`. _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", + "boolean": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", + "datetime64[us, UTC]": "TIMESTAMP", "datetime64[ns]": "DATETIME", + "datetime64[us]": "DATETIME", "float32": "FLOAT", "float64": "FLOAT", "int8": "INTEGER", "int16": "INTEGER", "int32": "INTEGER", "int64": "INTEGER", + "Int8": "INTEGER", + "Int16": "INTEGER", + "Int32": "INTEGER", + "Int64": "INTEGER", "uint8": "INTEGER", "uint16": "INTEGER", "uint32": "INTEGER", @@ -103,7 +110,7 @@ def dataframe_to_bigquery_fields( # Try to automatically determine the type based on a few rows of the data. values = dataframe.reset_index()[column] - bq_field = values_to_bigquery_field(column, values) + bq_field = values_to_bigquery_field(column, values, default_type=default_type) if bq_field: bq_schema_out.append(bq_field) @@ -114,7 +121,9 @@ def dataframe_to_bigquery_fields( arrow_value = pyarrow.array(values) bq_field = ( pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field( - column, arrow_value.type + column, + arrow_value.type, + default_type=default_type, ) ) @@ -151,6 +160,19 @@ def dataframe_to_bigquery_fields( def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from a pandas dtype. + + Args: + name (str): + Name of the column/field. + dtype: + A pandas / numpy dtype object. + + Returns: + Optional[schema.SchemaField]: + The schema field, or None if a type cannot be inferred, such as if + it is ambiguous like the object dtype. + """ bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) if bq_type is not None: @@ -164,9 +186,44 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]: return None -def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: - if isinstance(value, str): - return schema.SchemaField(name, "STRING") +def value_to_bigquery_field( + name: str, value: Any, default_type: Optional[str] = None +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from a single value. + + Args: + name: + The name of the field. + value: + The value to infer the type from. If None, the default type is used + if available. + default_type: + The default field type. Defaults to None. + + Returns: + The schema field, or None if a type cannot be inferred. + """ + + # Set the SchemaField datatype to the given default_type if the value + # being assessed is None. + if value is None: + return schema.SchemaField(name, default_type) + + # Map from Python types to BigQuery types. This isn't super exhaustive + # because we rely more on pyarrow, which can check more than one value to + # determine the type. + type_mapping = { + str: "STRING", + } + + # geopandas and shapely are optional dependencies, so only check if those + # are installed. + if _BaseGeometry is not None: + type_mapping[_BaseGeometry] = "GEOGRAPHY" + + for type_, bq_type in type_mapping.items(): + if isinstance(value, type_): + return schema.SchemaField(name, bq_type) # For timezone-naive datetimes, the later pyarrow conversion to try and # learn the type add a timezone to such datetimes, causing them to be @@ -182,35 +239,51 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: else: return schema.SchemaField(name, "DATETIME") - if _BaseGeometry is not None and isinstance(value, _BaseGeometry): - return schema.SchemaField(name, "GEOGRAPHY") - return None -def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]: +def values_to_bigquery_field( + name: str, values: Any, default_type: str = "STRING" +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from a list of values. + + This function iterates through the given values to determine the + corresponding schema field type. + + Args: + name: + The name of the field. + values: + An iterable of values to infer the type from. If all the values + are None or the iterable is empty, the function returns None. + default_type: + The default field type to use if a specific type cannot be + determined from the values. Defaults to "STRING". + + Returns: + The schema field, or None if a type cannot be inferred. + """ value = pandas_gbq.core.pandas.first_valid(values) - # All NULL, type not determinable. + # All values came back as NULL, thus type not determinable by this method. + # Return None so we can try other methods. if value is None: return None - field = value_to_bigquery_field(name, value) - if field is not None: + field = value_to_bigquery_field(name, value, default_type=default_type) + if field: return field - if isinstance(value, str): - return schema.SchemaField(name, "STRING") - - # Check plain ARRAY values here. Let STRUCT get determined by pyarrow, - # which can examine more values to determine all keys. + # Check plain ARRAY values here. Exclude mapping types to let STRUCT get + # determined by pyarrow, which can examine more values to determine all + # keys. if isinstance(value, collections.abc.Iterable) and not isinstance( value, collections.abc.Mapping ): # It could be that this value contains all None or is empty, so get the # first non-None value we can find. valid_item = pandas_gbq.core.pandas.first_array_valid(values) - field = value_to_bigquery_field(name, valid_item) + field = value_to_bigquery_field(name, valid_item, default_type=default_type) if field is not None: return schema.SchemaField(name, field.field_type, mode="REPEATED") diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py index da1a1ce8..91677f9d 100644 --- a/pandas_gbq/schema/pyarrow_to_bigquery.py +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -37,7 +37,31 @@ } -def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: +def arrow_type_to_bigquery_field( + name, type_, default_type="STRING" +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from an arrow type. + + Args: + name (str): + Name of the column/field. + type_: + A pyarrow type object. + + Returns: + Optional[schema.SchemaField]: + The schema field, or None if a type cannot be inferred, such as if + it is a type that doesn't have a clear mapping in BigQuery. + + null() are assumed to be the ``default_type``, since there are no + values that contradict that. + """ + # If a sub-field is the null type, then assume it's the default type, as + # that's the best we can do. + # https://github.com/googleapis/python-bigquery-pandas/issues/836 + if pyarrow.types.is_null(type_): + return schema.SchemaField(name, default_type) + # Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use # a special case to disambiguate them. See: # https://github.com/googleapis/python-bigquery-pandas/issues/450 @@ -52,22 +76,49 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: return schema.SchemaField(name, detected_type) if pyarrow.types.is_list(type_): - return arrow_list_type_to_bigquery(name, type_) + return arrow_list_type_to_bigquery(name, type_, default_type=default_type) if pyarrow.types.is_struct(type_): inner_fields: list[pyarrow.Field] = [] struct_type = cast(pyarrow.StructType, type_) for field_index in range(struct_type.num_fields): field = struct_type[field_index] - inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type)) + inner_fields.append( + arrow_type_to_bigquery_field( + field.name, field.type, default_type=default_type + ) + ) return schema.SchemaField(name, "RECORD", fields=inner_fields) return None -def arrow_list_type_to_bigquery(name, type_) -> Optional[schema.SchemaField]: - inner_field = arrow_type_to_bigquery_field(name, type_.value_type) +def arrow_list_type_to_bigquery( + name, type_, default_type="STRING" +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from an arrow list type. + + Args: + name (str): + Name of the column/field. + type_: + A pyarrow type object. + + Returns: + Optional[schema.SchemaField]: + The schema field, or None if a type cannot be inferred, such as if + it is a type that doesn't have a clear mapping in BigQuery. + + null() are assumed to be the ``default_type``, since there are no + values that contradict that. + """ + inner_field = arrow_type_to_bigquery_field( + name, type_.value_type, default_type=default_type + ) + + # If this is None, it means we got some type that we can't cleanly map to + # a BigQuery type, so bubble that status up. if inner_field is None: return None diff --git a/tests/unit/schema/test_pandas_to_bigquery.py b/tests/unit/schema/test_pandas_to_bigquery.py index 924ce1ee..777c3825 100644 --- a/tests/unit/schema/test_pandas_to_bigquery.py +++ b/tests/unit/schema/test_pandas_to_bigquery.py @@ -21,13 +21,34 @@ def module_under_test(): def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): df_data = collections.OrderedDict( [ + ("str_index", ["a", "b"]), ("str_column", ["hello", "world"]), ("int_column", [42, 8]), + ("nullable_int_column", pandas.Series([42, None], dtype="Int64")), + ("uint_column", pandas.Series([7, 13], dtype="uint8")), ("bool_column", [True, False]), + ("boolean_column", pandas.Series([True, None], dtype="boolean")), + ( + "datetime_column", + [ + datetime.datetime(1999, 12, 31, 23, 59, 59, 999999), + datetime.datetime(2000, 1, 1, 0, 0, 0), + ], + ), + ( + "timestamp_column", + [ + datetime.datetime( + 1999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2000, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + ], + ), ] ) - index = pandas.Index(["a", "b"], name="str_index") - dataframe = pandas.DataFrame(df_data, index=index) + dataframe = pandas.DataFrame(df_data).set_index("str_index", drop=True) returned_schema = module_under_test.dataframe_to_bigquery_fields( dataframe, [], index=True @@ -37,7 +58,12 @@ def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): schema.SchemaField("str_index", "STRING", "NULLABLE"), schema.SchemaField("str_column", "STRING", "NULLABLE"), schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("nullable_int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("uint_column", "INTEGER", "NULLABLE"), schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"), + schema.SchemaField("boolean_column", "BOOLEAN", "NULLABLE"), + schema.SchemaField("datetime_column", "DATETIME", "NULLABLE"), + schema.SchemaField("timestamp_column", "TIMESTAMP", "NULLABLE"), ) assert returned_schema == expected_schema @@ -45,19 +71,24 @@ def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): def test_dataframe_to_bigquery_fields_w_multiindex(module_under_test): df_data = collections.OrderedDict( [ + ("str_index", ["a", "a"]), + ("int_index", [0, 0]), + ( + "dt_index", + [ + datetime.datetime(1999, 12, 31, 23, 59, 59, 999999), + datetime.datetime(2000, 1, 1, 0, 0, 0), + ], + ), ("str_column", ["hello", "world"]), ("int_column", [42, 8]), ("bool_column", [True, False]), ] ) - index = pandas.MultiIndex.from_tuples( - [ - ("a", 0, datetime.datetime(1999, 12, 31, 23, 59, 59, 999999)), - ("a", 0, datetime.datetime(2000, 1, 1, 0, 0, 0)), - ], - names=["str_index", "int_index", "dt_index"], + dataframe = pandas.DataFrame(df_data).set_index( + ["str_index", "int_index", "dt_index"], + drop=True, ) - dataframe = pandas.DataFrame(df_data, index=index) returned_schema = module_under_test.dataframe_to_bigquery_fields( dataframe, [], index=True diff --git a/tests/unit/schema/test_pyarrow_to_bigquery.py b/tests/unit/schema/test_pyarrow_to_bigquery.py index 4af0760f..dc5504f9 100644 --- a/tests/unit/schema/test_pyarrow_to_bigquery.py +++ b/tests/unit/schema/test_pyarrow_to_bigquery.py @@ -42,16 +42,14 @@ def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type): def test_arrow_type_to_bigquery_field_unknown(): - assert ( - pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null()) - is None - ) + assert pyarrow_to_bigquery.arrow_type_to_bigquery_field( + "test_name", pyarrow.null(), default_type="DEFAULT_TYPE" + ) == bigquery.SchemaField("test_name", "DEFAULT_TYPE") def test_arrow_type_to_bigquery_field_list_of_unknown(): - assert ( - pyarrow_to_bigquery.arrow_type_to_bigquery_field( - "test_name", pyarrow.list_(pyarrow.null()) - ) - is None - ) + assert pyarrow_to_bigquery.arrow_type_to_bigquery_field( + "test_name", + pyarrow.list_(pyarrow.null()), + default_type="DEFAULT_TYPE", + ) == bigquery.SchemaField("test_name", "DEFAULT_TYPE", mode="REPEATED") diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 48e8862a..0da16baf 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -70,7 +70,7 @@ def test_schema_is_subset_fails_if_not_subset(): [ pytest.param( pandas.DataFrame(data={"col1": [object()]}), - {"fields": [{"name": "col1", "type": "STRING"}]}, + {"fields": [{"name": "col1", "type": "DEFAULT_TYPE"}]}, id="default-type-fails-pyarrow-conversion", ), ( @@ -182,13 +182,15 @@ def test_schema_is_subset_fails_if_not_subset(): else "object", ), "list_of_struct": pandas.Series( - [[], [{"test": "abc"}], []], + [[], [{"test": 123.0}], []], dtype=pandas.ArrowDtype( - pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])) + pyarrow.list_(pyarrow.struct([("test", pyarrow.float64())])) ) if hasattr(pandas, "ArrowDtype") else "object", ), + "list_of_unknown": [[], [], []], + "list_of_null": [[None, None], [None], [None, None]], } ), { @@ -200,17 +202,56 @@ def test_schema_is_subset_fails_if_not_subset(): "type": "RECORD", "mode": "REPEATED", "fields": [ - {"name": "test", "type": "STRING", "mode": "NULLABLE"}, + {"name": "test", "type": "FLOAT", "mode": "NULLABLE"}, ], }, + # Use DEFAULT_TYPE because there are no values to detect a type. + { + "name": "list_of_unknown", + "type": "DEFAULT_TYPE", + "mode": "REPEATED", + }, + { + "name": "list_of_null", + "type": "DEFAULT_TYPE", + "mode": "REPEATED", + }, ], }, id="array", ), + pytest.param( + # If a struct contains only nulls in a sub-field, use the default + # type for subfields without a type we can determine. + # https://github.com/googleapis/python-bigquery-pandas/issues/836 + pandas.DataFrame( + { + "id": [0, 1], + "positions": [{"state": None}, {"state": None}], + }, + ), + { + "fields": [ + {"name": "id", "type": "INTEGER"}, + { + "name": "positions", + "type": "RECORD", + "fields": [ + { + "name": "state", + "type": "DEFAULT_TYPE", + "mode": "NULLABLE", + }, + ], + }, + ], + }, + id="issue832-null-struct-field", + ), ], ) def test_generate_bq_schema(dataframe, expected_schema): - schema = pandas_gbq.gbq._generate_bq_schema(dataframe) + schema = pandas_gbq.gbq._generate_bq_schema(dataframe, default_type="DEFAULT_TYPE") # NULLABLE is the default mode. for field in expected_schema["fields"]: