From 5251b5dbb254732ea730bab664ad319bd5be47e7 Mon Sep 17 00:00:00 2001 From: Lingqing Gan Date: Thu, 18 Apr 2024 13:14:56 -0700 Subject: [PATCH] feat: support RANGE in queries Part 2: Arrow (#1868) * feat: support range in queries as dict * fix sys tests * lint * add arrow support * fix python 3.7 test error * print dependencies in sys test * add unit test and docs * fix unit test * add func docs * add sys test for tabledata.list in arrow * add sys test for tabledata.list as iterator * lint * fix docs error * fix docstring * fix docstring * fix docstring * docs * docs * docs * move dtypes mapping code * address comment * address comment * fix pytest error * Revert "move dtypes mapping code" This reverts commit c46c65c822b3c8295d5d6650b1c9c97d35d2ba5b. * remove commented out assertions * typo and formats * add None-check for range_element_type and add unit tests * change test skip condition * fix test error * change test skip condition * change test skip condition * change decorator order * use a different way to construct test data * fix error message and add warning number check * add warning number check and comments --- google/cloud/bigquery/_helpers.py | 16 ++- google/cloud/bigquery/_pandas_helpers.py | 33 ++++++ google/cloud/bigquery/dbapi/_helpers.py | 14 ++- google/cloud/bigquery/enums.py | 9 ++ google/cloud/bigquery/job/query.py | 67 +++++++++++ google/cloud/bigquery/query.py | 11 +- google/cloud/bigquery/table.py | 137 +++++++++++++++++++++++ noxfile.py | 3 + tests/data/scalars.csv | 2 + tests/data/scalars_schema_csv.json | 10 ++ tests/system/conftest.py | 22 +++- tests/system/test_arrow.py | 27 +++++ tests/system/test_list_rows.py | 14 +++ tests/unit/test__pandas_helpers.py | 61 ++++++++++ tests/unit/test_table.py | 115 ++++++++++++++++++- 15 files changed, 516 insertions(+), 25 deletions(-) create mode 100644 tests/data/scalars.csv create mode 100644 tests/data/scalars_schema_csv.json diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 0572867d7..083eb9f9d 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -66,6 +66,8 @@ _UNIVERSE_DOMAIN_ENV = "GOOGLE_CLOUD_UNIVERSE_DOMAIN" """Environment variable for setting universe domain.""" +_SUPPORTED_RANGE_ELEMENTS = {"TIMESTAMP", "DATETIME", "DATE"} + def _get_client_universe( client_options: Optional[Union[client_options_lib.ClientOptions, dict]] @@ -310,17 +312,13 @@ def _json_from_json(value, field): def _range_element_from_json(value, field): - """Coerce 'value' to a range element value, if set or not nullable.""" + """Coerce 'value' to a range element value.""" if value == "UNBOUNDED": return None - elif field.element_type == "DATE": - return _date_from_json(value, None) - elif field.element_type == "DATETIME": - return _datetime_from_json(value, None) - elif field.element_type == "TIMESTAMP": - return _timestamp_from_json(value, None) + if field.element_type in _SUPPORTED_RANGE_ELEMENTS: + return _CELLDATA_FROM_JSON[field.element_type](value, field.element_type) else: - raise ValueError(f"Unsupported range field type: {value}") + raise ValueError(f"Unsupported range element type: {field.element_type}") def _range_from_json(value, field): @@ -344,7 +342,7 @@ def _range_from_json(value, field): end = _range_element_from_json(end, field.range_element_type) return {"start": start, "end": end} else: - raise ValueError(f"Unknown range format: {value}") + raise ValueError(f"Unknown format for range value: {value}") else: return None diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 3b58d3736..8395478fb 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -142,6 +142,17 @@ def bq_to_arrow_struct_data_type(field): return pyarrow.struct(arrow_fields) +def bq_to_arrow_range_data_type(field): + if field is None: + raise ValueError( + "Range element type cannot be None, must be one of " + "DATE, DATETIME, or TIMESTAMP" + ) + element_type = field.element_type.upper() + arrow_element_type = _pyarrow_helpers.bq_to_arrow_scalars(element_type)() + return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)]) + + def bq_to_arrow_data_type(field): """Return the Arrow data type, corresponding to a given BigQuery column. @@ -160,6 +171,9 @@ def bq_to_arrow_data_type(field): if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) + if field_type_upper == "RANGE": + return bq_to_arrow_range_data_type(field.range_element_type) + data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper) if data_type_constructor is None: return None @@ -220,6 +234,9 @@ def default_types_mapper( datetime_dtype: Union[Any, None] = None, time_dtype: Union[Any, None] = None, timestamp_dtype: Union[Any, None] = None, + range_date_dtype: Union[Any, None] = None, + range_datetime_dtype: Union[Any, None] = None, + range_timestamp_dtype: Union[Any, None] = None, ): """Create a mapping from pyarrow types to pandas types. @@ -274,6 +291,22 @@ def types_mapper(arrow_data_type): elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type): return time_dtype + elif pyarrow.types.is_struct(arrow_data_type): + if range_datetime_dtype is not None and arrow_data_type.equals( + range_datetime_dtype.pyarrow_dtype + ): + return range_datetime_dtype + + elif range_date_dtype is not None and arrow_data_type.equals( + range_date_dtype.pyarrow_dtype + ): + return range_date_dtype + + elif range_timestamp_dtype is not None and arrow_data_type.equals( + range_timestamp_dtype.pyarrow_dtype + ): + return range_timestamp_dtype + return types_mapper diff --git a/google/cloud/bigquery/dbapi/_helpers.py b/google/cloud/bigquery/dbapi/_helpers.py index 117fa8ae7..a4ab05ce8 100644 --- a/google/cloud/bigquery/dbapi/_helpers.py +++ b/google/cloud/bigquery/dbapi/_helpers.py @@ -277,12 +277,14 @@ def complex_query_parameter( param = query.ArrayQueryParameter( name, sub_type, - value - if isinstance(sub_type, query.ScalarQueryParameterType) - else [ - complex_query_parameter(None, v, sub_type._complex__src, base) - for v in value - ], + ( + value + if isinstance(sub_type, query.ScalarQueryParameterType) + else [ + complex_query_parameter(None, v, sub_type._complex__src, base) + for v in value + ] + ), ) elif type_type == STRUCT: if not isinstance(value, collections_abc.Mapping): diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index 1abe28381..d8cbe9969 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -99,6 +99,15 @@ class DefaultPandasDTypes(enum.Enum): TIME_DTYPE = object() """Specifies default time dtype""" + RANGE_DATE_DTYPE = object() + """Specifies default range date dtype""" + + RANGE_DATETIME_DTYPE = object() + """Specifies default range datetime dtype""" + + RANGE_TIMESTAMP_DTYPE = object() + """Specifies default range timestamp dtype""" + class DestinationFormat(object): """The exported file format. The default value is :attr:`CSV`. diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 7436b6013..09a69e11c 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1784,6 +1784,13 @@ def to_dataframe( datetime_dtype: Union[Any, None] = None, time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, timestamp_dtype: Union[Any, None] = None, + range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE, + range_datetime_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, + range_timestamp_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -1919,6 +1926,63 @@ def to_dataframe( .. versionadded:: 3.10.0 + range_date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + )) + + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + )) + + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + )) + + to convert BigQuery RANGE type, instead of relying + on the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data @@ -1949,6 +2013,9 @@ def to_dataframe( datetime_dtype=datetime_dtype, time_dtype=time_dtype, timestamp_dtype=timestamp_dtype, + range_date_dtype=range_date_dtype, + range_datetime_dtype=range_datetime_dtype, + range_timestamp_dtype=range_timestamp_dtype, ) # If changing the signature of this method, make sure to apply the same diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index 9c9402b74..9c59056fd 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -24,14 +24,13 @@ from google.cloud.bigquery._helpers import _rows_from_json from google.cloud.bigquery._helpers import _QUERY_PARAMS_FROM_JSON from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_PARAM +from google.cloud.bigquery._helpers import _SUPPORTED_RANGE_ELEMENTS _SCALAR_VALUE_TYPE = Optional[ Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date] ] -_RANGE_ELEMENT_TYPE_STR = {"TIMESTAMP", "DATETIME", "DATE"} - class ConnectionProperty: """A connection-level property to customize query behavior. @@ -388,14 +387,14 @@ def _parse_range_element_type(self, type_): google.cloud.bigquery.query.ScalarQueryParameterType: Instance """ if isinstance(type_, str): - if type_ not in _RANGE_ELEMENT_TYPE_STR: + if type_ not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a string, range element type must be one of " "'TIMESTAMP', 'DATE', or 'DATETIME'." ) return ScalarQueryParameterType(type_) elif isinstance(type_, ScalarQueryParameterType): - if type_._type not in _RANGE_ELEMENT_TYPE_STR: + if type_._type not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a ScalarQueryParameter object, range element " "type must be one of 'TIMESTAMP', 'DATE', or 'DATETIME' " @@ -960,14 +959,14 @@ class RangeQueryParameter(_AbstractQueryParameter): @classmethod def _parse_range_element_type(self, range_element_type): if isinstance(range_element_type, str): - if range_element_type not in _RANGE_ELEMENT_TYPE_STR: + if range_element_type not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a string, range_element_type must be one of " f"'TIMESTAMP', 'DATE', or 'DATETIME'. Got {range_element_type}." ) return RangeQueryParameterType(range_element_type) elif isinstance(range_element_type, RangeQueryParameterType): - if range_element_type.type_._type not in _RANGE_ELEMENT_TYPE_STR: + if range_element_type.type_._type not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a RangeQueryParameterType object, " "range_element_type must be one of 'TIMESTAMP', 'DATE', " diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 73e755e9e..2f07bcc78 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2044,6 +2044,13 @@ def to_dataframe( datetime_dtype: Union[Any, None] = None, time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, timestamp_dtype: Union[Any, None] = None, + range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE, + range_datetime_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, + range_timestamp_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -2183,6 +2190,63 @@ def to_dataframe( .. versionadded:: 3.10.0 + range_date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + )) + + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + )) + + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + )) + + to convert BigQuery RANGE type, instead of relying + on the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column @@ -2214,6 +2278,69 @@ def to_dataframe( if time_dtype is DefaultPandasDTypes.TIME_DTYPE: time_dtype = db_dtypes.TimeDtype() + if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: + try: + range_date_dtype = pandas.ArrowDtype( + pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_date_dtype to None. + msg = ( + "Unable to find class ArrowDtype in pandas, setting " + "range_date_dtype to be None. To use ArrowDtype, please " + "use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_date_dtype = None + + if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: + try: + range_datetime_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_datetime_dtype to None. + msg = ( + "Unable to find class ArrowDtype in pandas, setting " + "range_datetime_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_datetime_dtype = None + + if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: + try: + range_timestamp_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_timestamp_dtype to None. + msg = ( + "Unable to find class ArrowDtype in pandas, setting " + "range_timestamp_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_timestamp_dtype = None + if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) @@ -2298,6 +2425,9 @@ def to_dataframe( datetime_dtype=datetime_dtype, time_dtype=time_dtype, timestamp_dtype=timestamp_dtype, + range_date_dtype=range_date_dtype, + range_datetime_dtype=range_datetime_dtype, + range_timestamp_dtype=range_timestamp_dtype, ), ) else: @@ -2502,6 +2632,9 @@ def to_dataframe( datetime_dtype=None, time_dtype=None, timestamp_dtype=None, + range_date_dtype=None, + range_datetime_dtype=None, + range_timestamp_dtype=None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2519,6 +2652,9 @@ def to_dataframe( datetime_dtype (Any): Ignored. Added for compatibility with RowIterator. time_dtype (Any): Ignored. Added for compatibility with RowIterator. timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator. + range_date_dtype (Any): Ignored. Added for compatibility with RowIterator. + range_datetime_dtype (Any): Ignored. Added for compatibility with RowIterator. + range_timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. @@ -2541,6 +2677,7 @@ def to_geodataframe( dtypes (Any): Ignored. Added for compatibility with RowIterator. progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + geography_column (str): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/noxfile.py b/noxfile.py index 034bb843a..78a9ab5b6 100644 --- a/noxfile.py +++ b/noxfile.py @@ -208,6 +208,9 @@ def system(session): extras = "[all]" session.install("-e", f".{extras}", "-c", constraints_path) + # print versions of all dependencies + session.run("python", "-m", "pip", "freeze") + # Run py.test against the system tests. session.run( "py.test", diff --git a/tests/data/scalars.csv b/tests/data/scalars.csv new file mode 100644 index 000000000..7af97583f --- /dev/null +++ b/tests/data/scalars.csv @@ -0,0 +1,2 @@ +"[2020-01-01, 2020-02-01)" + diff --git a/tests/data/scalars_schema_csv.json b/tests/data/scalars_schema_csv.json new file mode 100644 index 000000000..82b878d95 --- /dev/null +++ b/tests/data/scalars_schema_csv.json @@ -0,0 +1,10 @@ +[ + { + "mode" : "NULLABLE", + "name" : "range_date", + "type" : "RANGE", + "rangeElementType": { + "type": "DATE" + } + } + ] \ No newline at end of file diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 784a1dd5c..8efa042af 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -96,12 +96,14 @@ def load_scalars_table( project_id: str, dataset_id: str, data_path: str = "scalars.jsonl", + source_format=enums.SourceFormat.NEWLINE_DELIMITED_JSON, + schema_source="scalars_schema.json", ) -> str: - schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + schema = bigquery_client.schema_from_json(DATA_DIR / schema_source) table_id = data_path.replace(".", "_") + hex(random.randrange(1000000)) job_config = bigquery.LoadJobConfig() job_config.schema = schema - job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + job_config.source_format = source_format full_table_id = f"{project_id}.{dataset_id}.{table_id}" with open(DATA_DIR / data_path, "rb") as data_file: job = bigquery_client.load_table_from_file( @@ -151,6 +153,22 @@ def scalars_table_multi_location( return request.param, full_table_id +@pytest.fixture(scope="session") +def scalars_table_csv( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +): + full_table_id = load_scalars_table( + bigquery_client, + project_id, + dataset_id, + data_path="scalars.csv", + source_format=enums.SourceFormat.CSV, + schema_source="scalars_schema_csv.json", + ) + yield full_table_id + bigquery_client.delete_table(full_table_id, not_found_ok=True) + + @pytest.fixture def test_table_name(request, replace_non_anum=re.compile(r"[^a-zA-Z0-9_]").sub): return replace_non_anum("_", request.node.name) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index 8b88b6844..82cf11f85 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -167,3 +167,30 @@ def test_arrow_extension_types_same_for_storage_and_REST_APIs_894( b"ARROW:extension:name": b"google:sqlType:geography", b"ARROW:extension:metadata": b'{"encoding": "WKT"}', } + + +def test_list_rows_range_csv( + bigquery_client: bigquery.Client, + scalars_table_csv: str, +): + table_id = scalars_table_csv + + schema = [ + bigquery.SchemaField( + "range_date", enums.SqlTypeNames.RANGE, range_element_type="DATE" + ), + ] + + arrow_table = bigquery_client.list_rows( + table_id, + selected_fields=schema, + ).to_arrow() + + schema = arrow_table.schema + + expected_type = pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + + range_type = schema.field("range_date").type + assert range_type == expected_type diff --git a/tests/system/test_list_rows.py b/tests/system/test_list_rows.py index 4c08958c3..108b842ce 100644 --- a/tests/system/test_list_rows.py +++ b/tests/system/test_list_rows.py @@ -118,3 +118,17 @@ def test_list_rows_scalars_extreme( assert value == 4 else: assert value is None + + +def test_list_rows_range(bigquery_client: bigquery.Client, scalars_table_csv: str): + rows = bigquery_client.list_rows(scalars_table_csv) + rows = list(rows) + row = rows[0] + expected_range = { + "start": datetime.date(2020, 1, 1), + "end": datetime.date(2020, 2, 1), + } + assert row["range_date"] == expected_range + + row_null = rows[1] + assert row_null["range_date"] is None diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 5c13669f3..58d2b73b3 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -670,6 +670,67 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): assert array.to_pylist() == list(series) +@pytest.mark.parametrize( + "bq_schema,expected", + [ + ( + schema.SchemaField( + "field1", + "RANGE", + range_element_type=schema.FieldElementType("DATE"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", pyarrow.date32()), + ("end", pyarrow.date32()), + ] + ), + ), + ( + schema.SchemaField( + "field2", + "RANGE", + range_element_type=schema.FieldElementType("DATETIME"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz=None)), + ("end", pyarrow.timestamp("us", tz=None)), + ] + ), + ), + ( + schema.SchemaField( + "field3", + "RANGE", + range_element_type=schema.FieldElementType("TIMESTAMP"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ), + ), + ], +) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_bq_to_arrow_data_type_w_range(module_under_test, bq_schema, expected): + actual = module_under_test.bq_to_arrow_data_type(bq_schema) + assert actual.equals(expected) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_bq_to_arrow_data_type_w_range_no_element(module_under_test): + field = schema.SchemaField("field1", "RANGE", mode="NULLABLE") + with pytest.raises(ValueError, match="Range element type cannot be None"): + module_under_test.bq_to_arrow_data_type(field) + + @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_schema_w_unknown_type(module_under_test): fields = ( diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 3953170fd..099529f95 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3503,7 +3503,11 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - self.assertEqual(len(user_warnings), 0) + # With Python 3.7 and 3.8, len(user_warnings) = 3. With pandas < 1.5, + # pandas.ArrowDtype is not supported. We raise warnings because + # range columns have to be converted to object. + # With higher Python versions and noextra tests, len(user_warnings) = 0 + self.assertIn(len(user_warnings), [0, 3]) self.assertEqual(len(df), 4) @mock.patch("google.cloud.bigquery._tqdm_helpers.tqdm", new=None) @@ -3534,7 +3538,11 @@ def test_to_dataframe_no_tqdm(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - self.assertEqual(len(user_warnings), 1) + # With Python 3.7 and 3.8, len(user_warnings) = 4. With pandas < 1.5, + # pandas.ArrowDtype is not supported. We raise warnings because + # range columns have to be converted to object. + # With higher Python versions and noextra tests, len(user_warnings) = 1 + self.assertIn(len(user_warnings), [1, 4]) # Even though the progress bar won't show, downloading the dataframe # should still work. @@ -3653,6 +3661,9 @@ def test_to_dataframe_w_dtypes_mapper(self): SchemaField("datetime", "DATETIME"), SchemaField("time", "TIME"), SchemaField("timestamp", "TIMESTAMP"), + SchemaField("range_timestamp", "RANGE", range_element_type="TIMESTAMP"), + SchemaField("range_datetime", "RANGE", range_element_type="DATETIME"), + SchemaField("range_date", "RANGE", range_element_type="DATE"), ] row_data = [ [ @@ -3665,6 +3676,9 @@ def test_to_dataframe_w_dtypes_mapper(self): "1999-12-31T00:00:00.000000", "00:00:00.000000", "1433836800000000", + "[1433836800000000, 1433999900000000)", + "[2009-06-17T13:45:30, 2019-07-17T13:45:30)", + "[2020-10-01, 2021-10-02)", ], [ "Bharney Rhubble", @@ -3676,6 +3690,9 @@ def test_to_dataframe_w_dtypes_mapper(self): "4567-12-31T00:00:00.000000", "12:00:00.232413", "81953424000000000", + "[1433836800000000, UNBOUNDED)", + "[2009-06-17T13:45:30, UNBOUNDED)", + "[2020-10-01, UNBOUNDED)", ], [ "Wylma Phlyntstone", @@ -3687,6 +3704,9 @@ def test_to_dataframe_w_dtypes_mapper(self): "9999-12-31T23:59:59.999999", "23:59:59.999999", "253402261199999999", + "[UNBOUNDED, UNBOUNDED)", + "[UNBOUNDED, UNBOUNDED)", + "[UNBOUNDED, UNBOUNDED)", ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] @@ -3724,6 +3744,39 @@ def test_to_dataframe_w_dtypes_mapper(self): if hasattr(pandas, "ArrowDtype") else None ), + range_date_dtype=( + pandas.ArrowDtype( + pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + ) + if hasattr(pandas, "ArrowDtype") + else None + ), + range_datetime_dtype=( + pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) + ) + if hasattr(pandas, "ArrowDtype") + else None + ), + range_timestamp_dtype=( + pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) + ) + if hasattr(pandas, "ArrowDtype") + else None + ), ) self.assertIsInstance(df, pandas.DataFrame) @@ -3791,6 +3844,52 @@ def test_to_dataframe_w_dtypes_mapper(self): ], ) self.assertEqual(df.timestamp.dtype.name, "timestamp[us, tz=UTC][pyarrow]") + + self.assertEqual( + list(df.range_timestamp), + [ + { + "start": datetime.datetime( + 2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc + ), + "end": datetime.datetime( + 2015, 6, 11, 5, 18, 20, tzinfo=datetime.timezone.utc + ), + }, + { + "start": datetime.datetime( + 2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc + ), + "end": None, + }, + {"start": None, "end": None}, + ], + ) + + self.assertEqual( + list(df.range_datetime), + [ + { + "start": datetime.datetime(2009, 6, 17, 13, 45, 30), + "end": datetime.datetime(2019, 7, 17, 13, 45, 30), + }, + {"start": datetime.datetime(2009, 6, 17, 13, 45, 30), "end": None}, + {"start": None, "end": None}, + ], + ) + + self.assertEqual( + list(df.range_date), + [ + { + "start": datetime.date(2020, 10, 1), + "end": datetime.date(2021, 10, 2), + }, + {"start": datetime.date(2020, 10, 1), "end": None}, + {"start": None, "end": None}, + ], + ) + else: self.assertEqual( list(df.date), @@ -3851,6 +3950,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): SchemaField("datetime", "DATETIME"), SchemaField("time", "TIME"), SchemaField("timestamp", "TIMESTAMP"), + SchemaField("range_timestamp", "RANGE", range_element_type="TIMESTAMP"), + SchemaField("range_datetime", "RANGE", range_element_type="DATETIME"), + SchemaField("range_date", "RANGE", range_element_type="DATE"), ] row_data = [ [ @@ -3863,6 +3965,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): "1999-12-31T00:00:00.000000", "23:59:59.999999", "1433836800000000", + "[1433836800000000, 1433999900000000)", + "[2009-06-17T13:45:30, 2019-07-17T13:45:30)", + "[2020-10-01, 2021-10-02)", ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] @@ -3880,6 +3985,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): datetime_dtype=None, time_dtype=None, timestamp_dtype=None, + range_timestamp_dtype=None, + range_datetime_dtype=None, + range_date_dtype=None, ) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(df.complete.dtype.name, "bool") @@ -3891,6 +3999,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): self.assertEqual(df.datetime.dtype.name, "datetime64[ns]") self.assertEqual(df.time.dtype.name, "object") self.assertEqual(df.timestamp.dtype.name, "datetime64[ns, UTC]") + self.assertEqual(df.range_timestamp.dtype.name, "object") + self.assertEqual(df.range_datetime.dtype.name, "object") + self.assertEqual(df.range_date.dtype.name, "object") def test_to_dataframe_w_unsupported_dtypes_mapper(self): pytest.importorskip("pandas")