diff --git a/CHANGELOG.md b/CHANGELOG.md index 382123253..7bedb5cf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [3.17.2](https://github.com/googleapis/python-bigquery/compare/v3.17.1...v3.17.2) (2024-01-30) + + +### Bug Fixes + +* Change load_table_from_json autodetect logic ([#1804](https://github.com/googleapis/python-bigquery/issues/1804)) ([6249032](https://github.com/googleapis/python-bigquery/commit/62490325f64e5d66303d9218992e28ac5f21cb3f)) + + +### Documentation + +* Update to use API ([#1781](https://github.com/googleapis/python-bigquery/issues/1781)) ([81563b0](https://github.com/googleapis/python-bigquery/commit/81563b06298fe3a64be6a89b583c3d64758ca12a)) +* Update `client_query_destination_table.py` sample to use `query_and_wait` ([#1783](https://github.com/googleapis/python-bigquery/issues/1783)) ([68ebbe1](https://github.com/googleapis/python-bigquery/commit/68ebbe12d455ce8e9b1784fb11787c2fb842ef22)) +* Update query_external_sheets_permanent_table.py to use query_and_wait API ([#1778](https://github.com/googleapis/python-bigquery/issues/1778)) ([a7be88a](https://github.com/googleapis/python-bigquery/commit/a7be88adf8a480ee61aa79789cb53df1b79bb091)) +* Update sample for query_to_arrow to use query_and_wait API ([#1776](https://github.com/googleapis/python-bigquery/issues/1776)) ([dbf10de](https://github.com/googleapis/python-bigquery/commit/dbf10dee51a7635e9b98658f205ded2de087a06f)) +* Update the query destination table legacy file to use query_and_wait API ([#1775](https://github.com/googleapis/python-bigquery/issues/1775)) ([ef89f9e](https://github.com/googleapis/python-bigquery/commit/ef89f9e58c22b3af5a7757b69daa030116012350)) +* Update to use `query_and_wait` in `client_query_w_positional_params.py` ([#1786](https://github.com/googleapis/python-bigquery/issues/1786)) ([410f71e](https://github.com/googleapis/python-bigquery/commit/410f71e6b6e755928e363ed89c1044e14b0db9cc)) +* Update to use `query_and_wait` in `samples/client_query_w_timestamp_params.py` ([#1785](https://github.com/googleapis/python-bigquery/issues/1785)) ([ba36948](https://github.com/googleapis/python-bigquery/commit/ba3694852c13c8a29fe0f9d923353e82acfd4278)) +* Update to_geodataframe to use query_and_wait functionality ([#1800](https://github.com/googleapis/python-bigquery/issues/1800)) ([1298594](https://github.com/googleapis/python-bigquery/commit/12985942942b8f205ecd261fcdf620df9a640460)) + ## [3.17.1](https://github.com/googleapis/python-bigquery/compare/v3.17.0...v3.17.1) (2024-01-24) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index b2ea130c4..4708e753b 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -2833,8 +2833,22 @@ def load_table_from_json( new_job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON - if new_job_config.schema is None: - new_job_config.autodetect = True + # In specific conditions, we check if the table alread exists, and/or + # set the autodetect value for the user. For exact conditions, see table + # https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + if new_job_config.schema is None and new_job_config.autodetect is None: + if new_job_config.write_disposition in ( + job.WriteDisposition.WRITE_TRUNCATE, + job.WriteDisposition.WRITE_EMPTY, + ): + new_job_config.autodetect = True + else: + try: + self.get_table(destination) + except core_exceptions.NotFound: + new_job_config.autodetect = True + else: + new_job_config.autodetect = False if project is None: project = self.project diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index 9b72eddf7..771b77a38 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.17.1" +__version__ = "3.17.2" diff --git a/samples/geography/to_geodataframe.py b/samples/geography/to_geodataframe.py index e36331f27..630d8d0bf 100644 --- a/samples/geography/to_geodataframe.py +++ b/samples/geography/to_geodataframe.py @@ -33,6 +33,6 @@ def get_austin_service_requests_as_geography() -> "pandas.DataFrame": LIMIT 10 """ - df = client.query(sql).to_geodataframe() + df = client.query_and_wait(sql).to_geodataframe() # [END bigquery_query_results_geodataframe] return df diff --git a/tests/system/test_client.py b/tests/system/test_client.py index d7e56f7ff..74c152cf2 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -994,6 +994,45 @@ def test_load_table_from_json_schema_autodetect(self): self.assertEqual(tuple(table.schema), table_schema) self.assertEqual(table.num_rows, 2) + # Autodetect makes best effort to infer the schema, but situations exist + # when the detected schema is wrong, and does not match existing schema. + # Thus the client sets autodetect = False when table exists and just uses + # the existing schema. This test case uses a special case where backend has + # no way to distinguish int from string. + def test_load_table_from_json_schema_autodetect_table_exists(self): + json_rows = [ + {"name": "123", "age": 18, "birthday": "2001-10-15", "is_awesome": False}, + {"name": "456", "age": 79, "birthday": "1940-03-10", "is_awesome": True}, + ] + + dataset_id = _make_dataset_id("bq_system_test") + self.temp_dataset(dataset_id) + table_id = "{}.{}.load_table_from_json_basic_use".format( + Config.CLIENT.project, dataset_id + ) + + # Use schema with NULLABLE fields, because schema autodetection + # defaults to field mode NULLABLE. + table_schema = ( + bigquery.SchemaField("name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("birthday", "DATE", mode="NULLABLE"), + bigquery.SchemaField("is_awesome", "BOOLEAN", mode="NULLABLE"), + ) + # create the table before loading so that the column order is predictable + table = helpers.retry_403(Config.CLIENT.create_table)( + Table(table_id, schema=table_schema) + ) + self.to_delete.insert(0, table) + + # do not pass an explicit job config to trigger automatic schema detection + load_job = Config.CLIENT.load_table_from_json(json_rows, table_id) + load_job.result() + + table = Config.CLIENT.get_table(table) + self.assertEqual(tuple(table.schema), table_schema) + self.assertEqual(table.num_rows, 2) + def test_load_avro_from_uri_then_dump_table(self): from google.cloud.bigquery.job import CreateDisposition from google.cloud.bigquery.job import SourceFormat diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 56bdbad5e..42581edc1 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -8951,6 +8951,8 @@ def test_load_table_from_dataframe_w_higher_scale_decimal128_datatype(self): SchemaField("x", "BIGNUMERIC", "NULLABLE", None), ) + # With autodetect specified, we pass the value as is. For more info, see + # https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 def test_load_table_from_json_basic_use(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -8962,12 +8964,28 @@ def test_load_table_from_json_basic_use(self): {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, ] + job_config = job.LoadJobConfig(autodetect=True) + load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) - with load_patch as load_table_from_file: - client.load_table_from_json(json_rows, self.TABLE_REF) + # mock: remote table already exists + get_table_reference = { + "projectId": "project_id", + "datasetId": "test_dataset", + "tableId": "test_table", + } + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock(table_reference=get_table_reference), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) load_table_from_file.assert_called_once_with( client, @@ -9066,6 +9084,174 @@ def test_load_table_from_json_w_invalid_job_config(self): err_msg = str(exc.value) assert "Expected an instance of LoadJobConfig" in err_msg + # When all following are true: + # (1) no schema provided; + # (2) no autodetect value provided; + # (3) writeDisposition == WRITE_APPEND or None; + # (4) table already exists, + # client sets autodetect == False + # For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + def test_load_table_from_json_wo_schema_wo_autodetect_write_append_w_table(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.job import WriteDisposition + + client = self._make_client() + + json_rows = [ + {"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False}, + {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, + ] + + job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + # mock: remote table already exists + get_table_reference = { + "projectId": "project_id", + "datasetId": "test_dataset", + "tableId": "test_table", + } + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock(table_reference=get_table_reference), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + size=mock.ANY, + num_retries=_DEFAULT_NUM_RETRIES, + job_id=mock.ANY, + job_id_prefix=None, + location=client.location, + project=client.project, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON + assert sent_config.schema is None + assert not sent_config.autodetect + + # When all following are true: + # (1) no schema provided; + # (2) no autodetect value provided; + # (3) writeDisposition == WRITE_APPEND or None; + # (4) table does NOT exist, + # client sets autodetect == True + # For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + def test_load_table_from_json_wo_schema_wo_autodetect_write_append_wo_table(self): + import google.api_core.exceptions as core_exceptions + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.job import WriteDisposition + + client = self._make_client() + + json_rows = [ + {"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False}, + {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, + ] + + job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + # mock: remote table doesn't exist + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=core_exceptions.NotFound(""), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + size=mock.ANY, + num_retries=_DEFAULT_NUM_RETRIES, + job_id=mock.ANY, + job_id_prefix=None, + location=client.location, + project=client.project, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON + assert sent_config.schema is None + assert sent_config.autodetect + + # When all following are true: + # (1) no schema provided; + # (2) no autodetect value provided; + # (3) writeDisposition == WRITE_TRUNCATE or WRITE_EMPTY; + # client sets autodetect == True + # For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + def test_load_table_from_json_wo_schema_wo_autodetect_others(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.job import WriteDisposition + + client = self._make_client() + + json_rows = [ + {"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False}, + {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, + ] + + job_config = job.LoadJobConfig( + write_disposition=WriteDisposition.WRITE_TRUNCATE + ) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + with load_patch as load_table_from_file: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + size=mock.ANY, + num_retries=_DEFAULT_NUM_RETRIES, + job_id=mock.ANY, + job_id_prefix=None, + location=client.location, + project=client.project, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON + assert sent_config.schema is None + assert sent_config.autodetect + def test_load_table_from_json_w_explicit_job_config_override(self): from google.cloud.bigquery import job from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES @@ -9190,8 +9376,19 @@ def test_load_table_from_json_unicode_emoji_data_case(self): load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) + # mock: remote table already exists + get_table_reference = { + "projectId": "project_id", + "datasetId": "test_dataset", + "tableId": "test_table", + } + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock(table_reference=get_table_reference), + ) - with load_patch as load_table_from_file: + with load_patch as load_table_from_file, get_table_patch: client.load_table_from_json(json_rows, self.TABLE_REF) load_table_from_file.assert_called_once_with(