diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ccb0f75ec..d5fde119ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,9 +21,12 @@ Changes can also be flagged with a GitHub label for tracking purposes. The URL o ## [Unreleased](https://github.com/ethyca/fides/compare/2.54.0...main) -## Changed +### Changed - Added frequency field to DataHubSchema integration config [#5716](https://github.com/ethyca/fides/pull/5716) +### Fixed +- Fixed Bigquery flakey tests. [#5713](LJ-278-fix-failed-big-query-enterprise-tests) + ## [2.53.0](https://github.com/ethyca/fides/compare/2.53.0...2.54.0) ### Added diff --git a/tests/fixtures/bigquery_fixtures.py b/tests/fixtures/bigquery_fixtures.py index 982e18a12b..396e2525da 100644 --- a/tests/fixtures/bigquery_fixtures.py +++ b/tests/fixtures/bigquery_fixtures.py @@ -60,6 +60,17 @@ def bigquery_connection_config(db: Session, bigquery_keyfile_creds) -> Generator connection_config.delete(db) +@pytest.fixture(scope="function") +def bigquery_enterprise_test_dataset_collections( + example_datasets: List[Dict], +) -> List[str]: + """Returns the names of collections in the BigQuery Enterprise dataset""" + bigquery_enterprise_dataset = example_datasets[16] + return [ + collection["name"] for collection in bigquery_enterprise_dataset["collections"] + ] + + @pytest.fixture(scope="function") def bigquery_enterprise_connection_config( db: Session, bigquery_enterprise_keyfile_creds @@ -341,6 +352,8 @@ def bigquery_example_test_dataset_config_with_namespace_and_partitioning_meta( def bigquery_resources( bigquery_example_test_dataset_config, ): + # Increment the ids by a random number to avoid conflicts on concurrent test runs + random_increment = random.randint(1, 99999) bigquery_connection_config = bigquery_example_test_dataset_config.connection_config connector = BigQueryConnector(bigquery_connection_config) bigquery_client = connector.client() @@ -351,11 +364,11 @@ def bigquery_resources( stmt = "select max(id) from customer;" res = connection.execute(stmt) - customer_id = res.all()[0][0] + 1 + customer_id = res.all()[0][0] + random_increment stmt = "select max(id) from address;" res = connection.execute(stmt) - address_id = res.all()[0][0] + 1 + address_id = res.all()[0][0] + random_increment city = "Test City" state = "TX" @@ -382,7 +395,7 @@ def bigquery_resources( stmt = "select max(id) from employee;" res = connection.execute(stmt) - employee_id = res.all()[0][0] + 1 + employee_id = res.all()[0][0] + random_increment employee_email = f"employee-{uuid}@example.com" employee_name = f"Jane {uuid}" @@ -422,6 +435,8 @@ def bigquery_resources( def bigquery_resources_with_namespace_meta( bigquery_example_test_dataset_config_with_namespace_meta, ): + # Increment the ids by a random number to avoid conflicts on concurrent test runs + random_increment = random.randint(1, 99999) bigquery_connection_config = ( bigquery_example_test_dataset_config_with_namespace_meta.connection_config ) @@ -434,11 +449,11 @@ def bigquery_resources_with_namespace_meta( stmt = "select max(id) from fidesopstest.customer;" res = connection.execute(stmt) - customer_id = res.all()[0][0] + 1 + customer_id = res.all()[0][0] + random_increment stmt = "select max(id) from fidesopstest.address;" res = connection.execute(stmt) - address_id = res.all()[0][0] + 1 + address_id = res.all()[0][0] + random_increment city = "Test City" state = "TX" @@ -465,7 +480,7 @@ def bigquery_resources_with_namespace_meta( stmt = "select max(id) from fidesopstest.employee;" res = connection.execute(stmt) - employee_id = res.all()[0][0] + 1 + employee_id = res.all()[0][0] + random_increment employee_email = f"employee-{uuid}@example.com" employee_name = f"Jane {uuid}" @@ -505,6 +520,8 @@ def bigquery_resources_with_namespace_meta( def bigquery_enterprise_resources( bigquery_enterprise_test_dataset_config, ): + # Increment the ids by a random number to avoid conflicts on concurrent test runs + random_increment = random.randint(1, 99999) bigquery_connection_config = ( bigquery_enterprise_test_dataset_config.connection_config ) @@ -515,8 +532,6 @@ def bigquery_enterprise_resources( # Real max id in the Stackoverflow dataset is 20081052, so we purposefully generate and id above this max stmt = "select max(id) from enterprise_dsr_testing.users;" res = connection.execute(stmt) - # Increment the id by a random number to avoid conflicts on concurrent test runs - random_increment = random.randint(0, 99999) user_id = res.all()[0][0] + random_increment display_name = ( f"fides_testing_{user_id}" # prefix to do manual cleanup if needed @@ -536,7 +551,6 @@ def bigquery_enterprise_resources( post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight." stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;" res = connection.execute(stmt) - random_increment = random.randint(0, 99999) post_id = res.all()[0][0] + random_increment stmt = f""" insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name) @@ -547,7 +561,6 @@ def bigquery_enterprise_resources( # Create test comments data. Comments are responses to posts or questions on Stackoverflow, and does not include original question or post itself. stmt = "select max(id) from enterprise_dsr_testing.comments;" res = connection.execute(stmt) - random_increment = random.randint(0, 99999) comment_id = res.all()[0][0] + random_increment comment_text = "FYI this only works if you have pytest installed locally." stmt = f""" @@ -557,9 +570,8 @@ def bigquery_enterprise_resources( connection.execute(stmt) # Create test post_history data - stmt = "select max(id) from enterprise_dsr_testing.comments;" + stmt = "select max(id) from enterprise_dsr_testing.post_history;" res = connection.execute(stmt) - random_increment = random.randint(0, 99999) post_history_id = res.all()[0][0] + random_increment revision_text = "this works if you have pytest" uuid = str(uuid4()) @@ -600,6 +612,8 @@ def bigquery_enterprise_resources( def bigquery_enterprise_resources_with_partitioning( bigquery_enterprise_test_dataset_config_with_partitioning_meta, ): + # Increment the ids by a random number to avoid conflicts on concurrent test runs + random_increment = random.randint(1, 99999) bigquery_connection_config = ( bigquery_enterprise_test_dataset_config_with_partitioning_meta.connection_config ) @@ -610,8 +624,6 @@ def bigquery_enterprise_resources_with_partitioning( # Real max id in the Stackoverflow dataset is 20081052, so we purposefully generate and id above this max stmt = "select max(id) from enterprise_dsr_testing.users;" res = connection.execute(stmt) - # Increment the id by a random number to avoid conflicts on concurrent test runs - random_increment = random.randint(0, 99999) user_id = res.all()[0][0] + random_increment display_name = ( f"fides_testing_{user_id}" # prefix to do manual cleanup if needed @@ -631,7 +643,6 @@ def bigquery_enterprise_resources_with_partitioning( post_body = "For me, the solution was to adopt 3 cats and dance with them under the full moon at midnight." stmt = "select max(id) from enterprise_dsr_testing.stackoverflow_posts_partitioned;" res = connection.execute(stmt) - random_increment = random.randint(0, 99999) post_id = res.all()[0][0] + random_increment stmt = f""" insert into enterprise_dsr_testing.stackoverflow_posts_partitioned (body, creation_date, id, owner_user_id, owner_display_name) @@ -642,7 +653,6 @@ def bigquery_enterprise_resources_with_partitioning( # Create test comments data. Comments are responses to posts or questions on Stackoverflow, and does not include original question or post itself. stmt = "select max(id) from enterprise_dsr_testing.comments;" res = connection.execute(stmt) - random_increment = random.randint(0, 99999) comment_id = res.all()[0][0] + random_increment comment_text = "FYI this only works if you have pytest installed locally." stmt = f""" @@ -652,9 +662,8 @@ def bigquery_enterprise_resources_with_partitioning( connection.execute(stmt) # Create test post_history data - stmt = "select max(id) from enterprise_dsr_testing.comments;" + stmt = "select max(id) from enterprise_dsr_testing.post_history;" res = connection.execute(stmt) - random_increment = random.randint(0, 99999) post_history_id = res.all()[0][0] + random_increment revision_text = "this works if you have pytest" uuid = str(uuid4()) diff --git a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py index 9042d4758a..d62f9fbd05 100644 --- a/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py +++ b/tests/ops/service/privacy_request/test_bigquery_enterprise_privacy_request.py @@ -1,9 +1,11 @@ +from typing import Any, Dict, List, Optional from unittest import mock import pytest from fides.api.models.audit_log import AuditLog, AuditLogAction -from fides.api.models.privacy_request import ExecutionLog +from fides.api.models.privacy_request import ExecutionLog, PrivacyRequest +from fides.api.util.collection_util import Row from tests.ops.service.privacy_request.test_request_runner_service import ( get_privacy_request_results, ) @@ -13,6 +15,107 @@ PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL = 150 +def validate_privacy_request( + pr: PrivacyRequest, + user_id: int, + bigquery_enterprise_test_dataset_collections: List[str], + access: bool = True, +) -> Dict[str, Optional[List[Row]]]: + """ + Validates the results of a privacy request with assertions. + - Checks that all collections have been queried + - Checks that all keys have a non-empty value + - Checks that only results for the user_id are returned + - Checks that the expected number of records are returned for each collection + + Note: The access boolean determines if we are looking at the access or erasure result counts. + + """ + results = pr.get_raw_access_results() + + assert len(results.keys()) == len(bigquery_enterprise_test_dataset_collections) + + for key in results.keys(): + assert results[key] is not None + assert results[key] != {} + + users = results["enterprise_dsr_testing:users"] + assert len(users) == 1 + user_details = users[0] + assert user_details["id"] == user_id + + assert ( + len( + [ + comment["user_id"] + for comment in results["enterprise_dsr_testing:comments"] + ] + ) + == 16 + if access + else 1 + ) + assert ( + len( + [post["user_id"] for post in results["enterprise_dsr_testing:post_history"]] + ) + == 39 + if access + else 1 + ) + assert ( + len( + [ + post["title"] + for post in results[ + "enterprise_dsr_testing:stackoverflow_posts_partitioned" + ] + ] + ) + == 30 + if access + else 1 + ) + + return results + + +def validate_erasure_privacy_request( + bigquery_enterprise_resources: dict[str, Any], user_id: int +) -> None: + """Validates the results of an erasure request with assertions.""" + bigquery_client = bigquery_enterprise_resources["client"] + post_history_id = bigquery_enterprise_resources["post_history_id"] + comment_id = bigquery_enterprise_resources["comment_id"] + post_id = bigquery_enterprise_resources["post_id"] + with bigquery_client.connect() as connection: + stmt = f"select text from enterprise_dsr_testing.post_history where id = {post_history_id};" + res = connection.execute(stmt).all() + for row in res: + assert row.text is None + + stmt = f"select user_display_name, text from enterprise_dsr_testing.comments where id = {comment_id};" + res = connection.execute(stmt).all() + for row in res: + assert row.user_display_name is None + assert row.text is None + + stmt = f"select owner_user_id, owner_display_name, body from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};" + res = connection.execute(stmt).all() + for row in res: + assert ( + row.owner_user_id == bigquery_enterprise_resources["user_id"] + ) # not targeted by policy + assert row.owner_display_name is None + assert row.body is None + + stmt = f"select display_name, location from enterprise_dsr_testing.users where id = {user_id};" + res = connection.execute(stmt).all() + for row in res: + assert row.display_name is None + assert row.location is None + + @pytest.mark.integration_bigquery @pytest.mark.integration_external @pytest.mark.parametrize( @@ -37,6 +140,7 @@ def test_access_request( policy_pre_execution_webhooks, policy_post_execution_webhooks, run_privacy_request_task, + bigquery_enterprise_test_dataset_collections, ): request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 request.getfixturevalue( @@ -67,44 +171,7 @@ def test_access_request( PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, ) - results = pr.get_raw_access_results() - assert len(results.keys()) == 4 - - for key in results.keys(): - assert results[key] is not None - assert results[key] != {} - - users = results["enterprise_dsr_testing:users"] - assert len(users) == 1 - user_details = users[0] - assert user_details["id"] == user_id - - assert ( - len( - [ - comment["user_id"] - for comment in results["enterprise_dsr_testing:comments"] - ] - ) - == 16 - ) - assert ( - len( - [post["user_id"] for post in results["enterprise_dsr_testing:post_history"]] - ) - == 39 - ) - assert ( - len( - [ - post["title"] - for post in results[ - "enterprise_dsr_testing:stackoverflow_posts_partitioned" - ] - ] - ) - == 30 - ) + validate_privacy_request(pr, user_id, bigquery_enterprise_test_dataset_collections) log_id = pr.execution_logs[0].id pr_id = pr.id @@ -156,10 +223,10 @@ def test_erasure_request( bigquery_fixtures, bigquery_enterprise_erasure_policy, run_privacy_request_task, + bigquery_enterprise_test_dataset_collections, ): request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 bigquery_enterprise_resources = request.getfixturevalue(bigquery_fixtures) - bigquery_client = bigquery_enterprise_resources["client"] # first test access request against manually added data user_id = bigquery_enterprise_resources["user_id"] @@ -184,43 +251,8 @@ def test_erasure_request( PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, ) - results = pr.get_raw_access_results() - assert len(results.keys()) == 4 - - for key in results.keys(): - assert results[key] is not None - assert results[key] != {} - - users = results["enterprise_dsr_testing:users"] - assert len(users) == 1 - user_details = users[0] - assert user_details["id"] == user_id - - assert ( - len( - [ - comment["user_id"] - for comment in results["enterprise_dsr_testing:comments"] - ] - ) - == 1 - ) - assert ( - len( - [post["user_id"] for post in results["enterprise_dsr_testing:post_history"]] - ) - == 1 - ) - assert ( - len( - [ - post["title"] - for post in results[ - "enterprise_dsr_testing:stackoverflow_posts_partitioned" - ] - ] - ) - == 1 + validate_privacy_request( + pr, user_id, bigquery_enterprise_test_dataset_collections, False ) data = { @@ -230,7 +262,7 @@ def test_erasure_request( "email": customer_email, "stackoverflow_user_id": { "label": "Stackoverflow User Id", - "value": bigquery_enterprise_resources["user_id"], + "value": user_id, }, }, } @@ -245,36 +277,7 @@ def test_erasure_request( ) pr.delete(db=db) - bigquery_client = bigquery_enterprise_resources["client"] - post_history_id = bigquery_enterprise_resources["post_history_id"] - comment_id = bigquery_enterprise_resources["comment_id"] - post_id = bigquery_enterprise_resources["post_id"] - with bigquery_client.connect() as connection: - stmt = f"select text from enterprise_dsr_testing.post_history where id = {post_history_id};" - res = connection.execute(stmt).all() - for row in res: - assert row.text is None - - stmt = f"select user_display_name, text from enterprise_dsr_testing.comments where id = {comment_id};" - res = connection.execute(stmt).all() - for row in res: - assert row.user_display_name is None - assert row.text is None - - stmt = f"select owner_user_id, owner_display_name, body from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};" - res = connection.execute(stmt).all() - for row in res: - assert ( - row.owner_user_id == bigquery_enterprise_resources["user_id"] - ) # not targeted by policy - assert row.owner_display_name is None - assert row.body is None - - stmt = f"select display_name, location from enterprise_dsr_testing.users where id = {user_id};" - res = connection.execute(stmt).all() - for row in res: - assert row.display_name is None - assert row.location is None + validate_erasure_privacy_request(bigquery_enterprise_resources, user_id) @pytest.mark.integration_bigquery @@ -295,6 +298,7 @@ def test_access_request_multiple_custom_identities( policy_pre_execution_webhooks, policy_post_execution_webhooks, run_privacy_request_task, + bigquery_enterprise_test_dataset_collections, ): request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 @@ -321,44 +325,7 @@ def test_access_request_multiple_custom_identities( PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, ) - results = pr.get_raw_access_results() - assert len(results.keys()) == 4 - - for key in results.keys(): - assert results[key] is not None - assert results[key] != {} - - users = results["enterprise_dsr_testing:users"] - assert len(users) == 1 - user_details = users[0] - assert user_details["id"] == user_id - - assert ( - len( - [ - comment["user_id"] - for comment in results["enterprise_dsr_testing:comments"] - ] - ) - == 16 - ) - assert ( - len( - [post["user_id"] for post in results["enterprise_dsr_testing:post_history"]] - ) - == 39 - ) - assert ( - len( - [ - post["title"] - for post in results[ - "enterprise_dsr_testing:stackoverflow_posts_partitioned" - ] - ] - ) - == 30 - ) + validate_privacy_request(pr, user_id, bigquery_enterprise_test_dataset_collections) log_id = pr.execution_logs[0].id pr_id = pr.id @@ -410,10 +377,10 @@ def test_erasure_request_multiple_custom_identities( bigquery_fixtures, bigquery_enterprise_erasure_policy, run_privacy_request_task, + bigquery_enterprise_test_dataset_collections, ): request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 bigquery_enterprise_resources = request.getfixturevalue(bigquery_fixtures) - bigquery_client = bigquery_enterprise_resources["client"] # first test access request against manually added data user_id = bigquery_enterprise_resources["user_id"] @@ -437,43 +404,8 @@ def test_erasure_request_multiple_custom_identities( PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, ) - results = pr.get_raw_access_results() - assert len(results.keys()) == 4 - - for key in results.keys(): - assert results[key] is not None - assert results[key] != {} - - users = results["enterprise_dsr_testing:users"] - assert len(users) == 1 - user_details = users[0] - assert user_details["id"] == user_id - - assert ( - len( - [ - comment["user_id"] - for comment in results["enterprise_dsr_testing:comments"] - ] - ) - == 1 - ) - assert ( - len( - [post["user_id"] for post in results["enterprise_dsr_testing:post_history"]] - ) - == 1 - ) - assert ( - len( - [ - post["title"] - for post in results[ - "enterprise_dsr_testing:stackoverflow_posts_partitioned" - ] - ] - ) - == 1 + validate_privacy_request( + pr, user_id, bigquery_enterprise_test_dataset_collections, False ) data = { @@ -497,33 +429,4 @@ def test_erasure_request_multiple_custom_identities( ) pr.delete(db=db) - bigquery_client = bigquery_enterprise_resources["client"] - post_history_id = bigquery_enterprise_resources["post_history_id"] - comment_id = bigquery_enterprise_resources["comment_id"] - post_id = bigquery_enterprise_resources["post_id"] - with bigquery_client.connect() as connection: - stmt = f"select text from enterprise_dsr_testing.post_history where id = {post_history_id};" - res = connection.execute(stmt).all() - for row in res: - assert row.text is None - - stmt = f"select user_display_name, text from enterprise_dsr_testing.comments where id = {comment_id};" - res = connection.execute(stmt).all() - for row in res: - assert row.user_display_name is None - assert row.text is None - - stmt = f"select owner_user_id, owner_display_name, body from enterprise_dsr_testing.stackoverflow_posts_partitioned where id = {post_id};" - res = connection.execute(stmt).all() - for row in res: - assert ( - row.owner_user_id == bigquery_enterprise_resources["user_id"] - ) # not targeted by policy - assert row.owner_display_name is None - assert row.body is None - - stmt = f"select display_name, location from enterprise_dsr_testing.users where id = {user_id};" - res = connection.execute(stmt).all() - for row in res: - assert row.display_name is None - assert row.location is None + validate_erasure_privacy_request(bigquery_enterprise_resources, user_id)