Skip to content

Commit

Permalink
Adding dataset reference validation to privacy request timeline
Browse files Browse the repository at this point in the history
  • Loading branch information
galvana committed Feb 10, 2025
1 parent bec5265 commit 24d6ab4
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 10 deletions.
6 changes: 2 additions & 4 deletions src/fides/api/api/v1/endpoints/generic_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ async def create_dataset(
) -> Dict:
"""Create a new dataset"""
try:
created = dataset_service.create_dataset(dataset)
return created.model_dump()
return dataset_service.create_dataset(dataset)
except PydanticValidationError as e:
raise HTTPException(
status_code=HTTP_422_UNPROCESSABLE_ENTITY,
Expand Down Expand Up @@ -193,8 +192,7 @@ async def get_dataset(
"""Get a single dataset by fides key"""
service = DatasetService(db)
try:
dataset = service.get_dataset(fides_key)
return dataset.model_dump()
return service.get_dataset(fides_key)
except DatasetNotFoundException as e:
raise HTTPException(
status_code=HTTP_404_NOT_FOUND,
Expand Down
8 changes: 3 additions & 5 deletions src/fides/api/graph/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,10 @@ def __init__(self, *datasets: GraphDataset) -> None:
)
for dest_field_address, direction in ref_list:
if dest_field_address.collection_address() not in self.nodes:
logger.warning(
"Referenced object {} does not exist", dest_field_address
)
message = f"Referenced object {dest_field_address} from dataset {node_address.dataset} does not exist"
logger.warning(message)
raise ValidationError(
f"Referred to object {dest_field_address} does not exist",
errors=[dest_field_address.value],
message, errors=[dest_field_address.value]
)
self.edges.add(
Edge.create_edge(
Expand Down
28 changes: 27 additions & 1 deletion src/fides/api/service/privacy_request/request_runner_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import requests
from loguru import logger
from pydantic import ValidationError
from sqlalchemy.orm import Query, Session

from fides.api import common_exceptions
Expand All @@ -15,6 +14,7 @@
NoCachedManualWebhookEntry,
PrivacyRequestExit,
PrivacyRequestPaused,
ValidationError,
)
from fides.api.db.session import get_db_session
from fides.api.graph.config import CollectionAddress
Expand Down Expand Up @@ -355,6 +355,17 @@ def run_privacy_request(
if not dataset_config.connection_config.disabled
]
dataset_graph = DatasetGraph(*dataset_graphs)

# Add success log for dataset configuration
privacy_request.add_success_execution_log(
session,
connection_key=None,
dataset_name="Dataset reference validation",
collection_name=None,
message=f"Dataset referencevalidation successful for privacy request: {privacy_request.id}",
action_type=privacy_request.policy.get_action_type(), # type: ignore
)

identity_data = {
key: value["value"] if isinstance(value, dict) else value
for key, value in privacy_request.get_cached_identity_data().items()
Expand Down Expand Up @@ -482,7 +493,22 @@ def run_privacy_request(
# the appropriate checkpoint when all the Request Tasks have run.
return

except ValidationError as exc:
# Handle validation errors from dataset graph creation
logger.error(f"Error validating dataset references: {str(exc)}")
privacy_request.add_error_execution_log(
session,
connection_key=None,
dataset_name="Dataset reference validation",
collection_name=None,
message=str(exc),
action_type=privacy_request.policy.get_action_type(), # type: ignore
)
privacy_request.error_processing(db=session)
return

except BaseException as exc: # pylint: disable=broad-except
logger.error(f"Error running privacy request: {str(exc)}")
privacy_request.error_processing(db=session)
# If dev mode, log traceback
_log_exception(exc, CONFIG.dev_mode)
Expand Down
103 changes: 103 additions & 0 deletions tests/ops/service/privacy_request/test_request_runner_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
import pytest
from pydantic import ValidationError
from sqlalchemy.orm import Session
from sqlalchemy.orm.attributes import flag_modified

from fides.api.common_exceptions import (
ClientUnsuccessfulException,
PrivacyRequestPaused,
)
from fides.api.graph.graph import DatasetGraph
from fides.api.models.application_config import ApplicationConfig
from fides.api.models.datasetconfig import DatasetConfig
from fides.api.models.policy import CurrentStep, PolicyPostWebhook
from fides.api.models.privacy_request import (
ActionType,
Expand Down Expand Up @@ -1372,3 +1374,104 @@ def test_async_callback_erasure_request(
# node cannot be paused
db.refresh(pr)
assert pr.status == PrivacyRequestStatus.complete


class TestDatasetReferenceValidation:
@pytest.mark.usefixtures("dataset_config")
@mock.patch(
"fides.api.service.privacy_request.request_runner_service.access_runner"
)
@pytest.mark.parametrize(
"dsr_version",
["use_dsr_3_0", "use_dsr_2_0"],
)
def test_dataset_reference_validation_success(
self,
run_access,
db: Session,
privacy_request: PrivacyRequest,
run_privacy_request_task,
request,
dsr_version,
):
"""Test that successful dataset reference validation is logged"""

request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0

# Run privacy request
run_privacy_request_task.delay(privacy_request.id).get(
timeout=PRIVACY_REQUEST_TASK_TIMEOUT
)

# Verify success log was created
success_logs = privacy_request.execution_logs.filter_by(status="complete").all()

validation_logs = [
log
for log in success_logs
if log.dataset_name == "Dataset reference validation"
]

assert len(validation_logs) == 1
log = validation_logs[0]
assert log.connection_key is None
assert log.collection_name is None
assert (
log.message
== f"Dataset referencevalidation successful for privacy request: {privacy_request.id}"
)
assert log.action_type == privacy_request.policy.get_action_type()

@mock.patch(
"fides.api.service.privacy_request.request_runner_service.access_runner"
)
@pytest.mark.parametrize(
"dsr_version",
["use_dsr_3_0", "use_dsr_2_0"],
)
def test_dataset_reference_validation_error(
self,
run_access,
db: Session,
privacy_request: PrivacyRequest,
dataset_config: DatasetConfig,
run_privacy_request_task,
request,
dsr_version,
):
"""Test that dataset reference validation errors are logged"""

request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0

# Add invalid dataset reference that will cause validation error
dataset_config.ctl_dataset.collections[0]["fields"][0]["fides_meta"] = {
"references": [
{"dataset": "invalid_dataset", "field": "invalid_collection.field"}
]
}
flag_modified(dataset_config.ctl_dataset, "collections")
dataset_config.save(db)

# Run privacy request
run_privacy_request_task.delay(privacy_request.id).get(
timeout=PRIVACY_REQUEST_TASK_TIMEOUT
)

# Verify error log was created
error_logs = privacy_request.execution_logs.filter_by(status="error").all()

validation_logs = [
log
for log in error_logs
if log.dataset_name == "Dataset reference validation"
]

assert len(validation_logs) == 1
log = validation_logs[0]
assert log.connection_key is None
assert log.collection_name is None
assert (
"Referenced object invalid_dataset:invalid_collection:field from dataset postgres_example_subscriptions_dataset does not exist"
in log.message
)
assert log.action_type == privacy_request.policy.get_action_type()

0 comments on commit 24d6ab4

Please sign in to comment.