From a30638ba11993563e89dbfe18a8b7c77c441fd2d Mon Sep 17 00:00:00 2001 From: Isaac To Date: Sun, 9 Feb 2025 22:23:25 -0800 Subject: [PATCH] feat: add solution to correct `Affiliation` corruption Provide solution to correct the corruption of `Affiliation` JSON objects documented in https://github.com/dandi/dandi-schema/issues/276 --- .../management/commands/correct_metadata.py | 42 +++- .../api/tests/test_management/__init__.py | 0 .../test_management/test_commands/__init__.py | 0 .../test_commands/test_correct_metadata.py | 224 ++++++++++++++++++ 4 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 dandiapi/api/tests/test_management/__init__.py create mode 100644 dandiapi/api/tests/test_management/test_commands/__init__.py create mode 100644 dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py diff --git a/dandiapi/api/management/commands/correct_metadata.py b/dandiapi/api/management/commands/correct_metadata.py index 269828290..8104b7b01 100644 --- a/dandiapi/api/management/commands/correct_metadata.py +++ b/dandiapi/api/management/commands/correct_metadata.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from copy import deepcopy +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from collections.abc import Callable @@ -92,3 +93,42 @@ def correct_affiliation_corruption(meta: dict) -> dict | None: Note: This function corrects the corruptions described in https://github.com/dandi/dandi-schema/issues/276 """ + unwanted_fields = ['contactPoint', 'includeInCitation', 'roleName'] + + meta_corrected = deepcopy(meta) + affiliation_objs = find_objs(meta_corrected, 'Affiliation') + + corrected = False + for obj in affiliation_objs: + for field in unwanted_fields: + if field in obj: + del obj[field] + corrected = True + + return meta_corrected if corrected else None + + +def find_objs(instance: Any, schema_key: str) -> list[dict]: + """ + Find JSON objects with a specified `"schemaKey"` field within a data instance. + + :param instance: The data instance to find JSON objects from + :param schema_key: The `"schemaKey"` field value + :return: The list of JSON objects with the specified `"schemaKey"` in the data instance + """ + + def find_objs_(data: Any) -> None: + if isinstance(data, dict): + if 'schemaKey' in data and data['schemaKey'] == schema_key: + objs.append(data) + for value in data.values(): + find_objs_(value) + elif isinstance(data, list): + for item in data: + find_objs_(item) + else: + return + + objs: list[dict] = [] + find_objs_(instance) + return objs diff --git a/dandiapi/api/tests/test_management/__init__.py b/dandiapi/api/tests/test_management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dandiapi/api/tests/test_management/test_commands/__init__.py b/dandiapi/api/tests/test_management/test_commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py b/dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py new file mode 100644 index 000000000..d4915d60c --- /dev/null +++ b/dandiapi/api/tests/test_management/test_commands/test_correct_metadata.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Any + +import pytest + +from dandiapi.api.management.commands.correct_metadata import ( + correct_affiliation_corruption, + find_objs, +) + + +@pytest.mark.parametrize( + ('instance', 'schema_key', 'expected'), + [ + # Single matching object. + pytest.param( + {'schemaKey': 'Test', 'data': 123}, + 'Test', + [{'schemaKey': 'Test', 'data': 123}], + id='single-match', + ), + # No match. + pytest.param( + {'schemaKey': 'NotMatch', 'data': 123}, + 'Test', + [], + id='no-match', + ), + # Empty dictionary should return an empty list. + pytest.param( + {}, + 'Test', + [], + id='empty-dict', + ), + # Empty list should return an empty list. + pytest.param( + [], + 'Test', + [], + id='empty-list', + ), + # Nested dictionary: the matching object is nested within another dictionary. + pytest.param( + {'level1': {'schemaKey': 'Test', 'info': 'nested'}}, + 'Test', + [{'schemaKey': 'Test', 'info': 'nested'}], + id='nested-dict', + ), + # List of dictionaries: only those with matching schema key are returned. + pytest.param( + [ + {'schemaKey': 'Test', 'data': 1}, + {'schemaKey': 'Test', 'data': 2}, + {'schemaKey': 'NotTest', 'data': 3}, + ], + 'Test', + [ + {'schemaKey': 'Test', 'data': 1}, + {'schemaKey': 'Test', 'data': 2}, + ], + id='list-of-dicts', + ), + # Mixed structure: nested dictionaries and lists. + pytest.param( + { + 'a': {'schemaKey': 'Test', 'value': 1}, + 'b': [ + {'schemaKey': 'NotTest', 'value': 2}, + {'schemaKey': 'Test', 'value': 3}, + ], + 'c': 'irrelevant', + 'd': [{'e': {'schemaKey': 'Test', 'value': 4}}], + }, + 'Test', + [ + {'schemaKey': 'Test', 'value': 1}, + {'schemaKey': 'Test', 'value': 3}, + {'schemaKey': 'Test', 'value': 4}, + ], + id='mixed-structure', + ), + # Non-collection type: integer. + pytest.param( + 42, + 'Test', + [], + id='non-collection-int', + ), + # Non-collection type: string. + pytest.param( + 'some string', + 'Test', + [], + id='non-collection-string', + ), + # Non-collection type: float. + pytest.param( + 3.14, + 'Test', + [], + id='non-collection-float', + ), + # Non-collection type: None. + pytest.param( + None, + 'Test', + [], + id='non-collection-None', + ), + # Nested child: an object with the schema key contains a nested child that also + # has the schema key. + pytest.param( + {'schemaKey': 'Test', 'child': {'schemaKey': 'Test', 'data': 'child'}}, + 'Test', + [ + {'schemaKey': 'Test', 'child': {'schemaKey': 'Test', 'data': 'child'}}, + {'schemaKey': 'Test', 'data': 'child'}, + ], + id='nested-child', + ), + # List in field: + # The object with the given schema key has a field whose value is a list + # containing objects, some of which also have the given schema key. + pytest.param( + { + 'schemaKey': 'Test', + 'items': [ + {'schemaKey': 'Test', 'data': 'item1'}, + {'schemaKey': 'Other', 'data': 'item2'}, + {'schemaKey': 'Test', 'data': 'item3'}, + ], + }, + 'Test', + [ + # The outer object is returned first... + { + 'schemaKey': 'Test', + 'items': [ + {'schemaKey': 'Test', 'data': 'item1'}, + {'schemaKey': 'Other', 'data': 'item2'}, + {'schemaKey': 'Test', 'data': 'item3'}, + ], + }, + # ...followed by the matching objects within the list. + {'schemaKey': 'Test', 'data': 'item1'}, + {'schemaKey': 'Test', 'data': 'item3'}, + ], + id='list-in-field', + ), + ], +) +def test_find_objs_parametrized(instance: Any, schema_key: str, expected: list[dict]) -> None: + result = find_objs(instance, schema_key) + assert result == expected + + +@pytest.mark.parametrize( + ('input_meta', 'expected_output'), + [ + # No Affiliation object: nothing to change. + ( + {'key': 'value'}, + None, + ), + # Affiliation exists but has no unwanted fields: returns None. + ( + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Alice'}}, + None, + ), + # Single unwanted field ("contactPoint") should be removed. + ( + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Alice', 'contactPoint': 'info'}}, + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Alice'}}, + ), + # Multiple unwanted fields should all be removed. + ( + { + 'affiliation': { + 'schemaKey': 'Affiliation', + 'name': 'Test', + 'contactPoint': 'a', + 'includeInCitation': 'b', + 'roleName': 'c', + } + }, + {'affiliation': {'schemaKey': 'Affiliation', 'name': 'Test'}}, + ), + # Nested Affiliation objects should be corrected. + ( + { + 'users': [ + {'profile': {'schemaKey': 'Affiliation', 'name': 'Bob', 'roleName': 'Member'}}, + {'profile': {'schemaKey': 'Affiliation', 'name': 'Charlie'}}, + ], + 'data': {'schemaKey': 'NotAffiliation', 'contactPoint': 'should not be touched'}, + }, + { + 'users': [ + {'profile': {'schemaKey': 'Affiliation', 'name': 'Bob'}}, + {'profile': {'schemaKey': 'Affiliation', 'name': 'Charlie'}}, + ], + 'data': {'schemaKey': 'NotAffiliation', 'contactPoint': 'should not be touched'}, + }, + ), + ], +) +def test_correct_affiliation_corruption(input_meta, expected_output): + """ + Test `correct_affiliation_corruption()`. + + Ensure that it returns the correct modified metadata (if any corrections are needed) + while not mutating the original input. + """ + # Make a deep copy of the input to ensure immutability. + original_meta = deepcopy(input_meta) + result = correct_affiliation_corruption(input_meta) + + assert result == expected_output + + # Verify that the original metadata has not been mutated. + assert input_meta == original_meta, 'The input metadata should remain unchanged.'