From 2871769e36c783faf1017b8a80c1dab2f931e446 Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Fri, 9 Aug 2024 09:49:57 -0400 Subject: [PATCH 1/3] Deduplicate list fields in TIMDEX record Why these changes are being introduced: * Improve data quality of TIMDEX records by reducing duplication of data in list fields. How this addresses that need: * Create an attrs converter function to dedupe list of items * Create ListFields abstract class with hash method * Set hash methods in custom classes to ListFields.__hash__ * Set 'converter=dedupe' for every list field in TimdexRecord * Add unit tests verifying deduplication of list fields Side effects of this change: * Deduplication is highly likely to result in diffs when comparing transformed records before and after this change. However (and more importantly), reducing duplicates improves the data quality of TIMDEX records. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-332 --- tests/test_models.py | 273 ++++++++++++++++++++++++++++++++++++--- transmogrifier/models.py | 108 +++++++++++++--- 2 files changed, 340 insertions(+), 41 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index ec2600c..d87a73d 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,15 +1,6 @@ import pytest -from transmogrifier.models import ( - AlternateTitle, - Contributor, - Date, - DateRange, - Identifier, - Link, - Note, - Subject, -) +import transmogrifier.models as timdex def test_timdex_record_required_fields_only(timdex_record_required_fields): @@ -45,14 +36,16 @@ def test_timdex_record_required_fields_only(timdex_record_required_fields): def test_timdex_record_required_subfields_only(timdex_record_required_fields): - timdex_record_required_fields.contributors = [Contributor(value="Smith, Jane")] - timdex_record_required_fields.identifiers = [Identifier(value="123")] + timdex_record_required_fields.contributors = [timdex.Contributor(value="Smith, Jane")] + timdex_record_required_fields.identifiers = [timdex.Identifier(value="123")] timdex_record_required_fields.links = [ - Link(url="http://dx.doi.org/10.1007/978-94-017-0726-8") + timdex.Link(url="http://dx.doi.org/10.1007/978-94-017-0726-8") ] - timdex_record_required_fields.notes = [Note(value=["This book is awesome"])] - timdex_record_required_fields.alternate_titles = [AlternateTitle(value="Alt Title")] - timdex_record_required_fields.subjects = [Subject(value=["Stuff"])] + timdex_record_required_fields.notes = [timdex.Note(value=["This book is awesome"])] + timdex_record_required_fields.alternate_titles = [ + timdex.AlternateTitle(value="Alt Title") + ] + timdex_record_required_fields.subjects = [timdex.Subject(value=["Stuff"])] assert timdex_record_required_fields.source == "A Cool Repository" assert timdex_record_required_fields.source_link == "https://example.com/123" assert timdex_record_required_fields.timdex_record_id == "cool-repo:123" @@ -337,8 +330,10 @@ def test_timdex_record_date_range_both_gt_and_gte_raises_error( match="range may have a 'gt' or 'gte' value, but not both;", ): timdex_record_required_fields.dates = [ - Date( - range=DateRange(gt="2019-01-01", gte="2019-01-01", lt="2019-06-30"), + timdex.Date( + range=timdex.DateRange( + gt="2019-01-01", gte="2019-01-01", lt="2019-06-30" + ), ) ] @@ -350,8 +345,10 @@ def test_timdex_record_date_range_both_lt_and_lte_raises_error( ValueError, match="range may have a 'lt' or 'lte' value, but not both;" ): timdex_record_required_fields.dates = [ - Date( - range=DateRange(gt="2019-01-01", lt="2019-06-30", lte="2019-06-30"), + timdex.Date( + range=timdex.DateRange( + gt="2019-01-01", lt="2019-06-30", lte="2019-06-30" + ), ) ] @@ -374,3 +371,239 @@ def test_timdex_record_not_a_list_raises_error(timdex_record_required_fields): match="'dates' must be ", ): timdex_record_required_fields.dates = "test" + + +def test_timdex_record_dedupe_alternate_titles(timdex_record_required_fields): + timdex_record_required_fields.alternate_titles = [ + timdex.AlternateTitle(value="My Octopus Teacher"), + timdex.AlternateTitle(value="My Octopus Teacher"), + ] + assert timdex_record_required_fields.alternate_titles == [ + timdex.AlternateTitle(value="My Octopus Teacher") + ] + + +def test_timdex_record_dedupe_call_numbers(timdex_record_required_fields): + timdex_record_required_fields.call_numbers = ["a", "a"] + assert timdex_record_required_fields.call_numbers == ["a"] + + +def test_timdex_record_dedupe_content_type(timdex_record_required_fields): + timdex_record_required_fields.content_type = ["thesis", "thesis"] + assert timdex_record_required_fields.content_type == ["thesis"] + + +def test_timdex_record_dedupe_contents(timdex_record_required_fields): + timdex_record_required_fields.contents = ["Chapter 1", "Chapter 1"] + assert timdex_record_required_fields.contents == ["Chapter 1"] + + +def test_timdex_record_dedupe_contributors(timdex_record_required_fields): + timdex_record_required_fields.contributors = [ + timdex.Contributor( + value="Joe Hisaishi", + affiliation=["Kunitachi College of Music"], + kind="Composer", + ), + timdex.Contributor( + value="Joe Hisaishi", + affiliation=["Kunitachi College of Music"], + kind="Composer", + ), + ] + assert timdex_record_required_fields.contributors == [ + timdex.Contributor( + value="Joe Hisaishi", + affiliation=["Kunitachi College of Music"], + kind="Composer", + ) + ] + + +def test_timdex_record_dedupe_dates(timdex_record_required_fields): + timdex_record_required_fields.dates = [ + timdex.Date(value="2022-01-01", kind="Publication date"), + timdex.Date(value="2022-01-01", kind="Publication date"), + timdex.Date( + range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"), + ), + timdex.Date( + range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"), + ), + ] + assert timdex_record_required_fields.dates == [ + timdex.Date(value="2022-01-01", kind="Publication date"), + timdex.Date( + range=timdex.DateRange(gt="2019-01-01", lt="2019-06-30"), + ), + ] + + +def test_timdex_record_dedupe_file_formats(timdex_record_required_fields): + timdex_record_required_fields.file_formats = [ + "application/pdf", + "application/pdf", + ] + assert timdex_record_required_fields.file_formats == ["application/pdf"] + + +def test_timdex_record_dedupe_funding_information(timdex_record_required_fields): + timdex_record_required_fields.funding_information = [ + timdex.Funder(funder_name="NPR Foundation"), + timdex.Funder(funder_name="NPR Foundation"), + ] + assert timdex_record_required_fields.funding_information == [ + timdex.Funder(funder_name="NPR Foundation") + ] + + +def test_timdex_record_dedupe_holdings(timdex_record_required_fields): + timdex_record_required_fields.holdings = [ + timdex.Holding( + call_number="PL2687.L8.A28 1994", + collection="Stacks", + format="Print volume", + location="Hayden Library", + ), + timdex.Holding( + call_number="PL2687.L8.A28 1994", + collection="Stacks", + format="Print volume", + location="Hayden Library", + ), + ] + assert timdex_record_required_fields.holdings == [ + timdex.Holding( + call_number="PL2687.L8.A28 1994", + collection="Stacks", + format="Print volume", + location="Hayden Library", + ) + ] + + +def test_timdex_record_dedupe_identifiers(timdex_record_required_fields): + timdex_record_required_fields.identifiers = [ + timdex.Identifier(value="9781250185969. hardcover", kind="ISBN"), + timdex.Identifier(value="9781250185969. hardcover", kind="ISBN"), + ] + assert timdex_record_required_fields.identifiers == [ + timdex.Identifier(value="9781250185969. hardcover", kind="ISBN") + ] + + +def test_timdex_record_dedupe_languages(timdex_record_required_fields): + timdex_record_required_fields.languages = ["Spanish", "Spanish"] + assert timdex_record_required_fields.languages == ["Spanish"] + + +def test_timdex_record_dedupe_links(timdex_record_required_fields): + timdex_record_required_fields.links = [ + timdex.Link( + url="https://geodata.libraries.mit.edu/record/gismit" + ":GISPORTAL_GISOWNER01_BOSTONWATER95", + kind="Website", + text="Website", + ), + timdex.Link( + url="https://geodata.libraries.mit.edu/record/gismit" + ":GISPORTAL_GISOWNER01_BOSTONWATER95", + kind="Website", + text="Website", + ), + ] + assert timdex_record_required_fields.links == [ + timdex.Link( + url="https://geodata.libraries.mit.edu/record/gismit" + ":GISPORTAL_GISOWNER01_BOSTONWATER95", + kind="Website", + text="Website", + ) + ] + + +def test_timdex_record_dedupe_locations(timdex_record_required_fields): + timdex_record_required_fields.locations = [ + timdex.Location(value="One Place", kind="Place of Publication"), + timdex.Location(value="One Place", kind="Place of Publication"), + ] + assert timdex_record_required_fields.locations == [ + timdex.Location(value="One Place", kind="Place of Publication") + ] + + +def test_timdex_record_dedupe_notes(timdex_record_required_fields): + timdex_record_required_fields.notes = [ + timdex.Note(value=["Survey Data"], kind="Datacite resource type"), + timdex.Note(value=["Survey Data"], kind="Datacite resource type"), + ] + assert timdex_record_required_fields.notes == [ + timdex.Note(value=["Survey Data"], kind="Datacite resource type"), + ] + + +def test_timdex_record_dedupe_publication_frequency(timdex_record_required_fields): + timdex_record_required_fields.publication_frequency = [ + "Three times a year", + "Three times a year", + ] + assert timdex_record_required_fields.publication_frequency == ["Three times a year"] + + +def test_timdex_record_dedupe_publishers(timdex_record_required_fields): + timdex_record_required_fields.publishers = [ + timdex.Publisher(name="Harvard Dataverse"), + timdex.Publisher(name="Harvard Dataverse"), + ] + assert timdex_record_required_fields.publishers == [ + timdex.Publisher(name="Harvard Dataverse") + ] + + +def test_timdex_record_dedupe_related_items(timdex_record_required_fields): + timdex_record_required_fields.related_items = [ + timdex.RelatedItem(description="Nature Communications", relationship="host"), + timdex.RelatedItem(description="Nature Communications", relationship="host"), + ] + assert timdex_record_required_fields.related_items == [ + timdex.RelatedItem(description="Nature Communications", relationship="host") + ] + + +def test_timdex_record_dedupe_rights(timdex_record_required_fields): + timdex_record_required_fields.rights = [ + timdex.Rights(description="MIT authentication required", kind="Access to files"), + timdex.Rights(description="MIT authentication required", kind="Access to files"), + ] + assert timdex_record_required_fields.rights == [ + timdex.Rights(description="MIT authentication required", kind="Access to files") + ] + + +def test_timdex_record_dedupe_subjects(timdex_record_required_fields): + timdex_record_required_fields.subjects = [ + timdex.Subject( + value=["Social Sciences", "Educational materials"], + kind="Subject scheme not provided", + ), + timdex.Subject( + value=["Social Sciences", "Educational materials"], + kind="Subject scheme not provided", + ), + ] + assert timdex_record_required_fields.subjects == [ + timdex.Subject( + value=["Social Sciences", "Educational materials"], + kind="Subject scheme not provided", + ) + ] + + +def test_timdex_record_dedupe_summary(timdex_record_required_fields): + timdex_record_required_fields.summary = [ + "Mitochondria is the powerhouse of the cell.", + "Mitochondria is the powerhouse of the cell.", + ] + assert timdex_record_required_fields.summary == [ + "Mitochondria is the powerhouse of the cell." + ] diff --git a/transmogrifier/models.py b/transmogrifier/models.py index f3ccc0b..6cc7bf1 100644 --- a/transmogrifier/models.py +++ b/transmogrifier/models.py @@ -35,6 +35,12 @@ def list_of(item_type: Any) -> Callable: # noqa: ANN401 ) +def dedupe(item_list: list | Any) -> list | None: # noqa: ANN401 + if not isinstance(item_list, list): + return item_list + return list(dict.fromkeys(item_list)) + + def not_empty( _instance: "TimdexRecord", attribute: "attrs.Attribute", value: "list" ) -> None: @@ -43,11 +49,26 @@ def not_empty( raise ValueError(message) +@define +class ListField: + def __hash__(self) -> int: + """Hash method to create unique identifier for Location objects.""" + values = tuple( + [ + tuple(attrib) if isinstance(attrib, list) else attrib + for attrib in attrs.astuple(self) + ] + ) + return hash(values) + + @define class AlternateTitle: value: str = field(validator=instance_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Contributor: @@ -59,6 +80,8 @@ class Contributor: default=None, validator=optional(instance_of(bool)) ) + __hash__ = ListField.__hash__ + @define class DateRange: @@ -73,10 +96,13 @@ class Date: kind: str | None = field(default=None, validator=optional(instance_of(str))) note: str | None = field(default=None, validator=optional(instance_of(str))) range: DateRange | None = field( # type: ignore[misc] - default=None, validator=[optional(instance_of(DateRange)), check_range] + default=None, + validator=[optional(instance_of(DateRange)), check_range], ) value: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Funder: @@ -90,6 +116,8 @@ class Funder: award_number: str | None = field(default=None, validator=optional(instance_of(str))) award_uri: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Holding: @@ -99,12 +127,16 @@ class Holding: location: str | None = field(default=None, validator=optional(instance_of(str))) note: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Identifier: value: str = field(validator=instance_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Link: @@ -113,6 +145,8 @@ class Link: restrictions: str | None = field(default=None, validator=optional(instance_of(str))) text: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Location: @@ -120,12 +154,16 @@ class Location: kind: str | None = field(default=None, validator=optional(instance_of(str))) geoshape: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Note: value: list[str] = field(validator=list_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Publisher: @@ -133,6 +171,8 @@ class Publisher: date: str | None = field(default=None, validator=optional(instance_of(str))) location: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class RelatedItem: @@ -141,6 +181,8 @@ class RelatedItem: relationship: str | None = field(default=None, validator=optional(instance_of(str))) uri: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Rights: @@ -148,12 +190,16 @@ class Rights: kind: str | None = field(default=None, validator=optional(instance_of(str))) uri: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class Subject: value: list[str] = field(validator=list_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) + __hash__ = ListField.__hash__ + @define class TimdexRecord: @@ -165,54 +211,74 @@ class TimdexRecord: # Optional fields alternate_titles: list[AlternateTitle] | None = field( - default=None, validator=optional(list_of(AlternateTitle)) + default=None, converter=dedupe, validator=optional(list_of(AlternateTitle)) + ) + call_numbers: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) ) - call_numbers: list[str] | None = field(default=None, validator=optional(list_of(str))) citation: str | None = field(default=None, validator=optional(instance_of(str))) - content_type: list[str] | None = field(default=None, validator=optional(list_of(str))) - contents: list[str] | None = field(default=None, validator=optional(list_of(str))) + content_type: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) + contents: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) contributors: list[Contributor] | None = field( - default=None, validator=optional(list_of(Contributor)) + default=None, converter=dedupe, validator=optional(list_of(Contributor)) + ) + dates: list[Date] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Date)) ) - dates: list[Date] | None = field(default=None, validator=optional(list_of(Date))) edition: str | None = field(default=None, validator=optional(instance_of(str))) - file_formats: list[str] | None = field(default=None, validator=optional(list_of(str))) + file_formats: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) format: str | None = field(default=None, validator=optional(instance_of(str))) funding_information: list[Funder] | None = field( - default=None, validator=optional(list_of(Funder)) + default=None, converter=dedupe, validator=optional(list_of(Funder)) ) holdings: list[Holding] | None = field( - default=None, validator=optional(list_of(Holding)) + default=None, converter=dedupe, validator=optional(list_of(Holding)) ) identifiers: list[Identifier] | None = field( - default=None, validator=optional(list_of(Identifier)) + default=None, converter=dedupe, validator=optional(list_of(Identifier)) + ) + languages: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) + ) + links: list[Link] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Link)) ) - languages: list[str] | None = field(default=None, validator=optional(list_of(str))) - links: list[Link] | None = field(default=None, validator=optional(list_of(Link))) literary_form: str | None = field(default=None, validator=optional(instance_of(str))) locations: list[Location] | None = field( - default=None, validator=optional(list_of(Location)) + default=None, converter=dedupe, validator=optional(list_of(Location)) + ) + notes: list[Note] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Note)) ) - notes: list[Note] | None = field(default=None, validator=optional(list_of(Note))) numbering: str | None = field(default=None, validator=optional(instance_of(str))) physical_description: str | None = field( default=None, validator=optional(instance_of(str)) ) provider: str | None = field(default=None, validator=optional(instance_of(str))) publication_frequency: list[str] | None = field( - default=None, validator=optional(list_of(str)) + default=None, converter=dedupe, validator=optional(list_of(str)) ) publishers: list[Publisher] | None = field( - default=None, validator=optional(list_of(Publisher)) + default=None, converter=dedupe, validator=optional(list_of(Publisher)) ) related_items: list[RelatedItem] | None = field( - default=None, validator=optional(list_of(RelatedItem)) + default=None, converter=dedupe, validator=optional(list_of(RelatedItem)) + ) + rights: list[Rights] | None = field( + default=None, converter=dedupe, validator=optional(list_of(Rights)) ) - rights: list[Rights] | None = field(default=None, validator=optional(list_of(Rights))) subjects: list[Subject] | None = field( - default=None, validator=optional(list_of(Subject)) + default=None, converter=dedupe, validator=optional(list_of(Subject)) + ) + summary: list[str] | None = field( + default=None, converter=dedupe, validator=optional(list_of(str)) ) - summary: list[str] | None = field(default=None, validator=optional(list_of(str))) def asdict(self) -> dict[str, Any]: return asdict(self, filter=lambda _, value: value is not None) From f81475b90ba01921858358d1f3f58a8f556110ef Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Thu, 15 Aug 2024 09:33:47 -0400 Subject: [PATCH 2/3] Address comments * Remove 'ListField' class and create standalone hash method * Add additional tests for hash and dedupe methods --- tests/test_models.py | 43 ++++++++++++++++++++++++ transmogrifier/models.py | 71 +++++++++++++++++++++++----------------- 2 files changed, 84 insertions(+), 30 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index d87a73d..4172d2c 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -373,6 +373,30 @@ def test_timdex_record_not_a_list_raises_error(timdex_record_required_fields): timdex_record_required_fields.dates = "test" +def test_timdex_object_hash_diff_if_diff_class(): + """ + Asserts that TIMDEX objects of different class types + with similar attributes and attribute values will + be assigned different hashes and declared not equal. + """ + identifier = timdex.Identifier(value="x", kind="y") + alternate_title = timdex.AlternateTitle(value="x", kind="y") + assert identifier != alternate_title + assert identifier.__hash__() != alternate_title.__hash__() + + +def test_timdex_object_hash_same_if_same_class(): + """ + Asserts that TIMDEX objects of different class types + with similar attributes and attribute values will + be assigned the same hash and declared equal. + """ + identifier_0 = timdex.Identifier(value="x", kind="y") + identifier_1 = timdex.Identifier(value="x", kind="y") + assert identifier_0 == identifier_1 + assert identifier_0.__hash__() == identifier_1.__hash__() + + def test_timdex_record_dedupe_alternate_titles(timdex_record_required_fields): timdex_record_required_fields.alternate_titles = [ timdex.AlternateTitle(value="My Octopus Teacher"), @@ -607,3 +631,22 @@ def test_timdex_record_dedupe_summary(timdex_record_required_fields): assert timdex_record_required_fields.summary == [ "Mitochondria is the powerhouse of the cell." ] + + +def test_timdex_dedupes_correctly_if_diff_class(): + items = [ + timdex.Identifier(value="x", kind="y"), + timdex.AlternateTitle(value="x", kind="y"), + ] + assert timdex.dedupe(items) == [ + timdex.Identifier(value="x", kind="y"), + timdex.AlternateTitle(value="x", kind="y"), + ] + + +def test_timdex_dedupes_correctly_if_same_class(): + items = [ + timdex.Identifier(value="x", kind="y"), + timdex.Identifier(value="x", kind="y"), + ] + assert timdex.dedupe(items) == [timdex.Identifier(value="x", kind="y")] diff --git a/transmogrifier/models.py b/transmogrifier/models.py index 6cc7bf1..7cc28d8 100644 --- a/transmogrifier/models.py +++ b/transmogrifier/models.py @@ -35,12 +35,6 @@ def list_of(item_type: Any) -> Callable: # noqa: ANN401 ) -def dedupe(item_list: list | Any) -> list | None: # noqa: ANN401 - if not isinstance(item_list, list): - return item_list - return list(dict.fromkeys(item_list)) - - def not_empty( _instance: "TimdexRecord", attribute: "attrs.Attribute", value: "list" ) -> None: @@ -49,17 +43,34 @@ def not_empty( raise ValueError(message) -@define -class ListField: - def __hash__(self) -> int: - """Hash method to create unique identifier for Location objects.""" - values = tuple( - [ - tuple(attrib) if isinstance(attrib, list) else attrib - for attrib in attrs.astuple(self) - ] - ) - return hash(values) +def timdex_object_hash(timdex_object: Any) -> int: # noqa: ANN401 + """Hash method for TIMDEX objects. + + This method is set as the hash method for TIMDEX objects. + The method generates a unique hash using a tuple + comprised of the class name and attribute values. + By making TIMDEX objects hashable, dedupe methods + can be applied to a list of TIMDEX objects. + """ + values = tuple(type(timdex_object).__name__) + values += tuple( + [ + tuple(attrib) if isinstance(attrib, list) else attrib + for attrib in attrs.astuple(timdex_object) + ] + ) + return hash(values) + + +def dedupe(item_list: list | Any) -> list | None: # noqa: ANN401 + """Deduplication method for list of items. + + This method is used as a converter function for list fields + in the TimdexRecord model. + """ + if not isinstance(item_list, list): + return item_list + return list(dict.fromkeys(item_list)) @define @@ -67,7 +78,7 @@ class AlternateTitle: value: str = field(validator=instance_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -80,7 +91,7 @@ class Contributor: default=None, validator=optional(instance_of(bool)) ) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -101,7 +112,7 @@ class Date: ) value: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -116,7 +127,7 @@ class Funder: award_number: str | None = field(default=None, validator=optional(instance_of(str))) award_uri: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -127,7 +138,7 @@ class Holding: location: str | None = field(default=None, validator=optional(instance_of(str))) note: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -135,7 +146,7 @@ class Identifier: value: str = field(validator=instance_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -145,7 +156,7 @@ class Link: restrictions: str | None = field(default=None, validator=optional(instance_of(str))) text: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -154,7 +165,7 @@ class Location: kind: str | None = field(default=None, validator=optional(instance_of(str))) geoshape: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -162,7 +173,7 @@ class Note: value: list[str] = field(validator=list_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -171,7 +182,7 @@ class Publisher: date: str | None = field(default=None, validator=optional(instance_of(str))) location: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -181,7 +192,7 @@ class RelatedItem: relationship: str | None = field(default=None, validator=optional(instance_of(str))) uri: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -190,7 +201,7 @@ class Rights: kind: str | None = field(default=None, validator=optional(instance_of(str))) uri: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define @@ -198,7 +209,7 @@ class Subject: value: list[str] = field(validator=list_of(str)) # Required subfield kind: str | None = field(default=None, validator=optional(instance_of(str))) - __hash__ = ListField.__hash__ + __hash__ = timdex_object_hash @define From 0448d3bb51fd5edff9ad6dcf4561a0df5112bdc4 Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Thu, 15 Aug 2024 11:23:54 -0400 Subject: [PATCH 3/3] Update 'timdex_object_hash' to correctly include class name --- transmogrifier/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transmogrifier/models.py b/transmogrifier/models.py index 7cc28d8..949035d 100644 --- a/transmogrifier/models.py +++ b/transmogrifier/models.py @@ -52,7 +52,7 @@ def timdex_object_hash(timdex_object: Any) -> int: # noqa: ANN401 By making TIMDEX objects hashable, dedupe methods can be applied to a list of TIMDEX objects. """ - values = tuple(type(timdex_object).__name__) + values = (timdex_object.__class__.__name__,) values += tuple( [ tuple(attrib) if isinstance(attrib, list) else attrib