Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Timx 332 dedupe function #206

Merged
merged 3 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,30 @@ def test_timdex_record_not_a_list_raises_error(timdex_record_required_fields):
timdex_record_required_fields.dates = "test"


def test_timdex_object_hash_diff_if_diff_class():
"""
Asserts that TIMDEX objects of different class types
with similar attributes and attribute values will
be assigned different hashes and declared not equal.
"""
identifier = timdex.Identifier(value="x", kind="y")
alternate_title = timdex.AlternateTitle(value="x", kind="y")
assert identifier != alternate_title
assert identifier.__hash__() != alternate_title.__hash__()
jonavellecuerdo marked this conversation as resolved.
Show resolved Hide resolved


def test_timdex_object_hash_same_if_same_class():
"""
Asserts that TIMDEX objects of different class types
with similar attributes and attribute values will
be assigned the same hash and declared equal.
"""
identifier_0 = timdex.Identifier(value="x", kind="y")
identifier_1 = timdex.Identifier(value="x", kind="y")
assert identifier_0 == identifier_1
assert identifier_0.__hash__() == identifier_1.__hash__()


def test_timdex_record_dedupe_alternate_titles(timdex_record_required_fields):
timdex_record_required_fields.alternate_titles = [
timdex.AlternateTitle(value="My Octopus Teacher"),
Expand Down Expand Up @@ -607,3 +631,22 @@ def test_timdex_record_dedupe_summary(timdex_record_required_fields):
assert timdex_record_required_fields.summary == [
"Mitochondria is the powerhouse of the cell."
]


def test_timdex_dedupes_correctly_if_diff_class():
items = [
timdex.Identifier(value="x", kind="y"),
timdex.AlternateTitle(value="x", kind="y"),
]
assert timdex.dedupe(items) == [
timdex.Identifier(value="x", kind="y"),
timdex.AlternateTitle(value="x", kind="y"),
]


def test_timdex_dedupes_correctly_if_same_class():
items = [
timdex.Identifier(value="x", kind="y"),
timdex.Identifier(value="x", kind="y"),
]
assert timdex.dedupe(items) == [timdex.Identifier(value="x", kind="y")]
71 changes: 41 additions & 30 deletions transmogrifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,6 @@ def list_of(item_type: Any) -> Callable: # noqa: ANN401
)


def dedupe(item_list: list | Any) -> list | None: # noqa: ANN401
if not isinstance(item_list, list):
return item_list
return list(dict.fromkeys(item_list))


def not_empty(
_instance: "TimdexRecord", attribute: "attrs.Attribute", value: "list"
) -> None:
Expand All @@ -49,25 +43,42 @@ def not_empty(
raise ValueError(message)


@define
class ListField:
def __hash__(self) -> int:
"""Hash method to create unique identifier for Location objects."""
values = tuple(
[
tuple(attrib) if isinstance(attrib, list) else attrib
for attrib in attrs.astuple(self)
]
)
return hash(values)
def timdex_object_hash(timdex_object: Any) -> int: # noqa: ANN401
"""Hash method for TIMDEX objects.

This method is set as the hash method for TIMDEX objects.
The method generates a unique hash using a tuple
comprised of the class name and attribute values.
By making TIMDEX objects hashable, dedupe methods
can be applied to a list of TIMDEX objects.
"""
values = tuple(type(timdex_object).__name__)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple requests here.

First, I think this approach might actually creating a tuple with the individual characters of the class name, e.g.

from transmogrifier.models import *

timdex_object = AlternateTitle(value='horse',kind='animal')

tuple(type(timdex_object).__name__)
# Out[13]: ('A', 'l', 't', 'e', 'r', 'n', 'a', 't', 'e', 'T', 'i', 't', 'l', 'e')

Think this could be resolved via an explicit list, and then converting to tuple:

tuple([type(timdex_object).__name__])
# Out[2]: ('AlternateTitle',)

Second, any objection with using X.__class__ vs type(X)?

Example:

values = tuple([timdex_object.__class__.__name__])

Personally, when we know it's some kind of well-known class instance (like classes we define), I think using .__class__ is more readable. I'm having trouble finding supporting documentation or reasoning, where it sounds like type(X) and X.__class__ are mostly interchangeable; this SO post being a good example where it's kind of fluid, matter of taste, and some pretty edge-y gotchas.

I tend to think of type(X) as when I really don't know anything about an object. But in this case, where we know it's a class instance of some kind, and we wrote the classes, it feels like accessing it directly is kind of nice.

TL/DR: I'd propose we make the first change to avoid a tuple of each character, but the second type(X) vs X.__class__ happy either way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ALL that said -- and should have led with this -- this is a nice and elegant solution to this equality checking! Nice work.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I consider values = tuple([timdex_object.__class__.__name__]) more readable as well, good suggestion!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EGAD 😱

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ghukill Thank you for catching this erroneous error 🙈 . This has been fixed by 426b185. I also prefer using __class__ for the same reason you described:

we know it's a class instance of some kind, and we wrote the classes

(Also remembering you suggested this during our check-in yesterday so apologies for forgetting!)

values += tuple(
[
tuple(attrib) if isinstance(attrib, list) else attrib
for attrib in attrs.astuple(timdex_object)
]
)
return hash(values)


def dedupe(item_list: list | Any) -> list | None: # noqa: ANN401
"""Deduplication method for list of items.

This method is used as a converter function for list fields
in the TimdexRecord model.
"""
if not isinstance(item_list, list):
return item_list
return list(dict.fromkeys(item_list))


@define
class AlternateTitle:
value: str = field(validator=instance_of(str)) # Required subfield
kind: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -80,7 +91,7 @@ class Contributor:
default=None, validator=optional(instance_of(bool))
)

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -101,7 +112,7 @@ class Date:
)
value: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -116,7 +127,7 @@ class Funder:
award_number: str | None = field(default=None, validator=optional(instance_of(str)))
award_uri: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -127,15 +138,15 @@ class Holding:
location: str | None = field(default=None, validator=optional(instance_of(str)))
note: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
class Identifier:
value: str = field(validator=instance_of(str)) # Required subfield
kind: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -145,7 +156,7 @@ class Link:
restrictions: str | None = field(default=None, validator=optional(instance_of(str)))
text: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -154,15 +165,15 @@ class Location:
kind: str | None = field(default=None, validator=optional(instance_of(str)))
geoshape: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
class Note:
value: list[str] = field(validator=list_of(str)) # Required subfield
kind: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -171,7 +182,7 @@ class Publisher:
date: str | None = field(default=None, validator=optional(instance_of(str)))
location: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -181,7 +192,7 @@ class RelatedItem:
relationship: str | None = field(default=None, validator=optional(instance_of(str)))
uri: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand All @@ -190,15 +201,15 @@ class Rights:
kind: str | None = field(default=None, validator=optional(instance_of(str)))
uri: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
class Subject:
value: list[str] = field(validator=list_of(str)) # Required subfield
kind: str | None = field(default=None, validator=optional(instance_of(str)))

__hash__ = ListField.__hash__
__hash__ = timdex_object_hash


@define
Expand Down