From a05fc00183d4e64936741b6e76bdf8f22092a89b Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Wed, 22 May 2024 10:53:30 -0400 Subject: [PATCH] Field method refactor for Springshare transform Why these changes are being introduced: * These updates are required to implement the architecture described in the following ADR: https://github.com/MITLibraries/transmogrifier/blob/main/docs/adrs/0005-field-methods.md How this addresses that need: * Refactor transform class SpringshareOaiDc to contain get_* methods for optional fields * Update tests for SpringshareOaiDc transform Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-283 --- tests/conftest.py | 34 +++ tests/sources/xml/test_springshare.py | 341 +++++++++++++++++----- transmogrifier/sources/xml/springshare.py | 32 +- 3 files changed, 316 insertions(+), 91 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9623aca..f5b4bb3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -111,6 +111,40 @@ def oai_pmh_records(): return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml") +# springshare - libguides ########################## +@pytest.fixture +def springshare_libguides_record_optional_fields_blank(): + source_records = OaiDc.parse_source_file( + "tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_blank.xml" + ) + return next(source_records) + + +@pytest.fixture +def springshare_libguides_record_optional_fields_missing(): + source_records = OaiDc.parse_source_file( + "tests/fixtures/oai_dc/springshare/libguides/libguides_record_optional_fields_missing.xml" + ) + return next(source_records) + + +# springshare - research databases ########################## +@pytest.fixture +def springshare_research_databases_record_optional_fields_blank(): + source_records = OaiDc.parse_source_file( + "tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_blank.xml" + ) + return next(source_records) + + +@pytest.fixture +def springshare_research_databases_record_optional_fields_missing(): + source_records = OaiDc.parse_source_file( + "tests/fixtures/oai_dc/springshare/research_databases/research_databases_record_optional_fields_missing.xml" + ) + return next(source_records) + + @pytest.fixture def timdex_record_required_fields(): return timdex.TimdexRecord( diff --git a/tests/sources/xml/test_springshare.py b/tests/sources/xml/test_springshare.py index f02c96f..0b0854b 100644 --- a/tests/sources/xml/test_springshare.py +++ b/tests/sources/xml/test_springshare.py @@ -1,97 +1,179 @@ +# ruff: noqa: E501 + import logging +from bs4 import BeautifulSoup + import transmogrifier.models as timdex from transmogrifier.sources.xml.springshare import SpringshareOaiDc SPRINGSHARE_FIXTURES_PREFIX = "tests/fixtures/oai_dc/springshare" - LIBGUIDES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/libguides" - RESEARCHDATABASES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/research_databases" -LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( - source="LibGuides", - source_link="https://libguides.mit.edu/materials", - timdex_record_id="libguides:guides-175846", - title="Materials Science & Engineering", - citation="Materials Science & Engineering. libguides. " - "https://libguides.mit.edu/materials", - content_type=["libguides"], - format="electronic resource", - identifiers=[ - timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") - ], - links=[ +def create_oaidc_source_record_stub(xml_insert: str = "") -> BeautifulSoup: + xml_str = f""" + + +
+ oai:libguides.com:guides/175846 + 2023-05-31T19:49:21Z + guides +
+ + + {xml_insert} + + +
+ January 1st, 2000 + """ + ) + assert SpringshareOaiDc.get_dates( + source_record=source_record, source_record_id="abc" + ) == [timdex.Date(kind="Created", value="2000-01-01T00:00:00")] + + +def test_get_dates_transforms_correctly_if_optional_fields_blank(): + source_record = create_oaidc_source_record_stub( + """ + + """ + ) + assert ( + SpringshareOaiDc.get_dates( + source_record=source_record, + source_record_id="abc", + ) + == [] + ) + + +def test_get_dates_transforms_correctly_if_optional_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert ( + SpringshareOaiDc.get_dates( + source_record=source_record, + source_record_id="abc", + ) + == [] + ) + + +def test_get_dates_transforms_correctly_and_logs_error_if_date_invalid( + caplog, +): + caplog.set_level(logging.DEBUG) + source_record = create_oaidc_source_record_stub( + """ + INVALID + """ + ) + assert ( + SpringshareOaiDc.get_dates(source_record=source_record, source_record_id="abc") + == [] + ) + assert ( + "Record ID abc has a date that cannot be parsed: Unknown string format: INVALID" + in caplog.text + ) + + +def test_get_links_success(): + source_record = create_oaidc_source_record_stub( + """ + https://libguides.mit.edu/materials + """ + ) + assert SpringshareOaiDc(source="libguides", source_records=iter(())).get_links( + source_record=source_record, + source_record_id="abc", + ) == [ timdex.Link( - url="https://libguides.mit.edu/materials", kind="LibGuide URL", text="LibGuide URL", + url="https://libguides.mit.edu/materials", ) - ], -) - -RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( - source="Research Databases", - source_link="https://libguides.mit.edu/llba", - timdex_record_id="researchdatabases:az-65257807", - title="Linguistics and Language Behavior Abstracts (LLBA)", - citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. " - "https://libguides.mit.edu/llba", - content_type=["researchdatabases"], - format="electronic resource", - identifiers=[ - timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") - ], - links=[ - timdex.Link( - url="https://libguides.mit.edu/llba", - kind="Research Database URL", - text="Research Database URL", + ] + + +def test_get_links_transforms_correctly_if_required_fields_blank(): + source_record = create_oaidc_source_record_stub( + """ + + """ + ) + assert ( + SpringshareOaiDc(source="libguides", source_records=iter(())).get_links( + source_record=source_record, source_record_id="abc" ) - ], -) + == [] + ) -def test_springshare_get_dates_valid(): - source_records = SpringshareOaiDc.parse_source_file( - f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_valid_dates.xml" - ) - transformer_instance = SpringshareOaiDc("libguides", source_records) - for xml in transformer_instance.source_records: - date_field_value = transformer_instance.get_dates("test_get_dates", xml) - assert date_field_value == [ - timdex.Date( - kind="Created", note=None, range=None, value="2000-01-01T00:00:00" - ) - ] +def test_get_links_transforms_correctly_if_required_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert ( + SpringshareOaiDc(source="libguides", source_records=iter(())).get_links( + source_record=source_record, source_record_id="abc" + ) + == [] + ) -def test_springshare_get_dates_invalid_logged_and_skipped(caplog): - caplog.set_level(logging.DEBUG) - source_records = SpringshareOaiDc.parse_source_file( - f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_invalid_dates.xml" +def test_get_source_link_success(): + source_record = create_oaidc_source_record_stub( + """ + https://libguides.mit.edu/materials + """ + ) + assert ( + SpringshareOaiDc.get_source_link("", "", source_record=source_record) + == "https://libguides.mit.edu/materials" ) - transformer_instance = SpringshareOaiDc("libguides", source_records) - for xml in transformer_instance.source_records: - date_field_value = transformer_instance.get_dates("test_get_dates", xml) - assert date_field_value == [] - assert "has a date that cannot be parsed" in caplog.text -def test_springshare_get_links_missing_identifier_logged_and_skipped(caplog): - caplog.set_level(logging.DEBUG) - source_records = SpringshareOaiDc.parse_source_file( - f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_record_missing_required_fields.xml" +def test_get_source_link_transforms_correctly_if_required_fields_blank(): + source_record = create_oaidc_source_record_stub( + """ + + """ + ) + assert SpringshareOaiDc.get_source_link("", "", source_record=source_record) == "" + + +def test_get_source_link_transforms_correctly_if_required_fields_missing(): + source_record = create_oaidc_source_record_stub( + """ + + """ ) - transformer_instance = SpringshareOaiDc("libguides", source_records) - for xml in transformer_instance.source_records: - links_field_value = transformer_instance.get_links("test_get_links", xml) - assert links_field_value == [] - assert "has links that cannot be generated" in caplog.text + assert SpringshareOaiDc.get_source_link("", "", source_record=source_record) == "" -def test_libguide_transform_with_all_fields_transforms_correctly(): +########################### +# Springshare - LibGuides +########################### + + +def test_springshare_libguides_transform_with_all_fields_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_all_fields.xml" ) @@ -135,23 +217,66 @@ def test_libguide_transform_with_all_fields_transforms_correctly(): ) -def test_libguides_transform_with_optional_fields_blank_transforms_correctly(): +def test_springshare_libguides_transform_with_optional_fields_blank_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_blank.xml" ) output_records = SpringshareOaiDc("libguides", source_records) - assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + assert next(output_records) == timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/materials", + timdex_record_id="libguides:guides-175846", + title="Materials Science & Engineering", + citation="Materials Science & Engineering. libguides. " + "https://libguides.mit.edu/materials", + content_type=["libguides"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/materials", + kind="LibGuide URL", + text="LibGuide URL", + ) + ], + ) -def test_libguides_transform_with_optional_fields_missing_transforms_correctly(): +def test_springshare_libguides_transform_with_optional_fields_missing_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_missing.xml" ) output_records = SpringshareOaiDc("libguides", source_records) - assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + assert next(output_records) == timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/materials", + timdex_record_id="libguides:guides-175846", + title="Materials Science & Engineering", + citation="Materials Science & Engineering. libguides. " + "https://libguides.mit.edu/materials", + content_type=["libguides"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/materials", + kind="LibGuide URL", + text="LibGuide URL", + ) + ], + ) -def test_research_databases_transform_with_all_fields_transforms_correctly(): +#################################### +# Springshare - Research Databases +#################################### + + +def test_springshare_research_databases_transform_with_all_fields_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{RESEARCHDATABASES_FIXTURES_PREFIX}/research_databases_record_all_fields.xml" ) @@ -191,23 +316,81 @@ def test_research_databases_transform_with_all_fields_transforms_correctly(): ) -def test_research_databases_transform_with_optional_fields_blank_transforms_correctly(): +def test_springshare_research_databases_transform_with_optional_fields_blank_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( RESEARCHDATABASES_FIXTURES_PREFIX + "/research_databases_record_optional_fields_blank.xml" ) output_records = SpringshareOaiDc("researchdatabases", source_records) - assert ( - next(output_records) == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + assert next(output_records) == timdex.TimdexRecord( + source="Research Databases", + source_link="https://libguides.mit.edu/llba", + timdex_record_id="researchdatabases:az-65257807", + title="Linguistics and Language Behavior Abstracts (LLBA)", + citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. " + "https://libguides.mit.edu/llba", + content_type=["researchdatabases"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/llba", + kind="Research Database URL", + text="Research Database URL", + ) + ], ) -def test_research_databases_transform_with_optional_fields_missing_transforms_correctly(): +def test_springshare_research_databases_transform_with_optional_fields_missing_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( RESEARCHDATABASES_FIXTURES_PREFIX + "/research_databases_record_optional_fields_missing.xml" ) output_records = SpringshareOaiDc("researchdatabases", source_records) + assert next(output_records) == timdex.TimdexRecord( + source="Research Databases", + source_link="https://libguides.mit.edu/llba", + timdex_record_id="researchdatabases:az-65257807", + title="Linguistics and Language Behavior Abstracts (LLBA)", + citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. " + "https://libguides.mit.edu/llba", + content_type=["researchdatabases"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/llba", + kind="Research Database URL", + text="Research Database URL", + ) + ], + ) + + +def test_research_databases_get_dates_transforms_correctly_if_optional_fields_blank( + springshare_research_databases_record_optional_fields_blank, +): + assert ( + SpringshareOaiDc.get_dates( + source_record=springshare_research_databases_record_optional_fields_blank, + source_record_id="abc", + ) + == [] + ) + + +def test_research_databases_get_dates_transforms_correctly_if_optional_fields_missing( + springshare_research_databases_record_optional_fields_missing, +): assert ( - next(output_records) == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + SpringshareOaiDc.get_dates( + source_record=springshare_research_databases_record_optional_fields_missing, + source_record_id="abc", + ) + == [] ) diff --git a/transmogrifier/sources/xml/springshare.py b/transmogrifier/sources/xml/springshare.py index b313987..240328f 100644 --- a/transmogrifier/sources/xml/springshare.py +++ b/transmogrifier/sources/xml/springshare.py @@ -20,7 +20,8 @@ class SpringshareOaiDc(OaiDc): - researchdatabases """ - def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: + @classmethod + def get_dates(cls, source_record: Tag, source_record_id: str) -> list[timdex.Date]: """ Overrides OaiDc's default get_dates() logic for Springshare records. @@ -31,11 +32,11 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: Additionally, only a single date will is expected. Args: - source_record_id: Source record id - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. + source_record_id: Source record ID. """ dates = [] - if date := xml.find("dc:date", string=True): + if date := source_record.find("dc:date", string=True): try: date_iso_str = date_parser(str(date.string).strip()).isoformat() if validate_date( @@ -51,16 +52,16 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: ) return dates - def get_links(self, source_record_id: str, xml: Tag) -> list[timdex.Link] | None: + def get_links(self, source_record: Tag, source_record_id: str) -> list[timdex.Link]: """ Overrides OaiDc's default get_links() logic for Springshare records. Args: - source_record_id: Source record id - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. + source_record_id: Source record ID. """ links = [] - if identifier := xml.find("dc:identifier", string=True): + if identifier := source_record.find("dc:identifier", string=True): singular_source_name = self.source_name.rstrip("s") links.append( timdex.Link( @@ -69,6 +70,8 @@ def get_links(self, source_record_id: str, xml: Tag) -> list[timdex.Link] | None url=str(identifier.string), ) ) + return links + logger.debug( "Record ID %s has links that cannot be generated: missing dc:identifier", source_record_id, @@ -77,7 +80,10 @@ def get_links(self, source_record_id: str, xml: Tag) -> list[timdex.Link] | None @classmethod def get_source_link( - cls, _source_base_url: str, _source_record_id: str, xml: Tag + cls, + _source_base_url: str, + _source_record_id: str, + source_record: Tag, ) -> str: """ Override for default source_link behavior. @@ -99,8 +105,10 @@ def get_source_link( link. Args: + source_record_id: Source record ID. source_base_url: Source base URL. - source_record_id: Record identifier for the source record. - xml: A BeautifulSoup Tag representing a single XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ - return str(xml.find("dc:identifier").string) + if source_link := source_record.find("dc:identifier", string=True): + return str(source_link.string) + return ""