From 97b7a196add9844454bd9d9faa239da445e5f315 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Thu, 13 Jun 2024 13:03:05 -0400 Subject: [PATCH] Add final set of DspaceMets field methods (#193) * Add final set of DspaceMets field methods Why these changes are being introduced: * Finish refactoring DspaceMets to use field methods How this addresses that need: * Add field methods and associated private methods for contributors, dates, edition, file_formats, format, identifiers, languages, links, numbering, publishers, related_items, rights, subjects, and summary * Add unit tests for new field methods * Update create_dspace_mets_source_record_stub function to use different inserts Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-286 * Shift comments to docstrings --- tests/sources/xml/test_dspace_mets.py | 359 +++++++++++++++++++++- transmogrifier/sources/xml/dspace_mets.py | 254 +++++++++------ 2 files changed, 511 insertions(+), 102 deletions(-) diff --git a/tests/sources/xml/test_dspace_mets.py b/tests/sources/xml/test_dspace_mets.py index bbf6b6e..206fed3 100644 --- a/tests/sources/xml/test_dspace_mets.py +++ b/tests/sources/xml/test_dspace_mets.py @@ -4,13 +4,15 @@ from transmogrifier.sources.xml.dspace_mets import DspaceMets -def create_dspace_mets_source_record_stub(xml_insert: str = "") -> BeautifulSoup: +def create_dspace_mets_source_record_stub( + dmdsec_insert: str = "", filesec_insert: str = "" +) -> BeautifulSoup: xml_string = f"""
- abc123 + oai:dspace:abc123
BeautifulSoup xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-1.xsd"> - {xml_insert} + {dmdsec_insert} + + {filesec_insert} + @@ -210,7 +215,7 @@ def test_dspace_mets_with_attribute_and_subfield_variations_transforms_correctly def test_get_alternate_titles_success(): source_record = create_dspace_mets_source_record_stub( - """ + dmdsec_insert=""" A Slightly Different Title @@ -223,7 +228,7 @@ def test_get_alternate_titles_success(): def test_get_alternate_titles_transforms_correctly_if_fields_blank(): source_record = create_dspace_mets_source_record_stub( - '' + dmdsec_insert='' ) assert DspaceMets.get_alternate_titles(source_record) is None @@ -236,7 +241,7 @@ def test_get_alternate_titles_transforms_correctly_if_fields_missing(): def test_get_alternate_titles_multiple_titles_success(): source_record = create_dspace_mets_source_record_stub( - """ + dmdsec_insert=""" Title 1" @@ -260,7 +265,7 @@ def test_get_citation_success(): 'Transport and Machine Learning-assisted Investigation of Magnetic Materials." ' "Massachusetts Institute of Technology © 2022." ) - source_record = create_dspace_mets_source_record_stub(xml_string) + source_record = create_dspace_mets_source_record_stub(dmdsec_insert=xml_string) assert DspaceMets.get_citation(source_record) == ( 'Tatsumi, Yuki. "Magneto-thermal Transport and Machine Learning-assisted ' 'Investigation of Magnetic Materials." Massachusetts Institute of Technology ' @@ -270,7 +275,7 @@ def test_get_citation_success(): def test_get_citation_transforms_correctly_if_fields_blank(): source_record = create_dspace_mets_source_record_stub( - '' + dmdsec_insert='' ) assert DspaceMets.get_citation(source_record) is None @@ -282,16 +287,350 @@ def test_get_citation_transforms_correctly_if_fields_missing(): def test_get_content_type_success(): source_record = create_dspace_mets_source_record_stub( - "Thesis" + dmdsec_insert="Thesis" ) assert DspaceMets.get_content_type(source_record) == ["Thesis"] def test_get_content_type_transforms_correctly_if_fields_blank(): - source_record = create_dspace_mets_source_record_stub("") + source_record = create_dspace_mets_source_record_stub(dmdsec_insert="") assert DspaceMets.get_content_type(source_record) is None def test_get_content_type_transforms_correctly_if_fields_missing(): source_record = create_dspace_mets_source_record_stub() assert DspaceMets.get_content_type(source_record) is None + + +def test_get_contribtuors_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + + + advisor + + Checkelsky, Joseph + + + + author + + Tatsumi, Yuki + + + + department + + Massachusetts Institute of Technology. Department + + + Smith, Susie Q. + + """ + ) + assert DspaceMets.get_contributors(source_record) == [ + timdex.Contributor( + value="Checkelsky, Joseph", + kind="advisor", + ), + timdex.Contributor( + value="Tatsumi, Yuki", + kind="author", + ), + timdex.Contributor( + value="Massachusetts Institute of Technology. Department", + kind="department", + ), + timdex.Contributor( + value="Smith, Susie Q.", + kind="Not specified", + ), + ] + + +def test_get_contributors_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_contributors(source_record) is None + + +def test_get_contributors_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_contributors(source_record) is None + + +def test_get_dates_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + + 2021-09 + + """ + ) + assert DspaceMets.get_dates(source_record) == [ + timdex.Date(kind="Publication date", value="2021-09") + ] + + +def test_get_dates_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_dates(source_record) is None + + +def test_get_file_formats_success(): + source_record = create_dspace_mets_source_record_stub( + filesec_insert=""" + + + + + + + + + + + """ + ) + assert DspaceMets.get_file_formats(source_record) == ["application/pdf"] + + +def test_get_file_formats_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + '' + ) + assert DspaceMets.get_file_formats(source_record) is None + + +def test_get_file_formats_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_file_formats(source_record) is None + + +def test_get_format_success(): + assert DspaceMets.get_format() == "electronic resource" + + +def test_get_identifiers_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + https://hdl.handle.net/1721.1/142832 + """ + ) + assert DspaceMets.get_identifiers(source_record) == [ + timdex.Identifier(value="https://hdl.handle.net/1721.1/142832", kind="uri"), + ] + + +def test_get_identifiers_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_identifiers(source_record) is None + + +def test_get_identifers_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_identifiers(source_record) is None + + +def test_get_languages_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + + en_US + + """ + ) + assert DspaceMets.get_languages(source_record) == ["en_US"] + + +def test_get_languages_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_languages(source_record) is None + + +def test_get_languages_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_languages(source_record) is None + + +def test_get_links_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + https://hdl.handle.net/1721.1/142832 + """ + ) + assert DspaceMets.get_links(source_record) == [ + timdex.Link( + url="https://hdl.handle.net/1721.1/142832", + kind="Digital object URL", + text="Digital object URL", + ), + ] + + +def test_get_links_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_links(source_record) is None + + +def test_get_links_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_links(source_record) is None + + +def test_get_numbering_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="MIT-CSAIL-TR-2018-016' + ) + assert DspaceMets.get_numbering(source_record) == "MIT-CSAIL-TR-2018-016" + + +def test_get_numbering_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert='' + ) + assert DspaceMets.get_numbering(source_record) is None + + +def test_get_numbering_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_numbering(source_record) is None + + +def test_get_publishers_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + + Massachusetts Institute of Technology + + """ + ) + assert DspaceMets.get_publishers(source_record) == [ + timdex.Publisher(name="Massachusetts Institute of Technology"), + ] + + +def test_get_publishers_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_publishers(source_record) is None + + +def test_get_publishers_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_publishers(source_record) is None + + +def test_get_related_items_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + Nature Communications + """ + ) + assert DspaceMets.get_related_items(source_record) == [ + timdex.RelatedItem(description="Nature Communications", relationship="host"), + ] + + +def test_get_related_items_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert='' + ) + assert DspaceMets.get_related_items(source_record) is None + + +def test_get_related_items_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_related_items(source_record) is None + + +def test_get_rights_items_success(): + dmdsec_insert = ( + '' + "In Copyright - Educational Use Permitted" + ) + source_record = create_dspace_mets_source_record_stub(dmdsec_insert) + assert DspaceMets.get_rights(source_record) == [ + timdex.Rights( + description="In Copyright - Educational Use Permitted", + kind="useAndReproduction", + ), + ] + + +def test_get_rights_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_rights(source_record) is None + + +def test_get_rights_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_rights(source_record) is None + + +def test_get_subjects_items_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert=""" + + Metallurgy and Materials Science + + """ + ) + assert DspaceMets.get_subjects(source_record) == [ + timdex.Subject( + value=["Metallurgy and Materials Science"], + kind="Subject scheme not provided", + ), + ] + + +def test_get_subjects_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_subjects(source_record) is None + + +def test_get_subjects_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_subjects(source_record) is None + + +def test_get_summary_items_success(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="Heat is carried by different." + ) + assert DspaceMets.get_summary(source_record) == ["Heat is carried by different."] + + +def test_get_summary_transforms_correctly_if_fields_blank(): + source_record = create_dspace_mets_source_record_stub( + dmdsec_insert="" + ) + assert DspaceMets.get_summary(source_record) is None + + +def test_get_summary_transforms_correctly_if_fields_missing(): + source_record = create_dspace_mets_source_record_stub() + assert DspaceMets.get_summary(source_record) is None diff --git a/transmogrifier/sources/xml/dspace_mets.py b/transmogrifier/sources/xml/dspace_mets.py index b05f147..d80988b 100644 --- a/transmogrifier/sources/xml/dspace_mets.py +++ b/transmogrifier/sources/xml/dspace_mets.py @@ -26,8 +26,6 @@ def get_optional_fields(self, source_record: Tag) -> dict: """ fields: dict = {} - source_record_id = self.get_source_record_id(source_record) - # alternate_titles fields["alternate_titles"] = self.get_alternate_titles(source_record) @@ -41,41 +39,18 @@ def get_optional_fields(self, source_record: Tag) -> dict: # mapped to the OAI-PMH METS output. # contributors - for contributor in source_record.find_all("mods:name"): - if name := contributor.find("mods:namePart", string=True): - if role := contributor.find("mods:roleTerm", string=True): - kind = role.string - else: - kind = "Not specified" - fields.setdefault("contributors", []).append( - timdex.Contributor( - kind=kind, - value=name.string, - ) - ) + fields["contributors"] = self.get_contributors(source_record) # dates - # Only publication date is mapped from DSpace, other relevant date field (dc. - # coverage.temporal) is not mapped to the OAI-PMH METS output. - if publication_date := source_record.find("mods:dateIssued", string=True): - publication_date_value = str(publication_date.string.strip()) - if validate_date(publication_date_value, source_record_id): - fields["dates"] = [ - timdex.Date(kind="Publication date", value=publication_date_value) - ] + fields["dates"] = self.get_dates(source_record) # edition field not used in DSpace # file_formats - # Only maps formats with attribute use="ORIGINAL" because other formats such as - # USE="TEXT" are used internally by DSpace and not made publicly available. - for file_group in source_record.find_all("fileGrp", USE="ORIGINAL"): - file = file_group.find("file") - if file and file.get("MIMETYPE"): - fields.setdefault("file_formats", []).append(file["MIMETYPE"]) + fields["file_formats"] = self.get_file_formats(source_record) # format - fields["format"] = "electronic resource" + fields["format"] = self.get_format() # funding_information: relevant field in DSpace (dc.description.sponsorship) is # not mapped to the OAI-PMH METS output. @@ -84,31 +59,13 @@ def get_optional_fields(self, source_record: Tag) -> dict: # identifiers # Exludes citation because we have a separate field for that - for identifier in [ - i - for i in source_record.find_all("mods:identifier", string=True) - if i.get("type") != "citation" - ]: - fields.setdefault("identifiers", []).append( - timdex.Identifier( - kind=identifier.get("type") or "Not specified", - value=identifier.string, - ) - ) + fields["identifiers"] = self.get_identifiers(source_record) # languages - for language in source_record.find_all("mods:languageTerm", string=True): - fields.setdefault("languages", []).append(language.string) + fields["languages"] = self.get_languages(source_record) # links - for link in source_record.find_all("mods:identifier", string=True, type="uri"): - fields.setdefault("links", []).append( - timdex.Link( - kind="Digital object URL", - text="Digital object URL", - url=link.string, - ) - ) + fields["links"] = self.get_links(source_record) # literary_form field not used in DSpace @@ -119,10 +76,7 @@ def get_optional_fields(self, source_record: Tag) -> dict: # the OAI-PMH METS output. # numbering - if numbering := source_record.find( - "mods:relatedItem", string=True, type="series" - ): - fields["numbering"] = numbering.string + fields["numbering"] = self.get_numbering(source_record) # physical_description: relevant fields in DSpace (dc.format, dc.format.extent, # dc.format.medium) are not mapped to the OAI-PMH METS output. @@ -130,49 +84,19 @@ def get_optional_fields(self, source_record: Tag) -> dict: # publication_frequency field not used in DSpace # publishers - for publisher in source_record.find_all("mods:publisher", string=True): - fields.setdefault("publishers", []).append( - timdex.Publisher(name=publisher.string) - ) + fields["publishers"] = self.get_publishers(source_record) # related_items - # Excludes related items with type of "series" because the data in that field - # seems to more accurately map to the numbering field. - for related_item in [ - ri - for ri in source_record.find_all("mods:relatedItem", string=True) - if ri.get("type") != "series" - ]: - fields.setdefault("related_items", []).append( - timdex.RelatedItem( - description=related_item.string, - relationship=related_item.get("type") or "Not specified", - ) - ) + fields["related_items"] = self.get_related_items(source_record) # rights - # Note: rights uri field in DSpace (dc.rights.uri) is not mapped to the OAI-PMH - # METS output. - for right in source_record.find_all("mods:accessCondition", string=True): - fields.setdefault("rights", []).append( - timdex.Rights(description=right.string, kind=right.get("type") or None) - ) + fields["rights"] = self.get_rights(source_record) # subjects - # Note: subject fields with schemes in DSpace (dc.subject.) are not - # mapped to the OAI-PMH METS output. - if topics := source_record.find_all("mods:topic", string=True): - fields["subjects"] = [ - timdex.Subject( - kind="Subject scheme not provided", value=[t.string for t in topics] - ) - ] + fields["subjects"] = self.get_subjects(source_record) # summary - fields["summary"] = [ - summary.string - for summary in source_record.find_all("mods:abstract", string=True) - ] or None + fields["summary"] = self.get_summary(source_record) return fields @@ -212,6 +136,152 @@ def get_content_type(cls, source_record: Tag) -> list[str] | None: for content_type in source_record.find_all("mods:genre", string=True) ] or None + @classmethod + def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None: + contributors = [] + for contributor in source_record.find_all("mods:name"): + if name := contributor.find("mods:namePart", string=True): + if role := contributor.find("mods:roleTerm", string=True): + kind = str(role.string) + else: + kind = "Not specified" + contributors.append( + timdex.Contributor( + kind=kind, + value=str(name.string), + ) + ) + return contributors or None + + @classmethod + def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None: + """ + Field method for dates. + + Only publication date is mapped from DSpace, other relevant date field + (dc.coverage.temporal) is not mapped to the OAI-PMH METS output. + """ + if publication_date := source_record.find("mods:dateIssued", string=True): + publication_date_value = str(publication_date.string.strip()) + if validate_date( + publication_date_value, cls.get_source_record_id(source_record) + ): + return [ + timdex.Date(kind="Publication date", value=publication_date_value) + ] + return None + + @classmethod + def get_file_formats(cls, source_record: Tag) -> list[str] | None: + """ + Field method for file_formats. + + Only maps formats with attribute use="ORIGINAL" because other formats such as + USE="TEXT" are used internally by DSpace and not made publicly available. + """ + file_formats = [] + for file_group in source_record.find_all("fileGrp", USE="ORIGINAL"): + file = file_group.find("file") + if file and file.get("MIMETYPE"): + file_formats.append(file["MIMETYPE"]) + return file_formats or None + + @classmethod + def get_format(cls) -> str: + return "electronic resource" + + @classmethod + def get_identifiers(cls, source_record: Tag) -> list[timdex.Identifier] | None: + return [ + timdex.Identifier( + kind=identifier.get("type") or "Not specified", + value=str(identifier.string), + ) + for identifier in source_record.find_all("mods:identifier", string=True) + if identifier.get("type") != "citation" + ] or None + + @classmethod + def get_languages(cls, source_record: Tag) -> list[str] | None: + return [ + str(language.string) + for language in source_record.find_all("mods:languageTerm", string=True) + ] or None + + @classmethod + def get_links(cls, source_record: Tag) -> list[timdex.Link] | None: + return [ + timdex.Link( + kind="Digital object URL", + text="Digital object URL", + url=str(link.string), + ) + for link in source_record.find_all("mods:identifier", string=True, type="uri") + ] or None + + @classmethod + def get_numbering(cls, source_record: Tag) -> str | None: + if numbering := source_record.find( + "mods:relatedItem", string=True, type="series" + ): + return str(numbering.string) + return None + + @classmethod + def get_publishers(cls, source_record: Tag) -> list[timdex.Publisher] | None: + return [ + timdex.Publisher(name=str(publisher.string)) + for publisher in source_record.find_all("mods:publisher", string=True) + ] or None + + @classmethod + def get_related_items(cls, source_record: Tag) -> list[timdex.RelatedItem] | None: + return [ + timdex.RelatedItem( + description=str(related_item.string), + relationship=related_item.get("type") or "Not specified", + ) + for related_item in source_record.find_all("mods:relatedItem", string=True) + if related_item.get("type") != "series" + ] or None + + @classmethod + def get_rights(cls, source_record: Tag) -> list[timdex.Rights] | None: + """ + Field method for rights. + + Rights uri field in DSpace (dc.rights.uri) is not mapped to the OAI-PMH + METS output. + """ + return [ + timdex.Rights(description=str(right.string), kind=right.get("type") or None) + for right in source_record.find_all("mods:accessCondition", string=True) + ] or None + + @classmethod + def get_subjects(cls, source_record: Tag) -> list[timdex.Subject] | None: + """ + Field method for subjects. + + Subject fields with schemes in DSpace (dc.subject.) are not + mapped to the OAI-PMH METS output. + """ + if subjects := source_record.find_all("mods:topic", string=True): + return [ + timdex.Subject( + kind="Subject scheme not provided", + value=[str(subject.string) for subject in subjects], + ) + ] + return None + + @classmethod + def get_summary(cls, source_record: Tag) -> list[str] | None: + return [ + str(summary.string) + for summary in source_record.find_all("mods:abstract", string=True) + ] or None + @classmethod def get_main_titles(cls, source_record: Tag) -> list[str]: """ @@ -224,9 +294,9 @@ def get_main_titles(cls, source_record: Tag) -> list[str]: record. """ return [ - t.string - for t in source_record.find_all("mods:title", string=True) - if not t.get("type") + str(title.string) + for title in source_record.find_all("mods:title", string=True) + if not title.get("type") ] @classmethod