diff --git a/tests/sources/xml/test_dspace_dim.py b/tests/sources/xml/test_dspace_dim.py index 55acc77..5ae06e2 100644 --- a/tests/sources/xml/test_dspace_dim.py +++ b/tests/sources/xml/test_dspace_dim.py @@ -1,7 +1,35 @@ +from bs4 import BeautifulSoup + import transmogrifier.models as timdex from transmogrifier.sources.xml.dspace_dim import DspaceDim +def create_dspace_dim_source_record_stub(xml_insert: str = "") -> BeautifulSoup: + xml_string = f""" + + +
+ oai:darchive.mblwhoilibrary.org:1912/2641 + 2020-01-28T19:30:01Z + com_1912_3 + col_1912_534 +
+ + + {xml_insert} + + +
+
+ """ + return BeautifulSoup(xml_string, "xml") + + def test_dspace_dim_transform_with_all_fields_transforms_correctly(): source_records = DspaceDim.parse_source_file( "tests/fixtures/dspace/dspace_dim_record_all_fields.xml" @@ -216,3 +244,231 @@ def test_dspace_dim_transform_with_optional_fields_missing_transforms_correctly( format="electronic resource", content_type=["Not specified"], ) + + +def test_get_alternate_titles_success(): + source_record = create_dspace_dim_source_record_stub( + """ + An Alternative Title + """ + ) + assert DspaceDim.get_alternate_titles(source_record) == [ + timdex.AlternateTitle(value="An Alternative Title", kind="alternative") + ] + + +def test_get_alternate_titles_transforms_correctly_if_fields_blank(): + source_record = create_dspace_dim_source_record_stub( + '' + ) + assert DspaceDim.get_alternate_titles(source_record) is None + + +def test_get_alternate_titles_transforms_correctly_if_fields_missing(): + source_record = create_dspace_dim_source_record_stub() + assert DspaceDim.get_alternate_titles(source_record) is None + + +def test_get_alternate_titles_multiple_titles_success(): + + source_record = create_dspace_dim_source_record_stub( + """ + Title 1 + Title 2 + Title 3 + """ + ) + assert DspaceDim.get_alternate_titles(source_record) == [ + timdex.AlternateTitle(value="Title 2"), + timdex.AlternateTitle(value="Title 3"), + ] + + +def test_get_citation_success(): + source_record = create_dspace_dim_source_record_stub( + """ + Journal of Geophysical Research: Solid Earth 121 (2016): 5859-5879 + """ + ) + assert ( + DspaceDim.get_citation(source_record) + == "Journal of Geophysical Research: Solid Earth 121 (2016): 5859-5879" + ) + + +def test_get_citation_transforms_correctly_if_fields_blank(): + source_record = create_dspace_dim_source_record_stub( + '' + ) + assert DspaceDim.get_citation(source_record) is None + + +def test_get_citation_transforms_correctly_if_fields_missing(): + source_record = create_dspace_dim_source_record_stub() + assert DspaceDim.get_citation(source_record) is None + + +def test_get_content_type_success(): + source_record = create_dspace_dim_source_record_stub( + """ + Moving Image + Dataset + """ + ) + assert DspaceDim.get_content_type(source_record) == [ + "Moving Image", + "Dataset", + ] + + +def test_get_content_type_transforms_correctly_if_fields_blank(): + source_record = create_dspace_dim_source_record_stub( + '' + ) + assert DspaceDim.get_content_type(source_record) is None + + +def test_get_content_type_transforms_correctly_if_fields_missing(): + source_record = create_dspace_dim_source_record_stub() + assert DspaceDim.get_content_type(source_record) is None + + +def test_get_contents_success(): + source_record = create_dspace_dim_source_record_stub( + """ + Chapter 1 + """ + ) + assert DspaceDim.get_contents(source_record) == ["Chapter 1"] + + +def test_get_contents_transforms_correctly_if_fields_blank(): + source_record = create_dspace_dim_source_record_stub( + '' + ) + assert DspaceDim.get_contents(source_record) is None + + +def test_get_contents_transforms_correctly_if_fields_missing(): + source_record = create_dspace_dim_source_record_stub() + assert DspaceDim.get_contents(source_record) is None + + +def test_get_contributors_success(): + source_record = create_dspace_dim_source_record_stub( + """ + LaFountain, James R. + Oldenbourg, Rudolf + Jamerson, James + """ + ) + assert DspaceDim.get_contributors(source_record) == [ + timdex.Contributor(value="Jamerson, James", kind="Creator"), + timdex.Contributor( + value="LaFountain, James R.", + kind="author", + ), + timdex.Contributor( + value="Oldenbourg, Rudolf", + kind="author", + ), + ] + + +def test_get_contributors_transforms_correctly_if_fields_blank(): + source_record = create_dspace_dim_source_record_stub( + """ + + + """ + ) + assert DspaceDim.get_contributors(source_record) is None + + +def test_get_contributors_transforms_correctly_if_fields_missing(): + source_record = create_dspace_dim_source_record_stub() + assert DspaceDim.get_contributors(source_record) is None + + +def test_get_dates_success(): + source_record = create_dspace_dim_source_record_stub( + """ + 1201-01-01 - 1965-12-21 + 1201-01-01/1965-12-21 + 2009-01-08T16:24:37Z + 2009-01-08T16:24:37Z + 2002-11 + """ + ) + assert DspaceDim.get_dates(source_record) == [ + timdex.Date(kind="accessioned", value="2009-01-08T16:24:37Z"), + timdex.Date(kind="available", value="2009-01-08T16:24:37Z"), + timdex.Date(kind="Publication date", value="2002-11"), + timdex.Date( + kind="coverage", + note="1201-01-01 - 1965-12-21", + ), + timdex.Date( + kind="coverage", + range=timdex.DateRange(gte="1201-01-01", lte="1965-12-21"), + ), + ] + + +def test_get_dates_transforms_correctly_if_fields_blank(): + source_record = create_dspace_dim_source_record_stub( + """ + + + """ + ) + assert DspaceDim.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_fields_missing(): + source_record = create_dspace_dim_source_record_stub() + assert DspaceDim.get_dates(source_record) is None + + +def test_get_file_formats_success(): + source_record = create_dspace_dim_source_record_stub( + """ + application/msword + image/tiff + video/quicktime + """ + ) + assert DspaceDim.get_file_formats(source_record) == [ + "application/msword", + "image/tiff", + "video/quicktime", + ] + + +def test_get_file_formats_transforms_correctly_if_fields_blank(): + source_record = create_dspace_dim_source_record_stub( + '' + ) + assert DspaceDim.get_file_formats(source_record) is None + + +def test_get_file_formats_transforms_correctly_if_fields_missing(): + source_record = create_dspace_dim_source_record_stub() + assert DspaceDim.get_file_formats(source_record) is None + + +def test_get_format_success(): + assert DspaceDim.get_format() == "electronic resource" diff --git a/transmogrifier/sources/xml/dspace_dim.py b/transmogrifier/sources/xml/dspace_dim.py index 0e4714c..fcd943d 100644 --- a/transmogrifier/sources/xml/dspace_dim.py +++ b/transmogrifier/sources/xml/dspace_dim.py @@ -1,4 +1,5 @@ import logging +from collections.abc import Iterator from bs4 import Tag # type: ignore[import-untyped] @@ -12,128 +13,46 @@ class DspaceDim(XMLTransformer): """DSpace DIM transformer.""" - def get_optional_fields(self, xml: Tag) -> dict | None: + def get_optional_fields(self, source_record: Tag) -> dict | None: """ Retrieve optional TIMDEX fields from a DSpace DIM XML record. Overrides metaclass get_optional_fields() method. Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. + source_record: A BeautifulSoup Tag representing a single DSpace DIM XML + record. """ fields: dict = {} - source_record_id = self.get_source_record_id(xml) - # alternate_titles - for alternate_title in [ - t - for t in xml.find_all("dim:field", element="title") - if "qualifier" in t.attrs and t.string - ]: - fields.setdefault("alternate_titles", []).append( - timdex.AlternateTitle( - value=alternate_title.string, - kind=alternate_title["qualifier"] or None, - ) - ) - # If the record has more than one main title, add extras to alternate_titles - for index, title in enumerate(self.get_main_titles(xml)): - if index > 0: - fields.setdefault("alternate_titles", []).append( - timdex.AlternateTitle(value=title) - ) + fields["alternate_titles"] = self.get_alternate_titles(source_record) # citation - citation = xml.find("dim:field", element="identifier", qualifier="citation") - fields["citation"] = citation.string if citation and citation.string else None + fields["citation"] = self.get_citation(source_record) # content_type - if content_types := self.get_content_types(xml): - if self.valid_content_types(content_types): - fields["content_type"] = content_types - else: - return None + fields["content_type"] = self.get_content_type(source_record) # contents - fields["contents"] = [ - t.string - for t in xml.find_all( - "dim:field", element="description", qualifier="tableofcontents" - ) - if t.string - ] or None + fields["contents"] = self.get_contents(source_record) # contributors - for creator in [ - c for c in xml.find_all("dim:field", element="creator") if c.string - ]: - fields.setdefault("contributors", []).append( - timdex.Contributor( - value=creator.string, - kind="Creator", - ) - ) - - for contributor in [ - c for c in xml.find_all("dim:field", element="contributor") if c.string - ]: - fields.setdefault("contributors", []).append( - timdex.Contributor( - value=contributor.string, - kind=contributor.get("qualifier") or "Not specified", - ) - ) + fields["contributors"] = self.get_contributors(source_record) # dates - for date in xml.find_all("dim:field", element="date", string=True): - date_value = str(date.string.strip()) - if validate_date(date_value, source_record_id): - if date.get("qualifier") == "issued": - d = timdex.Date(value=date_value, kind="Publication date") - else: - d = timdex.Date(value=date_value, kind=date.get("qualifier") or None) - fields.setdefault("dates", []).append(d) - - for coverage in [ - c.string - for c in xml.find_all("dim:field", element="coverage", qualifier="temporal") - if c.string - ]: - if "/" in coverage: - split = coverage.index("/") - gte_date = coverage[:split] - lte_date = coverage[split + 1 :] - if validate_date_range( - gte_date, - lte_date, - source_record_id, - ): - d = timdex.Date( - range=timdex.DateRange( - gte=gte_date, - lte=lte_date, - ), - kind="coverage", - ) - else: - d = timdex.Date(note=coverage.string, kind="coverage") - fields.setdefault("dates", []).append(d) + fields["dates"] = self.get_dates(source_record) # file_formats - fields["file_formats"] = [ - f.string - for f in xml.find_all("dim:field", element="format") - if f.get("qualifier") == "mimetype" and f.string - ] or None + fields["file_formats"] = self.get_file_formats(source_record) # format - fields["format"] = "electronic resource" + fields["format"] = self.get_format() # funding_information for funding_reference in [ f - for f in xml.find_all( + for f in source_record.find_all( "dim:field", element="description", qualifier="sponsorship" ) if f.string @@ -145,7 +64,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) # identifiers - identifiers = xml.find_all("dim:field", element="identifier") + identifiers = source_record.find_all("dim:field", element="identifier") for identifier in [ i for i in identifiers if i.get("qualifier") != "citation" and i.string ]: @@ -158,7 +77,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # language fields["languages"] = [ - la.string for la in xml.find_all("dim:field", element="language") if la.string + la.string + for la in source_record.find_all("dim:field", element="language") + if la.string ] or None # links, uses identifiers list retrieved for identifiers field @@ -176,12 +97,14 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # locations fields["locations"] = [ timdex.Location(value=lo.string) - for lo in xml.find_all("dim:field", element="coverage", qualifier="spatial") + for lo in source_record.find_all( + "dim:field", element="coverage", qualifier="spatial" + ) if lo.string ] or None # notes - descriptions = xml.find_all("dim:field", element="description") + descriptions = source_record.find_all("dim:field", element="description") for description in [ d for d in descriptions @@ -204,13 +127,13 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # publishers fields["publishers"] = [ timdex.Publisher(name=p.string) - for p in xml.find_all("dim:field", element="publisher") + for p in source_record.find_all("dim:field", element="publisher") if p.string ] or None # related_items for related_item in [ - r for r in xml.find_all("dim:field", element="relation") if r.string + r for r in source_record.find_all("dim:field", element="relation") if r.string ]: if related_item.get("qualifier") == "uri": ri = timdex.RelatedItem( @@ -225,7 +148,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # rights for rights in [ - r for r in xml.find_all("dim:field", element="rights") if r.string + r for r in source_record.find_all("dim:field", element="rights") if r.string ]: if rights.get("qualifier") == "uri": rg = timdex.Rights(uri=rights.string) @@ -238,7 +161,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # subjects subjects_dict: dict[str, list[str]] = {} for subject in [ - s for s in xml.find_all("dim:field", element="subject") if s.string + s for s in source_record.find_all("dim:field", element="subject") if s.string ]: if not subject.get("qualifier"): subjects_dict.setdefault("Subject scheme not provided", []).append( @@ -260,47 +183,172 @@ def get_optional_fields(self, xml: Tag) -> dict | None: return fields @classmethod - def get_content_types(cls, xml: Tag) -> list[str] | None: - """ - Retrieve content types from a DSpace DIM XML record. + def get_alternate_titles( + cls, source_record: Tag + ) -> list[timdex.AlternateTitle] | None: + alternate_titles = [ + timdex.AlternateTitle( + value=str(alternate_title.string), + kind=alternate_title["qualifier"], + ) + for alternate_title in source_record.find_all( + "dim:field", element="title", string=True + ) + if alternate_title.get("qualifier") + ] + # If the record has more than one main title, add extras to alternate_titles + alternate_titles.extend( + [ + timdex.AlternateTitle(value=title) + for title in cls.get_main_titles(source_record)[1:] + ] + ) + return alternate_titles or None - May be overridden by source subclasses that retrieve content type values - differently. + @classmethod + def get_citation(cls, source_record: Tag) -> str | None: + if citation := source_record.find( + "dim:field", element="identifier", qualifier="citation", string=True + ): + return citation.string + return None - Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. - """ + @classmethod + def get_content_type(cls, source_record: Tag) -> list[str] | None: return [ - t.string for t in xml.find_all("dim:field", element="type", string=True) + str(content_type.string) + for content_type in source_record.find_all( + "dim:field", element="type", string=True + ) ] or None @classmethod - def get_main_titles(cls, xml: Tag) -> list[str]: + def get_contents(cls, source_record: Tag) -> list[str] | None: + return [ + contents.string + for contents in source_record.find_all( + "dim:field", + element="description", + qualifier="tableofcontents", + string=True, + ) + ] or None + + @classmethod + def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None: + contributors: list[timdex.Contributor] = [] + contributors.extend(cls._get_creators(source_record)) + contributors.extend(cls._get_contributors_by_contributor_element(source_record)) + return contributors or None + + @classmethod + def _get_creators(cls, source_record: Tag) -> Iterator[timdex.Contributor]: + for creator in source_record.find_all( + "dim:field", element="creator", string=True + ): + yield timdex.Contributor( + value=str(creator.string), + kind="Creator", + ) + + @classmethod + def _get_contributors_by_contributor_element( + cls, source_record: Tag + ) -> Iterator[timdex.Contributor]: + for contributor in source_record.find_all( + "dim:field", element="contributor", string=True + ): + yield timdex.Contributor( + value=str(contributor.string), + kind=contributor.get("qualifier") or "Not specified", + ) + + @classmethod + def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None: + dates = [] + for date in source_record.find_all("dim:field", element="date", string=True): + date_value = str(date.string.strip()) + if validate_date(date_value, cls.get_source_record_id(source_record)): + if date.get("qualifier") == "issued": + date_object = timdex.Date(value=date_value, kind="Publication date") + else: + date_object = timdex.Date( + value=date_value, kind=date.get("qualifier") or None + ) + dates.append(date_object) + dates.extend(cls._get_coverage_dates(source_record)) + return dates or None + + @classmethod + def _get_coverage_dates(cls, source_record: Tag) -> Iterator[timdex.Date]: + for coverage_value in [ + str(coverage.string) + for coverage in source_record.find_all( + "dim:field", element="coverage", qualifier="temporal", string=True + ) + ]: + if "/" in coverage_value: + split = coverage_value.index("/") + gte_date = coverage_value[:split] + lte_date = coverage_value[split + 1 :] + if validate_date_range( + gte_date, + lte_date, + cls.get_source_record_id(source_record), + ): + yield timdex.Date( + range=timdex.DateRange( + gte=gte_date, + lte=lte_date, + ), + kind="coverage", + ) + else: + yield timdex.Date(note=coverage_value, kind="coverage") + + @classmethod + def get_file_formats(cls, source_record: Tag) -> list[str] | None: + return [ + str(file_format.string) + for file_format in source_record.find_all( + "dim:field", element="format", string=True + ) + if file_format.get("qualifier") == "mimetype" + ] or None + + @classmethod + def get_format(cls) -> str: + return "electronic resource" + + @classmethod + def get_main_titles(cls, source_record: Tag) -> list[str]: """ Retrieve main title(s) from a DSpace DIM XML record. Overrides metaclass get_main_titles() method. Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. + source_record: A BeautifulSoup Tag representing a single DSpace DIM XML + record. """ return [ t.string - for t in xml.find_all("dim:field", element="title", string=True) + for t in source_record.find_all("dim:field", element="title", string=True) if "qualifier" not in t.attrs ] @classmethod - def get_source_record_id(cls, xml: Tag) -> str: + def get_source_record_id(cls, source_record: Tag) -> str: """ Get the source record ID from a DSpace DIM XML record. Overrides metaclass get_source_record_id() method. Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. + source_record: A BeautifulSoup Tag representing a single DSpace DIM XML + record. """ - return xml.header.identifier.string.split(":")[2] + return source_record.header.identifier.string.split(":")[2] @classmethod def valid_content_types(cls, _content_type_list: list[str]) -> bool: diff --git a/transmogrifier/sources/xml/whoas.py b/transmogrifier/sources/xml/whoas.py index 4736626..668b509 100644 --- a/transmogrifier/sources/xml/whoas.py +++ b/transmogrifier/sources/xml/whoas.py @@ -1,5 +1,6 @@ from bs4 import Tag # type: ignore[import-untyped] +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.sources.xml.dspace_dim import DspaceDim INVALID_CONTENT_TYPES = [ @@ -23,18 +24,17 @@ class Whoas(DspaceDim): """Whoas transformer class.""" @classmethod - def get_content_types(cls, xml: Tag) -> list[str]: - """ - Retrieve content types from a DSpace DIM XML record. - - Overrides the base DspaceDim.get_content_types() method. - - Args: - xml: A BeautifulSoup Tag representing a single DSpace DIM XML record. - """ - return [ - t.string for t in xml.find_all("dim:field", element="type", string=True) + def get_content_type(cls, source_record: Tag) -> list[str] | None: + content_types = [ + str(content_type.string) + for content_type in source_record.find_all( + "dim:field", element="type", string=True + ) ] or ["no content type in source record"] + if cls.valid_content_types(content_types): + return content_types + message = f'Record skipped based on content type: "{content_types}"' + raise SkippedRecordEvent(message, cls.get_source_record_id(source_record)) @classmethod def valid_content_types(cls, content_type_list: list[str]) -> bool: