diff --git a/tests/sources/xml/test_dspace_dim.py b/tests/sources/xml/test_dspace_dim.py
index 55acc77..5ae06e2 100644
--- a/tests/sources/xml/test_dspace_dim.py
+++ b/tests/sources/xml/test_dspace_dim.py
@@ -1,7 +1,35 @@
+from bs4 import BeautifulSoup
+
import transmogrifier.models as timdex
from transmogrifier.sources.xml.dspace_dim import DspaceDim
+def create_dspace_dim_source_record_stub(xml_insert: str = "") -> BeautifulSoup:
+ xml_string = f"""
+
+
+
+ oai:darchive.mblwhoilibrary.org:1912/2641
+ 2020-01-28T19:30:01Z
+ com_1912_3
+ col_1912_534
+
+
+
+ {xml_insert}
+
+
+
+
+ """
+ return BeautifulSoup(xml_string, "xml")
+
+
def test_dspace_dim_transform_with_all_fields_transforms_correctly():
source_records = DspaceDim.parse_source_file(
"tests/fixtures/dspace/dspace_dim_record_all_fields.xml"
@@ -216,3 +244,231 @@ def test_dspace_dim_transform_with_optional_fields_missing_transforms_correctly(
format="electronic resource",
content_type=["Not specified"],
)
+
+
+def test_get_alternate_titles_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ An Alternative Title
+ """
+ )
+ assert DspaceDim.get_alternate_titles(source_record) == [
+ timdex.AlternateTitle(value="An Alternative Title", kind="alternative")
+ ]
+
+
+def test_get_alternate_titles_transforms_correctly_if_fields_blank():
+ source_record = create_dspace_dim_source_record_stub(
+ ''
+ )
+ assert DspaceDim.get_alternate_titles(source_record) is None
+
+
+def test_get_alternate_titles_transforms_correctly_if_fields_missing():
+ source_record = create_dspace_dim_source_record_stub()
+ assert DspaceDim.get_alternate_titles(source_record) is None
+
+
+def test_get_alternate_titles_multiple_titles_success():
+
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ Title 1
+ Title 2
+ Title 3
+ """
+ )
+ assert DspaceDim.get_alternate_titles(source_record) == [
+ timdex.AlternateTitle(value="Title 2"),
+ timdex.AlternateTitle(value="Title 3"),
+ ]
+
+
+def test_get_citation_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ Journal of Geophysical Research: Solid Earth 121 (2016): 5859-5879
+ """
+ )
+ assert (
+ DspaceDim.get_citation(source_record)
+ == "Journal of Geophysical Research: Solid Earth 121 (2016): 5859-5879"
+ )
+
+
+def test_get_citation_transforms_correctly_if_fields_blank():
+ source_record = create_dspace_dim_source_record_stub(
+ ''
+ )
+ assert DspaceDim.get_citation(source_record) is None
+
+
+def test_get_citation_transforms_correctly_if_fields_missing():
+ source_record = create_dspace_dim_source_record_stub()
+ assert DspaceDim.get_citation(source_record) is None
+
+
+def test_get_content_type_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ Moving Image
+ Dataset
+ """
+ )
+ assert DspaceDim.get_content_type(source_record) == [
+ "Moving Image",
+ "Dataset",
+ ]
+
+
+def test_get_content_type_transforms_correctly_if_fields_blank():
+ source_record = create_dspace_dim_source_record_stub(
+ ''
+ )
+ assert DspaceDim.get_content_type(source_record) is None
+
+
+def test_get_content_type_transforms_correctly_if_fields_missing():
+ source_record = create_dspace_dim_source_record_stub()
+ assert DspaceDim.get_content_type(source_record) is None
+
+
+def test_get_contents_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ Chapter 1
+ """
+ )
+ assert DspaceDim.get_contents(source_record) == ["Chapter 1"]
+
+
+def test_get_contents_transforms_correctly_if_fields_blank():
+ source_record = create_dspace_dim_source_record_stub(
+ ''
+ )
+ assert DspaceDim.get_contents(source_record) is None
+
+
+def test_get_contents_transforms_correctly_if_fields_missing():
+ source_record = create_dspace_dim_source_record_stub()
+ assert DspaceDim.get_contents(source_record) is None
+
+
+def test_get_contributors_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ LaFountain, James R.
+ Oldenbourg, Rudolf
+ Jamerson, James
+ """
+ )
+ assert DspaceDim.get_contributors(source_record) == [
+ timdex.Contributor(value="Jamerson, James", kind="Creator"),
+ timdex.Contributor(
+ value="LaFountain, James R.",
+ kind="author",
+ ),
+ timdex.Contributor(
+ value="Oldenbourg, Rudolf",
+ kind="author",
+ ),
+ ]
+
+
+def test_get_contributors_transforms_correctly_if_fields_blank():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+
+
+ """
+ )
+ assert DspaceDim.get_contributors(source_record) is None
+
+
+def test_get_contributors_transforms_correctly_if_fields_missing():
+ source_record = create_dspace_dim_source_record_stub()
+ assert DspaceDim.get_contributors(source_record) is None
+
+
+def test_get_dates_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ 1201-01-01 - 1965-12-21
+ 1201-01-01/1965-12-21
+ 2009-01-08T16:24:37Z
+ 2009-01-08T16:24:37Z
+ 2002-11
+ """
+ )
+ assert DspaceDim.get_dates(source_record) == [
+ timdex.Date(kind="accessioned", value="2009-01-08T16:24:37Z"),
+ timdex.Date(kind="available", value="2009-01-08T16:24:37Z"),
+ timdex.Date(kind="Publication date", value="2002-11"),
+ timdex.Date(
+ kind="coverage",
+ note="1201-01-01 - 1965-12-21",
+ ),
+ timdex.Date(
+ kind="coverage",
+ range=timdex.DateRange(gte="1201-01-01", lte="1965-12-21"),
+ ),
+ ]
+
+
+def test_get_dates_transforms_correctly_if_fields_blank():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+
+
+ """
+ )
+ assert DspaceDim.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_if_fields_missing():
+ source_record = create_dspace_dim_source_record_stub()
+ assert DspaceDim.get_dates(source_record) is None
+
+
+def test_get_file_formats_success():
+ source_record = create_dspace_dim_source_record_stub(
+ """
+ application/msword
+ image/tiff
+ video/quicktime
+ """
+ )
+ assert DspaceDim.get_file_formats(source_record) == [
+ "application/msword",
+ "image/tiff",
+ "video/quicktime",
+ ]
+
+
+def test_get_file_formats_transforms_correctly_if_fields_blank():
+ source_record = create_dspace_dim_source_record_stub(
+ ''
+ )
+ assert DspaceDim.get_file_formats(source_record) is None
+
+
+def test_get_file_formats_transforms_correctly_if_fields_missing():
+ source_record = create_dspace_dim_source_record_stub()
+ assert DspaceDim.get_file_formats(source_record) is None
+
+
+def test_get_format_success():
+ assert DspaceDim.get_format() == "electronic resource"
diff --git a/transmogrifier/sources/xml/dspace_dim.py b/transmogrifier/sources/xml/dspace_dim.py
index 0e4714c..fcd943d 100644
--- a/transmogrifier/sources/xml/dspace_dim.py
+++ b/transmogrifier/sources/xml/dspace_dim.py
@@ -1,4 +1,5 @@
import logging
+from collections.abc import Iterator
from bs4 import Tag # type: ignore[import-untyped]
@@ -12,128 +13,46 @@
class DspaceDim(XMLTransformer):
"""DSpace DIM transformer."""
- def get_optional_fields(self, xml: Tag) -> dict | None:
+ def get_optional_fields(self, source_record: Tag) -> dict | None:
"""
Retrieve optional TIMDEX fields from a DSpace DIM XML record.
Overrides metaclass get_optional_fields() method.
Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
+ source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
+ record.
"""
fields: dict = {}
- source_record_id = self.get_source_record_id(xml)
-
# alternate_titles
- for alternate_title in [
- t
- for t in xml.find_all("dim:field", element="title")
- if "qualifier" in t.attrs and t.string
- ]:
- fields.setdefault("alternate_titles", []).append(
- timdex.AlternateTitle(
- value=alternate_title.string,
- kind=alternate_title["qualifier"] or None,
- )
- )
- # If the record has more than one main title, add extras to alternate_titles
- for index, title in enumerate(self.get_main_titles(xml)):
- if index > 0:
- fields.setdefault("alternate_titles", []).append(
- timdex.AlternateTitle(value=title)
- )
+ fields["alternate_titles"] = self.get_alternate_titles(source_record)
# citation
- citation = xml.find("dim:field", element="identifier", qualifier="citation")
- fields["citation"] = citation.string if citation and citation.string else None
+ fields["citation"] = self.get_citation(source_record)
# content_type
- if content_types := self.get_content_types(xml):
- if self.valid_content_types(content_types):
- fields["content_type"] = content_types
- else:
- return None
+ fields["content_type"] = self.get_content_type(source_record)
# contents
- fields["contents"] = [
- t.string
- for t in xml.find_all(
- "dim:field", element="description", qualifier="tableofcontents"
- )
- if t.string
- ] or None
+ fields["contents"] = self.get_contents(source_record)
# contributors
- for creator in [
- c for c in xml.find_all("dim:field", element="creator") if c.string
- ]:
- fields.setdefault("contributors", []).append(
- timdex.Contributor(
- value=creator.string,
- kind="Creator",
- )
- )
-
- for contributor in [
- c for c in xml.find_all("dim:field", element="contributor") if c.string
- ]:
- fields.setdefault("contributors", []).append(
- timdex.Contributor(
- value=contributor.string,
- kind=contributor.get("qualifier") or "Not specified",
- )
- )
+ fields["contributors"] = self.get_contributors(source_record)
# dates
- for date in xml.find_all("dim:field", element="date", string=True):
- date_value = str(date.string.strip())
- if validate_date(date_value, source_record_id):
- if date.get("qualifier") == "issued":
- d = timdex.Date(value=date_value, kind="Publication date")
- else:
- d = timdex.Date(value=date_value, kind=date.get("qualifier") or None)
- fields.setdefault("dates", []).append(d)
-
- for coverage in [
- c.string
- for c in xml.find_all("dim:field", element="coverage", qualifier="temporal")
- if c.string
- ]:
- if "/" in coverage:
- split = coverage.index("/")
- gte_date = coverage[:split]
- lte_date = coverage[split + 1 :]
- if validate_date_range(
- gte_date,
- lte_date,
- source_record_id,
- ):
- d = timdex.Date(
- range=timdex.DateRange(
- gte=gte_date,
- lte=lte_date,
- ),
- kind="coverage",
- )
- else:
- d = timdex.Date(note=coverage.string, kind="coverage")
- fields.setdefault("dates", []).append(d)
+ fields["dates"] = self.get_dates(source_record)
# file_formats
- fields["file_formats"] = [
- f.string
- for f in xml.find_all("dim:field", element="format")
- if f.get("qualifier") == "mimetype" and f.string
- ] or None
+ fields["file_formats"] = self.get_file_formats(source_record)
# format
- fields["format"] = "electronic resource"
+ fields["format"] = self.get_format()
# funding_information
for funding_reference in [
f
- for f in xml.find_all(
+ for f in source_record.find_all(
"dim:field", element="description", qualifier="sponsorship"
)
if f.string
@@ -145,7 +64,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
# identifiers
- identifiers = xml.find_all("dim:field", element="identifier")
+ identifiers = source_record.find_all("dim:field", element="identifier")
for identifier in [
i for i in identifiers if i.get("qualifier") != "citation" and i.string
]:
@@ -158,7 +77,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# language
fields["languages"] = [
- la.string for la in xml.find_all("dim:field", element="language") if la.string
+ la.string
+ for la in source_record.find_all("dim:field", element="language")
+ if la.string
] or None
# links, uses identifiers list retrieved for identifiers field
@@ -176,12 +97,14 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# locations
fields["locations"] = [
timdex.Location(value=lo.string)
- for lo in xml.find_all("dim:field", element="coverage", qualifier="spatial")
+ for lo in source_record.find_all(
+ "dim:field", element="coverage", qualifier="spatial"
+ )
if lo.string
] or None
# notes
- descriptions = xml.find_all("dim:field", element="description")
+ descriptions = source_record.find_all("dim:field", element="description")
for description in [
d
for d in descriptions
@@ -204,13 +127,13 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# publishers
fields["publishers"] = [
timdex.Publisher(name=p.string)
- for p in xml.find_all("dim:field", element="publisher")
+ for p in source_record.find_all("dim:field", element="publisher")
if p.string
] or None
# related_items
for related_item in [
- r for r in xml.find_all("dim:field", element="relation") if r.string
+ r for r in source_record.find_all("dim:field", element="relation") if r.string
]:
if related_item.get("qualifier") == "uri":
ri = timdex.RelatedItem(
@@ -225,7 +148,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# rights
for rights in [
- r for r in xml.find_all("dim:field", element="rights") if r.string
+ r for r in source_record.find_all("dim:field", element="rights") if r.string
]:
if rights.get("qualifier") == "uri":
rg = timdex.Rights(uri=rights.string)
@@ -238,7 +161,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# subjects
subjects_dict: dict[str, list[str]] = {}
for subject in [
- s for s in xml.find_all("dim:field", element="subject") if s.string
+ s for s in source_record.find_all("dim:field", element="subject") if s.string
]:
if not subject.get("qualifier"):
subjects_dict.setdefault("Subject scheme not provided", []).append(
@@ -260,47 +183,172 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
return fields
@classmethod
- def get_content_types(cls, xml: Tag) -> list[str] | None:
- """
- Retrieve content types from a DSpace DIM XML record.
+ def get_alternate_titles(
+ cls, source_record: Tag
+ ) -> list[timdex.AlternateTitle] | None:
+ alternate_titles = [
+ timdex.AlternateTitle(
+ value=str(alternate_title.string),
+ kind=alternate_title["qualifier"],
+ )
+ for alternate_title in source_record.find_all(
+ "dim:field", element="title", string=True
+ )
+ if alternate_title.get("qualifier")
+ ]
+ # If the record has more than one main title, add extras to alternate_titles
+ alternate_titles.extend(
+ [
+ timdex.AlternateTitle(value=title)
+ for title in cls.get_main_titles(source_record)[1:]
+ ]
+ )
+ return alternate_titles or None
- May be overridden by source subclasses that retrieve content type values
- differently.
+ @classmethod
+ def get_citation(cls, source_record: Tag) -> str | None:
+ if citation := source_record.find(
+ "dim:field", element="identifier", qualifier="citation", string=True
+ ):
+ return citation.string
+ return None
- Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
- """
+ @classmethod
+ def get_content_type(cls, source_record: Tag) -> list[str] | None:
return [
- t.string for t in xml.find_all("dim:field", element="type", string=True)
+ str(content_type.string)
+ for content_type in source_record.find_all(
+ "dim:field", element="type", string=True
+ )
] or None
@classmethod
- def get_main_titles(cls, xml: Tag) -> list[str]:
+ def get_contents(cls, source_record: Tag) -> list[str] | None:
+ return [
+ contents.string
+ for contents in source_record.find_all(
+ "dim:field",
+ element="description",
+ qualifier="tableofcontents",
+ string=True,
+ )
+ ] or None
+
+ @classmethod
+ def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None:
+ contributors: list[timdex.Contributor] = []
+ contributors.extend(cls._get_creators(source_record))
+ contributors.extend(cls._get_contributors_by_contributor_element(source_record))
+ return contributors or None
+
+ @classmethod
+ def _get_creators(cls, source_record: Tag) -> Iterator[timdex.Contributor]:
+ for creator in source_record.find_all(
+ "dim:field", element="creator", string=True
+ ):
+ yield timdex.Contributor(
+ value=str(creator.string),
+ kind="Creator",
+ )
+
+ @classmethod
+ def _get_contributors_by_contributor_element(
+ cls, source_record: Tag
+ ) -> Iterator[timdex.Contributor]:
+ for contributor in source_record.find_all(
+ "dim:field", element="contributor", string=True
+ ):
+ yield timdex.Contributor(
+ value=str(contributor.string),
+ kind=contributor.get("qualifier") or "Not specified",
+ )
+
+ @classmethod
+ def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None:
+ dates = []
+ for date in source_record.find_all("dim:field", element="date", string=True):
+ date_value = str(date.string.strip())
+ if validate_date(date_value, cls.get_source_record_id(source_record)):
+ if date.get("qualifier") == "issued":
+ date_object = timdex.Date(value=date_value, kind="Publication date")
+ else:
+ date_object = timdex.Date(
+ value=date_value, kind=date.get("qualifier") or None
+ )
+ dates.append(date_object)
+ dates.extend(cls._get_coverage_dates(source_record))
+ return dates or None
+
+ @classmethod
+ def _get_coverage_dates(cls, source_record: Tag) -> Iterator[timdex.Date]:
+ for coverage_value in [
+ str(coverage.string)
+ for coverage in source_record.find_all(
+ "dim:field", element="coverage", qualifier="temporal", string=True
+ )
+ ]:
+ if "/" in coverage_value:
+ split = coverage_value.index("/")
+ gte_date = coverage_value[:split]
+ lte_date = coverage_value[split + 1 :]
+ if validate_date_range(
+ gte_date,
+ lte_date,
+ cls.get_source_record_id(source_record),
+ ):
+ yield timdex.Date(
+ range=timdex.DateRange(
+ gte=gte_date,
+ lte=lte_date,
+ ),
+ kind="coverage",
+ )
+ else:
+ yield timdex.Date(note=coverage_value, kind="coverage")
+
+ @classmethod
+ def get_file_formats(cls, source_record: Tag) -> list[str] | None:
+ return [
+ str(file_format.string)
+ for file_format in source_record.find_all(
+ "dim:field", element="format", string=True
+ )
+ if file_format.get("qualifier") == "mimetype"
+ ] or None
+
+ @classmethod
+ def get_format(cls) -> str:
+ return "electronic resource"
+
+ @classmethod
+ def get_main_titles(cls, source_record: Tag) -> list[str]:
"""
Retrieve main title(s) from a DSpace DIM XML record.
Overrides metaclass get_main_titles() method.
Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
+ source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
+ record.
"""
return [
t.string
- for t in xml.find_all("dim:field", element="title", string=True)
+ for t in source_record.find_all("dim:field", element="title", string=True)
if "qualifier" not in t.attrs
]
@classmethod
- def get_source_record_id(cls, xml: Tag) -> str:
+ def get_source_record_id(cls, source_record: Tag) -> str:
"""
Get the source record ID from a DSpace DIM XML record.
Overrides metaclass get_source_record_id() method.
Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
+ source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
+ record.
"""
- return xml.header.identifier.string.split(":")[2]
+ return source_record.header.identifier.string.split(":")[2]
@classmethod
def valid_content_types(cls, _content_type_list: list[str]) -> bool:
diff --git a/transmogrifier/sources/xml/whoas.py b/transmogrifier/sources/xml/whoas.py
index 4736626..668b509 100644
--- a/transmogrifier/sources/xml/whoas.py
+++ b/transmogrifier/sources/xml/whoas.py
@@ -1,5 +1,6 @@
from bs4 import Tag # type: ignore[import-untyped]
+from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.sources.xml.dspace_dim import DspaceDim
INVALID_CONTENT_TYPES = [
@@ -23,18 +24,17 @@ class Whoas(DspaceDim):
"""Whoas transformer class."""
@classmethod
- def get_content_types(cls, xml: Tag) -> list[str]:
- """
- Retrieve content types from a DSpace DIM XML record.
-
- Overrides the base DspaceDim.get_content_types() method.
-
- Args:
- xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
- """
- return [
- t.string for t in xml.find_all("dim:field", element="type", string=True)
+ def get_content_type(cls, source_record: Tag) -> list[str] | None:
+ content_types = [
+ str(content_type.string)
+ for content_type in source_record.find_all(
+ "dim:field", element="type", string=True
+ )
] or ["no content type in source record"]
+ if cls.valid_content_types(content_types):
+ return content_types
+ message = f'Record skipped based on content type: "{content_types}"'
+ raise SkippedRecordEvent(message, cls.get_source_record_id(source_record))
@classmethod
def valid_content_types(cls, content_type_list: list[str]) -> bool: