From 74a2b44950aac345adde826d9b3e7c41f744d333 Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Tue, 21 May 2024 10:36:39 -0400 Subject: [PATCH 1/4] Add 1st set of Datacite field methods Why these changes are being introduced: * Refactor Datacite to use field methods How this addresses that need: * Add comments to organize conftest.py by source * Add additional Datacite fixtures to conftest.py * Update datacite_record_all_fields.xml to align with other source fixtures * Add create_datacite_source_record_stub function to Datacite test module * Rename param xml > source_record * Add field methods and associated private methods for alternate_titles, content_type, and contributors * Add unit tests for new field methods * Shift note-related code block from content_type to notes Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-284 --- tests/conftest.py | 35 ++- .../datacite/datacite_record_all_fields.xml | 251 +++++++++--------- tests/sources/xml/test_datacite.py | 151 +++++++++++ transmogrifier/sources/xml/datacite.py | 229 +++++++++------- 4 files changed, 445 insertions(+), 221 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4e28fb5..1c2413d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,6 +43,14 @@ def runner(): return CliRunner() +# aardvark ########################## + + +@pytest.fixture +def aardvark_records(): + return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl") + + @pytest.fixture def aardvark_record_all_fields(): return JSONTransformer.parse_source_file( @@ -50,6 +58,9 @@ def aardvark_record_all_fields(): ) +# datacite ########################## + + @pytest.fixture def datacite_records(): return XMLTransformer.parse_source_file( @@ -66,8 +77,22 @@ def datacite_record_all_fields(): @pytest.fixture -def aardvark_records(): - return JSONTransformer.parse_source_file("tests/fixtures/aardvark_records.jsonl") +def datacite_record_optional_fields_blank(): + source_records = XMLTransformer.parse_source_file( + "tests/fixtures/datacite/datacite_record_optional_fields_blank.xml" + ) + return next(source_records) + + +@pytest.fixture +def datacite_record_optional_fields_missing(): + source_records = XMLTransformer.parse_source_file( + "tests/fixtures/datacite/datacite_record_optional_fields_missing.xml" + ) + return next(source_records) + + +# marc ########################## @pytest.fixture @@ -80,11 +105,17 @@ def marc_content_type_crosswalk(): return load_external_config("config/marc_content_type_crosswalk.json", "json") +# oaidc ########################## + + @pytest.fixture def oai_pmh_records(): return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml") +# timdex ########################## + + @pytest.fixture def timdex_record_required_fields(): return timdex.TimdexRecord( diff --git a/tests/fixtures/datacite/datacite_record_all_fields.xml b/tests/fixtures/datacite/datacite_record_all_fields.xml index 07538f7..f929a46 100644 --- a/tests/fixtures/datacite/datacite_record_all_fields.xml +++ b/tests/fixtures/datacite/datacite_record_all_fields.xml @@ -1,125 +1,128 @@ - -
- doi:10.7910/DVN/19PPE7 - 2022-03-26T06:04:55Z - Jameel_Poverty_Action_Lab - IQSS -
- - - 10.7910/DVN/19PPE7 - - - Banerji, Rukmini - Rukmini - Banerji - Pratham and ASER Centre - 0000-0000-0000-0000 - - - Berry, James - James - Berry - University of Delaware - 0000-0000-0000-0001 - - - Shotland, Marc - Marc - Shotland - Abdul Latif Jameel Poverty Action Lab - 0000-0000-0000-0002 - - - - The Impact of Maternal Literacy and Participation Programs - An Alternative Title - Baseline Data - - Harvard Dataverse - 2017 - - Social Sciences - Educational materials - Adult education, education inputs, field experiments - Education - - - https://zenodo.org/record/5524465 - - - - Banerji, Rukmini - Rukmini - Banerji - 0000-0000-0000-0000 - Pratham and ASER Centre - - - - 2017-02-27 - 2019-06-24 - 2007-01-01/2007-02-28 - - Survey Data - - 10.1257/app.20150390 - 10.5281/zenodo.5524464 - 1234567.5524464 - 1234567.5524464 - https://zenodo.org/communities/astronomy-general - - en_US - - 124903 - 48958 - 199070 - 186674 - 139605 - 97304 - 9907 - 178534602 - 4032103 - 43589 - 15697 - - - application/vnd.openxmlformats-officedocument.spreadsheetml.sheet - application/pdf - application/pdf - application/vnd.openxmlformats-officedocument.spreadsheetml.sheet - application/pdf - application/x-stata-syntax - application/x-stata - application/x-stata - application/zip - application/pdf - application/pdf - - 1.2 - - - CC0 1.0 - - - Using a randomized field experiment in India, we evaluate the effectiveness of adult literacy and parental involvement interventions in improving children's learning. Households were assigned to receive either adult literacy (language and math) classes for mothers, training for mothers on how to enhance their children's learning at home, or a combination of the two programs. All three interventions had significant but modest impacts on childrens math scores. The interventions also increased mothers' test scores in both language and math, as well as a range of other outcomes reflecting greater involvement of mothers in their children's education. - Stata, 13 - - - - A point on the globe - - - - - 3ie, Nike Foundation - 0987 - OW1/1012 (3ie) - - - - -
\ No newline at end of file + + +
+ doi:10.7910/DVN/19PPE7 + 2022-03-26T06:04:55Z + Jameel_Poverty_Action_Lab + IQSS +
+ + + 10.7910/DVN/19PPE7 + + + Banerji, Rukmini + Rukmini + Banerji + Pratham and ASER Centre + 0000-0000-0000-0000 + + + Berry, James + James + Berry + University of Delaware + 0000-0000-0000-0001 + + + Shotland, Marc + Marc + Shotland + Abdul Latif Jameel Poverty Action Lab + 0000-0000-0000-0002 + + + + The Impact of Maternal Literacy and Participation Programs + An Alternative Title + Baseline Data + + Harvard Dataverse + 2017 + + Social Sciences + Educational materials + Adult education, education inputs, field experiments + Education + + + https://zenodo.org/record/5524465 + + + + Banerji, Rukmini + Rukmini + Banerji + 0000-0000-0000-0000 + Pratham and ASER Centre + + + + 2017-02-27 + 2019-06-24 + 2007-01-01/2007-02-28 + + Survey Data + + 10.1257/app.20150390 + 10.5281/zenodo.5524464 + 1234567.5524464 + 1234567.5524464 + https://zenodo.org/communities/astronomy-general + + en_US + + 124903 + 48958 + 199070 + 186674 + 139605 + 97304 + 9907 + 178534602 + 4032103 + 43589 + 15697 + + + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + application/pdf + application/pdf + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + application/pdf + application/x-stata-syntax + application/x-stata + application/x-stata + application/zip + application/pdf + application/pdf + + 1.2 + + + CC0 1.0 + + + Using a randomized field experiment in India, we evaluate the effectiveness of adult literacy and parental involvement interventions in improving children's learning. Households were assigned to receive either adult literacy (language and math) classes for mothers, training for mothers on how to enhance their children's learning at home, or a combination of the two programs. All three interventions had significant but modest impacts on childrens math scores. The interventions also increased mothers' test scores in both language and math, as well as a range of other outcomes reflecting greater involvement of mothers in their children's education. + Stata, 13 + + + + A point on the globe + + + + + 3ie, Nike Foundation + 0987 + OW1/1012 (3ie) + + + + +
+
\ No newline at end of file diff --git a/tests/sources/xml/test_datacite.py b/tests/sources/xml/test_datacite.py index f8e7c01..26a5710 100644 --- a/tests/sources/xml/test_datacite.py +++ b/tests/sources/xml/test_datacite.py @@ -1,3 +1,5 @@ +from bs4 import BeautifulSoup + from transmogrifier.models import ( AlternateTitle, Contributor, @@ -17,6 +19,25 @@ from transmogrifier.sources.xml.datacite import Datacite +def create_datacite_source_record_stub(xml_insert: str) -> BeautifulSoup: + xml_string = f""" + + + + + {xml_insert} + + + + + """ + return BeautifulSoup(xml_string, "xml") + + def test_datacite_transform_with_all_fields_transforms_correctly( datacite_record_all_fields, ): @@ -359,6 +380,136 @@ def test_datacite_with_attribute_and_subfield_variations_transforms_correctly(): ) +def test_get_alternate_titles_success(): + source_record = create_datacite_source_record_stub( + """ + + The Impact of Maternal Literacy and Participation Programs + An Alternative Title + Baseline Data + + """ + ) + assert Datacite.get_alternate_titles(source_record) == [ + AlternateTitle(value="An Alternative Title", kind="AlternativeTitle"), + AlternateTitle(value="Baseline Data", kind="Subtitle"), + ] + + +def test_get_alternate_titles_transforms_correctly_if_fields_blank( + datacite_record_optional_fields_blank, +): + assert Datacite.get_alternate_titles(datacite_record_optional_fields_blank) == [] + + +def test_get_alternate_titles_transforms_correctly_if_fields_missing( + datacite_record_optional_fields_missing, +): + assert Datacite.get_alternate_titles(datacite_record_optional_fields_missing) == [] + + +def test_get_content_type_success(): + source_record = create_datacite_source_record_stub( + """ + Survey Data + """ + ) + assert Datacite.get_content_type(source_record, "abc123") == ["Dataset"] + + +def test_get_content_type_transforms_correctly_if_fields_blank( + datacite_record_optional_fields_blank, +): + assert ( + Datacite.get_content_type(datacite_record_optional_fields_blank, "abc123") == [] + ) + + +def test_get_content_type_transforms_correctly_if_fields_missing( + datacite_record_optional_fields_missing, +): + assert ( + Datacite.get_content_type(datacite_record_optional_fields_missing, "abc123") == [] + ) + + +def test_get_contributors_success(): + source_record = create_datacite_source_record_stub( + """ + + + Banerji, Rukmini + Rukmini + Banerji + Pratham and ASER Centre + 0000-0000-0000-0000 + + + Berry, James + James + Berry + University of Delaware + 0000-0000-0000-0001 + + + Shotland, Marc + Marc + Shotland + Abdul Latif Jameel Poverty Action Lab + 0000-0000-0000-0002 + + + + + Banerji, Rukmini + Rukmini + Banerji + 0000-0000-0000-0000 + Pratham and ASER Centre + + + """ + ) + assert Datacite.get_contributors(source_record) == [ + Contributor( + value="Banerji, Rukmini", + affiliation=["Pratham and ASER Centre"], + identifier=["https://orcid.org/0000-0000-0000-0000"], + kind="Creator", + ), + Contributor( + value="Berry, James", + affiliation=["University of Delaware"], + identifier=["0000-0000-0000-0001"], + kind="Creator", + ), + Contributor( + value="Shotland, Marc", + affiliation=["Abdul Latif Jameel Poverty Action Lab"], + identifier=["0000-0000-0000-0002"], + kind="Creator", + ), + Contributor( + value="Banerji, Rukmini", + affiliation=["Pratham and ASER Centre"], + identifier=["https://orcid.org/0000-0000-0000-0000"], + kind="ContactPerson", + ), + ] + + +def test_get_contributors_transforms_correctly_if_fields_blank( + datacite_record_optional_fields_blank, +): + assert Datacite.get_contributors(datacite_record_optional_fields_blank) == [] + + +def test_get_contributors_transforms_correctly_if_fields_missing( + datacite_record_optional_fields_missing, +): + assert Datacite.get_contributors(datacite_record_optional_fields_missing) == [] + + def test_generate_name_identifier_url_orcid_scheme(datacite_record_all_fields): assert next(datacite_record_all_fields).contributors[0].identifier == [ "https://orcid.org/0000-0000-0000-0000" diff --git a/transmogrifier/sources/xml/datacite.py b/transmogrifier/sources/xml/datacite.py index dd82147..7a58df9 100644 --- a/transmogrifier/sources/xml/datacite.py +++ b/transmogrifier/sources/xml/datacite.py @@ -1,4 +1,5 @@ import logging +from collections.abc import Iterator from bs4 import Tag # type: ignore[import-untyped] @@ -13,100 +14,34 @@ class Datacite(XMLTransformer): """Datacite transformer.""" - def get_optional_fields(self, xml: Tag) -> dict | None: + def get_optional_fields(self, source_record: Tag) -> dict | None: """ Retrieve optional TIMDEX fields from a Datacite XML record. Overrides metaclass get_optional_fields() method. Args: - xml: A BeautifulSoup Tag representing a single Datacite record in + source_record: A BeautifulSoup Tag representing a single Datacite record in oai_datacite XML. """ fields: dict = {} - source_record_id = self.get_source_record_id(xml) + source_record_id = self.get_source_record_id(source_record) # alternate_titles - for alternate_title in [ - t for t in xml.find_all("title", string=True) if t.get("titleType") - ]: - fields.setdefault("alternate_titles", []).append( - timdex.AlternateTitle( - value=alternate_title.string, - kind=alternate_title["titleType"], - ) - ) - # If the record has more than one main title, add extras to alternate_titles - for index, title in enumerate(self.get_main_titles(xml)): - if index > 0: - fields.setdefault("alternate_titles", []).append( - timdex.AlternateTitle(value=title) - ) + fields["alternate_titles"] = self.get_alternate_titles(source_record) or None # content_type - if resource_type := xml.metadata.find("resourceType"): - if resource_type.string: - fields["notes"] = [ - timdex.Note( - value=[resource_type.string], kind="Datacite resource type" - ) - ] - if content_type := resource_type.get("resourceTypeGeneral"): - if self.valid_content_types([content_type]): - fields["content_type"] = [content_type] - else: - message = f'Record skipped based on content type: "{content_type}"' - raise SkippedRecordEvent(message, source_record_id) - else: - logger.warning( - "Datacite record %s missing required Datacite field resourceType", - source_record_id, - ) + fields["content_type"] = ( + self.get_content_type(source_record, source_record_id) or None + ) # contributors - for creator in xml.metadata.find_all("creator"): - if creator_name := creator.find("creatorName", string=True): - fields.setdefault("contributors", []).append( - timdex.Contributor( - value=creator_name.string, - affiliation=[ - a.string for a in creator.find_all("affiliation", string=True) - ] - or None, - identifier=[ - self.generate_name_identifier_url(name_identifier) - for name_identifier in creator.find_all( - "nameIdentifier", string=True - ) - ] - or None, - kind="Creator", - ) - ) - - for contributor in xml.metadata.find_all("contributor"): - if contributor_name := contributor.find("contributorName", string=True): - fields.setdefault("contributors", []).append( - timdex.Contributor( - value=contributor_name.string, - affiliation=[ - a.string - for a in contributor.find_all("affiliation", string=True) - ] - or None, - identifier=[ - self.generate_name_identifier_url(name_identifier) - for name_identifier in contributor.find_all( - "nameIdentifier", string=True - ) - ] - or None, - kind=contributor.get("contributorType") or "Not specified", - ) - ) + fields["contributors"] = self.get_contributors(source_record) or None # dates - if publication_year := xml.metadata.find("publicationYear", string=True): + if publication_year := source_record.metadata.find( + "publicationYear", string=True + ): publication_year = str(publication_year.string.strip()) if validate_date( publication_year, @@ -121,7 +56,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: source_record_id, ) - for date in xml.metadata.find_all("date"): + for date in source_record.metadata.find_all("date"): d = timdex.Date() if date_value := date.string: date_value = str(date_value) @@ -153,19 +88,19 @@ def get_optional_fields(self, xml: Tag) -> dict | None: fields.setdefault("dates", []).append(d) # edition - if edition := xml.metadata.find("version", string=True): + if edition := source_record.metadata.find("version", string=True): fields["edition"] = edition.string # file_formats fields["file_formats"] = [ - f.string for f in xml.metadata.find_all("format", string=True) + f.string for f in source_record.metadata.find_all("format", string=True) ] or None # format fields["format"] = "electronic resource" # funding_information - for funding_reference in xml.metadata.find_all("fundingReference"): + for funding_reference in source_record.metadata.find_all("fundingReference"): f = timdex.Funder() if funder_name := funding_reference.find("funderName", string=True): f.funder_name = funder_name.string @@ -183,14 +118,14 @@ def get_optional_fields(self, xml: Tag) -> dict | None: fields.setdefault("funding_information", []).append(f) # identifiers - if identifier_xml := xml.metadata.find("identifier", string=True): + if identifier_xml := source_record.metadata.find("identifier", string=True): fields.setdefault("identifiers", []).append( timdex.Identifier( value=identifier_xml.string, kind=identifier_xml.get("identifierType") or "Not specified", ) ) - for alternate_identifier in xml.metadata.find_all( + for alternate_identifier in source_record.metadata.find_all( "alternateIdentifier", string=True ): fields.setdefault("identifiers", []).append( @@ -201,7 +136,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) ) - related_identifiers = xml.metadata.find_all("relatedIdentifier", string=True) + related_identifiers = source_record.metadata.find_all( + "relatedIdentifier", string=True + ) for related_identifier in [ ri for ri in related_identifiers if ri.get("relationType") == "IsIdenticalTo" ]: @@ -213,7 +150,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) # language - if language := xml.metadata.find("language", string=True): + if language := source_record.metadata.find("language", string=True): fields["languages"] = [language.string] # links @@ -226,13 +163,20 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ] # locations - for location in xml.metadata.find_all("geoLocationPlace", string=True): + for location in source_record.metadata.find_all("geoLocationPlace", string=True): fields.setdefault("locations", []).append( timdex.Location(value=location.string) ) # notes - descriptions = xml.metadata.find_all("description", string=True) + if resource_type := source_record.metadata.find("resourceType", string=True): + fields.setdefault("notes", []).append( + timdex.Note( + value=[str(resource_type.string)], + kind="Datacite resource type", + ) + ) + descriptions = source_record.metadata.find_all("description", string=True) for description in descriptions: if "descriptionType" not in description.attrs: logger.warning( @@ -249,7 +193,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: ) # publishers - if publisher := xml.metadata.find("publisher", string=True): + if publisher := source_record.metadata.find("publisher", string=True): fields["publishers"] = [timdex.Publisher(name=publisher.string)] else: logger.warning( @@ -271,7 +215,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # rights for right in [ - r for r in xml.metadata.find_all("rights") if r.string or r.get("rightsURI") + r + for r in source_record.metadata.find_all("rights") + if r.string or r.get("rightsURI") ]: fields.setdefault("rights", []).append( timdex.Rights( @@ -281,7 +227,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # subjects subjects_dict: dict[str, list[str]] = {} - for subject in xml.metadata.find_all("subject", string=True): + for subject in source_record.metadata.find_all("subject", string=True): if not subject.get("subjectScheme"): subjects_dict.setdefault("Subject scheme not provided", []).append( subject.string @@ -302,20 +248,113 @@ def get_optional_fields(self, xml: Tag) -> dict | None: return fields @classmethod - def get_main_titles(cls, xml: Tag) -> list[str]: + def get_alternate_titles(cls, source_record: Tag) -> list[timdex.AlternateTitle]: + alternate_titles = [] + alternate_titles.extend( + [ + timdex.AlternateTitle( + value=str(title.string.strip()), + kind=title["titleType"], + ) + for title in source_record.find_all("title", string=True) + if title.get("titleType") + ] + ) + alternate_titles.extend(list(cls._get_additional_titles(source_record))) + return alternate_titles + + @classmethod + def _get_additional_titles( + cls, source_record: Tag + ) -> Iterator[timdex.AlternateTitle]: + """Get additional titles from get_main_titles method.""" + for index, title in enumerate(cls.get_main_titles(source_record)): + if index > 0: + yield timdex.AlternateTitle(value=title) + + @classmethod + def get_content_type(cls, source_record: Tag, source_record_id: str) -> list[str]: + content_types = [] + if resource_type := source_record.metadata.find("resourceType"): + if content_type := resource_type.get("resourceTypeGeneral"): + if cls.valid_content_types([content_type]): + content_types.append(str(content_type)) + else: + message = f'Record skipped based on content type: "{content_type}"' + raise SkippedRecordEvent(message, source_record_id) + else: + logger.warning( + "Datacite record %s missing required Datacite field resourceType", + source_record_id, + ) + return content_types + + @classmethod + def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor]: + contributors = [] + contributors.extend(list(cls._get_creators(source_record))) + contributors.extend(list(cls._get_contributors(source_record))) + return contributors + + @classmethod + def _get_creators(cls, source_record: Tag) -> Iterator[timdex.Contributor]: + for creator in source_record.metadata.find_all("creator"): + if creator_name := creator.find("creatorName", string=True): + yield timdex.Contributor( + value=str(creator_name.string), + affiliation=[ + str(affiliation.string) + for affiliation in creator.find_all("affiliation", string=True) + ] + or None, + identifier=[ + cls.generate_name_identifier_url(name_identifier) + for name_identifier in creator.find_all( + "nameIdentifier", string=True + ) + ] + or None, + kind="Creator", + ) + + @classmethod + def _get_contributors(cls, source_record: Tag) -> Iterator[timdex.Contributor]: + for contributor in source_record.metadata.find_all("contributor"): + if contributor_name := contributor.find("contributorName", string=True): + yield timdex.Contributor( + value=contributor_name.string, + affiliation=[ + str(affiliation.string) + for affiliation in contributor.find_all( + "affiliation", string=True + ) + ] + or None, + identifier=[ + cls.generate_name_identifier_url(name_identifier) + for name_identifier in contributor.find_all( + "nameIdentifier", string=True + ) + ] + or None, + kind=contributor.get("contributorType") or "Not specified", + ) + + @classmethod + def get_main_titles(cls, source_record: Tag) -> list[str]: """ Retrieve main title(s) from a Datacite XML record. Overrides metaclass get_main_titles() method. Args: - xml: A BeautifulSoup Tag representing a single Datacite record in + source_record: A BeautifulSoup Tag representing a single Datacite record in oai_datacite XML. """ return [ - t.string - for t in xml.metadata.find_all("title", string=True) - if not t.get("titleType") + str(title.string) + for title in source_record.metadata.find_all("title", string=True) + if not title.get("titleType") ] @classmethod From 6849eae5518a6961ea3df5af43c52a868bc7105c Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Thu, 23 May 2024 08:57:11 -0400 Subject: [PATCH 2/4] Updates based on discussion in PR #178 * Shift "or None" logic to field methods and update type hinting * Update unit tests to all use stub records * Remove unnecessary fixtures from conftest.py --- tests/conftest.py | 16 ------- tests/sources/xml/test_datacite.py | 58 ++++++++++++++------------ transmogrifier/sources/xml/datacite.py | 24 ++++++----- 3 files changed, 44 insertions(+), 54 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 1c2413d..2785419 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -76,22 +76,6 @@ def datacite_record_all_fields(): return Datacite("cool-repo", source_records) -@pytest.fixture -def datacite_record_optional_fields_blank(): - source_records = XMLTransformer.parse_source_file( - "tests/fixtures/datacite/datacite_record_optional_fields_blank.xml" - ) - return next(source_records) - - -@pytest.fixture -def datacite_record_optional_fields_missing(): - source_records = XMLTransformer.parse_source_file( - "tests/fixtures/datacite/datacite_record_optional_fields_missing.xml" - ) - return next(source_records) - - # marc ########################## diff --git a/tests/sources/xml/test_datacite.py b/tests/sources/xml/test_datacite.py index 26a5710..27cec60 100644 --- a/tests/sources/xml/test_datacite.py +++ b/tests/sources/xml/test_datacite.py @@ -396,16 +396,16 @@ def test_get_alternate_titles_success(): ] -def test_get_alternate_titles_transforms_correctly_if_fields_blank( - datacite_record_optional_fields_blank, -): - assert Datacite.get_alternate_titles(datacite_record_optional_fields_blank) == [] +def test_get_alternate_titles_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub( + '' + ) + assert Datacite.get_alternate_titles(source_record) is None -def test_get_alternate_titles_transforms_correctly_if_fields_missing( - datacite_record_optional_fields_missing, -): - assert Datacite.get_alternate_titles(datacite_record_optional_fields_missing) == [] +def test_get_alternate_titles_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub("") + assert Datacite.get_alternate_titles(source_record) is None def test_get_content_type_success(): @@ -417,20 +417,18 @@ def test_get_content_type_success(): assert Datacite.get_content_type(source_record, "abc123") == ["Dataset"] -def test_get_content_type_transforms_correctly_if_fields_blank( - datacite_record_optional_fields_blank, -): - assert ( - Datacite.get_content_type(datacite_record_optional_fields_blank, "abc123") == [] +def test_get_content_type_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub( + """ + + """ ) + assert Datacite.get_content_type(source_record, "abc123") is None -def test_get_content_type_transforms_correctly_if_fields_missing( - datacite_record_optional_fields_missing, -): - assert ( - Datacite.get_content_type(datacite_record_optional_fields_missing, "abc123") == [] - ) +def test_get_content_type_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub("") + assert Datacite.get_content_type(source_record, "abc123") is None def test_get_contributors_success(): @@ -498,16 +496,22 @@ def test_get_contributors_success(): ] -def test_get_contributors_transforms_correctly_if_fields_blank( - datacite_record_optional_fields_blank, -): - assert Datacite.get_contributors(datacite_record_optional_fields_blank) == [] +def test_get_contributors_transforms_correctly_if_fields_blank(): + source_record = create_datacite_source_record_stub( + """ + + + + + + """ + ) + assert Datacite.get_contributors(source_record) is None -def test_get_contributors_transforms_correctly_if_fields_missing( - datacite_record_optional_fields_missing, -): - assert Datacite.get_contributors(datacite_record_optional_fields_missing) == [] +def test_get_contributors_transforms_correctly_if_fields_missing(): + source_record = create_datacite_source_record_stub("") + assert Datacite.get_contributors(source_record) is None def test_generate_name_identifier_url_orcid_scheme(datacite_record_all_fields): diff --git a/transmogrifier/sources/xml/datacite.py b/transmogrifier/sources/xml/datacite.py index 7a58df9..8daf56e 100644 --- a/transmogrifier/sources/xml/datacite.py +++ b/transmogrifier/sources/xml/datacite.py @@ -28,15 +28,13 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: source_record_id = self.get_source_record_id(source_record) # alternate_titles - fields["alternate_titles"] = self.get_alternate_titles(source_record) or None + fields["alternate_titles"] = self.get_alternate_titles(source_record) # content_type - fields["content_type"] = ( - self.get_content_type(source_record, source_record_id) or None - ) + fields["content_type"] = self.get_content_type(source_record, source_record_id) # contributors - fields["contributors"] = self.get_contributors(source_record) or None + fields["contributors"] = self.get_contributors(source_record) # dates if publication_year := source_record.metadata.find( @@ -248,7 +246,9 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: return fields @classmethod - def get_alternate_titles(cls, source_record: Tag) -> list[timdex.AlternateTitle]: + def get_alternate_titles( + cls, source_record: Tag + ) -> list[timdex.AlternateTitle] | None: alternate_titles = [] alternate_titles.extend( [ @@ -261,7 +261,7 @@ def get_alternate_titles(cls, source_record: Tag) -> list[timdex.AlternateTitle] ] ) alternate_titles.extend(list(cls._get_additional_titles(source_record))) - return alternate_titles + return alternate_titles or None @classmethod def _get_additional_titles( @@ -273,7 +273,9 @@ def _get_additional_titles( yield timdex.AlternateTitle(value=title) @classmethod - def get_content_type(cls, source_record: Tag, source_record_id: str) -> list[str]: + def get_content_type( + cls, source_record: Tag, source_record_id: str + ) -> list[str] | None: content_types = [] if resource_type := source_record.metadata.find("resourceType"): if content_type := resource_type.get("resourceTypeGeneral"): @@ -287,14 +289,14 @@ def get_content_type(cls, source_record: Tag, source_record_id: str) -> list[str "Datacite record %s missing required Datacite field resourceType", source_record_id, ) - return content_types + return content_types or None @classmethod - def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor]: + def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None: contributors = [] contributors.extend(list(cls._get_creators(source_record))) contributors.extend(list(cls._get_contributors(source_record))) - return contributors + return contributors or None @classmethod def _get_creators(cls, source_record: Tag) -> Iterator[timdex.Contributor]: From bdb2c4bd95c930683f7a5b6c2499fc88fe03eebb Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Thu, 23 May 2024 12:04:14 -0400 Subject: [PATCH 3/4] Further updates based on discussion in PR #178 * Add default value to create_datacite_source_record_stub's xml_insert param * Replace calls of source_record_id variable with calls of get_source_record_id method inside field methods --- tests/sources/xml/test_datacite.py | 17 ++++++++++------- transmogrifier/sources/xml/datacite.py | 12 ++++++------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/sources/xml/test_datacite.py b/tests/sources/xml/test_datacite.py index 27cec60..a5d4971 100644 --- a/tests/sources/xml/test_datacite.py +++ b/tests/sources/xml/test_datacite.py @@ -19,11 +19,14 @@ from transmogrifier.sources.xml.datacite import Datacite -def create_datacite_source_record_stub(xml_insert: str) -> BeautifulSoup: +def create_datacite_source_record_stub(xml_insert: str = "") -> BeautifulSoup: xml_string = f""" +
+ abc123 +
Survey Data """ ) - assert Datacite.get_content_type(source_record, "abc123") == ["Dataset"] + assert Datacite.get_content_type(source_record) == ["Dataset"] def test_get_content_type_transforms_correctly_if_fields_blank(): @@ -423,12 +426,12 @@ def test_get_content_type_transforms_correctly_if_fields_blank(): """ ) - assert Datacite.get_content_type(source_record, "abc123") is None + assert Datacite.get_content_type(source_record) is None def test_get_content_type_transforms_correctly_if_fields_missing(): - source_record = create_datacite_source_record_stub("") - assert Datacite.get_content_type(source_record, "abc123") is None + source_record = create_datacite_source_record_stub() + assert Datacite.get_content_type(source_record) is None def test_get_contributors_success(): @@ -510,7 +513,7 @@ def test_get_contributors_transforms_correctly_if_fields_blank(): def test_get_contributors_transforms_correctly_if_fields_missing(): - source_record = create_datacite_source_record_stub("") + source_record = create_datacite_source_record_stub() assert Datacite.get_contributors(source_record) is None diff --git a/transmogrifier/sources/xml/datacite.py b/transmogrifier/sources/xml/datacite.py index 8daf56e..89abb48 100644 --- a/transmogrifier/sources/xml/datacite.py +++ b/transmogrifier/sources/xml/datacite.py @@ -31,7 +31,7 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: fields["alternate_titles"] = self.get_alternate_titles(source_record) # content_type - fields["content_type"] = self.get_content_type(source_record, source_record_id) + fields["content_type"] = self.get_content_type(source_record) # contributors fields["contributors"] = self.get_contributors(source_record) @@ -273,9 +273,7 @@ def _get_additional_titles( yield timdex.AlternateTitle(value=title) @classmethod - def get_content_type( - cls, source_record: Tag, source_record_id: str - ) -> list[str] | None: + def get_content_type(cls, source_record: Tag) -> list[str] | None: content_types = [] if resource_type := source_record.metadata.find("resourceType"): if content_type := resource_type.get("resourceTypeGeneral"): @@ -283,11 +281,13 @@ def get_content_type( content_types.append(str(content_type)) else: message = f'Record skipped based on content type: "{content_type}"' - raise SkippedRecordEvent(message, source_record_id) + raise SkippedRecordEvent( + message, cls.get_source_record_id(source_record) + ) else: logger.warning( "Datacite record %s missing required Datacite field resourceType", - source_record_id, + cls.get_source_record_id(source_record), ) return content_types or None From 861b3d03f1418dbcd27e0874cfd0cdf1ab3b6fdd Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Fri, 24 May 2024 10:52:39 -0400 Subject: [PATCH 4/4] Update get_alternate_titles method --- transmogrifier/sources/xml/datacite.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/transmogrifier/sources/xml/datacite.py b/transmogrifier/sources/xml/datacite.py index 89abb48..ec99f6b 100644 --- a/transmogrifier/sources/xml/datacite.py +++ b/transmogrifier/sources/xml/datacite.py @@ -249,17 +249,14 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: def get_alternate_titles( cls, source_record: Tag ) -> list[timdex.AlternateTitle] | None: - alternate_titles = [] - alternate_titles.extend( - [ - timdex.AlternateTitle( - value=str(title.string.strip()), - kind=title["titleType"], - ) - for title in source_record.find_all("title", string=True) - if title.get("titleType") - ] - ) + alternate_titles = [ + timdex.AlternateTitle( + value=str(title.string.strip()), + kind=title["titleType"], + ) + for title in source_record.find_all("title", string=True) + if title.get("titleType") + ] alternate_titles.extend(list(cls._get_additional_titles(source_record))) return alternate_titles or None