Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Timx 291 orchestration #205

Merged
merged 15 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ def oai_pmh_records():
return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml")


# timdex ##########################


@pytest.fixture
def timdex_record_required_fields():
return timdex.TimdexRecord(
Expand Down
60 changes: 23 additions & 37 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ def test_mitaardvark_transform_and_write_output_files_writes_output_files(
tmp_path, aardvark_records
):
output_file = str(tmp_path / "output_file.json")
transformer = MITAardvark("cool-repo", aardvark_records)
aardvark = MITAardvark("cool-repo", aardvark_records)
assert not Path(tmp_path / "output_file.json").exists()
assert not Path(tmp_path / "output_file.txt").exists()
transformer.transform_and_write_output_files(output_file)
aardvark.transform_and_write_output_files(output_file)
assert Path(tmp_path / "output_file.json").exists()
assert Path(tmp_path / "output_file.txt").exists()

Expand All @@ -27,25 +27,15 @@ def test_mitaardvark_transform_and_write_output_files_no_txt_file_if_not_needed(
tmp_path, aardvark_record_all_fields
):
output_file = str(tmp_path / "output_file.json")
transformer = MITAardvark("cool-repo", aardvark_record_all_fields)
transformer.transform_and_write_output_files(output_file)
aardvark = MITAardvark("cool-repo", aardvark_record_all_fields)
aardvark.transform_and_write_output_files(output_file)
assert len(list(tmp_path.iterdir())) == 1
assert next(tmp_path.iterdir()).name == "output_file.json"


def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
"source": "A Cool Repository",
"source_link": "https://geodata.libraries.mit.edu/record/abc:123",
"timdex_record_id": "cool-repo:123",
"title": "Test title 1",
}


def test_aardvark_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
aardvark = MITAardvark("cool-repo", aardvark_records)
assert next(aardvark) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://geodata.libraries.mit.edu/record/abc:123",
timdex_record_id="cool-repo:123",
Expand All @@ -72,40 +62,31 @@ def test_aardvark_get_main_titles_success():
def test_aardvark_record_get_source_link_success():
source_record = create_aardvark_source_record_stub()
url_from_source_record = "https://geodata.libraries.mit.edu/record/abc:123"
source_record["gbl_suppressed_b"] = False
source_record["dct_references_s"] = json.dumps(
{"http://schema.org/url": url_from_source_record}
)
assert (
MITAardvark.get_source_link(
"None",
"abc:123",
source_record,
)
== url_from_source_record
)
aardvark = MITAardvark("cool-repo", iter([source_record]))
assert aardvark.get_source_link(source_record) == url_from_source_record


def test_aardvark_record_get_source_link_bad_dct_references_s_raises_error():
source_record = create_aardvark_source_record_stub()
source_record["dct_references_s"] = json.dumps(
{"missing data": "from aardvark from geoharvester"}
)
aardvark = MITAardvark("cool-repo", iter([source_record]))
with pytest.raises(
ValueError,
match="Could not locate a kind=Website link to pull the source link from.",
):
MITAardvark.get_source_link(
"None",
"abc:123",
source_record,
)
aardvark.get_source_link(source_record)


def test_aardvark_record_get_timdex_record_id_success():
source_record = create_aardvark_source_record_stub()
assert (
MITAardvark.get_timdex_record_id("source", "123", source_record) == "source:123"
)
aardvark = MITAardvark("cool-repo", iter([source_record]))
assert aardvark.get_timdex_record_id(source_record) == "cool-repo:123"


def test_aardvark_get_source_record_id_success():
Expand Down Expand Up @@ -532,7 +513,8 @@ def test_aardvark_get_rights_success():
"The person with the rights",
"Another person with the rights",
]
assert MITAardvark.get_rights(source_record, "source") == [
aardvark = MITAardvark("cool-repo", iter([source_record]))
assert aardvark.get_rights(source_record) == [
timdex.Rights(description="Access note", kind="Access rights"),
timdex.Rights(uri="http://license.license"),
timdex.Rights(uri="http://another_license.another_license"),
Expand All @@ -546,7 +528,8 @@ def test_aardvark_get_rights_success():
def test_aardvark_get_rights_mit_restricted_success():
source_record = create_aardvark_source_record_stub()
source_record["dct_accessRights_s"] = "Restricted"
assert MITAardvark.get_rights(source_record, "gismit") == [
aardvark = MITAardvark("gismit", iter([source_record]))
assert aardvark.get_rights(source_record) == [
timdex.Rights(description="Restricted", kind="Access rights"),
timdex.Rights(description="MIT authentication required", kind="Access to files"),
]
Expand All @@ -555,7 +538,8 @@ def test_aardvark_get_rights_mit_restricted_success():
def test_aardvark_get_rights_mit_public_success():
source_record = create_aardvark_source_record_stub()
source_record["dct_accessRights_s"] = "Public"
assert MITAardvark.get_rights(source_record, "gismit") == [
aardvark = MITAardvark("gismit", iter([source_record]))
assert aardvark.get_rights(source_record) == [
timdex.Rights(description="Public", kind="Access rights"),
timdex.Rights(description="no authentication required", kind="Access to files"),
]
Expand All @@ -564,7 +548,8 @@ def test_aardvark_get_rights_mit_public_success():
def test_aardvark_get_rights_external_restricted_success():
source_record = create_aardvark_source_record_stub()
source_record["dct_accessRights_s"] = "Restricted"
assert MITAardvark.get_rights(source_record, "gisogm") == [
aardvark = MITAardvark("gisogm", iter([source_record]))
assert aardvark.get_rights(source_record) == [
timdex.Rights(description="Restricted", kind="Access rights"),
timdex.Rights(
description="unknown: check with owning institution", kind="Access to files"
Expand All @@ -575,7 +560,8 @@ def test_aardvark_get_rights_external_restricted_success():
def test_aardvark_get_rights_external_public_success():
source_record = create_aardvark_source_record_stub()
source_record["dct_accessRights_s"] = "Public"
assert MITAardvark.get_rights(source_record, "gisogm") == [
aardvark = MITAardvark("gisogm", iter([source_record]))
assert aardvark.get_rights(source_record) == [
timdex.Rights(description="Public", kind="Access rights"),
timdex.Rights(
description="unknown: check with owning institution", kind="Access to files"
Expand Down
146 changes: 46 additions & 100 deletions tests/sources/test_transformer.py
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -28,104 +28,50 @@ def test_transformer_get_transformer_source_wrong_module_path_raises_error():
Transformer.get_transformer("bad-module-path")


def test_create_dates_and_locations_from_publishers_success():
fields = {
"timdex_record_id": "abc123",
"publishers": [
timdex.Publisher(name="Publisher", date="2018", location="Location")
],
}
assert Transformer.create_dates_and_locations_from_publishers(fields) == {
"timdex_record_id": "abc123",
"publishers": [
timdex.Publisher(name="Publisher", date="2018", location="Location")
],
"dates": [timdex.Date(kind="Publication date", value="2018")],
"locations": [timdex.Location(value="Location", kind="Place of Publication")],
}


def test_create_dates_and_locations_from_publishers_drops_unparseable_dates(caplog):
def test_create_dates_from_publishers_success(timdex_record_required_fields):
timdex_record_required_fields.publishers = [
timdex.Publisher(name="Publisher", date="2018", location="Location")
]
assert list(
Transformer.create_dates_from_publishers(timdex_record_required_fields)
) == [timdex.Date(kind="Publication date", value="2018")]


def test_create_dates_from_publishers_drops_unparseable_dates(
caplog, timdex_record_required_fields
):
caplog.set_level("DEBUG")
fields = {
"timdex_record_id": "abc123",
"publishers": [
timdex.Publisher(name="Publisher", date="Date", location="Location")
],
}
assert Transformer.create_dates_and_locations_from_publishers(fields) == {
"timdex_record_id": "abc123",
"publishers": [
timdex.Publisher(name="Publisher", date="Date", location="Location")
],
"locations": [timdex.Location(value="Location", kind="Place of Publication")],
}
assert "Record ID 'abc123' has a date that couldn't be parsed: 'Date'" in caplog.text


def test_create_dates_and_locations_from_publishers_when_fields_are_none_success():
fields = {
"timdex_record_id": "abc123",
"publishers": [
timdex.Publisher(name="Publisher", date="2018", location="Location")
],
"dates": None,
"locations": None,
}
assert Transformer.create_dates_and_locations_from_publishers(fields) == {
"timdex_record_id": "abc123",
"publishers": [
timdex.Publisher(name="Publisher", date="2018", location="Location")
],
"dates": [timdex.Date(kind="Publication date", value="2018")],
"locations": [timdex.Location(value="Location", kind="Place of Publication")],
}


def test_create_locations_from_spatial_subjects_success():
fields = {
"subjects": [
timdex.Subject(
value=["Some city, Some country"], kind="Dublin Core; Spatial"
),
timdex.Subject(value=["City 1", "City 2"], kind="Dublin Core; Spatial"),
]
}
assert Transformer.create_locations_from_spatial_subjects(fields) == {
"subjects": [
timdex.Subject(
value=["Some city, Some country"], kind="Dublin Core; Spatial"
),
timdex.Subject(value=["City 1", "City 2"], kind="Dublin Core; Spatial"),
],
"locations": [
timdex.Location(value="Some city, Some country", kind="Place Name"),
timdex.Location(value="City 1", kind="Place Name"),
timdex.Location(value="City 2", kind="Place Name"),
],
}


def test_create_locations_from_spatial_subjects_when_field_is_none_success():
fields = {
"subjects": [
timdex.Subject(
value=["Some city, Some country"], kind="Dublin Core; Spatial"
),
timdex.Subject(value=["City 1", "City 2"], kind="Dublin Core; Spatial"),
],
"locations": None,
}
assert Transformer.create_locations_from_spatial_subjects(fields) == {
"subjects": [
timdex.Subject(
value=["Some city, Some country"], kind="Dublin Core; Spatial"
),
timdex.Subject(value=["City 1", "City 2"], kind="Dublin Core; Spatial"),
],
"locations": [
timdex.Location(value="Some city, Some country", kind="Place Name"),
timdex.Location(value="City 1", kind="Place Name"),
timdex.Location(value="City 2", kind="Place Name"),
],
}
timdex_record_required_fields.publishers = [
timdex.Publisher(name="Publisher", date="Date", location="Location")
]
assert (
list(Transformer.create_dates_from_publishers(timdex_record_required_fields))
== []
)
assert (
"Record ID 'cool-repo:123' has a date that couldn't be parsed: 'Date'"
in caplog.text
)


def test_create_locations_from_publishers_success(timdex_record_required_fields):
timdex_record_required_fields.publishers = [
timdex.Publisher(name="Publisher", date="2018", location="Location")
]
assert list(
Transformer.create_locations_from_publishers(timdex_record_required_fields)
) == [timdex.Location(value="Location", kind="Place of Publication")]


def test_create_locations_from_spatial_subjects_success(timdex_record_required_fields):
timdex_record_required_fields.subjects = [
timdex.Subject(value=["Some city, Some country"], kind="Dublin Core; Spatial"),
timdex.Subject(value=["City 1", "City 2"], kind="Dublin Core; Spatial"),
]
assert list(
Transformer.create_locations_from_spatial_subjects(timdex_record_required_fields)
) == [
timdex.Location(value="Some city, Some country", kind="Place Name"),
timdex.Location(value="City 1", kind="Place Name"),
timdex.Location(value="City 2", kind="Place Name"),
]
26 changes: 0 additions & 26 deletions tests/sources/test_xmltransformer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# ruff: noqa: PLR2004
from pathlib import Path
from unittest.mock import patch

import transmogrifier.models as timdex
from transmogrifier.sources.xml.datacite import Datacite
Expand All @@ -23,21 +22,6 @@ def test_xmltransformer_iterates_through_all_records(oai_pmh_records):
assert len(output_records.deleted_records) == 1


def test_xmltransformer_iterates_successfully_if_get_optional_fields_returns_none(
oai_pmh_records,
):
with patch(
"transmogrifier.sources.xmltransformer.XMLTransformer.get_optional_fields"
) as m:
m.return_value = None
output_records = XMLTransformer("cool-repo", oai_pmh_records)
assert len(list(output_records)) == 0
assert output_records.processed_record_count == 3
assert output_records.skipped_record_count == 2
assert output_records.transformed_record_count == 0
assert len(output_records.deleted_records) == 1


def test_xmltransformer_transform_and_write_output_files_writes_output_files(
tmp_path, oai_pmh_records
):
Expand Down Expand Up @@ -82,16 +66,6 @@ def test_xmltransformer_record_is_deleted_returns_false_if_not_deleted(caplog):
assert XMLTransformer.record_is_deleted(next(source_records)) is False


def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_records):
transformer = XMLTransformer("cool-repo", oai_pmh_records)
assert transformer.get_required_fields(next(oai_pmh_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/12345",
"timdex_record_id": "cool-repo:12345",
"title": "Title not provided",
}


def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records):
transformer = XMLTransformer("cool-repo", oai_pmh_records)
assert next(transformer) == timdex.TimdexRecord(
Expand Down
2 changes: 1 addition & 1 deletion tests/sources/xml/test_dspace_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ def test_get_identifiers_transforms_correctly_if_fields_missing():
assert DspaceDim.get_identifiers(source_record) is None


def test_languages_success():
def test_get_languages_success():
source_record = create_dspace_dim_source_record_stub(
"""
<dim:field mdschema="dc" element="language" qualifier="iso">en_US</dim:field>
Expand Down
3 changes: 3 additions & 0 deletions tests/sources/xml/test_marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ def test_marc_record_all_fields_transform_correctly():
),
timdex.Location(value="Austria - Vienna", kind="Hierarchical Place Name"),
timdex.Location(value="New York", kind="Place of Publication"),
timdex.Location(value="New York", kind="Place of Publication"),
timdex.Location(value="France", kind="Place of Publication"),
],
notes=[
timdex.Note(
Expand Down Expand Up @@ -675,6 +677,7 @@ def test_marc_record_attribute_and_subfield_variations_transforms_correctly():
kind="Hierarchical Place Name",
),
timdex.Location(value="a", kind="Place of Publication"),
timdex.Location(value="a", kind="Place of Publication"),
],
notes=[
timdex.Note(value=["c"], kind="Title Statement of Responsibility"),
Expand Down
Loading
Loading