Skip to content

Commit

Permalink
Merge pull request #112 from MITLibraries/GDT-54-workarounds-and-bug-…
Browse files Browse the repository at this point in the history
…fixes

GDT-54 Address mismatches with OpenSearch mapping
  • Loading branch information
ghukill authored Jan 8, 2024
2 parents a941359 + 6a97912 commit cbddb32
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 22 deletions.
4 changes: 2 additions & 2 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
13 changes: 7 additions & 6 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields):
timdex.Date(kind="Coverage", value="1945"),
timdex.Date(kind="Coverage", value="1946"),
timdex.Date(
kind="Coverage",
range=timdex.Date_Range(gte="1943", lte="1946"),
),
]
Expand All @@ -97,7 +98,7 @@ def test_parse_solr_date_range_invalid_date_range_string_raises_error():

def test_aardvark_get_identifiers_success(aardvark_record_all_fields):
assert MITAardvark.get_identifiers(next(aardvark_record_all_fields)) == [
timdex.Identifier(value="abc123")
timdex.Identifier(value="abc123", kind="Not specified")
]


Expand Down Expand Up @@ -129,11 +130,11 @@ def test_aardvark_get_links_logs_warning_for_invalid_json(caplog):
)


def test_aardvark_get_locations_success(aardvark_record_all_fields):
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [
timdex.Location(kind="Bounding Box", geodata=[-111.1, -104.0, 45.0, 40.9]),
timdex.Location(kind="Geometry", geodata=[-111.1, -104.0, 45.0, 40.9]),
]
def test_aardvark_get_locations_success(caplog, aardvark_record_all_fields):
caplog.set_level("DEBUG")
assert "Geometry field 'dcat_bbox' found, but currently not mapped."
assert "Geometry field 'locn_geometry' found, but currently not mapped."
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == []


def test_aardvark_get_notes_success(aardvark_record_all_fields):
Expand Down
29 changes: 15 additions & 14 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re

import transmogrifier.models as timdex
from transmogrifier.helpers import parse_geodata_string
from transmogrifier.sources.transformer import JSON, JSONTransformer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -228,9 +227,10 @@ def _range_dates(
)
range_dates.append(
timdex.Date(
kind="Coverage",
range=timdex.Date_Range(
gte=date_range_values[0], lte=date_range_values[1]
)
),
)
)
return range_dates
Expand Down Expand Up @@ -262,7 +262,7 @@ def parse_solr_date_range_string(
def get_identifiers(source_record: dict) -> list[timdex.Identifier]:
"""Get values from source record for TIMDEX identifiers field."""
return [
timdex.Identifier(value=identifier_value)
timdex.Identifier(value=identifier_value, kind="Not specified")
for identifier_value in source_record.get("dct_identifier_sm", [])
]

Expand Down Expand Up @@ -292,8 +292,13 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
def get_locations(
source_record: dict, source_record_id: str
) -> list[timdex.Location]:
"""Get values from source record for TIMDEX locations field."""
locations = []
"""Get values from source record for TIMDEX locations field.
WIP: Currently in the process of determining our approach for storing geographic
geometry data in the TIMDEX record and how this dovetails with the OpenSearch
mapping. At this time, this method returns an empty list of Locations.
"""
locations: list[timdex.Location] = []

aardvark_location_fields = {
"dcat_bbox": "Bounding Box",
Expand All @@ -303,15 +308,11 @@ def get_locations(
if aardvark_location_field not in source_record:
continue
try:
if geodata_points := parse_geodata_string(
source_record[aardvark_location_field], source_record_id
):
locations.append(
timdex.Location(
geodata=geodata_points,
kind=kind_value,
)
)
message = (
f"Geometry field '{aardvark_location_field}' found, but "
f"currently not mapped."
)
logger.debug(message)
except ValueError as exception:
logger.warning(exception)
return locations
Expand Down

0 comments on commit cbddb32

Please sign in to comment.