Skip to content

Commit

Permalink
Updates based on discussion in PR #110
Browse files Browse the repository at this point in the history
* Update aardvark_records fixture to reflect required fields and corresponding unit test
* Add gismit and gisogm sources to config.py
* Refactor parse_source_file methods to use smart_open
* Add get_timdex_record_id method for more specific ID processing
* Refactor get_dates to use new private methods for each date type
* Refactor get_locations to improve readability and error processing
* Shift parse_solr_date_range function to MITAardvark class method as well as corresponding unit tests
  • Loading branch information
ehanson8 committed Jan 3, 2024
1 parent 181bdc5 commit f4126c2
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 77 deletions.
2 changes: 1 addition & 1 deletion tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
4 changes: 2 additions & 2 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"id": "123", "dct_title_s": "Test title 1"}
{"id": "456", "dct_title_s": "Test title 2"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": ""}
18 changes: 18 additions & 0 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import MITAardvark

Expand All @@ -21,6 +23,7 @@ def test_aardvark_transform_returns_timdex_record(aardvark_records):
title="Test title 1",
citation="Test title 1. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
rights=[timdex.Rights(description="Access rights", kind="Access")],
)


Expand Down Expand Up @@ -77,6 +80,21 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields):
]


def test_aardvark_parse_solr_date_range_string_success():
assert MITAardvark.parse_solr_date_range_string("[1932 TO 1937]", "123") == (
"1932",
"1937",
)


def test_parse_solr_date_range_invalid_date_range_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse date range string 'Invalid'",
):
MITAardvark.parse_solr_date_range_string("Invalid", "123")


def test_aardvark_get_identifiers_success(aardvark_record_all_fields):
assert MITAardvark.get_identifiers(next(aardvark_record_all_fields)) == [
timdex.Identifier(value="abc123")
Expand Down
13 changes: 0 additions & 13 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
generate_citation,
parse_date_from_string,
parse_geodata_string,
parse_solr_date_range_string,
validate_date,
validate_date_range,
)
Expand Down Expand Up @@ -277,18 +276,6 @@ def test_parse_geodata_string_invalid_geodata_string_raises_error():
parse_geodata_string("Invalid", "123")


def test_parse_solr_date_range_string_success():
assert parse_solr_date_range_string("[[1932 TO 1937]]", "123") == ["1932", "1937"]


def test_parse_solr_date_range_invalid_date_range_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse date range string 'Invalid'",
):
parse_solr_date_range_string("Invalid", "123")


def test_validate_date_success():
assert validate_date("1930", "1234") is True

Expand Down
10 changes: 10 additions & 0 deletions transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@
"base-url": "https://libguides.mit.edu/",
"transform-class": "transmogrifier.sources.xml.springshare.SpringshareOaiDc",
},
"gismit": {
"name": "MIT GIS Resources",
"base-url": "XXXX",
"transform-class": "transmogrifier.sources.json.aardvark.MITAardvark",
},
"gisogm": {
"name": "OpenGeoMetadata GIS Resources",
"base-url": "XXXX",
"transform-class": "transmogrifier.sources.json.aardvark.OGMAardvark",
},
"researchdatabases": {
"name": "Research Databases",
"base-url": "https://libguides.mit.edu/",
Expand Down
31 changes: 0 additions & 31 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[flo
Example:
- "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"
- "POLYGON((-80 25, -65 18, -64 33, -80 25))"
Args:
geodata_string: Formatted geodata string to parse.
Expand All @@ -101,36 +100,6 @@ def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[flo
return geodata_points


def parse_solr_date_range_string(
date_range_string: str, source_record_id: str
) -> list[str]:
"""Get a list of values from a Solr-formatted date range string.
Example:
- "[1943 TO 1946]"
Args:
date_range_string: Formatted date range string to parse.
source_record_id: The ID of the record containing the string to parse.
"""
date_ranges = []
if (
date_range_string.startswith("[")
and date_range_string.endswith("]")
and " TO " in date_range_string
):
date_range_values = date_range_string.split("[")[-1].split("]")[0].split(" TO ")
if [date_range_string] != date_range_values:
date_ranges.extend(date_range_values)
else:
message = (
f"Record ID '{source_record_id}': "
f"Unable to parse date range string '{date_range_string}'"
)
raise ValueError(message)
return date_ranges


def validate_date(
date_string: str,
source_record_id: str,
Expand Down
113 changes: 87 additions & 26 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import json
import logging
import re

import transmogrifier.models as timdex
from transmogrifier.helpers import parse_geodata_string, parse_solr_date_range_string
from transmogrifier.sources.transformer import JSONTransformer
from transmogrifier.helpers import parse_geodata_string
from transmogrifier.sources.transformer import JSON, JSONTransformer

logger = logging.getLogger(__name__)

Expand All @@ -29,6 +30,22 @@ def get_main_titles(cls, source_record: dict) -> list[str]:
"""
return [source_record["dct_title_s"]]

@classmethod
def get_timdex_record_id(
cls, source: str, source_record_id: str, source_record: dict[str, JSON]
) -> str:
"""
Class method to set the TIMDEX record id.
Args:
source: Source name.
source_record_id: Record identifier for the source record.
source_record: A JSON object representing a source record.
- not used by default implementation, but could be useful for subclass
overrides
"""
return f"{source}:{source_record_id.replace('/', '-')}"

@classmethod
def get_source_record_id(cls, source_record: dict) -> str:
"""
Expand All @@ -37,7 +54,7 @@ def get_source_record_id(cls, source_record: dict) -> str:
Args:
source_record: A JSON object representing a source record.
"""
return source_record["id"]
return source_record["id"].replace("mit:", "").replace("ogm:", "")

@classmethod
def record_is_deleted(cls, source_record: dict) -> bool:
Expand Down Expand Up @@ -134,17 +151,29 @@ def get_contributors(source_record: dict) -> list[timdex.Contributor]:
for contributor_value in source_record.get("dct_creator_sm", [])
]

@staticmethod
def get_dates(source_record: dict, source_record_id: str) -> list[timdex.Date]:
@classmethod
def get_dates(cls, source_record: dict, source_record_id: str) -> list[timdex.Date]:
"""Get values from source record for TIMDEX dates field."""
dates = []
return (
cls._issued_dates(source_record)
+ cls._coverage_dates(source_record)
+ cls._range_dates(source_record, source_record_id)
)

@classmethod
def _issued_dates(cls, source_record: dict) -> list[timdex.Date]:
"""Get values for issued dates."""
issued_dates = []
if "dct_issued_s" in source_record:
dates.append(
issued_dates.append(
timdex.Date(value=source_record["dct_issued_s"], kind="Issued")
)
return issued_dates

# logic to remove duplicate entries from the 2 fields that record coverage dates
@classmethod
def _coverage_dates(cls, source_record: dict) -> list[timdex.Date]:
"""Get values for coverage dates."""
coverage_dates = []
coverage_date_values = []
coverage_date_values.extend(source_record.get("dct_temporal_sm", []))
for date_value in [
Expand All @@ -154,24 +183,55 @@ def get_dates(source_record: dict, source_record_id: str) -> list[timdex.Date]:
]:
coverage_date_values.append(date_value)
for coverage_date_value in coverage_date_values:
dates.append(timdex.Date(value=coverage_date_value, kind="Coverage"))
coverage_dates.append(
timdex.Date(value=coverage_date_value, kind="Coverage")
)
return coverage_dates

@classmethod
def _range_dates(
cls, source_record: dict, source_record_id: str
) -> list[timdex.Date]:
"""Get values for issued dates."""
range_dates = []
for date_range_string in [
date_range_strings
for date_range_strings in source_record.get("gbl_dateRange_drsim", [])
]:
date_range_values = parse_solr_date_range_string(
date_range_values = cls.parse_solr_date_range_string(
date_range_string, source_record_id
)
dates.append(
range_dates.append(
timdex.Date(
range=timdex.Date_Range(
gte=date_range_values[0], lte=date_range_values[1]
)
)
)
return range_dates

@classmethod
def parse_solr_date_range_string(
cls, date_range_string: str, source_record_id: str
) -> tuple:
"""Get a list of values from a Solr-formatted date range string.
return dates
Example:
- "[1943 TO 1946]"
Args:
date_range_string: Formatted date range string to parse.
source_record_id: The ID of the record containing the string to parse.
"""
try:
matches = re.match(r"\[([0-9]{4}) TO ([0-9]{4})\]", date_range_string)
return matches.groups() # type: ignore[union-attr]
except AttributeError:
message = (
f"Record ID '{source_record_id}': "
f"Unable to parse date range string '{date_range_string}'"
)
raise ValueError(message)

@staticmethod
def get_identifiers(source_record: dict) -> list[timdex.Identifier]:
Expand All @@ -185,8 +245,8 @@ def get_identifiers(source_record: dict) -> list[timdex.Identifier]:
def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
"""Get values from source record for TIMDEX links field."""
links = []
links_string = source_record["dct_references_s"]
try:
links_string = source_record.get("dct_references_s", "")
links_object = json.loads(links_string)
links.extend(
[
Expand Down Expand Up @@ -214,20 +274,21 @@ def get_locations(
"dcat_bbox": "Bounding Box",
"locn_geometry": "Geometry",
}
for aardvark_location_field, kind_value in {
key: value
for key, value in aardvark_location_fields.items()
if key in source_record
}.items():
if geodata_points := parse_geodata_string(
source_record[aardvark_location_field], source_record_id
):
locations.append(
timdex.Location(
geodata=geodata_points,
kind=kind_value,
for aardvark_location_field, kind_value in aardvark_location_fields.items():
if aardvark_location_field not in source_record:
continue
try:
if geodata_points := parse_geodata_string(
source_record[aardvark_location_field], source_record_id
):
locations.append(
timdex.Location(
geodata=geodata_points,
kind=kind_value,
)
)
)
except ValueError as exception:
logger.warning(exception)
return locations

@staticmethod
Expand Down
10 changes: 6 additions & 4 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import Iterator, Optional, TypeAlias, final

import jsonlines
import smart_open
from attrs import asdict
from bs4 import BeautifulSoup, Tag

Expand Down Expand Up @@ -390,9 +391,10 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON]]:
Args:
source_file: A file containing source records to be transformed.
"""
with jsonlines.open(source_file) as records:
for record in records.iter(type=dict):
yield record
with smart_open.open(source_file, "r") as source_file_object:
with jsonlines.open(source_file_object.name) as records:
for record in records.iter(type=dict):
yield record

@final
def transform(self, source_record: dict[str, JSON]) -> Optional[TimdexRecord]:
Expand Down Expand Up @@ -539,7 +541,7 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
Args:
source_file: A file containing source records to be transformed.
"""
with open(source_file, "rb") as file:
with smart_open.open(source_file, "rb") as file:
for _, element in etree.iterparse(
file,
tag="{*}record",
Expand Down

0 comments on commit f4126c2

Please sign in to comment.