Updates based on discussion in PR #110

* Update aardvark_records fixture to reflect required fields and corresponding unit test * Add gismit and gisogm sources to config.py * Refactor parse_source_file methods to use smart_open * Add get_timdex_record_id method for more specific ID processing * Refactor get_dates to use new private methods for each date type * Refactor get_locations to improve readability and error processing * Shift parse_solr_date_range function to MITAardvark class method as well as corresponding unit tests
MITLibraries · Jan 3, 2024 · f4126c2 · f4126c2
1 parent 181bdc5
commit f4126c2
Show file tree

Hide file tree

Showing 8 changed files with 124 additions and 77 deletions.
diff --git a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
@@ -1 +1 @@
-{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
+{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
diff --git a/tests/fixtures/aardvark_records.jsonl b/tests/fixtures/aardvark_records.jsonl
@@ -1,2 +1,2 @@
-{"id": "123", "dct_title_s": "Test title 1"}
-{"id": "456", "dct_title_s": "Test title 2"}
+{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": ""}
+{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": ""}
diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py
@@ -1,3 +1,5 @@
+import pytest
+
 import transmogrifier.models as timdex
 from transmogrifier.sources.json.aardvark import MITAardvark
 
@@ -21,6 +23,7 @@ def test_aardvark_transform_returns_timdex_record(aardvark_records):
         title="Test title 1",
         citation="Test title 1. Geospatial data. https://example.com/123",
         content_type=["Geospatial data"],
+        rights=[timdex.Rights(description="Access rights", kind="Access")],
     )
 
 
@@ -77,6 +80,21 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields):
     ]
 
 
+def test_aardvark_parse_solr_date_range_string_success():
+    assert MITAardvark.parse_solr_date_range_string("[1932 TO 1937]", "123") == (
+        "1932",
+        "1937",
+    )
+
+
+def test_parse_solr_date_range_invalid_date_range_string_raises_error():
+    with pytest.raises(
+        ValueError,
+        match="Record ID '123': Unable to parse date range string 'Invalid'",
+    ):
+        MITAardvark.parse_solr_date_range_string("Invalid", "123")
+
+
 def test_aardvark_get_identifiers_success(aardvark_record_all_fields):
     assert MITAardvark.get_identifiers(next(aardvark_record_all_fields)) == [
         timdex.Identifier(value="abc123")

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -7,7 +7,6 @@
     generate_citation,
     parse_date_from_string,
     parse_geodata_string,
-    parse_solr_date_range_string,
     validate_date,
     validate_date_range,
 )
@@ -277,18 +276,6 @@ def test_parse_geodata_string_invalid_geodata_string_raises_error():
         parse_geodata_string("Invalid", "123")
 
 
-def test_parse_solr_date_range_string_success():
-    assert parse_solr_date_range_string("[[1932 TO 1937]]", "123") == ["1932", "1937"]
-
-
-def test_parse_solr_date_range_invalid_date_range_string_raises_error():
-    with pytest.raises(
-        ValueError,
-        match="Record ID '123': Unable to parse date range string 'Invalid'",
-    ):
-        parse_solr_date_range_string("Invalid", "123")
-
-
 def test_validate_date_success():
     assert validate_date("1930", "1234") is True
 

diff --git a/transmogrifier/config.py b/transmogrifier/config.py
@@ -103,6 +103,16 @@
         "base-url": "https://libguides.mit.edu/",
         "transform-class": "transmogrifier.sources.xml.springshare.SpringshareOaiDc",
     },
+    "gismit": {
+        "name": "MIT GIS Resources",
+        "base-url": "XXXX",
+        "transform-class": "transmogrifier.sources.json.aardvark.MITAardvark",
+    },
+    "gisogm": {
+        "name": "OpenGeoMetadata GIS Resources",
+        "base-url": "XXXX",
+        "transform-class": "transmogrifier.sources.json.aardvark.OGMAardvark",
+    },
     "researchdatabases": {
         "name": "Research Databases",
         "base-url": "https://libguides.mit.edu/",

diff --git a/transmogrifier/helpers.py b/transmogrifier/helpers.py
@@ -80,7 +80,6 @@ def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[flo
 
     Example:
      - "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"
-     - "POLYGON((-80 25, -65 18, -64 33, -80 25))"
 
      Args:
         geodata_string: Formatted geodata string to parse.
@@ -101,36 +100,6 @@ def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[flo
     return geodata_points
 
 
-def parse_solr_date_range_string(
-    date_range_string: str, source_record_id: str
-) -> list[str]:
-    """Get a list of values from a Solr-formatted date range string.
-
-    Example:
-     - "[1943 TO 1946]"
-
-    Args:
-        date_range_string: Formatted date range string to parse.
-        source_record_id: The ID of the record containing the string to parse.
-    """
-    date_ranges = []
-    if (
-        date_range_string.startswith("[")
-        and date_range_string.endswith("]")
-        and " TO " in date_range_string
-    ):
-        date_range_values = date_range_string.split("[")[-1].split("]")[0].split(" TO ")
-        if [date_range_string] != date_range_values:
-            date_ranges.extend(date_range_values)
-    else:
-        message = (
-            f"Record ID '{source_record_id}': "
-            f"Unable to parse date range string '{date_range_string}'"
-        )
-        raise ValueError(message)
-    return date_ranges
-
-
 def validate_date(
     date_string: str,
     source_record_id: str,

diff --git a/transmogrifier/sources/json/aardvark.py b/transmogrifier/sources/json/aardvark.py
@@ -1,9 +1,10 @@
 import json
 import logging
+import re
 
 import transmogrifier.models as timdex
-from transmogrifier.helpers import parse_geodata_string, parse_solr_date_range_string
-from transmogrifier.sources.transformer import JSONTransformer
+from transmogrifier.helpers import parse_geodata_string
+from transmogrifier.sources.transformer import JSON, JSONTransformer
 
 logger = logging.getLogger(__name__)
 
@@ -29,6 +30,22 @@ def get_main_titles(cls, source_record: dict) -> list[str]:
         """
         return [source_record["dct_title_s"]]
 
+    @classmethod
+    def get_timdex_record_id(
+        cls, source: str, source_record_id: str, source_record: dict[str, JSON]
+    ) -> str:
+        """
+        Class method to set the TIMDEX record id.
+
+        Args:
+            source: Source name.
+            source_record_id: Record identifier for the source record.
+            source_record: A JSON object representing a source record.
+                - not used by default implementation, but could be useful for subclass
+                overrides
+        """
+        return f"{source}:{source_record_id.replace('/', '-')}"
+
     @classmethod
     def get_source_record_id(cls, source_record: dict) -> str:
         """
@@ -37,7 +54,7 @@ def get_source_record_id(cls, source_record: dict) -> str:
         Args:
             source_record: A JSON object representing a source record.
         """
-        return source_record["id"]
+        return source_record["id"].replace("mit:", "").replace("ogm:", "")
 
     @classmethod
     def record_is_deleted(cls, source_record: dict) -> bool:
@@ -134,17 +151,29 @@ def get_contributors(source_record: dict) -> list[timdex.Contributor]:
             for contributor_value in source_record.get("dct_creator_sm", [])
         ]
 
-    @staticmethod
-    def get_dates(source_record: dict, source_record_id: str) -> list[timdex.Date]:
+    @classmethod
+    def get_dates(cls, source_record: dict, source_record_id: str) -> list[timdex.Date]:
         """Get values from source record for TIMDEX dates field."""
-        dates = []
+        return (
+            cls._issued_dates(source_record)
+            + cls._coverage_dates(source_record)
+            + cls._range_dates(source_record, source_record_id)
+        )
 
+    @classmethod
+    def _issued_dates(cls, source_record: dict) -> list[timdex.Date]:
+        """Get values for issued dates."""
+        issued_dates = []
         if "dct_issued_s" in source_record:
-            dates.append(
+            issued_dates.append(
                 timdex.Date(value=source_record["dct_issued_s"], kind="Issued")
             )
+        return issued_dates
 
-        # logic to remove duplicate entries from the 2 fields that record coverage dates
+    @classmethod
+    def _coverage_dates(cls, source_record: dict) -> list[timdex.Date]:
+        """Get values for coverage dates."""
+        coverage_dates = []
         coverage_date_values = []
         coverage_date_values.extend(source_record.get("dct_temporal_sm", []))
         for date_value in [
@@ -154,24 +183,55 @@ def get_dates(source_record: dict, source_record_id: str) -> list[timdex.Date]:
         ]:
             coverage_date_values.append(date_value)
         for coverage_date_value in coverage_date_values:
-            dates.append(timdex.Date(value=coverage_date_value, kind="Coverage"))
+            coverage_dates.append(
+                timdex.Date(value=coverage_date_value, kind="Coverage")
+            )
+        return coverage_dates
 
+    @classmethod
+    def _range_dates(
+        cls, source_record: dict, source_record_id: str
+    ) -> list[timdex.Date]:
+        """Get values for issued dates."""
+        range_dates = []
         for date_range_string in [
             date_range_strings
             for date_range_strings in source_record.get("gbl_dateRange_drsim", [])
         ]:
-            date_range_values = parse_solr_date_range_string(
+            date_range_values = cls.parse_solr_date_range_string(
                 date_range_string, source_record_id
             )
-            dates.append(
+            range_dates.append(
                 timdex.Date(
                     range=timdex.Date_Range(
                         gte=date_range_values[0], lte=date_range_values[1]
                     )
                 )
             )
+        return range_dates
+
+    @classmethod
+    def parse_solr_date_range_string(
+        cls, date_range_string: str, source_record_id: str
+    ) -> tuple:
+        """Get a list of values from a Solr-formatted date range string.
 
-        return dates
+        Example:
+         - "[1943 TO 1946]"
+
+        Args:
+            date_range_string: Formatted date range string to parse.
+            source_record_id: The ID of the record containing the string to parse.
+        """
+        try:
+            matches = re.match(r"\[([0-9]{4}) TO ([0-9]{4})\]", date_range_string)
+            return matches.groups()  # type: ignore[union-attr]
+        except AttributeError:
+            message = (
+                f"Record ID '{source_record_id}': "
+                f"Unable to parse date range string '{date_range_string}'"
+            )
+            raise ValueError(message)
 
     @staticmethod
     def get_identifiers(source_record: dict) -> list[timdex.Identifier]:
@@ -185,8 +245,8 @@ def get_identifiers(source_record: dict) -> list[timdex.Identifier]:
     def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
         """Get values from source record for TIMDEX links field."""
         links = []
+        links_string = source_record["dct_references_s"]
         try:
-            links_string = source_record.get("dct_references_s", "")
             links_object = json.loads(links_string)
             links.extend(
                 [
@@ -214,20 +274,21 @@ def get_locations(
             "dcat_bbox": "Bounding Box",
             "locn_geometry": "Geometry",
         }
-        for aardvark_location_field, kind_value in {
-            key: value
-            for key, value in aardvark_location_fields.items()
-            if key in source_record
-        }.items():
-            if geodata_points := parse_geodata_string(
-                source_record[aardvark_location_field], source_record_id
-            ):
-                locations.append(
-                    timdex.Location(
-                        geodata=geodata_points,
-                        kind=kind_value,
+        for aardvark_location_field, kind_value in aardvark_location_fields.items():
+            if aardvark_location_field not in source_record:
+                continue
+            try:
+                if geodata_points := parse_geodata_string(
+                    source_record[aardvark_location_field], source_record_id
+                ):
+                    locations.append(
+                        timdex.Location(
+                            geodata=geodata_points,
+                            kind=kind_value,
+                        )
                     )
-                )
+            except ValueError as exception:
+                logger.warning(exception)
         return locations
 
     @staticmethod

diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py
@@ -9,6 +9,7 @@
 from typing import Iterator, Optional, TypeAlias, final
 
 import jsonlines
+import smart_open
 from attrs import asdict
 from bs4 import BeautifulSoup, Tag
 
@@ -390,9 +391,10 @@ def parse_source_file(cls, source_file: str) -> Iterator[dict[str, JSON]]:
         Args:
             source_file: A file containing source records to be transformed.
         """
-        with jsonlines.open(source_file) as records:
-            for record in records.iter(type=dict):
-                yield record
+        with smart_open.open(source_file, "r") as source_file_object:
+            with jsonlines.open(source_file_object.name) as records:
+                for record in records.iter(type=dict):
+                    yield record
 
     @final
     def transform(self, source_record: dict[str, JSON]) -> Optional[TimdexRecord]:
@@ -539,7 +541,7 @@ def parse_source_file(cls, source_file: str) -> Iterator[Tag]:
         Args:
             source_file: A file containing source records to be transformed.
         """
-        with open(source_file, "rb") as file:
+        with smart_open.open(source_file, "rb") as file:
             for _, element in etree.iterparse(
                 file,
                 tag="{*}record",
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
		{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}