Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdt 116 update locations #113

Merged
merged 5 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def timdex_record_all_fields_and_subfields():
timdex.Location(
value="A point on the globe",
kind="Data was gathered here",
geodata=[-77.025955, 38.942142],
geoshape="BBOX(-77.025955, 38.942142)",
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
)
],
notes=[timdex.Note(value=["This book is awesome"], kind="opinion")],
Expand Down
29 changes: 24 additions & 5 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,30 @@ def test_aardvark_get_links_logs_warning_for_invalid_json(caplog):
)


def test_aardvark_get_locations_success(caplog, aardvark_record_all_fields):
caplog.set_level("DEBUG")
assert "Geometry field 'dcat_bbox' found, but currently not mapped."
assert "Geometry field 'locn_geometry' found, but currently not mapped."
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == []
def test_aardvark_get_locations_success(aardvark_record_all_fields):
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [
timdex.Location(
kind="Bounding Box", geoshape="BBOX(-111.1, -104.0, 45.0, 40.9)"
),
timdex.Location(kind="Geometry", geoshape="BBOX(-111.1, -104.0, 45.0, 40.9)"),
]


def test_parse_get_locations_string_invalid_geostring_logs_warning(
aardvark_record_all_fields, caplog
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["dcat_bbox"] = "Invalid"
aardvark_record["locn_geometry"] = "Invalid"
assert MITAardvark.get_locations(aardvark_record, "123") == []
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'dcat_bbox'"
in caplog.text
)
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'locn_geometry'"
in caplog.text
)


def test_aardvark_get_notes_success(aardvark_record_all_fields):
Expand Down
20 changes: 0 additions & 20 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from datetime import datetime

import pytest

import transmogrifier.models as timdex
from transmogrifier.helpers import (
generate_citation,
parse_date_from_string,
parse_geodata_string,
validate_date,
validate_date_range,
)
Expand Down Expand Up @@ -259,23 +256,6 @@ def test_parse_date_from_string_invalid_date_returns_none():
assert parse_date_from_string("circa 1930s") is None


def test_parse_geodata_string_success():
assert parse_geodata_string("ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "123") == [
-111.1,
-104.0,
45.0,
40.9,
]


def test_parse_geodata_string_invalid_geodata_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse geodata string 'Invalid'",
):
parse_geodata_string("Invalid", "123")


def test_validate_date_success():
assert validate_date("1930", "1234") is True

Expand Down
10 changes: 5 additions & 5 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ def test_timdex_record_all_fields_and_subfields(timdex_record_all_fields_and_sub
timdex_record_all_fields_and_subfields.locations[0].kind
== "Data was gathered here"
)
assert timdex_record_all_fields_and_subfields.locations[0].geodata == [
-77.025955,
38.942142,
]
assert (
timdex_record_all_fields_and_subfields.locations[0].geoshape
== "BBOX(-77.025955, 38.942142)"
jonavellecuerdo marked this conversation as resolved.
Show resolved Hide resolved
)
assert timdex_record_all_fields_and_subfields.notes[0].value == [
"This book is awesome"
]
Expand Down Expand Up @@ -305,7 +305,7 @@ def test_record_asdict_includes_all_fields(timdex_record_all_fields_and_subfield
"literary_form": "nonfiction",
"locations": [
{
"geodata": [-77.025955, 38.942142],
"geoshape": "BBOX(-77.025955, 38.942142)",
jonavellecuerdo marked this conversation as resolved.
Show resolved Hide resolved
"kind": "Data was gathered here",
"value": "A point on the globe",
}
Expand Down
25 changes: 0 additions & 25 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,31 +75,6 @@ def parse_date_from_string(
return None


def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[float]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason for deleting this helper function? Just want to understand. 🤓

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're just taking the WKT string as is given than OpenSearch can handle it, we don't need this function to do the parsing that we originally though was needed. Does that make sense?

"""Get list of values from a formatted geodata string.

Example:
- "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"

Args:
geodata_string: Formatted geodata string to parse.
source_record_id: The ID of the record containing the string to parse.
"""
geodata_points = []
try:
raw_geodata_points = geodata_string.split("(")[-1].split(")")[0].split(",")
stripped_geodata_points = map(str.strip, raw_geodata_points)
geodata_floats = list(map(float, stripped_geodata_points))
geodata_points.extend(geodata_floats)
except ValueError:
message = (
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}'"
)
raise ValueError(message)
return geodata_points


def validate_date(
date_string: str,
source_record_id: str,
Expand Down
4 changes: 1 addition & 3 deletions transmogrifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,7 @@ class Link:
class Location:
value: Optional[str] = field(default=None, validator=optional(instance_of(str)))
kind: Optional[str] = field(default=None, validator=optional(instance_of(str)))
geodata: Optional[list[float]] = field(
default=None, validator=optional(list_of(float))
)
geoshape: Optional[str] = field(default=None, validator=optional(instance_of(str)))


@define
Expand Down
29 changes: 16 additions & 13 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,13 +294,8 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
def get_locations(
source_record: dict, source_record_id: str
) -> list[timdex.Location]:
"""Get values from source record for TIMDEX locations field.

WIP: Currently in the process of determining our approach for storing geographic
geometry data in the TIMDEX record and how this dovetails with the OpenSearch
mapping. At this time, this method returns an empty list of Locations.
"""
locations: list[timdex.Location] = []
"""Get values from source record for TIMDEX locations field."""
locations = []

aardvark_location_fields = {
"dcat_bbox": "Bounding Box",
Expand All @@ -309,14 +304,22 @@ def get_locations(
for aardvark_location_field, kind_value in aardvark_location_fields.items():
if aardvark_location_field not in source_record:
continue
try:
if (
geodata_string := source_record[aardvark_location_field]
) and "ENVELOPE" in source_record[aardvark_location_field]:
locations.append(
timdex.Location(
geoshape=geodata_string.replace("ENVELOPE", "BBOX"),
kind=kind_value,
)
)
else:
message = (
f"Geometry field '{aardvark_location_field}' found, but "
f"currently not mapped."
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}' "
f"in '{aardvark_location_field}'"
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
)
logger.debug(message)
except ValueError as exception:
logger.warning(exception)
logger.warning(message)
return locations

@staticmethod
Expand Down
Loading