Skip to content

Commit

Permalink
date-whitespace-bug-fix (#56)
Browse files Browse the repository at this point in the history
* date-whitespace-bug-fix

Why these changes are being introduced:
* There was a bug related to whitespace in date values. strip() was performed before the validation but not on the original value thus allowing values with whitespace to be passed to TIM where they caused errors. This update ensures that all date values are stripped before being passed to validate_date and removes the strip() call from that function.

How this addresses that need:
* Remove strip() call from validate_date function and update log message
* Update source transforms with strip() calls on all date values
* Update unit tests and add new test for whitespace

Side effects of this change:
* None

Relevant ticket(s):
* NA

* Update date parsing

* Remove 2 character year values as an accepted date format
* Update corresponding unit tests

* Updates based on discussion in PR #56

* Convert NavgiableStrings to str in datacite, dspace_dim, and dspace_mets transforms
* Add date validation to dspace_dim transform

* Updates based on further discussion in PR #56

* Update datacite transform to fix errors and add validation to publicationYear
* Update dspace_dim transform for code consistency
  • Loading branch information
ehanson8 authored Mar 15, 2023
1 parent b5d1b21 commit 0949a82
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 24 deletions.
12 changes: 9 additions & 3 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ def test_parse_date_from_string_success():
for date in [
"1930",
"1930-12",
"30",
"30-12",
"30-12-31",
"1930-12-31",
Expand Down Expand Up @@ -264,13 +263,20 @@ def test_parse_date_from_string_invalid_date_returns_none():


def test_validate_date_success():
assert validate_date("1930 ", "1234") is True
assert validate_date("1930", "1234") is True


def test_validate_date_invalid_date_logs_error(caplog):
assert validate_date("circa 1930s", "1234") is False
assert (
"Record # '1234' has a date that couldn't be parsed: circa 1930s"
"Record # '1234' has a date that couldn't be parsed: 'circa 1930s'"
) in caplog.text


def test_validate_date_whitespace_date_logs_error(caplog):
assert validate_date("1930 ", "1234") is False
assert (
"Record # '1234' has a date that couldn't be parsed: '1930 '"
) in caplog.text


Expand Down
1 change: 0 additions & 1 deletion transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"%Y", # strict_year
"%Y-%m", # strict_year_month
# date variations
"%y",
"%y-%m",
"%y-%m-%d",
"%Y-%m-%d",
Expand Down
3 changes: 1 addition & 2 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,11 @@ def validate_date(
date_string: A date string.
source_record_id: The ID of the record being transformed.
"""
date_string = date_string.strip()
if parse_date_from_string(date_string):
return True
else:
logger.debug(
"Record # '%s' has a date that couldn't be parsed: %s",
"Record # '%s' has a date that couldn't be parsed: '%s'",
source_record_id,
date_string,
)
Expand Down
18 changes: 12 additions & 6 deletions transmogrifier/sources/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,14 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:

# dates
if publication_year := xml.metadata.find("publicationYear", string=True):
fields["dates"] = [
timdex.Date(kind="Publication date", value=publication_year.string)
]
publication_year = str(publication_year.string.strip())
if validate_date(
publication_year,
source_record_id,
):
fields["dates"] = [
timdex.Date(kind="Publication date", value=publication_year)
]
else:
logger.warning(
"Datacite record %s missing required Datacite field publicationYear",
Expand All @@ -119,10 +124,11 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
for date in xml.metadata.find_all("date"):
d = timdex.Date()
if date_value := date.string:
date_value = str(date_value)
if "/" in date_value:
split = date_value.index("/")
gte_date = date_value[:split]
lte_date = date_value[split + 1 :]
gte_date = date_value[:split].strip()
lte_date = date_value[split + 1 :].strip()
if validate_date_range(
gte_date,
lte_date,
Expand All @@ -134,7 +140,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
)
else:
d.value = (
date_value
date_value.strip()
if validate_date(
date_value,
source_record_id,
Expand Down
18 changes: 11 additions & 7 deletions transmogrifier/sources/dspace_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from bs4 import Tag

import transmogrifier.models as timdex
from transmogrifier.helpers import validate_date_range
from transmogrifier.helpers import validate_date, validate_date_range
from transmogrifier.sources.transformer import Transformer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -87,12 +87,16 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
)

# dates
for date in [d for d in xml.find_all("dim:field", element="date") if d.string]:
if date.get("qualifier") == "issued":
d = timdex.Date(value=date.string, kind="Publication date")
else:
d = timdex.Date(value=date.string, kind=date.get("qualifier") or None)
fields.setdefault("dates", []).append(d)
for date in xml.find_all("dim:field", element="date", string=True):
date_value = str(date.string.strip())
if validate_date(date_value, source_record_id):
if date.get("qualifier") == "issued":
d = timdex.Date(value=date_value, kind="Publication date")
else:
d = timdex.Date(
value=date_value, kind=date.get("qualifier") or None
)
fields.setdefault("dates", []).append(d)

for coverage in [
c.string
Expand Down
2 changes: 1 addition & 1 deletion transmogrifier/sources/dspace_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def get_optional_fields(self, xml: Tag) -> dict:
# Only publication date is mapped from DSpace, other relevant date field (dc.
# coverage.temporal) is not mapped to the OAI-PMH METS output.
if publication_date := xml.find("mods:dateIssued", string=True):
publication_date_value = publication_date.string
publication_date_value = str(publication_date.string.strip())
if validate_date(publication_date_value, source_record_id):
fields["dates"] = [
timdex.Date(kind="Publication date", value=publication_date_value)
Expand Down
6 changes: 3 additions & 3 deletions transmogrifier/sources/ead.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
date_instance = timdex.Date()
if "-" in date_value:
split = date_value.index("-")
gte_date = date_value[:split]
lte_date = date_value[split + 1 :]
gte_date = date_value[:split].strip()
lte_date = date_value[split + 1 :].strip()
if validate_date_range(
gte_date,
lte_date,
Expand All @@ -125,7 +125,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
)
else:
date_instance.value = (
date_value
date_value.strip()
if validate_date(
date_value,
source_record_id,
Expand Down
2 changes: 1 addition & 1 deletion transmogrifier/sources/marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
fields["contributors"] = contributor_values or None

# dates
publication_year = str(fixed_length_data.string)[7:11]
publication_year = str(fixed_length_data.string)[7:11].strip()
if validate_date(publication_year, source_record_id):
fields["dates"] = [
timdex.Date(kind="Publication date", value=publication_year)
Expand Down

0 comments on commit 0949a82

Please sign in to comment.