Skip to content

Commit

Permalink
Adding "Missing Value" issue to the log when Geometry is empty.
Browse files Browse the repository at this point in the history
  • Loading branch information
ssadhu-sl committed Jan 29, 2024
1 parent 41c8dbd commit 9fb17db
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 20 deletions.
12 changes: 12 additions & 0 deletions digital_land/phase/harmonise.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@ def process(self, stream):
if value and ":" not in value:
o[typology] = "%s:%s" % (block["dataset"], value)

# ensure geometry field is not empty
for typology in ["geography"]:
# logging error when both geometry & point are empty
# TO-DO: will replace this code once we get mandatory list from Specification
for field in row:
if field in ["geometry", "point"]:
if (
row.get("geometry") == "" or row.get("geometry") is None
) and (row.get("point") == "" or row.get("point") is None):
self.issues.log_issue(field, "missing value", "")

# migrate wikipedia URLs to a reference compatible with dbpedia CURIEs with a wikipedia-en prefix
if row.get("wikipedia", "").startswith("http"):
self.issues.log_issue(
Expand All @@ -68,4 +79,5 @@ def process(self, stream):
"https://en.wikipedia.org/wiki/", ""
)
block["row"] = o

yield block
17 changes: 5 additions & 12 deletions digital_land/phase/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ def headers(self, fieldnames):
matched = []
for header in sorted(fieldnames):
fieldname = normalise(header)

for pattern, value in self.columns.items():
if fieldname == pattern:
matched.append(value)
headers[header] = value

# stop if we found a match

for header in sorted(fieldnames):
if header in headers:
continue

if fieldname in self.normalised_fieldnames:
fieldname = normalise(header)
if fieldname not in matched and fieldname in self.normalised_fieldnames:
headers[header] = self.normalised_fieldnames[fieldname]
continue

Expand Down Expand Up @@ -74,14 +74,7 @@ def process(self, stream):
if headers[header] == "IGNORE":
continue

value = row.get(header)

if value is not None and value != "":
o[headers[header]] = value

for header in self.normalised_fieldnames.values():
if header not in o:
o[header] = ""
o[headers[header]] = row.get(header)

block["row"] = o

Expand Down
1 change: 1 addition & 0 deletions tests/data/specification/field.csv
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ entity,,string,1,,,,
reference-entity,,string,1,,,,
field,,string,1,,,,
value,,string,1,,,,
geometry,Geometry,wkt,1,,,,
12 changes: 4 additions & 8 deletions tests/unit/phase/test_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,9 @@ def test_map_column_names_with_underscores_when_column_not_in_specification():

m = MapPhase(fieldnames, columns)
output = TestPipeline(
m, "Organisation_Label,PermissionDate\r\ncol-1-val,col-2-val\r\n"
)
assert (
output
== "Organisation_Label,PermissionDate,SiteNameAddress\r\ncol-1-val,col-2-val,\r\n"
m, "Organisation_Label,PermissionDate,test\r\ncol-1-val,col-2-val,\r\n"
)
assert output == "Organisation_Label,PermissionDate\r\ncol-1-val,col-2-val\r\n"


def test_map_column_names_with_underscores_when_column_in_specification():
Expand All @@ -128,10 +125,9 @@ def test_map_column_names_with_underscores_when_column_in_specification():

m = MapPhase(fieldnames, columns)
output = TestPipeline(
m, "Organisation_Label,end_date,SiteNameAddress\r\ncol-1-val,col-2-val\r\n"
m, "Organisation_Label,end_date,SiteNameAddress\r\ncol-1-val,col-2-val,\r\n"
)

assert (
output
== "Organisation-Label,Organisation_Label,SiteNameAddress,end-date,end_date\r\ncol-1-val,,,col-2-val,\r\n"
== "Organisation-Label,SiteNameAddress,end-date\r\ncol-1-val,,col-2-val\r\n"
)
21 changes: 21 additions & 0 deletions tests/unit/test_harmonise.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,24 @@ def test_harmonise():
assert output[0]["row"] == {"field-integer": "123"}, "pass through valid data"
assert output[1]["row"] == {"field-integer": "321"}, "whitespace trimmed"
assert output[2]["row"] == {"field-integer": ""}, "remove bad data"


def test_harmonise_geometry():
specification = Specification("tests/data/specification")
issues = IssueLog()

h = HarmonisePhase(specification=specification, issues=issues)
reader = FakeDictReader(
[
{"organisation": "test_org"},
]
)
output = list(h.process(reader))

assert len(output) == 1

# It should have an issue logged for the empty "geometry" field
for issue in issues.rows:
assert issue["field"] in ["geometry", "point"]
assert issue["issue-type"] == "missing value"
assert issue["value"] == ""
2 changes: 2 additions & 0 deletions tests/unit/test_specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def test_field_names():
"field",
"reference-entity",
"fact",
"geometry",
]
)

Expand Down Expand Up @@ -133,6 +134,7 @@ def test_current_fieldnames():
"line-number",
"resource",
"value",
"geometry",
]
)

Expand Down

0 comments on commit 9fb17db

Please sign in to comment.