Skip to content

Commit

Permalink
Check for illegal characters in codelists (#418)
Browse files Browse the repository at this point in the history
* Allow checking illegal characters defined in nomenclature.yaml

* Update testing

* Add defaults and check_illegal_characters field

* Add `ErrorCollector` and pydantic `model_dump`

* Delete common-definitions stray index

* Add newlines

* Allow single quote, disallow double-quote

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>

* Fix typing

---------

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>
  • Loading branch information
dc-almeida and danielhuppmann authored Dec 16, 2024
1 parent c16ee58 commit 4e059f8
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 34 deletions.
63 changes: 34 additions & 29 deletions nomenclature/codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,35 +43,6 @@ class CodeList(BaseModel):
def __eq__(self, other):
return self.name == other.name and self.mapping == other.mapping

@field_validator("mapping")
@classmethod
def check_stray_tag(
cls, v: dict[str, Code], info: ValidationInfo
) -> dict[str, Code]:
"""Check that no stray tags are left in codes after tag replacement"""
forbidden = ["{", "}"]

def _check_string(value):
if isinstance(value, str):
if any(char in value for char in forbidden):
raise ValueError(
f"Unexpected bracket in {info.data['name']}: '{code.name}'."
" Check if tags were spelled correctly."
)
elif isinstance(value, dict):
for v in value.values():
_check_string(v)
elif isinstance(value, list):
for item in value:
_check_string(item)

for code in v.values():
for attr in code.model_fields:
if attr not in ["file", "repository"]:
value = getattr(code, attr)
_check_string(value)
return v

@field_validator("mapping")
@classmethod
def check_end_whitespace(
Expand Down Expand Up @@ -363,6 +334,40 @@ def read_excel(cls, name, source, sheet_name, col, attrs=None):

return cls(name=name, mapping=mapp)

def check_illegal_characters(self, config: NomenclatureConfig) -> dict[str, Code]:
"""Check that no illegal characters are left in codes after tag replacement"""
illegal = (
["{", "}"] + config.illegal_characters
if config.check_illegal_characters
else ["{", "}"]
)
errors = ErrorCollector()

def _check_string(value):
if isinstance(value, str):
if any(char in value for char in illegal):
errors.append(
ValueError(
f"Unexpected character in {self.name}: '{code.name}'."
" Check for illegal characters and/or if tags were spelled correctly."
)
)
elif isinstance(value, dict):
for k in value.keys():
_check_string(k)
for v in value.values():
_check_string(v)
elif isinstance(value, list):
for item in value:
_check_string(item)

for code in self.mapping.values():
if not code.repository:
for value in code.model_dump(exclude="file").values():
_check_string(value)
if errors:
raise ValueError(errors)

def to_yaml(self, path=None):
"""Write mapping to yaml file or return as stream
Expand Down
7 changes: 7 additions & 0 deletions nomenclature/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,16 @@ class NomenclatureConfig(BaseModel):
repositories: dict[str, Repository] = Field(default_factory=dict)
definitions: DataStructureConfig = Field(default_factory=DataStructureConfig)
mappings: RegionMappingConfig = Field(default_factory=RegionMappingConfig)
check_illegal_characters: bool = True
illegal_characters: list[str] = [":", ";", '"']

model_config = ConfigDict(use_enum_values=True)

@field_validator("illegal_characters", mode="before")
@classmethod
def check_illegal_chars(cls, v: str | list[str]) -> list[str]:
return v if isinstance(v, list) else [v]

@model_validator(mode="after")
@classmethod
def check_definitions_repository(
Expand Down
1 change: 1 addition & 0 deletions nomenclature/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(self, path, dimensions=None):
self.__setattr__(
dim, codelist_cls.from_directory(dim, path / dim, self.config)
)
getattr(self, dim).check_illegal_characters(self.config)

if empty := [d for d in self.dimensions if not getattr(self, d)]:
raise ValueError(f"Empty codelist: {', '.join(empty)}")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
dimensions:
- variable
repositories:
common-definitions:
url: https://github.com/IAMconsortium/common-definitions.git/
hash: 3b75bb9
definitions:
variable:
repository:
- common-definitions
check_illegal_characters: true
illegal_characters: ['"' , ";"] # these are known to be present in common-definitions variables
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- Primary Energy:
definition: Total primary energy consumption
unit: EJ/yr
- Primary Energy|Coal:
definition: Primary energy consumption of coal
unit: EJ/yr
info: Additional info with illegal " character
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dimensions:
- variable
illegal_characters: ['"' , ";"]
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,3 @@
- Valid information.
- Invalid bracket { information.
- Another valid information.

Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ dimensions:
definitions:
region:
country: true
illegal_characters: [";", ":"]
28 changes: 24 additions & 4 deletions tests/test_codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
MetaCodeList,
)
from nomenclature.config import NomenclatureConfig
from nomenclature.definition import DataStructureDefinition

from conftest import TEST_DATA_DIR, clean_up_external_repos

Expand Down Expand Up @@ -256,17 +257,36 @@ def test_to_csv():
@pytest.mark.parametrize(
"subfolder, match",
[
("char_in_str", r"Unexpected bracket in variable: 'Primary Energy\|{Feul}'"),
("char_in_list", r"Unexpected bracket in variable: 'Share\|Coal'"),
("char_in_dict", r"Unexpected bracket in variable: 'Primary Energy'"),
("tag_in_str", r"Unexpected character in variable: 'Primary Energy\|{Feul}'"),
("tag_in_list", r"Unexpected character in variable: 'Share\|Coal'"),
("tag_in_dict", r"Unexpected character in variable: 'Primary Energy'"),
],
)
def test_stray_tag_fails(subfolder, match):
"""Check that stray brackets from, e.g. typos in a tag, raises expected error"""
with raises(ValueError, match=match):
VariableCodeList.from_directory(
code_list = VariableCodeList.from_directory(
"variable", MODULE_TEST_DATA_DIR / "stray_tag" / subfolder
)
code_list.check_illegal_characters(NomenclatureConfig(dimensions=["variable"]))


def test_illegal_char_fails():
"""Check that illegal character raises expected error."""
match = r"Unexpected character in variable: 'Primary Energy\|Coal'"
with raises(ValueError, match=match):
DataStructureDefinition(
MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_str" / "definitions"
)


def test_illegal_char_ignores_external_repo():
"""Check that external repos are excluded from this check."""
# the config includes illegal characters known to be in common-definitions
# the test will not raise errors as the check is skipped for external repos
DataStructureDefinition(
MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_external_repo" / "definitions"
)


def test_end_whitespace_fails():
Expand Down

0 comments on commit 4e059f8

Please sign in to comment.