Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check for illegal characters in codelists #418

Merged
merged 10 commits into from
Dec 16, 2024
60 changes: 31 additions & 29 deletions nomenclature/codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,35 +43,6 @@ class CodeList(BaseModel):
def __eq__(self, other):
return self.name == other.name and self.mapping == other.mapping

@field_validator("mapping")
@classmethod
def check_stray_tag(
cls, v: Dict[str, Code], info: ValidationInfo
) -> Dict[str, Code]:
"""Check that no stray tags are left in codes after tag replacement"""
forbidden = ["{", "}"]

def _check_string(value):
if isinstance(value, str):
if any(char in value for char in forbidden):
raise ValueError(
f"Unexpected bracket in {info.data['name']}: '{code.name}'."
" Check if tags were spelled correctly."
)
elif isinstance(value, dict):
for v in value.values():
_check_string(v)
elif isinstance(value, list):
for item in value:
_check_string(item)

for code in v.values():
for attr in code.model_fields:
if attr not in ["file", "repository"]:
value = getattr(code, attr)
_check_string(value)
return v

@field_validator("mapping")
@classmethod
def check_end_whitespace(
Expand Down Expand Up @@ -364,6 +335,37 @@ def read_excel(cls, name, source, sheet_name, col, attrs=None):

return cls(name=name, mapping=mapp)

def check_illegal_characters(self, config: NomenclatureConfig) -> Dict[str, Code]:
"""Check that no illegal characters are left in codes after tag replacement"""
illegal = (
["{", "}"] + config.illegal_characters
if config.check_illegal_characters
else ["{", "}"]
)

def _check_string(value):
if isinstance(value, str):
if any(char in value for char in illegal):
raise ValueError(
dc-almeida marked this conversation as resolved.
Show resolved Hide resolved
f"Unexpected character in {self.name}: '{code.name}'."
" Check for illegal characters and/or if tags were spelled correctly."
)
elif isinstance(value, dict):
for k in value.keys():
_check_string(k)
for v in value.values():
dc-almeida marked this conversation as resolved.
Show resolved Hide resolved
_check_string(v)
elif isinstance(value, list):
for item in value:
_check_string(item)

for code in self.mapping.values():
if not code.repository:
for attr in code.model_fields:
dc-almeida marked this conversation as resolved.
Show resolved Hide resolved
if attr != "file":
value = getattr(code, attr)
_check_string(value)
dc-almeida marked this conversation as resolved.
Show resolved Hide resolved

def to_yaml(self, path=None):
"""Write mapping to yaml file or return as stream

Expand Down
7 changes: 7 additions & 0 deletions nomenclature/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,16 @@ class NomenclatureConfig(BaseModel):
repositories: dict[str, Repository] = Field(default_factory=dict)
definitions: DataStructureConfig = Field(default_factory=DataStructureConfig)
mappings: RegionMappingConfig = Field(default_factory=RegionMappingConfig)
check_illegal_characters: bool = True
illegal_characters: list[str] = ["'", ":", ";"]
dc-almeida marked this conversation as resolved.
Show resolved Hide resolved

model_config = ConfigDict(use_enum_values=True)

@field_validator("illegal_characters", mode="before")
@classmethod
def check_illegal_chars(cls, v: str | list[str]) -> list[str]:
return v if isinstance(v, list) else [v]

@model_validator(mode="after")
@classmethod
def check_definitions_repository(
Expand Down
1 change: 1 addition & 0 deletions nomenclature/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(self, path, dimensions=None):
self.__setattr__(
dim, codelist_cls.from_directory(dim, path / dim, self.config)
)
getattr(self, dim).check_illegal_characters(self.config)

if empty := [d for d in self.dimensions if not getattr(self, d)]:
raise ValueError(f"Empty codelist: {', '.join(empty)}")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
dimensions:
- variable
repositories:
common-definitions:
url: https://github.com/IAMconsortium/common-definitions.git/
hash: 3b75bb9
definitions:
variable:
repository:
- common-definitions
check_illegal_characters: true
illegal_characters: ['"' , ";"] # these are known to be present in common-definitions variables
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- Primary Energy:
definition: Total primary energy consumption
unit: EJ/yr
- Primary Energy|Coal:
definition: Primary energy consumption of coal
unit: EJ/yr
info: Additional info with illegal " character
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dimensions:
- variable
illegal_characters: ['"' , ";"]
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ dimensions:
definitions:
region:
country: true
illegal_characters: [";", ":"]
28 changes: 24 additions & 4 deletions tests/test_codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
MetaCodeList,
)
from nomenclature.config import NomenclatureConfig
from nomenclature.definition import DataStructureDefinition

from conftest import TEST_DATA_DIR, clean_up_external_repos

Expand Down Expand Up @@ -224,17 +225,36 @@ def test_to_csv():
@pytest.mark.parametrize(
"subfolder, match",
[
("char_in_str", r"Unexpected bracket in variable: 'Primary Energy\|{Feul}'"),
("char_in_list", r"Unexpected bracket in variable: 'Share\|Coal'"),
("char_in_dict", r"Unexpected bracket in variable: 'Primary Energy'"),
("tag_in_str", r"Unexpected character in variable: 'Primary Energy\|{Feul}'"),
("tag_in_list", r"Unexpected character in variable: 'Share\|Coal'"),
("tag_in_dict", r"Unexpected character in variable: 'Primary Energy'"),
],
)
def test_stray_tag_fails(subfolder, match):
"""Check that stray brackets from, e.g. typos in a tag, raises expected error"""
with raises(ValueError, match=match):
VariableCodeList.from_directory(
code_list = VariableCodeList.from_directory(
"variable", MODULE_TEST_DATA_DIR / "stray_tag" / subfolder
)
code_list.check_illegal_characters(NomenclatureConfig(dimensions=["variable"]))


def test_illegal_char_fails():
"""Check that illegal character raises expected error."""
match = r"Unexpected character in variable: 'Primary Energy\|Coal'"
with raises(ValueError, match=match):
DataStructureDefinition(
MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_str" / "definitions"
)


def test_illegal_char_ignores_external_repo():
"""Check that external repos are excluded from this check."""
# the config includes illegal characters known to be in common-definitions
# the test will not raise errors as the check is skipped for external repos
DataStructureDefinition(
MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_external_repo" / "definitions"
)


def test_end_whitespace_fails():
Expand Down