diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 60b56e49..685bb483 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -43,35 +43,6 @@ class CodeList(BaseModel): def __eq__(self, other): return self.name == other.name and self.mapping == other.mapping - @field_validator("mapping") - @classmethod - def check_stray_tag( - cls, v: dict[str, Code], info: ValidationInfo - ) -> dict[str, Code]: - """Check that no stray tags are left in codes after tag replacement""" - forbidden = ["{", "}"] - - def _check_string(value): - if isinstance(value, str): - if any(char in value for char in forbidden): - raise ValueError( - f"Unexpected bracket in {info.data['name']}: '{code.name}'." - " Check if tags were spelled correctly." - ) - elif isinstance(value, dict): - for v in value.values(): - _check_string(v) - elif isinstance(value, list): - for item in value: - _check_string(item) - - for code in v.values(): - for attr in code.model_fields: - if attr not in ["file", "repository"]: - value = getattr(code, attr) - _check_string(value) - return v - @field_validator("mapping") @classmethod def check_end_whitespace( @@ -363,6 +334,40 @@ def read_excel(cls, name, source, sheet_name, col, attrs=None): return cls(name=name, mapping=mapp) + def check_illegal_characters(self, config: NomenclatureConfig) -> dict[str, Code]: + """Check that no illegal characters are left in codes after tag replacement""" + illegal = ( + ["{", "}"] + config.illegal_characters + if config.check_illegal_characters + else ["{", "}"] + ) + errors = ErrorCollector() + + def _check_string(value): + if isinstance(value, str): + if any(char in value for char in illegal): + errors.append( + ValueError( + f"Unexpected character in {self.name}: '{code.name}'." + " Check for illegal characters and/or if tags were spelled correctly." + ) + ) + elif isinstance(value, dict): + for k in value.keys(): + _check_string(k) + for v in value.values(): + _check_string(v) + elif isinstance(value, list): + for item in value: + _check_string(item) + + for code in self.mapping.values(): + if not code.repository: + for value in code.model_dump(exclude="file").values(): + _check_string(value) + if errors: + raise ValueError(errors) + def to_yaml(self, path=None): """Write mapping to yaml file or return as stream diff --git a/nomenclature/config.py b/nomenclature/config.py index 0e6bd5e5..6ccd5b79 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -245,9 +245,16 @@ class NomenclatureConfig(BaseModel): repositories: dict[str, Repository] = Field(default_factory=dict) definitions: DataStructureConfig = Field(default_factory=DataStructureConfig) mappings: RegionMappingConfig = Field(default_factory=RegionMappingConfig) + check_illegal_characters: bool = True + illegal_characters: list[str] = [":", ";", '"'] model_config = ConfigDict(use_enum_values=True) + @field_validator("illegal_characters", mode="before") + @classmethod + def check_illegal_chars(cls, v: str | list[str]) -> list[str]: + return v if isinstance(v, list) else [v] + @model_validator(mode="after") @classmethod def check_definitions_repository( diff --git a/nomenclature/definition.py b/nomenclature/definition.py index 66837302..9f3e639e 100644 --- a/nomenclature/definition.py +++ b/nomenclature/definition.py @@ -76,6 +76,7 @@ def __init__(self, path, dimensions=None): self.__setattr__( dim, codelist_cls.from_directory(dim, path / dim, self.config) ) + getattr(self, dim).check_illegal_characters(self.config) if empty := [d for d in self.dimensions if not getattr(self, d)]: raise ValueError(f"Empty codelist: {', '.join(empty)}") diff --git a/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml b/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml new file mode 100644 index 00000000..76c519bf --- /dev/null +++ b/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml @@ -0,0 +1,12 @@ +dimensions: + - variable +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + hash: 3b75bb9 +definitions: + variable: + repository: + - common-definitions +check_illegal_characters: true +illegal_characters: ['"' , ";"] # these are known to be present in common-definitions variables diff --git a/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml b/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml new file mode 100644 index 00000000..24e2d1ce --- /dev/null +++ b/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml @@ -0,0 +1,7 @@ +- Primary Energy: + definition: Total primary energy consumption + unit: EJ/yr +- Primary Energy|Coal: + definition: Primary energy consumption of coal + unit: EJ/yr + info: Additional info with illegal " character diff --git a/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml b/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml new file mode 100644 index 00000000..199d9411 --- /dev/null +++ b/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml @@ -0,0 +1,3 @@ +dimensions: + - variable +illegal_characters: ['"' , ";"] diff --git a/tests/data/codelist/stray_tag/char_in_dict/variables.yaml b/tests/data/codelist/stray_tag/tag_in_dict/variables.yaml similarity index 100% rename from tests/data/codelist/stray_tag/char_in_dict/variables.yaml rename to tests/data/codelist/stray_tag/tag_in_dict/variables.yaml diff --git a/tests/data/codelist/stray_tag/char_in_list/variables.yaml b/tests/data/codelist/stray_tag/tag_in_list/variables.yaml similarity index 99% rename from tests/data/codelist/stray_tag/char_in_list/variables.yaml rename to tests/data/codelist/stray_tag/tag_in_list/variables.yaml index 00672726..6f9e7d17 100644 --- a/tests/data/codelist/stray_tag/char_in_list/variables.yaml +++ b/tests/data/codelist/stray_tag/tag_in_list/variables.yaml @@ -11,4 +11,3 @@ - Valid information. - Invalid bracket { information. - Another valid information. - diff --git a/tests/data/codelist/stray_tag/char_in_str/tag_fuel.yaml b/tests/data/codelist/stray_tag/tag_in_str/tag_fuel.yaml similarity index 100% rename from tests/data/codelist/stray_tag/char_in_str/tag_fuel.yaml rename to tests/data/codelist/stray_tag/tag_in_str/tag_fuel.yaml diff --git a/tests/data/codelist/stray_tag/char_in_str/variables.yaml b/tests/data/codelist/stray_tag/tag_in_str/variables.yaml similarity index 100% rename from tests/data/codelist/stray_tag/char_in_str/variables.yaml rename to tests/data/codelist/stray_tag/tag_in_str/variables.yaml diff --git a/tests/data/config/general-config-only-country/nomenclature.yaml b/tests/data/config/general-config-only-country/nomenclature.yaml index 7616a4d1..a0794834 100644 --- a/tests/data/config/general-config-only-country/nomenclature.yaml +++ b/tests/data/config/general-config-only-country/nomenclature.yaml @@ -3,3 +3,4 @@ dimensions: definitions: region: country: true +illegal_characters: [";", ":"] diff --git a/tests/test_codelist.py b/tests/test_codelist.py index 82d8564d..f0b56955 100644 --- a/tests/test_codelist.py +++ b/tests/test_codelist.py @@ -12,6 +12,7 @@ MetaCodeList, ) from nomenclature.config import NomenclatureConfig +from nomenclature.definition import DataStructureDefinition from conftest import TEST_DATA_DIR, clean_up_external_repos @@ -256,17 +257,36 @@ def test_to_csv(): @pytest.mark.parametrize( "subfolder, match", [ - ("char_in_str", r"Unexpected bracket in variable: 'Primary Energy\|{Feul}'"), - ("char_in_list", r"Unexpected bracket in variable: 'Share\|Coal'"), - ("char_in_dict", r"Unexpected bracket in variable: 'Primary Energy'"), + ("tag_in_str", r"Unexpected character in variable: 'Primary Energy\|{Feul}'"), + ("tag_in_list", r"Unexpected character in variable: 'Share\|Coal'"), + ("tag_in_dict", r"Unexpected character in variable: 'Primary Energy'"), ], ) def test_stray_tag_fails(subfolder, match): """Check that stray brackets from, e.g. typos in a tag, raises expected error""" with raises(ValueError, match=match): - VariableCodeList.from_directory( + code_list = VariableCodeList.from_directory( "variable", MODULE_TEST_DATA_DIR / "stray_tag" / subfolder ) + code_list.check_illegal_characters(NomenclatureConfig(dimensions=["variable"])) + + +def test_illegal_char_fails(): + """Check that illegal character raises expected error.""" + match = r"Unexpected character in variable: 'Primary Energy\|Coal'" + with raises(ValueError, match=match): + DataStructureDefinition( + MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_str" / "definitions" + ) + + +def test_illegal_char_ignores_external_repo(): + """Check that external repos are excluded from this check.""" + # the config includes illegal characters known to be in common-definitions + # the test will not raise errors as the check is skipped for external repos + DataStructureDefinition( + MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_external_repo" / "definitions" + ) def test_end_whitespace_fails():