From ca3a1c511ee6879e7af5aeece2d44492b7e06071 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Wed, 16 Oct 2024 19:08:32 +0200 Subject: [PATCH 1/8] Allow checking illegal characters defined in nomenclature.yaml --- nomenclature/codelist.py | 58 +++++++++++++++++++------------------- nomenclature/config.py | 6 ++++ nomenclature/definition.py | 1 + 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 1233a2a4..391eefe8 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -43,35 +43,6 @@ class CodeList(BaseModel): def __eq__(self, other): return self.name == other.name and self.mapping == other.mapping - @field_validator("mapping") - @classmethod - def check_stray_tag( - cls, v: Dict[str, Code], info: ValidationInfo - ) -> Dict[str, Code]: - """Check that no stray tags are left in codes after tag replacement""" - forbidden = ["{", "}"] - - def _check_string(value): - if isinstance(value, str): - if any(char in value for char in forbidden): - raise ValueError( - f"Unexpected bracket in {info.data['name']}: '{code.name}'." - " Check if tags were spelled correctly." - ) - elif isinstance(value, dict): - for v in value.values(): - _check_string(v) - elif isinstance(value, list): - for item in value: - _check_string(item) - - for code in v.values(): - for attr in code.model_fields: - if attr not in ["file", "repository"]: - value = getattr(code, attr) - _check_string(value) - return v - @field_validator("mapping") @classmethod def check_end_whitespace( @@ -364,6 +335,35 @@ def read_excel(cls, name, source, sheet_name, col, attrs=None): return cls(name=name, mapping=mapp) + def check_illegal_characters( + self, config: NomenclatureConfig = None + ) -> Dict[str, Code]: + """Check that no illegal characters are left in codes after tag replacement""" + forbidden = ["{", "}"] + if config and config.illegal_characters: + forbidden += config.illegal_characters + + def _check_string(value): + if isinstance(value, str): + if any(char in value for char in forbidden): + raise ValueError( + f"Unexpected character in {self.name}: '{code.name}'." + " Check for illegal characters and/or if tags were spelled correctly." + ) + elif isinstance(value, dict): + for v in value.values(): + _check_string(v) + elif isinstance(value, list): + for item in value: + _check_string(item) + + for code in self.mapping.values(): + if not code.repository: + for attr in code.model_fields: + if attr not in ["file", "repository"]: + value = getattr(code, attr) + _check_string(value) + def to_yaml(self, path=None): """Write mapping to yaml file or return as stream diff --git a/nomenclature/config.py b/nomenclature/config.py index 4d20d32e..5801034e 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -159,9 +159,15 @@ class NomenclatureConfig(BaseModel): repositories: dict[str, Repository] = Field(default_factory=dict) definitions: DataStructureConfig = Field(default_factory=DataStructureConfig) mappings: RegionMappingConfig = Field(default_factory=RegionMappingConfig) + illegal_characters: None | list[str] = None model_config = ConfigDict(use_enum_values=True) + @field_validator("illegal_characters", mode="before") + @classmethod + def check_illegal_characters(cls, v: str | list[str]) -> list[str]: + return v if isinstance(v, list) else [v] + @model_validator(mode="after") @classmethod def check_definitions_repository( diff --git a/nomenclature/definition.py b/nomenclature/definition.py index 8a93d1ea..394e88fd 100644 --- a/nomenclature/definition.py +++ b/nomenclature/definition.py @@ -76,6 +76,7 @@ def __init__(self, path, dimensions=None): self.__setattr__( dim, codelist_cls.from_directory(dim, path / dim, self.config) ) + getattr(self, dim).check_illegal_characters(self.config) if empty := [d for d in self.dimensions if not getattr(self, d)]: raise ValueError(f"Empty codelist: {', '.join(empty)}") From 08165617f55ed78bdd3e1fefa19d09c34509133d Mon Sep 17 00:00:00 2001 From: David Almeida Date: Wed, 16 Oct 2024 19:08:43 +0200 Subject: [PATCH 2/8] Update testing --- .../char_in_external_repo/nomenclature.yaml | 11 ++++++++ .../definitions/variable/variables.yaml | 7 +++++ .../char_in_str/nomenclature.yaml | 3 ++ .../variables.yaml | 0 .../variables.yaml | 0 .../{char_in_str => tag_in_str}/tag_fuel.yaml | 0 .../variables.yaml | 0 tests/test_codelist.py | 28 ++++++++++++++++--- 8 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml create mode 100644 tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml create mode 100644 tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml rename tests/data/codelist/stray_tag/{char_in_dict => tag_in_dict}/variables.yaml (100%) rename tests/data/codelist/stray_tag/{char_in_list => tag_in_list}/variables.yaml (100%) rename tests/data/codelist/stray_tag/{char_in_str => tag_in_str}/tag_fuel.yaml (100%) rename tests/data/codelist/stray_tag/{char_in_str => tag_in_str}/variables.yaml (100%) diff --git a/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml b/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml new file mode 100644 index 00000000..416164fe --- /dev/null +++ b/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml @@ -0,0 +1,11 @@ +dimensions: + - variable +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + hash: 3b75bb9 +definitions: + variable: + repository: + - common-definitions +illegal_characters: ['"' , ";"] # these are known to be present in common-definitions variables \ No newline at end of file diff --git a/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml b/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml new file mode 100644 index 00000000..19da6e1f --- /dev/null +++ b/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml @@ -0,0 +1,7 @@ +- Primary Energy: + definition: Total primary energy consumption + unit: EJ/yr +- Primary Energy|Coal: + definition: Primary energy consumption of coal + unit: EJ/yr + info: Additional info with illegal " character \ No newline at end of file diff --git a/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml b/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml new file mode 100644 index 00000000..a0e942bc --- /dev/null +++ b/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml @@ -0,0 +1,3 @@ +dimensions: + - variable +illegal_characters: ['"' , ";"] \ No newline at end of file diff --git a/tests/data/codelist/stray_tag/char_in_dict/variables.yaml b/tests/data/codelist/stray_tag/tag_in_dict/variables.yaml similarity index 100% rename from tests/data/codelist/stray_tag/char_in_dict/variables.yaml rename to tests/data/codelist/stray_tag/tag_in_dict/variables.yaml diff --git a/tests/data/codelist/stray_tag/char_in_list/variables.yaml b/tests/data/codelist/stray_tag/tag_in_list/variables.yaml similarity index 100% rename from tests/data/codelist/stray_tag/char_in_list/variables.yaml rename to tests/data/codelist/stray_tag/tag_in_list/variables.yaml diff --git a/tests/data/codelist/stray_tag/char_in_str/tag_fuel.yaml b/tests/data/codelist/stray_tag/tag_in_str/tag_fuel.yaml similarity index 100% rename from tests/data/codelist/stray_tag/char_in_str/tag_fuel.yaml rename to tests/data/codelist/stray_tag/tag_in_str/tag_fuel.yaml diff --git a/tests/data/codelist/stray_tag/char_in_str/variables.yaml b/tests/data/codelist/stray_tag/tag_in_str/variables.yaml similarity index 100% rename from tests/data/codelist/stray_tag/char_in_str/variables.yaml rename to tests/data/codelist/stray_tag/tag_in_str/variables.yaml diff --git a/tests/test_codelist.py b/tests/test_codelist.py index a5c36428..f49c3c56 100644 --- a/tests/test_codelist.py +++ b/tests/test_codelist.py @@ -12,6 +12,7 @@ MetaCodeList, ) from nomenclature.config import NomenclatureConfig +from nomenclature.definition import DataStructureDefinition from conftest import TEST_DATA_DIR, clean_up_external_repos @@ -224,17 +225,36 @@ def test_to_csv(): @pytest.mark.parametrize( "subfolder, match", [ - ("char_in_str", r"Unexpected bracket in variable: 'Primary Energy\|{Feul}'"), - ("char_in_list", r"Unexpected bracket in variable: 'Share\|Coal'"), - ("char_in_dict", r"Unexpected bracket in variable: 'Primary Energy'"), + ("tag_in_str", r"Unexpected character in variable: 'Primary Energy\|{Feul}'"), + ("tag_in_list", r"Unexpected character in variable: 'Share\|Coal'"), + ("tag_in_dict", r"Unexpected character in variable: 'Primary Energy'"), ], ) def test_stray_tag_fails(subfolder, match): """Check that stray brackets from, e.g. typos in a tag, raises expected error""" with raises(ValueError, match=match): - VariableCodeList.from_directory( + code_list = VariableCodeList.from_directory( "variable", MODULE_TEST_DATA_DIR / "stray_tag" / subfolder ) + code_list.check_illegal_characters() + + +def test_illegal_char_fails(): + """Check that illegal character raises expected error.""" + match = r"Unexpected character in variable: 'Primary Energy\|Coal'" + with raises(ValueError, match=match): + DataStructureDefinition( + MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_str" / "definitions" + ) + + +def test_illegal_char_ignores_external_repo(): + """Check that external repos are excluded from this check.""" + # the config includes illegal characters known to be in common-definitions + # the test will not raise errors as the check is skipped for external repos + DataStructureDefinition( + MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_external_repo" / "definitions" + ) def test_end_whitespace_fails(): From 61a71d9793f3f77ffebba1833fd05d5f8299ec11 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Wed, 13 Nov 2024 14:06:28 +0100 Subject: [PATCH 3/8] Add defaults and check_illegal_characters field --- nomenclature/codelist.py | 18 ++++++++++-------- nomenclature/config.py | 5 +++-- .../char_in_external_repo/nomenclature.yaml | 3 ++- .../nomenclature.yaml | 1 + tests/test_codelist.py | 2 +- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 391eefe8..2824dffe 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -335,22 +335,24 @@ def read_excel(cls, name, source, sheet_name, col, attrs=None): return cls(name=name, mapping=mapp) - def check_illegal_characters( - self, config: NomenclatureConfig = None - ) -> Dict[str, Code]: + def check_illegal_characters(self, config: NomenclatureConfig) -> Dict[str, Code]: """Check that no illegal characters are left in codes after tag replacement""" - forbidden = ["{", "}"] - if config and config.illegal_characters: - forbidden += config.illegal_characters + illegal = ( + ["{", "}"] + config.illegal_characters + if config.check_illegal_characters + else ["{", "}"] + ) def _check_string(value): if isinstance(value, str): - if any(char in value for char in forbidden): + if any(char in value for char in illegal): raise ValueError( f"Unexpected character in {self.name}: '{code.name}'." " Check for illegal characters and/or if tags were spelled correctly." ) elif isinstance(value, dict): + for k in value.keys(): + _check_string(k) for v in value.values(): _check_string(v) elif isinstance(value, list): @@ -360,7 +362,7 @@ def _check_string(value): for code in self.mapping.values(): if not code.repository: for attr in code.model_fields: - if attr not in ["file", "repository"]: + if attr != "file": value = getattr(code, attr) _check_string(value) diff --git a/nomenclature/config.py b/nomenclature/config.py index 5801034e..e6c0c298 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -159,13 +159,14 @@ class NomenclatureConfig(BaseModel): repositories: dict[str, Repository] = Field(default_factory=dict) definitions: DataStructureConfig = Field(default_factory=DataStructureConfig) mappings: RegionMappingConfig = Field(default_factory=RegionMappingConfig) - illegal_characters: None | list[str] = None + check_illegal_characters: bool = True + illegal_characters: list[str] = ["'", ":", ";"] model_config = ConfigDict(use_enum_values=True) @field_validator("illegal_characters", mode="before") @classmethod - def check_illegal_characters(cls, v: str | list[str]) -> list[str]: + def check_illegal_chars(cls, v: str | list[str]) -> list[str]: return v if isinstance(v, list) else [v] @model_validator(mode="after") diff --git a/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml b/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml index 416164fe..76c519bf 100644 --- a/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml +++ b/tests/data/codelist/illegal_chars/char_in_external_repo/nomenclature.yaml @@ -8,4 +8,5 @@ definitions: variable: repository: - common-definitions -illegal_characters: ['"' , ";"] # these are known to be present in common-definitions variables \ No newline at end of file +check_illegal_characters: true +illegal_characters: ['"' , ";"] # these are known to be present in common-definitions variables diff --git a/tests/data/config/general-config-only-country/nomenclature.yaml b/tests/data/config/general-config-only-country/nomenclature.yaml index 7616a4d1..a0794834 100644 --- a/tests/data/config/general-config-only-country/nomenclature.yaml +++ b/tests/data/config/general-config-only-country/nomenclature.yaml @@ -3,3 +3,4 @@ dimensions: definitions: region: country: true +illegal_characters: [";", ":"] diff --git a/tests/test_codelist.py b/tests/test_codelist.py index f49c3c56..ada18690 100644 --- a/tests/test_codelist.py +++ b/tests/test_codelist.py @@ -236,7 +236,7 @@ def test_stray_tag_fails(subfolder, match): code_list = VariableCodeList.from_directory( "variable", MODULE_TEST_DATA_DIR / "stray_tag" / subfolder ) - code_list.check_illegal_characters() + code_list.check_illegal_characters(NomenclatureConfig(dimensions=["variable"])) def test_illegal_char_fails(): From 2ea110e2e537860f74091254e241c1ec866a5dd5 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Thu, 14 Nov 2024 12:34:49 +0100 Subject: [PATCH 4/8] Add `ErrorCollector` and pydantic `model_dump` --- nomenclature/codelist.py | 17 ++++++++++------- .../char_in_external_repo/common-definitions | 1 + 2 files changed, 11 insertions(+), 7 deletions(-) create mode 160000 tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 2824dffe..f929b67f 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -342,13 +342,16 @@ def check_illegal_characters(self, config: NomenclatureConfig) -> Dict[str, Code if config.check_illegal_characters else ["{", "}"] ) + errors = ErrorCollector() def _check_string(value): if isinstance(value, str): if any(char in value for char in illegal): - raise ValueError( - f"Unexpected character in {self.name}: '{code.name}'." - " Check for illegal characters and/or if tags were spelled correctly." + errors.append( + ValueError( + f"Unexpected character in {self.name}: '{code.name}'." + " Check for illegal characters and/or if tags were spelled correctly." + ) ) elif isinstance(value, dict): for k in value.keys(): @@ -361,10 +364,10 @@ def _check_string(value): for code in self.mapping.values(): if not code.repository: - for attr in code.model_fields: - if attr != "file": - value = getattr(code, attr) - _check_string(value) + for value in code.model_dump(exclude="file").values(): + _check_string(value) + if errors: + raise ValueError(errors) def to_yaml(self, path=None): """Write mapping to yaml file or return as stream diff --git a/tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions b/tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions new file mode 160000 index 00000000..3b75bb94 --- /dev/null +++ b/tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions @@ -0,0 +1 @@ +Subproject commit 3b75bb946658e55affe4108ddcaac281058832cd From 1e3b8bb1812afe71603bd1c64c0d3fbfeaabe7c5 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Thu, 14 Nov 2024 12:40:27 +0100 Subject: [PATCH 5/8] Delete common-definitions stray index --- .../illegal_chars/char_in_external_repo/common-definitions | 1 - 1 file changed, 1 deletion(-) delete mode 160000 tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions diff --git a/tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions b/tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions deleted file mode 160000 index 3b75bb94..00000000 --- a/tests/data/codelist/illegal_chars/char_in_external_repo/common-definitions +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3b75bb946658e55affe4108ddcaac281058832cd From f06ae3c1fb209d1e84a08073c2525bd97553b66e Mon Sep 17 00:00:00 2001 From: David Almeida Date: Wed, 11 Dec 2024 10:39:05 +0100 Subject: [PATCH 6/8] Add newlines --- .../char_in_str/definitions/variable/variables.yaml | 2 +- tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml | 2 +- tests/data/codelist/stray_tag/tag_in_list/variables.yaml | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml b/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml index 19da6e1f..24e2d1ce 100644 --- a/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml +++ b/tests/data/codelist/illegal_chars/char_in_str/definitions/variable/variables.yaml @@ -4,4 +4,4 @@ - Primary Energy|Coal: definition: Primary energy consumption of coal unit: EJ/yr - info: Additional info with illegal " character \ No newline at end of file + info: Additional info with illegal " character diff --git a/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml b/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml index a0e942bc..199d9411 100644 --- a/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml +++ b/tests/data/codelist/illegal_chars/char_in_str/nomenclature.yaml @@ -1,3 +1,3 @@ dimensions: - variable -illegal_characters: ['"' , ";"] \ No newline at end of file +illegal_characters: ['"' , ";"] diff --git a/tests/data/codelist/stray_tag/tag_in_list/variables.yaml b/tests/data/codelist/stray_tag/tag_in_list/variables.yaml index 00672726..6f9e7d17 100644 --- a/tests/data/codelist/stray_tag/tag_in_list/variables.yaml +++ b/tests/data/codelist/stray_tag/tag_in_list/variables.yaml @@ -11,4 +11,3 @@ - Valid information. - Invalid bracket { information. - Another valid information. - From bd8fc17dff07e03de55864b7509d49640c229a9d Mon Sep 17 00:00:00 2001 From: David Almeida <58078834+dc-almeida@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:22:57 +0100 Subject: [PATCH 7/8] Allow single quote, disallow double-quote Co-authored-by: Daniel Huppmann --- nomenclature/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomenclature/config.py b/nomenclature/config.py index cf11eb89..3fef01b2 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -162,7 +162,7 @@ class NomenclatureConfig(BaseModel): definitions: DataStructureConfig = Field(default_factory=DataStructureConfig) mappings: RegionMappingConfig = Field(default_factory=RegionMappingConfig) check_illegal_characters: bool = True - illegal_characters: list[str] = ["'", ":", ";"] + illegal_characters: list[str] = [":", ";", '"'] model_config = ConfigDict(use_enum_values=True) From 493a6bd43d630abb90e796b3385bc59dd83adffc Mon Sep 17 00:00:00 2001 From: David Almeida Date: Fri, 13 Dec 2024 13:32:17 +0100 Subject: [PATCH 8/8] Fix typing --- nomenclature/codelist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 8e9ae114..685bb483 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -334,7 +334,7 @@ def read_excel(cls, name, source, sheet_name, col, attrs=None): return cls(name=name, mapping=mapp) - def check_illegal_characters(self, config: NomenclatureConfig) -> Dict[str, Code]: + def check_illegal_characters(self, config: NomenclatureConfig) -> dict[str, Code]: """Check that no illegal characters are left in codes after tag replacement""" illegal = ( ["{", "}"] + config.illegal_characters