From 0131b88a3b4f1f4ee16297f1b36fd1d7a257f565 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 31 Jan 2025 16:42:54 +0000 Subject: [PATCH] adds 'type' field to the parser to allow for a single field to return a list of specified values --- docs/specification.md | 16 ++++ schemas/dev.schema.json | 6 ++ src/adtl/get_value.py | 138 +++++++++++++++++------------- tests/test_adtl/test_get_value.py | 34 ++++++++ 4 files changed, 133 insertions(+), 61 deletions(-) diff --git a/docs/specification.md b/docs/specification.md index c69953b..bd5cf03 100644 --- a/docs/specification.md +++ b/docs/specification.md @@ -310,6 +310,22 @@ When the parser encounters e.g. `Homme` or `FEMME` in the data it will still mat `Home` will return `null`, but strips and leading or trailing whitespace so `" FEMME "` will also match to `female`. +#### Field with lists of values + +If a field requires a list of values, a `type` of `enum_list` can be added to the rule: +```toml +[table.symptoms] +field = "ReportedSymptoms" +type = "enum_list" +values = { "high temp" = "fever", headache = "cephalalgia", "muscle aches"="myalgia" } +ignoreMissingKey = true +``` + +When the parser if given a list either in square brackets, e.g. `'[high temp, headache']'` +or as a comma-separated string e.g. `"muscle aches, high temp"` it will attempt to turn +convert the string into a list of values and find matches for the listed values. As with +a standard value mapping field, it can be tagged to be case insensitive and to return +all fields it cannot match. ### Combined type diff --git a/schemas/dev.schema.json b/schemas/dev.schema.json index 3624c12..4afe25a 100644 --- a/schemas/dev.schema.json +++ b/schemas/dev.schema.json @@ -88,6 +88,12 @@ "can_skip": { "const": true, "description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data." + }, + "type": { + "enum": [ + "enum_list" + ], + "description": "If the field is a single field list" } } } diff --git a/src/adtl/get_value.py b/src/adtl/get_value.py index f2a4f74..575473c 100644 --- a/src/adtl/get_value.py +++ b/src/adtl/get_value.py @@ -54,6 +54,79 @@ def skip_field(row: StrDict, rule: StrDict, ctx: Context = None): return False +def apply_fuction(value, row: StrDict, rule: StrDict, ctx: Context): + # apply data transformations. + transformation = rule["apply"]["function"] + params = None + if "params" in rule["apply"]: + params = [] + for i in range(len(rule["apply"]["params"])): + if isinstance(rule["apply"]["params"][i], str) and rule["apply"]["params"][ + i + ].startswith("$"): + params.append(row[rule["apply"]["params"][i][1:]]) + elif isinstance(rule["apply"]["params"][i], list): + param = [ + ( + row[rule["apply"]["params"][i][j][1:]] + if ( + isinstance(rule["apply"]["params"][i][j], str) + and rule["apply"]["params"][i][j].startswith("$") + ) + else rule["apply"]["params"][i][j] + ) + for j in range(len(rule["apply"]["params"][i])) + ] + params.append(param) + else: + params.append(rule["apply"]["params"][i]) + + try: + with warnings.catch_warnings(): + warnings.simplefilter("error", category=AdtlTransformationWarning) + if params: + value = getattr(tf, transformation)(value, *params) + else: + value = getattr(tf, transformation)(value) + except AttributeError: + raise AttributeError( + f"Error using a data transformation: Function {transformation} " + "has not been defined." + ) + except AdtlTransformationWarning as e: + if ctx and ctx.get("returnUnmatched"): + warnings.warn(str(e), AdtlTransformationWarning) + return value + else: + logging.error(str(e)) + return None + return value + + +def convert_values(value, rule: StrDict, ctx: Context) -> str | list[str | None] | None: + if rule.get("type") == "enum_list": + try: + value = [v.lstrip(" ").rstrip(" ") for v in value.strip("[]").split(",")] + new_rule = {k: v for k, v in rule.items() if k != "type"} + value = [convert_values(v, new_rule, ctx) for v in value] + return value + except Exception as e: + logging.debug(f"Error converting {value} to a list: {e}") + return value + + if rule.get("caseInsensitive") and isinstance(value, str): + value = value.lower().lstrip(" ").rstrip(" ") + rule["values"] = {k.lower(): v for k, v in rule["values"].items()} + + if rule.get("ignoreMissingKey") or (ctx and ctx.get("returnUnmatched")): + value = rule["values"].get(value, value) + else: + value = rule["values"].get(value) + + # recheck if value is empty after mapping (use to map values to None) + return None if value == "" else value + + # main functions @@ -85,9 +158,8 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: and should not be called directly, except for debugging. Use get_value() instead. """ - if not isinstance(rule, dict) or isinstance( - rule, list - ): # not a container, is constant + if not isinstance(rule, dict) or isinstance(rule, list): + # not a container, is constant return rule # Check whether field is present if it's allowed to be passed over if "field" in rule: @@ -99,67 +171,11 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: return None value = row[rule["field"]] if "apply" in rule: - # apply data transformations. - transformation = rule["apply"]["function"] - params = None - if "params" in rule["apply"]: - params = [] - for i in range(len(rule["apply"]["params"])): - if isinstance(rule["apply"]["params"][i], str) and rule["apply"][ - "params" - ][i].startswith("$"): - params.append(row[rule["apply"]["params"][i][1:]]) - elif isinstance(rule["apply"]["params"][i], list): - param = [ - ( - row[rule["apply"]["params"][i][j][1:]] - if ( - isinstance(rule["apply"]["params"][i][j], str) - and rule["apply"]["params"][i][j].startswith("$") - ) - else rule["apply"]["params"][i][j] - ) - for j in range(len(rule["apply"]["params"][i])) - ] - params.append(param) - else: - params.append(rule["apply"]["params"][i]) - - try: - with warnings.catch_warnings(): - warnings.simplefilter("error", category=AdtlTransformationWarning) - if params: - value = getattr(tf, transformation)(value, *params) - else: - value = getattr(tf, transformation)(value) - except AttributeError: - raise AttributeError( - f"Error using a data transformation: Function {transformation} " - "has not been defined." - ) - except AdtlTransformationWarning as e: - if ctx and ctx.get("returnUnmatched"): - warnings.warn(str(e), AdtlTransformationWarning) - return value - else: - logging.error(str(e)) - return None - return value + value = apply_fuction(value, row, rule, ctx) if value == "": return None if "values" in rule: - if rule.get("caseInsensitive") and isinstance(value, str): - value = value.lower().lstrip(" ").rstrip(" ") - rule["values"] = {k.lower(): v for k, v in rule["values"].items()} - - if rule.get("ignoreMissingKey") or (ctx and ctx.get("returnUnmatched")): - value = rule["values"].get(value, value) - else: - value = rule["values"].get(value) - - # recheck if value is empty after mapping (use to map values to None) - if value == "": - return None + value = convert_values(value, rule, ctx) # Either source_unit / unit OR source_date / date triggers conversion # do not parse units if value is empty if "source_unit" in rule and "unit" in rule: diff --git a/tests/test_adtl/test_get_value.py b/tests/test_adtl/test_get_value.py index 824dc01..cf66fe5 100644 --- a/tests/test_adtl/test_get_value.py +++ b/tests/test_adtl/test_get_value.py @@ -67,6 +67,12 @@ "can_skip": True, } +RULE_ENUM_LIST = { + "field": "symptoms", + "type": "enum_list", + "values": {"high temp": "fever", "head pain": "headache"}, +} + @pytest.mark.parametrize( "row_rule,expected", @@ -97,6 +103,34 @@ (({"aidshiv": "1"}, RULE_FIELD_OPTION_SKIP), None), (({"aidshiv_mhyn": "1"}, RULE_FIELD_OPTION_SKIP), True), (({"aidshiv_mhyn": "2"}, RULE_FIELD_OPTION_SKIP), None), + ( + ( + {"symptoms": "[high temp, head pain]"}, + RULE_ENUM_LIST, + ), + ["fever", "headache"], + ), + ( + ( + {"symptoms": "[high temp, fatigue]"}, + RULE_ENUM_LIST, + ), + ["fever", None], + ), + ( + ( + {"symptoms": "[high temp, fatigue]"}, + RULE_ENUM_LIST | {"ignoreMissingKey": True}, + ), + ["fever", "fatigue"], + ), + ( + ( + {"symptoms": "[high temp; fatigue]"}, + RULE_ENUM_LIST | {"ignoreMissingKey": True}, + ), + ["fever", "fatigue"], + ), ], ) def test_get_value_single_field(row_rule, expected):