Skip to content

Commit

Permalink
adds 'type' field to the parser to allow for a single field to return…
Browse files Browse the repository at this point in the history
… a list of specified values
  • Loading branch information
pipliggins committed Jan 31, 2025
1 parent 853f3a4 commit 0131b88
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 61 deletions.
16 changes: 16 additions & 0 deletions docs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,22 @@ When the parser encounters e.g. `Homme` or `FEMME` in the data it will still mat
`Home` will return `null`, but strips and leading or trailing whitespace so `" FEMME "`
will also match to `female`.

#### Field with lists of values

If a field requires a list of values, a `type` of `enum_list` can be added to the rule:
```toml
[table.symptoms]
field = "ReportedSymptoms"
type = "enum_list"
values = { "high temp" = "fever", headache = "cephalalgia", "muscle aches"="myalgia" }
ignoreMissingKey = true
```

When the parser if given a list either in square brackets, e.g. `'[high temp, headache']'`
or as a comma-separated string e.g. `"muscle aches, high temp"` it will attempt to turn
convert the string into a list of values and find matches for the listed values. As with
a standard value mapping field, it can be tagged to be case insensitive and to return
all fields it cannot match.

### Combined type

Expand Down
6 changes: 6 additions & 0 deletions schemas/dev.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@
"can_skip": {
"const": true,
"description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data."
},
"type": {
"enum": [
"enum_list"
],
"description": "If the field is a single field list"
}
}
}
Expand Down
138 changes: 77 additions & 61 deletions src/adtl/get_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,79 @@ def skip_field(row: StrDict, rule: StrDict, ctx: Context = None):
return False


def apply_fuction(value, row: StrDict, rule: StrDict, ctx: Context):
# apply data transformations.
transformation = rule["apply"]["function"]
params = None
if "params" in rule["apply"]:
params = []
for i in range(len(rule["apply"]["params"])):
if isinstance(rule["apply"]["params"][i], str) and rule["apply"]["params"][
i
].startswith("$"):
params.append(row[rule["apply"]["params"][i][1:]])
elif isinstance(rule["apply"]["params"][i], list):
param = [
(
row[rule["apply"]["params"][i][j][1:]]
if (
isinstance(rule["apply"]["params"][i][j], str)
and rule["apply"]["params"][i][j].startswith("$")
)
else rule["apply"]["params"][i][j]
)
for j in range(len(rule["apply"]["params"][i]))
]
params.append(param)
else:
params.append(rule["apply"]["params"][i])

try:
with warnings.catch_warnings():
warnings.simplefilter("error", category=AdtlTransformationWarning)
if params:
value = getattr(tf, transformation)(value, *params)
else:
value = getattr(tf, transformation)(value)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
except AdtlTransformationWarning as e:
if ctx and ctx.get("returnUnmatched"):
warnings.warn(str(e), AdtlTransformationWarning)
return value
else:
logging.error(str(e))
return None
return value


def convert_values(value, rule: StrDict, ctx: Context) -> str | list[str | None] | None:
if rule.get("type") == "enum_list":
try:
value = [v.lstrip(" ").rstrip(" ") for v in value.strip("[]").split(",")]
new_rule = {k: v for k, v in rule.items() if k != "type"}
value = [convert_values(v, new_rule, ctx) for v in value]
return value
except Exception as e:
logging.debug(f"Error converting {value} to a list: {e}")
return value

if rule.get("caseInsensitive") and isinstance(value, str):
value = value.lower().lstrip(" ").rstrip(" ")
rule["values"] = {k.lower(): v for k, v in rule["values"].items()}

if rule.get("ignoreMissingKey") or (ctx and ctx.get("returnUnmatched")):
value = rule["values"].get(value, value)
else:
value = rule["values"].get(value)

# recheck if value is empty after mapping (use to map values to None)
return None if value == "" else value


# main functions


Expand Down Expand Up @@ -85,9 +158,8 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
and should not be called directly, except for debugging. Use
get_value() instead.
"""
if not isinstance(rule, dict) or isinstance(
rule, list
): # not a container, is constant
if not isinstance(rule, dict) or isinstance(rule, list):
# not a container, is constant
return rule
# Check whether field is present if it's allowed to be passed over
if "field" in rule:
Expand All @@ -99,67 +171,11 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
return None
value = row[rule["field"]]
if "apply" in rule:
# apply data transformations.
transformation = rule["apply"]["function"]
params = None
if "params" in rule["apply"]:
params = []
for i in range(len(rule["apply"]["params"])):
if isinstance(rule["apply"]["params"][i], str) and rule["apply"][
"params"
][i].startswith("$"):
params.append(row[rule["apply"]["params"][i][1:]])
elif isinstance(rule["apply"]["params"][i], list):
param = [
(
row[rule["apply"]["params"][i][j][1:]]
if (
isinstance(rule["apply"]["params"][i][j], str)
and rule["apply"]["params"][i][j].startswith("$")
)
else rule["apply"]["params"][i][j]
)
for j in range(len(rule["apply"]["params"][i]))
]
params.append(param)
else:
params.append(rule["apply"]["params"][i])

try:
with warnings.catch_warnings():
warnings.simplefilter("error", category=AdtlTransformationWarning)
if params:
value = getattr(tf, transformation)(value, *params)
else:
value = getattr(tf, transformation)(value)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
except AdtlTransformationWarning as e:
if ctx and ctx.get("returnUnmatched"):
warnings.warn(str(e), AdtlTransformationWarning)
return value
else:
logging.error(str(e))
return None
return value
value = apply_fuction(value, row, rule, ctx)
if value == "":
return None
if "values" in rule:
if rule.get("caseInsensitive") and isinstance(value, str):
value = value.lower().lstrip(" ").rstrip(" ")
rule["values"] = {k.lower(): v for k, v in rule["values"].items()}

if rule.get("ignoreMissingKey") or (ctx and ctx.get("returnUnmatched")):
value = rule["values"].get(value, value)
else:
value = rule["values"].get(value)

# recheck if value is empty after mapping (use to map values to None)
if value == "":
return None
value = convert_values(value, rule, ctx)
# Either source_unit / unit OR source_date / date triggers conversion
# do not parse units if value is empty
if "source_unit" in rule and "unit" in rule:
Expand Down
34 changes: 34 additions & 0 deletions tests/test_adtl/test_get_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@
"can_skip": True,
}

RULE_ENUM_LIST = {
"field": "symptoms",
"type": "enum_list",
"values": {"high temp": "fever", "head pain": "headache"},
}


@pytest.mark.parametrize(
"row_rule,expected",
Expand Down Expand Up @@ -97,6 +103,34 @@
(({"aidshiv": "1"}, RULE_FIELD_OPTION_SKIP), None),
(({"aidshiv_mhyn": "1"}, RULE_FIELD_OPTION_SKIP), True),
(({"aidshiv_mhyn": "2"}, RULE_FIELD_OPTION_SKIP), None),
(
(
{"symptoms": "[high temp, head pain]"},
RULE_ENUM_LIST,
),
["fever", "headache"],
),
(
(
{"symptoms": "[high temp, fatigue]"},
RULE_ENUM_LIST,
),
["fever", None],
),
(
(
{"symptoms": "[high temp, fatigue]"},
RULE_ENUM_LIST | {"ignoreMissingKey": True},
),
["fever", "fatigue"],
),
(
(
{"symptoms": "[high temp; fatigue]"},
RULE_ENUM_LIST | {"ignoreMissingKey": True},
),
["fever", "fatigue"],
),
],
)
def test_get_value_single_field(row_rule, expected):
Expand Down

0 comments on commit 0131b88

Please sign in to comment.