Misc edits from generating paper data (#113)

globaldothealth · Dec 10, 2024 · 9854464 · 9854464
1 parent 96c49a1
commit 9854464
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 24 deletions.
diff --git a/src/adtl/autoparser/config/autoparser.toml b/src/adtl/autoparser/config/autoparser.toml
@@ -2,6 +2,7 @@
 name = "autoparser"
 description = "Autoparser config for generating CSV mappings and TOML from autoparser-generated dictionaries"
 
+# NOTE: currently unused as only autoparser-generated data dicts are supported ---
 # Used by parse_choices() to generate values mapping
 # Using the delimiters below, we can parse this string:
 #   oui=True, non=False
@@ -10,6 +11,7 @@ description = "Autoparser config for generating CSV mappings and TOML from autop
 
 choice_delimiter = ","
 choice_delimiter_map = "="
+# -------------------------------------------------------------------------------
 
 # max number of references to use in the parser file
 num_refs = 3

diff --git a/src/adtl/autoparser/language_models/data_structures.py b/src/adtl/autoparser/language_models/data_structures.py
@@ -9,7 +9,7 @@
 
 class SingleField(BaseModel):
     field_name: str
-    translation: str | None
+    translation: str
 
 
 class ColumnDescriptionRequest(BaseModel):

diff --git a/src/adtl/autoparser/make_toml.py b/src/adtl/autoparser/make_toml.py
@@ -108,15 +108,11 @@ def __init__(
 
     @property
     def parsed_choices(self) -> pd.Series:
-        """Returns the mapped values for each taget field"""
+        """Returns the mapped values for each target field"""
         try:
             return self._parsed_choices
         except AttributeError:
-
-            def _parse_choices(s: str):
-                return parse_choices(self.config, s)
-
-            self._parsed_choices = self.mappings.value_mapping.map(_parse_choices)
+            self._parsed_choices = self.mappings.value_mapping.map(parse_choices)
             self._parsed_choices.index = self.mappings.target_field
             return self._parsed_choices
 
@@ -307,7 +303,7 @@ def main():
     )
     args = parser.parse_args()
 
-    schema_path = Path(args.schema)
+    schema_path = Path(args.schema_path)
 
     ParserGenerator(
         args.mappings,

diff --git a/src/adtl/autoparser/util.py b/src/adtl/autoparser/util.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import json
+import re
 from pathlib import Path
 from typing import Any, Dict
 
@@ -59,35 +60,42 @@ def read_data(file: str | Path | pd.DataFrame, file_type: str):
         )
 
 
-def parse_choices(config, s: str) -> Dict[str, Any]:
-    delimiter = config["choice_delimiter"]
-    delimiter_map = config["choice_delimiter_map"]
+def parse_choices(s: str) -> Dict[str, Any] | None:
+    """
+    Takes the choices from llm as a string and turns into pairs.
+
+    "oui=True, non=False, blah=None" -> {"oui": True, "non": False, "blah": ""}
+    "vivant=alive, décédé=dead, " "=None" -> {"vivant": "alive", "décédé": "dead"}
+    {2: True} -> None
+    "" " = " ", poisson=fish" -> {"poisson": "fish"}
+    ecouvillon+croûte=[swab, crust], ecouvillon=[swab]" ->
+            {"ecouvillon+croûte": ["swab", "crust"], "ecouvillon": ["swab"]}
+
+    """
 
-    lower_string = lambda s: s.strip().lower()  # NOQA
     if not isinstance(s, str):
         return None
 
-    choices_list = [
-        tuple(map(lower_string, x.split(delimiter_map)[:2])) for x in s.split(delimiter)
-    ]
+    split_str = re.split(r",(?!(?:[^\[]*\])|(?:[^\[]*\[[^\]]*$))", s)
+    choices_list = [tuple(x.strip().split("=")) for x in split_str]
     if any(len(c) != 2 for c in choices_list):
         raise ValueError(f"Invalid choices list {choices_list!r}")
-    choices = dict(
-        tuple(map(lower_string, x.split(delimiter_map)[:2])) for x in s.split(delimiter)
-    )
+    choices = dict(choices_list)
 
     for k, v in choices.copy().items():
-        if v == "true":
+        if v.lower() == "true":
             choices[k] = True
-        if v == "false":
+        if v.lower() == "false":
             choices[k] = False
-        if v == "none":
+        if v.lower() == "none":
             if k == "":
                 choices.pop(k)
             else:
                 choices[k] = ""
         if v == "" and k == "":
             choices.pop(k)
+        if "[" and "]" in v:
+            choices[k] = [i for i in v.strip("[]").replace(" ", "").split(",")]
     return choices
 
 

diff --git a/tests/test_autoparser/test_utils.py b/tests/test_autoparser/test_utils.py
@@ -53,21 +53,26 @@ def test_read_config_schema():
         ("vivant=alive, décédé=dead, " "=None", {"vivant": "alive", "décédé": "dead"}),
         ({2: True}, None),
         ("" " = " ", poisson=fish", {"poisson": "fish"}),
+        (
+            "" "=None, ecouvillon+croûte=[swab, crust], ecouvillon=[swab]",
+            {"ecouvillon+croûte": ["swab", "crust"], "ecouvillon": ["swab"]},
+        ),
+        ("pos=Y, neg=N", {"pos": "Y", "neg": "N"}),
     ],
 )
 def test_parse_choices(s, expected):
-    choices = parse_choices(CONFIG, s)
+    choices = parse_choices(s)
     assert choices == expected
 
 
 def test_parse_choices_error():
     # dictionary printed without stringification
     with pytest.raises(ValueError, match="Invalid choices list"):
-        parse_choices(CONFIG, '{"oui":"True", "non":"False", "blah":"None"}')
+        parse_choices('{"oui":"True", "non":"False", "blah":"None"}')
 
     # different choice_delimeter_map
     with pytest.raises(ValueError, match="Invalid choices list"):
-        parse_choices(CONFIG, "oui:True, non:False, blah:None")
+        parse_choices("oui:True, non:False, blah:None")
 
 
 def test_load_data_dict():