Skip to content

Commit

Permalink
Misc edits from generating paper data (#113)
Browse files Browse the repository at this point in the history
  • Loading branch information
pipliggins authored Dec 10, 2024
1 parent 96c49a1 commit 9854464
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 24 deletions.
2 changes: 2 additions & 0 deletions src/adtl/autoparser/config/autoparser.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
name = "autoparser"
description = "Autoparser config for generating CSV mappings and TOML from autoparser-generated dictionaries"

# NOTE: currently unused as only autoparser-generated data dicts are supported ---
# Used by parse_choices() to generate values mapping
# Using the delimiters below, we can parse this string:
# oui=True, non=False
Expand All @@ -10,6 +11,7 @@ description = "Autoparser config for generating CSV mappings and TOML from autop

choice_delimiter = ","
choice_delimiter_map = "="
# -------------------------------------------------------------------------------

# max number of references to use in the parser file
num_refs = 3
Expand Down
2 changes: 1 addition & 1 deletion src/adtl/autoparser/language_models/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class SingleField(BaseModel):
field_name: str
translation: str | None
translation: str


class ColumnDescriptionRequest(BaseModel):
Expand Down
10 changes: 3 additions & 7 deletions src/adtl/autoparser/make_toml.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,11 @@ def __init__(

@property
def parsed_choices(self) -> pd.Series:
"""Returns the mapped values for each taget field"""
"""Returns the mapped values for each target field"""
try:
return self._parsed_choices
except AttributeError:

def _parse_choices(s: str):
return parse_choices(self.config, s)

self._parsed_choices = self.mappings.value_mapping.map(_parse_choices)
self._parsed_choices = self.mappings.value_mapping.map(parse_choices)
self._parsed_choices.index = self.mappings.target_field
return self._parsed_choices

Expand Down Expand Up @@ -307,7 +303,7 @@ def main():
)
args = parser.parse_args()

schema_path = Path(args.schema)
schema_path = Path(args.schema_path)

ParserGenerator(
args.mappings,
Expand Down
34 changes: 21 additions & 13 deletions src/adtl/autoparser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any, Dict

Expand Down Expand Up @@ -59,35 +60,42 @@ def read_data(file: str | Path | pd.DataFrame, file_type: str):
)


def parse_choices(config, s: str) -> Dict[str, Any]:
delimiter = config["choice_delimiter"]
delimiter_map = config["choice_delimiter_map"]
def parse_choices(s: str) -> Dict[str, Any] | None:
"""
Takes the choices from llm as a string and turns into pairs.
"oui=True, non=False, blah=None" -> {"oui": True, "non": False, "blah": ""}
"vivant=alive, décédé=dead, " "=None" -> {"vivant": "alive", "décédé": "dead"}
{2: True} -> None
"" " = " ", poisson=fish" -> {"poisson": "fish"}
ecouvillon+croûte=[swab, crust], ecouvillon=[swab]" ->
{"ecouvillon+croûte": ["swab", "crust"], "ecouvillon": ["swab"]}
"""

lower_string = lambda s: s.strip().lower() # NOQA
if not isinstance(s, str):
return None

choices_list = [
tuple(map(lower_string, x.split(delimiter_map)[:2])) for x in s.split(delimiter)
]
split_str = re.split(r",(?!(?:[^\[]*\])|(?:[^\[]*\[[^\]]*$))", s)
choices_list = [tuple(x.strip().split("=")) for x in split_str]
if any(len(c) != 2 for c in choices_list):
raise ValueError(f"Invalid choices list {choices_list!r}")
choices = dict(
tuple(map(lower_string, x.split(delimiter_map)[:2])) for x in s.split(delimiter)
)
choices = dict(choices_list)

for k, v in choices.copy().items():
if v == "true":
if v.lower() == "true":
choices[k] = True
if v == "false":
if v.lower() == "false":
choices[k] = False
if v == "none":
if v.lower() == "none":
if k == "":
choices.pop(k)
else:
choices[k] = ""
if v == "" and k == "":
choices.pop(k)
if "[" and "]" in v:
choices[k] = [i for i in v.strip("[]").replace(" ", "").split(",")]
return choices


Expand Down
11 changes: 8 additions & 3 deletions tests/test_autoparser/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,26 @@ def test_read_config_schema():
("vivant=alive, décédé=dead, " "=None", {"vivant": "alive", "décédé": "dead"}),
({2: True}, None),
("" " = " ", poisson=fish", {"poisson": "fish"}),
(
"" "=None, ecouvillon+croûte=[swab, crust], ecouvillon=[swab]",
{"ecouvillon+croûte": ["swab", "crust"], "ecouvillon": ["swab"]},
),
("pos=Y, neg=N", {"pos": "Y", "neg": "N"}),
],
)
def test_parse_choices(s, expected):
choices = parse_choices(CONFIG, s)
choices = parse_choices(s)
assert choices == expected


def test_parse_choices_error():
# dictionary printed without stringification
with pytest.raises(ValueError, match="Invalid choices list"):
parse_choices(CONFIG, '{"oui":"True", "non":"False", "blah":"None"}')
parse_choices('{"oui":"True", "non":"False", "blah":"None"}')

# different choice_delimeter_map
with pytest.raises(ValueError, match="Invalid choices list"):
parse_choices(CONFIG, "oui:True, non:False, blah:None")
parse_choices("oui:True, non:False, blah:None")


def test_load_data_dict():
Expand Down

0 comments on commit 9854464

Please sign in to comment.