Skip to content

Commit

Permalink
Add safety check for showing common values (#124)
Browse files Browse the repository at this point in the history
* Adds a frequency limit, and a check for how large the max_count is compared to the dataset size
  • Loading branch information
pipliggins authored Jan 31, 2025
1 parent d28d65f commit 4d96229
Show file tree
Hide file tree
Showing 11 changed files with 114 additions and 31 deletions.
13 changes: 10 additions & 3 deletions src/adtl/autoparser/config/autoparser.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,18 @@ choice_delimiter_map = "="
# max number of references to use in the parser file
num_refs = 3

# number of unique values below which a column is considered having 'common values'
# which might need to be mapped in the parser. e.g. a column with only
# maximum number of unique values a column can contain for them to be considered 'common'
# and which might need to be mapped in the parser. e.g. a column with only
# 'oui, non, inconnu' as unique values would be considered to have common values,
# while a column with 50 unique values (perhaps because they are dates, or IDs) would not.
num_choices = 25
# Maximum is 30% of the dataset length (relevant for small datasets to limit data leakage).
max_common_count = 25

# Optional:
# Frequency required for a value to be considered common. Max_common_count will act on
# the filtered 'common' list of values.
# Will default to 5% if the max_common_count is > 30% of the dataset.
# min_common_freq = 0.002

# Path to the target schemas, one per table
[schemas]
Expand Down
13 changes: 10 additions & 3 deletions src/adtl/autoparser/config/redcap-en.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,18 @@ choice_delimiter_map = ","
# max number of references to use in the parser file
num_refs = 3

# number of unique values below which a column is considered having 'common values'
# which might need to be mapped in the parser. e.g. a column with only
# maximum number of unique values a column can contain for them to be considered 'common'
# and which might need to be mapped in the parser. e.g. a column with only
# 'oui, non, inconnu' as unique values would be considered to have common values,
# while a column with 50 unique values (perhaps because they are dates, or IDs) would not.
num_choices = 25
# Maximum is 30% of the dataset length (relevant for small datasets to limit data leakage).
max_common_count = 25

# Optional:
# Frequency required for a value to be considered common. Max_common_count will act on
# the filtered 'common' list of values.
# Will default to 5% if the max_common_count is > 30% of the dataset.
# min_common_freq = 0.002

# Path to the target schemas, one per table
[schemas]
Expand Down
46 changes: 37 additions & 9 deletions src/adtl/autoparser/dict_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import annotations

import argparse
import warnings
from pathlib import Path
from typing import Any

Expand Down Expand Up @@ -50,6 +51,11 @@ def __init__(
config or Path(Path(__file__).parent, DEFAULT_CONFIG)
)

try:
self.config["max_common_count"]
except KeyError:
raise ValueError("'max_common_count' not found in config file.")

if llm and api_key:
self.model = setup_llm(llm, api_key)
else:
Expand Down Expand Up @@ -102,17 +108,39 @@ def create_dict(self, data: pd.DataFrame | str) -> pd.DataFrame:
types = [str(t) for t in df.dtypes]
value_opts = {}

# Get common value thresholds
max_common_count = self.config["max_common_count"]
min_common_freq = self.config.get("min_common_freq")

# check the max count isn't > than 30% of the dataset
calced_max_common_count = min(max_common_count, len(df) * 0.3)
if calced_max_common_count < max_common_count:
warnings.warn(
f"Small Dataset: max_common_count of '{max_common_count}' is"
f" too high for a dataset with {len(df)} rows.\n"
f"Reducing to {calced_max_common_count} to avoid data "
"identification issues.\n"
"Setting the minimum frequency to 5% of the dataset."
)
max_common_count = calced_max_common_count
min_common_freq = 0.05

for i in df.columns:
values = df[i].value_counts()
if len(values) <= self.config["num_choices"]:
try:
value_opts[i] = f"{self.config['choice_delimiter']} ".join(
list(values.index.values)
)
except TypeError:
value_opts[i] = np.nan
else:
value_opts[i] = np.nan
if min_common_freq:
values = values[values > max(1, len(df) * min_common_freq)]
value_opts[i] = np.nan
if not values.empty and len(values) <= max_common_count:
# drop any values with a frequency of 1
values = values[values > 1]
if not values.empty:
try:
value_opts[i] = f"{self.config['choice_delimiter']} ".join(
list(values.index.values)
)
except TypeError:
# This stops float values being given as 'common values'.
continue

dd = pd.DataFrame(
{
Expand Down
4 changes: 3 additions & 1 deletion src/adtl/autoparser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ def parse_choices(s: str) -> Dict[str, Any] | None:
split_str = re.split(r",(?!(?:[^\[]*\])|(?:[^\[]*\[[^\]]*$))", s)
choices_list = [tuple(x.strip().split("=")) for x in split_str]
if any(len(c) != 2 for c in choices_list):
raise ValueError(f"Invalid choices list {choices_list!r}")
raise ValueError(
f"autoparser: Invalid choices list for value mapping {choices_list!r}"
)
choices = dict(choices_list)

for k, v in choices.copy().items():
Expand Down
8 changes: 4 additions & 4 deletions tests/test_autoparser/sources/animals_dd.csv
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
Field Name,Description,Field Type,Common Values
Identité,,string,
Province,,choice,"Equateur, Orientale, Katanga, Kinshasa"
Province,,choice,"Equateur, Orientale, Katanga"
DateNotification,,string,
Classicfication ,,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT, OISEAU"
Classicfication ,,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT"
Nom complet ,,string,
Date de naissance,,string,
AgeAns,,number,
AgeMois ,,number,
Sexe,,choice,"F, M, f, m, f, m , inconnu"
Sexe,,choice,"F, M, f, m, f, m "
StatusCas,,choice,"Vivant, Décédé"
DateDec,,string,
ContSoins ,,choice,"Oui, Non"
ContHumain Autre,,choice,"Non, Oui"
AutreContHumain,,choice,"Non, Voyage, Autres, Voyage , Oui"
AutreContHumain,,choice,"Non, Voyage, Autres, Voyage "
ContactAnimal,,choice,"Oui, Non"
Micropucé,,choice,"Oui, NON, OUI, oui"
AnimalDeCompagnie,,choice,"Oui, Non, non"
8 changes: 4 additions & 4 deletions tests/test_autoparser/sources/animals_dd_described.csv
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
Field Name,Description,Field Type,Common Values
Identité,Identity,string,
Province,Province,choice,"Equateur, Orientale, Katanga, Kinshasa"
Province,Province,choice,"Equateur, Orientale, Katanga"
DateNotification,Notification Date,string,
Classicfication ,Classification,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT, OISEAU"
Classicfication ,Classification,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT"
Nom complet ,Full Name,string,
Date de naissance,Date of Birth,string,
AgeAns,Age in Years,number,
AgeMois ,Age in Months,number,
Sexe,Gender,choice,"F, M, f, m, f, m , inconnu"
Sexe,Gender,choice,"F, M, f, m, f, m "
StatusCas,Case Status,choice,"Vivant, Décédé"
DateDec,Date of Death,string,
ContSoins ,Care Contact,choice,"Oui, Non"
ContHumain Autre,Other Human Contact,choice,"Non, Oui"
AutreContHumain,Other Human Contact,choice,"Non, Voyage, Autres, Voyage , Oui"
AutreContHumain,Other Human Contact,choice,"Non, Voyage, Autres, Voyage "
ContactAnimal,Animal Contact,choice,"Oui, Non"
Micropucé,Microchipped,choice,"Oui, NON, OUI, oui"
AnimalDeCompagnie,Pet Animal,choice,"Oui, Non, non"
26 changes: 26 additions & 0 deletions tests/test_autoparser/sources/config_missing_common_count.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

name = "config file for testing"
description = "Autoparser config for generating CSV mappings and TOML"

# Used by parse_choices() to generate values mapping
# Using the delimiters below, we can parse this string:
# oui=True, non=False
# to this TOML:
# { "oui" = True, "non" = False}

choice_delimiter = ","
choice_delimiter_map = "="

# max number of references to use in the parser file
num_refs = 3

# Path to the target schemas, one per table
[schemas]
animals = "animals.schema.json"

# Column mappings to standardise column names across data dictionaries
[column_mappings]
source_field = "Field Name"
source_type = "Field Type"
source_description = "Description"
common_values = "Common Values"
13 changes: 10 additions & 3 deletions tests/test_autoparser/test_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,18 @@ choice_delimiter_map = "="
# max number of references to use in the parser file
num_refs = 3

# number of unique values below which a column is considered having 'common values'
# which might need to be mapped in the parser. e.g. a column with only
# maximum number of unique values a column can contain for them to be considered 'common'
# and which might need to be mapped in the parser. e.g. a column with only
# 'oui, non, inconnu' as unique values would be considered to have common values,
# while a column with 50 unique values (perhaps because they are dates, or IDs) would not.
num_choices = 10
# Maximum is 30% of the dataset length (relevant for small datasets to limit data leakage).
max_common_count = 8

# Optional:
# Frequency required for a value to be considered common. Max_common_count will act on
# the filtered 'common' list of values.
# Will default to 5% if the max_common_count is > 30% of the dataset.
# min_common_freq = 0.002

# Path to the target schemas, one per table
[schemas]
Expand Down
6 changes: 6 additions & 0 deletions tests/test_autoparser/test_dict_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ def test_data_not_df_or_path():
writer.create_dict(None)


def test_error_config_missing_max_common_count():
with pytest.raises(ValueError, match="'max_common_count' not found in config file"):
DictWriter(config=SOURCES + "config_missing_common_count.toml")


def test_dictionary_creation_no_descrip():
writer = DictWriter(config=CONFIG_PATH)

Expand All @@ -48,6 +53,7 @@ def test_create_dict_no_descrip():
pd.testing.assert_frame_equal(df, df_desired)


@pytest.mark.filterwarnings("ignore:Small Dataset")
def test_dictionary_creation_no_descrip_excel_dataframe():
writer = DictWriter(config=CONFIG_PATH)

Expand Down
6 changes: 3 additions & 3 deletions tests/test_autoparser/test_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,19 @@ def test_common_values():
common_vals = pd.Series(
data=[
None,
{"orientale", "kinshasa", "katanga", "equateur"},
{"orientale", "katanga", "equateur"},
None,
{"poisson", "fish", "rept", "oiseau", "mammifère", "amphibie"},
None,
None,
None,
None,
{"m", "f", "inconnu"},
{"m", "f"},
{"vivant", "décédé"},
None,
{"non", "oui"},
{"non", "oui"},
{"non", "oui", "autres", "voyage"},
{"non", "autres", "voyage"},
{"non", "oui"},
{"non", "oui"},
{"non", "oui"},
Expand Down
2 changes: 1 addition & 1 deletion tests/test_autoparser/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_read_config_schema():
"choice_delimiter",
"choice_delimiter_map",
"num_refs",
"num_choices",
"max_common_count",
"schemas",
"column_mappings",
],
Expand Down

0 comments on commit 4d96229

Please sign in to comment.