Add safety check for showing common values (#124)

* Adds a frequency limit, and a check for how large the max_count is compared to the dataset size
globaldothealth · Jan 31, 2025 · 4d96229 · 4d96229
1 parent d28d65f
commit 4d96229
Show file tree

Hide file tree

Showing 11 changed files with 114 additions and 31 deletions.
diff --git a/src/adtl/autoparser/config/autoparser.toml b/src/adtl/autoparser/config/autoparser.toml
@@ -16,11 +16,18 @@ choice_delimiter_map = "="
 # max number of references to use in the parser file
 num_refs = 3
 
-# number of unique values below which a column is considered having 'common values'
-# which might need to be mapped in the parser. e.g. a column with only
+# maximum number of unique values a column can contain for them to be considered 'common'
+# and which might need to be mapped in the parser. e.g. a column with only
 # 'oui, non, inconnu' as unique values would be considered to have common values,
 # while a column with 50 unique values (perhaps because they are dates, or IDs) would not.
-num_choices = 25
+# Maximum is 30% of the dataset length (relevant for small datasets to limit data leakage).
+max_common_count = 25
+
+# Optional:
+# Frequency required for a value to be considered common. Max_common_count will act on
+# the filtered 'common' list of values.
+# Will default to 5% if the max_common_count is > 30% of the dataset.
+# min_common_freq = 0.002
 
 # Path to the target schemas, one per table
 [schemas]

diff --git a/src/adtl/autoparser/config/redcap-en.toml b/src/adtl/autoparser/config/redcap-en.toml
@@ -14,11 +14,18 @@ choice_delimiter_map = ","
 # max number of references to use in the parser file
 num_refs = 3
 
-# number of unique values below which a column is considered having 'common values'
-# which might need to be mapped in the parser. e.g. a column with only
+# maximum number of unique values a column can contain for them to be considered 'common'
+# and which might need to be mapped in the parser. e.g. a column with only
 # 'oui, non, inconnu' as unique values would be considered to have common values,
 # while a column with 50 unique values (perhaps because they are dates, or IDs) would not.
-num_choices = 25
+# Maximum is 30% of the dataset length (relevant for small datasets to limit data leakage).
+max_common_count = 25
+
+# Optional:
+# Frequency required for a value to be considered common. Max_common_count will act on
+# the filtered 'common' list of values.
+# Will default to 5% if the max_common_count is > 30% of the dataset.
+# min_common_freq = 0.002
 
 # Path to the target schemas, one per table
 [schemas]

diff --git a/src/adtl/autoparser/dict_writer.py b/src/adtl/autoparser/dict_writer.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import argparse
+import warnings
 from pathlib import Path
 from typing import Any
 
@@ -50,6 +51,11 @@ def __init__(
             config or Path(Path(__file__).parent, DEFAULT_CONFIG)
         )
 
+        try:
+            self.config["max_common_count"]
+        except KeyError:
+            raise ValueError("'max_common_count' not found in config file.")
+
         if llm and api_key:
             self.model = setup_llm(llm, api_key)
         else:
@@ -102,17 +108,39 @@ def create_dict(self, data: pd.DataFrame | str) -> pd.DataFrame:
         types = [str(t) for t in df.dtypes]
         value_opts = {}
 
+        # Get common value thresholds
+        max_common_count = self.config["max_common_count"]
+        min_common_freq = self.config.get("min_common_freq")
+
+        # check the max count isn't > than 30% of the dataset
+        calced_max_common_count = min(max_common_count, len(df) * 0.3)
+        if calced_max_common_count < max_common_count:
+            warnings.warn(
+                f"Small Dataset: max_common_count of '{max_common_count}' is"
+                f" too high for a dataset with {len(df)} rows.\n"
+                f"Reducing to {calced_max_common_count} to avoid data "
+                "identification issues.\n"
+                "Setting the minimum frequency to 5% of the dataset."
+            )
+            max_common_count = calced_max_common_count
+            min_common_freq = 0.05
+
         for i in df.columns:
             values = df[i].value_counts()
-            if len(values) <= self.config["num_choices"]:
-                try:
-                    value_opts[i] = f"{self.config['choice_delimiter']} ".join(
-                        list(values.index.values)
-                    )
-                except TypeError:
-                    value_opts[i] = np.nan
-            else:
-                value_opts[i] = np.nan
+            if min_common_freq:
+                values = values[values > max(1, len(df) * min_common_freq)]
+            value_opts[i] = np.nan
+            if not values.empty and len(values) <= max_common_count:
+                # drop any values with a frequency of 1
+                values = values[values > 1]
+                if not values.empty:
+                    try:
+                        value_opts[i] = f"{self.config['choice_delimiter']} ".join(
+                            list(values.index.values)
+                        )
+                    except TypeError:
+                        # This stops float values being given as 'common values'.
+                        continue
 
         dd = pd.DataFrame(
             {

diff --git a/src/adtl/autoparser/util.py b/src/adtl/autoparser/util.py
@@ -80,7 +80,9 @@ def parse_choices(s: str) -> Dict[str, Any] | None:
     split_str = re.split(r",(?!(?:[^\[]*\])|(?:[^\[]*\[[^\]]*$))", s)
     choices_list = [tuple(x.strip().split("=")) for x in split_str]
     if any(len(c) != 2 for c in choices_list):
-        raise ValueError(f"Invalid choices list {choices_list!r}")
+        raise ValueError(
+            f"autoparser: Invalid choices list for value mapping {choices_list!r}"
+        )
     choices = dict(choices_list)
 
     for k, v in choices.copy().items():

diff --git a/tests/test_autoparser/sources/animals_dd.csv b/tests/test_autoparser/sources/animals_dd.csv
@@ -1,18 +1,18 @@
 Field Name,Description,Field Type,Common Values
 Identité,,string,
-Province,,choice,"Equateur, Orientale, Katanga, Kinshasa"
+Province,,choice,"Equateur, Orientale, Katanga"
 DateNotification,,string,
-Classicfication ,,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT, OISEAU"
+Classicfication ,,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT"
 Nom complet ,,string,
 Date de naissance,,string,
 AgeAns,,number,
 AgeMois         ,,number,
-Sexe,,choice,"F, M,   f, m, f, m     , inconnu"
+Sexe,,choice,"F, M,   f, m, f, m     "
 StatusCas,,choice,"Vivant, Décédé"
 DateDec,,string,
 ContSoins ,,choice,"Oui, Non"
 ContHumain Autre,,choice,"Non, Oui"
-AutreContHumain,,choice,"Non, Voyage, Autres, Voyage , Oui"
+AutreContHumain,,choice,"Non, Voyage, Autres, Voyage "
 ContactAnimal,,choice,"Oui, Non"
 Micropucé,,choice,"Oui, NON, OUI, oui"
 AnimalDeCompagnie,,choice,"Oui, Non, non"
diff --git a/tests/test_autoparser/sources/animals_dd_described.csv b/tests/test_autoparser/sources/animals_dd_described.csv
@@ -1,18 +1,18 @@
 Field Name,Description,Field Type,Common Values
 Identité,Identity,string,
-Province,Province,choice,"Equateur, Orientale, Katanga, Kinshasa"
+Province,Province,choice,"Equateur, Orientale, Katanga"
 DateNotification,Notification Date,string,
-Classicfication ,Classification,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT, OISEAU"
+Classicfication ,Classification,choice,"FISH, amphibie, oiseau, Mammifère, poisson, REPT"
 Nom complet ,Full Name,string,
 Date de naissance,Date of Birth,string,
 AgeAns,Age in Years,number,
 AgeMois         ,Age in Months,number,
-Sexe,Gender,choice,"F, M,   f, m, f, m     , inconnu"
+Sexe,Gender,choice,"F, M,   f, m, f, m     "
 StatusCas,Case Status,choice,"Vivant, Décédé"
 DateDec,Date of Death,string,
 ContSoins ,Care Contact,choice,"Oui, Non"
 ContHumain Autre,Other Human Contact,choice,"Non, Oui"
-AutreContHumain,Other Human Contact,choice,"Non, Voyage, Autres, Voyage , Oui"
+AutreContHumain,Other Human Contact,choice,"Non, Voyage, Autres, Voyage "
 ContactAnimal,Animal Contact,choice,"Oui, Non"
 Micropucé,Microchipped,choice,"Oui, NON, OUI, oui"
 AnimalDeCompagnie,Pet Animal,choice,"Oui, Non, non"
diff --git a/tests/test_autoparser/sources/config_missing_common_count.toml b/tests/test_autoparser/sources/config_missing_common_count.toml
@@ -0,0 +1,26 @@
+
+name = "config file for testing"
+description = "Autoparser config for generating CSV mappings and TOML"
+
+# Used by parse_choices() to generate values mapping
+# Using the delimiters below, we can parse this string:
+#   oui=True, non=False
+# to this TOML:
+#  { "oui" = True, "non" = False}
+
+choice_delimiter = ","
+choice_delimiter_map = "="
+
+# max number of references to use in the parser file
+num_refs = 3
+
+# Path to the target schemas, one per table
+[schemas]
+  animals = "animals.schema.json"
+
+# Column mappings to standardise column names across data dictionaries
+[column_mappings]
+  source_field = "Field Name"
+  source_type = "Field Type"
+  source_description = "Description"
+  common_values = "Common Values"
diff --git a/tests/test_autoparser/test_config.toml b/tests/test_autoparser/test_config.toml
@@ -14,11 +14,18 @@ choice_delimiter_map = "="
 # max number of references to use in the parser file
 num_refs = 3
 
-# number of unique values below which a column is considered having 'common values'
-# which might need to be mapped in the parser. e.g. a column with only
+# maximum number of unique values a column can contain for them to be considered 'common'
+# and which might need to be mapped in the parser. e.g. a column with only
 # 'oui, non, inconnu' as unique values would be considered to have common values,
 # while a column with 50 unique values (perhaps because they are dates, or IDs) would not.
-num_choices = 10
+# Maximum is 30% of the dataset length (relevant for small datasets to limit data leakage).
+max_common_count = 8
+
+# Optional:
+# Frequency required for a value to be considered common. Max_common_count will act on
+# the filtered 'common' list of values.
+# Will default to 5% if the max_common_count is > 30% of the dataset.
+# min_common_freq = 0.002
 
 # Path to the target schemas, one per table
 [schemas]

diff --git a/tests/test_autoparser/test_dict_writer.py b/tests/test_autoparser/test_dict_writer.py
@@ -30,6 +30,11 @@ def test_data_not_df_or_path():
         writer.create_dict(None)
 
 
+def test_error_config_missing_max_common_count():
+    with pytest.raises(ValueError, match="'max_common_count' not found in config file"):
+        DictWriter(config=SOURCES + "config_missing_common_count.toml")
+
+
 def test_dictionary_creation_no_descrip():
     writer = DictWriter(config=CONFIG_PATH)
 
@@ -48,6 +53,7 @@ def test_create_dict_no_descrip():
     pd.testing.assert_frame_equal(df, df_desired)
 
 
+@pytest.mark.filterwarnings("ignore:Small Dataset")
 def test_dictionary_creation_no_descrip_excel_dataframe():
     writer = DictWriter(config=CONFIG_PATH)
 

diff --git a/tests/test_autoparser/test_mapper.py b/tests/test_autoparser/test_mapper.py
@@ -128,19 +128,19 @@ def test_common_values():
     common_vals = pd.Series(
         data=[
             None,
-            {"orientale", "kinshasa", "katanga", "equateur"},
+            {"orientale", "katanga", "equateur"},
             None,
             {"poisson", "fish", "rept", "oiseau", "mammifère", "amphibie"},
             None,
             None,
             None,
             None,
-            {"m", "f", "inconnu"},
+            {"m", "f"},
             {"vivant", "décédé"},
             None,
             {"non", "oui"},
             {"non", "oui"},
-            {"non", "oui", "autres", "voyage"},
+            {"non", "autres", "voyage"},
             {"non", "oui"},
             {"non", "oui"},
             {"non", "oui"},

diff --git a/tests/test_autoparser/test_utils.py b/tests/test_autoparser/test_utils.py
@@ -27,7 +27,7 @@ def test_read_config_schema():
             "choice_delimiter",
             "choice_delimiter_map",
             "num_refs",
-            "num_choices",
+            "max_common_count",
             "schemas",
             "column_mappings",
         ],