Sage-Bionetworks · lakikowolfe · May 10, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
@@ -23,6 +23,7 @@
     np_array_to_str_list,
     iterable_to_str_list,
     rule_in_rule_list,
+    get_list_robustness,
 )
 
 from synapseclient.core.exceptions import SynapseNoCredentialsError
@@ -58,6 +59,7 @@ def generate_schema_error(
             error_col=attribute_name,
             error_message=error_message,
             error_val=invalid_entry,
+            message_level="error",
         )
 
         return error_list, warning_list
@@ -475,7 +477,12 @@ def _get_rule_attributes(
 
         is_schema_error = rule_name == "schema"
         col_is_recommended = rule_name == "recommended"
-        col_is_required = dmge.get_node_required(node_display_name=error_col_name)
+
+        if not is_schema_error:
+            col_is_required = dmge.get_node_required(node_display_name=error_col_name)
+        else:
+            col_is_required = False
+
         return (
             rule_parts,
             rule_name,
@@ -823,16 +830,17 @@ def list_validation(
         # white spaces removed.
         errors = []
         warnings = []
+        replace_null = True
 
         csv_re = comma_separated_list_regex()
 
-        rule_parts = val_rule.lower().split(" ")
-        if len(rule_parts) > 1:
-            list_robustness = rule_parts[1]
-        else:
-            list_robustness = "strict"
+        # Check if lists -must- be a list, or can be a single value.
+        list_robustness = get_list_robustness(val_rule=val_rule)
+
+        if list_robustness == "like":
+            replace_null = False
 
-        if list_robustness == "strict":
+        elif list_robustness == "strict":
             manifest_col = manifest_col.astype(str)
 
             # This will capture any if an entry is not formatted properly. Only for strict lists
@@ -864,7 +872,7 @@ def list_validation(
                         warnings.append(vr_warnings)
 
         # Convert string to list.
-        manifest_col = parse_str_series_to_list(manifest_col)
+        manifest_col = parse_str_series_to_list(manifest_col, replace_null=replace_null)
 
         return errors, warnings, manifest_col
 

@@ -25,7 +25,10 @@
 from schematic.store.synapse import SynapseStorage
 from schematic.models.GE_Helpers import GreatExpectationsHelpers
 from schematic.utils.validate_rules_utils import validation_rule_info
-from schematic.utils.validate_utils import rule_in_rule_list
+from schematic.utils.validate_utils import (
+    rule_in_rule_list,
+    convert_nan_entries_to_empty_strings,
+)
 from schematic.utils.schema_utils import extract_component_validation_rules
 
 logger = logging.getLogger(__name__)
@@ -103,9 +106,9 @@ def validate_manifest_rules(
         manifest: pd.core.frame.DataFrame,
         dmge: DataModelGraphExplorer,
         restrict_rules: bool,
-        project_scope: List,
+        project_scope: list[str],
         access_token: Optional[str] = None,
-    ) -> (pd.core.frame.DataFrame, List[List[str]]):
+    ) -> (pd.core.frame.DataFrame, list[list[str]]):
         """
         Purpose:
             Take validation rules set for a particular attribute
@@ -295,8 +298,7 @@ def validate_manifest_values(
         warnings = []
         col_attr = {}  # save the mapping between column index and attribute name
 
-        # Replace nans with empty strings so jsonschema
-        manifest = manifest.replace({np.nan: ""})
+        manifest = convert_nan_entries_to_empty_strings(manifest=manifest)
 
         # numerical values need to be type string for the jsonValidator
         for col in manifest.select_dtypes(
@@ -347,15 +349,18 @@ def validate_all(
     project_scope: List,
     access_token: str,
 ):
+    # Run Validation Rules
     vm = ValidateManifest(errors, manifest, manifestPath, dmge, jsonSchema)
     manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules(
         manifest, dmge, restrict_rules, project_scope, access_token
     )
+
     if vmr_errors:
         errors.extend(vmr_errors)
     if vmr_warnings:
         warnings.extend(vmr_warnings)
 
+    # Run JSON Schema Validation
     vmv_errors, vmv_warnings = vm.validate_manifest_values(manifest, jsonSchema, dmge)
     if vmv_errors:
         errors.extend(vmv_errors)

@@ -54,6 +54,33 @@ def comma_separated_list_regex() -> Pattern[str]:
     return csv_list_regex
 
 
+def convert_nan_entries_to_empty_strings(
+    manifest: pd.core.frame.DataFrame,
+) -> pd.core.frame.DataFrame:
+    """
+    Nans need to be converted to empty strings for JSON Schema Validation. This helper
+    converts an a list with a single '<NA>' string or a single np.nan to empty strings.
+    These types of expected NANs come from different stages of conversion during import
+    and validation.
+
+    Args:
+        manifest: pd.core.frame.DataFrame, manifest prior to removing nans and
+            replacing with empty strings.
+    Returns:
+        manifest: pd.core.frame.DataFrame, manifest post removing nans and
+            replacing with empty strings.
+    """
+    # Replace nans with empty strings so jsonschema, address replace type infering depreciation.
+    with pd.option_context("future.no_silent_downcasting", True):
+        manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False)  # type: ignore
+
+    for col in manifest.columns:
+        for index, value in manifest[col].items():
+            if value == ["<NA>"]:
+                manifest.loc[index, col] = [""]  # type: ignore
+    return manifest
+
+
 def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]]:
     """
     Function to standardize
@@ -70,18 +97,62 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]
     return re.search(rule_type, rule_list_str, flags=re.IGNORECASE)
 
 
-def parse_str_series_to_list(col: pd.Series) -> pd.Series:
+def get_list_robustness(val_rule: str) -> str:
+    """Helper function to extract list robustness from the validation rule.
+    List robustness defines if the input -must- be a list (several values
+    or a single value with a trailing comma),
+    or if a user is allowed to submit a single value.
+    List rules default to `strict` if not defined to be `like`
+    Args:
+        val_rule: str, validation rule string.
+    Returns:
+        list_robutness: str, list robustness extracted from validation rule.
+    """
+    list_robustness_options = ["like", "strict"]
+    list_robustness = None
+    default_robustness = list_robustness_options[1]
+
+    # Get the parts of a single rule, list is assumed to be in the first position, based on
+    # requirements that can be found in documentation.
+    rule_parts = val_rule.lower().split(" ")
+
+    if len(rule_parts) > 1:
+        # Check if list_robustness is defined in the rule, if not give them the default.
+        list_robustness_list = [
+            part for part in rule_parts if part in list_robustness_options
+        ]
+        if list_robustness_list:
+            list_robustness = list_robustness_list[0]
+
+    if not list_robustness:
+        # If no robustness has been defined by the user, set to the default.
+        list_robustness = default_robustness
+    return list_robustness
+
+
+def parse_str_series_to_list(col: pd.Series, replace_null: bool = True) -> pd.Series:
     """
     Parse a pandas series of comma delimited strings
-    into a series with values that are lists of strings
+    into a series with values that are lists of strings. If replace_null, fill null values
+    with nan. If the type of the value needs to be an array, fill with empty list.
     ex.
         Input:  'a,b,c'
         Output: ['a','b','c']
 
     """
-    col = col.apply(
-        lambda x: [s.strip() for s in str(x).split(",")] if not pd.isnull(x) else pd.NA
-    )
+    if replace_null:
+        col = col.apply(
+            lambda x: [s.strip() for s in str(x).split(",")]
+            if not pd.isnull(x)
+            else pd.NA
+        )
+    else:
+        col = col.apply(
+            lambda x: [s.strip() for s in str(x).split(",")]
+            if (isinstance(x, np.ndarray) and not x.any()) or not pd.isnull(x)
+            else []
+        )
+
     return col
 
 

@@ -19,9 +19,16 @@ CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
 CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
 Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
 Genome FASTA,,,,,TRUE,DataProperty,,,
-MockComponent,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
-Check List,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict 
-Check Regex List,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
+MockComponent,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
+Check List,,,,,TRUE,DataProperty,,,list
+Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list
+Check List Like,,,,,TRUE,DataProperty,,,list like
+Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like
+Check List Strict,,,,,TRUE,DataProperty,,,list strict 
+Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict 
+Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f]
+Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
+Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f]
 Check Regex Single,,,,,TRUE,DataProperty,,,regex search [a-f]
 Check Regex Format,,,,,TRUE,DataProperty,,,regex match [a-f]
 Check Regex Integer,,,,,TRUE,DataProperty,,,regex search ^\d+$