Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v24.5.1 fix #1429

Merged
merged 24 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
70a0045
handle blank list like, add to testing st cases
mialy-defelice May 2, 2024
5aab710
update example.model.jsonld
mialy-defelice May 2, 2024
d387d04
fix key error for wrong schema
mialy-defelice May 2, 2024
c5c2d62
add test_missing_column test
mialy-defelice May 2, 2024
616e2d1
add docstring to get_list_robustness helper funct
mialy-defelice May 2, 2024
b92ea07
run poetry
mialy-defelice May 2, 2024
20b966c
add a helper function to convert nans
mialy-defelice May 3, 2024
7de939d
run black
mialy-defelice May 3, 2024
9435ab6
update typing, update docstrings
mialy-defelice May 3, 2024
77a4963
lint validate_utils
mialy-defelice May 3, 2024
e541e62
add type ignore for mypy error
mialy-defelice May 3, 2024
f189fa7
run black
mialy-defelice May 3, 2024
661d5f4
add type ignore to validate_utils
mialy-defelice May 3, 2024
d062df4
Fix test to work with newly updated data model
mialy-defelice May 3, 2024
ef38545
add extra line to invalid manifest csv
mialy-defelice May 3, 2024
41782de
add docstring about list robustness'
mialy-defelice May 6, 2024
fdc68a4
add docstrings and update typing
mialy-defelice May 6, 2024
5056de1
rewrite get_list_robutness to be clearer and add doc strings to parse…
mialy-defelice May 6, 2024
a898893
add test_convert_nan_entries_to_empty_strings and stubs for additiona…
mialy-defelice May 6, 2024
0864e49
Merge pull request #1420 from Sage-Bionetworks/develop-FDS-2015-js-va…
mialy-defelice May 8, 2024
7534d46
remove release md
linglp May 9, 2024
d380524
Merge pull request #1426 from Sage-Bionetworks/develop-remove-releasemd
linglp May 9, 2024
9df0794
fix failing test to conform to data model updates
mialy-defelice May 9, 2024
d3668c0
Merge pull request #1428 from Sage-Bionetworks/develop-FDS-2056
mialy-defelice May 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 0 additions & 51 deletions RELEASE.md

This file was deleted.

24 changes: 16 additions & 8 deletions schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
np_array_to_str_list,
iterable_to_str_list,
rule_in_rule_list,
get_list_robustness,
)

from synapseclient.core.exceptions import SynapseNoCredentialsError
Expand Down Expand Up @@ -58,6 +59,7 @@ def generate_schema_error(
error_col=attribute_name,
error_message=error_message,
error_val=invalid_entry,
message_level="error",
)

return error_list, warning_list
Expand Down Expand Up @@ -475,7 +477,12 @@ def _get_rule_attributes(

is_schema_error = rule_name == "schema"
col_is_recommended = rule_name == "recommended"
col_is_required = dmge.get_node_required(node_display_name=error_col_name)

if not is_schema_error:
col_is_required = dmge.get_node_required(node_display_name=error_col_name)
else:
col_is_required = False

return (
rule_parts,
rule_name,
Expand Down Expand Up @@ -823,16 +830,17 @@ def list_validation(
# white spaces removed.
errors = []
warnings = []
replace_null = True

csv_re = comma_separated_list_regex()

rule_parts = val_rule.lower().split(" ")
if len(rule_parts) > 1:
list_robustness = rule_parts[1]
else:
list_robustness = "strict"
# Check if lists -must- be a list, or can be a single value.
list_robustness = get_list_robustness(val_rule=val_rule)

if list_robustness == "like":
replace_null = False

if list_robustness == "strict":
elif list_robustness == "strict":
manifest_col = manifest_col.astype(str)

# This will capture any if an entry is not formatted properly. Only for strict lists
Expand Down Expand Up @@ -864,7 +872,7 @@ def list_validation(
warnings.append(vr_warnings)

# Convert string to list.
manifest_col = parse_str_series_to_list(manifest_col)
manifest_col = parse_str_series_to_list(manifest_col, replace_null=replace_null)

return errors, warnings, manifest_col

Expand Down
15 changes: 10 additions & 5 deletions schematic/models/validate_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
from schematic.store.synapse import SynapseStorage
from schematic.models.GE_Helpers import GreatExpectationsHelpers
from schematic.utils.validate_rules_utils import validation_rule_info
from schematic.utils.validate_utils import rule_in_rule_list
from schematic.utils.validate_utils import (
rule_in_rule_list,
convert_nan_entries_to_empty_strings,
)
from schematic.utils.schema_utils import extract_component_validation_rules

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -103,9 +106,9 @@ def validate_manifest_rules(
manifest: pd.core.frame.DataFrame,
dmge: DataModelGraphExplorer,
restrict_rules: bool,
project_scope: List,
project_scope: list[str],
access_token: Optional[str] = None,
) -> (pd.core.frame.DataFrame, List[List[str]]):
) -> (pd.core.frame.DataFrame, list[list[str]]):
"""
Purpose:
Take validation rules set for a particular attribute
Expand Down Expand Up @@ -295,8 +298,7 @@ def validate_manifest_values(
warnings = []
col_attr = {} # save the mapping between column index and attribute name

# Replace nans with empty strings so jsonschema
manifest = manifest.replace({np.nan: ""})
manifest = convert_nan_entries_to_empty_strings(manifest=manifest)

# numerical values need to be type string for the jsonValidator
for col in manifest.select_dtypes(
Expand Down Expand Up @@ -347,15 +349,18 @@ def validate_all(
project_scope: List,
access_token: str,
):
# Run Validation Rules
vm = ValidateManifest(errors, manifest, manifestPath, dmge, jsonSchema)
manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules(
manifest, dmge, restrict_rules, project_scope, access_token
)

if vmr_errors:
errors.extend(vmr_errors)
if vmr_warnings:
warnings.extend(vmr_warnings)

# Run JSON Schema Validation
vmv_errors, vmv_warnings = vm.validate_manifest_values(manifest, jsonSchema, dmge)
if vmv_errors:
errors.extend(vmv_errors)
Expand Down
81 changes: 76 additions & 5 deletions schematic/utils/validate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,33 @@ def comma_separated_list_regex() -> Pattern[str]:
return csv_list_regex


def convert_nan_entries_to_empty_strings(
manifest: pd.core.frame.DataFrame,
) -> pd.core.frame.DataFrame:
"""
Nans need to be converted to empty strings for JSON Schema Validation. This helper
converts an a list with a single '<NA>' string or a single np.nan to empty strings.
These types of expected NANs come from different stages of conversion during import
and validation.

Args:
manifest: pd.core.frame.DataFrame, manifest prior to removing nans and
replacing with empty strings.
Returns:
manifest: pd.core.frame.DataFrame, manifest post removing nans and
replacing with empty strings.
"""
# Replace nans with empty strings so jsonschema, address replace type infering depreciation.
with pd.option_context("future.no_silent_downcasting", True):
manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) # type: ignore

for col in manifest.columns:
for index, value in manifest[col].items():
if value == ["<NA>"]:
manifest.loc[index, col] = [""] # type: ignore
return manifest


def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]]:
"""
Function to standardize
Expand All @@ -70,18 +97,62 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]
return re.search(rule_type, rule_list_str, flags=re.IGNORECASE)


def parse_str_series_to_list(col: pd.Series) -> pd.Series:
def get_list_robustness(val_rule: str) -> str:
"""Helper function to extract list robustness from the validation rule.
List robustness defines if the input -must- be a list (several values
or a single value with a trailing comma),
or if a user is allowed to submit a single value.
List rules default to `strict` if not defined to be `like`
Args:
val_rule: str, validation rule string.
Returns:
list_robutness: str, list robustness extracted from validation rule.
"""
list_robustness_options = ["like", "strict"]
list_robustness = None
default_robustness = list_robustness_options[1]

# Get the parts of a single rule, list is assumed to be in the first position, based on
# requirements that can be found in documentation.
rule_parts = val_rule.lower().split(" ")

if len(rule_parts) > 1:
# Check if list_robustness is defined in the rule, if not give them the default.
list_robustness_list = [
part for part in rule_parts if part in list_robustness_options
]
if list_robustness_list:
list_robustness = list_robustness_list[0]

if not list_robustness:
# If no robustness has been defined by the user, set to the default.
list_robustness = default_robustness
return list_robustness


def parse_str_series_to_list(col: pd.Series, replace_null: bool = True) -> pd.Series:
"""
Parse a pandas series of comma delimited strings
into a series with values that are lists of strings
into a series with values that are lists of strings. If replace_null, fill null values
with nan. If the type of the value needs to be an array, fill with empty list.
ex.
Input: 'a,b,c'
Output: ['a','b','c']

"""
col = col.apply(
lambda x: [s.strip() for s in str(x).split(",")] if not pd.isnull(x) else pd.NA
)
if replace_null:
col = col.apply(
lambda x: [s.strip() for s in str(x).split(",")]
if not pd.isnull(x)
else pd.NA
)
else:
col = col.apply(
lambda x: [s.strip() for s in str(x).split(",")]
if (isinstance(x, np.ndarray) and not x.any()) or not pd.isnull(x)
else []
)

return col


Expand Down
13 changes: 10 additions & 3 deletions tests/data/example.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,16 @@ CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
Genome FASTA,,,,,TRUE,DataProperty,,,
MockComponent,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
Check List,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check Regex List,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
MockComponent,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
Check List,,,,,TRUE,DataProperty,,,list
Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list
Check List Like,,,,,TRUE,DataProperty,,,list like
Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like
Check List Strict,,,,,TRUE,DataProperty,,,list strict
Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f]
Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f]
Check Regex Single,,,,,TRUE,DataProperty,,,regex search [a-f]
Check Regex Format,,,,,TRUE,DataProperty,,,regex match [a-f]
Check Regex Integer,,,,,TRUE,DataProperty,,,regex search ^\d+$
Expand Down
Loading
Loading