Skip to content

Commit

Permalink
Merge pull request #1368 from Sage-Bionetworks/develop-vr-set-per-com…
Browse files Browse the repository at this point in the history
…ponent-FDS-1443

Feature: Component Based Rule Settings
  • Loading branch information
mialy-defelice authored Feb 15, 2024
2 parents 38cda95 + ed31d91 commit 8e8cf18
Show file tree
Hide file tree
Showing 13 changed files with 522 additions and 36 deletions.
7 changes: 6 additions & 1 deletion schematic/manifest/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pathlib import Path
import pygsheets as ps
from tempfile import NamedTemporaryFile
from typing import Dict, List, Optional, Tuple, Union, BinaryIO, Literal
from typing import Any, Dict, List, Optional, Tuple, Union, BinaryIO, Literal

from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
Expand All @@ -21,6 +21,7 @@
build_service_account_creds,
)
from schematic.utils.df_utils import update_df, load_df
from schematic.utils.schema_utils import extract_component_validation_rules
from schematic.utils.validate_utils import rule_in_rule_list
from schematic.utils.schema_utils import DisplayLabelType

Expand Down Expand Up @@ -1137,6 +1138,10 @@ def _create_requests_body(
validation_rules = self.dmge.get_node_validation_rules(
node_display_name=req
)
if isinstance(validation_rules, dict):
validation_rules = extract_component_validation_rules(
validation_rules=validation_rules, manifest_component=self.root
)

# Add regex match validaiton rule to Google Sheets.
if validation_rules and sheet_url:
Expand Down
9 changes: 9 additions & 0 deletions schematic/models/GE_Helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
from schematic.models.validate_attribute import GenerateError
from schematic.schemas.data_model_graph import DataModelGraphExplorer

from schematic.utils.schema_utils import extract_component_validation_rules

from schematic.utils.validate_utils import (
rule_in_rule_list,
np_array_to_str_list,
Expand Down Expand Up @@ -164,12 +166,19 @@ def build_expectation_suite(

# remove trailing/leading whitespaces from manifest
self.manifest.map(lambda x: x.strip() if isinstance(x, str) else x)

validation_rules = self.dmge.get_node_validation_rules(
node_display_name=col
)

# check if attribute has any rules associated with it
if validation_rules:
# Check if the validation rule applies to this manifest
if isinstance(validation_rules, dict):
validation_rules = extract_component_validation_rules(
manifest_component=self.manifest["Component"][0],
validation_rules=validation_rules,
)
# iterate through all validation rules for an attribute
for rule in validation_rules:
base_rule = rule.split(" ")[0]
Expand Down
4 changes: 0 additions & 4 deletions schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,6 @@ def generate_type_error(
node_display_name=attribute_name
)

# TODO: Can remove when handling updated so split within graph
if validation_rules and "::" in validation_rules[0]:
validation_rules = validation_rules[0].split("::")

# If IsNA rule is being used to allow `Not Applicable` entries, do not log a message
if error_val.lower() == "not applicable" and rule_in_rule_list(
"IsNA", validation_rules
Expand Down
10 changes: 7 additions & 3 deletions schematic/models/validate_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from schematic.models.GE_Helpers import GreatExpectationsHelpers
from schematic.utils.validate_rules_utils import validation_rule_info
from schematic.utils.validate_utils import rule_in_rule_list
from schematic.utils.schema_utils import extract_component_validation_rules

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -184,9 +185,12 @@ def validate_manifest_rules(
manifest.map(lambda x: x.strip() if isinstance(x, str) else x)
validation_rules = dmge.get_node_validation_rules(node_display_name=col)

# TODO: Can remove when handling updated so split within graph
if validation_rules and "::" in validation_rules[0]:
validation_rules = validation_rules[0].split("::")
# Parse the validation rules
if validation_rules and isinstance(validation_rules, dict):
validation_rules = extract_component_validation_rules(
manifest_component=manifest["Component"][0],
validation_rules=validation_rules,
)

# Check that attribute rules conform to limits:
# no more than two rules for an attribute.
Expand Down
213 changes: 205 additions & 8 deletions schematic/utils/schema_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
import json
import logging
import networkx as nx
import re
import string
from typing import Dict, List, Literal
from typing import List, Literal, Dict, Tuple, Union


logger = logging.getLogger(__name__)

DisplayLabelType = Literal["class_label", "display_label"]
BLACKLISTED_CHARS = ["(", ")", ".", " ", "-"]
COMPONENT_NAME_DELIMITER = "#"
COMPONENT_RULES_DELIMITER = "^^"
RULE_DELIMITER = "::"


def attr_dict_template(key_name: str) -> Dict[str, dict[str, dict]]:
Expand Down Expand Up @@ -143,7 +148,7 @@ def get_schema_label(
Returns:
label, str: class label of display name
Raises:
ValueError if entry_type.lower(), is not either 'class' or 'property'
Error Logged if entry_type.lower(), is not either 'class' or 'property'
"""
if entry_type.lower() == "class":
label = get_class_label_from_display_name(
Expand All @@ -155,7 +160,7 @@ def get_schema_label(
display_name=display_name, strict_camel_case=strict_camel_case
)
else:
raise ValueError(
logger.error(
f"The entry type submitted: {entry_type}, is not one of the permitted types: 'class' or 'property'"
)
return label
Expand Down Expand Up @@ -207,15 +212,207 @@ def convert_bool_to_str(provided_bool: bool) -> str:
return str(provided_bool)


def parse_validation_rules(validation_rules: List[str]) -> List[str]:
def get_individual_rules(
rule: str, validation_rules: list[Union[str, None]]
) -> Union[str, list]:
"""Extract individual rules from a string and add to a list of rules
Args:
rule, str: valdation rule that has been parsed from a component rule.
validaiton_rules, list: list of rules being collected,
if this is the first time the list is being added to, it will be empty
Returns:
validaiton_rules, list: list of rules being collected.
"""
# Separate multiple rules (defined by addition of the rule delimiter)
if RULE_DELIMITER in rule:
validation_rules.append(parse_single_set_validation_rules(rule))
# Get single rule
else:
validation_rules.append(rule)
return validation_rules


def get_component_name_rules(
component_names: list[Union[str, None]], component_rule: str
) -> Tuple[list, str]:
"""Get component name and rule from an string that was initilly split by the COMPONENT_RULES_DELIMITER
Args:
component_names, list[Union[str,None]]: list of components, will be empty if being added to for the first time.
component_rule, str: component rule string that has only been split by the COMPONENT_RULES_DELIMETER
Returns:
Tuple[list,str]: list with the a new component name or 'all_other_components' appended,
rule with the component name stripped off.
Raises:
Error Logged if it looks like a component name should have been added to the list, but wass not.
"""
# If a component name is not attached to the rule, have it apply to all other components
if COMPONENT_NAME_DELIMITER != component_rule[0]:
component_names.append("all_other_components")
# Get the component name if available
else:
component_names.append(
component_rule.split(" ")[0].replace(COMPONENT_NAME_DELIMITER, "")
)
if component_names[-1] == " ":
logger.error(
f"There was an error capturing at least one of the component names "
f"in the following rule: {component_rule}, "
f"please ensure there is not extra whitespace or non-allowed characters."
)

component_rule = component_rule.replace(component_rule.split(" ")[0], "")
component_rule = component_rule.strip()
return component_names, component_rule


def check_for_duplicate_components(
component_names: list[str], validation_rule_string: str
) -> None:
"""Check if component names are repeated in a validation rule
Args:
component_names, list[str]: list of components identified in the validation rule
validation_rule_str, str: validation rule, used if error needs to be raised.
Returns:
None
Raises: Error Logged if a component name is duplicated.
"""
duplicated_entries = [cn for cn in component_names if component_names.count(cn) > 1]
if duplicated_entries:
logger.error(
f"Oops, it looks like the following rule {validation_rule_string}, contains the same component "
f"name more than once. An attribute can only have a single rule applied per manifest/component."
)
return


def parse_component_validation_rules(validation_rule_string: str) -> Dict:
"""If a validation rule is identified to be fomatted as a component validation rule, parse to a dictionary of components:rules
Args:
validation_rule_string, str: validation rule provided by user.
Returns:
validation_rules_dict, dict: validation rules parsed to a dictionary where
the key is the component name (or 'all_other_components') and the value is the parsed validaiton rule for
the given component.
"""
component_names = []
validation_rules = []

component_rules = validation_rule_string.split(COMPONENT_RULES_DELIMITER)
# Extract component rules, per component
for component_rule in component_rules:
component_rule = component_rule.strip()
if component_rule:
# Get component name attached to rule
component_names, component_rule = get_component_name_rules(
component_names=component_names, component_rule=component_rule
)

# Get rules
validation_rules = get_individual_rules(
rule=component_rule, validation_rules=validation_rules
)

# Ensure we collected the component names and validation rules like expected
if len(component_names) != len(validation_rules):
logger.error(
f"The number of components names and validation rules does not match "
f"for validation rule: {validation_rule_string}."
)

# If a component name is repeated throw an error.
check_for_duplicate_components(component_names, validation_rule_string)

validation_rules_dict = dict(zip(component_names, validation_rules))

return validation_rules_dict


def parse_single_set_validation_rules(validation_rule_string: str) -> list:
"""Parse a single set of validation rules.
Args:
validation_rule_string, str: validation rule provided by user.
Returns:
list, the valiation rule string split by the rule delimiter
Raise:
ValueEror if the string contains a component name delimter in the beginning.
This would indicate that a user was trying to set component rules, but did so improperly.
"""
# Try to catch an improperly formatted rule
if COMPONENT_NAME_DELIMITER == validation_rule_string[0]:
logger.error(
f"The provided validation rule {validation_rule_string}, looks to be formatted as a component "
f"based rule, but is missing the necessary formatting, "
f"please refer to the SchemaHub documentation for more details."
)

return validation_rule_string.split(RULE_DELIMITER)


def parse_validation_rules(validation_rules: Union[list, dict]) -> Union[list, dict]:
"""Split multiple validation rules based on :: delimiter
Args:
validation_rules, list: list containing a string validation rule
validation_rules, Any[List[str], Dict]: List or Dictionary of validation rules,
if list, contains a string validation rule; if dictionary, key is the component the
rule (value) is applied to
Returns:
validation_rules, list: if submitted List
validation_rules, Union[list,dict]: Parsed validation rules, component rules are output as a dictionary,
single sets are a list.
Raises:
Error Logged if Rule is not formatted properly
"""
if validation_rules and "::" in validation_rules[0]:
validation_rules = validation_rules[0].split("::")

if isinstance(validation_rules, dict):
# Rules pulled in as a dict can be used directly
return validation_rules
elif isinstance(validation_rules, list):
# If rules are already parsed from the JSONLD
if len(validation_rules) > 1 and isinstance(validation_rules[-1], str):
return validation_rules
# Parse rules set for a subset of components/manifests
elif COMPONENT_RULES_DELIMITER in validation_rules[0]:
return parse_component_validation_rules(
validation_rule_string=validation_rules[0]
)
# Parse rules that are set across *all* components/manifests
else:
return parse_single_set_validation_rules(
validation_rule_string=validation_rules[0]
)
return


def extract_component_validation_rules(
manifest_component: str, validation_rules: dict[str, list]
) -> list:
"""Parse a component validation rule dictionary to pull out the rule (if any) for a given manifest
Args:
manifest_component, str: Component label, pulled from the manifest directly
validation_rules, dict[str, list[Union[list,str]]: Validation rules dictionary, where keys are the manifest component label,
and the value is a parsed set of validation rules.
Returns:
validation_rules, list[str]: rule for the provided manifest component if one is available,
if a validation rule is not specified for a given component but "all_other_components" is specified (as a key), then pull that one,
otherwise return an empty list.
"""
manifest_component_rule = validation_rules.get(manifest_component)
all_component_rules = validation_rules.get("all_other_components")

# Capture situation where manifest_component rule is an empty string
if manifest_component_rule is not None:
if isinstance(manifest_component_rule, str):
if manifest_component_rule == "":
validation_rules = []
else:
validation_rules = [manifest_component_rule]
elif isinstance(manifest_component_rule, list):
validation_rules = manifest_component_rule
elif all_component_rules:
if isinstance(all_component_rules, str):
validation_rules = [all_component_rules]
elif isinstance(all_component_rules, list):
validation_rules = all_component_rules
else:
validation_rules = []
return validation_rules


Expand Down
2 changes: 1 addition & 1 deletion schematic/utils/validate_rules_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""validate rules utils"""

from typing import Union
import logging
from typing import Union
from jsonschema import ValidationError


Expand Down
4 changes: 2 additions & 2 deletions tests/data/example.model.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
Component,,,,,TRUE,,,,
Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType,,,
Patient ID,,,,,TRUE,DataProperty,,,
Patient ID,,,,,TRUE,DataProperty,,,#Patient unique warning^^#Biospecimen unique error
Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
Year of Birth,,,,,FALSE,DataProperty,,,
Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
Expand Down Expand Up @@ -42,4 +42,4 @@ Check Date,,,,,TRUE,DataProperty,,,date
Check NA,,,,,TRUE,DataProperty,,,int::IsNA
MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,,
MockRDB_id,,,,,TRUE,DataProperty,,,int
SourceManifest,,,,,TRUE,DataProperty,,,
SourceManifest,,,,,TRUE,DataProperty,,,
5 changes: 4 additions & 1 deletion tests/data/example.model.jsonld
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@
},
"sms:displayName": "Patient ID",
"sms:required": "sms:true",
"sms:validationRules": []
"sms:validationRules": {
"Biospecimen": "unique error",
"Patient": "unique warning"
}
},
{
"@id": "bts:Sex",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Sample ID,Patient ID,Tissue Status,Component
123,123,Healthy,Biospecimen
456,123,Healthy,Biospecimen
789,syn465,Healthy,Biospecimen
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Patient ID,Sex,Year of Birth,Diagnosis,Component,Cancer Type,Family History
123,Female,,Cancer,Patient,Breast,"Breast, Colorectal"
123,Male,,Healthy,Patient,,"Breast, Colorectal"
789,Other,,Healthy,Patient,,"Breast, Colorectal"
Loading

0 comments on commit 8e8cf18

Please sign in to comment.