Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Component Based Rule Settings #1368

Merged
merged 25 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c4665df
add component based rule feature
mialy-defelice Feb 9, 2024
b834032
run black
mialy-defelice Feb 9, 2024
2986880
clean up and refactor some functions
mialy-defelice Feb 9, 2024
659e4a4
continue cleanup, and refactor functions to break down more
mialy-defelice Feb 9, 2024
4ac1f98
add new ManifestValidation Test
mialy-defelice Feb 13, 2024
0f652aa
add new tests to test_utils and run black formatter
mialy-defelice Feb 13, 2024
ac94b2d
update test_run_rel_functions to work better with new rule handling
mialy-defelice Feb 13, 2024
aab7682
update syntax, clean and except errors
mialy-defelice Feb 13, 2024
ef22e8e
run black on test_validation
mialy-defelice Feb 14, 2024
a9c42bd
run black on schema_utils
mialy-defelice Feb 14, 2024
170a975
run black on test_schemas
mialy-defelice Feb 14, 2024
2cb4475
add updated data model with PatientID validation rule
mialy-defelice Feb 14, 2024
8c65b57
update path to new model and dmge
mialy-defelice Feb 14, 2024
c91f5a1
add additional mock manifests for component rule testing
mialy-defelice Feb 14, 2024
0b9127d
merge with develop, resolve merge conflicts
mialy-defelice Feb 14, 2024
8a2809b
run black on schema_utils
mialy-defelice Feb 14, 2024
1ea120e
clean up validate_rules_utils imports
mialy-defelice Feb 14, 2024
9c25d9c
tests/test_schemas.py
mialy-defelice Feb 14, 2024
b2e436c
revert some test relationships
mialy-defelice Feb 14, 2024
ffc9140
fix indent issue in test_validation
mialy-defelice Feb 14, 2024
41640e5
update typing in docstring
mialy-defelice Feb 15, 2024
c7580c2
add additional function to check for duplicate components and additio…
mialy-defelice Feb 15, 2024
47028fa
run black
mialy-defelice Feb 15, 2024
a0b3b73
add docstrings and clean up, address andrews comments
mialy-defelice Feb 15, 2024
ed31d91
switch from try:except to logger
mialy-defelice Feb 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion schematic/manifest/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pathlib import Path
import pygsheets as ps
from tempfile import NamedTemporaryFile
from typing import Dict, List, Optional, Tuple, Union, BinaryIO, Literal
from typing import Any, Dict, List, Optional, Tuple, Union, BinaryIO, Literal

from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
Expand All @@ -21,6 +21,7 @@
build_service_account_creds,
)
from schematic.utils.df_utils import update_df, load_df
from schematic.utils.schema_utils import extract_component_validation_rules
from schematic.utils.validate_utils import rule_in_rule_list
from schematic.utils.schema_utils import DisplayLabelType

Expand Down Expand Up @@ -1137,6 +1138,10 @@ def _create_requests_body(
validation_rules = self.dmge.get_node_validation_rules(
node_display_name=req
)
if isinstance(validation_rules, dict):
validation_rules = extract_component_validation_rules(
validation_rules=validation_rules, manifest_component=self.root
)

# Add regex match validaiton rule to Google Sheets.
if validation_rules and sheet_url:
Expand Down
9 changes: 9 additions & 0 deletions schematic/models/GE_Helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
from schematic.models.validate_attribute import GenerateError
from schematic.schemas.data_model_graph import DataModelGraphExplorer

from schematic.utils.schema_utils import extract_component_validation_rules

from schematic.utils.validate_utils import (
rule_in_rule_list,
np_array_to_str_list,
Expand Down Expand Up @@ -164,12 +166,19 @@ def build_expectation_suite(

# remove trailing/leading whitespaces from manifest
self.manifest.map(lambda x: x.strip() if isinstance(x, str) else x)

validation_rules = self.dmge.get_node_validation_rules(
node_display_name=col
)

# check if attribute has any rules associated with it
if validation_rules:
# Check if the validation rule applies to this manifest
if isinstance(validation_rules, dict):
validation_rules = extract_component_validation_rules(
manifest_component=self.manifest["Component"][0],
validation_rules=validation_rules,
)
# iterate through all validation rules for an attribute
for rule in validation_rules:
base_rule = rule.split(" ")[0]
Expand Down
4 changes: 0 additions & 4 deletions schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,6 @@ def generate_type_error(
node_display_name=attribute_name
)

# TODO: Can remove when handling updated so split within graph
if validation_rules and "::" in validation_rules[0]:
validation_rules = validation_rules[0].split("::")

# If IsNA rule is being used to allow `Not Applicable` entries, do not log a message
if error_val.lower() == "not applicable" and rule_in_rule_list(
"IsNA", validation_rules
Expand Down
10 changes: 7 additions & 3 deletions schematic/models/validate_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from schematic.models.GE_Helpers import GreatExpectationsHelpers
from schematic.utils.validate_rules_utils import validation_rule_info
from schematic.utils.validate_utils import rule_in_rule_list
from schematic.utils.schema_utils import extract_component_validation_rules

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -184,9 +185,12 @@ def validate_manifest_rules(
manifest.map(lambda x: x.strip() if isinstance(x, str) else x)
validation_rules = dmge.get_node_validation_rules(node_display_name=col)

# TODO: Can remove when handling updated so split within graph
if validation_rules and "::" in validation_rules[0]:
validation_rules = validation_rules[0].split("::")
# Parse the validation rules
if validation_rules and isinstance(validation_rules, dict):
validation_rules = extract_component_validation_rules(
manifest_component=manifest["Component"][0],
validation_rules=validation_rules,
)

# Check that attribute rules conform to limits:
# no more than two rules for an attribute.
Expand Down
213 changes: 205 additions & 8 deletions schematic/utils/schema_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
import json
import logging
import networkx as nx
import re
import string
from typing import Dict, List, Literal
from typing import List, Literal, Dict, Tuple, Union


logger = logging.getLogger(__name__)

DisplayLabelType = Literal["class_label", "display_label"]
BLACKLISTED_CHARS = ["(", ")", ".", " ", "-"]
COMPONENT_NAME_DELIMITER = "#"
COMPONENT_RULES_DELIMITER = "^^"
RULE_DELIMITER = "::"


def attr_dict_template(key_name: str) -> Dict[str, dict[str, dict]]:
Expand Down Expand Up @@ -143,7 +148,7 @@ def get_schema_label(
Returns:
label, str: class label of display name
Raises:
ValueError if entry_type.lower(), is not either 'class' or 'property'
Error Logged if entry_type.lower(), is not either 'class' or 'property'
"""
if entry_type.lower() == "class":
label = get_class_label_from_display_name(
Expand All @@ -155,7 +160,7 @@ def get_schema_label(
display_name=display_name, strict_camel_case=strict_camel_case
)
else:
raise ValueError(
logger.error(
f"The entry type submitted: {entry_type}, is not one of the permitted types: 'class' or 'property'"
)
return label
Expand Down Expand Up @@ -207,15 +212,207 @@ def convert_bool_to_str(provided_bool: bool) -> str:
return str(provided_bool)


def parse_validation_rules(validation_rules: List[str]) -> List[str]:
def get_individual_rules(
rule: str, validation_rules: list[Union[str, None]]
) -> Union[str, list]:
"""Extract individual rules from a string and add to a list of rules
Args:
rule, str: valdation rule that has been parsed from a component rule.
validaiton_rules, list: list of rules being collected,
if this is the first time the list is being added to, it will be empty
Returns:
validaiton_rules, list: list of rules being collected.
"""
# Separate multiple rules (defined by addition of the rule delimiter)
if RULE_DELIMITER in rule:
validation_rules.append(parse_single_set_validation_rules(rule))
# Get single rule
else:
validation_rules.append(rule)
return validation_rules


def get_component_name_rules(
component_names: list[Union[str, None]], component_rule: str
) -> Tuple[list, str]:
"""Get component name and rule from an string that was initilly split by the COMPONENT_RULES_DELIMITER
Args:
component_names, list[Union[str,None]]: list of components, will be empty if being added to for the first time.
component_rule, str: component rule string that has only been split by the COMPONENT_RULES_DELIMETER
Returns:
Tuple[list,str]: list with the a new component name or 'all_other_components' appended,
rule with the component name stripped off.
Raises:
Error Logged if it looks like a component name should have been added to the list, but wass not.
"""
# If a component name is not attached to the rule, have it apply to all other components
if COMPONENT_NAME_DELIMITER != component_rule[0]:
component_names.append("all_other_components")
# Get the component name if available
else:
component_names.append(
component_rule.split(" ")[0].replace(COMPONENT_NAME_DELIMITER, "")
)
if component_names[-1] == " ":
logger.error(
f"There was an error capturing at least one of the component names "
f"in the following rule: {component_rule}, "
f"please ensure there is not extra whitespace or non-allowed characters."
)

component_rule = component_rule.replace(component_rule.split(" ")[0], "")
component_rule = component_rule.strip()
return component_names, component_rule


def check_for_duplicate_components(
component_names: list[str], validation_rule_string: str
) -> None:
"""Check if component names are repeated in a validation rule
Args:
component_names, list[str]: list of components identified in the validation rule
validation_rule_str, str: validation rule, used if error needs to be raised.
Returns:
None
Raises: Error Logged if a component name is duplicated.
"""
duplicated_entries = [cn for cn in component_names if component_names.count(cn) > 1]
if duplicated_entries:
logger.error(
f"Oops, it looks like the following rule {validation_rule_string}, contains the same component "
f"name more than once. An attribute can only have a single rule applied per manifest/component."
)
return


def parse_component_validation_rules(validation_rule_string: str) -> Dict:
"""If a validation rule is identified to be fomatted as a component validation rule, parse to a dictionary of components:rules
Args:
validation_rule_string, str: validation rule provided by user.
Returns:
validation_rules_dict, dict: validation rules parsed to a dictionary where
the key is the component name (or 'all_other_components') and the value is the parsed validaiton rule for
the given component.
"""
component_names = []
validation_rules = []

component_rules = validation_rule_string.split(COMPONENT_RULES_DELIMITER)
# Extract component rules, per component
for component_rule in component_rules:
component_rule = component_rule.strip()
if component_rule:
# Get component name attached to rule
component_names, component_rule = get_component_name_rules(
component_names=component_names, component_rule=component_rule
)

# Get rules
validation_rules = get_individual_rules(
rule=component_rule, validation_rules=validation_rules
)

# Ensure we collected the component names and validation rules like expected
if len(component_names) != len(validation_rules):
logger.error(
f"The number of components names and validation rules does not match "
f"for validation rule: {validation_rule_string}."
)

# If a component name is repeated throw an error.
check_for_duplicate_components(component_names, validation_rule_string)

validation_rules_dict = dict(zip(component_names, validation_rules))

return validation_rules_dict


def parse_single_set_validation_rules(validation_rule_string: str) -> list:
"""Parse a single set of validation rules.
Args:
validation_rule_string, str: validation rule provided by user.
Returns:
list, the valiation rule string split by the rule delimiter
Raise:
ValueEror if the string contains a component name delimter in the beginning.
This would indicate that a user was trying to set component rules, but did so improperly.
"""
# Try to catch an improperly formatted rule
if COMPONENT_NAME_DELIMITER == validation_rule_string[0]:
logger.error(
f"The provided validation rule {validation_rule_string}, looks to be formatted as a component "
f"based rule, but is missing the necessary formatting, "
f"please refer to the SchemaHub documentation for more details."
)

return validation_rule_string.split(RULE_DELIMITER)


def parse_validation_rules(validation_rules: Union[list, dict]) -> Union[list, dict]:
"""Split multiple validation rules based on :: delimiter
Args:
validation_rules, list: list containing a string validation rule
validation_rules, Any[List[str], Dict]: List or Dictionary of validation rules,
if list, contains a string validation rule; if dictionary, key is the component the
rule (value) is applied to
Returns:
validation_rules, list: if submitted List
validation_rules, Union[list,dict]: Parsed validation rules, component rules are output as a dictionary,
single sets are a list.
Raises:
Error Logged if Rule is not formatted properly
"""
if validation_rules and "::" in validation_rules[0]:
validation_rules = validation_rules[0].split("::")

if isinstance(validation_rules, dict):
# Rules pulled in as a dict can be used directly
return validation_rules
elif isinstance(validation_rules, list):
# If rules are already parsed from the JSONLD
if len(validation_rules) > 1 and isinstance(validation_rules[-1], str):
return validation_rules
# Parse rules set for a subset of components/manifests
elif COMPONENT_RULES_DELIMITER in validation_rules[0]:
return parse_component_validation_rules(
validation_rule_string=validation_rules[0]
)
# Parse rules that are set across *all* components/manifests
else:
return parse_single_set_validation_rules(
validation_rule_string=validation_rules[0]
)
return


def extract_component_validation_rules(
manifest_component: str, validation_rules: dict[str, list]
) -> list:
"""Parse a component validation rule dictionary to pull out the rule (if any) for a given manifest
Args:
manifest_component, str: Component label, pulled from the manifest directly
validation_rules, dict[str, list[Union[list,str]]: Validation rules dictionary, where keys are the manifest component label,
and the value is a parsed set of validation rules.
Returns:
validation_rules, list[str]: rule for the provided manifest component if one is available,
if a validation rule is not specified for a given component but "all_other_components" is specified (as a key), then pull that one,
otherwise return an empty list.
"""
manifest_component_rule = validation_rules.get(manifest_component)
all_component_rules = validation_rules.get("all_other_components")

# Capture situation where manifest_component rule is an empty string
if manifest_component_rule is not None:
if isinstance(manifest_component_rule, str):
if manifest_component_rule == "":
validation_rules = []
else:
validation_rules = [manifest_component_rule]
elif isinstance(manifest_component_rule, list):
validation_rules = manifest_component_rule
elif all_component_rules:
if isinstance(all_component_rules, str):
validation_rules = [all_component_rules]
elif isinstance(all_component_rules, list):
validation_rules = all_component_rules
else:
validation_rules = []
return validation_rules


Expand Down
2 changes: 1 addition & 1 deletion schematic/utils/validate_rules_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""validate rules utils"""

from typing import Union
import logging
from typing import Union
from jsonschema import ValidationError


Expand Down
4 changes: 2 additions & 2 deletions tests/data/example.model.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
Component,,,,,TRUE,,,,
Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType,,,
Patient ID,,,,,TRUE,DataProperty,,,
Patient ID,,,,,TRUE,DataProperty,,,#Patient unique warning^^#Biospecimen unique error
Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
Year of Birth,,,,,FALSE,DataProperty,,,
Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
Expand Down Expand Up @@ -42,4 +42,4 @@ Check Date,,,,,TRUE,DataProperty,,,date
Check NA,,,,,TRUE,DataProperty,,,int::IsNA
MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,,
MockRDB_id,,,,,TRUE,DataProperty,,,int
SourceManifest,,,,,TRUE,DataProperty,,,
SourceManifest,,,,,TRUE,DataProperty,,,
5 changes: 4 additions & 1 deletion tests/data/example.model.jsonld
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@
},
"sms:displayName": "Patient ID",
"sms:required": "sms:true",
"sms:validationRules": []
"sms:validationRules": {
"Biospecimen": "unique error",
"Patient": "unique warning"
}
},
{
"@id": "bts:Sex",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Sample ID,Patient ID,Tissue Status,Component
123,123,Healthy,Biospecimen
456,123,Healthy,Biospecimen
789,syn465,Healthy,Biospecimen
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Patient ID,Sex,Year of Birth,Diagnosis,Component,Cancer Type,Family History
123,Female,,Cancer,Patient,Breast,"Breast, Colorectal"
123,Male,,Healthy,Patient,,"Breast, Colorectal"
789,Other,,Healthy,Patient,,"Breast, Colorectal"
Loading
Loading