Skip to content

Commit

Permalink
Merge pull request #1401 from Sage-Bionetworks/develop-required-valid…
Browse files Browse the repository at this point in the history
…ation-rule-FDS-1626

add functionality to set requirements based per component
  • Loading branch information
mialy-defelice authored Apr 19, 2024
2 parents 0b2d7b3 + 7d18e94 commit c51af2d
Show file tree
Hide file tree
Showing 15 changed files with 1,442 additions and 59 deletions.
58 changes: 36 additions & 22 deletions schematic/models/GE_Helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,37 @@
rule_in_rule_list,
np_array_to_str_list,
iterable_to_str_list,
required_is_only_rule,
)

logger = logging.getLogger(__name__)


# List of modifiers that users can add to a rule, that arent rules themselves.
# as additional modifiers are added will need to update this list

RULE_MODIFIERS = ["error", "warning", "strict", "like", "set", "value"]
VALIDATION_EXPECTATION = {
"int": "expect_column_values_to_be_in_type_list",
"float": "expect_column_values_to_be_in_type_list",
"str": "expect_column_values_to_be_of_type",
"num": "expect_column_values_to_be_in_type_list",
"date": "expect_column_values_to_be_dateutil_parseable",
"recommended": "expect_column_values_to_not_be_null",
"protectAges": "expect_column_values_to_be_between",
"unique": "expect_column_values_to_be_unique",
"inRange": "expect_column_values_to_be_between",
"IsNA": "expect_column_values_to_match_regex_list",
# To be implemented rules with possible expectations
# "list": "expect_column_values_to_not_match_regex_list",
# "regex": "expect_column_values_to_match_regex",
# "url": "expect_column_values_to_be_valid_urls",
# "matchAtLeastOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
# "matchExactlyOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
# "matchNone": "expect_compound_columns_to_be_unique",
}


class GreatExpectationsHelpers(object):
"""
Great Expectations helper class
Expand Down Expand Up @@ -134,25 +160,6 @@ def build_expectation_suite(
saves expectation suite and identifier to self
"""
validation_expectation = {
"int": "expect_column_values_to_be_in_type_list",
"float": "expect_column_values_to_be_in_type_list",
"str": "expect_column_values_to_be_of_type",
"num": "expect_column_values_to_be_in_type_list",
"date": "expect_column_values_to_be_dateutil_parseable",
"recommended": "expect_column_values_to_not_be_null",
"protectAges": "expect_column_values_to_be_between",
"unique": "expect_column_values_to_be_unique",
"inRange": "expect_column_values_to_be_between",
"IsNA": "expect_column_values_to_match_regex_list",
# To be implemented rules with possible expectations
# "list": "expect_column_values_to_not_match_regex_list",
# "regex": "expect_column_values_to_match_regex",
# "url": "expect_column_values_to_be_valid_urls",
# "matchAtLeastOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
# "matchExactlyOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
# "matchNone": "expect_compound_columns_to_be_unique",
}

# create blank expectation suite
self.expectation_suite_name = "Manifest_test_suite"
Expand Down Expand Up @@ -185,7 +192,14 @@ def build_expectation_suite(
base_rule = rule.split(" ")[0]

# check if rule has an implemented expectation
if rule_in_rule_list(rule, self.unimplemented_expectations):
if rule_in_rule_list(
rule, self.unimplemented_expectations
) or required_is_only_rule(
rule=rule,
attribute=col,
rule_modifiers=RULE_MODIFIERS,
validation_expectation=VALIDATION_EXPECTATION,
):
continue

args["column"] = col
Expand Down Expand Up @@ -328,7 +342,7 @@ def build_expectation_suite(
rule=rule,
args=args,
meta=meta,
validation_expectation=validation_expectation,
validation_expectation=VALIDATION_EXPECTATION,
)

self.context.update_expectation_suite(
Expand Down Expand Up @@ -368,7 +382,7 @@ def add_expectation(
# Create an Expectation
expectation_configuration = ExpectationConfiguration(
# Name of expectation type being added
expectation_type=validation_expectation[rule.split(" ")[0]],
expectation_type=VALIDATION_EXPECTATION[rule.split(" ")[0]],
# add arguments and meta message
kwargs={**args},
meta={**meta},
Expand Down
5 changes: 4 additions & 1 deletion schematic/models/validate_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,14 @@ def check_max_rule_num(
errors, list[list[str]]: list of errors being compiled, with additional error list being appended if appropriate
"""
# Check that attribute rules conform to limits:
# IsNa is operates differently than most rules, do not consider it as a rule for evaluating
# IsNa and required is operate differently than most rules, do not consider it as a rule for evaluating
# if the number of rule pairs has been exceeded.
if "IsNa" in validation_rules:
validation_rules.remove("IsNa")

if "required" in validation_rules:
validation_rules.remove("required")

# no more than two rules for an attribute.
# As more combinations get added, may want to bring out into its own function / or use validate_rules_utils?
if len(validation_rules) > 2:
Expand Down
94 changes: 93 additions & 1 deletion schematic/schemas/data_model_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
get_property_label_from_display_name,
get_class_label_from_display_name,
DisplayLabelType,
extract_component_validation_rules,
)
from schematic.utils.general import unlist
from schematic.utils.viz_utils import visualize
from schematic.utils.validate_utils import rule_in_rule_list


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -226,6 +229,95 @@ def get_adjacent_nodes_by_relationship(

return list(nodes)

def get_component_node_required(
self,
manifest_component: str,
node_validation_rules: Optional[list[str]] = None,
node_label: Optional[str] = None,
node_display_name: Optional[str] = None,
) -> bool:
"""Check if a node is required taking into account the manifest component it is defined in
(requirements can be set in validaiton rule as well as required column)
Args:
manifest_component: str, manifest component display name that the node belongs to.
node_validation_rules: list[str], valdation rules for a given node and component.
node_label: str, Label of the node you would want to get the comment for.
node_display_name: str, node display name for the node being queried.
Returns:
True, if node is required, False if not
"""
node_required = False

if not node_validation_rules:
# Get node validation rules for a given component
node_validation_rules = self.get_component_node_validation_rules(
manifest_component=manifest_component,
node_label=node_label,
node_display_name=node_display_name,
)

# Check if the valdation rule specifies that the node is required for this particular
# component.
if rule_in_rule_list("required", node_validation_rules):
node_required = True
# To prevent any unintended errors, ensure the Required field for this node is False
if self.get_node_required(
node_label=node_label, node_display_name=node_display_name
):
if not node_display_name:
assert node_label is not None
node_display_name = self.graph.nodes[node_label][
self.rel_dict["displayName"]["node_label"]
]
error_str = " ".join(
[
f"For component: {manifest_component} and attribute: {node_display_name}",
"requirements are being specified in both the Required field and in the",
"Validation Rules. If you desire to use validation rules to set component",
"specific requirements for this attribute",
"then the Required field needs to be set to False, or the validation may",
"not work as intended, for other components where the attribute",
"that should not be required.",
]
)

logger.error(error_str)
else:
# If requirements are not being set in the validaiton rule, then just pull the
# standard node requirements from the model
node_required = self.get_node_required(
node_label=node_label, node_display_name=node_display_name
)
return node_required

def get_component_node_validation_rules(
self,
manifest_component: str,
node_label: Optional[str] = None,
node_display_name: Optional[str] = None,
) -> list[str]:
"""Get valdation rules for a given node and component.
Args:
manifest_component: str, manifest component display name that the node belongs to.
node_label: str, Label of the node you would want to get the comment for.
node_display_name: str, node display name for the node being queried.
Returns:
validation_rules: list[str], validation rules list for a given node and component.
"""
# get any additional validation rules associated with this node (e.g. can this node
# be mapped to a list of other nodes)
node_validation_rules = self.get_node_validation_rules(
node_label=node_label, node_display_name=node_display_name
)

# Parse the validation rules per component if applicable
if node_validation_rules and isinstance(node_validation_rules, dict):
node_validation_rules = extract_component_validation_rules(
manifest_component=manifest_component,
validation_rules_dict=node_validation_rules,
)
return node_validation_rules

def get_component_requirements(
self,
source_component: str,
Expand Down Expand Up @@ -671,7 +763,7 @@ def get_node_required(

def get_node_validation_rules(
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
) -> list:
) -> Union[list, dict[str, str]]:
"""Get validation rules associated with a node,
Args:
Expand Down
20 changes: 10 additions & 10 deletions schematic/schemas/data_model_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,16 @@ def get_json_validation_schema(
range_domain_map[node] = []
range_domain_map[node].append(node_display_name)

# can this node be map to the empty set (if required no; if not required yes)
# TODO: change "required" to different term, required may be a bit misleading
# (i.e. is the node required in the schema)
node_required = self.dmge.get_node_required(node_label=process_node)

# get any additional validation rules associated with this node (e.g. can this node
# be mapped to a list of other nodes)
node_validation_rules = self.dmge.get_node_validation_rules(
node_display_name=node_display_name
# Get node validation rules for the current node, and the given component
node_validation_rules = self.dmge.get_component_node_validation_rules(
manifest_component=source_node, node_display_name=node_display_name
)

# Get if the node is required for the given component
node_required = self.dmge.get_component_node_required(
manifest_component=source_node,
node_validation_rules=node_validation_rules,
node_display_name=node_display_name,
)

if node_display_name in reverse_dependencies:
Expand Down Expand Up @@ -231,7 +232,6 @@ def get_json_validation_schema(
# set schema conditional dependencies
for node in reverse_dependencies[node_display_name]:
# set all of the conditional nodes that require this process node

# get node domain if any
# ow this node is a conditional requirement
if node in range_domain_map:
Expand Down
73 changes: 73 additions & 0 deletions schematic/utils/validate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import re
from collections.abc import Mapping
import logging
from typing import Pattern, Union, Iterable, Any, Optional
from numbers import Number
from jsonschema import validate
Expand All @@ -12,6 +13,8 @@
from schematic.utils.io_utils import load_json
from schematic import LOADER

logger = logging.getLogger(__name__)


def validate_schema(schema: Union[Mapping, bool]) -> None:
"""Validate schema against schema.org standard"""
Expand Down Expand Up @@ -104,3 +107,73 @@ def iterable_to_str_list(obj: Union[str, Number, Iterable]) -> list[str]:
# If the object is iterable and not a string, convert every element
# to string and wrap as a list
return [str(item) for item in obj]


def required_is_only_rule(
rule: str,
attribute: str,
rule_modifiers: list[str],
validation_expectation: dict[str, str],
) -> bool:
"""Need to determine if required is the only rule being set. Do this way so we dont have
to enforce a position for it (ie, it can only be before message and after the rule).
This ensures that 'required' is not treated like a real rule, in the case it is
accidentally combined with a rule modifier. The required rule is t
Args:
rule: str, the validation rule string
attribute: str, attribute the validation rule is set to
rule_modifiers: list[str], list of rule modifiers available to add to rules
validation_expectation: dict[str, str], currently implemented expectations.
Returns:
bool, True, if required is the only rule, false if it is not.
"""
# convert rule to lowercase to ensure punctuation does not throw off determination.
rule = rule.lower()

# If required is not in the rule, it cant be the only rule, return False
if "required" not in rule:
return False

# If the entire rule is just 'required' then it is easily determined to be the only rule
if rule == "required":
return True

# Try to find an expectation rule in the rule, if there is one there log it and
# continue
# This function is called as part of an if that is already looking for in house rules
# so don't worry about looking for them.
rule_parts = rule.split(" ")
for idx, rule_part in enumerate(rule_parts):
if rule_part in validation_expectation:
return False

# identify then remove all rule modifiers, all that should be left is required in the
# case that someone used a standard modifier with required
idx_to_remove = []
if "required" in rule_parts:
for idx, rule_part in enumerate(rule_parts):
if rule_part in rule_modifiers:
idx_to_remove.append(idx)

if idx_to_remove:
for idx in sorted(idx_to_remove, reverse=True):
del rule_parts[idx]

# In this case, rule modifiers have been added to required. This is not the expected use
# so log a warning, but let user proceed.
if rule_parts == ["required"]:
warning_message = " ".join(
[
f"For Attribute: {attribute}, it looks like required was set as a single rule,"
"with modifiers attached.",
"Rule modifiers do not work in conjunction with the required validation rule.",
"Please reformat your rule.",
]
)
logger.warning(warning_message)
return True

# Return false if no other condition has been met. In this case if the rule is not a real
# rule an error will be raised from the containing function.
return False
21 changes: 21 additions & 0 deletions tests/data/example_required_vr_test.model.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
Component,,,,,TRUE,,,,
Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType,,,
Patient ID,,,,,FALSE,DataProperty,,,#Patient unique warning^^#Biospecimen unique required error
Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
Year of Birth,,,,,FALSE,DataProperty,,,
Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,,
Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict
Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,,
Sample ID,,,,,TRUE,DataProperty,,,
Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,,
Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,,
Filename,,,,,TRUE,DataProperty,,,
File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,FALSE,DataProperty,,,^^#BulkRNA-seqAssay required
BAM,,,Genome Build,,FALSE,ValidValue,,,
CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
Genome FASTA,,,,,TRUE,DataProperty,,,
Loading

0 comments on commit c51af2d

Please sign in to comment.