Skip to content

Commit

Permalink
Merge pull request #1306 from Sage-Bionetworks/develop-generate-edge-…
Browse files Browse the repository at this point in the history
…test-FDS-1113

Schema Refactor: Create tests for `DataModelEdges`
  • Loading branch information
mialy-defelice authored Oct 19, 2023
2 parents 283ff95 + 531d476 commit e52b967
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 41 deletions.
2 changes: 1 addition & 1 deletion tests/data/example.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ Check Date,,,,,TRUE,DataProperty,,,date
Check NA,,,,,TRUE,DataProperty,,,int::IsNA
MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,,
MockRDB_id,,,,,TRUE,DataProperty,,,int
SourceManifest,,,,,TRUE,DataProperty,,,
SourceManifest,,,,,TRUE,DataProperty,,,
6 changes: 6 additions & 0 deletions tests/data/properties.test.model.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
cohorts,,,"Component, dataset_id, cohort_tag_id, id","name, dataset_id, cohort_tag_id, id",FALSE,,,,
cohort_tag_id,,,,,FALSE,,,,matchAtLeastOne tags.id set error
name,,,,,FALSE,,,,
dataset_id,,,,,FALSE,,,,
id,,,,,FALSE,,,,
4 changes: 2 additions & 2 deletions tests/data/validator_dag_test.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType
Patient ID,,,Patient,,TRUE,DataProperty,,,
Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
Year of Birth,,,,,FALSE,DataProperty,,,
Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
Diagnosis,,"Healthy, Cancer, Diagnosis",,,TRUE,DataProperty,,,
Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,,
Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
Family History,,"Breast, Colorectal, Lung, Prostate, Skin",Cancer Type,,TRUE,DataProperty,,,list strict
Expand Down Expand Up @@ -41,4 +41,4 @@ Check Date,,,,,TRUE,DataProperty,,,date
Check NA,,,,,TRUE,DataProperty,,,int::IsNA
MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,,
MockRDB_id,,,,,TRUE,DataProperty,,,int
SourceManifest,,,,,TRUE,DataProperty,,,
SourceManifest,,,,,TRUE,DataProperty,,,
255 changes: 218 additions & 37 deletions tests/test_schemas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import copy
from copy import deepcopy
import json
import logging
import networkx as nx
Expand All @@ -8,6 +8,12 @@
import pytest
import random

from schematic.schemas.data_model_edges import DataModelEdges
from schematic.schemas.data_model_nodes import DataModelNodes
from schematic.schemas.data_model_relationships import (
DataModelRelationships
)

from schematic.utils.df_utils import load_df
from schematic.utils.schema_utils import get_label_from_display_name, get_attribute_display_name_from_label, convert_bool_to_str, parse_validation_rules
from schematic.utils.io_utils import load_json
Expand Down Expand Up @@ -47,16 +53,26 @@ def test_fake_func():
NODE_DISPLAY_NAME_DICT = {'Patient':False,
'Sex': True}

def get_data_model_parser(helpers, data_model_name: str = None):
# Get path to data model
fullpath = helpers.get_data_path(path=data_model_name)

# Instantiate DataModelParser
data_model_parser = DataModelParser(path_to_data_model=fullpath)
return data_model_parser

def generate_graph_data_model(helpers, data_model_name):

def generate_graph_data_model(helpers, data_model_name: str) -> nx.MultiDiGraph:
"""
Simple helper function to generate a networkx graph data model from a CSV or JSONLD data model
"""

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model_name)
data_model_parser = get_data_model_parser(
helpers=helpers, data_model_name=data_model_name
)

#Parse Model
# Parse Model
parsed_data_model = data_model_parser.parse_model()

# Convert parsed model to graph
Expand All @@ -68,45 +84,60 @@ def generate_graph_data_model(helpers, data_model_name):

return graph_data_model

def generate_data_model_nodes(helpers, data_model_name):

def generate_data_model_nodes(helpers, data_model_name: str) -> DataModelNodes:
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model_name)
data_model_parser = get_data_model_parser(
helpers=helpers, data_model_name=data_model_name
)
# Parse Model
parsed_data_model = data_model_parser.parse_model()
# Instantiate DataModelNodes
data_model_nodes = DataModelNodes(attribute_relationships_dict=parsed_data_model)
return data_model_nodes


@pytest.fixture(name='dmjsonldp')
def fixture_dm_jsonld_parser():
yield DataModelJSONLDParser()
def get_data_model_json_schema(helpers, data_model_name: str = None):
# Get path to data model
fullpath = helpers.get_data_path(path=data_model_name)

@pytest.fixture
def DME(helpers, data_model_name='example.model.csv'):
'''
In future could pull using helpers.
'''
graph_data_model = generate_graph_data_model(helpers, data_model_name=data_model_name)
DME = DataModelGraphExplorer(graph_data_model)
yield DME

@pytest.fixture(name='dmcsvp')
def fixture_dm_csv_parser():
yield DataModelCSVParser()
# Get Data Model Graph
graph_data_model = generate_graph_data_model(
helpers, data_model_name=data_model_name
)

@pytest.fixture(name='relationships')

@pytest.fixture(name="relationships")
def get_relationships(helpers):
DMR = DataModelRelationships()
relationships_dict = DMR.relationships_dictionary
relationships = list(relationships_dict.keys())
yield relationships

@pytest.fixture(name="dmr")

@pytest.fixture(name="DMR")
def fixture_dmr():
"""Yields a data model relationships object for testing"""
yield DataModelRelationships()


@pytest.fixture(name="csv_parser")
def fixture_dm_csv_parser():
yield DataModelCSVParser()


@pytest.fixture(name="jsonld_parser")
def fixture_dm_jsonld_parser():
yield DataModelJSONLDParser()

@pytest.fixture
def data_model_edges():
"""
Yields a Data Model Edges object for testing
TODO: Update naming for DataModelGraphExplorer and fixture to avoid overlapping namespace
"""
yield DataModelEdges()

class TestDataModelParser:
def test_get_base_schema_path(self, helpers):
'''Test that base schema path is returned properly.
Expand Down Expand Up @@ -242,7 +273,7 @@ def test_parse_jsonld_model(self, helpers, data_model, dmjsonldp):

class TestDataModelRelationships:
"""Tests for DataModelRelationships class"""
def test_define_data_model_relationships(self, dmr: DataModelRelationships):
def test_define_data_model_relationships(self, DMR: DataModelRelationships):
"""Tests relationships_dictionary created has correct keys"""
required_keys = [
'jsonld_key',
Expand All @@ -254,7 +285,7 @@ def test_define_data_model_relationships(self, dmr: DataModelRelationships):
required_edge_keys = ['edge_key', 'edge_dir']
required_node_keys = ['node_label', 'node_attr_dict']

relationships = dmr.relationships_dictionary
relationships = DMR.relationships_dictionary

for relationship in relationships.values():
for key in required_keys:
Expand All @@ -266,9 +297,9 @@ def test_define_data_model_relationships(self, dmr: DataModelRelationships):
for key in required_node_keys:
assert key in relationship.keys()

def test_define_required_csv_headers(self, dmr: DataModelRelationships):
def test_define_required_csv_headers(self, DMR: DataModelRelationships):
"""Tests method returns correct values"""
assert dmr.define_required_csv_headers() == [
assert DMR.define_required_csv_headers() == [
'Attribute',
'Description',
'Valid Values',
Expand All @@ -280,6 +311,7 @@ def test_define_required_csv_headers(self, dmr: DataModelRelationships):
'Source'
]


@pytest.mark.parametrize("edge", [True, False], ids=["True", "False"])
def test_retreive_rel_headers_dict(self, dmr: DataModelRelationships, edge:bool):
"""Tests method returns correct values"""
Expand Down Expand Up @@ -691,8 +723,164 @@ def test_generate_node(self, helpers, data_model):
assert True == (node_dict['label'] in graph_data_model.nodes)

class TestDataModelEdges:
def test_generate_edge(self,helpers):
return
"""
Cases to test
Where node == attribute_display_name
Weights
domain includes weights
list weights
single element weights
Edges
subClassOf/domainIncludes relationship edge
any other relationship edge
rangeIncludes relationship edge
"""
def test_skip_edge(self, helpers, DMR, data_model_edges):
# Instantiate graph object and set node
G = nx.MultiDiGraph()
node = "Diagnosis"

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser("validator_dag_test.model.csv")

# Parse Model
parsed_data_model = data_model_parser.parse_model()

# Instantiate data model Nodes object
DMN = DataModelNodes(parsed_data_model)

# Get edge relationships and all nodes from the parsed model
edge_relationships = DMR.define_edge_relationships()
all_nodes = DMN.gather_all_nodes(attr_rel_dict=parsed_data_model)

# Sanity check to ensure that the node we intend to test exists in the data model
assert node in all_nodes

# Add a single node to the graph
node_dict = {}
node_dict = DMN.generate_node_dict(node, parsed_data_model)
node_dict[node] = node_dict
G = DMN.generate_node(G, node_dict)

# Check the edges in the graph, there should be none
before_edges = deepcopy(G.edges)

# Generate an edge in the graph with one node and a subset of the parsed data model
# We're attempting to add an edge for a node that is the only one in the graph,
# so `generate_edge` should skip adding edges and return the same graph
G = data_model_edges.generate_edge(G, node, node_dict, {node:parsed_data_model[node]}, edge_relationships)

# Assert that no edges were added and that the current graph edges are the same as before the call to `generate_edge`
assert before_edges == G.edges

@pytest.mark.parametrize("node_to_add, edge_relationship",
[("DataType", "parentOf"),
("Female", "parentOf"),
("Sex","requiresDependency")],
ids=["subClassOf",
"Valid Value",
"all others"
])
def test_generate_edge(self, helpers, DMR, data_model_edges, node_to_add, edge_relationship):
# Instantiate graph object
G = nx.MultiDiGraph()

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser("validator_dag_test.model.csv")

#Parse Model
parsed_data_model = data_model_parser.parse_model()

# Instantiate data model Nodes object
DMN = DataModelNodes(parsed_data_model)

# Get edge relationships and all nodes from the parsed model
edge_relationships = DMR.define_edge_relationships()
all_nodes = DMN.gather_all_nodes(attr_rel_dict=parsed_data_model)

# Sanity check to ensure that the node we intend to test exists in the data model
assert node_to_add in all_nodes

# Add all nodes to the graph
all_node_dict = {}
for node in all_nodes:
node_dict = DMN.generate_node_dict(node, parsed_data_model)
all_node_dict[node] = node_dict
G = DMN.generate_node(G, node_dict)

# Check the edges in the graph, there should be none
before_edges = deepcopy(G.edges)

# Generate edges for whichever node we are testing
G = data_model_edges.generate_edge(G, node_to_add, all_node_dict, parsed_data_model, edge_relationships)

# Assert that the current edges are different from the edges of the graph before
assert G.edges > before_edges

# Assert that somewhere in the current edges for the node we added, that the correct relationship exists
relationship_df = pd.DataFrame(G.edges, columns= ['node1', 'node2', 'edge'])
assert (relationship_df['edge'] == edge_relationship).any()

@pytest.mark.parametrize("node_to_add, other_node, expected_weight, data_model_path",
[("Patient ID", "Biospecimen", 1, "validator_dag_test.model.csv"),
("dataset_id", "cohorts", -1, "properties.test.model.csv")],
ids=["list", "domainIncludes"])
def test_generate_weights(self, helpers, DMR, data_model_edges, node_to_add, other_node, expected_weight, data_model_path):
# Instantiate graph object
G = nx.MultiDiGraph()

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_path)

#Parse Model
parsed_data_model = data_model_parser.parse_model()

# Instantiate data model Nodes object
DMN = DataModelNodes(parsed_data_model)

# Get edge relationships and all nodes from the parsed model
edge_relationships = DMR.define_edge_relationships()
all_nodes = DMN.gather_all_nodes(attr_rel_dict=parsed_data_model)

# Sanity check to ensure that the node we intend to test exists in the data model
assert node_to_add in all_nodes

# Add all nodes to the graph
all_node_dict = {}
for node in all_nodes:
node_dict = DMN.generate_node_dict(node, parsed_data_model)
all_node_dict[node] = node_dict
G = DMN.generate_node(G, node_dict)

# Check the edges in the graph, there should be none
before_edges = deepcopy(G.edges)

# Generate edges for whichever node we are testing
G = data_model_edges.generate_edge(G, node_to_add, all_node_dict, parsed_data_model, edge_relationships)

# Assert that the current edges are different from the edges of the graph before
assert G.edges > before_edges

# Cast the edges and weights to a DataFrame for easier indexing
edges_and_weights = pd.DataFrame(G.edges.data(), columns= ['node1', 'node2', 'weights']).set_index('node1')

# Weights are set to a negative nubmer to indicate that the weight cannot be known reliably beforehand and must be determined by reading the schema
# Get the index of the property in the schema
# Weights for properties are determined by their order in the schema.
# This would allow the tests to continue to function correctly in the case were other attributes were added to the schema
if expected_weight < 0:
schema = helpers.get_data_frame(path=helpers.get_data_path(data_model_path), data_model=True)
expected_weight = schema.index[schema['Attribute']==other_node][0]
logger.debug(f"Expected weight for the edge of nodes {node_to_add} and {other_node} is {expected_weight}.")

# Assert that the weight added is what is expected
if node_to_add in ['Patient ID']:
assert edges_and_weights.loc[other_node, 'weights']['weight'] == expected_weight
elif node_to_add in ['cohorts']:
assert edges_and_weights.loc[node_to_add, 'weights']['weight'] == expected_weight




class TestDataModelJsonSchema:
Expand Down Expand Up @@ -994,10 +1182,3 @@ def test_convert_graph_to_jsonld(self, helpers, data_model):
jsonld_dm = convert_graph_to_jsonld(Graph=graph_data_model)
assert list(jsonld_dm.keys()) == ['@context', '@graph', '@id']
assert len(jsonld_dm['@graph']) > 1

class TestSchemas:
def test_convert_csv_to_graph(self, helpers):
return
def test_convert_jsonld_to_graph(self, helpers):
return

7 changes: 6 additions & 1 deletion tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ def test_check_graph_has_required_node_fields(self, helpers):
assert expected_error == validator_errors

def test_dag(self, helpers):
# TODO: The schema validator currently doesn't catch the Diagnosis-Diagnosis self loop.
# It is an expected error but it will need to be decided if the validator should prevent or allow such self loops

# Get graph data model
graph_data_model = graph_data_model_func(helpers, data_model_name='validator_dag_test.model.csv')

Expand All @@ -101,6 +104,8 @@ def test_dag(self, helpers):

# nodes could be in different order so need to account for that
expected_errors = ['Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: Patient and PatientID, please remove this loop from your model and submit again.',
'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: PatientID and Patient, please remove this loop from your model and submit again.']
'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: PatientID and Patient, please remove this loop from your model and submit again.',
'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: Diagnosis and Diagnosis, please remove this loop from your model and submit again.']

assert validator_errors[0] in expected_errors

0 comments on commit e52b967

Please sign in to comment.