Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Schema Refactor: Create tests for DataModelEdges #1306

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0036db0
add docstring for cases to test
GiaJordan Sep 25, 2023
a374f99
add self loop to validation test model
GiaJordan Sep 28, 2023
9c7bd06
add slots for other tests
GiaJordan Sep 28, 2023
247b63a
add test to ensure self loop edges are not added
GiaJordan Sep 28, 2023
269ca0f
change edge checking
GiaJordan Sep 28, 2023
dcf4872
make a deepcopy
GiaJordan Sep 28, 2023
bedda93
add test for adding edges
GiaJordan Sep 29, 2023
1be618d
WIP add test for edge weights
GiaJordan Sep 29, 2023
02b1c61
add comments
GiaJordan Oct 10, 2023
8e21b6c
add weight checks in test
GiaJordan Oct 10, 2023
368834a
change test case
GiaJordan Oct 10, 2023
5cc5a2c
update expected schema validator errors
GiaJordan Oct 10, 2023
50344b4
update example and test model with new component
GiaJordan Oct 10, 2023
6af0009
add test for property weights
GiaJordan Oct 10, 2023
1044c39
remove `cohorts` component from test models
GiaJordan Oct 11, 2023
026862e
create new model for component w/ properties
GiaJordan Oct 11, 2023
1c4b880
parametrize data model path
GiaJordan Oct 11, 2023
5c5991c
remove print statement
GiaJordan Oct 11, 2023
dfc5ca3
use DMR fixture, change name cap
GiaJordan Oct 11, 2023
2ab025e
add fixture for DataModelEdges object
GiaJordan Oct 11, 2023
af51f09
use dataModelParser helper fxn
GiaJordan Oct 11, 2023
a7a9e7c
change df column names
GiaJordan Oct 11, 2023
47a49fb
change assertion logic to explicitly check that nodes were added
GiaJordan Oct 11, 2023
f7f7e58
update comments and log message
GiaJordan Oct 11, 2023
641748f
clean spacing / update comments
GiaJordan Oct 11, 2023
ec06372
Revert "update expected schema validator errors"
GiaJordan Oct 11, 2023
0271f7f
Revert "Revert "update expected schema validator errors""
GiaJordan Oct 11, 2023
27cb86b
change fixture name
GiaJordan Oct 18, 2023
b58a185
remove returns
GiaJordan Oct 18, 2023
5321ff3
add comment on negative weight
GiaJordan Oct 18, 2023
d0287ef
change var cap
GiaJordan Oct 18, 2023
8b843ee
add note on self loop error
GiaJordan Oct 19, 2023
531d476
Merge branch 'develop-refactor-schemas' into develop-generate-edge-te…
mialy-defelice Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/data/example.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ Check Date,,,,,TRUE,DataProperty,,,date
Check NA,,,,,TRUE,DataProperty,,,int::IsNA
MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,,
MockRDB_id,,,,,TRUE,DataProperty,,,int
SourceManifest,,,,,TRUE,DataProperty,,,
SourceManifest,,,,,TRUE,DataProperty,,,
6 changes: 6 additions & 0 deletions tests/data/properties.test.model.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
cohorts,,,"Component, dataset_id, cohort_tag_id, id","name, dataset_id, cohort_tag_id, id",FALSE,,,,
cohort_tag_id,,,,,FALSE,,,,matchAtLeastOne tags.id set error
name,,,,,FALSE,,,,
dataset_id,,,,,FALSE,,,,
id,,,,,FALSE,,,,
4 changes: 2 additions & 2 deletions tests/data/validator_dag_test.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType
Patient ID,,,Patient,,TRUE,DataProperty,,,
Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
Year of Birth,,,,,FALSE,DataProperty,,,
Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
Diagnosis,,"Healthy, Cancer, Diagnosis",,,TRUE,DataProperty,,,
Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,,
Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
Family History,,"Breast, Colorectal, Lung, Prostate, Skin",Cancer Type,,TRUE,DataProperty,,,list strict
Expand Down Expand Up @@ -41,4 +41,4 @@ Check Date,,,,,TRUE,DataProperty,,,date
Check NA,,,,,TRUE,DataProperty,,,int::IsNA
MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,,
MockRDB_id,,,,,TRUE,DataProperty,,,int
SourceManifest,,,,,TRUE,DataProperty,,,
SourceManifest,,,,,TRUE,DataProperty,,,
255 changes: 218 additions & 37 deletions tests/test_schemas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import copy
from copy import deepcopy
import json
import logging
import networkx as nx
Expand All @@ -8,6 +8,12 @@
import pytest
import random

from schematic.schemas.data_model_edges import DataModelEdges
from schematic.schemas.data_model_nodes import DataModelNodes
from schematic.schemas.data_model_relationships import (
DataModelRelationships
)

from schematic.utils.df_utils import load_df
from schematic.utils.schema_utils import get_label_from_display_name, get_attribute_display_name_from_label, convert_bool_to_str, parse_validation_rules
from schematic.utils.io_utils import load_json
Expand Down Expand Up @@ -47,16 +53,26 @@ def test_fake_func():
NODE_DISPLAY_NAME_DICT = {'Patient':False,
'Sex': True}

def get_data_model_parser(helpers, data_model_name: str = None):
# Get path to data model
fullpath = helpers.get_data_path(path=data_model_name)

# Instantiate DataModelParser
data_model_parser = DataModelParser(path_to_data_model=fullpath)
return data_model_parser

def generate_graph_data_model(helpers, data_model_name):

def generate_graph_data_model(helpers, data_model_name: str) -> nx.MultiDiGraph:
"""
Simple helper function to generate a networkx graph data model from a CSV or JSONLD data model
"""

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model_name)
data_model_parser = get_data_model_parser(
helpers=helpers, data_model_name=data_model_name
)

#Parse Model
# Parse Model
parsed_data_model = data_model_parser.parse_model()

# Convert parsed model to graph
Expand All @@ -68,45 +84,60 @@ def generate_graph_data_model(helpers, data_model_name):

return graph_data_model

def generate_data_model_nodes(helpers, data_model_name):

def generate_data_model_nodes(helpers, data_model_name: str) -> DataModelNodes:
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model_name)
data_model_parser = get_data_model_parser(
helpers=helpers, data_model_name=data_model_name
)
# Parse Model
parsed_data_model = data_model_parser.parse_model()
# Instantiate DataModelNodes
data_model_nodes = DataModelNodes(attribute_relationships_dict=parsed_data_model)
return data_model_nodes


@pytest.fixture(name='dmjsonldp')
def fixture_dm_jsonld_parser():
yield DataModelJSONLDParser()
def get_data_model_json_schema(helpers, data_model_name: str = None):
# Get path to data model
fullpath = helpers.get_data_path(path=data_model_name)

@pytest.fixture
def DME(helpers, data_model_name='example.model.csv'):
'''
In future could pull using helpers.
'''
graph_data_model = generate_graph_data_model(helpers, data_model_name=data_model_name)
DME = DataModelGraphExplorer(graph_data_model)
yield DME

@pytest.fixture(name='dmcsvp')
def fixture_dm_csv_parser():
yield DataModelCSVParser()
# Get Data Model Graph
graph_data_model = generate_graph_data_model(
helpers, data_model_name=data_model_name
)

@pytest.fixture(name='relationships')

@pytest.fixture(name="relationships")
def get_relationships(helpers):
DMR = DataModelRelationships()
relationships_dict = DMR.relationships_dictionary
relationships = list(relationships_dict.keys())
yield relationships

@pytest.fixture(name="dmr")

@pytest.fixture(name="DMR")
def fixture_dmr():
"""Yields a data model relationships object for testing"""
yield DataModelRelationships()


@pytest.fixture(name="csv_parser")
def fixture_dm_csv_parser():
yield DataModelCSVParser()


@pytest.fixture(name="jsonld_parser")
def fixture_dm_jsonld_parser():
yield DataModelJSONLDParser()

@pytest.fixture
def data_model_edges():
"""
Yields a Data Model Edges object for testing
TODO: Update naming for DataModelGraphExplorer and fixture to avoid overlapping namespace
"""
yield DataModelEdges()

class TestDataModelParser:
def test_get_base_schema_path(self, helpers):
'''Test that base schema path is returned properly.
Expand Down Expand Up @@ -242,7 +273,7 @@ def test_parse_jsonld_model(self, helpers, data_model, dmjsonldp):

class TestDataModelRelationships:
"""Tests for DataModelRelationships class"""
def test_define_data_model_relationships(self, dmr: DataModelRelationships):
def test_define_data_model_relationships(self, DMR: DataModelRelationships):
"""Tests relationships_dictionary created has correct keys"""
required_keys = [
'jsonld_key',
Expand All @@ -254,7 +285,7 @@ def test_define_data_model_relationships(self, dmr: DataModelRelationships):
required_edge_keys = ['edge_key', 'edge_dir']
required_node_keys = ['node_label', 'node_attr_dict']

relationships = dmr.relationships_dictionary
relationships = DMR.relationships_dictionary

for relationship in relationships.values():
for key in required_keys:
Expand All @@ -266,9 +297,9 @@ def test_define_data_model_relationships(self, dmr: DataModelRelationships):
for key in required_node_keys:
assert key in relationship.keys()

def test_define_required_csv_headers(self, dmr: DataModelRelationships):
def test_define_required_csv_headers(self, DMR: DataModelRelationships):
"""Tests method returns correct values"""
assert dmr.define_required_csv_headers() == [
assert DMR.define_required_csv_headers() == [
'Attribute',
'Description',
'Valid Values',
Expand All @@ -280,6 +311,7 @@ def test_define_required_csv_headers(self, dmr: DataModelRelationships):
'Source'
]


@pytest.mark.parametrize("edge", [True, False], ids=["True", "False"])
def test_retreive_rel_headers_dict(self, dmr: DataModelRelationships, edge:bool):
"""Tests method returns correct values"""
Expand Down Expand Up @@ -691,8 +723,164 @@ def test_generate_node(self, helpers, data_model):
assert True == (node_dict['label'] in graph_data_model.nodes)

class TestDataModelEdges:
def test_generate_edge(self,helpers):
return
"""
Cases to test
Where node == attribute_display_name
Weights
domain includes weights
list weights
single element weights
Edges
subClassOf/domainIncludes relationship edge
any other relationship edge
rangeIncludes relationship edge

"""
def test_skip_edge(self, helpers, DMR, data_model_edges):
# Instantiate graph object and set node
G = nx.MultiDiGraph()
node = "Diagnosis"

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser("validator_dag_test.model.csv")

# Parse Model
parsed_data_model = data_model_parser.parse_model()

# Instantiate data model Nodes object
DMN = DataModelNodes(parsed_data_model)

# Get edge relationships and all nodes from the parsed model
edge_relationships = DMR.define_edge_relationships()
all_nodes = DMN.gather_all_nodes(attr_rel_dict=parsed_data_model)

# Sanity check to ensure that the node we intend to test exists in the data model
assert node in all_nodes

# Add a single node to the graph
node_dict = {}
node_dict = DMN.generate_node_dict(node, parsed_data_model)
node_dict[node] = node_dict
G = DMN.generate_node(G, node_dict)

# Check the edges in the graph, there should be none
before_edges = deepcopy(G.edges)

# Generate an edge in the graph with one node and a subset of the parsed data model
# We're attempting to add an edge for a node that is the only one in the graph,
# so `generate_edge` should skip adding edges and return the same graph
G = data_model_edges.generate_edge(G, node, node_dict, {node:parsed_data_model[node]}, edge_relationships)

# Assert that no edges were added and that the current graph edges are the same as before the call to `generate_edge`
assert before_edges == G.edges

@pytest.mark.parametrize("node_to_add, edge_relationship",
[("DataType", "parentOf"),
("Female", "parentOf"),
("Sex","requiresDependency")],
ids=["subClassOf",
"Valid Value",
"all others"
])
def test_generate_edge(self, helpers, DMR, data_model_edges, node_to_add, edge_relationship):
# Instantiate graph object
G = nx.MultiDiGraph()

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser("validator_dag_test.model.csv")

#Parse Model
parsed_data_model = data_model_parser.parse_model()

# Instantiate data model Nodes object
DMN = DataModelNodes(parsed_data_model)

# Get edge relationships and all nodes from the parsed model
edge_relationships = DMR.define_edge_relationships()
all_nodes = DMN.gather_all_nodes(attr_rel_dict=parsed_data_model)

# Sanity check to ensure that the node we intend to test exists in the data model
assert node_to_add in all_nodes

# Add all nodes to the graph
all_node_dict = {}
for node in all_nodes:
node_dict = DMN.generate_node_dict(node, parsed_data_model)
all_node_dict[node] = node_dict
G = DMN.generate_node(G, node_dict)

# Check the edges in the graph, there should be none
before_edges = deepcopy(G.edges)

# Generate edges for whichever node we are testing
G = data_model_edges.generate_edge(G, node_to_add, all_node_dict, parsed_data_model, edge_relationships)

# Assert that the current edges are different from the edges of the graph before
assert G.edges > before_edges

# Assert that somewhere in the current edges for the node we added, that the correct relationship exists
relationship_df = pd.DataFrame(G.edges, columns= ['node1', 'node2', 'edge'])
assert (relationship_df['edge'] == edge_relationship).any()

@pytest.mark.parametrize("node_to_add, other_node, expected_weight, data_model_path",
[("Patient ID", "Biospecimen", 1, "validator_dag_test.model.csv"),
("dataset_id", "cohorts", -1, "properties.test.model.csv")],
ids=["list", "domainIncludes"])
def test_generate_weights(self, helpers, DMR, data_model_edges, node_to_add, other_node, expected_weight, data_model_path):
# Instantiate graph object
G = nx.MultiDiGraph()

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_path)

#Parse Model
parsed_data_model = data_model_parser.parse_model()

# Instantiate data model Nodes object
DMN = DataModelNodes(parsed_data_model)

# Get edge relationships and all nodes from the parsed model
edge_relationships = DMR.define_edge_relationships()
all_nodes = DMN.gather_all_nodes(attr_rel_dict=parsed_data_model)

# Sanity check to ensure that the node we intend to test exists in the data model
assert node_to_add in all_nodes

# Add all nodes to the graph
all_node_dict = {}
for node in all_nodes:
node_dict = DMN.generate_node_dict(node, parsed_data_model)
all_node_dict[node] = node_dict
G = DMN.generate_node(G, node_dict)

# Check the edges in the graph, there should be none
before_edges = deepcopy(G.edges)

# Generate edges for whichever node we are testing
G = data_model_edges.generate_edge(G, node_to_add, all_node_dict, parsed_data_model, edge_relationships)

# Assert that the current edges are different from the edges of the graph before
assert G.edges > before_edges

# Cast the edges and weights to a DataFrame for easier indexing
edges_and_weights = pd.DataFrame(G.edges.data(), columns= ['node1', 'node2', 'weights']).set_index('node1')

# Weights are set to a negative nubmer to indicate that the weight cannot be known reliably beforehand and must be determined by reading the schema
# Get the index of the property in the schema
# Weights for properties are determined by their order in the schema.
# This would allow the tests to continue to function correctly in the case were other attributes were added to the schema
if expected_weight < 0:
schema = helpers.get_data_frame(path=helpers.get_data_path(data_model_path), data_model=True)
expected_weight = schema.index[schema['Attribute']==other_node][0]
logger.debug(f"Expected weight for the edge of nodes {node_to_add} and {other_node} is {expected_weight}.")

# Assert that the weight added is what is expected
if node_to_add in ['Patient ID']:
assert edges_and_weights.loc[other_node, 'weights']['weight'] == expected_weight
elif node_to_add in ['cohorts']:
assert edges_and_weights.loc[node_to_add, 'weights']['weight'] == expected_weight




class TestDataModelJsonSchema:
Expand Down Expand Up @@ -994,10 +1182,3 @@ def test_convert_graph_to_jsonld(self, helpers, data_model):
jsonld_dm = convert_graph_to_jsonld(Graph=graph_data_model)
assert list(jsonld_dm.keys()) == ['@context', '@graph', '@id']
assert len(jsonld_dm['@graph']) > 1

class TestSchemas:
def test_convert_csv_to_graph(self, helpers):
return
def test_convert_jsonld_to_graph(self, helpers):
return

7 changes: 6 additions & 1 deletion tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ def test_check_graph_has_required_node_fields(self, helpers):
assert expected_error == validator_errors

def test_dag(self, helpers):
# TODO: The schema validator currently doesn't catch the Diagnosis-Diagnosis self loop.
# It is an expected error but it will need to be decided if the validator should prevent or allow such self loops

# Get graph data model
graph_data_model = graph_data_model_func(helpers, data_model_name='validator_dag_test.model.csv')

Expand All @@ -101,6 +104,8 @@ def test_dag(self, helpers):

# nodes could be in different order so need to account for that
expected_errors = ['Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: Patient and PatientID, please remove this loop from your model and submit again.',
'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: PatientID and Patient, please remove this loop from your model and submit again.']
'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: PatientID and Patient, please remove this loop from your model and submit again.',
'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: Diagnosis and Diagnosis, please remove this loop from your model and submit again.']

assert validator_errors[0] in expected_errors