Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Schemas: unit tests FDS-1061 TestDataModelNodes #1310

Merged
2 changes: 1 addition & 1 deletion schematic/schemas/data_model_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_rel_node_dict_info(self, relationship: str) -> Optional[tuple[str, dict]
Returns:
rel_key, str: relationship node label
rel_node_dict, dict: node_attr_dict, from relationships dictionary for a given relationship

TODO: Move to data_model_relationships.
"""
for k,v in self.data_model_relationships.relationships_dictionary.items():
if k == relationship:
Expand Down
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from schematic.schemas.data_model_parser import DataModelParser
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_nodes import DataModelNodes
from schematic.schemas.data_model_json_schema import DataModelJSONSchema

from schematic.configuration.configuration import CONFIG
Expand Down Expand Up @@ -87,6 +88,7 @@ def get_data_model_parser(data_model_name:str=None, *paths):
# Instantiate DataModelParser
data_model_parser = DataModelParser(path_to_data_model=fullpath)
return data_model_parser

@staticmethod
def get_data_model_json_schema(data_model_name:str=None, *paths):
# Get path to data model
Expand Down
320 changes: 305 additions & 15 deletions tests/test_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
import os
import pandas as pd
import pytest
import random

from schematic.utils.df_utils import load_df
from schematic.utils.schema_utils import get_label_from_display_name, get_attribute_display_name_from_label, convert_bool_to_str, parse_validation_rules
from schematic.utils.io_utils import load_json

from schematic.schemas.data_model_graph import DataModelGraph
Expand All @@ -28,6 +30,25 @@
'example.model.jsonld': "JSONLD"
}

def test_fake_func():
return

REL_FUNC_DICT = {
'get_attribute_display_name_from_label':get_attribute_display_name_from_label,
'parse_validation_rules': parse_validation_rules,
'get_label_from_display_name': get_label_from_display_name,
'convert_bool_to_str': convert_bool_to_str,
'test_fake_func': test_fake_func,
}
TEST_DN_DICT = {'Bio Things': {'class': 'BioThings',
'property': 'bioThings'},
'bio things': {'class': 'Biothings',
'property': 'biothings'},
}
NODE_DISPLAY_NAME_DICT = {'Patient':False,
'Sex': True}


def generate_graph_data_model(helpers, data_model_name):
"""
Simple helper function to generate a networkx graph data model from a CSV or JSONLD data model
Expand All @@ -48,6 +69,16 @@ def generate_graph_data_model(helpers, data_model_name):

return graph_data_model

def generate_data_model_nodes(helpers, data_model_name):
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model_name)
# Parse Model
parsed_data_model = data_model_parser.parse_model()
# Instantiate DataModelNodes
data_model_nodes = DataModelNodes(attribute_relationships_dict=parsed_data_model)
return data_model_nodes


@pytest.fixture(name='dmjsonldp')
def fixture_dm_jsonld_parser():
yield DataModelJSONLDParser()
Expand All @@ -61,6 +92,13 @@ def DME(helpers, data_model_name='example.model.csv'):
DME = DataModelGraphExplorer(graph_data_model)
yield DME

@pytest.fixture(name='relationships')
def get_relationships(helpers):
DMR = DataModelRelationships()
relationships_dict = DMR.relationships_dictionary
relationships = list(relationships_dict.keys())
yield relationships

@pytest.fixture(name="dmr")
def fixture_dmr():
"""Yields a data model relationships object for testing"""
Expand Down Expand Up @@ -339,23 +377,275 @@ def test_sub_schema_graph(self):
return

class TestDataModelNodes:
def test_gather_nodes(self):
return
def test_gather_all_nodes(self):
return
def test_get_rel_node_dict_info(self):
return
def test_get_data_model_properties(self):
return
def test_get_entry_type(self):
return
def test_run_rel_functions(self):
return
def test_generate_node_dict(self):
return
def test_generate_node(self):
@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()))
def test_gather_nodes(self, helpers, data_model):
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model)

# Parse Model
attr_rel_dictionary = data_model_parser.parse_model()

# Instantiate DataModelNodes
data_model_nodes = generate_data_model_nodes(helpers, data_model_name=data_model)

attr_info = ('Patient', attr_rel_dictionary['Patient'])
nodes = data_model_nodes.gather_nodes(attr_info=attr_info)

# Make sure there are no repeat nodes
assert len(nodes) == len(set(nodes))

# Make sure the nodes returned conform to expectations (values and order)
## The parsing records display names for relationships for CSV and labels for JSONLD, so the expectations are different between the two.
if DATA_MODEL_DICT[data_model]=='CSV':
expected_nodes = ['Patient', 'Patient ID', 'Sex', 'Year of Birth', 'Diagnosis', 'Component', 'DataType']
elif DATA_MODEL_DICT[data_model] == 'JSONLD':
expected_nodes = ['Patient', 'PatientID', 'Sex', 'YearofBirth', 'Diagnosis', 'Component', 'DataType']

assert nodes == expected_nodes

# Ensure order is tested.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit confused here. Here it seems like we are ensuring "Patient" does not become the last item in a list?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an additional maybe unnecessary test to ensure that the order and not just the contents is being tested in the previous test. this is sort of a future proof thing...

reordered_nodes = nodes.copy()
reordered_nodes.remove('Patient')
reordered_nodes.append('Patient')
assert reordered_nodes != expected_nodes

@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be re-written like:

@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values())) class TestDataModelNodes:

def test_gather_all_nodes(self, helpers, data_model):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand what you are trying to do here. I am thinking since gather_all_nodes calls gather_nodes function, will it make sense to mock gather_nodes and then test if gather_all_nodes could return the correct output?

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model)

# Parse Model
attr_rel_dictionary = data_model_parser.parse_model()

# Instantiate DataModelNodes
data_model_nodes = generate_data_model_nodes(helpers, data_model_name=data_model)

all_nodes = data_model_nodes.gather_all_nodes(attr_rel_dict=attr_rel_dictionary)

# Make sure there are no repeat nodes
assert len(all_nodes) == len(set(all_nodes))

# Check that nodes from first entry, are recoreded in order in all_nodes
# Only check first entry, bc subsequent ones might be in the same order as would be gathered with gather_nodes if it contained a node that was already recorded.
first_attribute = list(attr_rel_dictionary.keys())[0]
attr_info = (first_attribute, attr_rel_dictionary[first_attribute])
expected_starter_nodes = data_model_nodes.gather_nodes(attr_info=attr_info)
actual_starter_nodes = all_nodes[0:len(expected_starter_nodes)]

assert actual_starter_nodes == expected_starter_nodes

def test_get_rel_node_dict_info(self, helpers, relationships):
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name='example.model.csv')

# Instantiate DataModelNodes
data_model_nodes = generate_data_model_nodes(helpers, data_model_name='example.model.csv')

for relationship in relationships:
rel_dict_info = data_model_nodes.get_rel_node_dict_info(relationship)
if rel_dict_info:
assert type(rel_dict_info[0]) == str
assert type(rel_dict_info[1]) == dict
assert 'default' in rel_dict_info[1].keys()

@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()))
def test_get_data_model_properties(self, helpers, data_model):
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model)

# Parse Model
attr_rel_dictionary = data_model_parser.parse_model()

# Instantiate DataModelNodes
data_model_nodes = generate_data_model_nodes(helpers, data_model_name=data_model)

# Get properties in the data model
data_model_properties = data_model_nodes.get_data_model_properties(attr_rel_dictionary)

# In the current example model, there are no properties, would need to update this section if properties are added.
assert data_model_properties == []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this is a good point. I created a ticket here to track: https://sagebionetworks.jira.com/jira/software/c/projects/FDS/issues/FDS-1226


# Update the attr_rel_dictionary to add a property, then see if its found.
# Get a random relationship key from the attr_rel_dictionary:
all_keys = list(attr_rel_dictionary.keys())
random_index = len(all_keys)-1
rel_key = all_keys[random.randint(0, random_index)]

# Modify the contents of that relationship
attr_rel_dictionary[rel_key]['Relationships']['Properties'] = ['TestProperty']

# Get properties in the modified data model
data_model_properties = data_model_nodes.get_data_model_properties(attr_rel_dictionary)

assert data_model_properties == ['TestProperty']

@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()))
def test_get_entry_type(self, helpers, data_model):

# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model)

# Parse Model
attr_rel_dictionary = data_model_parser.parse_model()

# Update the attr_rel_dictionary to add a property, then see if it is assigned the correct entry type.
# Get a random relationship key from the attr_rel_dictionary:
all_keys = list(attr_rel_dictionary.keys())
random_index = len(all_keys)-1
rel_key = all_keys[random.randint(0, random_index)]

# Modify the contents of that relationship
attr_rel_dictionary[rel_key]['Relationships']['Properties'] = ['TestProperty']

# Instantiate DataModelNodes
# Note: Get entry type uses self, so I will have to instantiate DataModelNodes outside of the generate_data_model_nodes function
data_model_nodes = DataModelNodes(attribute_relationships_dict=attr_rel_dictionary)

# In the example data model all attributes should be classes.
for attr in attr_rel_dictionary.keys():
entry_type = data_model_nodes.get_entry_type(attr)
assert entry_type == 'class'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: after adding an example data dictionaries that have properties, we could change this part of the test


# Check that the added property is properly loaded as a property
assert data_model_nodes.get_entry_type('TestProperty') == 'property'

@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()))
@pytest.mark.parametrize("rel_func", list(REL_FUNC_DICT.values()), ids=list(REL_FUNC_DICT.keys()))
@pytest.mark.parametrize("test_dn", list(TEST_DN_DICT.keys()), ids=list(TEST_DN_DICT.keys()))
@pytest.mark.parametrize("test_bool", ['True', 'False', True, False, 'kldjk'], ids=['True_str', 'False_str', 'True_bool', 'False_bool', 'Random_str'])
def test_run_rel_functions(self, helpers, data_model, rel_func, test_dn, test_bool):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have unit tests for get_attribute_display_name_from_label, parse_validation_rules, get_label_from_display_name, convert_bool_to_str? If so, I think here we could simply mock responses from these four functions and test run_rel_functions.. Currently what you are doing here makes sense. But at the same time, I feel like this test becomes difficult to read because it gathers essentially four tests of four different functions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do have tests for those, but the trick here is actually making sure the parameters are being passed properly through this function.. which has at times felt a bit chaotic, so its nice to call the functions this way.

# Call each relationship function to ensure that it is returning the desired result.
# Note all the called functions will also be tested in other unit tests.
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model)

# Parse Model
attr_rel_dictionary = data_model_parser.parse_model()

# Instantiate DataModelNodes
data_model_nodes = generate_data_model_nodes(helpers, data_model_name=data_model)

# Run functions the same way they are called in run_rel_functions:
if rel_func == get_attribute_display_name_from_label:
expected_display_names = list(attr_rel_dictionary.keys())
returned_display_names = [data_model_nodes.run_rel_functions(
rel_func=get_attribute_display_name_from_label,
node_display_name=ndn,
attr_relationships=attr_rel_dictionary)
for ndn in expected_display_names]

assert expected_display_names == returned_display_names

elif rel_func == parse_validation_rules:
# Find attributes with validation rules
# Gather Validation Rules
vrs = []
for k, v in attr_rel_dictionary.items():
if 'Validation Rules' in v['Relationships'].keys():
vrs.append(v['Relationships']['Validation Rules'])
parsed_vrs= []
for attr in attr_rel_dictionary.keys():
attr_relationships = attr_rel_dictionary[attr]['Relationships']
if 'Validation Rules' in attr_relationships:
parsed_vrs.append(data_model_nodes.run_rel_functions(
rel_func=parse_validation_rules,
attr_relationships=attr_relationships,
csv_header='Validation Rules'))

assert len(vrs) == len(parsed_vrs)
if DATA_MODEL_DICT[data_model]=='CSV':
assert vrs != parsed_vrs
elif DATA_MODEL_DICT[data_model]=='JSONLD':
# JSONLDs already contain parsed validaiton rules so the raw vrs will match the parsed_vrs
assert vrs == parsed_vrs

# For all validation rules where there are multiple rules, make sure they have been split as expected.
for i, pvr in enumerate(parsed_vrs):
delim_count = vrs[i][0].count('::')
if delim_count:
assert len(pvr) == delim_count+1

elif rel_func == get_label_from_display_name:
# For a limited set check label is returned as expected.
for entry_type, expected_value in TEST_DN_DICT[test_dn].items():
actual_value = data_model_nodes.run_rel_functions(
rel_func=get_label_from_display_name,
node_display_name=test_dn,
entry_type=entry_type,
)
assert actual_value == expected_value
elif rel_func == convert_bool_to_str:
# return nothing if random string provided.
csv_header='Required'
attr_relationships = {csv_header:test_bool}
actual_conversion = data_model_nodes.run_rel_functions(
rel_func=convert_bool_to_str,
csv_header=csv_header,
attr_relationships=attr_relationships,
)
if 'true' in str(test_bool).lower():
assert actual_conversion==True
elif 'false' in str(test_bool).lower():
assert actual_conversion==False
else:
assert actual_conversion==None
else:
# If the function passed is not currently supported, should hit an error.
try:
data_model_nodes.run_rel_functions(rel_func=test_fake_func)
convert_worked = False
except:
convert_worked = True
assert convert_worked==True
return

@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()))
@pytest.mark.parametrize("node_display_name", list(NODE_DISPLAY_NAME_DICT.keys()), ids=[str(v) for v in NODE_DISPLAY_NAME_DICT.values()])
def test_generate_node_dict(self, helpers, data_model, node_display_name):
# Instantiate Parser
data_model_parser = helpers.get_data_model_parser(data_model_name=data_model)

# Parse Model
attr_rel_dictionary = data_model_parser.parse_model()

# Instantiate DataModelNodes
data_model_nodes = generate_data_model_nodes(helpers, data_model_name=data_model)

node_dict = data_model_nodes.generate_node_dict(
node_display_name=node_display_name,
attr_rel_dict=attr_rel_dictionary,
)

# Check that the output is as expected for the required key.
if NODE_DISPLAY_NAME_DICT[node_display_name]:
assert node_dict['required'] == True
else:
#Looking up this way, in case we add empty defaults back to JSONLD it wont fail, but will only be absent in JSONLD not CSV.
if not node_dict['required'] == False:
assert DATA_MODEL_DICT[data_model] == 'JSONLD'

@pytest.mark.parametrize("data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()))
def test_generate_node(self, helpers, data_model):
# Test adding a dummy node
node_dict = {'label': 'test_label'}

path_to_data_model = helpers.get_data_path(data_model)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually probably don't need path_to_data_model here.


# Get Graph
graph_data_model = generate_graph_data_model(helpers, data_model_name=path_to_data_model)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just pass data_model here.


# Instantiate DataModelNodes
data_model_nodes = generate_data_model_nodes(helpers, data_model_name=data_model)

# Assert the test node is not already in the graph
assert False == (node_dict['label'] in graph_data_model.nodes)

# Add test node
data_model_nodes.generate_node(graph_data_model, node_dict)

# Check that the test node has been added
assert True == (node_dict['label'] in graph_data_model.nodes)

class TestDataModelEdges:
def test_generate_edge(self,helpers):
return
Expand Down