From 2d6ae20ab56dab28c2d31f1958da9e9bd4d02710 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Mon, 23 Oct 2023 20:39:48 -0700 Subject: [PATCH 01/13] update data model edges to update a list instead of the graph directly --- schematic/schemas/data_model_edges.py | 37 ++++++++++++++------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/schematic/schemas/data_model_edges.py b/schematic/schemas/data_model_edges.py index 5acbb5e7b..6cec66c85 100644 --- a/schematic/schemas/data_model_edges.py +++ b/schematic/schemas/data_model_edges.py @@ -10,12 +10,12 @@ def __init__(self): def generate_edge( self, - G: nx.MultiDiGraph, node: str, all_node_dict: dict, attr_rel_dict: dict, edge_relationships: dict, - ) -> nx.MultiDiGraph: + edge_list:list, + ) -> list(tuple(str, str, dict{str:str, str:int})): """Generate an edge between a target node and relevant other nodes the data model. In short, does this current node belong to a recorded relationship in the attribute, relationshps dictionary. Go through each attribute and relationship to find where the node may be. Args: G, nx.MultiDiGraph: networkx graph representation of the data model, that is in the process of being fully built. At this point, all the nodes would have been added, and edges are being added per target node. @@ -28,9 +28,11 @@ def generate_edge( Relationships: { CSV Header: Value}}} edge_relationships: dict, rel_key: csv_header if the key represents a value relationship. - + edge_list: list(tuple), list of tuples describing the edges and the edge attributes, organized as (node_1, node_2, {key:edge_relationship_key, weight:int}) + At this point, the edge list will be in the process of being built. Adding edges from list so they will be added properly to the graph without being overwritten in the loop, and passing the Graph around more. Returns: - G, nx.MultiDiGraph: networkx graph representation of the data model, that has had new edges attached. + edge_list: list(tuple), list of tuples describing the edges and the edge attributes, organized as (node_1, node_2, {key:edge_relationship_key, weight:int}) + At this point, the edge list will have additional edges added related to the current node. """ # For each attribute in the model. for attribute_display_name, relationship in attr_rel_dict.items(): @@ -65,26 +67,25 @@ def generate_edge( # Add edges, in a manner that preserves directionality # TODO: rewrite to use edge_dir if rel_key in ["subClassOf", "domainIncludes"]: - G.add_edge( + edge_list.append(( all_node_dict[node]["label"], all_node_dict[attribute_display_name]["label"], - key=edge_key, - weight=weight, - ) + {'key':edge_key, + 'weight':weight,}) + ) else: - G.add_edge( + edge_list.append(( all_node_dict[attribute_display_name]["label"], all_node_dict[node]["label"], - key=edge_key, - weight=weight, - ) + {'key':edge_key, + 'weight':weight},) + ) # Add add rangeIncludes/valid value relationships in reverse as well, making the attribute the parent of the valid value. if rel_key == "rangeIncludes": - G.add_edge( + edge_list.append(( all_node_dict[attribute_display_name]["label"], all_node_dict[node]["label"], - key="parentOf", - weight=weight, - ) - - return G + {'key':"parentOf", + 'weight':weight},) + ) + return edge_list From 85ae4fad6733f6dd18f2c4f1ee443eccc84be6d3 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Mon, 23 Oct 2023 22:35:28 -0700 Subject: [PATCH 02/13] add timing to cycle finding so that it will move on if too many cycles are found --- schematic/schemas/data_model_validator.py | 39 ++++++++++++++++------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/schematic/schemas/data_model_validator.py b/schematic/schemas/data_model_validator.py index 40911e6a9..e3d626882 100644 --- a/schematic/schemas/data_model_validator.py +++ b/schematic/schemas/data_model_validator.py @@ -1,8 +1,11 @@ +import logging +import multiprocessing import networkx as nx +import time from typing import Any, Dict, Optional, Text, List, Tuple from schematic.schemas.data_model_relationships import DataModelRelationships - +logger = logging.getLogger(__name__) class DataModelValidator: """ @@ -69,6 +72,14 @@ def check_graph_has_required_node_fields(self) -> List[str]: ) return error + def run_cycles(self, graph): + cycles = nx.simple_cycles(self.graph) + if cycles: + for cycle in cycles: + logger.warning( + f"Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: {cycle[0]} and {cycle[1]}, please remove this loop from your model and submit again." + ) + def check_is_dag(self) -> List[str]: """Check that generated graph is a directed acyclic graph Returns: @@ -76,17 +87,23 @@ def check_is_dag(self) -> List[str]: """ error = [] if not nx.is_directed_acyclic_graph(self.graph): - # Attempt to find any cycles: - cycles = nx.simple_cycles(self.graph) - if cycles: - for cycle in cycles: - error.append( - f"Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: {cycle[0]} and {cycle[1]}, please remove this loop from your model and submit again." - ) - else: - error.append( - f"Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we could not locate the sorce of the error, please inspect your model." + cycles = multiprocessing.Process(target=self.run_cycles, name="Get Cycles", args=(self.graph,)) + cycles.start() + + # Give up to 5 seconds to find cycles, if not exit and issue standard error + time.sleep(5) + + # If thread is active + if cycles.is_alive(): + # Terminate foo + cycles.terminate() + # Cleanup + cycles.join() + + error.append( + f"Schematic requires models be a directed acyclic graph (DAG). Please inspect your model." ) + return error def check_blacklisted_characters(self) -> List[str]: From f57b31aa818e1ca81b2340a03b44bb62185aaf43 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Mon, 23 Oct 2023 22:36:09 -0700 Subject: [PATCH 03/13] fix typing error in data_model_edges --- schematic/schemas/data_model_edges.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/schemas/data_model_edges.py b/schematic/schemas/data_model_edges.py index 6cec66c85..7abbc26a8 100644 --- a/schematic/schemas/data_model_edges.py +++ b/schematic/schemas/data_model_edges.py @@ -15,7 +15,7 @@ def generate_edge( attr_rel_dict: dict, edge_relationships: dict, edge_list:list, - ) -> list(tuple(str, str, dict{str:str, str:int})): + ) -> list[tuple[str, str, dict[str:str, str:int]]]: """Generate an edge between a target node and relevant other nodes the data model. In short, does this current node belong to a recorded relationship in the attribute, relationshps dictionary. Go through each attribute and relationship to find where the node may be. Args: G, nx.MultiDiGraph: networkx graph representation of the data model, that is in the process of being fully built. At this point, all the nodes would have been added, and edges are being added per target node. From 9b77be90a7f71e6a12e4ed03db2243404189d7e2 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Mon, 23 Oct 2023 22:42:04 -0700 Subject: [PATCH 04/13] fix issue with get_ordered_entry --- schematic/schemas/data_model_graph.py | 31 ++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/schematic/schemas/data_model_graph.py b/schematic/schemas/data_model_graph.py index e63cd4137..f5946d35c 100644 --- a/schematic/schemas/data_model_graph.py +++ b/schematic/schemas/data_model_graph.py @@ -1,3 +1,4 @@ +from copy import deepcopy import graphviz import logging from typing import Any, Dict, Optional, Text @@ -94,16 +95,22 @@ def generate_data_model_graph(self) -> nx.MultiDiGraph: # Generate node and attach information (attributes) to each node G = self.dmn.generate_node(G, node_dict) + edge_list = [] ## Connect nodes via edges for node in all_nodes: # Generate edges - G = self.dme.generate_edge( - G, + edge_list_2 = self.dme.generate_edge( node, all_node_dict, self.attribute_relationships_dict, edge_relationships, + edge_list, ) + edge_list = edge_list_2.copy() + + # Add edges to the Graph + for node_1, node_2, edge_dict in edge_list: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) return G @@ -357,7 +364,24 @@ def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]: ) edge_key = self.rel_dict[key]["edge_key"] - if self.rel_dict[key]["jsonld_direction"] == "out": + + # Get edge keys for domain includes and subclassOf + domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key'] + subclassOf_edge_key = self.rel_dict['subClassOf']['edge_key'] + + # Order lists when they are part of subclassOf or domainIncludes + if edge_key in [domainIncludes_edge_key, subclassOf_edge_key]: + original_edge_weights_dict = { + attached_node: self.graph[source_node][attached_node][edge_key][ + "weight" + ] + for source_node, attached_node in self.graph.out_edges( + source_node_label + ) + if edge_key in self.graph[source_node][attached_node] + } + # Handle out edges + elif self.rel_dict[key]["jsonld_direction"] == "out": # use outedges original_edge_weights_dict = { @@ -369,6 +393,7 @@ def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]: ) if edge_key in self.graph[source_node][attached_node] } + # Handle in edges else: # use inedges original_edge_weights_dict = { From 459120ad1637aa26570d8650b6ea11d92371c434 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Mon, 23 Oct 2023 23:13:28 -0700 Subject: [PATCH 05/13] WIP: fix domain includes handling, contains subclass off issue --- schematic/schemas/data_model_jsonld.py | 60 +++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/schematic/schemas/data_model_jsonld.py b/schematic/schemas/data_model_jsonld.py index d4e9c98c8..5363e7ffc 100644 --- a/schematic/schemas/data_model_jsonld.py +++ b/schematic/schemas/data_model_jsonld.py @@ -134,6 +134,28 @@ def get_edges_associated_with_node( node_edges.extend(list(self.graph.out_edges(node, data=True))) return node_edges + def get_edges_associated_with_property_nodes( + self, node:str + ) -> List[tuple[str, str, dict[str, int]]]: + """Get edges associated with property nodes to make sure we add that relationship. + Args: + node, str: Label of node property in the graph to look for assiciated edges + Returns: + node_edges, list: List of Tuples of edges associated with the given node, tuple contains the two nodes, plus the weight dict associated with the edge connection. + """ + # Get edge keys for domainIncludes and subclassOf + domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key'] + node_edges = [] + # Get dict of edges for the current property node + node_edges_dict = self.graph[node] + for node_2, edge_dict in node_edges_dict.items(): + # Look through relationships in the edge dictionary + for edge_key in edge_dict: + # If the edge is a property or subclass then add the edges to the list + if edge_key in [domainIncludes_edge_key]: + node_edges.append((node, node_2, edge_dict[edge_key])) + return node_edges + def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): """ Args: @@ -145,6 +167,12 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): # Get all edges associated with the current node node_edges = self.get_edges_associated_with_node(node=node) + + # For properties look for reverse relationships too + if node in self.dmge.find_properties(): + property_node_edges = self.get_edges_associated_with_property_nodes(node=node) + node_edges.extend(property_node_edges) + # Get node pairs and weights for each edge for node_1, node_2, weight in node_edges: # Retrieve the relationship(s) and related info between the two nodes @@ -153,6 +181,7 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): # Get the relationship edge key edge_key = rel_vals["edge_key"] + # Check if edge_key is even one of the relationships for this node pair. if edge_key in node_edge_relationships: # for each relationship between the given nodes @@ -160,7 +189,10 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): # If the relationship defined and edge_key if relationship == edge_key: # TODO: rewrite to use edge_dir - if edge_key in ["domainIncludes", "parentOf"]: + + domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key'] + subclassOf_edge_key = self.rel_dict['subClassOf']['edge_key'] + if edge_key in [domainIncludes_edge_key, subclassOf_edge_key]: if node_2 == node: # Make sure the key is in the template (differs between properties and classes) if rel_vals["jsonld_key"] in template.keys(): @@ -178,6 +210,23 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): ) else: template[rel_vals["jsonld_key"]] == node_1 + elif node_1 == node: + # Make sure the key is in the template (differs between properties and classes) + if rel_vals["jsonld_key"] in template.keys(): + node_2_id = {"@id": "bts:" + node_2} + # TODO Move this to a helper function to clear up. + if ( + isinstance( + template[rel_vals["jsonld_key"]], list + ) + and node_2_id + not in template[rel_vals["jsonld_key"]] + ): + template[rel_vals["jsonld_key"]].append( + node_2_id + ) + else: + template[rel_vals["jsonld_key"]] == node_2 else: if node_1 == node: # Make sure the key is in the template (differs between properties and classes) @@ -238,6 +287,7 @@ def fill_entry_template(self, template: dict, node: str) -> dict: template = self.add_edge_rels_to_template( template=template, rel_vals=rel_vals, node=node ) + # Fill in node value information else: template = self.add_node_info_to_template( @@ -249,11 +299,11 @@ def fill_entry_template(self, template: dict, node: str) -> dict: template=template, data_model_relationships=data_model_relationships, ) + # Reorder lists based on weights: template = self.reorder_template_entries( template=template, ) - # Add contexts to certain values template = self.add_contexts_to_entries( template=template, @@ -364,6 +414,12 @@ def reorder_template_entries(self, template: dict) -> dict: sorted_edges = self.dmge.get_ordered_entry( key=key, source_node_label=template_label ) + try: + len(entry) == len(sorted_edges) + except: + breakpoint() + #raise ValueError("There is an error with sorting values in the JSONLD, please issue a bug report.") + edge_weights_dict = {edge: i for i, edge in enumerate(sorted_edges)} ordered_edges = [0] * len(edge_weights_dict.keys()) for edge, normalized_weight in edge_weights_dict.items(): From 44cc0e88c68959f43e1c5f6a1222002db5887e77 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Tue, 24 Oct 2023 06:06:02 -0700 Subject: [PATCH 06/13] change jsonld direction for domainIncludes and update jsonld to remove subclassOf specific handling --- schematic/schemas/data_model_graph.py | 17 +---------------- schematic/schemas/data_model_relationships.py | 2 +- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/schematic/schemas/data_model_graph.py b/schematic/schemas/data_model_graph.py index f5946d35c..917d1eb71 100644 --- a/schematic/schemas/data_model_graph.py +++ b/schematic/schemas/data_model_graph.py @@ -365,23 +365,8 @@ def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]: edge_key = self.rel_dict[key]["edge_key"] - # Get edge keys for domain includes and subclassOf - domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key'] - subclassOf_edge_key = self.rel_dict['subClassOf']['edge_key'] - - # Order lists when they are part of subclassOf or domainIncludes - if edge_key in [domainIncludes_edge_key, subclassOf_edge_key]: - original_edge_weights_dict = { - attached_node: self.graph[source_node][attached_node][edge_key][ - "weight" - ] - for source_node, attached_node in self.graph.out_edges( - source_node_label - ) - if edge_key in self.graph[source_node][attached_node] - } # Handle out edges - elif self.rel_dict[key]["jsonld_direction"] == "out": + if self.rel_dict[key]["jsonld_direction"] == "out": # use outedges original_edge_weights_dict = { diff --git a/schematic/schemas/data_model_relationships.py b/schematic/schemas/data_model_relationships.py index 6cf85d899..9be7952dc 100644 --- a/schematic/schemas/data_model_relationships.py +++ b/schematic/schemas/data_model_relationships.py @@ -153,7 +153,7 @@ def define_data_model_relationships(self) -> Dict: "jsonld_key": "schema:domainIncludes", "csv_header": "Properties", "edge_key": "domainValue", - "jsonld_direction": "in", + "jsonld_direction": "out", "edge_dir": "in", "type": list, "edge_rel": True, From a7def06856daaf440b12fa00a1966c57570ec2c9 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Tue, 24 Oct 2023 07:00:10 -0700 Subject: [PATCH 07/13] handle subclassOf and domainIncludes separately when handling edge relationships --- schematic/schemas/data_model_jsonld.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/schematic/schemas/data_model_jsonld.py b/schematic/schemas/data_model_jsonld.py index 5363e7ffc..41b5a05e3 100644 --- a/schematic/schemas/data_model_jsonld.py +++ b/schematic/schemas/data_model_jsonld.py @@ -181,7 +181,6 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): # Get the relationship edge key edge_key = rel_vals["edge_key"] - # Check if edge_key is even one of the relationships for this node pair. if edge_key in node_edge_relationships: # for each relationship between the given nodes @@ -189,10 +188,9 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): # If the relationship defined and edge_key if relationship == edge_key: # TODO: rewrite to use edge_dir - domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key'] subclassOf_edge_key = self.rel_dict['subClassOf']['edge_key'] - if edge_key in [domainIncludes_edge_key, subclassOf_edge_key]: + if edge_key in [subclassOf_edge_key]: if node_2 == node: # Make sure the key is in the template (differs between properties and classes) if rel_vals["jsonld_key"] in template.keys(): @@ -210,7 +208,8 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): ) else: template[rel_vals["jsonld_key"]] == node_1 - elif node_1 == node: + elif edge_key in [domainIncludes_edge_key]: + if node_1 == node: # Make sure the key is in the template (differs between properties and classes) if rel_vals["jsonld_key"] in template.keys(): node_2_id = {"@id": "bts:" + node_2} @@ -414,9 +413,7 @@ def reorder_template_entries(self, template: dict) -> dict: sorted_edges = self.dmge.get_ordered_entry( key=key, source_node_label=template_label ) - try: - len(entry) == len(sorted_edges) - except: + if not len(entry) == len(sorted_edges): breakpoint() #raise ValueError("There is an error with sorting values in the JSONLD, please issue a bug report.") From e2f540e30fc84bae3e535d6488236cc910cd9387 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Tue, 24 Oct 2023 13:54:00 -0700 Subject: [PATCH 08/13] update JSONLD parsing to better handle properties --- schematic/schemas/data_model_parser.py | 51 +++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py index 74068c180..c7423eb67 100644 --- a/schematic/schemas/data_model_parser.py +++ b/schematic/schemas/data_model_parser.py @@ -274,6 +274,21 @@ def parse_entry(self, rel_entry: any, id_jsonld_key: str) -> Any: parsed_rel_entry = rel_entry return parsed_rel_entry + def get_display_name_from_label(self, label, model_jsonld): + jsonld_keys_to_extract = ["label", "displayName"] + label_jsonld_key, dn_jsonld_key = [ + self.rel_dict[key]["jsonld_key"] for key in jsonld_keys_to_extract + ] + for entry in model_jsonld: + # Get the attr key for the dictionary + if dn_jsonld_key in entry: + # The attr_key is the entry display name if one was recorded + attr_key = entry[dn_jsonld_key] + else: + # If not we wil use the get the label. + attr_key = entry[label_jsonld_key] + return attr_key + def gather_jsonld_attributes_relationships(self, model_jsonld: List[dict]) -> Dict: """ Args: @@ -327,15 +342,41 @@ def gather_jsonld_attributes_relationships(self, model_jsonld: List[dict]) -> Di ): # Retrieve entry value associated with the given relationship rel_entry = entry[rel_vals["jsonld_key"]] - # If there is an entry parset it by type and add to the attr:relationships dictionary. + # If there is an entry parse it by type and add to the attr:relationships dictionary. if rel_entry: parsed_rel_entry = self.parse_entry( rel_entry=rel_entry, id_jsonld_key=id_jsonld_key ) - # Add relationships for each attribute and relationship to the dictionary - attr_rel_dictionary[attr_key]["Relationships"].update( - {self.rel_dict[rel_key]["csv_header"]: parsed_rel_entry} - ) + rel_csv_header = self.rel_dict[rel_key]["csv_header"] + if rel_key == 'domainIncludes': + # In the JSONLD the domain includes field contains the ids of attributes that the current attribute is the property of. + # Because of this we need to handle these values differently. + # We will get the values in the field (parsed_val), then add the current attribute as to the property key in the attr_rel_dictionary[property_attr_key]. + for parsed_val in parsed_rel_entry: + attr_in_dict = False + property_attr_key='' + # Check if the parsed value is already a part of the attr_rel_dictionary + for attr_dn, rels in attr_rel_dictionary.items(): + if parsed_val == rels["Relationships"].get('label'): + property_attr_key = attr_dn + attr_in_dict = True + # If it is part of the dictionary update add current attribute as a property of the parsed value + if attr_in_dict == True: + if not rel_csv_header in attr_rel_dictionary[property_attr_key]["Relationships"]: + attr_rel_dictionary[property_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]}) + else: + attr_rel_dictionary[property_attr_key]["Relationships"][rel_csv_header].append(entry[label_jsonld_key]) + # If the parsed_val is not already recorded in the dictionary, add it + elif attr_in_dict == False: + # Get the display name for the parsed value + property_attr_key = self.get_display_name_from_label(parsed_val, model_jsonld) + + attr_rel_dictionary.update(attr_dict_template(property_attr_key)) + attr_rel_dictionary[property_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]}) + else: + attr_rel_dictionary[attr_key]["Relationships"].update( + {rel_csv_header: parsed_rel_entry} + ) elif ( rel_vals["jsonld_key"] in entry.keys() and not rel_vals["csv_header"] From 7181728d6c664b3b7a4dce52f93f192e44cc34aa Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Tue, 24 Oct 2023 15:00:25 -0700 Subject: [PATCH 09/13] update test_skip_edge to work with new edge_list handling --- tests/test_schemas.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index fbf8faf47..532e00b1a 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -900,13 +900,17 @@ def test_skip_edge(self, helpers, DMR, data_model_edges): # Check the edges in the graph, there should be none before_edges = deepcopy(G.edges) + edge_list = [] # Generate an edge in the graph with one node and a subset of the parsed data model # We're attempting to add an edge for a node that is the only one in the graph, # so `generate_edge` should skip adding edges and return the same graph - G = data_model_edges.generate_edge( - G, node, node_dict, {node: parsed_data_model[node]}, edge_relationships + edge_list_2 = data_model_edges.generate_edge( + node, node_dict, {node: parsed_data_model[node]}, edge_relationships, edge_list, ) + for node_1, node_2, edge_dict in edge_list_2: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) + breakpoint() # Assert that no edges were added and that the current graph edges are the same as before the call to `generate_edge` assert before_edges == G.edges From c9737b37b2d91abfe1ddaa8dbb031148d2deb121 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Wed, 25 Oct 2023 09:27:25 -0700 Subject: [PATCH 10/13] add parentOf in the JSONLD parsing update, also change some naming to be clearer --- schematic/schemas/data_model_parser.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py index c7423eb67..f1684da1c 100644 --- a/schematic/schemas/data_model_parser.py +++ b/schematic/schemas/data_model_parser.py @@ -348,31 +348,32 @@ def gather_jsonld_attributes_relationships(self, model_jsonld: List[dict]) -> Di rel_entry=rel_entry, id_jsonld_key=id_jsonld_key ) rel_csv_header = self.rel_dict[rel_key]["csv_header"] - if rel_key == 'domainIncludes': - # In the JSONLD the domain includes field contains the ids of attributes that the current attribute is the property of. + if rel_key == 'domainIncludes' or rel_key == 'parentOf': + # In the JSONLD the domain includes field contains the ids of attributes that the current attribute is the property/parent of. # Because of this we need to handle these values differently. - # We will get the values in the field (parsed_val), then add the current attribute as to the property key in the attr_rel_dictionary[property_attr_key]. + # We will get the values in the field (parsed_val), then add the current attribute as to the property key in the attr_rel_dictionary[p_attr_key]. for parsed_val in parsed_rel_entry: attr_in_dict = False - property_attr_key='' + #Get propert/parent key (displayName) + p_attr_key='' # Check if the parsed value is already a part of the attr_rel_dictionary for attr_dn, rels in attr_rel_dictionary.items(): if parsed_val == rels["Relationships"].get('label'): - property_attr_key = attr_dn + p_attr_key = attr_dn attr_in_dict = True # If it is part of the dictionary update add current attribute as a property of the parsed value if attr_in_dict == True: - if not rel_csv_header in attr_rel_dictionary[property_attr_key]["Relationships"]: - attr_rel_dictionary[property_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]}) + if not rel_csv_header in attr_rel_dictionary[p_attr_key]["Relationships"]: + attr_rel_dictionary[p_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]}) else: - attr_rel_dictionary[property_attr_key]["Relationships"][rel_csv_header].append(entry[label_jsonld_key]) + attr_rel_dictionary[p_attr_key]["Relationships"][rel_csv_header].append(entry[label_jsonld_key]) # If the parsed_val is not already recorded in the dictionary, add it elif attr_in_dict == False: # Get the display name for the parsed value - property_attr_key = self.get_display_name_from_label(parsed_val, model_jsonld) + p_attr_key = self.get_display_name_from_label(parsed_val, model_jsonld) - attr_rel_dictionary.update(attr_dict_template(property_attr_key)) - attr_rel_dictionary[property_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]}) + attr_rel_dictionary.update(attr_dict_template(p_attr_key)) + attr_rel_dictionary[p_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]}) else: attr_rel_dictionary[attr_key]["Relationships"].update( {rel_csv_header: parsed_rel_entry} From 2cd4f85bd1a8112125768735422b53ab5506ec1c Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Wed, 25 Oct 2023 09:28:07 -0700 Subject: [PATCH 11/13] update graph tests to conform to new list expectations --- tests/test_schemas.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 532e00b1a..7436712d1 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -910,7 +910,7 @@ def test_skip_edge(self, helpers, DMR, data_model_edges): for node_1, node_2, edge_dict in edge_list_2: G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) - breakpoint() + # Assert that no edges were added and that the current graph edges are the same as before the call to `generate_edge` assert before_edges == G.edges @@ -957,11 +957,16 @@ def test_generate_edge( # Check the edges in the graph, there should be none before_edges = deepcopy(G.edges) + edge_list = [] + # Generate edges for whichever node we are testing - G = data_model_edges.generate_edge( - G, node_to_add, all_node_dict, parsed_data_model, edge_relationships + edge_list_2 = data_model_edges.generate_edge( + node_to_add, all_node_dict, parsed_data_model, edge_relationships, edge_list, ) + for node_1, node_2, edge_dict in edge_list_2: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) + # Assert that the current edges are different from the edges of the graph before assert G.edges > before_edges @@ -1018,11 +1023,16 @@ def test_generate_weights( # Check the edges in the graph, there should be none before_edges = deepcopy(G.edges) + edge_list = [] + # Generate edges for whichever node we are testing - G = data_model_edges.generate_edge( - G, node_to_add, all_node_dict, parsed_data_model, edge_relationships + edge_list_2 = data_model_edges.generate_edge( + node_to_add, all_node_dict, parsed_data_model, edge_relationships, edge_list, ) + for node_1, node_2, edge_dict in edge_list_2: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) + # Assert that the current edges are different from the edges of the graph before assert G.edges > before_edges From a62530ec22646fb8bdd01edc3e754fb7bdd96073 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Wed, 25 Oct 2023 09:28:59 -0700 Subject: [PATCH 12/13] update validation tests to use dmge vs DME --- tests/test_validation.py | 105 +++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 60 deletions(-) diff --git a/tests/test_validation.py b/tests/test_validation.py index 923669f84..1b447190d 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -20,25 +20,10 @@ logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) -@pytest.fixture -def DME(helpers): - - inputModelLocation = helpers.get_data_path('example.model.jsonld') - #sg = SchemaGenerator(inputModelLocation) - data_model_parser = DataModelParser(path_to_data_model = inputModelLocation) - #Parse Model - parsed_data_model = data_model_parser.parse_model() - - # Instantiate DataModelGraph - data_model_grapher = DataModelGraph(parsed_data_model) - - # Generate graph - graph_data_model = data_model_grapher.generate_data_model_graph() - - # Instantiate DataModelGraphExplorer - DME = DataModelGraphExplorer(graph_data_model) - - yield DME +@pytest.fixture(name="dmge") +def DMGE(helpers): + dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") + yield dmge @pytest.fixture def metadataModel(helpers): @@ -73,7 +58,7 @@ def test_valid_manifest(self,helpers,metadataModel): assert warnings == [] - def test_invalid_manifest(self,helpers, DME,metadataModel): + def test_invalid_manifest(self,helpers, dmge,metadataModel): manifestPath = helpers.get_data_path("mock_manifests/Invalid_Test_Manifest.csv") rootNode = 'MockComponent' @@ -88,7 +73,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): row_num = '3', attribute_name = 'Check Num', invalid_entry = 'c', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -96,7 +81,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): row_num = '3', attribute_name = 'Check Int', invalid_entry = '5.63', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -104,7 +89,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): row_num = '3', attribute_name = 'Check String', invalid_entry = '94', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -114,7 +99,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): attribute_name = 'Check List', list_error = "not_comma_delimited", invalid_entry = 'invalid list values', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -124,7 +109,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): attribute_name = 'Check Regex List', list_error = "not_comma_delimited", invalid_entry = 'ab cd ef', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -134,7 +119,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): attribute_name = 'Check Regex Format', module_to_call = 'match', invalid_entry = 'm', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -144,7 +129,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): attribute_name = 'Check Regex Single', module_to_call = 'search', invalid_entry = 'q', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -154,7 +139,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): attribute_name = 'Check Regex Integer', module_to_call = 'search', invalid_entry = '5.4', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_url_error( @@ -165,14 +150,14 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): attribute_name = 'Check URL', argument = None, invalid_entry = 'http://googlef.com/', - DME = DME, + dmge = dmge, )[0] in errors date_err = GenerateError.generate_content_error( val_rule = 'date', attribute_name = 'Check Date', - DME = DME, + dmge = dmge, row_num = ['2','3','4'], error_val = ['84-43-094', '32-984', 'notADate'], )[0] @@ -182,7 +167,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): assert GenerateError.generate_content_error( val_rule = 'unique error', attribute_name = 'Check Unique', - DME = DME, + dmge = dmge, row_num = ['2','3','4'], error_val = ['str1'], )[0] in errors @@ -190,7 +175,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): assert GenerateError.generate_content_error( val_rule = 'inRange 50 100 error', attribute_name = 'Check Range', - DME = DME, + dmge = dmge, row_num = ['3'], error_val = ['30'], )[0] in errors @@ -199,13 +184,13 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): assert GenerateError.generate_content_error( val_rule = 'recommended', attribute_name = 'Check Recommended', - DME = DME, + dmge = dmge, )[1] in warnings assert GenerateError.generate_content_error( val_rule = 'protectAges', attribute_name = 'Check Ages', - DME = DME, + dmge = dmge, row_num = ['2','3'], error_val = ['6549','32851'], )[1] in warnings @@ -216,7 +201,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): attribute_name='Check Match at Least', invalid_entry = ['7163'], missing_manifest_ID = ['syn27600110', 'syn29381803'], - DME = DME, + dmge = dmge, )[1] in warnings assert GenerateError.generate_cross_warning( @@ -224,7 +209,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): row_num = ['3'], attribute_name = 'Check Match at Least values', invalid_entry = ['51100'], - DME = DME, + dmge = dmge, )[1] in warnings assert \ @@ -232,14 +217,14 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862078', 'syn27648165'], - DME = DME, + dmge = dmge, )[1] in warnings \ or \ GenerateError.generate_cross_warning( val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862066', 'syn27648165'], - DME = DME, + dmge = dmge, )[1] in warnings @@ -248,7 +233,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): row_num = ['2', '3', '4'], attribute_name='Check Match Exactly values', invalid_entry = ['71738', '98085', '210065'], - DME = DME, + dmge = dmge, )[1] warning_in_list = [cross_warning[1] in warning for warning in warnings] assert any(warning_in_list) @@ -256,7 +241,7 @@ def test_invalid_manifest(self,helpers, DME,metadataModel): - def test_in_house_validation(self,helpers,DME,metadataModel): + def test_in_house_validation(self,helpers,dmge,metadataModel): manifestPath = helpers.get_data_path("mock_manifests/Invalid_Test_Manifest.csv") rootNode = 'MockComponent' @@ -272,7 +257,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): row_num = '3', attribute_name = 'Check Num', invalid_entry = 'c', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -280,7 +265,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): row_num = '3', attribute_name = 'Check Int', invalid_entry = '5.63', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -288,7 +273,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): row_num = '3', attribute_name = 'Check String', invalid_entry = '94', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -296,7 +281,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): row_num = '3', attribute_name = 'Check NA', invalid_entry = '9.5', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -306,7 +291,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): attribute_name = 'Check List', list_error = "not_comma_delimited", invalid_entry = 'invalid list values', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -316,7 +301,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): attribute_name = 'Check Regex List', list_error = "not_comma_delimited", invalid_entry = 'ab cd ef', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -326,7 +311,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): attribute_name = 'Check Regex Single', module_to_call = 'search', invalid_entry = 'q', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -336,7 +321,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): attribute_name = 'Check Regex Format', module_to_call = 'match', invalid_entry = 'm', - DME = DME, + dmge = dmge, )[0] in errors assert GenerateError.generate_url_error( @@ -347,7 +332,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): attribute_name = 'Check URL', argument = None, invalid_entry = 'http://googlef.com/', - DME = DME, + dmge = dmge, )[0] in errors @@ -358,7 +343,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): attribute_name='Check Match at Least', invalid_entry = ['7163'], missing_manifest_ID = ['syn27600110', 'syn29381803'], - DME = DME, + dmge = dmge, )[1] in warnings assert GenerateError.generate_cross_warning( @@ -366,7 +351,7 @@ def test_in_house_validation(self,helpers,DME,metadataModel): row_num = ['3'], attribute_name = 'Check Match at Least values', invalid_entry = ['51100'], - DME = DME, + dmge = dmge, )[1] in warnings assert \ @@ -374,14 +359,14 @@ def test_in_house_validation(self,helpers,DME,metadataModel): val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862078', 'syn27648165'], - DME = DME, + dmge = dmge, )[1] in warnings \ or \ GenerateError.generate_cross_warning( val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862066', 'syn27648165'], - DME = DME, + dmge = dmge, )[1] in warnings assert GenerateError.generate_cross_warning( @@ -389,13 +374,13 @@ def test_in_house_validation(self,helpers,DME,metadataModel): row_num = ['2', '3', '4'], attribute_name='Check Match Exactly values', invalid_entry = ['71738', '98085', '210065'], - DME = DME, + dmge = dmge, )[1] in warnings @pytest.mark.rule_combos(reason = 'This introduces a great number of tests covering every possible rule combination that are only necessary on occasion.') @pytest.mark.parametrize("base_rule, second_rule", get_rule_combinations()) - def test_rule_combinations(self, helpers, DME, base_rule, second_rule, metadataModel): + def test_rule_combinations(self, helpers, dmge, base_rule, second_rule, metadataModel): """ TODO: Describe what this test is doing. Updating the data model graph to allow testing of allowable rule combinations. @@ -408,12 +393,12 @@ def test_rule_combinations(self, helpers, DME, base_rule, second_rule, metadataM manifest = helpers.get_data_frame(manifestPath) # Get a view of the node data - all_node_data = DME.graph.nodes.data() + all_node_data = dmge.graph.nodes.data() # Update select validation rules in the data model graph for columns in the manifest for attribute in manifest.columns: # Get the node label - node_label = DME.get_node_label(attribute) + node_label = dmge.get_node_label(attribute) # Get a view of the recorded info for current node node_info = all_node_data[node_label] @@ -440,21 +425,21 @@ def test_rule_combinations(self, helpers, DME, base_rule, second_rule, metadataM # Update the manifest to only contain the Component and attribute column where the rule was changed. manifest = manifest[['Component', attribute]] - data_model_js = DataModelJSONSchema(jsonld_path=helpers.get_data_path('example.model.jsonld'), graph=DME.graph) + data_model_js = DataModelJSONSchema(jsonld_path=helpers.get_data_path('example.model.jsonld'), graph=dmge.graph) json_schema = data_model_js.get_json_validation_schema(source_node=rootNode, schema_name=rootNode + "_validation") validateManifest = ValidateManifest( errors = [], manifest = manifest, manifestPath = manifestPath, - DME = DME, + dmge = dmge, jsonSchema = json_schema ) #perform validation with no exceptions raised _, errors, warnings = validateManifest.validate_manifest_rules( manifest = manifest, - DME = DME, + dmge = dmge, restrict_rules = False, project_scope = None, ) From 1511eb3d793be4788305ceca28ded5b029b27a21 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice Date: Wed, 25 Oct 2023 09:29:41 -0700 Subject: [PATCH 13/13] update the expected errors since the old errors are just printed as warnings to the screen --- tests/test_validator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index 0f24f39fe..641fcccea 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -103,9 +103,7 @@ def test_dag(self, helpers): validator_errors = DMV.check_is_dag() # nodes could be in different order so need to account for that - expected_errors = ['Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: Patient and PatientID, please remove this loop from your model and submit again.', - 'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: PatientID and Patient, please remove this loop from your model and submit again.', - 'Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: Diagnosis and Diagnosis, please remove this loop from your model and submit again.'] - + expected_errors = ['Schematic requires models be a directed acyclic graph (DAG). Please inspect your model.'] + assert validator_errors[0] in expected_errors