Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to address JSONLD processing #1321

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions schematic/schemas/data_model_edges.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ def __init__(self):

def generate_edge(
self,
G: nx.MultiDiGraph,
node: str,
all_node_dict: dict,
attr_rel_dict: dict,
edge_relationships: dict,
) -> nx.MultiDiGraph:
edge_list:list,
) -> list[tuple[str, str, dict[str:str, str:int]]]:
"""Generate an edge between a target node and relevant other nodes the data model. In short, does this current node belong to a recorded relationship in the attribute, relationshps dictionary. Go through each attribute and relationship to find where the node may be.
Args:
G, nx.MultiDiGraph: networkx graph representation of the data model, that is in the process of being fully built. At this point, all the nodes would have been added, and edges are being added per target node.
Expand All @@ -28,9 +28,11 @@ def generate_edge(
Relationships: {
CSV Header: Value}}}
edge_relationships: dict, rel_key: csv_header if the key represents a value relationship.

edge_list: list(tuple), list of tuples describing the edges and the edge attributes, organized as (node_1, node_2, {key:edge_relationship_key, weight:int})
At this point, the edge list will be in the process of being built. Adding edges from list so they will be added properly to the graph without being overwritten in the loop, and passing the Graph around more.
Returns:
G, nx.MultiDiGraph: networkx graph representation of the data model, that has had new edges attached.
edge_list: list(tuple), list of tuples describing the edges and the edge attributes, organized as (node_1, node_2, {key:edge_relationship_key, weight:int})
At this point, the edge list will have additional edges added related to the current node.
"""
# For each attribute in the model.
for attribute_display_name, relationship in attr_rel_dict.items():
Expand Down Expand Up @@ -65,26 +67,25 @@ def generate_edge(
# Add edges, in a manner that preserves directionality
# TODO: rewrite to use edge_dir
if rel_key in ["subClassOf", "domainIncludes"]:
G.add_edge(
edge_list.append((
all_node_dict[node]["label"],
all_node_dict[attribute_display_name]["label"],
key=edge_key,
weight=weight,
)
{'key':edge_key,
'weight':weight,})
)
else:
G.add_edge(
edge_list.append((
all_node_dict[attribute_display_name]["label"],
all_node_dict[node]["label"],
key=edge_key,
weight=weight,
)
{'key':edge_key,
'weight':weight},)
)
# Add add rangeIncludes/valid value relationships in reverse as well, making the attribute the parent of the valid value.
if rel_key == "rangeIncludes":
G.add_edge(
edge_list.append((
all_node_dict[attribute_display_name]["label"],
all_node_dict[node]["label"],
key="parentOf",
weight=weight,
)

return G
{'key':"parentOf",
'weight':weight},)
)
return edge_list
14 changes: 12 additions & 2 deletions schematic/schemas/data_model_graph.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
import graphviz
import logging
from typing import Any, Dict, Optional, Text
Expand Down Expand Up @@ -94,16 +95,22 @@ def generate_data_model_graph(self) -> nx.MultiDiGraph:
# Generate node and attach information (attributes) to each node
G = self.dmn.generate_node(G, node_dict)

edge_list = []
## Connect nodes via edges
for node in all_nodes:
# Generate edges
G = self.dme.generate_edge(
G,
edge_list_2 = self.dme.generate_edge(
node,
all_node_dict,
self.attribute_relationships_dict,
edge_relationships,
edge_list,
)
edge_list = edge_list_2.copy()

# Add edges to the Graph
for node_1, node_2, edge_dict in edge_list:
G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight'])
return G


Expand Down Expand Up @@ -357,6 +364,8 @@ def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]:
)

edge_key = self.rel_dict[key]["edge_key"]

# Handle out edges
if self.rel_dict[key]["jsonld_direction"] == "out":
# use outedges

Expand All @@ -369,6 +378,7 @@ def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]:
)
if edge_key in self.graph[source_node][attached_node]
}
# Handle in edges
else:
# use inedges
original_edge_weights_dict = {
Expand Down
57 changes: 55 additions & 2 deletions schematic/schemas/data_model_jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,28 @@ def get_edges_associated_with_node(
node_edges.extend(list(self.graph.out_edges(node, data=True)))
return node_edges

def get_edges_associated_with_property_nodes(
self, node:str
) -> List[tuple[str, str, dict[str, int]]]:
"""Get edges associated with property nodes to make sure we add that relationship.
Args:
node, str: Label of node property in the graph to look for assiciated edges
Returns:
node_edges, list: List of Tuples of edges associated with the given node, tuple contains the two nodes, plus the weight dict associated with the edge connection.
"""
# Get edge keys for domainIncludes and subclassOf
domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key']
node_edges = []
# Get dict of edges for the current property node
node_edges_dict = self.graph[node]
for node_2, edge_dict in node_edges_dict.items():
# Look through relationships in the edge dictionary
for edge_key in edge_dict:
# If the edge is a property or subclass then add the edges to the list
if edge_key in [domainIncludes_edge_key]:
node_edges.append((node, node_2, edge_dict[edge_key]))
return node_edges

def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str):
"""
Args:
Expand All @@ -145,6 +167,12 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str):
# Get all edges associated with the current node
node_edges = self.get_edges_associated_with_node(node=node)


# For properties look for reverse relationships too
if node in self.dmge.find_properties():
property_node_edges = self.get_edges_associated_with_property_nodes(node=node)
node_edges.extend(property_node_edges)

# Get node pairs and weights for each edge
for node_1, node_2, weight in node_edges:
# Retrieve the relationship(s) and related info between the two nodes
Expand All @@ -160,7 +188,9 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str):
# If the relationship defined and edge_key
if relationship == edge_key:
# TODO: rewrite to use edge_dir
if edge_key in ["domainIncludes", "parentOf"]:
domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key']
subclassOf_edge_key = self.rel_dict['subClassOf']['edge_key']
if edge_key in [subclassOf_edge_key]:
if node_2 == node:
# Make sure the key is in the template (differs between properties and classes)
if rel_vals["jsonld_key"] in template.keys():
Expand All @@ -178,6 +208,24 @@ def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str):
)
else:
template[rel_vals["jsonld_key"]] == node_1
elif edge_key in [domainIncludes_edge_key]:
if node_1 == node:
# Make sure the key is in the template (differs between properties and classes)
if rel_vals["jsonld_key"] in template.keys():
node_2_id = {"@id": "bts:" + node_2}
# TODO Move this to a helper function to clear up.
if (
isinstance(
template[rel_vals["jsonld_key"]], list
)
and node_2_id
not in template[rel_vals["jsonld_key"]]
):
template[rel_vals["jsonld_key"]].append(
node_2_id
)
else:
template[rel_vals["jsonld_key"]] == node_2
else:
if node_1 == node:
# Make sure the key is in the template (differs between properties and classes)
Expand Down Expand Up @@ -238,6 +286,7 @@ def fill_entry_template(self, template: dict, node: str) -> dict:
template = self.add_edge_rels_to_template(
template=template, rel_vals=rel_vals, node=node
)

# Fill in node value information
else:
template = self.add_node_info_to_template(
Expand All @@ -249,11 +298,11 @@ def fill_entry_template(self, template: dict, node: str) -> dict:
template=template,
data_model_relationships=data_model_relationships,
)

# Reorder lists based on weights:
template = self.reorder_template_entries(
template=template,
)

# Add contexts to certain values
template = self.add_contexts_to_entries(
template=template,
Expand Down Expand Up @@ -364,6 +413,10 @@ def reorder_template_entries(self, template: dict) -> dict:
sorted_edges = self.dmge.get_ordered_entry(
key=key, source_node_label=template_label
)
if not len(entry) == len(sorted_edges):
breakpoint()
#raise ValueError("There is an error with sorting values in the JSONLD, please issue a bug report.")

edge_weights_dict = {edge: i for i, edge in enumerate(sorted_edges)}
ordered_edges = [0] * len(edge_weights_dict.keys())
for edge, normalized_weight in edge_weights_dict.items():
Expand Down
52 changes: 47 additions & 5 deletions schematic/schemas/data_model_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,21 @@ def parse_entry(self, rel_entry: any, id_jsonld_key: str) -> Any:
parsed_rel_entry = rel_entry
return parsed_rel_entry

def get_display_name_from_label(self, label, model_jsonld):
jsonld_keys_to_extract = ["label", "displayName"]
label_jsonld_key, dn_jsonld_key = [
self.rel_dict[key]["jsonld_key"] for key in jsonld_keys_to_extract
]
for entry in model_jsonld:
# Get the attr key for the dictionary
if dn_jsonld_key in entry:
# The attr_key is the entry display name if one was recorded
attr_key = entry[dn_jsonld_key]
else:
# If not we wil use the get the label.
attr_key = entry[label_jsonld_key]
return attr_key

def gather_jsonld_attributes_relationships(self, model_jsonld: List[dict]) -> Dict:
"""
Args:
Expand Down Expand Up @@ -327,15 +342,42 @@ def gather_jsonld_attributes_relationships(self, model_jsonld: List[dict]) -> Di
):
# Retrieve entry value associated with the given relationship
rel_entry = entry[rel_vals["jsonld_key"]]
# If there is an entry parset it by type and add to the attr:relationships dictionary.
# If there is an entry parse it by type and add to the attr:relationships dictionary.
if rel_entry:
parsed_rel_entry = self.parse_entry(
rel_entry=rel_entry, id_jsonld_key=id_jsonld_key
)
# Add relationships for each attribute and relationship to the dictionary
attr_rel_dictionary[attr_key]["Relationships"].update(
{self.rel_dict[rel_key]["csv_header"]: parsed_rel_entry}
)
rel_csv_header = self.rel_dict[rel_key]["csv_header"]
if rel_key == 'domainIncludes' or rel_key == 'parentOf':
# In the JSONLD the domain includes field contains the ids of attributes that the current attribute is the property/parent of.
# Because of this we need to handle these values differently.
# We will get the values in the field (parsed_val), then add the current attribute as to the property key in the attr_rel_dictionary[p_attr_key].
for parsed_val in parsed_rel_entry:
attr_in_dict = False
#Get propert/parent key (displayName)
p_attr_key=''
# Check if the parsed value is already a part of the attr_rel_dictionary
for attr_dn, rels in attr_rel_dictionary.items():
if parsed_val == rels["Relationships"].get('label'):
p_attr_key = attr_dn
attr_in_dict = True
# If it is part of the dictionary update add current attribute as a property of the parsed value
if attr_in_dict == True:
if not rel_csv_header in attr_rel_dictionary[p_attr_key]["Relationships"]:
attr_rel_dictionary[p_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]})
else:
attr_rel_dictionary[p_attr_key]["Relationships"][rel_csv_header].append(entry[label_jsonld_key])
# If the parsed_val is not already recorded in the dictionary, add it
elif attr_in_dict == False:
# Get the display name for the parsed value
p_attr_key = self.get_display_name_from_label(parsed_val, model_jsonld)

attr_rel_dictionary.update(attr_dict_template(p_attr_key))
attr_rel_dictionary[p_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]})
else:
attr_rel_dictionary[attr_key]["Relationships"].update(
{rel_csv_header: parsed_rel_entry}
)
elif (
rel_vals["jsonld_key"] in entry.keys()
and not rel_vals["csv_header"]
Expand Down
2 changes: 1 addition & 1 deletion schematic/schemas/data_model_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def define_data_model_relationships(self) -> Dict:
"jsonld_key": "schema:domainIncludes",
"csv_header": "Properties",
"edge_key": "domainValue",
"jsonld_direction": "in",
"jsonld_direction": "out",
"edge_dir": "in",
"type": list,
"edge_rel": True,
Expand Down
39 changes: 28 additions & 11 deletions schematic/schemas/data_model_validator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import logging
import multiprocessing
import networkx as nx
import time
from typing import Any, Dict, Optional, Text, List, Tuple

from schematic.schemas.data_model_relationships import DataModelRelationships

logger = logging.getLogger(__name__)

class DataModelValidator:
"""
Expand Down Expand Up @@ -69,24 +72,38 @@ def check_graph_has_required_node_fields(self) -> List[str]:
)
return error

def run_cycles(self, graph):
cycles = nx.simple_cycles(self.graph)
if cycles:
for cycle in cycles:
logger.warning(
f"Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: {cycle[0]} and {cycle[1]}, please remove this loop from your model and submit again."
)

def check_is_dag(self) -> List[str]:
"""Check that generated graph is a directed acyclic graph
Returns:
error, list: List of error messages if graph is not a DAG. List will include a message for each cycle found, if not there is a more generic message for the graph as a whole.
"""
error = []
if not nx.is_directed_acyclic_graph(self.graph):
# Attempt to find any cycles:
cycles = nx.simple_cycles(self.graph)
if cycles:
for cycle in cycles:
error.append(
f"Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: {cycle[0]} and {cycle[1]}, please remove this loop from your model and submit again."
)
else:
error.append(
f"Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we could not locate the sorce of the error, please inspect your model."
cycles = multiprocessing.Process(target=self.run_cycles, name="Get Cycles", args=(self.graph,))
cycles.start()

# Give up to 5 seconds to find cycles, if not exit and issue standard error
time.sleep(5)

# If thread is active
if cycles.is_alive():
# Terminate foo
cycles.terminate()
# Cleanup
cycles.join()

error.append(
f"Schematic requires models be a directed acyclic graph (DAG). Please inspect your model."
)

return error

def check_blacklisted_characters(self) -> List[str]:
Expand Down
Loading