Skip to content

Commit 65bbbf6

Browse files
author
Matthias Ley
committedFeb 29, 2024·
Add Node class and improve tree structure building
Refactored tree building mechanism in obo_utils.py to handle duplications and invalid entries more efficiently. Added a new Node class to encapsulate node-related logic. The building mechanism now includes initializing the tree structure, sorting nodes topologically, constructing the complete tree and validating the longest chain. Improved functionality and code readability in this process.
1 parent 500978d commit 65bbbf6

File tree

1 file changed

+227
-67
lines changed

1 file changed

+227
-67
lines changed
 

‎src/ontoloviz/obo_utils.py

+227-67
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,60 @@
11
import requests
22
import re
33
from typing import Union
4+
from collections import deque
5+
from typing import List
6+
from copy import deepcopy
47

58
zero = 0.000001337
69
fake_one = 1.000001337
710
white = "#FFFFFF"
811

912

13+
class Node:
14+
def __init__(self, original_id: str, updated_id: str, line_data: List[str], float_sep: str):
15+
self.original_id = original_id
16+
self.updated_id = updated_id
17+
self.original_parent_id = line_data[0]
18+
self.updated_parent_ids = {line_data[0]}
19+
self.level = 0
20+
self.label = line_data[1]
21+
self.description = line_data[2]
22+
self.count = safe_convert_count(line_data[3], float_sep)
23+
self.color = line_data[4]
24+
25+
def update_parents(self, updated_parent_ids: List[str]):
26+
self.updated_parent_ids.update(set(updated_parent_ids))
27+
28+
def as_dict(self) -> dict:
29+
return {
30+
"id": self.updated_id,
31+
"parent": list(self.updated_parent_ids)[0],
32+
"level": self.level,
33+
"label": self.label,
34+
"description": self.description,
35+
"counts": self.count if self.count != 0 else zero,
36+
"imported_counts": self.count if self.count != 0 else fake_one,
37+
"color": self.color if self.color else white,
38+
"node_object": self
39+
}
40+
41+
1042
def build_non_separator_based_tree(file_name: str = None, float_sep: str = None) -> dict:
1143
"""Parse an ontology with child- and parent-ids from a file and build tree structure
1244
1345
:param file_name: tab separated file with 6 columns:
1446
id, parent, label, description, count, color
1547
:param float_sep: if given, counts are considered as floating point values and converted based on given separator
1648
"""
17-
tree, to_process = parse_file_to_extract_root_nodes_and_processable_lines(file_name, float_sep)
49+
tree, to_process = initialize_tree_structure(file_name, float_sep)
50+
sort_to_process_topologically(to_process)
1851

19-
while True:
20-
drop_idxs = handle_and_assign_nodes(to_process, tree)
21-
for idx in sorted(list(set(drop_idxs)), reverse=True):
22-
del to_process[idx]
52+
t = [_ for _ in to_process if _.original_id == "x99902827_x99902971" and _.original_parent_id == "x99903372_x99903533"]
53+
for _ in t:
54+
print("Original ID:", _.original_id, "Updated ID:", _.updated_id, "Original Parent ID: ", _.original_parent_id, "Updated Parent IDs: ", _.updated_parent_ids)
2355

24-
if not to_process:
25-
break
26-
27-
print(f"Dropped: {len(drop_idxs)}, Left to process: {len(to_process)}")
56+
construct_complete_tree(to_process, tree)
57+
validate_longest_chain(tree)
2858

2959
if float_sep:
3060
print("Normalizing float counts to int")
@@ -33,71 +63,61 @@ def build_non_separator_based_tree(file_name: str = None, float_sep: str = None)
3363
return tree
3464

3565

36-
def handle_and_assign_nodes(to_process: list = None, tree: dict = None) -> list:
37-
drop_idxs = []
38-
for idx, (attempts, node) in enumerate(to_process):
39-
if attempts >= 20:
40-
print(f"Dropping node because no suitable parent was found after "
41-
f"20 attempts: {node['id']}")
42-
drop_idxs.append(idx)
43-
continue
44-
45-
for sub_tree_id, sub_tree in tree.items():
46-
parent = node["parent"]
47-
if parent in sub_tree.keys():
48-
node["level"] = tree[sub_tree_id][parent]["level"] + 1
49-
tree[sub_tree_id][node["id"]] = node
50-
drop_idxs.append(idx)
51-
52-
to_process[idx][0] += 1
53-
return drop_idxs
54-
55-
56-
def parse_file_to_extract_root_nodes_and_processable_lines(input_file: str = None, float_sep: str = None) -> tuple:
66+
def initialize_tree_structure(input_file: str = None, float_sep: str = None) -> tuple:
5767
tree = dict()
58-
to_process = list()
59-
duplicate_check = list()
60-
with open(file=input_file, mode="r", encoding="utf-8") as f_in:
68+
duplicate_map = dict() # This now serves for both identifying duplicates and mapping them to unique ids
69+
70+
nodes_to_process = list()
71+
with open(input_file, "r", encoding="utf-8") as f_in:
6172
for line_idx, line in enumerate(f_in):
62-
if line_idx == 0:
73+
if line_idx == 0: # Skip header
6374
continue
6475
node_ids_unformatted, *line_data = line.rstrip("\n").split("\t")
6576
node_ids = node_ids_unformatted.split("|")
77+
6678
for node_id in node_ids:
67-
original_node_id = node_id
68-
duplicate_count = duplicate_check.count(node_id) + 1
69-
if duplicate_count > 1:
70-
node_id = f"{node_id}_{duplicate_count}"
71-
duplicate_check.append(original_node_id)
72-
handle_and_assign_root_nodes(node_id, tree, to_process, line_data, float_sep)
73-
74-
return tree, to_process
75-
76-
77-
def handle_and_assign_root_nodes(node_id: str = None, tree: dict = None, to_process: list = None,
78-
line_data: list = None, float_sep: str = None):
79-
parent = line_data[0]
80-
count = safe_convert_count(line_data[3], float_sep)
81-
color = line_data[4]
82-
83-
node = {
84-
"id": node_id,
85-
"parent": parent,
86-
"level": 0,
87-
"label": line_data[1],
88-
"description": line_data[2],
89-
"counts": count if count != 0 else zero,
90-
"imported_counts": count if count != 0 else fake_one,
91-
"color": color if color else white
92-
}
79+
# Store unique identifiers for duplicate nodes
80+
if node_id in duplicate_map:
81+
duplicate_count = len(duplicate_map[node_id]) + 1
82+
node_id_updated = f"{node_id}_{duplicate_count}"
83+
duplicate_map[node_id].append(node_id_updated)
84+
else:
85+
node_id_updated = node_id
86+
duplicate_map[node_id] = [node_id]
87+
88+
node = Node(original_id=node_id, updated_id=node_id_updated, line_data=line_data, float_sep=float_sep)
89+
90+
# add root nodes to tree
91+
if not node.original_parent_id:
92+
tree[node.original_id] = {
93+
node.original_id: node.as_dict()
94+
}
95+
96+
nodes_to_process.append(Node(original_id=node_id, updated_id=node_id_updated,
97+
line_data=line_data, float_sep=float_sep))
98+
99+
for node in nodes_to_process:
100+
node.update_parents(duplicate_map.get(node.original_parent_id, []))
101+
102+
# if a parent is duplicated, all respective child nodes have to be duplicated as well
103+
nodes_to_process_deduped = [] # A new list to store the updated Nodes
104+
105+
for node in nodes_to_process:
106+
parent_ids = node.updated_parent_ids
107+
108+
if len(parent_ids) > 1:
109+
for parent_id in parent_ids:
110+
111+
# Duplicate the Node and update the parent id according to the duplicate
112+
new_node = deepcopy(node)
113+
new_node.updated_parent_ids = {parent_id}
114+
nodes_to_process_deduped.append(new_node)
115+
else:
116+
nodes_to_process_deduped.append(node)
93117

94-
# populate first level of tree structure
95-
if not parent:
96-
tree[node_id] = {
97-
node_id: node
98-
}
99-
else:
100-
to_process.append([0, node])
118+
nodes_to_process = nodes_to_process_deduped
119+
120+
return tree, nodes_to_process
101121

102122

103123
def safe_convert_count(count_as_str: str = None, float_sep: str = None) -> Union[int, float]:
@@ -111,6 +131,146 @@ def safe_convert_count(count_as_str: str = None, float_sep: str = None) -> Union
111131
return def_value
112132

113133

134+
def sort_to_process_topologically(nodes_to_process):
135+
"""Sorts nodes_to_process to have parents appear before their children"""
136+
id_to_node = {node.updated_id: node for node in nodes_to_process}
137+
138+
graph_dict = {node.updated_id: [] for node in nodes_to_process}
139+
140+
for node in nodes_to_process:
141+
if node.original_parent_id:
142+
for parent_id in node.updated_parent_ids: # Each node has set of parent IDs.
143+
if parent_id is None: # Skip if parent_id is None.
144+
continue
145+
graph_dict[parent_id].append(node.updated_id)
146+
147+
result = [] # this list will store the result.
148+
Q = deque() # create an empty deque.
149+
150+
# calculate in-degrees for all nodes.
151+
in_degree = {k: 0 for k in graph_dict}
152+
153+
for node_id in graph_dict:
154+
for child_id in graph_dict[node_id]:
155+
in_degree[child_id] += 1
156+
157+
# identify all nodes without parents.
158+
for id, degree in in_degree.items():
159+
if degree == 0:
160+
Q.appendleft(id)
161+
162+
# remove nodes without parents.
163+
while Q:
164+
id = Q.pop()
165+
result.append(id)
166+
167+
if id not in graph_dict: # Handle cases for nodes that were parents but not a child themselves.
168+
continue
169+
170+
for child_id in graph_dict[id]:
171+
in_degree[child_id] -= 1
172+
if in_degree[child_id] == 0:
173+
Q.appendleft(child_id)
174+
175+
if len(result) != len(graph_dict):
176+
raise RuntimeError("Graph contains a cycle or a disconnected segment.")
177+
178+
sorted_nodes = [id_to_node[id] for id in result]
179+
180+
return sorted_nodes
181+
182+
183+
def construct_complete_tree(nodes_to_process: list, tree: dict) -> None:
184+
last_size = len(nodes_to_process)
185+
186+
while nodes_to_process:
187+
uninserted_nodes = []
188+
189+
for node in nodes_to_process:
190+
inserted = False
191+
for parent_id_updated in node.updated_parent_ids:
192+
for sub_tree_id, sub_tree in tree.items():
193+
if parent_id_updated in sub_tree.keys():
194+
node.level = tree[sub_tree_id][parent_id_updated]["level"] + 1
195+
tree[sub_tree_id][node.updated_id] = node.as_dict()
196+
inserted = True
197+
break
198+
if inserted:
199+
break
200+
201+
if not inserted:
202+
uninserted_nodes.append(node)
203+
204+
clear_uninserted_nodes_if_only_root_nodes_remain(uninserted_nodes)
205+
206+
nodes_to_process = uninserted_nodes
207+
if len(nodes_to_process) == last_size:
208+
print(f"Can't insert any more nodes, they will be dropped: {nodes_to_process}")
209+
return
210+
211+
last_size = len(nodes_to_process)
212+
print("Constructed tree")
213+
214+
215+
def clear_uninserted_nodes_if_only_root_nodes_remain(uninserted_nodes: list):
216+
only_root_nodes = all([False if _.original_parent_id else True for _ in uninserted_nodes])
217+
if only_root_nodes:
218+
uninserted_nodes.clear()
219+
220+
221+
def validate_longest_chain(tree: dict):
222+
longest_chain = {
223+
"x99902827_x99902971": False,
224+
"x99903372_x99903533": False,
225+
"x99904082_x99904236": False,
226+
"x99905136_x99905259_x99905369": False,
227+
"x99905705_x99905844": False,
228+
"x99906758_x99906904": False,
229+
"x99910659_x99910771_x99910863_x99910953": False,
230+
"x99911101_x99911213_x99911325_x99911452_x99911576": False,
231+
"x99911641_x99911753_x99911865": False,
232+
"x99913118_x99913207_x99913277": False,
233+
"x99913394_x99913503_x99913595": False,
234+
"x99913733_x99913822": False,
235+
"x99914757_x99914847": False,
236+
"x99915197": False,
237+
"x99915523_x99915607": False,
238+
"x99917162": False,
239+
"x99917946_x99918006": False,
240+
"x99919131": False,
241+
"Collagen": False,
242+
}
243+
for sub_tree_id, sub_tree in tree.items():
244+
for updated_node_id, node in sub_tree.items():
245+
original_node_id = node["node_object"].original_id
246+
if original_node_id in longest_chain.keys():
247+
longest_chain[original_node_id] = True
248+
249+
# these are original ids, and not the updated ids generated by duplication
250+
longest_chain_child_to_parent = {
251+
"x99902827_x99902971": "x99903372_x99903533",
252+
"x99903372_x99903533": "x99904082_x99904236",
253+
"x99904082_x99904236": "x99905136_x99905259_x99905369",
254+
"x99905136_x99905259_x99905369": "x99905705_x99905844",
255+
"x99905705_x99905844": "x99906758_x99906904",
256+
"x99906758_x99906904": "x99910659_x99910771_x99910863_x99910953",
257+
"x99910659_x99910771_x99910863_x99910953": "x99911101_x99911213_x99911325_x99911452_x99911576",
258+
"x99911101_x99911213_x99911325_x99911452_x99911576": "x99911641_x99911753_x99911865",
259+
"x99911641_x99911753_x99911865": "x99913118_x99913207_x99913277",
260+
"x99913118_x99913207_x99913277": "x99913394_x99913503_x99913595",
261+
"x99913394_x99913503_x99913595": "x99913733_x99913822",
262+
"x99913733_x99913822": "x99914757_x99914847",
263+
"x99914757_x99914847": "x99915197",
264+
"x99915197": "x99915523_x99915607",
265+
"x99915523_x99915607": "x99917162",
266+
"x99917162": "x99917946_x99918006",
267+
"x99917946_x99918006": "x99919131",
268+
"x99919131": "Collagen",
269+
"Collagen": "",
270+
}
271+
272+
273+
114274
def normalize_tree_counts_from_float_to_int(tree: dict = None):
115275
max_val = find_max(tree)
116276
for main_node in tree:

0 commit comments

Comments
 (0)
Please sign in to comment.