1
1
import requests
2
2
import re
3
3
from typing import Union
4
+ from collections import deque
5
+ from typing import List
6
+ from copy import deepcopy
4
7
5
8
zero = 0.000001337
6
9
fake_one = 1.000001337
7
10
white = "#FFFFFF"
8
11
9
12
13
+ class Node :
14
+ def __init__ (self , original_id : str , updated_id : str , line_data : List [str ], float_sep : str ):
15
+ self .original_id = original_id
16
+ self .updated_id = updated_id
17
+ self .original_parent_id = line_data [0 ]
18
+ self .updated_parent_ids = {line_data [0 ]}
19
+ self .level = 0
20
+ self .label = line_data [1 ]
21
+ self .description = line_data [2 ]
22
+ self .count = safe_convert_count (line_data [3 ], float_sep )
23
+ self .color = line_data [4 ]
24
+
25
+ def update_parents (self , updated_parent_ids : List [str ]):
26
+ self .updated_parent_ids .update (set (updated_parent_ids ))
27
+
28
+ def as_dict (self ) -> dict :
29
+ return {
30
+ "id" : self .updated_id ,
31
+ "parent" : list (self .updated_parent_ids )[0 ],
32
+ "level" : self .level ,
33
+ "label" : self .label ,
34
+ "description" : self .description ,
35
+ "counts" : self .count if self .count != 0 else zero ,
36
+ "imported_counts" : self .count if self .count != 0 else fake_one ,
37
+ "color" : self .color if self .color else white ,
38
+ "node_object" : self
39
+ }
40
+
41
+
10
42
def build_non_separator_based_tree (file_name : str = None , float_sep : str = None ) -> dict :
11
43
"""Parse an ontology with child- and parent-ids from a file and build tree structure
12
44
13
45
:param file_name: tab separated file with 6 columns:
14
46
id, parent, label, description, count, color
15
47
:param float_sep: if given, counts are considered as floating point values and converted based on given separator
16
48
"""
17
- tree , to_process = parse_file_to_extract_root_nodes_and_processable_lines (file_name , float_sep )
49
+ tree , to_process = initialize_tree_structure (file_name , float_sep )
50
+ sort_to_process_topologically (to_process )
18
51
19
- while True :
20
- drop_idxs = handle_and_assign_nodes (to_process , tree )
21
- for idx in sorted (list (set (drop_idxs )), reverse = True ):
22
- del to_process [idx ]
52
+ t = [_ for _ in to_process if _ .original_id == "x99902827_x99902971" and _ .original_parent_id == "x99903372_x99903533" ]
53
+ for _ in t :
54
+ print ("Original ID:" , _ .original_id , "Updated ID:" , _ .updated_id , "Original Parent ID: " , _ .original_parent_id , "Updated Parent IDs: " , _ .updated_parent_ids )
23
55
24
- if not to_process :
25
- break
26
-
27
- print (f"Dropped: { len (drop_idxs )} , Left to process: { len (to_process )} " )
56
+ construct_complete_tree (to_process , tree )
57
+ validate_longest_chain (tree )
28
58
29
59
if float_sep :
30
60
print ("Normalizing float counts to int" )
@@ -33,71 +63,61 @@ def build_non_separator_based_tree(file_name: str = None, float_sep: str = None)
33
63
return tree
34
64
35
65
36
- def handle_and_assign_nodes (to_process : list = None , tree : dict = None ) -> list :
37
- drop_idxs = []
38
- for idx , (attempts , node ) in enumerate (to_process ):
39
- if attempts >= 20 :
40
- print (f"Dropping node because no suitable parent was found after "
41
- f"20 attempts: { node ['id' ]} " )
42
- drop_idxs .append (idx )
43
- continue
44
-
45
- for sub_tree_id , sub_tree in tree .items ():
46
- parent = node ["parent" ]
47
- if parent in sub_tree .keys ():
48
- node ["level" ] = tree [sub_tree_id ][parent ]["level" ] + 1
49
- tree [sub_tree_id ][node ["id" ]] = node
50
- drop_idxs .append (idx )
51
-
52
- to_process [idx ][0 ] += 1
53
- return drop_idxs
54
-
55
-
56
- def parse_file_to_extract_root_nodes_and_processable_lines (input_file : str = None , float_sep : str = None ) -> tuple :
66
+ def initialize_tree_structure (input_file : str = None , float_sep : str = None ) -> tuple :
57
67
tree = dict ()
58
- to_process = list ()
59
- duplicate_check = list ()
60
- with open (file = input_file , mode = "r" , encoding = "utf-8" ) as f_in :
68
+ duplicate_map = dict () # This now serves for both identifying duplicates and mapping them to unique ids
69
+
70
+ nodes_to_process = list ()
71
+ with open (input_file , "r" , encoding = "utf-8" ) as f_in :
61
72
for line_idx , line in enumerate (f_in ):
62
- if line_idx == 0 :
73
+ if line_idx == 0 : # Skip header
63
74
continue
64
75
node_ids_unformatted , * line_data = line .rstrip ("\n " ).split ("\t " )
65
76
node_ids = node_ids_unformatted .split ("|" )
77
+
66
78
for node_id in node_ids :
67
- original_node_id = node_id
68
- duplicate_count = duplicate_check .count (node_id ) + 1
69
- if duplicate_count > 1 :
70
- node_id = f"{ node_id } _{ duplicate_count } "
71
- duplicate_check .append (original_node_id )
72
- handle_and_assign_root_nodes (node_id , tree , to_process , line_data , float_sep )
73
-
74
- return tree , to_process
75
-
76
-
77
- def handle_and_assign_root_nodes (node_id : str = None , tree : dict = None , to_process : list = None ,
78
- line_data : list = None , float_sep : str = None ):
79
- parent = line_data [0 ]
80
- count = safe_convert_count (line_data [3 ], float_sep )
81
- color = line_data [4 ]
82
-
83
- node = {
84
- "id" : node_id ,
85
- "parent" : parent ,
86
- "level" : 0 ,
87
- "label" : line_data [1 ],
88
- "description" : line_data [2 ],
89
- "counts" : count if count != 0 else zero ,
90
- "imported_counts" : count if count != 0 else fake_one ,
91
- "color" : color if color else white
92
- }
79
+ # Store unique identifiers for duplicate nodes
80
+ if node_id in duplicate_map :
81
+ duplicate_count = len (duplicate_map [node_id ]) + 1
82
+ node_id_updated = f"{ node_id } _{ duplicate_count } "
83
+ duplicate_map [node_id ].append (node_id_updated )
84
+ else :
85
+ node_id_updated = node_id
86
+ duplicate_map [node_id ] = [node_id ]
87
+
88
+ node = Node (original_id = node_id , updated_id = node_id_updated , line_data = line_data , float_sep = float_sep )
89
+
90
+ # add root nodes to tree
91
+ if not node .original_parent_id :
92
+ tree [node .original_id ] = {
93
+ node .original_id : node .as_dict ()
94
+ }
95
+
96
+ nodes_to_process .append (Node (original_id = node_id , updated_id = node_id_updated ,
97
+ line_data = line_data , float_sep = float_sep ))
98
+
99
+ for node in nodes_to_process :
100
+ node .update_parents (duplicate_map .get (node .original_parent_id , []))
101
+
102
+ # if a parent is duplicated, all respective child nodes have to be duplicated as well
103
+ nodes_to_process_deduped = [] # A new list to store the updated Nodes
104
+
105
+ for node in nodes_to_process :
106
+ parent_ids = node .updated_parent_ids
107
+
108
+ if len (parent_ids ) > 1 :
109
+ for parent_id in parent_ids :
110
+
111
+ # Duplicate the Node and update the parent id according to the duplicate
112
+ new_node = deepcopy (node )
113
+ new_node .updated_parent_ids = {parent_id }
114
+ nodes_to_process_deduped .append (new_node )
115
+ else :
116
+ nodes_to_process_deduped .append (node )
93
117
94
- # populate first level of tree structure
95
- if not parent :
96
- tree [node_id ] = {
97
- node_id : node
98
- }
99
- else :
100
- to_process .append ([0 , node ])
118
+ nodes_to_process = nodes_to_process_deduped
119
+
120
+ return tree , nodes_to_process
101
121
102
122
103
123
def safe_convert_count (count_as_str : str = None , float_sep : str = None ) -> Union [int , float ]:
@@ -111,6 +131,146 @@ def safe_convert_count(count_as_str: str = None, float_sep: str = None) -> Union
111
131
return def_value
112
132
113
133
134
+ def sort_to_process_topologically (nodes_to_process ):
135
+ """Sorts nodes_to_process to have parents appear before their children"""
136
+ id_to_node = {node .updated_id : node for node in nodes_to_process }
137
+
138
+ graph_dict = {node .updated_id : [] for node in nodes_to_process }
139
+
140
+ for node in nodes_to_process :
141
+ if node .original_parent_id :
142
+ for parent_id in node .updated_parent_ids : # Each node has set of parent IDs.
143
+ if parent_id is None : # Skip if parent_id is None.
144
+ continue
145
+ graph_dict [parent_id ].append (node .updated_id )
146
+
147
+ result = [] # this list will store the result.
148
+ Q = deque () # create an empty deque.
149
+
150
+ # calculate in-degrees for all nodes.
151
+ in_degree = {k : 0 for k in graph_dict }
152
+
153
+ for node_id in graph_dict :
154
+ for child_id in graph_dict [node_id ]:
155
+ in_degree [child_id ] += 1
156
+
157
+ # identify all nodes without parents.
158
+ for id , degree in in_degree .items ():
159
+ if degree == 0 :
160
+ Q .appendleft (id )
161
+
162
+ # remove nodes without parents.
163
+ while Q :
164
+ id = Q .pop ()
165
+ result .append (id )
166
+
167
+ if id not in graph_dict : # Handle cases for nodes that were parents but not a child themselves.
168
+ continue
169
+
170
+ for child_id in graph_dict [id ]:
171
+ in_degree [child_id ] -= 1
172
+ if in_degree [child_id ] == 0 :
173
+ Q .appendleft (child_id )
174
+
175
+ if len (result ) != len (graph_dict ):
176
+ raise RuntimeError ("Graph contains a cycle or a disconnected segment." )
177
+
178
+ sorted_nodes = [id_to_node [id ] for id in result ]
179
+
180
+ return sorted_nodes
181
+
182
+
183
+ def construct_complete_tree (nodes_to_process : list , tree : dict ) -> None :
184
+ last_size = len (nodes_to_process )
185
+
186
+ while nodes_to_process :
187
+ uninserted_nodes = []
188
+
189
+ for node in nodes_to_process :
190
+ inserted = False
191
+ for parent_id_updated in node .updated_parent_ids :
192
+ for sub_tree_id , sub_tree in tree .items ():
193
+ if parent_id_updated in sub_tree .keys ():
194
+ node .level = tree [sub_tree_id ][parent_id_updated ]["level" ] + 1
195
+ tree [sub_tree_id ][node .updated_id ] = node .as_dict ()
196
+ inserted = True
197
+ break
198
+ if inserted :
199
+ break
200
+
201
+ if not inserted :
202
+ uninserted_nodes .append (node )
203
+
204
+ clear_uninserted_nodes_if_only_root_nodes_remain (uninserted_nodes )
205
+
206
+ nodes_to_process = uninserted_nodes
207
+ if len (nodes_to_process ) == last_size :
208
+ print (f"Can't insert any more nodes, they will be dropped: { nodes_to_process } " )
209
+ return
210
+
211
+ last_size = len (nodes_to_process )
212
+ print ("Constructed tree" )
213
+
214
+
215
+ def clear_uninserted_nodes_if_only_root_nodes_remain (uninserted_nodes : list ):
216
+ only_root_nodes = all ([False if _ .original_parent_id else True for _ in uninserted_nodes ])
217
+ if only_root_nodes :
218
+ uninserted_nodes .clear ()
219
+
220
+
221
+ def validate_longest_chain (tree : dict ):
222
+ longest_chain = {
223
+ "x99902827_x99902971" : False ,
224
+ "x99903372_x99903533" : False ,
225
+ "x99904082_x99904236" : False ,
226
+ "x99905136_x99905259_x99905369" : False ,
227
+ "x99905705_x99905844" : False ,
228
+ "x99906758_x99906904" : False ,
229
+ "x99910659_x99910771_x99910863_x99910953" : False ,
230
+ "x99911101_x99911213_x99911325_x99911452_x99911576" : False ,
231
+ "x99911641_x99911753_x99911865" : False ,
232
+ "x99913118_x99913207_x99913277" : False ,
233
+ "x99913394_x99913503_x99913595" : False ,
234
+ "x99913733_x99913822" : False ,
235
+ "x99914757_x99914847" : False ,
236
+ "x99915197" : False ,
237
+ "x99915523_x99915607" : False ,
238
+ "x99917162" : False ,
239
+ "x99917946_x99918006" : False ,
240
+ "x99919131" : False ,
241
+ "Collagen" : False ,
242
+ }
243
+ for sub_tree_id , sub_tree in tree .items ():
244
+ for updated_node_id , node in sub_tree .items ():
245
+ original_node_id = node ["node_object" ].original_id
246
+ if original_node_id in longest_chain .keys ():
247
+ longest_chain [original_node_id ] = True
248
+
249
+ # these are original ids, and not the updated ids generated by duplication
250
+ longest_chain_child_to_parent = {
251
+ "x99902827_x99902971" : "x99903372_x99903533" ,
252
+ "x99903372_x99903533" : "x99904082_x99904236" ,
253
+ "x99904082_x99904236" : "x99905136_x99905259_x99905369" ,
254
+ "x99905136_x99905259_x99905369" : "x99905705_x99905844" ,
255
+ "x99905705_x99905844" : "x99906758_x99906904" ,
256
+ "x99906758_x99906904" : "x99910659_x99910771_x99910863_x99910953" ,
257
+ "x99910659_x99910771_x99910863_x99910953" : "x99911101_x99911213_x99911325_x99911452_x99911576" ,
258
+ "x99911101_x99911213_x99911325_x99911452_x99911576" : "x99911641_x99911753_x99911865" ,
259
+ "x99911641_x99911753_x99911865" : "x99913118_x99913207_x99913277" ,
260
+ "x99913118_x99913207_x99913277" : "x99913394_x99913503_x99913595" ,
261
+ "x99913394_x99913503_x99913595" : "x99913733_x99913822" ,
262
+ "x99913733_x99913822" : "x99914757_x99914847" ,
263
+ "x99914757_x99914847" : "x99915197" ,
264
+ "x99915197" : "x99915523_x99915607" ,
265
+ "x99915523_x99915607" : "x99917162" ,
266
+ "x99917162" : "x99917946_x99918006" ,
267
+ "x99917946_x99918006" : "x99919131" ,
268
+ "x99919131" : "Collagen" ,
269
+ "Collagen" : "" ,
270
+ }
271
+
272
+
273
+
114
274
def normalize_tree_counts_from_float_to_int (tree : dict = None ):
115
275
max_val = find_max (tree )
116
276
for main_node in tree :
0 commit comments