-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathperform_compound_hierarchical_analysis.py
208 lines (170 loc) · 7.52 KB
/
perform_compound_hierarchical_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import sys
import obonet
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import os
import multiprocessing
from functools import partial
import re
def write_each_compound_fold_change_matrix_to_file(temp_nx,temp_address_base, temp_matrix):
'''
'''
#if you get here then you should can the directory that existed previously
total_address=temp_address_base+temp_matrix+'/'
os.system('trash '+total_address)
os.system('mkdir '+total_address)
#traverse entire compound matrix
for temp_node in temp_nx.nodes:
temp_nx.nodes[temp_node][temp_matrix].to_pickle(total_address+str(temp_node)+'.bin')
def write_each_bin_to_file(binvestigate_panda,temp_address_base):
matrices_to_compute=[
'fold_change_matrix_average',
'fold_change_matrix_median',
'signifigance_matrix_mannwhitney',
'signifigance_matrix_welch'
]
binvestigate_panda_column_names=[
'fold_change_total_intensity',
'fold_change_median_intensity',
'signifigance_mannwhitney',
'signifigance_welch'
]
for index,series in binvestigate_panda.iterrows():
print('printing bin '+str(series['id'])+' iteration number '+str(index))
for i in range(len(matrices_to_compute)):
total_address=temp_address_base+matrices_to_compute[i]+'/'
series[binvestigate_panda_column_names[i]].to_pickle(total_address+str(series['id'])+'.bin')
def bfs_layers(G, sources):
"""Returns an iterator of all the layers in breadth-first search traversal.
Parameters
----------
G : NetworkX graph
A graph over which to find the layers using breadth-first search.
sources : node in `G` or list of nodes in `G`
Specify starting nodes for single source or multiple sources breadth-first search
Yields
------
layer: list of nodes
Yields list of nodes at the same distance from sources
Examples
--------
>>> G = nx.path_graph(5)
>>> dict(enumerate(nx.bfs_layers(G, [0, 4])))
{0: [0, 4], 1: [1, 3], 2: [2]}
>>> H = nx.Graph()
>>> H.add_edges_from([(0, 1), (0, 2), (1, 3), (1, 4), (2, 5), (2, 6)])
>>> dict(enumerate(nx.bfs_layers(H, [1])))
{0: [1], 1: [0, 3, 4], 2: [2], 3: [5, 6]}
>>> dict(enumerate(nx.bfs_layers(H, [1, 6])))
{0: [1, 6], 1: [0, 3, 4, 2], 2: [5]}
"""
#this is in here like this because its in a newer version of nx
if sources in G:
sources = [sources]
current_layer = list(sources)
visited = set(sources)
for source in current_layer:
if source not in G:
raise nx.NetworkXError(f"The node {source} is not in the graph.")
# this is basically BFS, except that the current layer only stores the nodes at
# same distance from sources at each iteration
while current_layer:
yield current_layer
next_layer = list()
for node in current_layer:
for child in G[node]:
if child not in visited:
visited.add(child)
next_layer.append(child)
current_layer = next_layer
def compute_output_matrix_fold(temp_pandas_list):
total_panda=pd.concat(temp_pandas_list,axis='index')
my_groupby_min=total_panda.groupby(
level=('organ','species','disease'),axis='index'
).min()
my_groupby_max=total_panda.groupby(
level=('organ','species','disease'),axis='index'
).max()
intermediate_min_df=my_groupby_min.where(
np.sign(my_groupby_min)>0,
other=0
)
intermediate_max_df=my_groupby_max.where(
np.sign(my_groupby_max)<0,
other=0
)
temp_output=intermediate_min_df.where(
intermediate_min_df !=0,
other=intermediate_max_df
)
temp_output.values[np.tril_indices(len(temp_output.index), -1)]=0
temp_output=temp_output-temp_output.T-np.diag(np.diag(temp_output))
np.fill_diagonal(temp_output.values, np.nan)
return temp_output
def compute_output_matrix_significance(temp_pandas_list):
total_panda=pd.concat(temp_pandas_list,axis='index')
temp_output=total_panda.groupby(
level=('organ','species','disease'),axis='index'
).max()
np.fill_diagonal(temp_output.values, np.nan)
return temp_output
if __name__ == "__main__":
min_fold_change=sys.argv[1]
cores_available=int(sys.argv[2])
os.system('mkdir -p ../results/'+str(min_fold_change)+'/step_8_perform_compound_hierarchical_analysis/all_matrices/')
os.system('touch ../results/'+str(min_fold_change)+'/step_8_perform_compound_hierarchical_analysis/dummy.txt')
individual_fold_matrix_directory_base='../results/'+str(min_fold_change)+'/step_8_perform_compound_hierarchical_analysis/all_matrices/'
matrices_to_compute=[
'fold_change_matrix_average',
'fold_change_matrix_median',
'signifigance_matrix_mannwhitney',
'signifigance_matrix_welch'
]
for temp_matrix in matrices_to_compute:
total_address=individual_fold_matrix_directory_base+temp_matrix+'/'
os.system('trash '+total_address)
os.system('mkdir '+total_address)
#make the compounds individual files
pipeline_input_panda_directory='../results/'+str(min_fold_change)+'/step_6_b_generate_signifigance_test_matrices/'
file_list=os.listdir(pipeline_input_panda_directory)
file_list.remove('dummy.txt')
for temp_file in file_list:
temporary_input_panda=pd.read_pickle(pipeline_input_panda_directory+temp_file)
write_each_bin_to_file(temporary_input_panda,individual_fold_matrix_directory_base)
input_graph_address='../results/'+str(min_fold_change)+'/step_7_prepare_compound_hierarchy/classyfire_ont_with_bins_added.bin'
#read in network
compound_network=nx.readwrite.gpickle.read_gpickle(input_graph_address)
layered_bfs_list=list(bfs_layers(compound_network,'CHEMONTID:0000000'))
layered_bfs_list.reverse()
#the algorithm is, go through the bottom most layer, then the 2nd bottom most, etc
#for each later, if its a compund, skip
#if its not a compound, then get only the direct descendants "names"
#open the correspondin files in the directory set that we made above,
#crunch the "combined" matrix, and output
#because we concern ourself only with min/max, there is a certain "linear property vibe", where we do not need the full
#set of compounds, only the direct descendants.
for layer in layered_bfs_list:
print(layer)
for node in layer:
print(node)
if compound_network.nodes[node]['type_of_node']=='from_binvestigate':
continue
else:
#print(compound_network)
temp_successor_list=list(compound_network.successors(node))
for temp_matrix in matrices_to_compute:
temp_pandas_list=list()
for temp_successor in temp_successor_list:
#continue
temp_pandas_list.append(
pd.read_pickle(individual_fold_matrix_directory_base+temp_matrix+'/'+str(temp_successor)+'.bin')
)
if 'fold' in temp_matrix:
output_panda=compute_output_matrix_fold(temp_pandas_list)
elif 'signifi' in temp_matrix:
output_panda=compute_output_matrix_significance(temp_pandas_list)
output_panda.to_pickle(
individual_fold_matrix_directory_base+temp_matrix+'/'+str(node)+'.bin'
)