-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathform_carrot_data_to_pipeline_input.py
205 lines (163 loc) · 11.8 KB
/
form_carrot_data_to_pipeline_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#this program takes the output from carrot and transforms it into something pretty close to what we got from the orginal
#binvestigate pull.
#the basic idea is to obtain the total list of compounds and the total list of (species, organ)
#then we check the soc annotation-only files for each species organ compound
#if the file is found we impute whats missing and add to the input panda
#if the file isnt found we impute the entire list using the noise level
#somewhat unfortunately, retrospectively, the information is stored in "parallel lists" for each column for each bin
#so, the length of species, organ, count, median intensity, annotation distribution, etc should be equal
import sys
import pandas as pd
import numpy as np
import os
import random
def get_compound_properties(compound_properties_panda):
a=set(zip(
compound_properties_panda.loc[compound_properties_panda.target_type=='CONFIRMED'].target_id.astype(int),
compound_properties_panda.loc[compound_properties_panda.target_type=='CONFIRMED'].name
))
unique_bin_and_name=list(a)
return unique_bin_and_name
def get_all_species_organs():
pass
def make_empty_input_panda(pipeline_input_panda_columns,bin_and_name_list):
temp_dict_of_list={
temp_column:[np.nan for i in range(len(bin_and_name_list))] for temp_column in pipeline_input_panda_columns
}
temp_dict_of_list['id']=[temp[0] for temp in bin_and_name_list]
temp_dict_of_list['name']=[temp[1] for temp in bin_and_name_list]
temp_dict_of_list['species']=[list() for temp in bin_and_name_list]
temp_dict_of_list['organ']=[list() for temp in bin_and_name_list]
temp_dict_of_list['median_intensity']=[list() for temp in bin_and_name_list]
temp_dict_of_list['annotation_distribution']=[list() for temp in bin_and_name_list]
temp_dict_of_list['count']=[list() for temp in bin_and_name_list]
temp_dict_of_list['total_intensity']=[list() for temp in bin_and_name_list]
temp_dict_of_list['percent_present']=[list() for temp in bin_and_name_list]
pipeline_input_panda=pd.DataFrame.from_dict(temp_dict_of_list)
return pipeline_input_panda
def insert_combination(species,organ,compound):
pass
def impute_annotations_at_least_one_found(temp_annotations_list,temp_so_count,temp_average_fame_intensity):
percent_present=len(temp_annotations_list)/temp_so_count
imputed_value_base=percent_present*min(temp_annotations_list)
imputed_value_sigma=0.05*imputed_value_base
return percent_present,temp_annotations_list+[
(imputed_value_base+random.gauss(0,imputed_value_sigma)) for i in range(temp_so_count-len(temp_annotations_list))
]
def impute_annotations_zero_found(temp_average_fame_intensity,temp_so_count):
#what we impute results from normalizing the imputed value
#(noise_intensity/1e6)=(imputed/avg fame intensity)
#by avg fame intensity
#indeed, what we get is invariant to the fame, which is what we want
#we want all 0% annotations to have basically the same intensities
return [((noise_intensity+random.gauss(0,gauss_sigma))/mid_fame_intensity) for i in range(temp_so_count)]
def insert_combination_wrapper(pipeline_input_panda,bin_and_name_list,species_organ_pair_list,data_base_address,species_organ_properties_panda,so_to_skip_set):
#
bins_to_remove=list()
print('inside combination wrapper')
for temp_counter,temp_bin_and_name in enumerate(bin_and_name_list):
print(f'we are on directory {temp_counter}')
print('some row in the input panda')
print(temp_bin_and_name)
#we add this to avoid checking groups.
if len(os.listdir(data_base_address+str(temp_bin_and_name[0])))==0:
print(data_base_address+str(temp_bin_and_name[0]))
print('the length was zero')
bins_to_remove.append(temp_bin_and_name[0])
continue
for temp_species_organ_pair in species_organ_pair_list:
this_species_organ_count=species_organ_properties_panda.loc[
(species_organ_properties_panda.species==temp_species_organ_pair[0]) &
(species_organ_properties_panda.organ==temp_species_organ_pair[1])
]['count'].item()
try:
temp_panda=pd.read_pickle(data_base_address+str(temp_bin_and_name[0])+'/'+temp_species_organ_pair[0]+'¬'+temp_species_organ_pair[1]+'¬'+str(temp_bin_and_name[0])+'.bin')
temp_panda=temp_panda.to_frame()
#there is a special case where the annotation is made but the value is still zero
#why this occurs is a binbase issue
#if this is the case, then we resort to the noise imputation strategy
if temp_panda.normalized_intensity.sum()==0:
this_species_organ_average_fame_intensity=species_organ_properties_panda.loc[
(species_organ_properties_panda.species==temp_species_organ_pair[0]) &
(species_organ_properties_panda.organ==temp_species_organ_pair[1])
]['average_fame_intensity'].item()
temp_imputed_annotations_list=impute_annotations_zero_found(this_species_organ_average_fame_intensity,this_species_organ_count)
temp_percent_present=0
else:
#in general, the lists can have the property where some annotation has a magnitude of zero
#the "all zero" strategy is different, so we keep it by itself above'
this_species_organ_average_fame_intensity=species_organ_properties_panda.loc[
(species_organ_properties_panda.species==temp_species_organ_pair[0]) &
(species_organ_properties_panda.organ==temp_species_organ_pair[1])
]['average_fame_intensity'].item()
temp_panda.drop(labels=temp_panda.index[temp_panda.normalized_intensity==0], axis='index',inplace=True)
temp_panda.reset_index(inplace=True,drop=True)
temp_percent_present,temp_imputed_annotations_list=impute_annotations_at_least_one_found(temp_panda['normalized_intensity'].to_list(),this_species_organ_count,this_species_organ_average_fame_intensity)
except FileNotFoundError:
this_species_organ_average_fame_intensity=species_organ_properties_panda.loc[
(species_organ_properties_panda.species==temp_species_organ_pair[0]) &
(species_organ_properties_panda.organ==temp_species_organ_pair[1])
]['average_fame_intensity'].item()
temp_imputed_annotations_list=impute_annotations_zero_found(this_species_organ_average_fame_intensity,this_species_organ_count)
temp_percent_present=0
pipeline_input_panda.at[temp_bin_and_name[0],'species'].append(temp_species_organ_pair[0])
pipeline_input_panda.at[temp_bin_and_name[0],'organ'].append(temp_species_organ_pair[1])
pipeline_input_panda.at[temp_bin_and_name[0],'count'].append(this_species_organ_count)
pipeline_input_panda.at[temp_bin_and_name[0],'median_intensity'].append(np.median(temp_imputed_annotations_list))
pipeline_input_panda.at[temp_bin_and_name[0],'annotation_distribution'].append(temp_imputed_annotations_list)
pipeline_input_panda.at[temp_bin_and_name[0],'total_intensity'].append(sum(temp_imputed_annotations_list))
pipeline_input_panda.at[temp_bin_and_name[0],'percent_present'].append(temp_percent_present)
return pipeline_input_panda,bins_to_remove
if __name__=="__main__":
min_fold_change=sys.argv[1]
#output_pickle_address='../results/'+str(min_fold_change)+'/step_0_b_shape_aws_pull_to_pipeline_input/binvestigate_species_transformed.bin'
os.system('mkdir -p ../results/'+str(min_fold_change)+'/step_0_b_shape_aws_pull_to_pipeline_input/')
os.system('touch ../results/'+str(min_fold_change)+'/step_0_b_shape_aws_pull_to_pipeline_input/dummy.txt')
##when looking at the fames it was discovered that a number of metadata had "bad fame appearance frequency"
#that is, for whatever reason, the fames only appeared like 50 percent of the time
#these are the things in the "elbow plot"
#this file tells us how to skip systematic bias
so_to_skip_csv_address='../resources/pull_from_carrot/intermediates/so_to_skip.csv'
noise_intensity=1
gauss_sigma=0.05
#if we dont do this, then we will create artifacts based on the variations in fame intensity
#histogram the ave fame intensities to see how we got 1M
#so the imputed value for none found is (noise_intensity/1e6)=(imputed/avg fame intensity)
#the values of 1 and e6 were chosen by looking at intensities less than 1e6 for bin 4 (valine?)
#200/1e6, or 1e4, was too big of a signal
#when we plotted the median intensities of urea, biphenyl, and hexose, 1e6 looked too big.
#instead now gong with e13
#then we saw that the total intensity divided by the count didnt have this bimodalityy, and that 1e6
#was appropriate. so we are going to get rid of median
#no, before, we were erroneously dividng the already normalized imputed value by the fame intensity, but this was unnecessary
#everything seems good now
mid_fame_intensity=1e6
##data_base_address='../results/'+str(min_fold_change)+'/step_0_a_pull_distributions_from_aws/soc_data/'
##we are not making the pull from carrot part of the chain anymore
data_base_address='../resources/pull_from_carrot/intermediates/soc_data/'
pipeline_input_panda_columns=['id','name','species','organ','count','total_intensity','median_intensity','group','inchikey','annotation_distribution','percent_present']
compound_properties_panda_address='../resources/pull_from_carrot/intermediates/all_species_organs_compounds_panda_2.tsv'
species_organ_properties_panda_address='../resources/pull_from_carrot/original_from_carrot_input/species_organ_sample_count_and_average_fame.tsv'
output_address='../results/'+str(min_fold_change)+'/step_0_b_shape_aws_pull_to_pipeline_input/pipeline_input'
so_to_skip_panda=pd.read_csv(so_to_skip_csv_address,sep='¬')
so_to_skip_set=set(zip(so_to_skip_panda['species'],so_to_skip_panda['organ']))
compound_properties_panda=pd.read_csv(compound_properties_panda_address,sep='\t')
species_organ_properties_panda=pd.read_csv(species_organ_properties_panda_address,sep='\t')
bin_and_name_list_full=get_compound_properties(compound_properties_panda)
bin_and_name_list_full=sorted(bin_and_name_list_full)
print(bin_and_name_list_full)
print('---------------------')
#split into 500 bin chunks
bin_and_name_list_list=[bin_and_name_list_full[x:x+500] for x in range(0, len(bin_and_name_list_full), 500)]
for bin_and_name_list_counter,bin_and_name_list in enumerate(bin_and_name_list_list):
species_organ_pair_list=list(zip(species_organ_properties_panda.species,species_organ_properties_panda.organ))
pipeline_input_panda=make_empty_input_panda(pipeline_input_panda_columns,bin_and_name_list)
#basically, we discvoered (see notebook step 3) that some groups dont have a bin assocaited with the group name
#therefore, we made some directories
pipeline_input_panda.set_index(keys='id',drop=False,inplace=True)
pipeline_input_panda,bins_to_remove=insert_combination_wrapper(pipeline_input_panda,bin_and_name_list,species_organ_pair_list,data_base_address,species_organ_properties_panda,so_to_skip_set)
pipeline_input_panda.reset_index(inplace=True,drop=True)
print(pipeline_input_panda)
pipeline_input_panda=pipeline_input_panda.loc[~pipeline_input_panda.id.isin(bins_to_remove),:]
print(pipeline_input_panda)
pipeline_input_panda.to_pickle(output_address+'_'+str(bin_and_name_list_counter)+'.bin')