-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransform_bins.py
367 lines (307 loc) · 19.5 KB
/
transform_bins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
import numpy as np
import pandas
import os
import sys
import itertools
import re
#the general thought process for the bin transformation is that we
#add an inchikey
#remove species/organ/intensity/count hierarchy based on count (thought about
# leaving count in and doing all analyses at the end, but a low count and oddly high value does
# the opposite of a zero value for that analyte)
#later on, if desired, add a class based on a ML technique
def drop_repeats_in_group(temp_panda):
'''
there is a column called 'group' which basically accounts for
when the same compound is represented by multiple spectra (such as coeluting peaks)
this drops those
it was noticed that the sunburst diagram info was the same for all bins with the same group
so first we double check this on our own
update 2-5-2022 plb
we basically dont get info from binvestigate anymore, now we get info directly from carrot
carrot does not support groups at the moment
will have to ignore them for now and perhaps revisit later?
'''
#get the groups that appear at least one time
group_value_counts=temp_panda['group'].value_counts()
#check that all rows are exactly duplicates (except for inchikey and other "names")
#so basically across all rows, the intensity, organ, species are the same
for index, value in group_value_counts.iteritems():
#we cant remove dupes if the group name appears only once
if value == 1:
continue
#we dont want to remove things that dont have a group
if pandas.isnull(index):
continue
#in this block we confirm that for each group, all [organ, species, intensity are the same]
#which we expect because the sunburst diagrams are all the same
zeroth_index=temp_panda.loc[temp_panda['group'] == index].index[0]
non_zeroth_indices=temp_panda.loc[temp_panda['group'] == index].index[1:]
for subset_index, subset_series in temp_panda.loc[temp_panda['group'] == index].iterrows():
if subset_series['organ'] != temp_panda.loc[temp_panda['group']==index].loc[zeroth_index,'organ']:
hold=input('there is a difference amongst the organs')
elif subset_series['species'] != temp_panda.loc[temp_panda['group']==index].loc[zeroth_index,'species']:
hold=input('there is a difference amongst the speciess')
elif subset_series['median_intensity'] != temp_panda.loc[temp_panda['group']==index].loc[zeroth_index,'median_intensity']:
print(index)
print(value)
print(subset_series)
hold=input('there is a difference amongst the median intensitys')
elif subset_series['total_intensity'] != temp_panda.loc[temp_panda['group']==index].loc[zeroth_index,'total_intensity']:
hold=input('there is a difference amongst the total intensitys')
#for a group, after confirming teh same, we drop that
temp_panda.drop(labels=non_zeroth_indices,axis='index',inplace=True)
def print_all_bin_identifiers(temp_panda):
'''
for each bin there are at least 5 types of identifiers
id, _id, group, name, and inchikey
any of those 5 can contribute to a new column, inchikey_final (people think about them when mapping)
this function prints a row "if there are words in at least one"
that is, the 'group' or '_id' or 'inchikey' might have stuff
but not the others
'''
temp_panda.fillna(value='@@@@@@@',inplace=True)
'''
note:
using the following code, we see that id 126322 (and 126323 if desired) is basically a large space
therefore, in the curation .tsv, we set the curated inchikey to @@@@@@@ (although we may update that)
print(temp_panda.loc[temp_panda['id']==126322][['id','inchikey']])
print('probing test----'+temp_panda.at[5390,'inchikey']+'!!!!')
hold=input('check na on 126322 row')
'''
my_mask=list()
for temp_index, temp_series in temp_panda.iterrows():
my_mask.append(any([
any(c.isalpha() for c in str(temp_series['_id'])),
any(c.isalpha() for c in str(temp_series['group'])),
any(c.isalpha() for c in str(temp_series['name'])),
any(c.isalpha() for c in str(temp_series['inchikey']))
]))
print('id¬_id¬group¬name¬inchikey')
for index,series in temp_panda.loc[my_mask].iterrows():
print(str(series['id'])+'¬'+str(series['_id'])+'¬'+str(series['group'])+'¬'+str(series['name'])+'¬'+str(series['inchikey']))
def update_inchikey_from_mapping(temp_panda, temp_inchikey_mapping_address):
'''
here we add columns for curated inchikey results as some bins have names but no inchikeys
we match the inchikeys that are incoming to the binvestigate panda using the field "id"
'''
temp_panda['inchikey_curated']='pre_curation_file'
inchikey_mapping_panda=pandas.read_csv(temp_inchikey_mapping_address,sep='\t')
for index, series in inchikey_mapping_panda.iterrows():
temp_main_panda_index=temp_panda.index[temp_panda['id']==series['id']]
temp_panda.at[temp_main_panda_index,'inchikey_curated']=series['inchikey_curated']
def divide_total_intensities_by_count(temp_panda):
'''
when calling rest binvestigate, the intensites are aggregates, whereas the GUI renderings are averages
so we must make this conversion manaully
'''
for index,series in temp_panda.iterrows():
new_intensities=list(np.divide(np.array(series['total_intensity']),np.array(series['count'])))
temp_panda.at[index,'total_intensity']=new_intensities
def transform_count_column(temp_bin_panda,temp_count_cutoff):
'''
like the species and the organs, we remove from the quadruplet (species, organ, intensity, count)
note that there are no transforms in this one, only removals
threshold is the hyperparameter count_cutoff
'''
for bin_index, bin_series in temp_bin_panda.iterrows():
print(bin_index)
bin_series['percent_present']
indices_to_drop=[i for i in range(0,len(bin_series['count'])) if bin_series['count'][i] < temp_count_cutoff]
species_list_with_indices_removed=list(np.delete(bin_series['species'],indices_to_drop))
organ_list_with_indices_removed=list(np.delete(bin_series['organ'],indices_to_drop))
count_list_with_indices_removed=list(np.delete(bin_series['count'],indices_to_drop))
special_property_list_with_indices_removed=list(np.delete(bin_series['special_property_list'],indices_to_drop))
total_intensity_list_with_indices_removed=list(np.delete(bin_series['total_intensity'],indices_to_drop))
median_intensity_list_with_indices_removed=list(np.delete(bin_series['median_intensity'],indices_to_drop))
#without the typecast
#VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated.
annotation_distribution_list_with_indices_removed=list(np.delete(np.array(bin_series['annotation_distribution'],dtype=object),indices_to_drop))
percent_present_list_with_indices_removed=list(np.delete(bin_series['percent_present'],indices_to_drop))
temp_bin_panda.at[bin_index,'species']=species_list_with_indices_removed
temp_bin_panda.at[bin_index,'organ']=organ_list_with_indices_removed
temp_bin_panda.at[bin_index,'count']=count_list_with_indices_removed
temp_bin_panda.at[bin_index,'special_property_list']=special_property_list_with_indices_removed
temp_bin_panda.at[bin_index,'total_intensity']=total_intensity_list_with_indices_removed
temp_bin_panda.at[bin_index,'median_intensity']=median_intensity_list_with_indices_removed
temp_bin_panda.at[bin_index,'annotation_distribution']=annotation_distribution_list_with_indices_removed
temp_bin_panda.at[bin_index,'percent_present']=percent_present_list_with_indices_removed
def transform_intensity_column(temp_bin_panda):
'''
sometimes the binvestigate rest calls provided species/organs with intensities of zero
this causes nonsense runtime/divide-by-zero errors
'''
for bin_index, bin_series in temp_bin_panda.iterrows():
indices_to_drop=[i for i in range(0,len(bin_series['intensity'])) if bin_series['intensity'][i] == 0.0]
species_list_with_indices_removed=list(np.delete(bin_series['species'],indices_to_drop))
organ_list_with_indices_removed=list(np.delete(bin_series['organ'],indices_to_drop))
intensity_list_with_indices_removed=list(np.delete(bin_series['intensity'],indices_to_drop))
count_list_with_indices_removed=list(np.delete(bin_series['count'],indices_to_drop))
special_property_list_with_indices_removed=list(np.delete(bin_series['special_property_list'],indices_to_drop))
temp_bin_panda.at[bin_index,'species']=species_list_with_indices_removed
temp_bin_panda.at[bin_index,'organ']=organ_list_with_indices_removed
temp_bin_panda.at[bin_index,'intensity']=intensity_list_with_indices_removed
temp_bin_panda.at[bin_index,'count']=count_list_with_indices_removed
temp_bin_panda.at[bin_index,'special_property_list']=special_property_list_with_indices_removed
def get_list_of_redundancies(temp_organ_list,temp_species_list,temp_properties_list):
'''
zip the organs and the species together
make a dict where the keys are the set of (organ, species)
values are unique lists
every time you encounter a pair, indicate the index
then, those with dupes are those where the len of value is >1 and we have the indices
'''
key_set=set(zip(temp_organ_list,temp_species_list,temp_properties_list))
redundancy_dict={i:[] for i in key_set}
for i,temp_triplet in enumerate(zip(temp_organ_list,temp_species_list,temp_properties_list)):
redundancy_dict[temp_triplet].append(i)
return redundancy_dict
def aggregate_redundancies(temp_panda):
'''
The purpose of this function is to aggregate redundancies after the transforms.
redundancies are "multiple intensity values" that have the "same organ and same species" (and the same special property)
this can occur, for example, after rattus+rattisimus,liver and rattus,liver both become rattus,liver
we search all of these and aggregate them
'''
for index, series in temp_panda.iterrows():
print(index)
redundancy_dict=get_list_of_redundancies(series['organ'],series['species'],series['special_property_list'])
temp_species_list=list()
temp_organ_list=list()
temp_count_list=list()
temp_median_intensity_list=list()
temp_total_intensity_list=list()
temp_annotation_distribution_list=list()
temp_special_property_list=list()
temp_percent_present_list=list()
for temp_triplet in redundancy_dict.keys():
#if there is only one occurence of a species,organ,property triplet
if len(redundancy_dict[temp_triplet])==1:
temp_species_list.append(series['species'][redundancy_dict[temp_triplet][0]])
temp_organ_list.append(series['organ'][redundancy_dict[temp_triplet][0]])
temp_count_list.append(series['count'][redundancy_dict[temp_triplet][0]])
temp_median_intensity_list.append(series['median_intensity'][redundancy_dict[temp_triplet][0]])
temp_total_intensity_list.append(series['total_intensity'][redundancy_dict[temp_triplet][0]])
temp_annotation_distribution_list.append(series['annotation_distribution'][redundancy_dict[temp_triplet][0]])
temp_special_property_list.append(series['special_property_list'][redundancy_dict[temp_triplet][0]])
temp_percent_present_list.append(series['percent_present'][redundancy_dict[temp_triplet][0]])
continue
elif len(redundancy_dict[temp_triplet])>1:
#we can directly use the aggregate sum because a future file, transform_bins
#does the division of the intensity by the sum. so we dont need to do a weighted
#average
#plb 2-5-2022
#i dont think above makes total sense, but i think that we just need to make sure that
#we do what we think we should do for multiple of the same sod
#sum the total intensity
#do whatever with the median intensity - we will have to recalculate this with a new function in thsi script
#append the annotation distributions "sub"lists
temp_counts=np.array([series['count'][i] for i in redundancy_dict[temp_triplet]])
aggregate_count=sum(temp_counts)
aggregate_intensity=sum([series['total_intensity'][i] for i in redundancy_dict[temp_triplet]])
#for the annotation distribution, first make a lisft of the sublists, then "flatten" it
aggregate_annotation_distribution=list()
[aggregate_annotation_distribution.append(series['annotation_distribution'][i]) for i in redundancy_dict[temp_triplet]]
aggregate_annotation_distribution=list(itertools.chain.from_iterable(aggregate_annotation_distribution))
#for the median, we get the numpy median of the new total aggregate distribution
aggregate_median_intensity=np.median(aggregate_annotation_distribution)
temp_percents_present=np.array([series['percent_present'][i] for i in redundancy_dict[temp_triplet]])
new_percent_present=sum((temp_percents_present*temp_counts))/aggregate_count
temp_species_list.append(temp_triplet[1])
temp_organ_list.append(temp_triplet[0])
temp_special_property_list.append(temp_triplet[2])
temp_count_list.append(aggregate_count)
temp_total_intensity_list.append(aggregate_intensity)
temp_median_intensity_list.append(aggregate_median_intensity)
temp_annotation_distribution_list.append(aggregate_annotation_distribution)
temp_percent_present_list.append(new_percent_present)
temp_panda.at[index,'species']=temp_species_list
temp_panda.at[index,'organ']=temp_organ_list
temp_panda.at[index,'count']=temp_count_list
temp_panda.at[index,'total_intensity']=temp_total_intensity_list
temp_panda.at[index,'special_property_list']=temp_special_property_list
temp_panda.at[index,'median_intensity']=temp_median_intensity_list
temp_panda.at[index,'annotation_distribution']=temp_annotation_distribution_list
temp_panda.at[index,'percent_present']=temp_percent_present_list
if __name__ == "__main__":
min_fold_change=sys.argv[1]
count_cutoff=int(sys.argv[2])
inchikey_mapping_address='../resources/species_organ_maps/inchikey_mapping.txt'
os.system('mkdir -p ../results/'+str(min_fold_change)+'/step_3_bins_transformed/')
os.system('touch ../results/'+str(min_fold_change)+'/step_3_bins_transformed/dummy.txt')
pipeline_input_panda_directory='../results/'+str(min_fold_change)+'/step_2b_organ_transformed/'
pipeline_output_directory='../results/'+str(min_fold_change)+'/step_3_bins_transformed/'
file_list=os.listdir(pipeline_input_panda_directory)
file_list.remove('dummy.txt')
for temp_file in file_list:
#read in the initial panda....
initial_panda=pandas.read_pickle(pipeline_input_panda_directory+temp_file)
#there is a column called 'group' which basically accounts for
#when the same compound is represented by multiple spectra (such as coeluting peaks)
#this drops those
drop_repeats_in_group(initial_panda)
#just like the species and organs, we need to create a list of things that we transform
#in this case, the transformation is of the inchikeys
#update 2-5-2022 plb
#we arent going to get the inchikeys from identifiers. we are going to get a custom map from oliver
#that goes into the beginning input panda that we build from carrot
#we instead update with this single line from that method
##print_all_bin_identifiers(initial_panda)
initial_panda.fillna(value='@@@@@@@',inplace=True)
#one of the provided inchikeys is a bunch of spaces ' '
initial_panda.replace(
to_replace={
' ':'@@@@@@@',
'HUYHGEVLJRTFIB-CRSCSNDXSA-N':'@@@@@@@',
'CLDHNWUVRZNQBD-GSVOUGTGSA-N':'@@@@@@@',
'ADGFNXYXROZUOG-VKHMYHEASA-N':'@@@@@@@'
},
inplace=True
)
#just like the species and organs, based on some external txt
#we update the inchikeys
#update 2-5-2022 plb
#we simplified the formatting of the input file to just 3 columns, id, inchikey, and inchikey_curated
##update 7-5-2022 plb
##the inchikey mapping does literally nothing.
##update_inchikey_from_mapping(initial_panda,inchikey_mapping_address)
##print(initial_panda)
##hold=input('hold')
#sometimes the binvestigate rest calls provided species/organs with intensities of zero
#this causes nonsense runtime/divide-by-zero errors
#update 2-5-2022 plb
#since we get data from carrot, at this point there should be no intensities of zero
#so we do not do this as a way to avoid surprising results
#transform_intensity_column(initial_panda)
#The purpose of this function is to aggregate redundancies after the transforms.
#redundancies are "multiple intensity values" that have the "same organ and same species"
#this can occur, for example, after rattus+rattisimus,liver and rattus,liver both become rattus,liver
#we search all of these and aggregate them
#update 2-5-2022 plb
#redid this function
aggregate_redundancies(initial_panda)
#like the species and the organs, we remove from the quadruplet (species, organ, intensity, count)
#if there are fewer samples than the cutoff that we specify (cant trust numbers from 1 sample)
#note that there are no transforms in this one, only removals
#we assign count_cutoff to be zero to make it so that all triplets are kept
#update 2-5-2022 plb
#since we get data from carrot, at this point there should be no counts of zero
#so we do not do this as a way to avoid surprising result
#update 2-7-2022 plb
#we actually readded this because the statistical tests will need to have a minimum amount of samples
#to be valid.
#in order to try to preserve what we can, we do the drop after we aggregate, so this was moved to below
#aggregate_redundancies
transform_count_column(initial_panda,count_cutoff)
#when calling rest binvestigate, the intensites are aggregates, whereas the GUI renderings are averages
#so we must make this conversion manaully
#update 2-5-2022 plb
#we only divide the total intensities
#leave the median intensity alone
divide_total_intensities_by_count(initial_panda)
#########################
#later, we may add a class from a ML algorithm
#no. we wont. - plb 2-5-2022
#########################
temporary_file_integer=re.findall(r'\d+', temp_file)[0]
initial_panda.to_pickle(pipeline_output_directory+'binvestigate_bins_transformed_'+str(temporary_file_integer)+'.bin',protocol=0)