-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathjoin-mgrast.py
336 lines (296 loc) · 14.9 KB
/
join-mgrast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
import argparse
import sys
import os
import requests
from ete3 import NCBITaxa
ncbi = NCBITaxa()
cognames = os.path.expanduser(os.path.join("~", "chordomics", "cognames2003-2014.tab"))
cogcats = os.path.expanduser(os.path.join("~", "chordomics", "fun2003-2014.tab"))
urlpre = "ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/"
if not os.path.exists(cognames):
print("Downloading COG Names from NCBI")
os.system("curl --url %s --output %s" % (
urlpre + "cognames2003-2014.tab",
cognames )
)
if not os.path.exists(cogcats):
print("Downloading COG Categories from NCBI")
os.system("curl --url %s --output %s" % (
urlpre + "fun2003-2014.tab",
cogcats)
)
cogcats_dict = {}
cognames_dict = {}
with open(cogcats, "r") as catin:
header = True
for line in catin:
if header:
header =False
continue
cat, catdesc = line.strip().split("\t")
cogcats_dict[cat] = catdesc
with open(cognames, "r", encoding = "ISO-8859-1") as cogin:
""" note from README ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data
Functional classes (categories) are described in the file
fun2003-2014.tab (see 2.3). Some COGs belong to more than one
functional class; in these cases the class listed first is considered
to be primary.
"""
header = True
for line in cogin:
if header:
header =False
continue
cog, cat, desc = line.strip().split("\t")
primary_cat = list(cat)[0]
cognames_dict[cog]={"desc":desc,
"catdesc":cogcats_dict[primary_cat]}
def get_args(): # pragma: no cover
parser = argparse.ArgumentParser(
description="Join MG-RAST datasets for use with chordomics",
add_help=False)
# contigs not needed if just re-analyzing the results
parser.add_argument(
"--ontology", action="store",
help="Ontology file ")
parser.add_argument(
"--organism", action="store",
help="organism file ")
# parser.add_argument(
# "--nseqs", action="store",
# type=int,
# help="number of total seqs, used to calculate overall percentages; " +
# "required if API is responding slowly for getting sequence count ",
# required=False)
# parser.add_argument("--input_data", action="store",
# help="input_data file ", required=False)
parser.add_argument("--mgm_id", action="store",
help="MG-RAST ID ", required=False)
parser.add_argument("-o", "--output", action="store",
help="destination dir", required=True)
args = parser.parse_args()
return args
def read_from_url(url, nrows=10000):
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.read()
def make_short_name(longname):
return(" ".join(longname.replace("]", "").replace("[", "").split(" ")[0:2]))
def make_org_dict(path):
org_dict = {}
header = True
with open(path, "r") as inf:
for line in skip_last(inf):
if header:
header = False
continue
if len(line.strip().split("\t")) != 4:
continue
thisid, m5nr, sequence, annotations = line.strip().split("\t")
# we dont have a good way to perform LCA on the
for annotation in annotations.split(";")[0]:
annotation = make_short_name(annotation)
thisid = thisid.replace("|RefSeq", "")
if annotation not in org_dict.keys():
org_dict[annotation] = [thisid]
else:
org_dict[annotation].append(thisid)
return(org_dict)
def make_ont_dict(path):
""" return a dictionary of read: cog1;cog2 pairs
"""
ont_dict = {}
header = True
with open(path, "r") as inf:
for line in skip_last(inf):
if header:
header = False
continue
if len(line.strip().split("\t")) != 4:
continue
thisid, m5nr, sequence, annotations = line.strip().split("\t")
thisid = thisid.replace("|COG", "")
ont_dict[thisid] = [x.replace("a", "").replace("ccession=[", "").split("],")[0] for x in annotations.split("];a")]
return(ont_dict)
def tally_org_ont(org, ont):
n_nomatch = 0
org_ont_dict = {thisorg: {"onts": [], "counts":[]} for thisorg in org.keys()}
for i, (thisorg, id_list) in enumerate(org.items()):
for id in id_list:
try:
for thisont in ont[id]:
if thisont in org_ont_dict[thisorg]["onts"]:
org_ont_dict[thisorg]["counts"][org_ont_dict[thisorg]["onts"].index(thisont)] += 1
else:
org_ont_dict[thisorg]["counts"].append(1)
org_ont_dict[thisorg]["onts"].append(thisont)
except KeyError:
n_nomatch += 1
print(str(n_nomatch) + " sequences lacked hits in both the ontology and organism sets")
return(org_ont_dict)
def get_seq_count(thisid):
r = requests.get("https://api-ui.mg-rast.org/metagenome/" + thisid + "?verbosity=stats&detail=sequence_stats")
return(r.json()["sequence_count_raw"])
def skip_last(iterator):
# Taken from SO, we need to avoid the last line which marks the completion of download
# https://stackoverflow.com/questions/16846460
prev = next(iterator)
for item in iterator:
yield prev
prev = item
def make_taxa_dict(shortname):
taxid = ncbi.get_name_translator([shortname])
if taxid == {}:
return()
this_lineage = {"Superkingdom": None,
"Kingdom":None,
"Phylum": None,
"Class":None,
"Order": None,
"Family": None,
"Genus": None,
"Species": None }
this_lin_ids = ncbi.get_lineage(taxid[shortname][0])
this_lin = ncbi.get_rank(this_lin_ids)
this_names = ncbi.get_taxid_translator(this_lin_ids)
for rank in this_lineage.keys():
thisrank = rank.lower()
try:
this_lineage[rank] = {"id": [k for k, v in this_lin.items() if v == thisrank][0]}
except IndexError:
pass
for rank, id_dict in this_lineage.items():
if id_dict is None:
continue
try:
this_lineage[rank]['name'] = [v for k, v in this_names.items() if k == id_dict["id"]][0]
except IndexError:
pass
return(this_lineage)
def get_name(d, level):
try:
name = d['lineage'][level]['name']
except TypeError:
name = "NA"
if name == "NA" and level == "Kingdom":
name = get_name(d, "Superkingdom")
return name
def main():
#print("setting up NCBI Taxa database; this takes a while the first time")
args = get_args()
# if args.nseqs is None and False:
# print("Getting number of seqs")
# args.nseqs = get_seq_count(args.mgm_id)
print("Tallying ontology file")
ont_dict = make_ont_dict(args.ontology)
print("Processed %i ontology hits" % len(ont_dict))
print("Tallying organism file")
org_dict = make_org_dict(args.organism)
print("Processed %i organism hits" % len(org_dict))
print("combining the datasets")
combined = tally_org_ont(org=org_dict, ont=ont_dict)
print("Assigning taxonomy")
for name in combined.keys():
combined[name]["lineage"] = make_taxa_dict(name)
print("Writing out joined results")
missing_cogs = []
counter = 0
with open(args.output, "w") as outf:
outf.write('"X","Superkingdom","Kingdom","Phylum","Class","Order","Family","Genus","Species","COG_Name","COG_Category"\n')
for org, onts_counts_lineage in combined.items():
for i, ont in enumerate(onts_counts_lineage["onts"]):
# if True:
# print(cognames_dict[ont])
# continue
for count in onts_counts_lineage["counts"]:
try:
outf.write(
'"{counter}","{Skingdom}","{Kingdom}","{Phylum}","{Class}","{Order}","{Family}","{Genus}","{Species}","{COG_Name}","{COG_Category}"\n'.format(
counter=counter,
Skingdom=get_name(onts_counts_lineage, "Superkingdom"),
Kingdom=get_name(onts_counts_lineage, "Kingdom"),
Phylum=get_name(onts_counts_lineage, "Phylum"),
Class=get_name(onts_counts_lineage, "Class"),
Order=get_name(onts_counts_lineage, "Order"),
Family=get_name(onts_counts_lineage, "Family"),
Genus=get_name(onts_counts_lineage, "Genus"),
Species=get_name(onts_counts_lineage, "Species"),
COG_Name=cognames_dict[ont]["desc"],
COG_Category=cognames_dict[ont]["catdesc"]
)
)
counter = counter + 1
except KeyError:
if ont not in cognames_dict.keys():
missing_cogs.append(ont)
outf.write(
'"{counter}","{Skingdom}","{Kingdom}","{Phylum}","{Class}","{Order}","{Family}","{Genus}","{Species}","{COG_Name}","{COG_Category}"\n'.format(
counter=counter,
Skingdom=get_name(onts_counts_lineage, "Superkingdom"),
Kingdom=get_name(onts_counts_lineage, "Kingdom"),
Phylum=get_name(onts_counts_lineage, "Phylum"),
Class=get_name(onts_counts_lineage, "Class"),
Order=get_name(onts_counts_lineage, "Order"),
Family=get_name(onts_counts_lineage, "Family"),
Genus=get_name(onts_counts_lineage, "Genus"),
Species=get_name(onts_counts_lineage, "Species"),
COG_Name=ont,
COG_Category="UNKNOWN"
)
)
counter = counter + 1
else:
print("Error with the following ontology -- organism: {} -- {}".format(ont, org))
print("The following COGs were not found; \n" +
"see the old database found at ftp://ftp.ncbi.nih.gov/pub/COG/COG/")
print(set(missing_cogs))
print("Done!")
if __name__ == "__main__":
main()
def test_make_org_dict():
test_dict = {
'Natronomonas pharaonis': ['mgm4762935.3|contig_121_1069145_length_20328_multi_3_in_1_out_0_12718_14127_-',
'mgm4762935.3|contig_121_1069145_length_20328_multi_3_in_1_out_0_12718_14188_-'],
'Lactobacillus salivarius': ['mgm4762935.3|contig_51_826956_length_45026_multi_4_in_0_out_0_9855_10283_-',
'mgm4762935.3|contig_51_826956_length_45026_multi_4_in_0_out_0_9855_10283_-'],
'Ferroglobus placidus': ['mgm4762935.3|contig_81_2449664_length_20179_multi_2_in_0_out_0_4654_6346_+'],
'Clostridium botulinum': ['mgm4762935.3|contig_121_1103569_length_8205_multi_2_in_2_out_2_4061_4312_+'],
'Aciduliprofundum boonei': ['mgm4762935.3|contig_111_1628766_length_37341_multi_2_in_0_out_0_11642_15072_-'],
'Clostridium thermocellum': ['mgm4762935.3|contig_71_7197930_length_16664_multi_2_in_0_out_0_9360_11931_+',
'mgm4762935.3|contig_71_7197930_length_16664_multi_2_in_0_out_0_9360_11931_+',
'mgm4762935.3|contig_71_7197930_length_16664_multi_2_in_0_out_0_9360_11931_+'],
'Ruminococcus albus': ['mgm4762935.3|contig_101_47814_length_32684_multi_2_in_0_out_0_3909_5352_-'], 'Methanosphaera stadtmanae': ['mgm4762935.3|contig_121_282635_length_6565_multi_2_in_0_out_2_2839_4642_-'],
'Ammonifex degensii': ['mgm4762935.3|contig_111_1628766_length_37341_multi_2_in_0_out_0_27395_28315_+'], 'Methanosaeta thermophila': ['mgm4762935.3|contig_111_1628766_length_37341_multi_2_in_0_out_0_27395_28315_+'],
'Paenibacillus sp.': ['mgm4762935.3|contig_121_394099_length_3154_multi_2_in_2_out_0_1_1139_-'],
'Methanothermobacter thermautotrophicus': ['mgm4762935.3|contig_121_570908_length_16058_multi_2_in_2_out_2_10587_14405_-']}
{
'Natronomonas pharaonis': ['mgm4762935.3|contig_121_1069145_length_20328_multi_3_in_1_out_0_12718_14127_-',
'mgm4762935.3|contig_121_1069145_length_20328_multi_3_in_1_out_0_12718_14188_-'],
'Lactobacillus salivarius': ['mgm4762935.3|contig_51_826956_length_45026_multi_4_in_0_out_0_9855_10283_-'],
'Lactobacillus salivarius': ['mgm4762935.3|contig_51_826956_length_45026_multi_4_in_0_out_0_9855_10283_-'],
'Ferroglobus placidus': ['mgm4762935.3|contig_81_2449664_length_20179_multi_2_in_0_out_0_4654_6346_+'],
'Clostridium botulinum': ['mgm4762935.3|contig_121_1103569_length_8205_multi_2_in_2_out_2_4061_4312_+'],
'Aciduliprofundum boonei': ['mgm4762935.3|contig_111_1628766_length_37341_multi_2_in_0_out_0_11642_15072_-'],
'Clostridium thermocellum': ['mgm4762935.3|contig_71_7197930_length_16664_multi_2_in_0_out_0_9360_11931_+'],
'Clostridium thermocellum': ['mgm4762935.3|contig_71_7197930_length_16664_multi_2_in_0_out_0_9360_11931_+'],
'Clostridium thermocellum': ['mgm4762935.3|contig_71_7197930_length_16664_multi_2_in_0_out_0_9360_11931_+'],
'Ruminococcus albus': ['mgm4762935.3|contig_101_47814_length_32684_multi_2_in_0_out_0_3909_5352_-'],
'Methanosphaera stadtmanae': ['mgm4762935.3|contig_121_282635_length_6565_multi_2_in_0_out_2_2839_4642_-'],
'Ammonifex degensii': ['mgm4762935.3|contig_111_1628766_length_37341_multi_2_in_0_out_0_27395_28315_+'],
'Methanosaeta thermophila': ['mgm4762935.3|contig_111_1628766_length_37341_multi_2_in_0_out_0_27395_28315_+'],
'Paenibacillus sp.': ['mgm4762935.3|contig_121_394099_length_3154_multi_2_in_2_out_0_1_1139_-'],
'Methanothermobacter thermautotrophicus': ['mgm4762935.3|contig_121_570908_length_16058_multi_2_in_2_out_2_10587_14405_-'],
'Meiothermus ruber': ['mgm4762935.3|contig_121_457105_length_4459_multi_2_in_0_out_1_2831_4299_-']
}
org_dict = make_org_dict("./tests/sample/organism")
print(org_dict)
for i in test_dict.keys():
print(test_dict[i])
print(org_dict[i])
assert test_dict[i] == org_dict[i], "mismatched org dict"
def test_get_seq_count():
seq_count = get_seq_count("mgm4762935.3")
print(seq_count)
assert seq_count == 109, "problem getting seqeunce count"