-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmake-acc-taxid-mapping.py
executable file
·65 lines (49 loc) · 1.77 KB
/
make-acc-taxid-mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#! /usr/bin/env python
"""
Take a file full of accessions and one or more NCBI 'accession2taxid' files
and create a CSV named 'acc_file.taxid' that contains the accessions
and their associated taxids.
"""
from __future__ import print_function
import argparse
import gzip
def main():
p = argparse.ArgumentParser()
p.add_argument('acc_file')
p.add_argument('acc2taxid_files', nargs='+')
args = p.parse_args()
with open(args.acc_file) as fp:
acc_set = set([ x.strip().split('.')[0] for x in fp ])
outfp = open(args.acc_file + '.taxid', 'w')
m = 0
for filename in args.acc2taxid_files:
if not acc_set: break
xopen = open
if filename.endswith('.gz'):
xopen = gzip.open
with xopen(filename, 'rt') as fp:
next(fp) # skip headers
for n, line in enumerate(fp):
if not acc_set: break
if n and n % 1000000 == 0:
print(u'\r\033[K', end=u'')
print('... read {} lines of {}; found {} of {}'.format(n, filename, m, m + len(acc_set)), end='\r')
try:
acc, _, taxid, _ = line.split()
except ValueError:
print('ignoring line', (line,))
continue
if acc in acc_set:
m += 1
outfp.write('{},{}\n'.format(acc, taxid))
acc_set.remove(acc)
if not acc_set:
break
print("\n")
if acc_set:
print('failed to find {} acc'.format(len(acc_set)))
else:
print('found all {} accessions!'.format(m))
print('output taxid file is in', args.acc_file + '.taxid')
if __name__ == '__main__':
main()