-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathadd_taxa_to_align.py
executable file
·602 lines (552 loc) · 31.2 KB
/
add_taxa_to_align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
#! /usr/bin/env python
# add_taxa_to_align.py v1.0 created 2016-12-08
# v1.5 2018-08-27
# v1.6 2023-01-20 python3 update and some changes to log output
'''
add_taxa_to_align.py v1.6 2023-02-01
add new taxa to an existing untrimmed alignment
requires Bio Python library
get hmmbuild and hmmscan (from hmmer package at http://hmmer.org/)
IT IS ADVISED TO COLLECT ALL STDERR AS A LOG FILE WITH 2>
to add proteins from species1 and species2 to alignments prot1 and prot2:
add_taxa_to_align.py -a prot1.aln prot2.aln -t species1.fasta species2.fasta
more generally as:
add_taxa_to_align.py -a *.aln -t new_transcriptomes/ ...
for very long lists of new taxa, instead use -X (--tabular-taxa)
where options of fasta files (-t) and taxon names (-T) are both omitted
and instead the information is put into a tab-separated file, as:
species1.fasta Homo_sapiens
species2.fasta Mus_musculus
comment lines within the file are allowed with #
to use a supermatrix with partitions as the input instead of alignments:
add_taxa_to_align.py -i partitions.txt -a big_alignment.aln ...
for partitioned alignments, formats include:
clustal, fasta, nexus, phylip, phylip-relaxed, stockholm
partition file is comma-delimited text, as single or multiple lines
1:136,137:301,...
add taxon names with -T, in the same order as -t
add_taxa_to_align.py -t species1.fasta species2.fasta -T Homo_sapiens Mus_musculus
specify the new super matrix name with -U
add_taxa_to_align.py -U big_alignment_w_Hs_Mm.aln ...
no e-value threshold is set for hmmsearch, as results are filtered later
e-value cutoff will be determined automatically (default)
this can be statically assigned with --ev-threshold,
though that is not recommended
for highly trimmed alignments:
meaning each individual gene alignment is many discontinuous pieces
use the option --fragmented to set an alternate gap penalty for hmmbuild
it may also be advisable to relax the --ev-allowance to 1e20 or 1e25
'''
import sys
import os
import argparse
import time
import subprocess
from collections import defaultdict
from glob import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import AlignIO
def get_partitions(partitionfile, errorlog):
'''read comma-delimited partition information and return a list of tuples'''
partitions = [] # list of tuples of intervals
for line in open(partitionfile,'r'):
line = line.strip()
if line:
blocks = line.split(",") # split "1:136,137:301,..." into ['1:136', '137:301',...]
for block in blocks:
alignindex = tuple( int(i) for i in block.split(":") ) # split '1:136' into ( 1,136 )
partitions.append(alignindex)
print( "# read {} partitions from {} {}".format(len(partitions), partitionfile, time.asctime() ), file=errorlog )
return partitions
def tabular_taxa_to_lists(tabulartaxafile):
'''read tab-separated file and return two lists, one of fasta files, the other of taxon names'''
fastafiles = []
taxonnames = []
for line in open(tabulartaxafile,'r'):
line = line.strip()
if line and line[0]!="#": # ignore blank and comment lines
lsplits = line.split("\t")
if len(lsplits)!=2:
sys.exit("ERROR: INCORRECT FORMAT IN LINE:\n{}".format(line))
fastafiles.append(os.path.expanduser(lsplits[0]))
taxonnames.append(lsplits[1])
return fastafiles, taxonnames
def check_redundant_names(fullalignment, alignformat, new_taxa):
'''read new taxa names and alignment, return a list of redundant names'''
taxa_counts = defaultdict(int)
for taxon in new_taxa:
taxa_counts[taxon] += 1
alignedseqs = AlignIO.read(fullalignment, alignformat)
for seqrec in alignedseqs:
taxa_counts[seqrec.id] += 1
doublecounts = []
for taxon, counts in taxa_counts.items():
if counts > 1:
doublecounts.append(taxon)
return doublecounts
def make_alignments(fullalignment, alignformat, partitions, partitiondir, errorlog):
'''split large alignment into individual alignments, return the list of files'''
splitalignments = [] # list of filenames
alignedseqs = AlignIO.read(fullalignment, alignformat)
for part in partitions:
alignpartname = os.path.join(partitiondir, "{}_{}_{}_part.aln".format( os.path.splitext(os.path.basename(fullalignment))[0], part[0], part[1] ) )
alignpart = alignedseqs[:, part[0]-1:part[1] ]
with open(alignpartname, 'w') as ao:
AlignIO.write(alignpart, ao, "fasta")
splitalignments.append(alignpartname)
print( "# split alignment by partitions {}".format( time.asctime() ), file=errorlog )
return splitalignments
def run_hmmbuild(HMMBUILD, alignmentfile, errorlog, fragmented=False):
'''generate HMM profile from multiple sequence alignment and return HMM filename'''
# filename contains relative path, so should include partitions folder
hmm_output = "{}.hmm".format(os.path.splitext(alignmentfile)[0] )
if fragmented:
hmmbuild_args = [HMMBUILD, "--pextend", "0.8", hmm_output, alignmentfile]
else:
hmmbuild_args = [HMMBUILD, hmm_output, alignmentfile]
print( "###TIME {}\n{}".format(time.asctime(), " ".join(hmmbuild_args) ), file=errorlog )
subprocess.call(hmmbuild_args, stdout=errorlog)
print( "# hmmbuild completed {}".format( time.asctime() ), file=errorlog )
if os.path.isfile(hmm_output):
return hmm_output
else:
raise OSError("Cannot find expected output file {}".format(hmm_output) )
def run_hmmsearch(HMMSEARCH, hmmprofile, fastafile, threadcount, hmmlog, hmmdir, errorlog):
'''search fasta format proteins with HMM profile and return formatted-table filename'''
hmmtbl_output = os.path.join(hmmdir, os.path.basename("{}_{}.tab".format(os.path.splitext(fastafile)[0], os.path.splitext(os.path.basename(hmmprofile))[0] ) ) )
hmmsearch_args = [HMMSEARCH,"--cpu", str(threadcount), "--domE", "0.0001", "--domtblout", hmmtbl_output, hmmprofile, fastafile]
print( "###TIME {}\n{}".format(time.asctime(), " ".join(hmmsearch_args) ), file=errorlog )
if hmmlog:
with open(hmmlog) as hmmstdout:
subprocess.call(hmmsearch_args, stdout=hmmstdout)
else:
DEVNULL = open(os.devnull, 'w')
subprocess.call(hmmsearch_args, stdout=DEVNULL)
print( "# hmmsearch completed {}".format( time.asctime() ), file=errorlog )
if os.path.isfile(hmmtbl_output):
return hmmtbl_output
else:
raise OSError("Cannot find expected output file {}".format(hmmtbl_output) )
def hmmtable_to_seqids(hmmtable, evaluecutoff, bitlencutoff, seqdict, verbose ):
'''parse hits from hmm tblout and return a list of kept protein IDs'''
# here domain-wise table output is processed, so that length of the hit can be directly checked
# such information is not kept in the regular table output
seqids_to_keep = {} # key is seqid, value is bits per length
maxscore = 0
maxbpl = 0
lastevalue = 1e-300
lastspan = 1 # span of HMM
lastbits = 0
hitcounter = 0
for line in open(hmmtable, 'r'):
line = line.strip()
if not line or line[0]=="#": # skip comment lines
continue # also catch for empty line, which would cause IndexError
lsplits = line.split(None,22)
hitcounter += 1
targetname = lsplits[0]
querylen = int(lsplits[5])
evalue = float(lsplits[11])
bitscore = float(lsplits[13])
alignlength = int(lsplits[18]) - int(lsplits[17]) + 1
hmmspan = int(lsplits[16]) - int(lsplits[15]) + 1
bitsperlen = bitscore/alignlength
if bitscore > maxscore:
maxscore = bitscore
if bitsperlen > maxbpl:
maxbpl = bitsperlen
if bitsperlen + 0.1 < maxbpl:
if verbose:
print( "# REMOVING {}: BpL {} + 0.1 < MAX {}".format(targetname, bitsperlen, maxbpl ), file=sys.stderr )
continue
if evalue > evaluecutoff and bitsperlen < bitlencutoff:
if verbose:
print( "# REMOVING {}: EVALUE {} > {} AND BpL {} < {}".format( targetname, evalue, evaluecutoff, bitsperlen, bitlencutoff ), file=sys.stderr )
continue
if bitscore < maxscore*0.5: # discard any seqs that score less than half max
if verbose:
print( "# REMOVING {}: BITS {} < {} * 0.5".format( targetname, bitscore, maxscore ), file=sys.stderr )
continue
if evalue==0: # change values of 0 to 1e-300 for comparisons
evalue = 1e-300
# should better resolve paralogs
if len(seqids_to_keep) >= 1 and evalue > lastevalue * 1e50: # if second best seq is 1e50 worse, skip
if verbose:
print( "# REMOVING {}: EVALUE {} > LAST {} * 1e50".format( targetname, evalue, lastevalue ), file=sys.stderr )
continue
# should correctly choose a longer splice variant if seqs are this close in evalue and bitscore
if len(seqids_to_keep) >= 1 and hmmspan < lastspan and bitscore < lastbits - 100:
if verbose:
print( "# REMOVING {}: SPAN {} < LAST {} AND BITS {} < LAST {} - 100".format( targetname, hmmspan, lastspan, bitscore, lastbits ), file=sys.stderr )
continue
lastevalue = evalue
lastspan = hmmspan
lastbits = bitscore
if verbose:
print( "# CANDIDATE {}: EVALUE {}, BITS {}, BpL {}".format( targetname, evalue, bitscore, bitsperlen ), file=sys.stderr )
if targetname in seqids_to_keep: # since multiple domains are allowed, keep highest scoring one
if seqids_to_keep.get(targetname,0) < bitscore:
seqids_to_keep[targetname] = bitscore
else:
seqids_to_keep[targetname] = bitscore
# sort IDs by value, in this case bitscore
sorted_ids = [ k for k,v in sorted(seqids_to_keep.items(), key=lambda x: x[1], reverse=True ) ]
if seqids_to_keep: # meaning if any hits
print( "# retaining {}/{} seqs from {}, highest bits is {:.4f} for {}".format( len(seqids_to_keep) , hitcounter, os.path.basename(hmmtable), maxbpl, sorted_ids[0] ), file=sys.stderr )
if maxbpl != seqids_to_keep[sorted_ids[0]]: # in case best reasonable seq is not the max
print( "# WARNING: MAX {} DOES NOT MATCH TOP SEQ {} WITH {}".format( maxbpl, sorted_ids[0] , seqids_to_keep[sorted_ids[0]] ), file=sys.stderr )
else: # meaning no hits
print( "# keeping NONE of {} hits, highest score from {} was {:.4f} W/ cutoff {:.4f}".format( hitcounter, os.path.basename(hmmtable), maxbpl, bitlencutoff ), file=sys.stderr )
return sorted_ids
def make_seq_length_dict(sequencefile):
print( "# Parsing target sequences from {} {}".format( os.path.basename(sequencefile), time.asctime() ) , file=sys.stderr )
lengthdict = {}
for seqrec in SeqIO.parse(sequencefile,'fasta'):
lengthdict[seqrec.id] = len(seqrec.seq)
print( "# Found {} sequences for self-hmm {}".format( len(lengthdict), time.asctime() ), file=sys.stderr )
return lengthdict
def get_evalue_from_hmm(HMMSEARCH, hmmprofile, alignment, threadcount, hmmevaluedir, errorlog, evallowance, bplallowance):
'''get dynamic evalue and bitscore/length cutoff from alignment and hmm'''
# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# 0 1 2 3 4 5 6 7 8 9
# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target
#10 11 12 13 14 15 16 17 18
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- ---------------------
# DEGAP FASTA FILE
fasta_unaligned = os.path.join( hmmevaluedir, "{}_no_gaps.fasta".format(os.path.splitext(os.path.basename(alignment))[0] ) )
unalign_sequences(fasta_unaligned, alignment, notrim=True, calculatemedian=False, removeempty=True)
seqlendict = make_seq_length_dict(fasta_unaligned)
# RUN HMMSEARCH
# regular table output is used here, as all seqs are assumed to be correct
hmmtbl_output = os.path.join( hmmevaluedir, os.path.basename( "{}_self_hmm.tab".format( os.path.splitext(alignment)[0] ) ) )
hmmsearch_args = [HMMSEARCH,"--cpu", str(threadcount), "-E", "0.0001", "--tblout", hmmtbl_output, hmmprofile, fasta_unaligned]
print( "###TIME {}\n{}".format(time.asctime(), " ".join(hmmsearch_args) ), file=errorlog )
DEVNULL = open(os.devnull, 'w')
subprocess.call(hmmsearch_args, stdout=DEVNULL)
print( "# self-hmmsearch completed {}".format( time.asctime() ), file=errorlog )
# PROCESS RESULTS
bitslenlist = [] # list of bits per length for long sequences using hmm results
if os.path.isfile(hmmtbl_output):
maxevalue = 0.0
for line in open(hmmtbl_output, 'r'):
line = line.strip()
if not line or line[0]=="#": # skip comment lines
continue # also catch for empty line, which would cause IndexError
lsplits = line.split(None,18)
targetname = lsplits[0]
evalue = float(lsplits[4])
bitscore = float(lsplits[5])
bitsperlength = bitscore / seqlendict[targetname]
bitslenlist.append(bitsperlength)
if evalue > maxevalue: # larger number, so smaller -log(x)
maxevalue = evalue
if maxevalue==0.0: # if all evalues are 0.0, then set to 1e-300
maxevalue = 1e-300
else:
maxevalue = maxevalue * evallowance # correct by constant margin of error, should be 1e15
print( "# calculated e-value for {} as {:.3e}".format(alignment, maxevalue), file=errorlog )
bplcutoff = min(bitslenlist) - bplallowance # should be 0.1 by default
if bplcutoff < 0.9:
bplcutoff = 0.9
print( "# using minimum bits-per-length cutoff for {} of {:.2f}".format(alignment, bplcutoff ), file=errorlog )
else:
print( "# calculated bits-per-length cutoff for {} as {:.3f}".format(alignment, bplcutoff), file=errorlog )
return maxevalue, bplcutoff
else:
raise OSError("Cannot find expected output file {}".format(hmmtbl_output) )
def unalign_sequences(unalignedtaxa, alignment, notrim, calculatemedian, removeempty):
'''remove gaps from alignments, and possibly return median ungapped length'''
GAPCUTOFF = 0.75
sizelist = []
with open(unalignedtaxa,'w') as notaln:
for seqrec in SeqIO.parse(alignment,"fasta"):
gappedseq = str(seqrec.seq)
degappedseq = Seq(gappedseq.replace("-","").replace("X",""))
seqrec.seq = degappedseq
if calculatemedian:
sizelist.append(len(degappedseq))
if removeempty and len(degappedseq) < GAPCUTOFF * len(gappedseq):
print( "# PARTIAL SEQUENCE {} REMOVED FROM HMM EVALUE TEST {}".format(seqrec.id, os.path.basename(alignment) ), file=sys.stderr )
continue
if notrim:
notaln.write( seqrec.format("fasta") )
if calculatemedian:
median = sorted(sizelist)[len(sizelist)/2]
return median
else: # essentially void, just generates unaligned fasta file
return None
def collect_sequences(unalignednewtaxa, alignment, hitlistolists, lengthcutoff, speciesnames, maxhits, keep_old_ids, notrim ):
'''write sequences from old alignment and new hits to file'''
speciescounts = defaultdict(int) # key is species, value is number of written seqs per species
median = unalign_sequences(unalignednewtaxa, alignment, notrim, calculatemedian=True, removeempty=False)
with open(unalignednewtaxa,'a') as notaln:
print( "###COLLECT:{}".format(os.path.basename(alignment)), file=sys.stderr )
# hitlistolists is a list of lists, so that order of species is preserved
for i,hitlist in enumerate(hitlistolists):
writeout = 0
if not hitlist:
print( "# NO HITS FOR {} IN {}".format(speciesnames[i], os.path.basename(alignment) ), file=sys.stderr )
print( ">{}".format(speciesnames[i]), file=notaln)
continue # no hits so skip to next taxon
for seqrec in hitlist: # sublist, each item is a SeqRecord object
old_id = str(seqrec.id)
if writeout==maxhits: # if already have enough candidates
break
if len(seqrec.seq) >= median*lengthcutoff: # remove short sequences
if keep_old_ids is False: # should be False by default
if seqrec.id==speciesnames[i]: # check if seq was already used, so dict entry was renamed
print( "WARNING: REDUNDANT SEQ {} FOR {}".format(seqrec.name, os.path.basename(alignment) ), file=sys.stderr )
seqrec.description = old_id
seqrec.id = str(speciesnames[i])
print( "# using seq {} for {}".format( old_id, speciesnames[i] ), file=sys.stderr )
notaln.write( seqrec.format("fasta") )
writeout += 1
else: # meaning too short
print( "# SEQ {} TOO SHORT FOR {} IN {}".format(seqrec.name, speciesnames[i], os.path.basename(alignment) ), file=sys.stderr )
if writeout==0: # all hits missed the cut or had no hits, give a dummy entry
print( ">{}".format(speciesnames[i]), file=notaln)
print( "# ALL HITS TOO SHORT FOR {} IN {}".format(speciesnames[i], os.path.basename(alignment) ), file=sys.stderr )
# no return
def run_mafft(MAFFT, rawseqsfile, errorlog):
'''generate multiple sequence alignment from fasta and return MSA filename'''
aln_output = "{}.aln".format(os.path.splitext(rawseqsfile)[0] )
aligner_args = [MAFFT, "--maxiterate", "1000", "--localpair", "--quiet", rawseqsfile]
print( "###TIME {}\n{} > {}".format(time.asctime(), " ".join(aligner_args), aln_output ), file=errorlog )
with open(aln_output, 'w') as msa:
subprocess.call(aligner_args, stdout=msa)
print( "# alignment of {} completed {}".format(aln_output, time.asctime() ), file=errorlog )
if os.path.isfile(aln_output):
return aln_output
else:
raise OSError("Cannot find expected output file {}".format(aln_output) )
def run_mafft_addlong(MAFFT, oldalignment, rawseqsfile, errorlog):
'''generate new MSA from fasta and old MSA and return MSA filename'''
aln_output = "{}.aln".format(os.path.splitext(rawseqsfile)[0] )
# aligner_args = [MAFFT, "--quiet", "--keeplength", "--auto", "--addlong", rawseqsfile, oldalignment]
aligner_args = [MAFFT, "--maxiterate", "1000", "--localpair", "--quiet", "--addlong", rawseqsfile, oldalignment]
print( "###TIME {}\n{} > {}".format(time.asctime(), " ".join(aligner_args), aln_output ), file=errorlog )
with open(aln_output, 'w') as msa:
subprocess.call(aligner_args, stdout=msa)
print( "# alignment of {} completed {}".format(aln_output, time.asctime() ), file=errorlog )
if os.path.isfile(aln_output):
return aln_output
else:
raise OSError("Cannot find expected output file {}".format(aln_output) )
def run_tree(FASTTREEMP, alignfile, errorlog):
'''generate tree from alignment'''
tree_output = "{}.tree".format(os.path.splitext(alignfile)[0] )
fasttree_args = [FASTTREEMP, "-quiet", alignfile]
print( "###TIME {}\n{}".format(time.asctime(), " ".join(fasttree_args) ), file=errorlog )
with open(tree_output, 'w') as tree:
subprocess.call(fasttree_args, stdout=tree)
if os.path.isfile(tree_output):
return tree_output
else:
raise OSError("Cannot find expected output file {}".format(tree_output) )
def main(argv, wayout, errorlog):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--alignments', nargs="*", help="alignment files or directory")
parser.add_argument('-d','--directory', default="new_taxa", help="directory for new alignments [autonamed]")
parser.add_argument('--ev-threshold', help="static E-value cutoff for hmmsearch, otherwise determined automatically")
parser.add_argument('--ev-allowance', type=float, default=1e15, help="E-value offset for automatic detection [1e15], increase to allow more hits (e.g. 1e20)")
parser.add_argument('--bpl-threshold', type=float, default=0.9, help="bits-per-length minimum threshold, otherwise determined automatically")
parser.add_argument('--bpl-allowance', type=float, default=0.1, help="bits-per-length offset for automatic detection [0.1], increase to allow more hits (e.g. 0.2)")
parser.add_argument('-E','--hmm-evalue-dir', default="hmm_vs_self", help="temporary directory for hmm self-search to dynamically determine E-value [./hmm_vs_self]")
parser.add_argument('-f','--format', default="fasta", help="alignment format [fasta]")
parser.add_argument('-i','--partition', help="optional partition file for splitting large alignments")
parser.add_argument('-I','--partition-dir', default="partitions", help="temporary directory for partitioned alignments [./partitions]")
parser.add_argument('-l','--length', type=float, default=0.4, help="minimum length cutoff compared to median [0.4]")
parser.add_argument('-m','--max-hits', type=int, default=1, help="max number of allowed protein hits [1]")
parser.add_argument('-p','--processors', type=int, default=1, help="number of processors [1]")
parser.add_argument('-r','--no-trim', action="store_true", help="do not trim to original alignment in mafft")
parser.add_argument('-s','--hmm-results', default=None, help="optional filename for hmmsearch output")
parser.add_argument('-S','--hmm-dir', default="hmm_hits", help="temporary directory for hmm hits [./hmm_hits]")
parser.add_argument('-t','--taxa', nargs="*", help="new taxa as fasta files of proteins (such as multiple translated transcriptomes) or directory")
parser.add_argument('-T','--taxa-names', nargs="*", help="optional species names for supermatrix (can contain underscores, no spaces)")
parser.add_argument('-X','--tabular-taxa', help="optional tab-separated file for fasta files and taxon names (must not use -t or -T)")
parser.add_argument('-U','--supermatrix', help="name for optional supermatrix output")
parser.add_argument('-v','--verbose', action="store_true", help="verbose output of HMM hit removal")
parser.add_argument('--mafft', default="mafft", help="path to mafft binary [default is in PATH]")
parser.add_argument('--hmmbin', default="", help="path to hmm binaries, should be a directory containing hmmbuild and hmmsearch [default is ./]")
parser.add_argument('--fasttree', default="FastTreeMP", help="path to fasttree binary [default is in PATH]")
parser.add_argument('--fragmented', action="store_true", help="alignments are highly-trimmed fragments, use relaxed gap properties")
parser.add_argument('--no-gene-trees', action="store_true", help="skip the FastTree step")
parser.add_argument('--preserve-names', action="store_true", help="keep the fasta headers for each hit, instead of renaming by taxa (only use if not making a supermatrix)")
args = parser.parse_args(argv)
ALIGNER = os.path.expanduser(args.mafft)
HMMBUILD = os.path.expanduser(os.path.join(args.hmmbin, "hmmbuild"))
HMMSEARCH = os.path.expanduser(os.path.join(args.hmmbin, "hmmsearch"))
FASTTREEMP = os.path.expanduser(args.fasttree)
starttime = time.time()
print( "# Script called as:\n{}".format( ' '.join(sys.argv) ), file=errorlog )
### PROTEIN FILES FOR NEW TAXA
if args.taxa is not None:
if os.path.isdir(args.taxa[0]):
print( "# Finding protein files from directory {} {}".format( args.taxa[0], time.asctime() ), file=errorlog )
globstring = "{}*".format(args.taxa[0])
newtaxafiles = glob(globstring)
elif os.path.isfile(args.taxa[0]):
newtaxafiles = args.taxa
### IN CASE TABULAR TAXA FILE IS USED
elif args.tabular_taxa is not None:
newtaxafiles, args.taxa_names = tabular_taxa_to_lists(args.tabular_taxa)
else: # otherwise no taxa are given
raise OSError("ERROR: Unknown new protein files, exiting")
### CHECK IF ALL FILES EXIST
missingfiles = []
for protfile in newtaxafiles:
if not os.path.isfile(protfile):
missingfiles.append(protfile)
if missingfiles: # if any are missing
raise OSError("ERROR: Cannot find new protein files:\n{}\nexiting".format("\n".join(missingfiles)))
### APPLY NAMES OF TAXA
if args.taxa_names:
if len(args.taxa_names)!=len(newtaxafiles):
for tfile, tname in zip(newtaxafiles, args.taxa_names):
print( "{}\t{}".format(tfile, tname), file=errorlog )
raise ValueError("ERROR: number of taxa names ({}) does not match number of files ({}), exiting".format(len(args.taxa_names),len(newtaxafiles) ) )
else:
print( "# Using {} pairs of files and taxon names".format(len(newtaxafiles)), file=errorlog )
for tfile, tname in zip(newtaxafiles, args.taxa_names):
print( "{}\t{}".format(tfile, tname), file=errorlog )
if len(args.taxa_names) != len(set(args.taxa_names)):
raise ValueError("ERROR: duplicate taxa names, check -T, exiting")
### TIME STRING FOR ALL NEW DIRECTORIES
timestring = time.strftime("%Y%m%d-%H%M%S")
### SINGLE PARTITIONED ALIGNMENT TO BE EXTENDED
if args.partition: # if partitioning, do this first
if len(args.alignments) > 1:
raise OSError("ERROR: Expecting one alignment to partition, found {}, exiting".format( len(args.alignments) ) )
elif not os.path.isfile(args.alignments[0]):
raise OSError("ERROR: Cannot find {} alignment for partitions, exiting".format(args.alignments[0]) )
else:
# check if any names are redundant between old and new sets
redundant_taxa = check_redundant_names(args.alignments[0], args.format, args.taxa_names)
if redundant_taxa:
raise ValueError("ERROR: duplicate taxa names:\n{}".format( "\n".join(redundant_taxa) ) )
# get partitions from the file
partitions = get_partitions(args.partition, errorlog)
part_dir = os.path.abspath( "{}_{}".format( timestring, args.partition_dir ) )
if not os.path.isdir(part_dir):
if os.path.isfile(part_dir):
raise OSError("ERROR: Directory {} exists as a file, cannot create, exiting".format(part_dir) )
else:
os.mkdir(part_dir)
print( "# Creating directory {} {}".format(part_dir, time.asctime() ), file=errorlog )
# if all names are unique, then continue to split the supermatrix
alignfiles = make_alignments(args.alignments[0], args.format, partitions, part_dir, errorlog)
### ALIGNMENTS TO BE EXTENDED AS MULTIPLE FILES
else: # otherwise treat alignments as normal, either directory or single file
if os.path.isdir(args.alignments[0]):
print( "# Reading alignments from directory {} {}".format(args.alignments[0], time.asctime() ), file=errorlog )
globstring = "{}*".format(args.alignments[0])
alignfiles = glob(globstring)
elif os.path.isfile(args.alignments[0]):
alignfiles = args.alignments
else:
raise OSError("ERROR: Unknown alignment files, exiting")
### DIRECTORY FOR NEW OUTPUT
new_aln_dir = os.path.abspath( "{}_{}".format( timestring, args.directory ) )
if not os.path.exists(new_aln_dir):
os.mkdir(new_aln_dir)
print( "# Creating directory {} {}".format(new_aln_dir, time.asctime() ), file=errorlog )
elif os.path.isdir(new_aln_dir):
print( "# Using directory {} {}".format(new_aln_dir, time.asctime() ), file=errorlog )
### DIRECTORY FOR HMM RESULTS ###
new_hmm_dir = os.path.abspath( "{}_{}".format( timestring, args.hmm_dir ) )
if not os.path.isdir(new_hmm_dir):
if os.path.isfile(new_hmm_dir):
raise OSError("ERROR: Directory {} exists as a file, cannot create, exiting".format(new_hmm_dir) )
else:
print( "# Creating directory {} {}".format(new_hmm_dir, time.asctime() ), file=errorlog )
os.mkdir(new_hmm_dir)
### DIRECTORY FOR HMM SELF SEARCH ###
new_self_dir = os.path.abspath( "{}_{}".format( timestring, args.hmm_evalue_dir ) )
if not os.path.isdir(new_self_dir):
if os.path.isfile(new_self_dir):
raise OSError("ERROR: Directory {} exists as a file, cannot create, exiting".format(new_self_dir) )
else:
print( "# Creating directory {} {}".format(new_self_dir, time.asctime() ), file=errorlog )
os.mkdir(new_self_dir)
### MAIN LOOP
supermatrix = None
partitionlist = [] # empty list for new partition file
runningsum = 0
hmmprofilelist = [] # store hmms as list of files to run for each taxa
aligntohmm = {} # key is alignment file, value is name of hmm profile
evalueforhmmdict = {} # evalue is calculated once, stored as value where hmm name is key
bitlenforhmmdict = {} # bitscore per length is calculated once, hmm name is key
print( "# Building HMM profiles {}".format( time.asctime() ), file=errorlog )
if args.ev_threshold is None:
print( "# Determining evalue for each HMM, using offset of {:.1e}".format(args.ev_allowance), file=errorlog )
else:
try:
args.ev_threshold = float(args.ev_threshold)
except ValueError:
print( "# WARNING: {} CANNOT BE CONVERTED TO FLOAT, LEAVE --ev-threshold BLANK TO ENABLE AUTO-DETERMINATION OF EVALUE".format(args.ev_threshold), file=errorlog )
args.ev_threshold = None
# GET EVALUE AND BPL FOR EACH ALIGNMENT
for alignment in alignfiles:
# make hmm profile from alignment, test against original set to determine evalue cutoff for new seqs
hmmprofile = run_hmmbuild(HMMBUILD, alignment, errorlog, args.fragmented)
aligntohmm[alignment] = hmmprofile
hmmprofilelist.append(hmmprofile)
# check if threshold was assigned, otherwise determine for each hmm profile
# by running hmmsearch against the original proteins
if args.ev_threshold is None:
filtered_evalue, filtered_bitlen = get_evalue_from_hmm(HMMSEARCH, hmmprofile, alignment, args.processors, new_self_dir, errorlog, args.ev_allowance, args.bpl_allowance)
else: # not advised to use static cutoff, but possible
filtered_evalue = args.ev_threshold
filtered_bitlen = args.bpl_threshold
# either use the static values, or the dynamic values determined above
evalueforhmmdict[hmmprofile] = filtered_evalue
bitlenforhmmdict[hmmprofile] = filtered_bitlen
# RUN HMMSEARCH FOR EACH SPECIES
# search each species sequentially, keep matches as SeqRecords in list of lists
# to keep memory profile low, sec dict is remade for each species, then all HMMs are run
seqrecs_to_add = defaultdict( list ) # key is hmm, value is list of lists of SeqRecords by species
print( "# Beginning search for orthologs in new taxa {}".format( time.asctime() ), file=errorlog )
speciesnames = args.taxa_names if args.taxa_names else [os.path.basename(newspeciesfile) for f in newtaxafiles]
for newspeciesfile in newtaxafiles:
seqids_to_add = [] # build list of lists by species
print( "# Reading proteins from {} into memory {}".format(newspeciesfile, time.asctime() ), file=errorlog )
seqdict = SeqIO.to_dict( SeqIO.parse(newspeciesfile, "fasta") )
for hmmprof in hmmprofilelist:
hmmtableout = run_hmmsearch(HMMSEARCH, hmmprof, newspeciesfile, args.processors, args.hmm_results, new_hmm_dir, errorlog)
# append seqrecord to list for each hmm
seqids_to_add = [ seqdict[hitseqid] for hitseqid in hmmtable_to_seqids(hmmtableout, evalueforhmmdict[hmmprof], bitlenforhmmdict[hmmprof] , seqdict, args.verbose) ] # is list of seqrecords
# add this list for each hmm in the dict 'seqrecs_to_add'
seqrecs_to_add[hmmprof].append(seqids_to_add)
# GET BEST HITS AND MAKE NEW ALIGNMENT
# then reiterate over each alignment, make the new fasta file, alignment, tree, and add to supermatrix
for alignment in alignfiles:
nt_unaligned = os.path.join(new_aln_dir, "{}.fasta".format(os.path.splitext(os.path.basename(alignment))[0] ) )
collect_sequences(nt_unaligned, alignment, seqrecs_to_add[aligntohmm[alignment]], args.length, speciesnames, args.max_hits, args.preserve_names, args.no_trim)
if args.no_trim: # use original method, which allows gaps from new sequences
nt_aligned = run_mafft(ALIGNER, nt_unaligned, errorlog)
else: # use --keeplength in mafft
nt_aligned = run_mafft_addlong(ALIGNER, alignment, nt_unaligned, errorlog)
# BUILD SUPERMATRIX
# generate supermatrix from alignments
newaligned = AlignIO.read(nt_aligned, "fasta")
alignment_length = newaligned.get_alignment_length()
partitionlist.append("{}:{}".format(runningsum+1, runningsum+alignment_length) )
runningsum += alignment_length
if supermatrix is None:
supermatrix = newaligned
else:
supermatrix += newaligned
if not args.no_gene_trees:
nt_tree = run_tree(FASTTREEMP, nt_aligned, errorlog)
### WRITE SUPERMATRIX AND PARTITIONS
if args.supermatrix:
AlignIO.write(supermatrix, args.supermatrix, "fasta")
print( "# Supermatrix written to {} {}".format(args.supermatrix, time.asctime() ), file=errorlog )
with open("{}.partition.txt".format(args.supermatrix),'w') as pf:
print( ",".join(partitionlist), file=pf )
print( "# Process completed in {:.1f} minutes".format( (time.time()-starttime)/60 ), file=errorlog )
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout, sys.stderr)