forked from crmumm/bulkPlasmidSeq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmedaka_wrap.py
112 lines (87 loc) · 4.72 KB
/
medaka_wrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import sys
import subprocess
import bulkPlasmidSeq
from Bio import SeqIO
from Bio.Emboss.Applications import NeedleCommandline
def run(reads, reference, outputDir, args):
'''
Takes as input fastq reads or directory to reads,
a fasta file with all the plasmid or directory pointing to the plasmid files.
'''
if None not in (reads, reference, outputDir):
runMedaka(reads, reference, outputDir, args.threads, args.model, args.igv)
else:
sys.exit('medaka needs input reads (-i), output directory (-o), and reference sequences (-r)')
def runMedaka(reads, reference, outputDir, threads, model, igv):
'''
Takes the input reads, reference, and output directory (after being processed through loadReads) and runs Medaka.
Please see Dependencies.txt or https://github.com/nanoporetech/medaka for more information about installing medaka.
Only medaka_consensus used in this pipeline.
'''
print('----------------------------------\n')
print('Running medaka with the following arguments, more info in medaka_log.txt')
print('----------------------------------\n')
print('Input files: %s \nReference Files: %s \nOutput directory: %s \nThreads: %s \nModel: %s\n'
% (reads, reference, outputDir, str(threads), model))
if os.path.isfile(outputDir):
sys.exit('Ouput directory exists')
elif not os.path.exists(outputDir):
subprocess.run(['mkdir %s' % outputDir], shell = True)
with open('%s/medaka_log.txt' % outputDir, 'wt') as log:
# Quietly run medaka, make a medaka_log
#20210512 Added back -g '-no gap fill'
subprocess.run(['medaka_consensus',
'-g',
'-i', str(reads),
'-d', str(reference),
'-o', str(outputDir),
'-t', str(threads),
'-m', str(model)],
stdout = log, stderr = log, check = True)
processMedakaOutput(outputDir, reference, igv)
return
def filterMedakaBam(outputDir):
'''
Filter out secondary and low quality mapping from medaka's 'calls_to_draft.bam'
'''
medaka_alignment = outputDir+'/calls_to_draft.bam'
filtered_bam = outputDir +'/filtered_alignment.bam'
subprocess.run('samtools view -bq 10 -F 2048 %s | samtools sort -o %s' %
(medaka_alignment, filtered_bam),
shell = True)
subprocess.run('samtools index %s' % filtered_bam, shell = True)
return filtered_bam
def processMedakaOutput(outputDir, reference, igv = None):
'''
Splits the consensus.fasta file that was generated by Medaka into individual plasmid seqs.
Generates Emboss Needle global alignments
'''
#Filter 'calls_to_draft.bam file'
output_alignmet = filterMedakaBam(outputDir)
if os.path.isdir(outputDir):
consensusFile = outputDir+'/consensus.fasta'
needle_commands = []
subprocess.run(['mkdir', '%s/consensus_sequences' % outputDir])
destination = outputDir+'/consensus_sequences/'
for record_consensus in SeqIO.parse(consensusFile, 'fasta'):
for record_reference in SeqIO.parse(reference, 'fasta'):
if record_reference.name in record_consensus.name: #Match the consensus to the reference
#print('Consensus source = ' + str(consensusFile))
#print('A seq, reference = ' + record_reference.name)
#print('B seq, consensus = ' + record_consensus)
SeqIO.write(record_consensus, destination + record_consensus.name+'_consensus.fasta', 'fasta')
SeqIO.write(record_reference, outputDir + '/' +record_reference.name+'_reference.fasta', 'fasta')
needle_commands.append(NeedleCommandline(asequence = outputDir
+ '/'+ record_reference.name+'_reference.fasta',
bsequence = destination + record_consensus.name+'_consensus.fasta',
gapopen = 10,
gapextend = 0.5,
outfile = destination + record_consensus.name + '.txt'))
else:
sys.exit('Medaka output directory not found')
for command in needle_commands:
subprocess.run(str(command) + ' -scircular1=True', shell = True)
#Run take screenshots based on final_processed.bam
if igv != None:
bulkPlasmidSeq.takeScreenshots(reference, outputDir, igv)