forked from crmumm/bulkPlasmidSeq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbiobinning.py
281 lines (224 loc) · 11.3 KB
/
biobinning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
from Bio import Align,SeqIO
from collections import defaultdict, Counter
import sys
from operator import itemgetter
import medaka_wrap
import bulkPlasmidSeq
def run(fastq_reads, reference, output_directory, args):
k = int(args.kmer_length)
run_medaka_binned(fastq_reads, reference, k, output_directory, args,
args.match, args.mismatch, args.gap_open, args.gap_extend,
args.context_map, args.fine_map, int(args.max_regions))
return
def define_aligner(match = 3, mismatch = -6, open_gap = -10, extend = -5):
'''
Nanopore read scoring
- match = 3
- mismatch = -6
- open_gap = -10
- extend_gap = -5
Returns Biopython PairwiseAligner
- mode = local
'''
aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.match_score = int(match)
aligner.mismatch_score = int(mismatch)
aligner.open_gap_score = int(open_gap)
aligner.extend_gap_score = int(extend)
return aligner
def getFasta(file):
plasmids = defaultdict(list)
for plasmid_ref in SeqIO.parse(file, "fasta"):
plasmids[plasmid_ref.id] = plasmid_ref.seq
return plasmids
def reverse_complement(seq):
"""
Swapping adenine with thymine and guanine with cytosine.
Reversing newly generated string
"""
mapping = str.maketrans('ATCG', 'TAGC')
return seq.translate(mapping)[::-1]
def build_kmers(file, k):
'''
Called by unique_kmers.
Returns dictionary of all kmers in plasmids.
'''
seqs = getFasta(file)
plasmids = defaultdict(list)
for x in seqs:
plasmids[x] = [(seqs[x][i : i + k],i) for i in range(0, len(seqs[x]) - k + 1)]
return plasmids
def unique_kmers(file, k):
'''
Called by define markers, calls build_kmers.
Returns a default dict of unique kmers
'''
plasmids = build_kmers(file, k)
unique_kmers = defaultdict(list)
for x in plasmids:
# for other in plasmids:
# for a in plasmids[other]:
# print(a[0])
op = [bc[0] for other in plasmids for bc in plasmids[other] if other != x]
for marker in plasmids[x]:
if marker[0] not in op and reverse_complement(str(marker[0])) not in op:
unique_kmers[x].append(marker)
return unique_kmers
def define_markers(file, k, max_regions):
'''
Called by align_reads. Calls unique_kmers to return all the unique regions for plasmids.
Processes the unique regions to get the start, end, length, sequence, and context sequence for the
regions to align to. These region are then sorted by length and the top [max_regions] are selected.
If the regions are long enough (> ~50bp) would be better to use medaka to distinguish plasmids.
Due to high identity scores required for binning, longer regions return fewer reads.
- Lower score context and fine map score
- Use medaka
Number reads * max_regions
Returns default dict (list) of best [max_regions, default 3] regions.
Format {'plasmid_name': [(start, end), len, unique_seq, context_seq]}
'''
kmers = unique_kmers(file, k)
seqs = getFasta(file)
intervals = defaultdict(list)
#print('Found %d plasmids' % len(unique_kmers))
for plasmid in kmers:
save_start = kmers[plasmid][0][1]
save_end = kmers[plasmid][-1][1]
moving = save_start
for i in range(len(kmers[plasmid])):
#print(save_start, moving, save_end, kmers[plasmid][i][1])
if moving == kmers[plasmid][i][1]:
moving +=1
if moving == save_end:
intervals[plasmid].append((save_start, moving))
else:
intervals[plasmid].append((save_start, moving))
moving = kmers[plasmid][i+1][1]
save_start = moving
#Get some marker definitions
best_markers = defaultdict(list)
for x in intervals:
#Keep the longest unique interval
#Format = {'plasmid_name': [(start, end), len, unique_seq, context_seq]}
#best_markers[x]= [(0, 1), 1, 'None', 'None']
for y in intervals[x]:
length_marker = y[1]-y[0]
# print(length_marker)
# print(y)
#print(k)
#Calculate unique length based on k
uniq_length = length_marker - k + 4
if uniq_length >= 3:
#print(uniq_length)
marker_start = round(y[0]+k-4)
marker_end = round(y[1])
marker_seq = seqs[x][marker_start:marker_end]
if marker_start-10 < 0:
print('Warning! Limited context available, markers near end of provided reference')
context_seq = seqs[x][0:marker_end+10]
elif marker_end+10 > len(seqs[x]):
print('Warning! Limited context available, markers near end of provided reference')
context_seq = seqs[x][marker_start-10:]
else:
context_seq = seqs[x][marker_start-10:marker_end+10]
#print('Adding ' + context_seq + 'here')
best_markers[x].append([(marker_start, marker_end), uniq_length, marker_seq, context_seq])
for item in best_markers:
unsorted_markers_list = best_markers[item]
length_sorted_markers = sorted(unsorted_markers_list, key = itemgetter(2))
if len(length_sorted_markers) >= max_regions:
best_markers[item] = length_sorted_markers[:max_regions]
else:
best_markers[item] = length_sorted_markers
return best_markers
def align_reads(fastq_reads, fasta_ref, k, match, mismatch, gap_open, gap_extend,
context_score, fine_alignment_score, max_regions):
'''
Called by write bins. Calls the define_aligner and define_markers. Gets the regions to score across and
uses those for Biopython PairwiseAligner() alignment to all of the provided reads.
Returns dictionary with plasmids (references) as keys and reads as values
If there is only one plasmid, (or if the length of unique sequence is too long), return None to run medaka.
'''
reads_dict = defaultdict(list)
aligner = define_aligner(match, mismatch, gap_open, gap_extend)
#Get the context and unique sequence from define makers
markers_dict = define_markers(fasta_ref, k, max_regions)
#Leave the biobin steps here if there is only 1 plasmid
if len(markers_dict) == 1:
print('Single plasmid found, passing all reads and reference to medaka')
return None
for plasmid in markers_dict:
for marker_region in markers_dict[plasmid]:
context_fwd_marker = marker_region[3]
context_rev_marker = reverse_complement(str(context_fwd_marker))
#Match with context
best_score = aligner.score(context_fwd_marker, context_fwd_marker)
#Iterate through each plasmid, then each read
for record in SeqIO.parse(fastq_reads, 'fastq'):
#Determines if forward or backward is better without doing alignment
scores = [aligner.score(record.seq, context_fwd_marker),
aligner.score(record.seq, context_rev_marker)]
top_score = (max(scores), scores.index(max(scores)))
if top_score[0] >= float(context_score)*best_score:
#Define top possible score for unique vs unique
best_fine_score = aligner.score(marker_region[2], marker_region[2])
#If the forward marker had the best score
if top_score[1] == 0:
top_fwd_alignment = sorted(aligner.align(record.seq, context_fwd_marker))[0]
#Get the sequence that the aligned to the larger region
subseq = record.seq[top_fwd_alignment.aligned[0][0][0]:top_fwd_alignment.aligned[0][-1][-1]]
#Possible add length subseq check
if aligner.score(subseq, marker_region[2]) >= float(fine_alignment_score)*best_fine_score:
reads_dict[plasmid].append(record)
#If the reverse marker has the best score
if top_score[1] == 1:
#Get the sequence that the aligned to the larger region
top_rev_alignment = sorted(aligner.align(record.seq, context_fwd_marker))[0]
subseq = record.seq[top_rev_alignment.aligned[0][0][0]:top_rev_alignment.aligned[0][-1][-1]]
if aligner.score(subseq, reverse_complement(str(marker_region[2]))) >= float(fine_alignment_score)*best_fine_score:
reads_dict[plasmid].append(record)
if not bool(reads_dict):
sys.exit("No reads were assigned to any plasmid!")
return reads_dict
def write_bins(fastq_reads, reference, k, output_directory, match = 3,
mismatch = -6, gap_open = -10, gap_extend = -5,
context_map = 0.70, fine_map = 0.80, max_regions = 3):
reads_dict = align_reads(fastq_reads, reference, k, match, mismatch, gap_open, gap_extend, context_map, fine_map, max_regions)
if reads_dict == None:
return None
# output_directory = output_directory.strip('/')
output_bins = []
for plasmid in reads_dict:
output_bins.append(output_directory+'/%s' % plasmid)
with open(output_directory+'/%s.fastq' % plasmid, 'w') as out_bin:
SeqIO.write(reads_dict[plasmid], out_bin, "fastq")
out_bin.close()
return output_bins
def run_medaka_binned(fastq_reads, reference, k, output_directory, args,
match = 3, mismatch = -6, gap_open = -10, gap_extend = -5,
context_map = 0.80, fine_map = 0.95, max_regions = 3):
output_bins = write_bins(fastq_reads, reference, k, output_directory, match, mismatch, gap_open, gap_extend,
context_map, fine_map, max_regions)
if output_bins == None:
#If the alignment step was broken for single plasmid or long unique seq, just pass the reads and referense to medaka
medaka_wrap.runMedaka(fastq_reads,
reference,
output_directory,
args.threads,
args.model,
args.igv)
return
with open(reference) as ref_list:
#for record, plasmid_bin in zip(SeqIO.parse(ref_list, 'fasta'), output_bins):
for record in SeqIO.parse(ref_list, 'fasta'):
for plasmid_bin in output_bins:
plasmid_name = plasmid_bin.split('/')[-1]
if plasmid_name == record.name:
#for plasmid_bin in output_bins:
single_reference = SeqIO.write(record, '%s.fasta' % plasmid_bin, 'fasta')
medaka_wrap.runMedaka(plasmid_bin + '.fastq', '%s.fasta'% plasmid_bin,
plasmid_bin+'_consensus',
args.threads,
args.model, args.igv)
return