-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathfindir.py
231 lines (205 loc) · 9 KB
/
findir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
from Bio.Seq import Seq
from Bio.SeqUtils.lcc import lcc_simp
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from subprocess import Popen, PIPE
import os
try:
from Bio.SeqUtils import GC
except ImportError:
from Bio.SeqUtils import gc_fraction
def GC(x):
return gc_fraction(x, 'ignore') * 100
import queue
import logging
def complex_enough(seq):
complexity = lcc_simp(seq.upper())
if complexity < 1.25:
return False
gc = GC(seq.upper())
if gc < 15 or gc > 95:
return False
return True
def makelog(stri, do_print=True):
if do_print:
print(stri)
logging.debug(stri)
def findIR(q, args,l_lock, candidates, perc_seq, last_perc_seq):
while True:
try:
seq, seq_fs, split_index, record_id, seq_len, count = q.get(timeout=15)
except queue.Empty:
break
splited_len = len(seq)
seq_rc = str(Seq(seq).reverse_complement())
complexity = lcc_simp(seq.upper())
if not complex_enough(seq):
q.task_done()
continue
record_q = SeqRecord(Seq(seq), id = record_id)
record_s = SeqRecord(Seq(seq_rc), id = record_id + "_rc")
query_filename = "results/" + args.jobname + "/tmp/query" + str(record_id + "_" + str(split_index))+".tmp"
subject_filename = "results/" + args.jobname + "/tmp/subject" + str(record_id + "_" + str(split_index))+".tmp"
SeqIO.write(record_q, query_filename, "fasta")
SeqIO.write(record_s, subject_filename, "fasta")
cmd_list = [
'blastn',
'-query',query_filename,
'-subject',subject_filename,
'-reward','2',
#'-max_target_seqs','1',
'-penalty','-4',
'-word_size','4',
#'-ungapped',
#'-evalue','140',
'-strand','plus',
#'-soft_masking','false',
#'-dust','no',
'-outfmt',"6 sstart send qstart qend score length mismatch gaps gapopen nident"]
p = Popen(cmd_list, stdout=PIPE, stderr=PIPE)
out,err = p.communicate()
if err:
print(split_index, record_id,seq_len)
makelog("BLASTN error: %s" % (err, ) )
os.remove(query_filename)
os.remove(subject_filename)
lines = out.splitlines()
for row in lines:
row = row.split()
sstart = int(row[0])
send = int(row[1])
qstart = int(row[2])
qend = int(row[3])
score = int(row[4])
length = int(row[5])
mismatch = int(row[6])
gaps = int(row[7])
#filter valids IR
if length < args.align_min_len:
continue
#subject transform cause it was reversed
sstart = splited_len - sstart
send = splited_len - send
#obtain IR sequences
seq_q = seq[qstart:qend]
seq_q_prime = seq[send:sstart]
qstart, qend = min(qstart, qend),max(qstart, qend)
sstart, send = min(sstart, send),max(sstart, send)
#organice positions
ir_start = min(qstart,qend,sstart,send)
ir_end = max(qstart,qend,sstart,send)
#calculate length
ir_len = ir_end - ir_start
#length constraints
if ir_len > args.mite_max_len:
continue
if ir_len < args.mite_min_len:
continue
#move in genome, split index
#ir_seq = seq[ir_start:ir_end]
#again validate complexity, a value of 1 means only two different nucleotides are present
if not complex_enough(seq_q):
continue
if not complex_enough(seq_q_prime):
continue
#validate TSD outside TIRs
i = args.tsd_max_len
valid_tsd = False
while i >= args.tsd_min_len:
tsd_one = seq_fs[ir_start - i + args.FSL:ir_start + args.FSL]
tsd_two = seq_fs[ir_end + args.FSL:ir_end + i + args.FSL]
if tsd_one.lower() == tsd_two.lower():
valid_tsd = True
mite_pos_one = ir_start + args.FSL
mite_pos_two = ir_end + args.FSL
tsd_in = 'no'
break
i -= 1
#validate TSD inside TIRs
#TSDs cannot be a large part of TIRs
if not valid_tsd:
i = args.tsd_max_len
while i >= args.tsd_min_len:
tsd_one = seq_fs[ir_start + args.FSL:ir_start + i + args.FSL]
tsd_two = seq_fs[ir_end - i + args.FSL:ir_end + args.FSL]
if tsd_one.lower() == tsd_two.lower():
valid_tsd = True
mite_pos_one = ir_start + args.FSL + i
mite_pos_two = ir_end + args.FSL - i
tsd_in = 'yes'
break
i -= 1
#"no tsd"
if not valid_tsd:
continue
ir_seq = seq_fs[mite_pos_one:mite_pos_two]
if not complex_enough(ir_seq):
continue
#ir_seq = seq_fs[mite_pos_one - args.FSL:mite_pos_two + args.FSL]
ir_len = mite_pos_two - mite_pos_one
fs_start = max(0,mite_pos_one - args.FSL)
flanking_seq_left = seq_fs[fs_start:mite_pos_one]
flanking_seq_right = seq_fs[mite_pos_two:mite_pos_two + args.FSL]
if len(flanking_seq_right) < args.FSL or len(flanking_seq_left) < args.FSL:
continue
#calculate positions in full sequence
mite_start_full = mite_pos_one + split_index - args.FSL
mite_end_full = mite_pos_two + split_index - args.FSL
#new_element = (mite_start_full, mite_end_full, ir_seq, record_id, ir_len, seq_q, seq_q_prime, tsd_one, tsd_in,flanking_seq_left,flanking_seq_right,length,'','unfiltered','')
new_element = {
'start': mite_start_full,
'end': mite_end_full,
'end': mite_end_full,
'seq': ir_seq,
'record': record_id,
'mite_len': ir_len,
'tir1_start': mite_start_full,
'tir1_end': mite_start_full + length,
'tir2_start': mite_end_full - length,
'tir2_end': mite_end_full,
'tir1_seq': seq_q,
'tir2_seq': seq_q_prime,
'tsd': tsd_one,
'tsd_in': tsd_in,
'fs_left': flanking_seq_left,
'fs_right': flanking_seq_right,
'tir_len': length,
}
with l_lock:
index = "%s_%i" % (record_id, (count))
#we don't want overlapped TIRs, save the broader
is_nested = False
has_nested = False
for curr_count in range(count - 1, count + 2):
curr_index = "%s_%i" % (record_id, (curr_count))
if curr_index in candidates:
for candidate in candidates[curr_index]:
#if new element TIR is nested in other TIR
if new_element['start'] >= candidate['tir1_start'] and \
new_element['start'] <= candidate['tir1_end'] and \
new_element['end'] >= candidate['tir2_start'] and \
new_element['end'] <= candidate['tir2_end'] and \
new_element['record'] == candidate['record']:
is_nested = True
#if other element is nested inside new element TIR
if candidate['start'] >= new_element['tir1_start'] and \
candidate['start'] <= new_element['tir1_end'] and \
candidate['end'] >= new_element['tir2_start'] and \
candidate['end'] <= new_element['tir2_end'] and \
candidate['record'] == candidate['record']:
has_nested = True
candidates[curr_index].remove(candidate)
if has_nested or not is_nested:
if not index in candidates:
candidates[index] = []
candidates[index].append(new_element)
curr_perc = int(split_index * 100 / seq_len)
if not record_id in perc_seq or not record_id in last_perc_seq:
perc_seq[record_id] = curr_perc
last_perc_seq[record_id] = curr_perc
makelog(record_id + " " + str(curr_perc) + "%")
if perc_seq[record_id] - last_perc_seq[record_id] >= 10:
makelog(record_id + " " + str(curr_perc) + "%")
last_perc_seq[record_id] = curr_perc
perc_seq[record_id] = curr_perc
q.task_done()