-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgene_finder.py
304 lines (242 loc) · 9.48 KB
/
gene_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
"""
Library for finding potential genes in a strand of DNA.
"""
from tqdm import tqdm_notebook as tqdm
from helpers import amino_acid, shuffle, load_fasta_file
tqdm().pandas()
def get_complement(nucleotide):
"""
Takes a string nucleotide consisting of a single character
A, T, C, or G, each representing the DNA nucleotides
adenine, thymine, cytosine, and guanine,
and return a string (also consisting of a single character)
representing the complementary nucleotide.
In DNA, A and T are complementary to each other,
as are C and G
Args:
- nucleotide: a string that represents the DNA nucleotides
in a certain sequence with A, T, C, or G.
Returns:
- a string (consisting of a single character)
representing the complementary nucleotide.
"""
complement = ""
if nucleotide in 'A':
complement = 'T'
elif nucleotide in 'T':
complement = 'A'
elif nucleotide in 'G':
complement = 'C'
else:
complement = 'G'
return complement
def get_reverse_complement(strand):
"""
Takes a string strand representing a single strand of DNA
(consisting entirely of the characters A, T, C, and G)
and returns a string representing a complementary strand of DNA
Args:
- strand: a string that represents a string with multiple
characters that are a single strand of DNA
Returns:
- a string representing a complementary strand of DNA
"""
reverse_sequence = ''
for base in reversed(strand):
reverse_sequence += get_complement(base)
return reverse_sequence
def rest_of_orf(strand):
"""
Takes a string strand representing a strand of DNA
that begins with a start codon and returns the sequence
of nucleotides representing the rest of the ORF,
up to but not including the next stop codon
which include 'TAA', 'TAG', 'TGA.'
If there is no stop codon, the function
returns the entire strand.
Args:
- strand: a string representing a strand of DNA
Returns:
- a string that begins with a start codon
and returns the rest of the opening reading frame
but not including the next stop codon.
"""
# Makes the strand list which is the strand
# into a list for each triplet
strand_list = [strand[nucleotide:nucleotide+3]
for nucleotide in range(0, len(strand), 3)]
# For each value in the strand list
for codon_index, codon in enumerate(strand_list):
# if the return value of function "amino_acid"
# is found in stop_codons list, then stop loop
if amino_acid(codon) == "*":
strand_list = strand_list[0:codon_index]
break
# Return the list as one big string
return ''.join(strand_list)
def find_all_orfs_one_frame(strand):
"""
Takes a string strand representing a strand of DNA
and returns a list of strings representing
all in-frame ORFs found in that strand
Args:
- strand: a string representing a strand of DNA
Returns:
- a list of strings representing
all in-frame ORFs found in that strand.
"""
# Make a list that will contain all orfs
frame = ""
in_frame_orfs = []
# while the strand still can return an orf
while len(strand) >= 3:
current_codon = strand[:3]
if current_codon != "ATG":
strand = strand[3:]
continue
# run frame of orf onto strand and get
frame = rest_of_orf(strand)
strand = strand[len(frame):]
# append the strand to the string when completed
in_frame_orfs.append(frame)
# return string when completed
return in_frame_orfs
def find_all_orfs(strand):
"""
Takes a string strand representing a strand of DNA
and returns a list of strings representing
not only in-frame ORFs, but also ORFs
found one or two nucleotides from the start of strand
Args: a string "strand" representing a strand of DNA
Returns: a list of strings representing
all ORFs found in that strand
"""
all_orfs = []
# go through every codon from the start
# and find all the orfs in that frame
# when completed, shift the strand by one
for start in range(3):
all_orfs += find_all_orfs_one_frame(strand[start:])
return all_orfs
def find_all_orfs_both_strands(strand):
"""
Takes a string strand representing a strand of DNA
and returns a list of strings representing all ORFs
found in strand or its reverse complement.
Args: a string strand representing a strand of DNA
Returns: a list of strings representing all ORFs
found in strand or its reverse complement.
"""
# create an empty list that will contain all the ORFs
all_frame_orfs_both_strands = []
# run the function twice for the original and reverse strand
for _ in range(2):
# add the function output into the list
all_frame_orfs_both_strands.append(find_all_orfs(strand))
strand = get_reverse_complement(strand)
# combine the nested list into a single list
all_frame_orfs_both_strands = [
orf for orf_list in all_frame_orfs_both_strands for orf in orf_list]
return all_frame_orfs_both_strands
def find_longest_orf(strand):
"""
Finds the longest ORF found in either the strand
or its reverse complement.
Args: a string strand representing a strand of DNA
Returns: the length of the longest ORF found in either
that strand or its reverse complement.
"""
# run find_all_orfs_both_strands into the list_of_strings
list_of_orfs = find_all_orfs_both_strands(strand)
longest_orf_length = 0
longest_orf = ""
# Go through the whole list
for orf in list_of_orfs:
# Checking for the longest word(string)
if len(orf) > longest_orf_length:
longest_orf_length = len(orf)
longest_orf = orf
return longest_orf
def noncoding_orf_threshold(strand, num_trials):
"""
Takes a string strand representing a strand of DNA
and a positive integer num_trials representing a number
of trials to run. For each of these trials, it randomly
shuffles the nucleotides in strand and finds the longest ORF
in either the shuffled strand or its reverse complement.
It then keeps track of the minimum length of this value over
all of the trials and returns an integer representing
this minimum length.
Args:
- a string strand representing a strand of DNA
- a positive integer num trials representing a number
of trials to run
Returns: an integer representing the minimum length
of the value over all of the trials
"""
orfs_lengths = []
for _ in tqdm(range(num_trials)):
# shuffles the strand
shuffled_strand = shuffle(strand)
# finds the longest ORF in either the shuffled strand or its reverse complement.
longest_orf = find_longest_orf(shuffled_strand)
# keeps track of the minimum length of this value over all of the trials
length_of_longest_orf = len(longest_orf)
# returns an integer representing this minimum length
orfs_lengths.append(length_of_longest_orf)
# returns an integer representing this minimum length
return min(orfs_lengths)
def encode_amino_acids(orf):
"""
Determines the sequence of amino acids these ORFs encode
Each of these sequences is potentially a protein of interest
in the genome, and will be a candidate for further analysis.
It is possible that an ORF's length will not be an exact multiple of 3,
due to reading an ORF from an offset.
In this case, there will be 1 or 2 nucleotides left at the end
your implementation should simply ignore these.
Args: a string orf representing a strand of DNA that is an ORF
Returns: a string representing the sequence of amino acids
with each amino acid written as its one-letter symbol.
"""
# empty string for the amino acids
amino_acids = ""
# run the amino acid function for each codon
# and add it to the string
for triple in range(0, len(orf), 3):
codon = orf[triple:triple + 3]
amino_acids += amino_acid(codon)
# return the string when completed
return amino_acids
def find_genes(path):
"""
Finds potential protein-coding genes by
1. Loading the string of nucleotides from the file.
2. Determining the threshold length to use as a cutoff
for coding ORFs, using 1,500 trials of shuffling
the nucleotide sequence.
3. Find all ORFs in both strands of the sequence.
4. For any ORF longer than the cutoff length,
translate the ORF to a sequence of amino acids.
Args: the location of a file in a FASTA format
Returns: list of all such amino acid sequences.
"""
# to be returned
amino_acid_sequences = []
# Loading the string of nucleotides from the file.
genes = load_fasta_file(path)
# Determining the threshold length to use as a cutoff
# for coding ORFs, using 1,500 trials of shuffling
# the nucleotide sequence.
threshold_length = noncoding_orf_threshold(genes, 1500)
# Find all ORFs in both strands of the sequence.
all_orfs_list = find_all_orfs_both_strands(genes)
# For any ORF longer than the cutoff length,
# translate the ORF to a sequence of amino acids.
# tqdm is used for the progress bar in the
# 'sars-cov-2' file
for orf in tqdm(all_orfs_list):
if len(orf) > threshold_length:
protein_orf = encode_amino_acids(orf)
amino_acid_sequences.append(protein_orf)
return amino_acid_sequences