-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_gene_finder.py
295 lines (248 loc) · 11.1 KB
/
test_gene_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""
Test library functions to find and identify protein-coding genes in DNA.
"""
from collections import Counter
import pytest
from gene_finder import (
get_complement,
get_reverse_complement,
rest_of_orf,
find_all_orfs_one_frame,
find_all_orfs,
find_all_orfs_both_strands,
find_longest_orf,
encode_amino_acids,
)
# Define sets of test cases.
get_complement_cases = [
# Check that the complement of A is T.
("A", "T"),
# Check that the complement of C is G.
("C", "G"),
# Check that the complement of T is A.
("T", "A"),
# Check that the complement of G is C.
("G", "C"),
]
get_reverse_complement_cases = [
# Check a single nucleotide, which should be the same as the complement.
("A", "T"),
# Check that the complement of C is G.
("C", "G"),
# Check that the complement of T is A.
("T", "A"),
# Check that the complement of G is C.
("G", "C"),
# Check two letters are forward and backward
("TG", "CA"),
# Check multiple goes forward and backward
("ATGCCCGCTTT", "AAAGCGGGCAT")
]
rest_of_orf_cases = [
# Check a start followed by a stop.
("ATGTGA", "ATG"),
# Check a case with a stop codon
("ATGAGGTGA", "ATGAGG"),
# Check a long case with a stop codon
("ATGATAGAATGA", "ATGATAGAA"),
# Check a case without a stop codon.
("ATGAAA", "ATGAAA"),
# Check a case without a stop codon where the length is not a multiple of 3.
("ATGATAG", "ATGATAG")
]
find_all_orfs_one_frame_cases = [
# Check a strand with a single ORF.
("ATGTGA", ['ATG']),
# Check a strand with two ORFs.
("ATGTAAATGAAATAA", ['ATG', 'ATGAAA']),
# Check a strand with two three ORFs.
("ATGTAAATGAAATAAATGAGGGGATGA", ['ATG', 'ATGAAA', 'ATGAGGGGA']),
# Check a strand that has a nested start codon
("ATGAAAATGGCATGA", ['ATGAAAATGGCA'])
]
find_all_orfs_cases = [
# This case from find_all_orfs has no ORFs in other frames, so it should
# return the same result as in the one_frame case.
("ATGTAAATGAAATAA", ["ATG", "ATGAAA"]),
# In frame case that has a nested start codon
("ATGAAAATGGCATGA", ['ATGAAAATGGCA', 'ATG']),
# Above case frame shifted
("TGAAAATGGCAT", ['ATGGCAT']),
# The frame case shifted twice
("GAAAATGGCATG", ['ATG', 'ATGGCATG']),
]
find_all_orfs_both_strands_cases = [
# Test a short strand starting with a start codon whose reverse complement
# is itself. Thus this should return two copies of the same ORF.
("ATGCAT", ['ATGCAT', 'ATGCAT']),
# Test a normal frame case that starts with a start codon and ends
# with a end codon
('ATGAAAAAAGGGTGA', ['ATGAAAAAAGGG']),
# Test a frame case with a nested start codon
("ATGAAAATGGCATGA", ['ATGAAAATGGCA', 'ATG', 'ATGCCATTTTCAT']),
# Test a frame case with no stop or start codon
]
find_longest_orf_cases = [
# An ORF covering the whole strand is by default the longest ORF.
("ATGAAAAAAAAA", "ATGAAAAAAAAA"),
# Finding the first orf to be the largest one
("ATGAAAAAAAAAATGAAATAA", 'ATGAAAAAAAAAATGAAA'),
# Finding the second orf to be the largest one
("ATGTAAATGAAATAA", "ATGAAA"),
# Finding the third orf to be the largest one
("TGTAAATGAAATAAATGAGGGGATGA", "ATGAGGGGA"),
]
encode_amino_acids_cases = [
# Check a single start codon.
("ATG", "M"),
# Check a case in which the length is not a multiple of 3.
("ATGCCCGCTTT", 'MPAF'),
# Check a case in with a nested start codon
("ATGAAAATGGCATGA", 'MKMA*'),
]
# Define additional testing lists and functions that check other properties of
# functions in gene_finder.py.
@pytest.mark.parametrize("nucleotide", ["A", "T", "C", "G"])
def test_double_complement(nucleotide):
"""
Check that taking the complement of a complement of a nucleotide produces
the original nucleotide.
Args:
nucleotide: A single-character string representing one of the four DNA
nucleotides.
"""
assert get_complement(get_complement(nucleotide)) == nucleotide
################################################################################
# Don't change anything below these lines.
################################################################################
# Define standard testing functions to check functions' outputs given certain
# inputs defined above.
@pytest.mark.parametrize("nucleotide,complement", get_complement_cases)
def test_get_complement(nucleotide, complement):
"""
Test that each nucleotide is mapped to its correct complement.
Given a single-character string representing a nucleotide that is "A", "T",
"G", or "C", check that the get_complement function correctly maps the
string to a single-character string representing the nucleotide's complement
(also "A", "T", "G", or "C").
Args:
nucleotide: A single-character string equal to "A", "C", "T", or "G"
representing a nucleotide.
complement: A single-character string equal to "A", "C", "T", or "G"
representing the expected complement of nucleotide.
"""
assert get_complement(nucleotide) == complement
@pytest.mark.parametrize("strand,reverse_complement",
get_reverse_complement_cases)
def test_get_reverse_complement(strand, reverse_complement):
"""
Test that a string of nucleotides get mapped to its reverse complement.
Check that given a string consisting of "A", "C", "T", and "G" that
represents a strand of DNA, the get_reverse_complement function correctly
returns the reverse complement of the string, defined as the complement of
each nucleotide in the strand in reverse order.
Args:
strand: A string consisting only of the characters "A", "C", "T", and
"G" representing a strand of DNA.
reverse_complement: A string representing the expected reverse
complement of strand.
"""
assert get_reverse_complement(strand) == reverse_complement
@pytest.mark.parametrize("strand,rest", rest_of_orf_cases)
def test_rest_of_orf(strand, rest):
"""
Test that a string representing a strand of DNA gets mapped to the rest of
its open reading frame.
Check that given a string representing a strand of DNA as defined above, the
rest_of_orf function returns a string representing a strand of DNA for the
rest of the given strand's open reading frame. This is the original strand
until reading sets of three nucleotides results in a STOP codon, or the
entire strand if no such codon appears when reading the strand.
Args:
strand: A string representing a strand of DNA.
rest: A string representing the expected rest of the open reading frame
of strand, or the entirety of strand if reading it does not result
in a STOP codon at any point.
"""
assert rest_of_orf(strand) == rest
@pytest.mark.parametrize("strand,orfs", find_all_orfs_one_frame_cases)
def test_find_all_orfs_oneframe(strand, orfs):
"""
Test that a string representing a strand of DNA gets mapped to a list of all
non-overlapping open reading frames (ORFs) aligned to its frame.
Check that given a string representing a strand of DNA as defined above, the
find_all_orfs_oneframe function returns a list of strings representing all
non-overlapping ORFs in the strand that are aligned to the strand's frame
(i.e., starting a multiple of 3 nucleotides from the start of the strand).
Each ORF is a strand of DNA from a START codon to a STOP codon (or in the
case of the last ORF in the strand, to the end of the strand if no STOP
codon is encountered during reading).
Args:
strand: A string representing a strand of DNA.
orfs: A list of strings representing the expected strands of DNA that
are ORFs within strand's frame.
"""
assert Counter(find_all_orfs_one_frame(strand)) == Counter(orfs)
@pytest.mark.parametrize("strand,orfs", find_all_orfs_cases)
def test_find_all_orfs(strand, orfs):
"""
Test that a string representing a strand of DNA gets mapped to a list of all
open reading frames within the strand, with no overlapping ORFs within any
given frame of the strand.
Check that given a string representing a strand of DNA as defined above, the
find_all_orfs function returns a list of strings representing all ORFs in
the strand as defined above. Overlapping ORFs are allowed as long as they do
not occur in different frames (i.e., each ORF is only non-overlapping with
the other ORFs in its own frame).
Args:
strand: A string representing a strand of DNA.
orfs: A list of strings representing the expected strands of DNA that
are ORFs within strand, with no overlapping ORFs within one frame of
strand.
"""
assert Counter(find_all_orfs(strand)) == Counter(orfs)
@pytest.mark.parametrize("strand,orfs", find_all_orfs_both_strands_cases)
def test_find_all_orfs_both_strands(strand, orfs):
"""
Test that a string representing a strand of DNA gets mapped to a list of
all open reading frames within the strand or its reverse complement, with no
overlapping ORFs within a given frame.
Check that given a string representing a strand of DNA as defined above, the
find_all_orfs_both_strands function returns a list of strings representing
all ORFs in the strand or its reverse complement as defined above.
Args:
strand: A string representing a strand of DNA.
orfs: A list of strings representing the expected strands of DNA that
are ORFs within strand or its reverse complement, with no
overlapping ORFs within one frame of either.
"""
assert Counter(find_all_orfs_both_strands(strand)) == Counter(orfs)
@pytest.mark.parametrize("strand,orf", find_longest_orf_cases)
def test_find_longest_orf(strand, orf):
"""
Test that a string representing a strand of DNA gets mapped to a string
representing the longest ORF within the strand or its reverse complement.
Check that given a string representing a strand of DNA as defined above, the
find_longest_orf function returns a string representing a strand of DNA
equal to the longest ORF within the strand or its reverse complement.
Args:
strand: A string representing a strand of DNA.
orf: A string representing a strand of DNA equal to the expected longest
ORF in strand or its reverse complement.
"""
assert find_longest_orf(strand) == orf
@pytest.mark.parametrize("strand,protein", encode_amino_acids_cases)
def test_encode_amino_acids(strand, protein):
"""
Test that a string representing a strand of DNA gets mapped to a string
representing the amino acids encoded by the strand.
Check that given a string representing a strand of DNA as defined above, the
encode_amino_acids function returns a string consisting of one-letter IUPAC
amino acid codes corresponding to the sequence amino acids encoded by the
strand.
Args:
strand: A string representing a strand of DNA.
protein: A string representing the expected sequence one-letter IUPAC
amino acid codes encoded by strand.
"""
assert encode_amino_acids(strand) == protein