Skip to content

Commit

Permalink
Merge pull request #68 from mbhall88/master
Browse files Browse the repository at this point in the history
Added a function to calculate GC content of a sequence
  • Loading branch information
martinghunt authored Feb 19, 2018
2 parents d432535 + abeb7e0 commit 892f4b9
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 3 deletions.
44 changes: 42 additions & 2 deletions pyfastaq/sequences.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import copy
import re
import string
import random
import itertools

from collections import Counter
from pyfastaq import utils, intervals, genetic_codes

class Error (Exception): pass
Expand Down Expand Up @@ -465,6 +464,47 @@ def translate(self, frame=0):
'''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))

def gc_content(self, as_decimal=True):
"""Returns the GC content for the sequence.
Notes:
This method ignores N when calculating the length of the sequence.
It does not, however ignore other ambiguous bases. It also only
includes the ambiguous base S (G or C). In this sense the method is
conservative with its calculation.
Args:
as_decimal (bool): Return the result as a decimal. Setting to False
will return as a percentage. i.e for the sequence GCAT it will
return 0.5 by default and 50.00 if set to False.
Returns:
float: GC content calculated as the number of G, C, and S divided
by the number of (non-N) bases (length).
"""
gc_total = 0.0
num_bases = 0.0
n_tuple = tuple('nN')
accepted_bases = tuple('cCgGsS')

# counter sums all unique characters in sequence. Case insensitive.
for base, count in Counter(self.seq).items():

# dont count N in the number of bases
if base not in n_tuple:
num_bases += count

if base in accepted_bases: # S is a G or C
gc_total += count

gc_content = gc_total / num_bases

if not as_decimal: # return as percentage
gc_content *= 100

return gc_content



class Embl(Fasta):
'''Exactly the same as Fasta, but reading seqs from a file works differently'''
Expand Down
13 changes: 13 additions & 0 deletions pyfastaq/tests/sequences_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,19 @@ def test_split_capillary_id(self):
fa = sequences.Fasta('name', 'A')
fa.split_capillary_id()

def test_gc_content(self):
"""Test GC content calculation works as expected"""
tests = [
(sequences.Fasta('ID', 'cgCG'), 1.0),
(sequences.Fasta('ID', 'tTaA'), 0.0),
(sequences.Fasta('ID', 'GCAT'), 0.5),
(sequences.Fasta('ID', 'GCATNN'), 0.5),
(sequences.Fasta('ID', 'GCATNNS'), 0.6),
(sequences.Fasta('ID', 'GCATNNSK'), 0.5)
]
for test, answer in tests:
self.assertAlmostEqual(test.gc_content(), answer)
self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100)

class TestEmbl(unittest.TestCase):
def test_get_id_from_header_line(self):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name='pyfastaq',
version='3.16.0',
version='3.17.0',
description='Script to manipulate FASTA and FASTQ files, plus API for developers',
packages = find_packages(),
author='Martin Hunt',
Expand Down

0 comments on commit 892f4b9

Please sign in to comment.