diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py index ee20b7c..d7e2c07 100644 --- a/pyfastaq/sequences.py +++ b/pyfastaq/sequences.py @@ -1,9 +1,8 @@ import copy import re -import string import random import itertools - +from collections import Counter from pyfastaq import utils, intervals, genetic_codes class Error (Exception): pass @@ -465,6 +464,47 @@ def translate(self, frame=0): '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)])) + def gc_content(self, as_decimal=True): + """Returns the GC content for the sequence. + Notes: + This method ignores N when calculating the length of the sequence. + It does not, however ignore other ambiguous bases. It also only + includes the ambiguous base S (G or C). In this sense the method is + conservative with its calculation. + + Args: + as_decimal (bool): Return the result as a decimal. Setting to False + will return as a percentage. i.e for the sequence GCAT it will + return 0.5 by default and 50.00 if set to False. + + Returns: + float: GC content calculated as the number of G, C, and S divided + by the number of (non-N) bases (length). + + """ + gc_total = 0.0 + num_bases = 0.0 + n_tuple = tuple('nN') + accepted_bases = tuple('cCgGsS') + + # counter sums all unique characters in sequence. Case insensitive. + for base, count in Counter(self.seq).items(): + + # dont count N in the number of bases + if base not in n_tuple: + num_bases += count + + if base in accepted_bases: # S is a G or C + gc_total += count + + gc_content = gc_total / num_bases + + if not as_decimal: # return as percentage + gc_content *= 100 + + return gc_content + + class Embl(Fasta): '''Exactly the same as Fasta, but reading seqs from a file works differently''' diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py index 8e4e18b..2663df7 100644 --- a/pyfastaq/tests/sequences_test.py +++ b/pyfastaq/tests/sequences_test.py @@ -520,6 +520,19 @@ def test_split_capillary_id(self): fa = sequences.Fasta('name', 'A') fa.split_capillary_id() + def test_gc_content(self): + """Test GC content calculation works as expected""" + tests = [ + (sequences.Fasta('ID', 'cgCG'), 1.0), + (sequences.Fasta('ID', 'tTaA'), 0.0), + (sequences.Fasta('ID', 'GCAT'), 0.5), + (sequences.Fasta('ID', 'GCATNN'), 0.5), + (sequences.Fasta('ID', 'GCATNNS'), 0.6), + (sequences.Fasta('ID', 'GCATNNSK'), 0.5) + ] + for test, answer in tests: + self.assertAlmostEqual(test.gc_content(), answer) + self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100) class TestEmbl(unittest.TestCase): def test_get_id_from_header_line(self): diff --git a/setup.py b/setup.py index 7fb94e3..1c8fca5 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='pyfastaq', - version='3.16.0', + version='3.17.0', description='Script to manipulate FASTA and FASTQ files, plus API for developers', packages = find_packages(), author='Martin Hunt',