From edd743e5dd4af978101c2f116bbe455ee54d4395 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Fri, 16 Feb 2018 14:24:59 +0000 Subject: [PATCH 1/3] add gc content function to fasta class --- pyfastaq/sequences.py | 42 ++++++++++++++++++++++++++++++-- pyfastaq/tests/sequences_test.py | 13 ++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py index ee20b7c..8fb729b 100644 --- a/pyfastaq/sequences.py +++ b/pyfastaq/sequences.py @@ -1,9 +1,8 @@ import copy import re -import string import random import itertools - +from collections import Counter from pyfastaq import utils, intervals, genetic_codes class Error (Exception): pass @@ -465,6 +464,45 @@ def translate(self, frame=0): '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)])) + def gc_content(self, as_decimal=True): + """Returns the GC content for the sequence. + Notes: + This method ignores N when calculating the length of the sequence. + It does not, however ignore other ambiguous bases. It also only + includes the ambiguous base S (G or C). In this sense the method is + conservative with it's calculation. + + Args: + as_decimal (bool): Return the result as a decimal. Setting to False + will return as a percentage. i.e for the sequence GCAT it will + return 0.5 by default and 50.00 if set to False. + + Returns: + float: GC content calculated as the number of G, C, and S divided + by the number of (non-N) bases (length). + + """ + gc_total = 0.0 + num_bases = 0.0 + + # counter sums all unique characters in sequence. Case insensitive. + for base, count in Counter(self.seq).items(): + + # dont count N in the number of bases + if base not in tuple('nN'): + num_bases += count + + if base in tuple('cCgGsS'): # S is a G or C + gc_total += count + + gc_content = gc_total / num_bases + + if not as_decimal: # return as percentage + gc_content *= 100 + + return gc_content + + class Embl(Fasta): '''Exactly the same as Fasta, but reading seqs from a file works differently''' diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py index 8e4e18b..2663df7 100644 --- a/pyfastaq/tests/sequences_test.py +++ b/pyfastaq/tests/sequences_test.py @@ -520,6 +520,19 @@ def test_split_capillary_id(self): fa = sequences.Fasta('name', 'A') fa.split_capillary_id() + def test_gc_content(self): + """Test GC content calculation works as expected""" + tests = [ + (sequences.Fasta('ID', 'cgCG'), 1.0), + (sequences.Fasta('ID', 'tTaA'), 0.0), + (sequences.Fasta('ID', 'GCAT'), 0.5), + (sequences.Fasta('ID', 'GCATNN'), 0.5), + (sequences.Fasta('ID', 'GCATNNS'), 0.6), + (sequences.Fasta('ID', 'GCATNNSK'), 0.5) + ] + for test, answer in tests: + self.assertAlmostEqual(test.gc_content(), answer) + self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100) class TestEmbl(unittest.TestCase): def test_get_id_from_header_line(self): From 55a32cc76167856a2a135c934013709e90c9adf5 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Fri, 16 Feb 2018 14:29:04 +0000 Subject: [PATCH 2/3] update version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7fb94e3..1c8fca5 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='pyfastaq', - version='3.16.0', + version='3.17.0', description='Script to manipulate FASTA and FASTQ files, plus API for developers', packages = find_packages(), author='Martin Hunt', From abeb7e0afe1c5a26696282b7b2e5ba41ec7a2116 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Mon, 19 Feb 2018 09:30:52 +0000 Subject: [PATCH 3/3] make loop more efficient. Fix Martin's crazy grammar requirements. --- pyfastaq/sequences.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py index 8fb729b..d7e2c07 100644 --- a/pyfastaq/sequences.py +++ b/pyfastaq/sequences.py @@ -470,7 +470,7 @@ def gc_content(self, as_decimal=True): This method ignores N when calculating the length of the sequence. It does not, however ignore other ambiguous bases. It also only includes the ambiguous base S (G or C). In this sense the method is - conservative with it's calculation. + conservative with its calculation. Args: as_decimal (bool): Return the result as a decimal. Setting to False @@ -484,15 +484,17 @@ def gc_content(self, as_decimal=True): """ gc_total = 0.0 num_bases = 0.0 + n_tuple = tuple('nN') + accepted_bases = tuple('cCgGsS') # counter sums all unique characters in sequence. Case insensitive. for base, count in Counter(self.seq).items(): # dont count N in the number of bases - if base not in tuple('nN'): + if base not in n_tuple: num_bases += count - if base in tuple('cCgGsS'): # S is a G or C + if base in accepted_bases: # S is a G or C gc_total += count gc_content = gc_total / num_bases