From edd743e5dd4af978101c2f116bbe455ee54d4395 Mon Sep 17 00:00:00 2001
From: Michael Hall <mbhall88@gmail.com>
Date: Fri, 16 Feb 2018 14:24:59 +0000
Subject: [PATCH 1/3] add gc content function to fasta class

---
 pyfastaq/sequences.py            | 42 ++++++++++++++++++++++++++++++--
 pyfastaq/tests/sequences_test.py | 13 ++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py
index ee20b7c..8fb729b 100644
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -1,9 +1,8 @@
 import copy
 import re
-import string
 import random
 import itertools
-
+from collections import Counter
 from pyfastaq import utils, intervals, genetic_codes
 
 class Error (Exception): pass
@@ -465,6 +464,45 @@ def translate(self, frame=0):
         '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
         return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
 
+    def gc_content(self, as_decimal=True):
+        """Returns the GC content for the sequence.
+        Notes:
+            This method ignores N when calculating the length of the sequence.
+            It does not, however ignore other ambiguous bases. It also only
+            includes the ambiguous base S (G or C). In this sense the method is
+            conservative with it's calculation.
+
+        Args:
+            as_decimal (bool): Return the result as a decimal. Setting to False
+            will return as a percentage. i.e for the sequence GCAT it will
+            return 0.5 by default and 50.00 if set to False.
+
+        Returns:
+            float: GC content calculated as the number of G, C, and S divided
+            by the number of (non-N) bases (length).
+
+        """
+        gc_total = 0.0
+        num_bases = 0.0
+
+        # counter sums all unique characters in sequence. Case insensitive.
+        for base, count in Counter(self.seq).items():
+
+            # dont count N in the number of bases
+            if base not in tuple('nN'):
+                num_bases += count
+
+                if base in tuple('cCgGsS'):  # S is a G or C
+                    gc_total += count
+
+        gc_content = gc_total / num_bases
+
+        if not as_decimal:  # return as percentage
+            gc_content *= 100
+
+        return gc_content
+
+
 
 class Embl(Fasta):
     '''Exactly the same as Fasta, but reading seqs from a file works differently'''
diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py
index 8e4e18b..2663df7 100644
--- a/pyfastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -520,6 +520,19 @@ def test_split_capillary_id(self):
             fa = sequences.Fasta('name', 'A')
             fa.split_capillary_id()
 
+    def test_gc_content(self):
+        """Test GC content calculation works as expected"""
+        tests = [
+            (sequences.Fasta('ID', 'cgCG'), 1.0),
+            (sequences.Fasta('ID', 'tTaA'), 0.0),
+            (sequences.Fasta('ID', 'GCAT'), 0.5),
+            (sequences.Fasta('ID', 'GCATNN'), 0.5),
+            (sequences.Fasta('ID', 'GCATNNS'), 0.6),
+            (sequences.Fasta('ID', 'GCATNNSK'), 0.5)
+        ]
+        for test, answer in tests:
+            self.assertAlmostEqual(test.gc_content(), answer)
+            self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100)
 
 class TestEmbl(unittest.TestCase):
     def test_get_id_from_header_line(self):

From 55a32cc76167856a2a135c934013709e90c9adf5 Mon Sep 17 00:00:00 2001
From: Michael Hall <mbhall88@gmail.com>
Date: Fri, 16 Feb 2018 14:29:04 +0000
Subject: [PATCH 2/3] update version number

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7fb94e3..1c8fca5 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name='pyfastaq',
-    version='3.16.0',
+    version='3.17.0',
     description='Script to manipulate FASTA and FASTQ files, plus API for developers',
     packages = find_packages(),
     author='Martin Hunt',

From abeb7e0afe1c5a26696282b7b2e5ba41ec7a2116 Mon Sep 17 00:00:00 2001
From: Michael Hall <mbhall88@gmail.com>
Date: Mon, 19 Feb 2018 09:30:52 +0000
Subject: [PATCH 3/3] make loop more efficient. Fix Martin's crazy grammar
 requirements.

---
 pyfastaq/sequences.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py
index 8fb729b..d7e2c07 100644
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -470,7 +470,7 @@ def gc_content(self, as_decimal=True):
             This method ignores N when calculating the length of the sequence.
             It does not, however ignore other ambiguous bases. It also only
             includes the ambiguous base S (G or C). In this sense the method is
-            conservative with it's calculation.
+            conservative with its calculation.
 
         Args:
             as_decimal (bool): Return the result as a decimal. Setting to False
@@ -484,15 +484,17 @@ def gc_content(self, as_decimal=True):
         """
         gc_total = 0.0
         num_bases = 0.0
+        n_tuple = tuple('nN')
+        accepted_bases = tuple('cCgGsS')
 
         # counter sums all unique characters in sequence. Case insensitive.
         for base, count in Counter(self.seq).items():
 
             # dont count N in the number of bases
-            if base not in tuple('nN'):
+            if base not in n_tuple:
                 num_bases += count
 
-                if base in tuple('cCgGsS'):  # S is a G or C
+                if base in accepted_bases:  # S is a G or C
                     gc_total += count
 
         gc_content = gc_total / num_bases