Make PFAM_domains compatible with antiSMASH 5, avoid crashing on sequ…

…ence with no detected proteins. * make PFAM_domain feature annotations compatible with antiSMASH 5 * avoid crashing on sequence with no detected proteins
Merck · Mar 21, 2019 · 5a201ec · 5a201ec
2 parents e607dca + e6afa5f
commit 5a201ec
Show file tree

Hide file tree

Showing 9 changed files with 49 additions and 45 deletions.
diff --git a/deepbgc/__version__.py b/deepbgc/__version__.py
@@ -1 +1 @@
-__version__ = '0.1.4'
+__version__ = '0.1.5dev'
diff --git a/deepbgc/command/pipeline.py b/deepbgc/command/pipeline.py
@@ -46,12 +46,15 @@ class PipelineCommand(BaseCommand):
   deepbgc pipeline --continue --output sequence/ --label deepbgc_90_score --score 0.9 sequence/sequence.full.gbk
   """
 
+    LOG_FILENAME = 'LOG.txt'
+    PLOT_DIRNAME = 'evaluation'
+    TMP_DIRNAME = 'tmp'
+
     def add_arguments(self, parser):
 
         parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV).")
 
         parser.add_argument('-o', '--output', required=False, help="Custom output directory path.")
-        parser.add_argument('--continue', dest='is_continue', default=False, action='store_true', help="Continue previous pipeline and save results to the existing output directory.")
         parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.")
         parser.add_argument('--minimal-output', dest='is_minimal_output', action='store_true', default=False,
                             help="Produce minimal output with just the GenBank sequence file.")
@@ -83,7 +86,7 @@ def add_arguments(self, parser):
                             help="DeepBGC classification score threshold for assigning classes to BGCs (inclusive).")
 
     def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_classifier,
-            is_continue, is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl,
+            is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl,
             min_proteins, min_domains, min_bio_domains):
         if not detectors:
             detectors = ['deepbgc']
@@ -93,24 +96,16 @@ def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_cl
             # if not specified, set output path to name of first input file without extension
             output, _ = os.path.splitext(os.path.basename(os.path.normpath(inputs[0])))
 
-        if os.path.exists(output):
-            if not os.path.isdir(output):
-                raise ValueError('Output path already exists and is not a directory: {}'.format(output))
-            num_outputs_in_dir = len(set(os.listdir(output)).difference(['tmp', 'evaluation']))
-            if not is_continue and num_outputs_in_dir:
-                raise ValueError('Output directory already exists and is not empty: {}'.format(output),
-                                 'Use --output my/dir/ to specify a different directory',
-                                 'Or use --continue to write in the existing directory')
-        else:
+        if not os.path.exists(output):
             os.mkdir(output)
 
         # Save log to LOG.txt file
         logger = logging.getLogger('')
-        logger.addHandler(logging.FileHandler(os.path.join(output, 'LOG.txt')))
+        logger.addHandler(logging.FileHandler(os.path.join(output, self.LOG_FILENAME)))
 
         # Define report dir paths
-        tmp_path = os.path.join(output, 'tmp')
-        evaluation_path = os.path.join(output, 'evaluation')
+        tmp_path = os.path.join(output, self.TMP_DIRNAME)
+        evaluation_path = os.path.join(output, self.PLOT_DIRNAME)
         output_file_name = os.path.basename(os.path.normpath(output))
 
         steps = []

diff --git a/deepbgc/main.py b/deepbgc/main.py
@@ -19,8 +19,8 @@
     from deepbgc import __version__
 
 import argparse
-from deepbgc import util
-util.choose_matplotlib_backend()
+import matplotlib
+matplotlib.use('Agg')
 from deepbgc.command.prepare import PrepareCommand
 from deepbgc.command.download import DownloadCommand
 from deepbgc.command.pipeline import PipelineCommand

diff --git a/deepbgc/output/evaluation/bgc_region_plot.py b/deepbgc/output/evaluation/bgc_region_plot.py
@@ -8,7 +8,7 @@
 from matplotlib import pyplot as plt
 import numpy as np
 import logging
-
+import warnings
 
 class BGCRegionPlotWriter(OutputWriter):
 
@@ -91,7 +91,7 @@ def save_plot(self):
 
     def write(self, record):
         if len(self.sequence_titles) > self.max_sequences:
-            logging.warning('Reached maximum number of %s sequences for plotting, skipping sequence %s', self.max_sequences, record.id)
+            warnings.warn('Reached maximum number of {} sequences for plotting, some sequences will not be plotted.'.format(self.max_sequences))
             return
         clusters = util.create_cluster_dataframe(record, add_pfams=False, add_proteins=False)
         title = record.id

diff --git a/deepbgc/output/evaluation/pfam_score_plot.py b/deepbgc/output/evaluation/pfam_score_plot.py
@@ -4,7 +4,7 @@
 from deepbgc import util
 from matplotlib import pyplot as plt
 import numpy as np
-
+import warnings
 
 class PfamScorePlotWriter(OutputWriter):
 
@@ -70,9 +70,12 @@ def save_plot(self):
 
     def write(self, record):
         if len(self.sequence_titles) > self.max_sequences:
-            logging.warning('Reached maximum number of %s sequences for plotting, skipping sequence %s', self.max_sequences, record.id)
+            warnings.warn('Reached maximum number of {} sequences for plotting, some sequences will not be plotted.'.format(self.max_sequences))
             return
         scores = util.create_pfam_dataframe(record, add_in_cluster=True)
+        if scores.empty:
+            logging.debug('Skipping score plot for empty record %s', record.id)
+            return
         detector_meta = util.get_record_detector_meta(record)
         detector_names = np.unique([meta['name'] for meta in detector_meta.values()])
         score_columns = [util.format_bgc_score_column(name) for name in detector_names]

diff --git a/deepbgc/pipeline/classifier.py b/deepbgc/pipeline/classifier.py
@@ -20,6 +20,9 @@ def __init__(self, classifier, score_threshold=0.5):
 
     def run(self, record):
         cluster_features = util.get_cluster_features(record)
+        if not len(cluster_features):
+            return
+
         logging.info('Classifying %s BGCs using %s model in %s', len(cluster_features), self.classifier_name, record.id)
 
         # Create list of DataFrames with Pfam sequences (one for each cluster)
@@ -45,7 +48,8 @@ def run(self, record):
             if feature.qualifiers.get(class_column):
                 prev_classes = feature.qualifiers.get(class_column)[0].split('-')
                 all_classes = sorted(list(set(all_classes + prev_classes)))
-            feature.qualifiers[class_column] = ['-'.join(all_classes)]
+            if all_classes:
+                feature.qualifiers[class_column] = ['-'.join(all_classes)]
             predicted_classes += new_classes or ['no confident class']
 
         # Add detector metadata to the record as a structured comment

diff --git a/deepbgc/pipeline/pfam.py b/deepbgc/pipeline/pfam.py
@@ -8,7 +8,7 @@
 
 import pandas as pd
 
-from deepbgc.data import PFAM_DB_FILE_NAME, PFAM_CLANS_FILE_NAME
+from deepbgc.data import PFAM_DB_FILE_NAME, PFAM_DB_VERSION, PFAM_CLANS_FILE_NAME
 from Bio import SeqIO, SearchIO
 from Bio.SeqRecord import SeqRecord
 from Bio.SeqFeature import SeqFeature, FeatureLocation
@@ -69,6 +69,10 @@ def annotate(self):
         proteins_by_id = util.get_proteins_by_id(proteins)
         domtbl_path = self.tmp_path_prefix + '.pfam.domtbl.txt'
 
+        if not proteins:
+            logging.warning('No	proteins in sequence %s, skipping protein domain detection', self.record.id)
+            return
+
         if util.is_valid_hmmscan_output(domtbl_path):
             cached = True
             logging.info('Reusing already existing HMMER hmmscan result: %s', domtbl_path)
@@ -102,16 +106,16 @@ def annotate(self):
             for hit in query.hits:
                 best_index = np.argmin([hsp.evalue for hsp in hit.hsps])
                 best_hsp = hit.hsps[best_index]
-                pfam_id = hit.accession.split('.')[0]
+                pfam_id = hit.accession
                 evalue = float(best_hsp.evalue)
                 if evalue > self.max_evalue:
                     continue
                 location = self._get_pfam_loc(best_hsp.query_start, best_hsp.query_end, protein)
                 qualifiers = {
-                    'PFAM_ID': [pfam_id],
+                    'db_xref': [pfam_id],
                     'evalue': evalue,
                     'locus_tag': [query.id],
-                    'database': [PFAM_DB_FILE_NAME],
+                    'database': [PFAM_DB_VERSION],
                 }
                 description = pfam_descriptions.get(pfam_id)
                 if description:

diff --git a/deepbgc/pipeline/protein.py b/deepbgc/pipeline/protein.py
@@ -41,9 +41,12 @@ def annotate(self):
             logging.warning(err.strip())
             logging.warning('== End Prodigal Error. ============')
 
-            if os.stat(protein_path).st_size == 0:
+            if 'Sequence must be' in err:
+                logging.warning('No proteins detected in short sequence, moving on.')
+            elif os.stat(protein_path).st_size == 0:
                 raise ValueError("Prodigal produced empty output, make sure to use a DNA sequence.")
-            raise ValueError("Unexpected error detecting genes using Prodigal")
+            else:
+                raise ValueError("Unexpected error detecting genes using Prodigal")
 
         proteins = SeqIO.parse(protein_path, 'fasta')
         for protein in proteins:

diff --git a/deepbgc/util.py b/deepbgc/util.py
@@ -62,17 +62,18 @@ def sort_record_features(record):
 
 def get_pfam_features(record):
     pfams = get_features_of_type(record, PFAM_FEATURE)
-    from deepbgc.data import PFAM_DB_FILE_NAME
-    filtered_pfams = [pfam for pfam in pfams if pfam.qualifiers.get('database') == [PFAM_DB_FILE_NAME]]
+    from deepbgc.data import PFAM_DB_VERSION
+    filtered_pfams = [pfam for pfam in pfams if pfam.qualifiers.get('database') == [PFAM_DB_VERSION]]
     num_incompatible = len(pfams) - len(filtered_pfams)
     if num_incompatible:
         # TODO: enable users to run model with older incompatible Pfam DB versions?
-        logging.warning('Ignoring {} incompatible Pfam features with database different than "{}"'.format(num_incompatible, PFAM_DB_FILE_NAME))
+        logging.warning('Ignoring {} incompatible Pfam features with database different than "{}"'.format(num_incompatible, PFAM_DB_VERSION))
     return filtered_pfams
 
+
 def get_pfam_feature_ids(record):
     pfam_features = get_pfam_features(record)
-    return [feature.qualifiers['PFAM_ID'][0] for feature in pfam_features if 'PFAM_ID' in feature.qualifiers]
+    return [get_pfam_id(feature) for feature in pfam_features if get_pfam_id(feature)]
 
 
 # Taken from antiSMASH ClusterFinder module
@@ -186,6 +187,12 @@ def get_pfam_protein_id(pfam_feature):
     return pfam_feature.qualifiers.get('locus_tag')[0]
 
 
+def get_pfam_id(feature):
+    if feature.qualifiers.get('db_xref'):
+        return feature.qualifiers.get('db_xref')[0].split('.')[0]
+    return None
+
+
 def create_pfam_dict(pfam, proteins_by_id, detector_names, cluster_locations):
     locus_tag = get_pfam_protein_id(pfam)
     if locus_tag not in proteins_by_id:
@@ -198,7 +205,7 @@ def create_pfam_dict(pfam, proteins_by_id, detector_names, cluster_locations):
         gene_start=protein.location.start,
         gene_end=protein.location.end,
         gene_strand=protein.strand,
-        pfam_id=pfam.qualifiers.get('PFAM_ID')[0]
+        pfam_id=get_pfam_id(pfam)
     )
 
     if cluster_locations:
@@ -592,18 +599,6 @@ def is_valid_hmmscan_output(domtbl_path):
     return True
 
 
-def choose_matplotlib_backend():
-    import matplotlib
-    gui_env = ['TKAgg', 'GTKAgg', 'Qt4Agg', 'WXAgg']
-    for gui in gui_env:
-        try:
-            matplotlib.use(gui, warn=False, force=True)
-            from matplotlib import pyplot as plt
-            break
-        except:
-            continue
-
-
 def print_elapsed_time(start_time):
     s = (datetime.now() - start_time).total_seconds()
     return '{:.0f}h{:.0f}m{:.0f}s'.format(s//3600, (s//60) % 60, s % 60)