Skip to content

Commit

Permalink
Make PFAM_domains compatible with antiSMASH 5, avoid crashing on sequ…
Browse files Browse the repository at this point in the history
…ence with no detected proteins.

* make PFAM_domain feature annotations compatible with antiSMASH 5
* avoid crashing on sequence with no detected proteins
  • Loading branch information
prihoda authored Mar 21, 2019
2 parents e607dca + e6afa5f commit 5a201ec
Show file tree
Hide file tree
Showing 9 changed files with 49 additions and 45 deletions.
2 changes: 1 addition & 1 deletion deepbgc/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.4'
__version__ = '0.1.5dev'
23 changes: 9 additions & 14 deletions deepbgc/command/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ class PipelineCommand(BaseCommand):
deepbgc pipeline --continue --output sequence/ --label deepbgc_90_score --score 0.9 sequence/sequence.full.gbk
"""

LOG_FILENAME = 'LOG.txt'
PLOT_DIRNAME = 'evaluation'
TMP_DIRNAME = 'tmp'

def add_arguments(self, parser):

parser.add_argument(dest='inputs', nargs='+', help="Input sequence file path (FASTA, GenBank, Pfam CSV).")

parser.add_argument('-o', '--output', required=False, help="Custom output directory path.")
parser.add_argument('--continue', dest='is_continue', default=False, action='store_true', help="Continue previous pipeline and save results to the existing output directory.")
parser.add_argument('--limit-to-record', action='append', help="Process only specific record ID. Can be provided multiple times.")
parser.add_argument('--minimal-output', dest='is_minimal_output', action='store_true', default=False,
help="Produce minimal output with just the GenBank sequence file.")
Expand Down Expand Up @@ -83,7 +86,7 @@ def add_arguments(self, parser):
help="DeepBGC classification score threshold for assigning classes to BGCs (inclusive).")

def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_classifier,
is_continue, is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl,
is_minimal_output, limit_to_record, score, classifier_score, merge_max_protein_gap, merge_max_nucl_gap, min_nucl,
min_proteins, min_domains, min_bio_domains):
if not detectors:
detectors = ['deepbgc']
Expand All @@ -93,24 +96,16 @@ def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_cl
# if not specified, set output path to name of first input file without extension
output, _ = os.path.splitext(os.path.basename(os.path.normpath(inputs[0])))

if os.path.exists(output):
if not os.path.isdir(output):
raise ValueError('Output path already exists and is not a directory: {}'.format(output))
num_outputs_in_dir = len(set(os.listdir(output)).difference(['tmp', 'evaluation']))
if not is_continue and num_outputs_in_dir:
raise ValueError('Output directory already exists and is not empty: {}'.format(output),
'Use --output my/dir/ to specify a different directory',
'Or use --continue to write in the existing directory')
else:
if not os.path.exists(output):
os.mkdir(output)

# Save log to LOG.txt file
logger = logging.getLogger('')
logger.addHandler(logging.FileHandler(os.path.join(output, 'LOG.txt')))
logger.addHandler(logging.FileHandler(os.path.join(output, self.LOG_FILENAME)))

# Define report dir paths
tmp_path = os.path.join(output, 'tmp')
evaluation_path = os.path.join(output, 'evaluation')
tmp_path = os.path.join(output, self.TMP_DIRNAME)
evaluation_path = os.path.join(output, self.PLOT_DIRNAME)
output_file_name = os.path.basename(os.path.normpath(output))

steps = []
Expand Down
4 changes: 2 additions & 2 deletions deepbgc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from deepbgc import __version__

import argparse
from deepbgc import util
util.choose_matplotlib_backend()
import matplotlib
matplotlib.use('Agg')
from deepbgc.command.prepare import PrepareCommand
from deepbgc.command.download import DownloadCommand
from deepbgc.command.pipeline import PipelineCommand
Expand Down
4 changes: 2 additions & 2 deletions deepbgc/output/evaluation/bgc_region_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from matplotlib import pyplot as plt
import numpy as np
import logging

import warnings

class BGCRegionPlotWriter(OutputWriter):

Expand Down Expand Up @@ -91,7 +91,7 @@ def save_plot(self):

def write(self, record):
if len(self.sequence_titles) > self.max_sequences:
logging.warning('Reached maximum number of %s sequences for plotting, skipping sequence %s', self.max_sequences, record.id)
warnings.warn('Reached maximum number of {} sequences for plotting, some sequences will not be plotted.'.format(self.max_sequences))
return
clusters = util.create_cluster_dataframe(record, add_pfams=False, add_proteins=False)
title = record.id
Expand Down
7 changes: 5 additions & 2 deletions deepbgc/output/evaluation/pfam_score_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from deepbgc import util
from matplotlib import pyplot as plt
import numpy as np

import warnings

class PfamScorePlotWriter(OutputWriter):

Expand Down Expand Up @@ -70,9 +70,12 @@ def save_plot(self):

def write(self, record):
if len(self.sequence_titles) > self.max_sequences:
logging.warning('Reached maximum number of %s sequences for plotting, skipping sequence %s', self.max_sequences, record.id)
warnings.warn('Reached maximum number of {} sequences for plotting, some sequences will not be plotted.'.format(self.max_sequences))
return
scores = util.create_pfam_dataframe(record, add_in_cluster=True)
if scores.empty:
logging.debug('Skipping score plot for empty record %s', record.id)
return
detector_meta = util.get_record_detector_meta(record)
detector_names = np.unique([meta['name'] for meta in detector_meta.values()])
score_columns = [util.format_bgc_score_column(name) for name in detector_names]
Expand Down
6 changes: 5 additions & 1 deletion deepbgc/pipeline/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def __init__(self, classifier, score_threshold=0.5):

def run(self, record):
cluster_features = util.get_cluster_features(record)
if not len(cluster_features):
return

logging.info('Classifying %s BGCs using %s model in %s', len(cluster_features), self.classifier_name, record.id)

# Create list of DataFrames with Pfam sequences (one for each cluster)
Expand All @@ -45,7 +48,8 @@ def run(self, record):
if feature.qualifiers.get(class_column):
prev_classes = feature.qualifiers.get(class_column)[0].split('-')
all_classes = sorted(list(set(all_classes + prev_classes)))
feature.qualifiers[class_column] = ['-'.join(all_classes)]
if all_classes:
feature.qualifiers[class_column] = ['-'.join(all_classes)]
predicted_classes += new_classes or ['no confident class']

# Add detector metadata to the record as a structured comment
Expand Down
12 changes: 8 additions & 4 deletions deepbgc/pipeline/pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd

from deepbgc.data import PFAM_DB_FILE_NAME, PFAM_CLANS_FILE_NAME
from deepbgc.data import PFAM_DB_FILE_NAME, PFAM_DB_VERSION, PFAM_CLANS_FILE_NAME
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
Expand Down Expand Up @@ -69,6 +69,10 @@ def annotate(self):
proteins_by_id = util.get_proteins_by_id(proteins)
domtbl_path = self.tmp_path_prefix + '.pfam.domtbl.txt'

if not proteins:
logging.warning('No proteins in sequence %s, skipping protein domain detection', self.record.id)
return

if util.is_valid_hmmscan_output(domtbl_path):
cached = True
logging.info('Reusing already existing HMMER hmmscan result: %s', domtbl_path)
Expand Down Expand Up @@ -102,16 +106,16 @@ def annotate(self):
for hit in query.hits:
best_index = np.argmin([hsp.evalue for hsp in hit.hsps])
best_hsp = hit.hsps[best_index]
pfam_id = hit.accession.split('.')[0]
pfam_id = hit.accession
evalue = float(best_hsp.evalue)
if evalue > self.max_evalue:
continue
location = self._get_pfam_loc(best_hsp.query_start, best_hsp.query_end, protein)
qualifiers = {
'PFAM_ID': [pfam_id],
'db_xref': [pfam_id],
'evalue': evalue,
'locus_tag': [query.id],
'database': [PFAM_DB_FILE_NAME],
'database': [PFAM_DB_VERSION],
}
description = pfam_descriptions.get(pfam_id)
if description:
Expand Down
7 changes: 5 additions & 2 deletions deepbgc/pipeline/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ def annotate(self):
logging.warning(err.strip())
logging.warning('== End Prodigal Error. ============')

if os.stat(protein_path).st_size == 0:
if 'Sequence must be' in err:
logging.warning('No proteins detected in short sequence, moving on.')
elif os.stat(protein_path).st_size == 0:
raise ValueError("Prodigal produced empty output, make sure to use a DNA sequence.")
raise ValueError("Unexpected error detecting genes using Prodigal")
else:
raise ValueError("Unexpected error detecting genes using Prodigal")

proteins = SeqIO.parse(protein_path, 'fasta')
for protein in proteins:
Expand Down
29 changes: 12 additions & 17 deletions deepbgc/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,18 @@ def sort_record_features(record):

def get_pfam_features(record):
pfams = get_features_of_type(record, PFAM_FEATURE)
from deepbgc.data import PFAM_DB_FILE_NAME
filtered_pfams = [pfam for pfam in pfams if pfam.qualifiers.get('database') == [PFAM_DB_FILE_NAME]]
from deepbgc.data import PFAM_DB_VERSION
filtered_pfams = [pfam for pfam in pfams if pfam.qualifiers.get('database') == [PFAM_DB_VERSION]]
num_incompatible = len(pfams) - len(filtered_pfams)
if num_incompatible:
# TODO: enable users to run model with older incompatible Pfam DB versions?
logging.warning('Ignoring {} incompatible Pfam features with database different than "{}"'.format(num_incompatible, PFAM_DB_FILE_NAME))
logging.warning('Ignoring {} incompatible Pfam features with database different than "{}"'.format(num_incompatible, PFAM_DB_VERSION))
return filtered_pfams


def get_pfam_feature_ids(record):
pfam_features = get_pfam_features(record)
return [feature.qualifiers['PFAM_ID'][0] for feature in pfam_features if 'PFAM_ID' in feature.qualifiers]
return [get_pfam_id(feature) for feature in pfam_features if get_pfam_id(feature)]


# Taken from antiSMASH ClusterFinder module
Expand Down Expand Up @@ -186,6 +187,12 @@ def get_pfam_protein_id(pfam_feature):
return pfam_feature.qualifiers.get('locus_tag')[0]


def get_pfam_id(feature):
if feature.qualifiers.get('db_xref'):
return feature.qualifiers.get('db_xref')[0].split('.')[0]
return None


def create_pfam_dict(pfam, proteins_by_id, detector_names, cluster_locations):
locus_tag = get_pfam_protein_id(pfam)
if locus_tag not in proteins_by_id:
Expand All @@ -198,7 +205,7 @@ def create_pfam_dict(pfam, proteins_by_id, detector_names, cluster_locations):
gene_start=protein.location.start,
gene_end=protein.location.end,
gene_strand=protein.strand,
pfam_id=pfam.qualifiers.get('PFAM_ID')[0]
pfam_id=get_pfam_id(pfam)
)

if cluster_locations:
Expand Down Expand Up @@ -592,18 +599,6 @@ def is_valid_hmmscan_output(domtbl_path):
return True


def choose_matplotlib_backend():
import matplotlib
gui_env = ['TKAgg', 'GTKAgg', 'Qt4Agg', 'WXAgg']
for gui in gui_env:
try:
matplotlib.use(gui, warn=False, force=True)
from matplotlib import pyplot as plt
break
except:
continue


def print_elapsed_time(start_time):
s = (datetime.now() - start_time).total_seconds()
return '{:.0f}h{:.0f}m{:.0f}s'.format(s//3600, (s//60) % 60, s % 60)

0 comments on commit 5a201ec

Please sign in to comment.