diff --git a/augur/ancestral.py b/augur/ancestral.py index 2771948eb..ebf96c905 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -28,7 +28,8 @@ from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name +from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \ + genome_features_to_auspice_annotation from .io.file import open_file from .io.vcf import is_vcf as is_filename_vcf from treetime.vcf_utils import read_vcf, write_vcf @@ -455,14 +456,7 @@ def run(args): node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True) anc_seqs['reference'][gene] = aa_result['root_seq'] - # FIXME: Note that this is calculating the end of the CDS as 3*length of translation - # this is equivalent to the annotation for single segment CDS, but not for cds - # with splicing and slippage. But auspice can't handle the latter at the moment. - anc_seqs['annotations'][gene] = {'seqid':args.annotation, - 'type':feat.type, - 'start': int(feat.location.start)+1, - 'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]), - 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]} + anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation)) # Save ancestral amino acid sequences to FASTA. if args.output_translations: diff --git a/augur/translate.py b/augur/translate.py index 45ab1c4e0..d7ce30ef6 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -17,7 +17,8 @@ import numpy as np from Bio import SeqIO, Seq, SeqRecord, Phylo from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf -from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name +from .utils import parse_genes_argument, read_node_data, load_features, \ + write_json, get_json_name, genome_features_to_auspice_annotation from treetime.vcf_utils import read_vcf from augur.errors import AugurError from textwrap import dedent @@ -446,24 +447,7 @@ def run(args): continue reference_translations[fname] = safe_translate(str(feat.extract(reference))) - ## glob the annotations for later auspice export - # - # Note that BioPython FeatureLocations use - # "Pythonic" coordinates: [zero-origin, half-open) - # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive] - annotations = { - 'nuc': {'start': features['nuc'].location.start+1, - 'end': features['nuc'].location.end, - 'strand': '+', - 'type': features['nuc'].type, # (unused by auspice) - 'seqid': args.reference_sequence} # (unused by auspice) - } - for fname, feat in features.items(): - annotations[fname] = {'seqid':args.reference_sequence, - 'type':feat.type, - 'start':int(feat.location.start)+1, - 'end':int(feat.location.end), - 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]} + annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True) ## determine amino acid mutations for each node try: diff --git a/augur/utils.py b/augur/utils.py index bd1a9ccd7..5441aed7f 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -832,3 +832,56 @@ def _get_genes_from_file(fname): print("Read in {} specified genes to translate.".format(len(unique_genes))) return unique_genes + + + +def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False): + """ + Parameters + ---------- + features : dict, keys: feature names, values: Bio.SeqFeature.SeqFeature objects + ref_seq_name : str (optional) + Exported as the `seqid` for each feature. Note this is unused by Auspice + assert_nuc : bool (optional) + If true, one of the feature key names must be "nuc" + + Returns + ------- + dict : annotations + See schema-annotations.json for the schema this conforms to + + """ + from Bio.SeqFeature import SimpleLocation, CompoundLocation + + if assert_nuc and 'nuc' not in features: + raise AugurError("Genome features must include a feature for 'nuc'") + + def _parse(feat): + a = {} + # Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open) + # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive] + if type(feat.location)==SimpleLocation: + a['start'] = int(feat.location.start)+1 + a['end'] = int(feat.location.end) + elif type(feat.location)==CompoundLocation: + a['segments'] = [ + {'start':int(segment.start)+1, 'end':int(segment.end)} + for segment in feat.location.parts # segment: SimpleLocation + ] + else: + raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}") + a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand] + a['type'] = feat.type # (unused by auspice) + if ref_seq_name: + a['seqid'] = ref_seq_name # (unused by auspice) + return a + + annotations = {} + for fname, feat in features.items(): + annotations[fname] = _parse(feat) + if fname=='nuc': + assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand" + elif annotations[fname]['strand'] not in ['+', '-']: + print("WARNING: Feature {fname:q} uses a strand which auspice cannot display") + + return annotations