From 6fa5ed517ae37c20df39b1eb0f5096e659126479 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sat, 4 Nov 2023 03:14:49 +0100 Subject: [PATCH] feat: output genome annotations for complex CDS Auspice now supports multi-part CDS annotations This PR makes `augur translate` output the correct annotation for complex CDSs Complex CDSs are detected through `type(feat.location)` being `SeqFeature.CompountLocation` Currently, this only works with genbank files, as our load_features utility function does not deal with compound CDS in GFF files correctly I tested this with MERSs complex ORF1ab Partially resolves #1280 --- augur/ancestral.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/augur/ancestral.py b/augur/ancestral.py index 64b25d6fe..b8f9dbe93 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -28,6 +28,7 @@ from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import CompoundLocation from .utils import read_tree, InvalidTreeError, write_json, get_json_name from treetime.vcf_utils import read_vcf, write_vcf from collections import defaultdict @@ -332,14 +333,19 @@ def run(args): node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True) anc_seqs['reference'][gene] = aa_result['root_seq'] - # FIXME: Note that this is calculating the end of the CDS as 3*length of translation - # this is equivalent to the annotation for single segment CDS, but not for cds - # with splicing and slippage. But auspice can't handle the latter at the moment. anc_seqs['annotations'][gene] = {'seqid':args.annotation, 'type':feat.type, - 'start': int(feat.location.start)+1, - 'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]), 'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]} + if feat.type == "CDS" and type(feat.location) == CompoundLocation: + location = [] + for seg in feat.location.parts: + location.append({"start": int(seg.start)+1, "end": int(seg.end)}) + anc_seqs['annotations'][gene]["segments"] = location + else: + anc_seqs['annotations'][gene].update({ + 'start': int(feat.location.start)+1, + 'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]), + }) # Save ancestral amino acid sequences to FASTA. if args.output_translations: