From 6fa5ed517ae37c20df39b1eb0f5096e659126479 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sat, 4 Nov 2023 03:14:49 +0100
Subject: [PATCH] feat: output genome annotations for complex CDS

Auspice now supports multi-part CDS annotations
This PR makes `augur translate` output the correct annotation for complex CDSs

Complex CDSs are detected through `type(feat.location)` being `SeqFeature.CompountLocation`

Currently, this only works with genbank files, as our load_features utility function does not
deal with compound CDS in GFF files correctly

I tested this with MERSs complex ORF1ab

Partially resolves #1280
---
 augur/ancestral.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/augur/ancestral.py b/augur/ancestral.py
index 64b25d6fe..b8f9dbe93 100644
--- a/augur/ancestral.py
+++ b/augur/ancestral.py
@@ -28,6 +28,7 @@
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import CompoundLocation
 from .utils import read_tree, InvalidTreeError, write_json, get_json_name
 from treetime.vcf_utils import read_vcf, write_vcf
 from collections import defaultdict
@@ -332,14 +333,19 @@ def run(args):
                     node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)
 
             anc_seqs['reference'][gene] = aa_result['root_seq']
-            # FIXME: Note that this is calculating the end of the CDS as 3*length of translation
-            # this is equivalent to the annotation for single segment CDS, but not for cds
-            # with splicing and slippage. But auspice can't handle the latter at the moment.
             anc_seqs['annotations'][gene] = {'seqid':args.annotation,
                                              'type':feat.type,
-                                             'start': int(feat.location.start)+1,
-                                             'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
                                              'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+            if feat.type == "CDS" and type(feat.location) == CompoundLocation:
+                location = []
+                for seg in feat.location.parts:
+                    location.append({"start": int(seg.start)+1, "end": int(seg.end)})
+                anc_seqs['annotations'][gene]["segments"] = location
+            else:
+                anc_seqs['annotations'][gene].update({
+                    'start': int(feat.location.start)+1,
+                    'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
+                })
 
             # Save ancestral amino acid sequences to FASTA.
             if args.output_translations: