Skip to content

Commit

Permalink
fix: add stop codon deletion condition to delins
Browse files Browse the repository at this point in the history
  • Loading branch information
nokara26 committed Oct 31, 2023
1 parent d6c9a3c commit acfbd2e
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 17 deletions.
51 changes: 36 additions & 15 deletions src/varity/vcf_to_hgvs/protein.clj
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,21 @@
(and (seq ref-only) (empty? alt-only)) [ref-only :del])]
(common/repeat-info seq* pos alt type)))

(defn- get-first-diff-aa-info
[pos ref-seq alt-seq]
(->> (map vector ref-seq alt-seq)
(drop (dec pos))
(map-indexed vector)
(drop-while (fn [[_ [r a]]] (= r a)))
first
((fn [[i [r a]]]
{:ppos (+ pos i)
:pref (str r)
:palt (str a)}))))

(defn- ->protein-variant
[{:keys [strand] :as rg} pos ref alt
{:keys [ref-exon-seq ref-prot-seq alt-exon-seq alt-rg] :as seq-info}
{:keys [ref-exon-seq ref-prot-seq alt-exon-seq alt-rg stop-codon-include] :as seq-info}
{:keys [prefer-deletion? prefer-insertion?]}]
(cond
(= ref-exon-seq alt-exon-seq)
Expand Down Expand Up @@ -267,14 +279,18 @@
(= (+ base-ppos offset) (count ref-prot-seq)) :extension
(= (+ base-ppos offset) 1) (if (= ref-prot-rest alt-prot-rest)
:extension
:frame-shift)
(if stop-codon-include
:indel
:frame-shift))
(and (pos? nprefo) (= (first palt-only) \*)) :substitution
(not= ref-prot-rest alt-prot-rest) (if (or (and (= (first alt-prot-rest) \*)
(>= nprefo npalto)
(= palt (subs pref 0 (count palt))))
(= (first palt-only) \*))
:fs-ter-substitution
:frame-shift)
(if stop-codon-include
:indel
:frame-shift))
(or (and (zero? nprefo) (zero? npalto))
(and (= nprefo 1) (= npalto 1))) :substitution
(and prefer-deletion? (pos? nprefo) (zero? npalto)) :deletion
Expand Down Expand Up @@ -344,8 +360,19 @@
(->> (seq ins) (map mut/->long-amino-acid)))))

(defn- protein-indel
[ppos pref palt]
(let [[del ins offset _] (diff-bases pref palt)
[ppos pref palt {:keys [ref-prot-seq c-ter-adjusted-alt-prot-seq stop-codon-include]}]
(let [[pref palt ppos] (if stop-codon-include
(let [{:keys [ppos]} (get-first-diff-aa-info ppos ref-prot-seq c-ter-adjusted-alt-prot-seq)
get-seq-between-pos-stop-codon (fn [seq pos]
(-> (pstring/split-at seq (dec pos))
last
(string/split #"\*")
first))
pref (get-seq-between-pos-stop-codon ref-prot-seq ppos)
palt (get-seq-between-pos-stop-codon c-ter-adjusted-alt-prot-seq ppos)]
[pref palt ppos])
[pref palt ppos])
[del ins offset _] (diff-bases pref palt)
ndel (count del)]
(mut/protein-indel (mut/->long-amino-acid (first del))
(coord/protein-coordinate (+ ppos offset))
Expand All @@ -372,17 +399,11 @@
alt-repeat)))

(defn- protein-frame-shift
[ppos seq-info]
(let [[ppos pref palt] (->> (map vector (:ref-prot-seq seq-info) (:alt-prot-seq seq-info))
(drop (dec ppos))
(map-indexed vector)
(drop-while (fn [[_ [r a]]] (= r a)))
first
((fn [[i [r a]]]
[(+ ppos i) (str r) (str a)])))
[ppos {:keys [ref-prot-seq alt-prot-seq] :as seq-info}]
(let [{:keys [ppos pref palt]} (get-first-diff-aa-info ppos ref-prot-seq alt-prot-seq)
[_ _ offset _] (diff-bases pref palt)
alt-prot-seq (format-alt-prot-seq seq-info)
ref (nth (:ref-prot-seq seq-info) (dec (+ ppos offset)))
ref (nth ref-prot-seq (dec (+ ppos offset)))
alt (nth alt-prot-seq (dec (+ ppos offset)))
ter-site (-> seq-info
format-alt-prot-seq
Expand Down Expand Up @@ -430,7 +451,7 @@
:deletion (protein-deletion ppos pref palt)
:duplication (protein-duplication ppos pref palt)
:insertion (protein-insertion ppos pref palt seq-info)
:indel (protein-indel ppos pref palt)
:indel (protein-indel ppos pref palt seq-info)
:repeated-seqs (protein-repeated-seqs ppos pref palt seq-info)
:frame-shift (protein-frame-shift ppos seq-info)
:extension (protein-extension ppos pref palt seq-info)
Expand Down
9 changes: 9 additions & 0 deletions test/varity/vcf_to_hgvs/protein_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@
[15 20] 15
[5 10] 5)))

(deftest get-first-diff-aa-info-test
(let [ref-seq "ABCDEFGHIJKLMN"]
(are [p alt-seq pos] (= (#'prot/get-first-diff-aa-info pos
ref-seq
alt-seq)
p)
{:ppos 6 :pref "F" :palt "G"} "ABCDEG" 4
{:ppos 13 :pref "M" :palt "K"} "ACBDEFGHIJKLK" 10)))

(def ref-gene-EGFR
{:bin 125
:name "NM_005228"
Expand Down
7 changes: 5 additions & 2 deletions test/varity/vcf_to_hgvs_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@
(cavia-testing "returns protein HGVS strings"
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
(are [chr pos ref alt e]
(= (vcf-variant->protein-hgvs-texts {:chr chr, :pos pos, :ref ref, :alt alt}
test-ref-seq-file rgidx) e)
(= (vcf-variant->protein-hgvs-texts {:chr chr, :pos pos, :ref ref, :alt alt}
test-ref-seq-file rgidx) e)
;; Substitution
"chr7" 55191822 "T" "G" '("p.L858R") ; cf. rs121434568
"chr1" 11796321 "G" "A" '("p.A222V") ; cf. rs1801133
Expand Down Expand Up @@ -220,6 +220,9 @@
"chr2" 47445589 "CTTACTGAT" "CCC" '("p.L440_D442delinsP" "p.L374_D376delinsP") ; cf. rs63749931 (+)
"chr1" 152111364 "TGC" "TCG" '("p.E617_Q618delinsDE") ; cf. rs35444647 (-)

;; indel include stop codon deletion
"chr8" 116847497 "TCCTTATATAATATGGAACCTTGGTCCAGGTGTTGCGATGATGTCACTGTA" "T" '("p.Y617_I631delinsS")

;; repeated sequences
"chr1" 47438996 "T" "TCCGCAC" '("p.P286_H287[5]") ; cf. rs3046924 (+)
"chr1" 11796319 "C" "CGGCGGC" '("p.A222[3]") ; not actual example (-)
Expand Down

0 comments on commit acfbd2e

Please sign in to comment.