diff --git a/src/varity/vcf_to_hgvs/common.clj b/src/varity/vcf_to_hgvs/common.clj index 1017f60..67d3f26 100644 --- a/src/varity/vcf_to_hgvs/common.clj +++ b/src/varity/vcf_to_hgvs/common.clj @@ -69,15 +69,15 @@ 0 (let [rbases (string/reverse bases) n (count bases) - step (count (first (repeat-units rbases))) - tweak (if (= step 1) #(+ (dec n) %) #(* step %))] + step (count (first (repeat-units rbases)))] (->> seq* - (take (dec pos)) + (take (+ pos (dec n))) reverse (partition (count rbases) step) (take-while #(= (apply str %) rbases)) count - tweak))) + dec + (* step)))) (throw (ex-info "The bases is not found on the position." {:type ::invalid-bases :sequence seq* @@ -148,10 +148,10 @@ ([variant seq*] (apply-3'-rule variant seq* :forward)) ([{:keys [pos ref alt] :as variant} seq* direction] - (let [[ref-only alt-only offset _] (diff-bases ref alt) + (let [[ref-only alt-only offset right-offset] (diff-bases ref alt) type* (cond - (and (zero? (count ref-only)) (pos? (count alt-only))) :ins - (and (pos? (count ref-only)) (zero? (count alt-only))) :del + (and (zero? (count ref-only)) (pos? (count alt-only)) (zero? right-offset)) :ins + (and (pos? (count ref-only)) (zero? (count alt-only)) (zero? right-offset)) :del :else :other)] (if (#{:ins :del} type*) (let [bases (if (string/blank? ref-only) alt-only ref-only) diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj index 81d9ff7..52ae369 100644 --- a/src/varity/vcf_to_hgvs/protein.clj +++ b/src/varity/vcf_to_hgvs/protein.clj @@ -91,7 +91,7 @@ ref-down-exon-seq1 (subs ref-down-exon-seq1 0 (- nref-down-exon-seq1 (mod nref-down-exon-seq1 3))) alt-exon-seq1 (exon-sequence alt-seq cds-start alt-exon-ranges*) apply-offset #(or (ffirst (alt-exon-ranges [[% %]] pos ref alt)) - (some (fn [[_ e]] (when (<= e %) e)) alt-exon-ranges*))] + (some (fn [[_ e]] (when (<= e %) e)) (reverse alt-exon-ranges*)))] {:ref-exon-seq ref-exon-seq1 :ref-prot-seq (codon/amino-acid-sequence (cond-> ref-exon-seq1 (= strand :reverse) util-seq/revcomp)) @@ -308,12 +308,14 @@ format-alt-prot-seq (subs (dec (+ ppos offset))) (string/index-of "*"))] - (mut/protein-frame-shift (mut/->long-amino-acid ref) - (coord/protein-coordinate (+ ppos offset)) - (mut/->long-amino-acid alt) - (if (and ter-site (pos? ter-site)) - (coord/protein-coordinate (inc ter-site)) - (coord/unknown-coordinate))))) + (if (= alt \*) + (protein-substitution (+ ppos offset) (str ref) (str alt)) ; eventually fs-ter-substitution + (mut/protein-frame-shift (mut/->long-amino-acid ref) + (coord/protein-coordinate (+ ppos offset)) + (mut/->long-amino-acid alt) + (if (and ter-site (pos? ter-site)) + (coord/protein-coordinate (inc ter-site)) + (coord/unknown-coordinate)))))) (defn- protein-extension [ppos pref palt seq-info] diff --git a/test/varity/vcf_to_hgvs/common_test.clj b/test/varity/vcf_to_hgvs/common_test.clj index e0e68a6..62ee2ac 100644 --- a/test/varity/vcf_to_hgvs/common_test.clj +++ b/test/varity/vcf_to_hgvs/common_test.clj @@ -49,6 +49,9 @@ "AGGGGGGT" 7 "G" 5 "AGGGGGGT" 6 "GG" 4 "AGGGGGGT" 5 "GGG" 3 + "AGGGGGGT" 4 "GGGG" 2 + "AGGGGGGT" 3 "GGG" 1 + "AGGGGGGT" 2 "GGGGG" 0 "AAGTGTCC" 5 "GT" 2 "AAGTGTCC" 3 "GT" 0)) (testing "backward-shift throws exception when inputs are illegal" @@ -80,7 +83,12 @@ {:pos 10, :ref "T", :alt "TAGTCTT"} :forward {:pos 20, :ref "C", :alt "CTTAGTC"} {:pos 4, :ref "CAGTCTT", :alt "C"} :forward {:pos 14, :ref "CTTAGTC", :alt "C"} {:pos 13, :ref "T", :alt "TCTTAGT"} :backward {:pos 4, :ref "C", :alt "CAGTCTT"} - {:pos 13, :ref "TCTTAGT", :alt "T"} :backward {:pos 4, :ref "CAGTCTT", :alt "C"}))) + {:pos 13, :ref "TCTTAGT", :alt "T"} :backward {:pos 4, :ref "CAGTCTT", :alt "C"})) + (testing "protein sequence" + (are [v d ret] (= (common/apply-3'-rule v "MDDLWFTFTPGPDE" d) ret) + {:pos 5 :ref "WFT" :alt "W"} :forward {:pos 7 :ref "TFT" :alt "T"} + {:pos 5 :ref "WFT" :alt "F"} :forward {:pos 5 :ref "WFT" :alt "F"} + {:pos 5 :ref "WFT" :alt "T"} :forward {:pos 5 :ref "WFT" :alt "T"}))) (defslowtest normalize-variant-test (cavia-testing "normalize without error" diff --git a/test/varity/vcf_to_hgvs/protein_test.clj b/test/varity/vcf_to_hgvs/protein_test.clj index ed0afc5..85de624 100644 --- a/test/varity/vcf_to_hgvs/protein_test.clj +++ b/test/varity/vcf_to_hgvs/protein_test.clj @@ -1,7 +1,11 @@ (ns varity.vcf-to-hgvs.protein-test (:require [clojure.string :as string] [clojure.test :refer :all] - [varity.vcf-to-hgvs.protein :as prot])) + [cljam.io.sequence :as cseq] + [clj-hgvs.core :as hgvs] + [varity.vcf-to-hgvs.protein :as prot] + [varity.t-common :refer [test-ref-seq-file + defslowtest]])) (deftest alt-exon-ranges-test ;; 1 [2 3 4] 5 6 7 [8 9 10 11] 12 13 14 15 @@ -95,3 +99,23 @@ "YDIGGPDQEFGVDVGPVCFL*" "YDIGGPDQEFGVDVGPVCFLQ" " ^"]))) + +(defslowtest protein-seq-3'-rule-test + (let [tp53 {:name2 "TP53" + :name "NM_000546.6" + :chr "chr17" + :tx-start 7668421 + :tx-end 7687490 + :cds-start 7669609 + :cds-end 7676594 + :strand :reverse + :cds-start-stat :cmpl + :cds-end-stat :cmpl + :exon-ranges [[7668421 7669690] [7670609 7670715] [7673535 7673608] [7673701 7673837] [7674181 7674290] [7674859 7674971] [7675053 7675236] [7675994 7676272] [7676382 7676403] [7676521 7676622] [7687377 7687490]] + :bin 643 + :exon-frames [2 0 1 2 0 1 0 0 2 0 -1] + :exon-count 11}] + (are [pos ref alt res] + (= (with-open [seq-rdr (cseq/reader test-ref-seq-file)] (#'prot/mutation seq-rdr tp53 pos ref alt {})) res) + 7676197 "G" "GGTCTTGTCCCTTA" (:mutation (hgvs/parse "p.P58*")) + 7676202 "T" "TGTCCCTTAGTCTT" (:mutation (hgvs/parse "p.P58*"))))) diff --git a/test/varity/vcf_to_hgvs_test.clj b/test/varity/vcf_to_hgvs_test.clj index 91f0028..70d6488 100644 --- a/test/varity/vcf_to_hgvs_test.clj +++ b/test/varity/vcf_to_hgvs_test.clj @@ -72,6 +72,21 @@ "NM_001176:c.-147_-138delTCTCTGCCGG" "NM_006849:c.-2636_-2627delTCTCTGCCGG") ; cf. rs1460727826 (deletion in UTR) "chr6" 33086236 "TA" "T" '("NM_002121:c.776delA") ; cf. rs67523850 (deletion in border of UTR) + "chr17" 7675170 "CAA" "C" '("NM_001126115:c.44_45delTT" + "NM_001126116:c.44_45delTT" + "NM_001126117:c.44_45delTT" + "NM_001276697:c.-38_-37delTT" + "NM_001276698:c.-38_-37delTT" + "NM_001276699:c.-38_-37delTT" + "NM_000546:c.440_441delTT" + "NM_001126112:c.440_441delTT" + "NM_001126113:c.440_441delTT" + "NM_001126114:c.440_441delTT" + "NM_001126118:c.323_324delTT" + "NM_001276695:c.323_324delTT" + "NM_001276696:c.323_324delTT" + "NM_001276760:c.323_324delTT" + "NM_001276761:c.323_324delTT") ;; Duplication "chr2" 47806844 "T" "TGATT" '("NM_000179:c.4068_4071dupGATT" @@ -190,6 +205,7 @@ "chr1" 247815239 "AAGG" "A" '("p.S163del") ; cf. rs35979231 (-) "chr2" 29223408 "AAGCAGT" "A" '("p.Y1096_C1097del") ; cf. rs776101205 (-) "chr11" 108259071 "GT" "G" '("p.L822*") ; https://github.com/chrovis/varity/issues/49 + "chr17" 7676206 "TGAACCATTGTTCAATATCGTCCGGGGACAGCATCAAATCATCCATTGCTTGGGACGGCAAGGGG" "T" '("p.P34Lfs*68" "p.M1ext-78" "p.M1ext-39") ; https://mutalyzer.nl/normalizer/NC_000017.11:g.7676209_7676272del ;; Duplication "chr2" 26254257 "G" "GACT" '("p.T2dup") ; cf. rs3839049 (+)