Skip to content

Commit

Permalink
fix: fix upstream and downstream sequence and update sequence-info
Browse files Browse the repository at this point in the history
  • Loading branch information
nokara26 committed Oct 30, 2023
1 parent 493bb81 commit d6c9a3c
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 17 deletions.
112 changes: 95 additions & 17 deletions src/varity/vcf_to_hgvs/protein.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns varity.vcf-to-hgvs.protein
(:require [clojure.pprint :as pp]
[clojure.set :as s]
[clojure.string :as string]
[clojure.tools.logging :as log]
[clj-hgvs.coordinate :as coord]
Expand Down Expand Up @@ -79,38 +80,115 @@
(exon-sequence (cseq/read-sequence seq-rdr {:chr chr, :start start, :end end})
start end exon-ranges))

(defn- is-deletion-variant?
[ref alt]
(or (and (= 1 (count alt))
(= (first ref) (first alt)))
(and (not (= 1 (count ref) (count alt)))
(not= (first ref) (first alt)))))

(defn- cds-start-upstream-to-cds-variant?
[cds-start pos ref]
(and (< pos cds-start)
(<= cds-start (dec (+ pos (count ref))))))

(defn- cds-to-cds-end-downstream-variant?
[cds-end pos ref]
(and (<= pos cds-end)
(< cds-end (dec (+ pos (count ref))))))

(defn- make-alt-up-exon-seq
[ref-up-exon-seq cds-start pos ref alt]
(let [is-deletion (is-deletion-variant? ref alt)
alt-up-exon-seq (if (cds-start-upstream-to-cds-variant? cds-start pos ref)
(let [offset (if is-deletion
(- cds-start pos)
0)]
(string/join "" (drop-last offset ref-up-exon-seq)))
ref-up-exon-seq)]
(subs alt-up-exon-seq (mod (count alt-up-exon-seq) 3))))

(defn- make-alt-down-exon-seq
[ref-down-exon-seq cds-end pos ref alt]
(let [is-deletion (is-deletion-variant? ref alt)
ref-end (dec (+ pos (count ref)))
alt-down-exon-seq (if (cds-to-cds-end-downstream-variant? cds-end pos ref)
(let [offset (if is-deletion
(- ref-end cds-end)
0)]
(string/join "" (drop offset ref-down-exon-seq)))
ref-down-exon-seq)
nalt-down-exon-seq (count alt-down-exon-seq)]
(subs alt-down-exon-seq 0 (- nalt-down-exon-seq (mod nalt-down-exon-seq 3)))))

(defn- make-stop-codon-adjusted-alt-seq
[alt-exon-seq alt-up-exon-seq alt-down-exon-seq strand cds-start cds-end pos ref]
(cond
(and (= strand :reverse)
(cds-start-upstream-to-cds-variant? cds-start pos ref))
(str alt-up-exon-seq alt-exon-seq)

(and (= strand :forward)
(cds-to-cds-end-downstream-variant? cds-end pos ref))
(str alt-exon-seq alt-down-exon-seq)

:else
alt-exon-seq))

(defn- get-pos-exon-end-tuple
[pos exon-ranges]
(let [[_ exon-end] (first (filter (fn [[s e]] (<= s pos e)) exon-ranges))]
[pos exon-end]))

(defn- include-stop-codon?
[{:keys [strand cds-start cds-end]} pos ref]
(let [stop-codon-positions (set (cond
(= strand :forward) (range (- cds-end 2) (inc cds-end))
(= strand :reverse) (range cds-start (inc (+ cds-start 2)))))
alt-positions (set (range pos (+ pos (count ref))))]
(= 3 (count (s/intersection stop-codon-positions alt-positions)))))

(defn- read-sequence-info
[seq-rdr rg pos ref alt]
(let [{:keys [chr tx-start tx-end cds-start cds-end exon-ranges strand]} rg
ref-seq (cseq/read-sequence seq-rdr {:chr chr, :start cds-start, :end cds-end})
alt-seq (common/alt-sequence ref-seq cds-start pos ref alt)
alt-exon-ranges* (alt-exon-ranges exon-ranges pos ref alt)
ref-exon-seq1 (exon-sequence ref-seq cds-start exon-ranges)
ref-up-exon-seq1 (read-exon-sequence seq-rdr chr tx-start (dec cds-start) exon-ranges)
ref-up-exon-seq1 (subs ref-up-exon-seq1 (mod (count ref-up-exon-seq1) 3))
ref-down-exon-seq1 (read-exon-sequence seq-rdr chr (inc cds-end) tx-end exon-ranges)
nref-down-exon-seq1 (count ref-down-exon-seq1)
ref-down-exon-seq1 (subs ref-down-exon-seq1 0 (- nref-down-exon-seq1 (mod nref-down-exon-seq1 3)))
alt-exon-seq1 (exon-sequence alt-seq cds-start alt-exon-ranges*)
ref-exon-seq (exon-sequence ref-seq cds-start exon-ranges)
ref-up-exon-seq (read-exon-sequence seq-rdr chr tx-start (dec cds-start) exon-ranges)
alt-up-exon-seq (make-alt-up-exon-seq ref-up-exon-seq cds-start pos ref alt)
ref-down-exon-seq (read-exon-sequence seq-rdr chr (inc cds-end) tx-end exon-ranges)
alt-down-exon-seq (make-alt-down-exon-seq ref-down-exon-seq cds-start pos ref alt)
alt-exon-seq (exon-sequence alt-seq cds-start alt-exon-ranges*)
stop-codon-adjusted-alt-seq (make-stop-codon-adjusted-alt-seq alt-exon-seq alt-up-exon-seq alt-down-exon-seq
strand cds-start cds-end pos ref)
apply-offset #(or (ffirst (alt-exon-ranges [[% %]] pos ref alt))
(some (fn [[_ e]] (when (<= e %) e)) (reverse alt-exon-ranges*)))]
{:ref-exon-seq ref-exon-seq1
:ref-prot-seq (codon/amino-acid-sequence (cond-> ref-exon-seq1
(some (fn [[_ e]] (when (<= e %) e)) (reverse alt-exon-ranges*))
(-> [(get-pos-exon-end-tuple % alt-exon-ranges*)]
(alt-exon-ranges pos ref alt)
ffirst))]
{:ref-exon-seq ref-exon-seq
:ref-prot-seq (codon/amino-acid-sequence (cond-> ref-exon-seq
(= strand :reverse) util-seq/revcomp))
:alt-exon-seq alt-exon-seq1
:alt-prot-seq (codon/amino-acid-sequence (cond-> alt-exon-seq1
:alt-exon-seq alt-exon-seq
:alt-prot-seq (codon/amino-acid-sequence (cond-> alt-exon-seq
(= strand :reverse) util-seq/revcomp))
:alt-tx-prot-seq (codon/amino-acid-sequence
(cond-> (str ref-up-exon-seq1 alt-exon-seq1 ref-down-exon-seq1)
(cond-> (str alt-up-exon-seq alt-exon-seq alt-down-exon-seq)
(= strand :reverse) util-seq/revcomp))
:ini-offset (quot (:position (rg/cds-coord (case strand
:forward tx-start
:reverse tx-end) rg))
:ini-offset (quot (count (case strand
:forward alt-up-exon-seq
:reverse alt-down-exon-seq))
3)
:c-ter-adjusted-alt-prot-seq (codon/amino-acid-sequence
(cond-> stop-codon-adjusted-alt-seq
(= strand :reverse) util-seq/revcomp))
:alt-rg (-> rg
(assoc :exon-ranges alt-exon-ranges*)
(update :cds-start apply-offset)
(update :cds-end apply-offset))}))
(update :cds-end apply-offset)
(update :tx-end apply-offset))
:stop-codon-include (include-stop-codon? rg pos ref)}))

(defn- protein-position
"Converts genomic position to protein position. If pos is outside of CDS,
Expand Down
52 changes: 52 additions & 0 deletions test/varity/vcf_to_hgvs/protein_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,58 @@
;; 1 [2 3 4] 5 6 7 [8 9 10 11] 12 13 14 15
(is (= (#'prot/exon-sequence "ACGTACGTACGTACG" 1 [[2 4] [8 11]]) "CGTTACG")))

(deftest make-alt-up-exon-seq-test
(let [ref-up-exon-seq "AATGCTTCTAGCTCC"
cds-start 100]
(are [p pos ref alt] (= p (#'prot/make-alt-up-exon-seq ref-up-exon-seq
cds-start
pos
ref
alt))
"AATGCTTCTAGCTCC" 102 "ATGTC" "A"
"ATGCTTCTAGCT" 98 "CCTT" "C")))

(deftest make-alt-down-exon-seq-test
(let [ref-up-exon-seq "CTTATAATAATAA"
cds-end 1000]
(are [p pos ref alt] (= p (#'prot/make-alt-down-exon-seq ref-up-exon-seq
cds-end
pos
ref
alt))
"CTTATAATAATA" 1002 "TTATAA" "T"
"TATAATAAT" 998 "GGCCT" "G")))

(deftest make-stop-codon-adjusted-alt-seq-test
(let [alt-seq "XXXXXX"
upstream-seq "YYYYYY"
downstream-seq "ZZZZZZ"
[cds-start cds-end] [7 12]]
(are [p strand pos ref] (#'prot/make-stop-codon-adjusted-alt-seq alt-seq
upstream-seq
downstream-seq
strand
cds-start
cds-end
pos
ref)
"XXXXXX" :forward 8 "XX"
"XXXXXX" :forward 5 "YY"
"XXXXXX" :forward 5 "YYX"
"XXXXXX" :forward 13 "ZZ"
"XXXXXXZZZZZZ" :forward 12 "XZZ"
"XXXXXX" :reverse 8 "XX"
"XXXXXX" :reverse 5 "YY"
"YYYYYYXXXXXX" :reverse 5 "YYX"
"XXXXXX" :reverse 13 "ZZ"
"XXXXXX" :reverse 12 "XZZ")))

(deftest get-pos-exon-end-tuple-test
(let [exon-ranges [[1 10] [15 20] [25 40]]]
(are [p pos] (= (#'prot/get-pos-exon-end-tuple pos exon-ranges) p)
[15 20] 15
[5 10] 5)))

(def ref-gene-EGFR
{:bin 125
:name "NM_005228"
Expand Down

0 comments on commit d6c9a3c

Please sign in to comment.