Skip to content

Commit

Permalink
fix (callVariant): hybrid node not identified if the variant is in th…
Browse files Browse the repository at this point in the history
…e of an exon. #782
  • Loading branch information
zhuchcn committed Jul 25, 2023
1 parent 2071711 commit b5c260e
Show file tree
Hide file tree
Showing 10 changed files with 166 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

- Fixed that circRNA peptides that carry inidels incompatible with the orf start node should not be called. #780

- Fixed that hybrid node not identified if the variant is in the of an exon. #782

## [1.2.0] - 2023-07-04

### Fixed
Expand Down
45 changes: 41 additions & 4 deletions moPepGen/svgraph/PVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,8 +964,18 @@ def is_hybrid_node(self, tree:SubgraphTree) -> bool:
cur_var = next(var_iter, None)

start, end = 0, 0
start_hard, end_hard = 0, 0
cur_level = -1

def set_hard_start_and_end(loc, start, end):
start_hard = start
end_hard = end
if loc.start_offset != 0:
start_hard += 1
if loc.end_offset != 0:
end_hard -= 1
return start_hard, end_hard

while cur_seq or cur_var:
if cur_var and cur_var.variant.is_circ_rna():
cur_var = next(var_iter, None)
Expand All @@ -992,37 +1002,64 @@ def is_hybrid_node(self, tree:SubgraphTree) -> bool:
if cur_level == -1:
cur_level = seq_level
start, end = seq_start, seq_end
start_hard, end_hard = set_hard_start_and_end(cur_seq.query, start, end)

elif seq_level > cur_level:
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True
cur_level = seq_level
start, end = seq_start, seq_end
start_hard, end_hard = set_hard_start_and_end(cur_seq.query, start, end)
else:
end = seq_end
start_hard, end_hard = set_hard_start_and_end(cur_seq.query, start, end)

if cur_seq is self.seq.locations[-1] and not cur_var:
seg = self[start:seq_end]
return seg.is_missing_any_variant(variants)
end = seq_end
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True

cur_seq = next(seq_iter, None)

else:
if cur_level == -1:
cur_level = var_level
start, end = var_start, var_end
start_hard, end_hard = set_hard_start_and_end(cur_var.location, start, end)
elif var_level > cur_level:
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True
cur_level = var_level
start, end = var_start, var_end
start_hard, end_hard = set_hard_start_and_end(cur_var.location, start, end)
else:
end = var_end
start_hard, end_hard = set_hard_start_and_end(cur_var.location, start, end)

if cur_var is self.variants[-1] and not cur_seq:
seg = self[start:var_end]
return seg.is_missing_any_variant(variants)
end = var_end
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True

cur_var = next(var_iter, None)

Expand Down
4 changes: 4 additions & 0 deletions moPepGen/svgraph/PVGOrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ def has_any_incompatible_variant(self, node:PVGNode) -> bool:
return False

def is_compatible_with_variant(self, variant:seqvar.VariantRecordWithCoordinate) -> bool:
""" Checks if the ORF is compatible with a given variant. A variant is
incompatible if it overlaps with any of the locations of the ORF start
node but not carried by the start node or is not a start gain variant
already. """
orf_variants = {v.variant.id for v in self.start_node.variants}
orf_variants.update({v.id for v in self.start_gain})

Expand Down
7 changes: 7 additions & 0 deletions test/files/fuzz/40/annotation.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
chrF . gene 1 2052 . + . gene_id FAKEG00000594; transcript_id FAKET00000594; protein_id FAKEP00000594;
chrF . transcript 1 2052 . + . gene_id FAKEG00000594; transcript_id FAKET00000594; protein_id FAKEP00000594; is_protein_coding false;
chrF . exon 1 128 . + . gene_id FAKEG00000594; transcript_id FAKET00000594; protein_id FAKEP00000594;
chrF . exon 448 550 . + . gene_id FAKEG00000594; transcript_id FAKET00000594; protein_id FAKEP00000594;
chrF . exon 923 979 . + . gene_id FAKEG00000594; transcript_id FAKET00000594; protein_id FAKEP00000594;
chrF . exon 1387 1402 . + . gene_id FAKEG00000594; transcript_id FAKET00000594; protein_id FAKEP00000594;
chrF . exon 1829 2052 . + . gene_id FAKEG00000594; transcript_id FAKET00000594; protein_id FAKEP00000594;
14 changes: 14 additions & 0 deletions test/files/fuzz/40/brute_force.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
CRPSSNIECRTWSVGYYRSPLCR
KGFSRSTDNDWMSGAPTLMAGVVNK
MCRPSSNIECRTWSVGYYRSPLCR
MPEVPYAGSPLCR
MPEVPYAGSPLCRK
MPEVPYAR
MPEVPYARSPLCQK
PEVPYAGSPLCR
PEVPYAGSPLCRK
PEVPYAR
PEVPYARSPLCQK
SPLCRKGFSR
TWSVGYYRSPLCR
TWSVGYYRSPLCRK
25 changes: 25 additions & 0 deletions test/files/fuzz/40/fake_circ_rna.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
##fileformat=VCFv4.2
##mopepgen_version=1.2.1
##parser=parseCIRCexplorer
##reference_index=
##genome_fasta=
##annotation_gtf=
##source=
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
##INFO=<ID=OFFSET,Number=+,Type=Integer,Description="Offsets of fragments (exons or introns)">
##INFO=<ID=LENGTH,Number=+,Type=Integer,Description="Lengths of fragments (exons or introns)">
##INFO=<ID=INTRON,Number=+,Type=Integer,Description="Indices of fragments that are introns">
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
##INFO=<ID=ACCEPTER_GENE_ID,Number=1,Type=String,Description="3' Accepter Transcript's Gene ID">
##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
##POS=<Description="Gene coordinate of circRNA start">
#CHROM POS ID REF ALT QUAL FILTER INFO
FAKEG00000594 1386 CIRC-FAKET00000594-E4 . . . . OFFSET=0;LENGTH=16;INTRON=;TRANSCRIPT_ID=FAKET00000594;GENE_SYMBOL=;GENOMIC_POSITION=chrF:1386
24 changes: 24 additions & 0 deletions test/files/fuzz/40/fake_variants.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
##fileformat=VCFv4.2
##mopepgen_version=1.2.1
##parser=parseVEP
##reference_index=
##genome_fasta=
##annotation_gtf=
##source=
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
##INFO=<ID=OFFSET,Number=+,Type=Integer,Description="Offsets of fragments (exons or introns)">
##INFO=<ID=LENGTH,Number=+,Type=Integer,Description="Lengths of fragments (exons or introns)">
##INFO=<ID=INTRON,Number=+,Type=Integer,Description="Indices of fragments that are introns">
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
##INFO=<ID=ACCEPTER_GENE_ID,Number=1,Type=String,Description="3' Accepter Transcript's Gene ID">
##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
#CHROM POS ID REF ALT QUAL FILTER INFO
FAKEG00000594 1401 FAKEG00000594-1400-A-G A G . . TRANSCRIPT_ID=FAKET00000594;GENOMIC_POSITION=chrF-1400:1401;GENE_SYMBOL=
36 changes: 36 additions & 0 deletions test/files/fuzz/40/genome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
>chrF
TTTTCCGCTTGACGCAATGTCCCATTTTTTGATATGATAATTTGCGGATGTGAAAAACAA
GTCCATCCCCTGCAGCTCTGACTTCATCTTAAGATATGGGAACGCGACCGCAATCCTGCG
GCCTCTCGAGAGACGCGCCCGATAGTGGGCCATTTGCTAAGGTCGAGGCTGTCTCCAACT
GGGATGCTGGGGGGGTCAGCGATGCCTATAGTGGAATGAACGTCCTCGGATGGACAGTAA
GGCGTTAGTATAAAACCGCACATCGTACCTTTAGGCTTTCACGCCAATCCAGATCTCCTT
CTACGCCGGTGAATCTGAAGCCACCCTGGTCATGGTGCACCTTATTCCACGACCAGGTCT
CAACTATCCCAGCGATATAGCTATAGGAATCATCTATTTCCTATCCGCGGCTCGTTCCAA
CTCCTTTAAGCGCGGACAACTGTTTCTTAAAGAACAGAGGCGTCGAATTTACCGGGTGAA
TTTCAGAAAGTCTACAGTCAGAGACTTGGTATTCCTCACTGTGAAAGACCCAAGTTCCAT
GACGCTTTTCAGCATGCAAGCAGTTTGTGGCTCCCCAGGCCAGTATTGTGATTAATGCGC
GTTCCATGACACTTTACTTTGCCGGACTTAATGACCACGTGGTCTTTTGTCCACACCTTG
CGAAAGTCTAAAAGCACTCTGGTTAGAGCAGAAAAGCGGAGCCGACTATCAAGGTAACTT
TTGAGACATAGCGTAATCCGGATAGTTCAGCTACCAGTTACGCACCGCATCTGTTGTTCC
TCGACGCTCACTCCAGTCGGAGCAAAAAAATGGTCCCGTTGCCAACACGGACGAGAGCGT
TGTTGGAGATACCTGCTTAATCCCTTCACTACACACTATATAATAATCATATAGAGCCCT
TCGACAACGCGCTTATCTTTCTAATGTGCCGGCCTTCCTCTAACATTGAGTGCCGCACAT
GGTCTGTCGGATATTACCGCATGATCGCTCTAGCCTCTAGCTATCGAATGAGTCAGGAAG
CGGCGCCCCCGTCGTGCCCTGTGTTGAGGCTCCCGAATTTTCGTCGCCGATAACACACTT
GGACGGATACCAGATCTGAATAGTTTGGTTTTCATGCAACCTGTAAATCTATAGAGACGA
TCCATTACGTGTCCCCAGGCAGCCGTCATAGGCTGCCACTTGGGTACTCGCTAACGTGAA
GGTCCTTCTGTGCTCTGTATATTGACTAAGGTTTGTCGCACCTCACAAGGTTACGTCCTG
TCTAAAAATGTCCGCATTGATCAGCTTTTGGTCTTTCATCGGGACTCTCCACTATTCTGC
CACCCCGGCACTTTACGCGACCTGCAAGCTGCTGGGCATGCTAGGTATGCTGTCGGCATC
GTAGGGAAGTCCCTTATGCCAGGTCGTACCGTCGTTTGTCACCGGCAAATTCTTGAGGCA
AGTAATCTGACGAGTTAGCCCACAAGCATACACGACACGTCAAGGATCGGACGCCCACGA
GTGGAGTTTGATGTGGCCCAACTCTCATGTTTGCCCCGTGCTTGACGCTGCGCGCGATAT
TGCGATACGCAACGAACCTGCAAGATCCGAAAACGACTTATGAAGTGAACACCGAAAGAG
CGACATCAAGTCCTATCTCATCGGAGATCCTAGTTGTTTCCTCCATTGTGATCGGCTTAT
AGTCGTACTACTATTCTAACGCCTCTCAGCCTGAGGGTGTCGCGGTGACGACCCCAGGAG
ACTCCAAAATTGAGCCCTGATAGGCCAACGGATTGGAGATCGTTCAGCACCAGGAGCTGG
CGGCTCGACTACAGTGAATGACGACCGGAAAGGTTTCTCTCGATCAACCGACAATGACTG
GATGAGTGGAGCTCCTACCCTGATGGCCGGAGTAGTCAATAAAAAAGGGTAACCTCGTAC
CGTGTTTCCCTCGCAAAAGGTATATCACATAGCCTGTCTAGTGTAGTGACTCACTCCAAC
TTATCAAATTTGATCCGTCTCACTCATTAAACGCGGCTTTCGGCAGAGGGACCGTGATCT
CGTTTACTTCGC
Empty file.
13 changes: 13 additions & 0 deletions test/integration/test_call_variant_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -1035,3 +1035,16 @@ def test_call_variant_peptide_case68(self):
expected = self.data_dir/'fuzz/39/brute_force.txt'
reference = self.data_dir/'fuzz/39'
self.default_test_case(gvf, reference, expected)

def test_call_variant_peptide_case69(self):
""" In this test case, a peptide node that contains an indel, which is
incompatible with the the node of the orf start (meaning that the orf
start node locations overlap with the variant). #780
"""
gvf = [
self.data_dir/'fuzz/40/fake_variants.gvf',
self.data_dir/'fuzz/40/fake_circ_rna.gvf'
]
expected = self.data_dir/'fuzz/40/brute_force.txt'
reference = self.data_dir/'fuzz/40'
self.default_test_case(gvf, reference, expected)

0 comments on commit b5c260e

Please sign in to comment.