Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix callVariant #891

Merged
merged 8 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ disable=raw-checker-failed,
superfluous-parens,
unnecessary-lambda-assignment,
unnecessary-dunder-call,
unspecified-encoding
unspecified-encoding,
redefined-builtin

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.4.4] - 2025-02-11

### Fixed

- Fixed issue reported in #889 where variant bubbles alignment failed due to incorrect handling of in-bridge and out-bridge nodes with the same subgraph ID. The fix ensures that only in-bridge and out-bridge nodes connected to any of the nodes in the members of the variant bubble are considered.

- Fixed issue that `callVariant` fails on transcripts with SEC very close to the start codon.

## [1.4.3] - 2025-01-18

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from . import constant


__version__ = '1.4.3'
__version__ = '1.4.4'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
4 changes: 3 additions & 1 deletion moPepGen/svgraph/TVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import copy
from collections import deque
import math
import uuid
from Bio.Seq import Seq
from moPepGen.dna import DNASeqRecordWithCoordinates
from moPepGen import seqvar, aa
Expand All @@ -29,7 +30,7 @@ def __init__(self, seq:DNASeqRecordWithCoordinates,
variants:List[seqvar.VariantRecordWithCoordinate]=None,
branch:bool=False, orf:List[int]=None, reading_frame_index:int=None,
subgraph_id:str=None, global_variant:seqvar.VariantRecord=None,
level:int=0, was_bridge:bool=False):
level:int=0, was_bridge:bool=False, id:str=None):
""" Constructor for TVGNode.

Args:
Expand All @@ -48,6 +49,7 @@ def __init__(self, seq:DNASeqRecordWithCoordinates,
self.global_variant = global_variant
self.level = level
self.was_bridge = was_bridge
self.id = id or str(uuid.uuid4())

def create_node(self, seq:DNASeqRecordWithCoordinates,
variants:List[seqvar.VariantRecordWithCoordinate]=None,
Expand Down
12 changes: 12 additions & 0 deletions moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,6 +1227,16 @@ def find_bridge_nodes_between(self, start:TVGNode, end:TVGNode, members:Set[TVGN
subgraph_out:Set[TVGNode] = set()
visited = set()
queue:Deque[TVGNode] = deque([e.out_node for e in start.out_edges])

# This `end_nodes` is to set the boundray of the current variant bubble.
# Any "inbrige" and "outbrange" nodes to and from any node outside of
# the current variant bubble should not be considered.
end_nodes = {}
for n in members:
for out_node in n.get_out_nodes():
if out_node not in members:
end_nodes[out_node.id] = out_node

while queue:
cur = queue.pop()
len_before = len(visited)
Expand Down Expand Up @@ -1273,6 +1283,8 @@ def find_bridge_nodes_between(self, start:TVGNode, end:TVGNode, members:Set[TVGN
and cur.get_first_subgraph_id() != cur.get_last_subgraph_id():
subgraph_out.add(cur)
continue
if cur.id in end_nodes:
continue
elif cur is not end:
# This is when an altSplice indel/sub carries additional frameshift
# variant, and the indel frameshift is very closed to the end of
Expand Down
24 changes: 13 additions & 11 deletions moPepGen/svgraph/VariantPeptideDict.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,8 +465,7 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
cur_metadata.label = label
cur_metadata.has_variants = bool(cur_variants)

if is_valid:
cur_metadata_2 = copy.copy(cur_metadata)
if is_valid or is_valid_start:
cur_nodes = []
cut_offset = sec.location.start
for node in nodes:
Expand All @@ -481,15 +480,18 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
else:
cut_offset = max(0, cut_offset - len(node.seq.seq))
continue
cur_metadata_2.segments = self.create_peptide_segments(cur_nodes)
yield seq_mod, cur_metadata_2

if is_valid_start:
cur_seq=seq_mod[1:]
cur_nodes[0] = cur_nodes[0].copy()
cur_nodes[0].truncate_left(1)
cur_metadata.segments = self.create_peptide_segments(cur_nodes)
yield cur_seq, cur_metadata
if is_valid:
cur_metadata_2 = copy.copy(cur_metadata)
cur_metadata_2.segments = self.create_peptide_segments(cur_nodes)
yield seq_mod, cur_metadata_2

if is_valid_start:
cur_metadata_2 = copy.copy(cur_metadata)
cur_seq=seq_mod[1:]
cur_nodes[0] = cur_nodes[0].copy()
cur_nodes[0].truncate_left(1)
cur_metadata_2.segments = self.create_peptide_segments(cur_nodes)
yield cur_seq, cur_metadata_2

@staticmethod
def any_overlaps_and_all_missing_variant(nodes:Iterable[PVGNode], variant:VariantRecord):
Expand Down
6 changes: 4 additions & 2 deletions moPepGen/util/validate_variant_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def parse_args(subparsers:argparse._SubParsersAction):
nargs='*',
metavar='<values>'
)
common.add_args_debug_level(parser)
common.add_args_reference(parser, proteome=True, index=False)
parser.set_defaults(func=main)
common.print_help_if_missing_args(parser)
Expand Down Expand Up @@ -120,12 +121,13 @@ def call_variant(gvf_files:Path, ref_dir:Path, output_fasta:Path, graph_output_d
args.min_mw = 500.
args.min_length = 7
args.max_length = 25
args.max_variants_per_node = 9
args.additional_variants_per_misc = 2
args.max_variants_per_node = (7,)
args.additional_variants_per_misc = (2,)
args.min_nodes_to_collapse = 30
args.naa_to_collapse = 5
args.noncanonical_transcripts = False
args.threads = 1
args.timeout_seconds = 900
call_variant_peptide(args=args)

def call_brute_force(gvf_files:Path, ref_dir:Path, output_path:str, force:bool,
Expand Down
21 changes: 21 additions & 0 deletions test/files/fuzz/54/AltSplice.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.2
##mopepgen_version=1.4.3
##parser=parseRMATS
##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1
##genome_fasta=
##annotation_gtf=
##source=AltSplice
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
#CHROM POS ID REF ALT QUAL FILTER INFO
ENSG00000105976.16 52054 A5SS_52054-52497-52576 G <INS> . . TRANSCRIPT_ID=ENST00000495962.1;DONOR_GENE_ID=ENSG00000105976.16;DONOR_START=52498;DONOR_END=52576;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116724249:116724250
ENSG00000105976.16 52577 A5SS_52054-52497-52580 G <DEL> . . TRANSCRIPT_ID=ENST00000495962.1;START=52577;END=52580;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116724772:116724775
ENSG00000105976.16 67755 A5SS_59664-67754-67818 G <DEL> . . TRANSCRIPT_ID=ENST00000495962.1;START=67755;END=67818;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116739950:116740013
ENSG00000105976.16 68657 A5SS_67889-68656-68740 A <DEL> . . TRANSCRIPT_ID=ENST00000495962.1;START=68657;END=68740;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116740852:116740935
8 changes: 8 additions & 0 deletions test/files/fuzz/54/annotation.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chr1 . gene 1 126182 . + . gene_id ENSG00000105976.16; gene_type protein_coding; gene_name MET;
chr1 . transcript 51951 83219 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
chr1 . exon 51951 52054 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
chr1 . exon 52577 52652 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
chr1 . exon 59473 59664 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
chr1 . exon 67755 67889 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
chr1 . exon 68657 68830 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
chr1 . exon 83160 83219 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
21 changes: 21 additions & 0 deletions test/files/fuzz/54/brute_force.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
ATHFLSLGRSVAGATTNVCDR
ATHFLSLGRSVAGATTNVCDRR
ATHWLSLGRSVAGATTNVCDR
ATHWLSLGRSVAGATTNVCDRR
CGFCHDK
CGFCHDKCVR
CGWCHDK
CGWCHDKCVR
FMQCLQK
GDLTIANLGTSEGRFMQCLQK
KCGFCHDK
KCGFCHDKCVR
KCGWCHDK
KCGWCHDKCVR
MATHFLSLGRSVAGATTNVCDR
MATHFLSLGRSVAGATTNVCDRR
MATHWLSLGRSVAGATTNVCDR
MATHWLSLGRSVAGATTNVCDRR
SVAGATTNVCDR
SVAGATTNVCDRR
SVAGATTNVCDRRNA
13 changes: 13 additions & 0 deletions test/files/fuzz/54/gSNP.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
##fileformat=VCFv4.2
##mopepgen_version=1.4.3
##parser=parseVEP
##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1
##genome_fasta=
##annotation_gtf=
##source=gSNP
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
#CHROM POS ID REF ALT QUAL FILTER INFO
ENSG00000105976.16 52574 MNV-52574-CA-C CA C . . TRANSCRIPT_ID=ENST00000495962.1;GENOMIC_POSITION=chr7:116724770;GENE_SYMBOL=MET
Loading
Loading