Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Peptides with variants that are incompetible with orf start #781

Merged
merged 8 commits into from
Jul 29, 2023
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

- Fixed issue that circRNA peptide nodes with and without a silent mutation were collapsed. #778

- Fixed that circRNA peptides that carry indels incompatible with the orf start node should not be called. #780

- Fixed that hybrid node was not identified if the variant is in the end of an exon. #782

- Fixed that in circRNA, cleavage gain from upstream node is added to the the wrong ORF. #783

## [1.2.0] - 2023-07-04

### Fixed
Expand Down
7 changes: 7 additions & 0 deletions moPepGen/SeqFeature.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,10 @@ def get_ref_dna_start(self) -> int:
def get_ref_dna_end(self) -> int:
""" Get the end position of the sequence on the gene/transcript """
return self.get_ref_codon_end() - self.query.end_offset

def get_ref_dna_location(self) -> FeatureLocation:
""" Get the reference location in dna coordinate """
return FeatureLocation(
start=self.get_ref_dna_start(),
end=self.get_ref_dna_end()
)
113 changes: 107 additions & 6 deletions moPepGen/svgraph/PVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,69 @@ def get_last_rf_index(self) -> int:
""" Get the last fragment's reading frame index """
return self._get_nth_rf_index(-1)

def _get_nth_subgraph_id(self, i) -> str:
""" Get the nth fragment's subgraph ID """
if (i > 0 or i < -1) and not self.has_multiple_segments():
raise ValueError('Node does not have multiple segments')

locations = [(loc.query, loc.ref.seqname) for loc in self.seq.locations]
locations += [(v.location, v.location.seqname) for v in self.variants
if not v.variant.is_circ_rna()]

locations = sorted(locations, key=lambda x: x[0])

return locations[i][1]

def get_first_subgraph_id(self) -> str:
""" Get the first fragment's subgraph ID """
return self._get_nth_subgraph_id(0)

def get_last_subgraph_id(self) -> str:
""" Get the last fragment's subgraph ID """
return self._get_nth_subgraph_id(-1)

def get_first_dna_ref_position(self) -> int:
""" Get the first DNA reference position """
if self.seq.locations:
first_seq_loc = self.seq.locations[0]
else:
first_seq_loc = None

non_circ_variants = [v for v in self.variants if not v.variant.is_circ_rna()]
if non_circ_variants:
first_var_loc = non_circ_variants[0].location
else:
first_var_loc = None

if not first_seq_loc and not first_var_loc:
raise ValueError(f"The node (seq={str(self.seq.seq)}) does not have loc")

if not first_var_loc or first_seq_loc.query < first_var_loc:
return first_seq_loc.get_ref_dna_start()

return non_circ_variants[0].variant.location.end

def get_last_dna_ref_position(self) -> int:
""" Get the first DNA reference position """
if self.seq.locations:
last_seq_loc = self.seq.locations[-1]
else:
last_seq_loc = None

non_circ_variants = [v for v in self.variants if not v.variant.is_circ_rna()]
if non_circ_variants:
last_var_loc = non_circ_variants[-1].location
else:
last_var_loc = None

if not last_seq_loc and not last_var_loc:
raise ValueError(f"The node (seq={str(self.seq.seq)}) does not have loc")

if not last_var_loc or last_seq_loc.query > last_var_loc:
return last_seq_loc.get_ref_dna_end()

return non_circ_variants[-1].variant.location.end

def remove_out_edges(self) -> None:
""" remove output nodes """
for node in copy.copy(self.out_nodes):
Expand Down Expand Up @@ -888,11 +951,11 @@ def has_variants_not_in(self, variants:Set[seqvar.VariantRecord]) -> bool:
return True
return False

def is_hybrid_node(self, tree:SubgraphTree) -> bool:
def is_hybrid_node(self, tree:SubgraphTree, check_hard:bool=True) -> bool:
""" Checks if the node is hybrid. A hybrid node is when two parts of
the node sequence are from different subgraphs, and have the different
states. """
variants = {v.variant for v in self.variants}
variants = {v.variant for v in self.variants if not v.variant.is_circ_rna()}

seq_iter = iter(self.seq.locations)
var_iter = iter(self.variants)
Expand All @@ -901,8 +964,19 @@ def is_hybrid_node(self, tree:SubgraphTree) -> bool:
cur_var = next(var_iter, None)

start, end = 0, 0
start_hard, end_hard = 0, 0
cur_level = -1

def set_hard_start_and_end(loc, start, end):
start_hard = start
end_hard = end
if check_hard:
if loc.start_offset != 0:
start_hard += 1
if loc.end_offset != 0:
end_hard -= 1
return start_hard, end_hard

while cur_seq or cur_var:
if cur_var and cur_var.variant.is_circ_rna():
cur_var = next(var_iter, None)
Expand All @@ -929,37 +1003,64 @@ def is_hybrid_node(self, tree:SubgraphTree) -> bool:
if cur_level == -1:
cur_level = seq_level
start, end = seq_start, seq_end
start_hard, end_hard = set_hard_start_and_end(cur_seq.query, start, end)

elif seq_level > cur_level:
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True
cur_level = seq_level
start, end = seq_start, seq_end
start_hard, end_hard = set_hard_start_and_end(cur_seq.query, start, end)
else:
end = seq_end
start_hard, end_hard = set_hard_start_and_end(cur_seq.query, start, end)

if cur_seq is self.seq.locations[-1] and not cur_var:
seg = self[start:seq_end]
return seg.is_missing_any_variant(variants)
end = seq_end
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True

cur_seq = next(seq_iter, None)

else:
if cur_level == -1:
cur_level = var_level
start, end = var_start, var_end
start_hard, end_hard = set_hard_start_and_end(cur_var.location, start, end)
elif var_level > cur_level:
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True
cur_level = var_level
start, end = var_start, var_end
start_hard, end_hard = set_hard_start_and_end(cur_var.location, start, end)
else:
end = var_end
start_hard, end_hard = set_hard_start_and_end(cur_var.location, start, end)

if cur_var is self.variants[-1] and not cur_seq:
seg = self[start:var_end]
return seg.is_missing_any_variant(variants)
end = var_end
seg = self[start:end]
if seg.is_missing_any_variant(variants):
return True
if start_hard < end_hard:
seg = self[start_hard:end_hard]
if seg.is_missing_any_variant(variants):
return True

cur_var = next(var_iter, None)

Expand Down
15 changes: 14 additions & 1 deletion moPepGen/svgraph/PVGNodeCollapser.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,20 @@ def __eq__(self, other:PVGPopCollapseNode):
and {v.variant for v in self.variants if v.variant.is_indel()} \
== {v.variant for v in other.variants if v.variant.is_indel()} \
and self.subgraph_id == other.subgraph_id \
and self.level == other.level
and self.level == other.level \
and (
(
not any(v.variant.is_circ_rna() for v in self.variants)
and not any(v.variant.is_circ_rna() for v in other.variants)
)
or {
(v.variant.id, v.location.seqname)
for v in self.variants if not v.variant.is_circ_rna()
} == {
(v.variant.id, v.location.seqname)
for v in other.variants if not v.variant.is_circ_rna()
}
)

if result and hasattr(other, 'match'):
other.match = self
Expand Down
53 changes: 50 additions & 3 deletions moPepGen/svgraph/PVGOrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Set, List
import copy
from moPepGen import circ, seqvar
from moPepGen.SeqFeature import FeatureLocation
from moPepGen.svgraph.PVGNode import PVGNode
from moPepGen.svgraph.SubgraphTree import SubgraphTree

Expand All @@ -11,21 +12,26 @@ class PVGOrf():
""" Helper class for an ORF and its corresponding start gain variants. """
def __init__(self, orf:List[int,int]=None,
start_gain:Set[seqvar.VariantRecord]=None, start_node:PVGNode=None,
subgraph_id:str=None, node_offset:int=None):
subgraph_id:str=None, node_offset:int=None,
cleavage_gain:Set[seqvar.VariantRecord]=None):
""" constructor """
self.orf = orf or None
self.start_gain = start_gain or set()
self.start_node = start_node
self.subgraph_id = subgraph_id
self.node_offset = node_offset
self._orf_node_locs = None
self.cleavage_gain = cleavage_gain or set()

def copy(self) -> PVGOrf:
""" copy """
orf = self.orf
start_gain = copy.copy(self.start_gain)
cleavage_gain = copy.copy(self.cleavage_gain)
return self.__class__(
orf=orf, start_gain=start_gain, start_node=self.start_node,
subgraph_id=self.subgraph_id, node_offset=self.node_offset
subgraph_id=self.subgraph_id, node_offset=self.node_offset,
cleavage_gain=cleavage_gain
)

def __eq__(self, other:PVGOrf) -> bool:
Expand Down Expand Up @@ -70,6 +76,20 @@ def __hash__(self):
""" hash """
return hash((tuple(self.orf), frozenset(self.start_gain)))

def get_orf_node_locs(self) -> List[FeatureLocation]:
""" Get locations of the orf node. """
if self._orf_node_locs:
return self._orf_node_locs
locs = {
loc.get_ref_dna_location() for loc in self.start_node.seq.locations
}
locs.update([
v.variant.location for v in self.start_node.variants
if not v.variant.is_circ_rna()
])
self._orf_node_locs = locs
return sorted(locs)

def location_is_at_least_one_loop_downstream(self, i:int, subgraph_id:str,
subgraphs:SubgraphTree, circ_rna:circ.CircRNAModel) -> bool:
""" Whether the genetic or transcriptional location i with the given
Expand Down Expand Up @@ -152,7 +172,7 @@ def is_valid_orf(self, node:PVGNode, subgraphs:SubgraphTree,
upstream_variants = upstream_variants or set()

if not self.node_is_at_least_one_loop_downstream(node, subgraphs, circ_rna):
return True
return not self.has_any_incompatible_variant(node)

start_gain = {x for x in self.start_gain if not x.is_circ_rna()}
start_gain.update(upstream_variants)
Expand Down Expand Up @@ -192,3 +212,30 @@ def is_valid_orf_to_misc_nodes(self, nodes:List[PVGNode], subgraphs:SubgraphTree
downstream_valid = True

return downstream_valid

def has_any_incompatible_variant(self, node:PVGNode) -> bool:
""" Checks if the orf is missing any frameshift variants before the
orf start node. """
for v in node.variants:
if v.variant.is_circ_rna():
continue

if not self.is_compatible_with_variant(v):
return True
return False

def is_compatible_with_variant(self, variant:seqvar.VariantRecordWithCoordinate) -> bool:
""" Checks if the ORF is compatible with a given variant. A variant is
incompatible if it overlaps with any of the locations of the ORF start
node but not carried by the start node or is not a start gain variant
already. """
orf_variants = {v.variant.id for v in self.start_node.variants}
orf_variants.update({v.id for v in self.start_gain})

orf_node_locs = self.get_orf_node_locs()

orf_subgraph = self.start_node.get_first_subgraph_id()

return variant.location.seqname == orf_subgraph \
or not any(variant.variant.location.overlaps(loc) for loc in orf_node_locs) \
or variant.variant.id in orf_variants
34 changes: 24 additions & 10 deletions moPepGen/svgraph/PeptideVariantGraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,8 +1053,12 @@ def call_and_stage_unknown_orf(self, cursor:PVGCursor,

if in_cds and not target_node.npop_collapsed:
cur_copy = target_node.copy(in_nodes=False)
additional_variants = cursor.cleavage_gain
cur_orfs = copy.copy(orfs)
if self.is_circ_rna():
additional_variants = []
else:
additional_variants = cursor.cleavage_gain

node_list.append((cur_copy, cur_orfs, False, additional_variants))
trash.add(cur_copy)

Expand Down Expand Up @@ -1085,7 +1089,11 @@ def call_and_stage_unknown_orf(self, cursor:PVGCursor,
subgraph_id=orf_subgraph_id, node_offset = start_index
)
orfs.append(orf)
additional_variants = copy.copy(cleavage_gain_down)
if self.is_circ_rna():
additional_variants = []
orf.cleavage_gain = set(cleavage_gain_down)
else:
additional_variants = copy.copy(cleavage_gain_down)
cur_orfs = [orfs[0]] if traversal.orf_assignment == 'min' else [orfs[-1]]
node_list.append((cur_copy, cur_orfs, True, additional_variants))
trash.add(cur_copy)
Expand Down Expand Up @@ -1141,21 +1149,27 @@ def call_and_stage_unknown_orf(self, cursor:PVGCursor,
for x in cur_orfs:
x.start_gain.add(variant.variant)

cur_cleavage_gain = copy.copy(cleavage_gain)
if self.is_circ_rna():
cur_cleavage_gain = []
else:
cur_cleavage_gain = copy.copy(cleavage_gain)

if self.is_circ_rna():
circ_rna = traversal.circ_rna
filtered_orfs = []
for orf_i in cur_orfs:
orf_i_2 = orf_i.copy()
if orf_i_2.node_is_at_least_one_loop_downstream(
if not orf_i_2.is_valid_orf(out_node, self.subgraphs, circ_rna):
continue

if not orf_i_2.node_is_at_least_one_loop_downstream(
out_node, self.subgraphs, circ_rna):
if orf_i_2.is_valid_orf(out_node, self.subgraphs, circ_rna):
filtered_orfs.append(orf_i_2)
else:
orf_i_2.start_gain.update({x.variant for x in out_node.variants
if x.not_cleavage_altering()})
filtered_orfs.append(orf_i_2)
for v in out_node.variants:
if v.not_cleavage_altering() \
and orf_i_2.is_compatible_with_variant(v):
orf_i_2.start_gain.add(v.variant)
orf_i_2.cleavage_gain = set(cleavage_gain)
filtered_orfs.append(orf_i_2)
if not filtered_orfs:
cur_in_cds = False
else:
Expand Down
3 changes: 3 additions & 0 deletions moPepGen/svgraph/VariantPeptideDict.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,9 @@ def join_miscleaved_peptides(self, pool:T, check_variants:bool,
for v in additional_variants:
if v.id not in variants:
variants[v.id] = v
for v in valid_orf.cleavage_gain:
if v.id not in variants:
variants[v.id] = v
for v in series.additional_variants:
if v.id not in variants:
variants[v.id] = v
Expand Down
Loading