Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Support for Fusion into the Graph Algorithm #4

Merged
merged 7 commits into from
Jun 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 0 additions & 58 deletions docs/file_format.md

This file was deleted.

7 changes: 0 additions & 7 deletions docs/mop_point_mutation.txt

This file was deleted.

2 changes: 2 additions & 0 deletions moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from datetime import datetime


__version__ = '0.0.1'

class _CaptureEq:
"""Object wrapper that remembers "other" for successful equality tests.
Adopted from https://code.activestate.com/recipes/499299/
Expand Down
43 changes: 38 additions & 5 deletions moPepGen/cli/call_variant_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,39 @@ def call_variant_peptide(args:argparse.Namespace) -> None:
seq=transcript_seq,
transcript_id=transcript_id
)
dgraph.create_variant_graph(variant_records)

## Create transcript variant graph
# dgraph.create_variant_graph(variant_records)
dgraph.add_null_root()
variant_iter = iter(variant_records)
variant = next(variant_iter, None)
cur = dgraph.root.get_reference_next()
while variant:
if cur.seq.locations[0].ref.start > variant.location.start:
variant = next(variant_iter, None)
continue

if cur.seq.locations[-1].ref.end <= variant.location.start:
cur = cur.get_reference_next()
continue

if variant.type == 'Fusion':
# TODO: inplement fusion
donor_transcript_id = variant.attrs['DONOR_TRANSCRIPT_ID']
donor_anno = annotation[donor_transcript_id]
donor_chrom = donor_anno.transcript.location.seqname
donor_seq = anno.get_transcript_sequence(genome[donor_chrom])
donor_variant_records = variants[donor_transcript_id] \
if donor_transcript_id in variants else []
cur = dgraph.apply_fusion(cur, variant, donor_seq, donor_variant_records)
variant = next(variant_iter, None)
continue

cur = dgraph.apply_variant(cur, variant)
if len(cur.in_edges) == 0:
dgraph.root = cur
variant = next(variant_iter, None)

dgraph.fit_into_codons()
pgraph = dgraph.translate()

Expand Down Expand Up @@ -118,11 +150,12 @@ def call_variant_peptide(args:argparse.Namespace) -> None:

if __name__ == '__main__':
test_args = argparse.Namespace()
test_args.input_variants = [
'test/files/CPCG0100_gencode_moPepGen.txt'
test_args.input_variant = [
'test/files/vep/vep.tvf',
'test/files/fusion/fusion.tvf'
]
test_args.index_dir = 'test/files/gencode_34_index'
test_args.output_fasta = 'test/files/CPCG0100_gencode_v34_moPepGen.fasta'
test_args.index_dir = 'test/files/index'
test_args.output_fasta = 'test/files/vep/vep_moPepGen.fasta'
test_args.verbose = True
test_args.cleavage_rule = 'trypsin'
test_args.miscleavage = 2
Expand Down
46 changes: 32 additions & 14 deletions moPepGen/cli/parse_vep.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
""" VEP2VariantPeptides module """
from typing import Dict, List
import pathlib
import argparse
import pickle
from moPepGen.vep import VepIO
from moPepGen.parser import VEPParser
from moPepGen import gtf, dna, seqvar, logger


Expand All @@ -12,8 +13,10 @@ def parse_vep(args:argparse.Namespace) -> None:
vep_files:List[str] = args.vep_txt
index_dir:str = args.index_dir
output_prefix:str = args.output_prefix
output_path = output_prefix + '_moPepGem.bed'
output_path = output_prefix + '.tvf'
verbose = args.verbose
genome_fasta = None
annotation_gtf = None

if verbose:
logger('moPepGen parseVEP started.')
Expand Down Expand Up @@ -45,7 +48,7 @@ def parse_vep(args:argparse.Namespace) -> None:
vep_records:Dict[str, List[seqvar.VariantRecord]] = {}

for vep_file in vep_files:
for record in VepIO.parse(vep_file):
for record in VEPParser.parse(vep_file):
transcript_id = record.feature

if transcript_id not in vep_records.keys():
Expand All @@ -69,12 +72,27 @@ def parse_vep(args:argparse.Namespace) -> None:
if verbose:
logger('VEP sorting done.')

with open(output_path, 'w') as handle:
mode = 'w'
for records in vep_records.values():
seqvar.io.write(records, handle, mode)
if mode == 'w':
mode = 'a'
if index_dir:
reference_index = pathlib.Path(index_dir).absolute()
genome_fasta = None
annotation_gtf = None
else:
reference_index = None
genome_fasta = pathlib.Path(genome_fasta).absolute()
annotation_gtf = pathlib.Path(annotation_gtf).absolute()

metadata = seqvar.TVFMetadata(
parser='parseVEP',
reference_index=reference_index,
genome_fasta=genome_fasta,
annotation_gtf=annotation_gtf
)

all_records = []
for records in vep_records.values():
all_records.extend(records)

seqvar.io.write(all_records, output_path, metadata)

if verbose:
logger('Variant info written to disk.')
Expand All @@ -83,12 +101,12 @@ def parse_vep(args:argparse.Namespace) -> None:
if __name__ == '__main__':
test_args = argparse.Namespace()
test_args.vep_txt = [
'test/files/vep_test_files/vep_snp.txt',
'test/files/vep_test_files/vep_indel.txt'
'test/files/vep/vep_snp.txt',
'test/files/vep/vep_indel.txt'
]
test_args.index_dir = None
test_args.genome_fasta = 'test/files/vep_test_files/genome.fasta'
test_args.annotation_gtf = 'test/files/vep_test_files/annotation.gtf'
test_args.output_prefix = 'test/files/vep_test_files/vep'
test_args.genome_fasta = 'test/files/genome.fasta'
test_args.annotation_gtf = 'test/files/annotation.gtf'
test_args.output_prefix = 'test/files/vep/vep'
test_args.verbose = True
parse_vep(args=test_args)
54 changes: 52 additions & 2 deletions moPepGen/vep/VepRecord.py → moPepGen/parser/VEPParser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,60 @@
""" This module defines the class for a single VEP record
"""
from typing import List, Tuple
from __future__ import annotations
from typing import List, Tuple, Iterable
import re
from moPepGen.SeqFeature import FeatureLocation
from moPepGen import seqvar, dna


def parse(path:str) -> Iterable[VEPRecord]:
""" Parse a VEP output text file and return as an iterator.

Args:
path (str): Path to the REDItools output table.

Return:
A iterable of VEPRecord.
"""
with open(path, 'r') as handle:
for line in handle:
if line.startswith('#'):
continue
line = line.rstrip()
fields = line.split('\t')

consequences = fields[6].split(',')

amino_acids = tuple(aa for aa in fields[10].split('/'))
if len(amino_acids) == 1:
amino_acids = (amino_acids[0], '')

codons = tuple(codon for codon in fields[11].split('/'))
if len(codons) == 1:
codons = (codons[0], '')

extra = {}
for field in fields[13].split(';'):
key, val = field.split('=')
extra[key] = val

yield VEPRecord(
uploaded_variation=fields[0],
location=fields[1],
allele=fields[2],
gene=fields[3],
feature=fields[4],
feature_type=fields[5],
consequences=consequences,
cdna_position=fields[7],
cds_position=fields[8],
protein_position=fields[9],
amino_acids=amino_acids,
codons=codons,
existing_variation='' if fields[12] == '-' else fields[12],
extra=extra
)

class VEPRecord():
""" A VEPRecord object holds the an entry from the VEP output. The VEP
output is defined at https://uswest.ensembl.org/info/docs/tools/vep/
Expand Down Expand Up @@ -119,7 +168,8 @@ def convert_to_variant_record(self, seq:dna.DNASeqRecord
ref=ref,
alt=alt,
_type=_type,
_id=_id
_id=_id,
attrs={'GENE_ID': self.gene}
)
except ValueError as e:
raise ValueError(e.args[0] + f' [{self.feature}]') from e
2 changes: 1 addition & 1 deletion moPepGen/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
""" Module for moPepGen parsers from different variant sources. """
from moPepGen.parser.REDItoolsParser import REDItoolsRecord
from moPepGen.parser import REDItoolsParser
from moPepGen.parser import REDItoolsParser, VEPParser
Loading