fix (moPepGen): --include-coding and --find-ass added to `callNon…

…coding` and `callVariant` to call novel ORF peptides from coding transcripts. #659
uclahs-cds · Mar 21, 2024 · 7b51502 · 7b51502
1 parent ae63236
commit 7b51502
Show file tree

Hide file tree

Showing 10 changed files with 79 additions and 284 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.3.2]
+
+- `--include-coding` and `--find-ass` added to `callNoncoding` and `callVariant` to call novel ORF peptides from coding transcripts. #659
+
 ## [1.3.1]
 
 ### Added:

diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
@@ -8,7 +8,7 @@
 from . import constant
 
 
-__version__ = '1.3.1'
+__version__ = '1.3.2'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'

diff --git a/moPepGen/aa/VariantPeptideIdentifier.py b/moPepGen/aa/VariantPeptideIdentifier.py
@@ -7,7 +7,6 @@
 
 if TYPE_CHECKING:
     from moPepGen.seqvar import VariantRecord
-    from moPepGen.gtf import GenomicAnnotation
 
 def create_variant_peptide_id(transcript_id:str, variants:List[VariantRecord],
         orf_id:str=None, index:int=None, gene_id:str=None) -> str:

diff --git a/moPepGen/cli/__init__.py b/moPepGen/cli/__init__.py
@@ -18,4 +18,3 @@
 from .decoy_fasta import add_subparser_decoy_fasta, decoy_fasta
 from .summarize_fasta import add_subparser_summarize_fasta, summarize_fasta
 from .update_index import add_subparser_update_index, update_index
-from .call_alt_start_site import add_subparser_call_alt_start, call_alt_start
diff --git a/moPepGen/cli/__main__.py b/moPepGen/cli/__main__.py
@@ -24,7 +24,6 @@
 -- Calling
    callVariant         Call non-canonical peptides from genomic variants.
    callNoncoding       Call non-canonical peptides from noncoding transcripts.
-   callAltStart        Call non-canonical peptides from alternative start sites.
    callAltTranslation  Call non-canonital peptides with alternative translation
                        from coding transcripts.
 
@@ -68,7 +67,6 @@ def main():
     cli.add_subparser_parse_circexplorer(subparsers)
     cli.add_subparser_call_variant(subparsers)
     cli.add_subparser_call_noncoding(subparsers)
-    cli.add_subparser_call_alt_start(subparsers)
     cli.add_subparser_call_alt_translation(subparsers)
     cli.add_subparser_split_fasta(subparsers)
     cli.add_subparser_filter_fasta(subparsers)

diff --git a/moPepGen/cli/call_alt_start_site.py b/moPepGen/cli/call_alt_start_site.py
diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py
@@ -60,6 +60,11 @@ def add_subparser_call_noncoding(subparsers:argparse._SubParsersAction):
         default='max',
         metavar='<choice>'
     )
+    p.add_argument(
+        '--include-coding',
+        action='store_true',
+        help='Include coding transcripts to find alternative ORFs.'
+    )
     p.add_argument(
         '--w2f-reassignment',
         action='store_true',
@@ -117,22 +122,26 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
 
     inclusion_biotypes, exclusion_biotypes = common.load_inclusion_exclusion_biotypes(args)
 
-    noncanonical_pool = aa.VariantPeptidePool()
+    noval_orf_peptide_pool = aa.VariantPeptidePool()
     orf_pool = []
 
     i = 0
     for tx_id in anno.transcripts:
         tx_model = anno.transcripts[tx_id]
-        if inclusion_biotypes and \
-                tx_model.transcript.biotype not in inclusion_biotypes:
-            continue
-        if exclusion_biotypes and \
-                tx_model.transcript.biotype in exclusion_biotypes:
-            continue
-        if tx_id in proteome:
-            continue
-        if tx_model.transcript_len() < args.min_tx_length:
-            continue
+        if tx_model.is_protein_coding:
+            if not args.include_coding:
+                pass
+        else:
+            if inclusion_biotypes and \
+                    tx_model.transcript.biotype not in inclusion_biotypes:
+                continue
+            if exclusion_biotypes and \
+                    tx_model.transcript.biotype in exclusion_biotypes:
+                continue
+            if tx_id in proteome:
+                continue
+            if tx_model.transcript_len() < args.min_tx_length:
+                continue
 
         try:
             peptides, orfs = call_noncoding_peptide_main(
@@ -149,7 +158,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
             orf_pool.extend(orfs)
 
             for peptide in peptides:
-                noncanonical_pool.add_peptide(peptide, canonical_peptides,
+                noval_orf_peptide_pool.add_peptide(peptide, canonical_peptides,
                     cleavage_params)
         except ReferenceSeqnameNotFoundError as e:
             if not ReferenceSeqnameNotFoundError.raised:
@@ -163,7 +172,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
         if i % 5000 == 0:
             logger.info('%i transcripts processed.', i)
 
-    noncanonical_pool.write(args.output_path)
+    noval_orf_peptide_pool.write(args.output_path)
     if args.output_orf:
         with open(args.output_orf, 'w') as handle:
             write_orf(orf_pool, handle)

diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
@@ -48,6 +48,12 @@ def __init__(self, query:FeatureLocation, ref:FeatureLocation, feature_type:str,
         self.feature_id = feature_id
         self.variant_id = variant_id
 
+    def __repr__(self):
+        """ str """
+        return f"<PeptideSegment query={self.query} ref={self.ref}" +\
+            f" feature_type={self.feature_type} feature_id={self.feature_id}" +\
+            f" variant_id={self.variant_id}>"
+
     def merge(self, other:PeptideSegment) -> PeptideSegment:
         """ merge """
         query = FeatureLocation(

diff --git a/test/integration/test_call_alt_start.py b/test/integration/test_call_alt_start.py