michauhl · michauhl · Sep 10, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 13, 2024
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-exclude test *
+exclude README.md
diff --git a/README.md b/README.md
@@ -90,7 +90,7 @@ More information will be added soon.
 
 ## Documentation
 
-This documentation provides further details on RBPBench (version 0.9).
+This documentation provides further details on RBPBench (version 0.9.1).
 
 ### Program modes
 

diff --git a/bin/batch_table_wrapper_rbpbench.py b/bin/batch_table_wrapper_rbpbench.py
@@ -68,25 +68,59 @@ def setup_argument_parser():
                    type=int,
                    default=1,
                    choices=[1, 2, 3],
-                   help="Built-in motif database to use. 1: human RBP motifs full (259 RBPs, 605 motifs, \"catrapid.omics.v2.1.human.6plus\"), 2: human RBP motifs full (low frequencies not rounded, \"catrapid.omics.v2.1.human.6plus.noround\"), 3: human RBP motifs eCLIP (107 RBPs, 316 motifs, \"s6_refined_ic010.human.rounded.encode_rbps\") (default: 1)")
-    p.add_argument("--fimo-nt-freqs",
-                   dest="fimo_nt_freqs",
+                   help="Built-in motif database to use (currently there are 3 human databases: 1,2,3. See details in rbpbench (default: 1)")
+    p.add_argument("--fimo-ntf-file",
+                   dest="fimo_user_ntf_file",
                    type=str,
                    metavar='str',
-                   default=False,
-                   help="Provide FIMO nucleotide frequencies (FIMO option: --bifile) file (default: use internal frequencies file optimized for human transcripts)")
+                   default = False,
+                   help = "Provide FIMO nucleotide frequencies (FIMO option: --bfile) file (default: use internal frequencies file, define which with --fimo-ntf-mode)")
+    p.add_argument("--fimo-ntf-mode",
+                   dest="fimo_ntf_mode",
+                   type=int,
+                   default=1,
+                   choices=[1, 2, 3],
+                   help="Set which internal nucleotide frequencies to use for FIMO search. 1: use frequencies from human ENSEMBL transcripts (excluding introns, A most prominent) 2: use frequencies from human ENSEMBL transcripts (including introns, resulting in lower G+C and T most prominent) 3: use uniform frequencies (same for every nucleotide) (default: 1)")
     p.add_argument("--fimo-pval",
                    dest="fimo_pval",
                    type=float,
                    metavar='float',
                    default=0.001,
                    help="FIMO p-value threshold (FIMO option: --thresh) (default: 0.001)")
+    p.add_argument("--cmsearch-bs",
+                   dest="cmsearch_bs",
+                   type=float,
+                   metavar='float',
+                   default=1.0,
+                   help="CMSEARCH bit score threshold (CMSEARCH options: -T --incT). The higher the more strict (default: 1.0)")
+    p.add_argument("--cmsearch-mode",
+                   dest="cmsearch_mode",
+                   type=int,
+                   default=1,
+                   choices=[1, 2],
+                   help="Set CMSEARCH mode to control strictness of filtering. 1: default setting (CMSEARCH option: --default). 2: max setting (CMSEARCH option: --max), i.e., turn all heuristic filters off, slower and more sensitive / more hits) (default: 1)")
+    p.add_argument("--greatest-hits",
+                   dest="greatest_hits",
+                   default = False,
+                   action = "store_true",
+                   help = "Keep only best FIMO/CMSEARCH motif hits (i.e., hit with lowest p-value / highest bit score for each motif sequence/site combination). By default, report all hits (default: False)")
     p.add_argument("--bed-score-col",
                    dest="bed_score_col",
                    type=int,
                    metavar='int',
                    default=5,
                    help="--in BED score column used for p-value calculations. BED score can be e.g. log2 fold change or -log10 p-value of the region (default: 5)")
+    p.add_argument("--bed-sc-thr",
+                   dest="bed_sc_thr",
+                   type = float,
+                   metavar='float',
+                   default = None,
+                   help = "Minimum site score (by default: --in BED column 5, or set via --bed-score-col) for filtering (assuming higher score == better site) (default: None)")
+    p.add_argument("--bed-sc-thr-rev",
+                   dest="bed_sc_thr_rev_filter",
+                   default = False,
+                   action = "store_true",
+                   help = "Reverse --bed-sc-thr filtering (i.e. the lower the better, e.g. for p-values) (default: False)")
     p.add_argument("--unstranded",
                    dest="unstranded",
                    default=False,
@@ -121,11 +155,6 @@ def setup_argument_parser():
                    choices=[1, 2, 3],
                    help="Defines Fisher exact test alternative hypothesis for testing co-occurrences of RBP motifs. 1: greater, 2: two-sided, 3: less (default: 1)")
     # Report.
-    p.add_argument("--report",
-                   dest="report",
-                   default = False,
-                   action = "store_true",
-                   help = "Generate an .html report containing various plots to compare input datasets (default: False)")
     p.add_argument("--kmer-size",
                    dest="kmer_size",
                    type=int,
@@ -137,6 +166,11 @@ def setup_argument_parser():
                    default = False,
                    action = "store_true",
                    help = "Do not produce gene region occupancy heatmap plot in HTML report (default: False)")
+    p.add_argument("--disable-heatmap-cluster-olo",
+                   dest="disable_heatmap_cluster_olo",
+                   default = False,
+                   action = "store_true",
+                   help="Disable optimal leave ordering (OLO) for clustering gene region occupancy heatmap. By default, OLO is enabled")
     p.add_argument("--report-header",
                    dest="report_header",
                    default = False,
@@ -184,7 +218,78 @@ def setup_argument_parser():
                    type=float,
                    metavar='float',
                    default=0.1,
-                   help="Minimum amount of overlap required for a region to be assigned to a GTF feature (if less or no overlap, region will be assigned to \"intergenic\") (default: 0.1)")
+                   help="Minimum amount of overlap required for a region to be assigned to a GTF feature (if less or no overlap, region will be assigned to \"intergenic\"). If there is overlap with several features, assign the one with highest overlap (default: 0.1)")
+    p.add_argument("--gtf-eib-min-overlap",
+                   dest="gtf_eib_min_overlap",
+                   type=float,
+                   metavar='float',
+                   default=0.9,
+                   help="Minimum amount input region has to overlap with exon (e), intron (i), i + ei borders to be counted as overlapping with these (note that the amount is reciprocal, i.e., one of the overlapping parts meeting the minimum amount is enough) (default: 0.9)")
+    p.add_argument("--gtf-intron-border-len",
+                   dest="gtf_intron_border_len",
+                   type=int,
+                   metavar='int',
+                   default=250,
+                   help="Set intron border region length (up- + downstream ends) for exon intron overlap statistics (default: 250)")
+    # GO enrichment analysis for batch mode.
+    p.add_argument("--goa",
+                   dest="run_goa",
+                   default = False,
+                   action = "store_true",
+                   help = "Run gene ontology (GO) enrichment analysis on genes occupied by sites in input datasets. Requires --gtf (default: False)")
+    p.add_argument("--goa-obo-mode",
+                   dest="goa_obo_mode",
+                   type=int,
+                   default=1,
+                   choices=[1, 2, 3],
+                   help = "Define how to obtain GO DAG (directed acyclic graph) obo file. 1: download most recent file from internet,  2: use local file,  3: provide file via --goa-obo-file (default: 1)")
+    p.add_argument("--goa-obo-file",
+                   dest="goa_obo_file",
+                   type=str,
+                   metavar='str',
+                   default = False,
+                   help = "Provide GO DAG obo file (default: False)")
+    p.add_argument("--goa-gene2go-file",
+                   dest="goa_gene2go_file",
+                   type=str,
+                   metavar='str',
+                   default = False,
+                   help = "Provide gene ID to GO IDs mapping table (row format: gene_id<tab>go_id1,go_id2). By default, a local file with ENSEMBL gene IDs is used. NOTE that gene IDs need to be compatible with --gtf (default: False)")
+    p.add_argument("--goa-pval",
+                   dest="goa_pval",
+                   type=float,
+                   metavar='float',
+                   default=0.05,
+                   help="GO enrichment analysis p-value threshold (applied on corrected p-value) (default: 0.05)")
+    p.add_argument("--goa-only-cooc",
+                   dest="goa_only_cooc",
+                   default = False,
+                   action = "store_true",
+                   help = "Only look at genes in GO enrichment analysis which contain motif hits for all input datasets. By default, GO enrichment analysis is performed on the genes covered by sites from all input datasets (default: False)")
+    p.add_argument("--goa-bg-gene-list",
+                   dest="goa_bg_gene_list",
+                   type=str,
+                   metavar='str',
+                   default = False,
+                   help = "Supply file with gene IDs (one ID per row) to use as background gene list for GOA. NOTE that gene IDs need to be compatible with --gtf (default: False)")
+    p.add_argument("--goa-max-child",
+                   dest="goa_max_child",
+                   type=int,
+                   metavar='int',
+                   default=None,
+                   help="Specify maximum number of children for a significant GO term to be reported in HTML table, e.g. --goa-max-child 100. This allows filtering out very broad terms (default: None)")
+    p.add_argument("--goa-min-depth",
+                   dest="goa_min_depth",
+                   type=int,
+                   metavar='int',
+                   default=None,
+                   help="Specify minimum depth number for a significant GO term to be reported in HTML table, e.g. --goa-min-depth 5 (default: None)")
+    p.add_argument("--goa-filter-purified",
+                   dest="goa_filter_purified",
+                   default = False,
+                   action = "store_true",
+                   help = "Filter out GOA results labeled as purified (i.e., GO terms with significantly lower concentration) in HTML table (default: False)")
+
     return p
 
 
@@ -342,10 +447,19 @@ def remove_special_chars_from_str(check_str,
     batch_call += " --genome %s" % (args.in_genome)
     batch_call += " --ext %s" % (args.ext_up_down)
     batch_call += " --motif-db %i" % (args.motif_db)
-    if args.fimo_nt_freqs:
-        batch_call += " --fimo-nt-freqs %s" % (args.fimo_nt_freqs)
+    if args.fimo_user_ntf_file:
+        batch_call += " --fimo-ntf-file %s" % (args.fimo_user_ntf_file)
+    batch_call += " --fimo-ntf-mode %i" % (args.fimo_ntf_mode)
     batch_call += " --fimo-pval %s" % (str(args.fimo_pval))
+    batch_call += " --cmsearch-bs %s" % (str(args.cmsearch_bs))
+    batch_call += " --cmsearch-mode %i" % (args.cmsearch_mode)
+    if args.greatest_hits:
+        batch_call += " --greatest-hits"
     batch_call += " --bed-score-col %i" % (args.bed_score_col)
+    if args.bed_sc_thr is not None:
+        batch_call += " --bed-sc-thr %s" % (str(args.bed_sc_thr))
+    if args.bed_sc_thr_rev_filter:
+        batch_call += " --bed-sc-thr-rev"
     if args.unstranded:
         batch_call += " --unstranded"
     if args.unstranded_ct:
@@ -354,24 +468,26 @@ def remove_special_chars_from_str(check_str,
         batch_call += " --meme-no-check"
     if args.meme_no_pgc:
         batch_call += " --meme-no-pgc"
-    if args.report:
-        batch_call += " --report"
+
     batch_call += " --kmer-size %i" % (args.kmer_size)
     if args.in_gtf:
         batch_call += " --gtf %s" % (args.in_gtf)
-        if not args.report:
-            batch_call += " --report"
         if args.tr_list:
             batch_call += " --tr-list %s" % (args.tr_list)
         if args.tr_types_list:
             tr_types = (" ").join(args.tr_types_list)
             batch_call += " --tr-types %s" % (tr_types)
 
     batch_call += " --gtf-feat-min-overlap %s" % (str(args.gtf_feat_min_overlap))
+    batch_call += " --gtf-eib-min-overlap %s" % (str(args.gtf_eib_min_overlap))
+    batch_call += " --gtf-intron-border-len %i" % (args.gtf_intron_border_len)
+
     if args.report_header:
         batch_call += " --report-header"
     if args.no_occ_heatmap:
         batch_call += " --no-occ-heatmap"
+    if args.disable_heatmap_cluster_olo:
+        batch_call += " --disable-heatmap-cluster-olo"
 
     batch_call += " --fisher-mode %i" % (args.fisher_mode)
     batch_call += " --wrs-mode %i" % (args.wrs_mode)
@@ -381,9 +497,9 @@ def remove_special_chars_from_str(check_str,
         assert is_valid_regex(args.regex), "given --regex \"%s\" is not a valid regular expression. Please provide valid expression" % (args.regex)
         # Remove , ; from given regex, to avoid motif_id format conflicts.
         regex = remove_special_chars_from_str(args.regex,
-                                              reg_ex="[ ,;]")
+                                              reg_ex="[ ;]")
 
-        assert regex, "empty string after removing special chars ( [ ,;] ) from --regex. Please provide a valid regex with DNA letters"
+        assert regex, "empty string after removing special chars ( [ ;] ) from --regex. Please provide a valid regex with DNA letters"
 
         batch_call += " --regex %s" % (regex)
         batch_call += " --regex-search-mode %i" % (args.regex_search_mode)
@@ -399,6 +515,26 @@ def remove_special_chars_from_str(check_str,
     batch_call += " --data-list %s" % (data_ids)
     batch_call += " --bed %s" % (paths)
 
+    # GO enrichment analysis.
+    if args.run_goa:
+        batch_call += " --goa"
+        batch_call += " --goa-obo-mode %i" % (args.goa_obo_mode)
+        if args.goa_obo_file:
+            batch_call += " --goa-obo-file %s" % (args.goa_obo_file)
+        if args.goa_gene2go_file:
+            batch_call += " --goa-gene2go-file %s" % (args.goa_gene2go_file)
+        batch_call += " --goa-pval %s" % (str(args.goa_pval))
+        if args.goa_only_cooc:
+            batch_call += " --goa-only-cooc"
+        if args.goa_bg_gene_list:
+            batch_call += " --goa-bg-gene-list %s" % (args.goa_bg_gene_list)
+        if args.goa_max_child is not None:
+            batch_call += " --goa-max-child %i" % (args.goa_max_child)
+        if args.goa_min_depth is not None:
+            batch_call += " --goa-min-depth %i" % (args.goa_min_depth)
+        if args.goa_filter_purified:
+            batch_call += " --goa-filter-purified"
+
     """
     Execute RBPBench batch call.
     """

diff --git a/bin/bed_print_first_n_pos.py b/bin/bed_print_first_n_pos.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+from rbpbench import benchlib
+
+
+###############################################################################
+
+def setup_argument_parser():
+    """Setup argparse parser."""
+    help_description = """
+    Print first n (set via --ext) positions of each region from --in BED file. 
+    For minus strand regions, the last n positions are printed.
+
+    """
+    # Define argument parser.
+    p = argparse.ArgumentParser(add_help=False,
+                                prog="bed_print_first_n_pos.py",
+                                description=help_description,
+                                formatter_class=argparse.MetavarTypeHelpFormatter)
+
+    # Required arguments.
+    p.add_argument("-h", "--help",
+                   action="help",
+                   help="Print help message")
+    p.add_argument("--in",
+                   dest="in_bed",
+                   type=str,
+                   metavar='str',
+                   required = True,
+                   help = "Input BED file with regions to extract first n positions from")
+    p.add_argument("--ext",
+                   dest="ext",
+                   type=int,
+                   metavar='int',
+                   required=True,
+                   help="Print first --ext positions of --in BED regions")
+    return p
+
+
+################################################################################
+
+if __name__ == '__main__':
+
+    parser = setup_argument_parser()
+    args = parser.parse_args()
+
+    assert os.path.exists(args.in_bed), "--in BED file \"%s\" not found" % (args.in_bed)
+    assert benchlib.boundary_check(args.ext, 1, 1000000), "set --ext expected to be >= 1 and <= 1000000"
+
+    with open(args.in_bed) as f:
+        for line in f:
+            cols = line.strip().split("\t")
+            chr_id = cols[0]
+            reg_s = int(cols[1])
+            reg_e = int(cols[2])
+            reg_id = cols[3]
+            sc = cols[4]
+            strand = cols[5]
+
+            new_reg_s = reg_s
+            new_reg_e = reg_s + args.ext
+
+            if strand == "-":
+                new_reg_s = reg_e - args.ext
+                new_reg_e = reg_e
+                if new_reg_s < 0:
+                    new_reg_s = 0
+
+            print("%s\t%i\t%i\t%s\t%s\t%s" % (chr_id, new_reg_s, new_reg_e, reg_id, sc, strand))
+
+    f.closed
+
+
+################################################################################