Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge to v1.0 #8

Merged
merged 67 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
8ff5890
mo
michauhl Jun 12, 2024
91bfc63
mo
michauhl Jun 12, 2024
bafadb5
mo
michauhl Jun 13, 2024
2f1f5cf
mo
michauhl Jun 13, 2024
3f512ee
mo
michauhl Jun 16, 2024
0a210fe
mo
michauhl Jun 19, 2024
8b6f258
mo
michauhl Jun 19, 2024
5b629ea
mo
michauhl Jun 20, 2024
49dde2a
mo
michauhl Jun 28, 2024
b025862
mo
michauhl Jun 30, 2024
c0bd515
mo
michauhl Jul 4, 2024
04ed8c3
mo
michauhl Jul 11, 2024
dc847fb
mo
michauhl Jul 11, 2024
a1acbed
mo
michauhl Jul 15, 2024
db22ee4
mo
michauhl Jul 17, 2024
b5860c9
mo
michauhl Jul 18, 2024
49be33e
mo
michauhl Jul 18, 2024
7da4532
mo
michauhl Jul 19, 2024
b44d355
mo
michauhl Jul 19, 2024
a138629
mo
michauhl Jul 19, 2024
ad4a0d9
mo
michauhl Jul 19, 2024
48a5a44
mo
michauhl Jul 19, 2024
173e369
mo
michauhl Jul 19, 2024
df176ab
mo
michauhl Jul 23, 2024
c989ffd
mo
michauhl Jul 23, 2024
62039b9
mo
michauhl Aug 5, 2024
5faeaee
mo
michauhl Aug 5, 2024
39eec0b
mo
michauhl Aug 6, 2024
9941fe7
mo
michauhl Aug 8, 2024
8c2c127
mo
michauhl Aug 12, 2024
96eb8de
mo
michauhl Aug 12, 2024
b2aaa88
mo
michauhl Aug 13, 2024
c11ca95
mo
michauhl Aug 14, 2024
e5dfe63
mo
michauhl Aug 14, 2024
d1025d3
mo
michauhl Aug 15, 2024
cbdd3a7
mo
michauhl Aug 16, 2024
13e6578
mo
michauhl Aug 19, 2024
b095ea6
mo
michauhl Aug 19, 2024
4094922
mo
michauhl Aug 19, 2024
ad12d09
mo
michauhl Aug 19, 2024
b98a233
mo
michauhl Aug 20, 2024
d9a5c68
mo
michauhl Aug 20, 2024
788d236
mo
michauhl Aug 20, 2024
31ae70e
mo
michauhl Aug 21, 2024
abc476d
mo
michauhl Aug 21, 2024
d07f2ed
mo
michauhl Aug 22, 2024
2508734
mo
michauhl Aug 22, 2024
4dd36a2
mo
michauhl Aug 23, 2024
ae5bef1
mo
michauhl Aug 27, 2024
b930b84
mo
michauhl Aug 28, 2024
3c4e162
mo
michauhl Sep 1, 2024
ea4e447
mo
michauhl Sep 1, 2024
e20b349
mo
michauhl Sep 2, 2024
05fdb34
mo
michauhl Sep 2, 2024
810f225
mo
michauhl Sep 3, 2024
09d5a60
mo
michauhl Sep 3, 2024
852ef82
mo
michauhl Sep 3, 2024
919dbaa
mo
michauhl Sep 4, 2024
38f2e18
mo
michauhl Sep 4, 2024
831e4aa
mo
michauhl Sep 5, 2024
f12e471
mo
michauhl Sep 5, 2024
55cd3f3
mo
michauhl Sep 5, 2024
2f2863a
mo
michauhl Sep 8, 2024
11e4f7a
mo
michauhl Sep 8, 2024
eabc127
mo
michauhl Sep 8, 2024
e23a3a9
mo
michauhl Sep 9, 2024
37f389e
mo
michauhl Sep 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
recursive-exclude test *
exclude README.md
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ More information will be added soon.

## Documentation

This documentation provides further details on RBPBench (version 0.9).
This documentation provides further details on RBPBench (version 0.9.1).

### Program modes

Expand Down
174 changes: 155 additions & 19 deletions bin/batch_table_wrapper_rbpbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,25 +68,59 @@ def setup_argument_parser():
type=int,
default=1,
choices=[1, 2, 3],
help="Built-in motif database to use. 1: human RBP motifs full (259 RBPs, 605 motifs, \"catrapid.omics.v2.1.human.6plus\"), 2: human RBP motifs full (low frequencies not rounded, \"catrapid.omics.v2.1.human.6plus.noround\"), 3: human RBP motifs eCLIP (107 RBPs, 316 motifs, \"s6_refined_ic010.human.rounded.encode_rbps\") (default: 1)")
p.add_argument("--fimo-nt-freqs",
dest="fimo_nt_freqs",
help="Built-in motif database to use (currently there are 3 human databases: 1,2,3. See details in rbpbench (default: 1)")
p.add_argument("--fimo-ntf-file",
dest="fimo_user_ntf_file",
type=str,
metavar='str',
default=False,
help="Provide FIMO nucleotide frequencies (FIMO option: --bifile) file (default: use internal frequencies file optimized for human transcripts)")
default = False,
help = "Provide FIMO nucleotide frequencies (FIMO option: --bfile) file (default: use internal frequencies file, define which with --fimo-ntf-mode)")
p.add_argument("--fimo-ntf-mode",
dest="fimo_ntf_mode",
type=int,
default=1,
choices=[1, 2, 3],
help="Set which internal nucleotide frequencies to use for FIMO search. 1: use frequencies from human ENSEMBL transcripts (excluding introns, A most prominent) 2: use frequencies from human ENSEMBL transcripts (including introns, resulting in lower G+C and T most prominent) 3: use uniform frequencies (same for every nucleotide) (default: 1)")
p.add_argument("--fimo-pval",
dest="fimo_pval",
type=float,
metavar='float',
default=0.001,
help="FIMO p-value threshold (FIMO option: --thresh) (default: 0.001)")
p.add_argument("--cmsearch-bs",
dest="cmsearch_bs",
type=float,
metavar='float',
default=1.0,
help="CMSEARCH bit score threshold (CMSEARCH options: -T --incT). The higher the more strict (default: 1.0)")
p.add_argument("--cmsearch-mode",
dest="cmsearch_mode",
type=int,
default=1,
choices=[1, 2],
help="Set CMSEARCH mode to control strictness of filtering. 1: default setting (CMSEARCH option: --default). 2: max setting (CMSEARCH option: --max), i.e., turn all heuristic filters off, slower and more sensitive / more hits) (default: 1)")
p.add_argument("--greatest-hits",
dest="greatest_hits",
default = False,
action = "store_true",
help = "Keep only best FIMO/CMSEARCH motif hits (i.e., hit with lowest p-value / highest bit score for each motif sequence/site combination). By default, report all hits (default: False)")
p.add_argument("--bed-score-col",
dest="bed_score_col",
type=int,
metavar='int',
default=5,
help="--in BED score column used for p-value calculations. BED score can be e.g. log2 fold change or -log10 p-value of the region (default: 5)")
p.add_argument("--bed-sc-thr",
dest="bed_sc_thr",
type = float,
metavar='float',
default = None,
help = "Minimum site score (by default: --in BED column 5, or set via --bed-score-col) for filtering (assuming higher score == better site) (default: None)")
p.add_argument("--bed-sc-thr-rev",
dest="bed_sc_thr_rev_filter",
default = False,
action = "store_true",
help = "Reverse --bed-sc-thr filtering (i.e. the lower the better, e.g. for p-values) (default: False)")
p.add_argument("--unstranded",
dest="unstranded",
default=False,
Expand Down Expand Up @@ -121,11 +155,6 @@ def setup_argument_parser():
choices=[1, 2, 3],
help="Defines Fisher exact test alternative hypothesis for testing co-occurrences of RBP motifs. 1: greater, 2: two-sided, 3: less (default: 1)")
# Report.
p.add_argument("--report",
dest="report",
default = False,
action = "store_true",
help = "Generate an .html report containing various plots to compare input datasets (default: False)")
p.add_argument("--kmer-size",
dest="kmer_size",
type=int,
Expand All @@ -137,6 +166,11 @@ def setup_argument_parser():
default = False,
action = "store_true",
help = "Do not produce gene region occupancy heatmap plot in HTML report (default: False)")
p.add_argument("--disable-heatmap-cluster-olo",
dest="disable_heatmap_cluster_olo",
default = False,
action = "store_true",
help="Disable optimal leave ordering (OLO) for clustering gene region occupancy heatmap. By default, OLO is enabled")
p.add_argument("--report-header",
dest="report_header",
default = False,
Expand Down Expand Up @@ -184,7 +218,78 @@ def setup_argument_parser():
type=float,
metavar='float',
default=0.1,
help="Minimum amount of overlap required for a region to be assigned to a GTF feature (if less or no overlap, region will be assigned to \"intergenic\") (default: 0.1)")
help="Minimum amount of overlap required for a region to be assigned to a GTF feature (if less or no overlap, region will be assigned to \"intergenic\"). If there is overlap with several features, assign the one with highest overlap (default: 0.1)")
p.add_argument("--gtf-eib-min-overlap",
dest="gtf_eib_min_overlap",
type=float,
metavar='float',
default=0.9,
help="Minimum amount input region has to overlap with exon (e), intron (i), i + ei borders to be counted as overlapping with these (note that the amount is reciprocal, i.e., one of the overlapping parts meeting the minimum amount is enough) (default: 0.9)")
p.add_argument("--gtf-intron-border-len",
dest="gtf_intron_border_len",
type=int,
metavar='int',
default=250,
help="Set intron border region length (up- + downstream ends) for exon intron overlap statistics (default: 250)")
# GO enrichment analysis for batch mode.
p.add_argument("--goa",
dest="run_goa",
default = False,
action = "store_true",
help = "Run gene ontology (GO) enrichment analysis on genes occupied by sites in input datasets. Requires --gtf (default: False)")
p.add_argument("--goa-obo-mode",
dest="goa_obo_mode",
type=int,
default=1,
choices=[1, 2, 3],
help = "Define how to obtain GO DAG (directed acyclic graph) obo file. 1: download most recent file from internet, 2: use local file, 3: provide file via --goa-obo-file (default: 1)")
p.add_argument("--goa-obo-file",
dest="goa_obo_file",
type=str,
metavar='str',
default = False,
help = "Provide GO DAG obo file (default: False)")
p.add_argument("--goa-gene2go-file",
dest="goa_gene2go_file",
type=str,
metavar='str',
default = False,
help = "Provide gene ID to GO IDs mapping table (row format: gene_id<tab>go_id1,go_id2). By default, a local file with ENSEMBL gene IDs is used. NOTE that gene IDs need to be compatible with --gtf (default: False)")
p.add_argument("--goa-pval",
dest="goa_pval",
type=float,
metavar='float',
default=0.05,
help="GO enrichment analysis p-value threshold (applied on corrected p-value) (default: 0.05)")
p.add_argument("--goa-only-cooc",
dest="goa_only_cooc",
default = False,
action = "store_true",
help = "Only look at genes in GO enrichment analysis which contain motif hits for all input datasets. By default, GO enrichment analysis is performed on the genes covered by sites from all input datasets (default: False)")
p.add_argument("--goa-bg-gene-list",
dest="goa_bg_gene_list",
type=str,
metavar='str',
default = False,
help = "Supply file with gene IDs (one ID per row) to use as background gene list for GOA. NOTE that gene IDs need to be compatible with --gtf (default: False)")
p.add_argument("--goa-max-child",
dest="goa_max_child",
type=int,
metavar='int',
default=None,
help="Specify maximum number of children for a significant GO term to be reported in HTML table, e.g. --goa-max-child 100. This allows filtering out very broad terms (default: None)")
p.add_argument("--goa-min-depth",
dest="goa_min_depth",
type=int,
metavar='int',
default=None,
help="Specify minimum depth number for a significant GO term to be reported in HTML table, e.g. --goa-min-depth 5 (default: None)")
p.add_argument("--goa-filter-purified",
dest="goa_filter_purified",
default = False,
action = "store_true",
help = "Filter out GOA results labeled as purified (i.e., GO terms with significantly lower concentration) in HTML table (default: False)")

return p


Expand Down Expand Up @@ -342,10 +447,19 @@ def remove_special_chars_from_str(check_str,
batch_call += " --genome %s" % (args.in_genome)
batch_call += " --ext %s" % (args.ext_up_down)
batch_call += " --motif-db %i" % (args.motif_db)
if args.fimo_nt_freqs:
batch_call += " --fimo-nt-freqs %s" % (args.fimo_nt_freqs)
if args.fimo_user_ntf_file:
batch_call += " --fimo-ntf-file %s" % (args.fimo_user_ntf_file)
batch_call += " --fimo-ntf-mode %i" % (args.fimo_ntf_mode)
batch_call += " --fimo-pval %s" % (str(args.fimo_pval))
batch_call += " --cmsearch-bs %s" % (str(args.cmsearch_bs))
batch_call += " --cmsearch-mode %i" % (args.cmsearch_mode)
if args.greatest_hits:
batch_call += " --greatest-hits"
batch_call += " --bed-score-col %i" % (args.bed_score_col)
if args.bed_sc_thr is not None:
batch_call += " --bed-sc-thr %s" % (str(args.bed_sc_thr))
if args.bed_sc_thr_rev_filter:
batch_call += " --bed-sc-thr-rev"
if args.unstranded:
batch_call += " --unstranded"
if args.unstranded_ct:
Expand All @@ -354,24 +468,26 @@ def remove_special_chars_from_str(check_str,
batch_call += " --meme-no-check"
if args.meme_no_pgc:
batch_call += " --meme-no-pgc"
if args.report:
batch_call += " --report"

batch_call += " --kmer-size %i" % (args.kmer_size)
if args.in_gtf:
batch_call += " --gtf %s" % (args.in_gtf)
if not args.report:
batch_call += " --report"
if args.tr_list:
batch_call += " --tr-list %s" % (args.tr_list)
if args.tr_types_list:
tr_types = (" ").join(args.tr_types_list)
batch_call += " --tr-types %s" % (tr_types)

batch_call += " --gtf-feat-min-overlap %s" % (str(args.gtf_feat_min_overlap))
batch_call += " --gtf-eib-min-overlap %s" % (str(args.gtf_eib_min_overlap))
batch_call += " --gtf-intron-border-len %i" % (args.gtf_intron_border_len)

if args.report_header:
batch_call += " --report-header"
if args.no_occ_heatmap:
batch_call += " --no-occ-heatmap"
if args.disable_heatmap_cluster_olo:
batch_call += " --disable-heatmap-cluster-olo"

batch_call += " --fisher-mode %i" % (args.fisher_mode)
batch_call += " --wrs-mode %i" % (args.wrs_mode)
Expand All @@ -381,9 +497,9 @@ def remove_special_chars_from_str(check_str,
assert is_valid_regex(args.regex), "given --regex \"%s\" is not a valid regular expression. Please provide valid expression" % (args.regex)
# Remove , ; from given regex, to avoid motif_id format conflicts.
regex = remove_special_chars_from_str(args.regex,
reg_ex="[ ,;]")
reg_ex="[ ;]")

assert regex, "empty string after removing special chars ( [ ,;] ) from --regex. Please provide a valid regex with DNA letters"
assert regex, "empty string after removing special chars ( [ ;] ) from --regex. Please provide a valid regex with DNA letters"

batch_call += " --regex %s" % (regex)
batch_call += " --regex-search-mode %i" % (args.regex_search_mode)
Expand All @@ -399,6 +515,26 @@ def remove_special_chars_from_str(check_str,
batch_call += " --data-list %s" % (data_ids)
batch_call += " --bed %s" % (paths)

# GO enrichment analysis.
if args.run_goa:
batch_call += " --goa"
batch_call += " --goa-obo-mode %i" % (args.goa_obo_mode)
if args.goa_obo_file:
batch_call += " --goa-obo-file %s" % (args.goa_obo_file)
if args.goa_gene2go_file:
batch_call += " --goa-gene2go-file %s" % (args.goa_gene2go_file)
batch_call += " --goa-pval %s" % (str(args.goa_pval))
if args.goa_only_cooc:
batch_call += " --goa-only-cooc"
if args.goa_bg_gene_list:
batch_call += " --goa-bg-gene-list %s" % (args.goa_bg_gene_list)
if args.goa_max_child is not None:
batch_call += " --goa-max-child %i" % (args.goa_max_child)
if args.goa_min_depth is not None:
batch_call += " --goa-min-depth %i" % (args.goa_min_depth)
if args.goa_filter_purified:
batch_call += " --goa-filter-purified"

"""
Execute RBPBench batch call.
"""
Expand Down
76 changes: 76 additions & 0 deletions bin/bed_print_first_n_pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python3

import argparse
import os
from rbpbench import benchlib


###############################################################################

def setup_argument_parser():
"""Setup argparse parser."""
help_description = """
Print first n (set via --ext) positions of each region from --in BED file.
For minus strand regions, the last n positions are printed.

"""
# Define argument parser.
p = argparse.ArgumentParser(add_help=False,
prog="bed_print_first_n_pos.py",
description=help_description,
formatter_class=argparse.MetavarTypeHelpFormatter)

# Required arguments.
p.add_argument("-h", "--help",
action="help",
help="Print help message")
p.add_argument("--in",
dest="in_bed",
type=str,
metavar='str',
required = True,
help = "Input BED file with regions to extract first n positions from")
p.add_argument("--ext",
dest="ext",
type=int,
metavar='int',
required=True,
help="Print first --ext positions of --in BED regions")
return p


################################################################################

if __name__ == '__main__':

parser = setup_argument_parser()
args = parser.parse_args()

assert os.path.exists(args.in_bed), "--in BED file \"%s\" not found" % (args.in_bed)
assert benchlib.boundary_check(args.ext, 1, 1000000), "set --ext expected to be >= 1 and <= 1000000"

with open(args.in_bed) as f:
for line in f:
cols = line.strip().split("\t")
chr_id = cols[0]
reg_s = int(cols[1])
reg_e = int(cols[2])
reg_id = cols[3]
sc = cols[4]
strand = cols[5]

new_reg_s = reg_s
new_reg_e = reg_s + args.ext

if strand == "-":
new_reg_s = reg_e - args.ext
new_reg_e = reg_e
if new_reg_s < 0:
new_reg_s = 0

print("%s\t%i\t%i\t%s\t%s\t%s" % (chr_id, new_reg_s, new_reg_e, reg_id, sc, strand))

f.closed


################################################################################
Loading