Skip to content

Commit

Permalink
Merge pull request #3 from michauhl/v07
Browse files Browse the repository at this point in the history
Merge v07
  • Loading branch information
michauhl authored Dec 2, 2023
2 parents e095c99 + df37e97 commit 992c41c
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 5 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ co-occurrence analysis, to benchmarking CLIP-seq peak caller methods as well as
As no ground truth (i.e., set of true transcriptome-wide binding sites of an RBP) exists, one obvious way to quantify the performance of a peak caller is to look for the enrichment of known RBP binding motifs in the binding site (peak region) data.
Since there exists no automated solution for this task yet, we implemented RBPBench:
RBPBench is multi-function tool to evaluate CLIP-seq and other genomic region data
using a comprehensive collection of known high-confidence RBP binding motifs (as of RBPBench v0.6: 259 RBPs comprising 605 motifs).
using a comprehensive collection of known high-confidence RBP binding motifs (as of RBPBench v0.7: 259 RBPs comprising 605 motifs).
RBPBench can be used for benchmarking CLIP-seq peak callers, but it works just as well for other RBP-related research questions:
one can e.g. look for RBP binding motifs in any set of genomic regions (selecting any number of RBPs of interest, including user-supplied motifs),
and check for RBP motif co-occurrences (to see which RBPs bind similar regions).
Expand Down Expand Up @@ -687,7 +687,7 @@ at position 0.

## Documentation

This documentation provides further details on RBPBench (version 0.6).
This documentation provides further details on RBPBench (version 0.7).

### Program modes

Expand Down
242 changes: 242 additions & 0 deletions bin/batch_table_wrapper_rbpbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#!/usr/bin/env python3

import argparse
import os
import re
import subprocess


###############################################################################

def setup_argument_parser():
"""Setup argparse parser."""
help_description = """
Python wrapper for RBPBench Galaxy wrapper to work with collections of
input BED files (i.e. to process them with rbpbench batch).
"""
# Define argument parser.
p = argparse.ArgumentParser(add_help=False,
prog="batch_table_wrapper_rbpbench.py",
description=help_description,
formatter_class=argparse.MetavarTypeHelpFormatter)

# Required arguments.
p.add_argument("-h", "--help",
action="help",
help="Print help message")
p.add_argument("--table",
dest="in_table",
type=str,
metavar='str',
required=True,
help="Input table file with data ID, method ID, RBP ID and file name (Galaxy element identifier in dataset collection) for each to be processed dataset by rbpbench batch")
p.add_argument("--paths",
dest="in_paths",
type=str,
metavar='str',
nargs='+',
required=True,
help="List of Galaxy BED file paths (--files path1 path2 .. )")
p.add_argument("--ids",
dest="in_ids",
type=str,
metavar='str',
nargs='+',
required=True,
help="List of Galaxy element identifiers, equal to the BED dataset names in the dataset collection (--ids id1 id2 .. )")
p.add_argument("--genome",
dest="in_genome",
type=str,
metavar='str',
required=True,
help="Genomic sequences file (currently supported formats: FASTA)")
p.add_argument("--out",
dest="out_folder",
type=str,
metavar='str',
required=True,
help="Batch results output folder")
# Optional batch arguments.
p.add_argument("--ext",
dest="ext_up_down",
type=str,
metavar='str',
default="0",
help="Up- and downstream extension of --in sites in nucleotides (nt). Set e.g. --ext 30 for 30 nt on both sides, or --ext 20,10 for different up- and downstream extension (default: 0)")
p.add_argument("--motif-db",
dest="motif_db",
type=int,
default=1,
choices=[1, 2, 3],
help="Motif database to use. 1: human RBP motifs full (259 RBPs, 605 motifs, human_v0.1), 2: human RBP motifs full (low frequencies not rounded, human_v0.1_no_round), 3: human RBP motifs eCLIP (107 RBPs, 316 motifs, human_eclip_v0.1) (default: 1)")
p.add_argument("--fimo-nt-freqs",
dest="fimo_nt_freqs",
type=str,
metavar='str',
default=False,
help="Provide FIMO nucleotide frequencies (FIMO option: --bifile) file (default: use internal frequencies file optimized for human transcripts)")
p.add_argument("--fimo-pval",
dest="fimo_pval",
type=float,
metavar='float',
default=0.001,
help="FIMO p-value threshold (FIMO option: --thresh) (default: 0.001)")
p.add_argument("--bed-score-col",
dest="bed_score_col",
type=int,
metavar='int',
default=5,
help="--in BED score column used for p-value calculations. BED score can be e.g. log2 fold change or -log10 p-value of the region (default: 5)")
p.add_argument("--unstranded",
dest="unstranded",
default=False,
action="store_true",
help="Set if --in BED regions are NOT strand-specific, i.e., to look for motifs on both strands of the provided regions. Note that the two strands of a region will still be counted as one region (change with --unstranded-ct) (default: False)")
p.add_argument("--unstranded-ct",
dest="unstranded_ct",
default=False,
action="store_true",
help="Count each --in region twice for RBP hit statistics when --unstranded is enabled. By default, two strands of one region are counted as one region for RBP hit statistics")
return p


###############################################################################

if __name__ == '__main__':

parser = setup_argument_parser()
args = parser.parse_args()

assert os.path.exists(args.in_table), "--table file \"%s\" not found" % (args.in_file)
assert os.path.exists(args.in_genome), "--genome file \"%s\" not found" % (args.in_genome)

c_paths = len(args.in_paths)
c_ids = len(args.in_ids)
assert c_paths == c_ids, "given # paths (--paths) != # ids (--ids) (%i != %i). Please provide one ID for each path" % (c_paths, c_ids)

"""
Check given paths and IDs.
"""

# Paths.
paths_dic = {}
paths_list = []
for path in args.in_paths:
assert os.path.exists(path), "--paths %s file not found" % (path)
if path not in paths_dic:
paths_dic[path] = 1
else:
assert False, "--paths %s given > 1. Please provide unique paths" % (path)
paths_list.append(path)

# IDs
ids_dic = {}
ids_list = []
for id in args.in_ids:
if id not in ids_dic:
ids_dic[id] = 1
else:
assert False, "--ids \"%s\" given > 1. Please provide unique element identifiers (dataset names) inside the dataset collection, in order to unambiguously assign element ID to file path" % (id)
ids_list.append(id)

id2path_dic = {}
for idx, id in enumerate(ids_list):
path = paths_list[idx]
id2path_dic[id] = path

"""
Read in table.
Column format:
rbp_id method_id data_id dataset_name
"""

comb_ids_dic = {}
id_collect_dic = {}
id_collect_dic["rbp_id"] = []
id_collect_dic["method_id"] = []
id_collect_dic["data_id"] = []
id_collect_dic["set_name"] = []
id_collect_dic["path"] = [] # Galaxy file path.

print("Read in --table ... ")

with open(args.in_table) as f:
for line in f:

if re.search("^#", line):
continue

cols = line.strip().split("\t")

assert len(cols) == 4, "line in --table with # cols != 4 (%i) encountered:%s" % (len(cols), line)

rbp_id = cols[0]
method_id = cols[1]
data_id = cols[2]
set_name = cols[3]

if rbp_id == "rbp_id":
continue

comb_id = "%s,%s,%s,%s" % (rbp_id, method_id, data_id, set_name)

if comb_id not in comb_ids_dic:
comb_ids_dic[comb_id] = 1
else:
assert False, "data combination (\"%s\") appears > 1 in --table file. Please provide unique combinations for rbpbench batch calculation" % (comb_id)

assert set_name in ids_dic, "given dataset name \"%s\" from --table not part of given --ids. Please provide dataset names present in dataset collection" % (set_name)

id_collect_dic["rbp_id"].append(rbp_id)
id_collect_dic["method_id"].append(method_id)
id_collect_dic["data_id"].append(data_id)
id_collect_dic["set_name"].append(set_name)
id_collect_dic["path"].append(id2path_dic[set_name])

f.closed

assert id_collect_dic["rbp_id"], "nothing read in from --table. Please provide non-empty table in correct format (columns: rbp_id method_id data_id dataset_name)"

"""
Construct RBPBench batch call.
"""

batch_call = "rbpbench batch"
batch_call += " --out %s" % (args.out_folder)
batch_call += " --genome %s" % (args.in_genome)
batch_call += " --ext %s" % (args.ext_up_down)
batch_call += " --motif-db %i" % (args.motif_db)
if args.fimo_nt_freqs:
batch_call += " --fimo-nt-freqs %s" % (args.fimo_nt_freqs)
batch_call += " --fimo-pval %s" % (str(args.fimo_pval))
batch_call += " --bed-score-col %i" % (args.bed_score_col)
if args.unstranded:
batch_call += " --unstranded"
if args.unstranded_ct:
batch_call += " --unstranded-ct"

rbp_ids = (" ").join(id_collect_dic["rbp_id"])
method_ids = (" ").join(id_collect_dic["method_id"])
data_ids = (" ").join(id_collect_dic["data_id"])
paths = (" ").join(id_collect_dic["path"])

batch_call += " --rbp-list %s" % (rbp_ids)
batch_call += " --method-list %s" % (method_ids)
batch_call += " --data-list %s" % (data_ids)
batch_call += " --bed %s" % (paths)

"""
Execute RBPBench batch call.
"""

print("")
print("EXECUTING CALL:\n%s" % (batch_call))
output = subprocess.getoutput(batch_call)
print("")
print("RUN OUTPUT:\n%s" % (output))
print("")
print("DONE.")
2 changes: 1 addition & 1 deletion bin/rbpbench
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ from scipy.stats import mannwhitneyu
# import uuid


__version__ = "0.6"
__version__ = "0.7"


################################################################################
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@

setup(
name='rbpbench',
version='0.6',
version='0.7',
description='Evaluate CLIP-seq and other genomic region data using a comprehensive collection of known RBP binding motifs',
long_description=open('README.md').read(),
author='Michael Uhl',
author_email='uhlm@informatik.uni-freiburg.de',
url='https://github.com/michauhl/RBPBench',
license='MIT',
scripts=['bin/rbpbench'],
scripts=['bin/rbpbench', 'bin/batch_table_wrapper_rbpbench.py'],
packages=['rbpbench'],
package_data={'rbpbench': ['content/*', 'content/catrapid.omics.v2.1.human.6plus_motif_plots/*']},
zip_safe=False,
Expand Down

0 comments on commit 992c41c

Please sign in to comment.