Merge pull request #2 from michauhl/v05

V05
michauhl · Nov 30, 2023 · cd673df · cd673df
2 parents 1d6f758 + 4633294
commit cd673df
Show file tree

Hide file tree

Showing 6 changed files with 252 additions and 553 deletions.
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ co-occurrence analysis, to benchmarking CLIP-seq peak caller methods as well as
 As no ground truth (i.e., set of true transcriptome-wide binding sites of an RBP) exists, one obvious way to quantify the performance of a peak caller is to look for the enrichment of known RBP binding motifs in the binding site (peak region) data. 
 Since there exists no automated solution for this task yet, we implemented RBPBench:
 RBPBench is multi-function tool to evaluate CLIP-seq and other genomic region data 
-using a comprehensive collection of known high-confidence RBP binding motifs (as of RBPBench v0.4: 259 RBPs comprising 605 motifs).
+using a comprehensive collection of known high-confidence RBP binding motifs (as of RBPBench v0.5: 259 RBPs comprising 605 motifs).
 RBPBench can be used for benchmarking CLIP-seq peak callers, but it works just as well for other RBP-related research questions:
 one can e.g. look for RBP binding motifs in any set of genomic regions (selecting any number of RBPs of interest, including user-supplied motifs),
 and check for RBP motif co-occurrences (to see which RBPs bind similar regions).
@@ -687,7 +687,7 @@ at position 0.
 
 ## Documentation
 
-This documentation provides further details on RBPBench (version 0.4).
+This documentation provides further details on RBPBench (version 0.5).
 
 ### Program modes
 
@@ -904,4 +904,4 @@ are described in the example section [above](#search-with-multiple-rbps).
 
 #### No FIMO hits
 
-This can e.g. happen if you have an old MEME version installed (v4). RBPBench was implemented using v5, and as of v0.4 throws an error if anything below MEME v5 is installed. 
+This can e.g. happen if you have an old MEME version installed (v4). RBPBench was implemented using v5, and from v0.4 on throws an error if anything below MEME v5 is installed. 
diff --git a/bin/rbpbench b/bin/rbpbench
@@ -21,7 +21,7 @@ from scipy.stats import mannwhitneyu
 # import uuid
 
 
-__version__ = "0.4"
+__version__ = "0.5"
 
 
 ################################################################################
@@ -247,7 +247,7 @@ def setup_argument_parser():
                    type=str,
                    metavar='str',
                    default = None,
-                   help = "Set RBP ID to plot motif distances relative to motifs from this RBP (needs to be one of the selected RBP IDs!). Motif plot will be centered on best scoring motif of the RBP for each region")
+                   help = "Set reference RBP ID to plot motif distances relative to motifs from this RBP (needs to be one of the selected RBP IDs!). Motif plot will be centered on best scoring motif of the RBP for each region")
     p_s.add_argument("--motif-distance-plot-range",
                    dest="motif_distance_plot_range",
                    type=int,
@@ -903,7 +903,6 @@ def main_search(args):
                 # assert rbp_id in name2ids_dic, "provided --rbps ID %s not in internal motif database. Please provide RBP name present in database" %(rbp_id)
                 loaded_rbps_dic[rbp_id] = motif_db_str
 
-
     # Motif IDs for search.
     loaded_motif_ids_dic = {}
     for rbp_id in loaded_rbps_dic:
@@ -2149,7 +2148,7 @@ def main_search(args):
 
         """
 
-        if args.in_gtf:
+        if args.in_gtf and c_regions_with_hits:
 
             reg_annot_table_file = args.out_folder + "/" + "region_annotations.tsv"
 
@@ -2182,6 +2181,7 @@ def main_search(args):
                 tr_ids_dic = benchlib.select_mpts_from_gene_infos(gid2gio_dic,
                                         basic_tag=False,  # do not be strict (only_tsl=False too).
                                         ensembl_canonical_tag=False,
+                                        prior_basic_tag=True,  # Prioritize basic tag transcript.
                                         only_tsl=False)
                 assert tr_ids_dic, "most prominent transcript selection from gene infos failed. Please contact developers"
                 print("# of transcript IDs (most prominent transcripts): ", len(tr_ids_dic))
@@ -2267,6 +2267,9 @@ def main_search(args):
                 OUTRAN.write("%s\t%s\t%s\t%s\t%s\t%s\n" %(reg_id, gene_id, gene_name, tr_id, annot, tr_biotype))
             OUTRAN.close()
 
+        elif args.in_gtf and not c_regions_with_hits:
+            print("No need to read in --gtf since no motif hits found .. ")
+
         plots_subfolder = "html_report_plots"
         benchlib_path = os.path.dirname(benchlib.__file__)
 
@@ -2365,7 +2368,7 @@ def main_search(args):
     print("RBP hit stats .tsv:\n%s" %(rbp_stats_out))
     print("Motif hit stats .tsv:\n%s" %(motif_stats_out))
     if reg_annot_table_file is not None:
-        print("Region nnotations .tsv:\n%s" %(reg_annot_table_file))
+        print("Region annotations .tsv:\n%s" %(reg_annot_table_file))
     if args.plot_motifs:
         print("Motif plots and hit statistics .html:\n%s" %(html_motif_plots_out))
     if args.report:
@@ -2822,7 +2825,7 @@ def main_batch(args):
             len_list.append(seq_len)
 
         # Length statistics.
-        reg_len_median = int(statistics.median(len_list))
+        reg_len_median = statistics.median(len_list)
         reg_len_mean = statistics.mean(len_list)
         reg_len_mean = round(reg_len_mean, 2)
         reg_len_min = min(len_list)