fix: finalize validation fixes

OpenOmics · Sep 9, 2024 · 5e847c5 · 5e847c5
1 parent 74d241b
commit 5e847c5
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 27 deletions.
diff --git a/bin/DiffBind_v2_Deseq2.Rmd b/bin/DiffBind_v2_Deseq2.Rmd
@@ -29,13 +29,12 @@ params:
 # inputs
 dateandtime <- format(Sys.time(), "%a %b %d %Y - %X")
 csvfile <- params$csvfile
-outbase <- dirname(csvfile)
 contrasts <- params$contrasts
 peakcaller <- params$peakcaller
 peak_counts <- params$counts
 up_file <- params$up_file
 down_file <- params$down_file
-list_file <- params$counts
+list_file <- params$list_file
 
 # knitr options
 knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE)

diff --git a/bin/DiffBind_v2_EdgeR.Rmd b/bin/DiffBind_v2_EdgeR.Rmd
@@ -145,30 +145,30 @@ DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER)
 Variance of differential peaks only
 
 ```{r PCA3, fig.height=5,fig.width=5}
-dba.plotPCA(DBAnalysisEdgeR, contrast=1, method=DBA_EdgeR)
+dba.plotPCA(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER)
 ```
 
 ## MA plot
 "Log concentration" means average concentration across all samples.
 Each dot is a consensus peak.
 
 ```{r MA_D}
-dba.plotMA(DBAnalysisEdgeR, method=DBA_EdgeR)
+dba.plotMA(DBAnalysisEdgeR, method=DBA_EDGER)
 ```
 
 ## Volcano plot
 Each dot is a consensus peak.
 
 ```{r Volcano1}
-dba.plotVolcano(DBAnalysisEdgeR, method=DBA_EdgeR)
+dba.plotVolcano(DBAnalysisEdgeR, method=DBA_EDGER)
 ```
 
 
 ## Heatmap: Differential
 1000 most significant differential peaks (EdgeR or EdgeR normalized)
 
 ```{r heatmap4D}
-dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method=DBA_EdgeR, correlations=FALSE, margin=20, cexRow=1, cexCol=1)
+dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER, correlations=FALSE, margin=20, cexRow=1, cexCol=1)
 ```
 
 ## Top 500 or less differentially bound peaks 
@@ -186,7 +186,7 @@ DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F)
 
 report2 <- dba.report(
                DBAnalysisEdgeR,
-               method = DBA_EdgeR,
+               method = DBA_EDGER,
                th=100,
                bNormalized=T,
                bFlip=FALSE,

diff --git a/bin/DiffBind_v2_EdgeR_block.Rmd b/bin/DiffBind_v2_EdgeR_block.Rmd
@@ -179,14 +179,18 @@ dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER_BLOCK, correlation
 
 ```{r EdgeRReport}
 UpPeaks <- DBReportEdgeR[which(DBReportEdgeR$Fold > 0)]
-try(rtracklayer::export(UpPeaks, up_file),silent=TRUE)
+print("up ")
+print(dim(UpPeaks))
+rtracklayer::export(UpPeaks, up_file)
 
 DownPeaks <- DBReportEdgeR[which(DBReportEdgeR$Fold < 0)]
-try(rtracklayer::export(DownPeaks, down_file),silent=TRUE)
+print("down ")
+print(dim(DownPeaks))
+rtracklayer::export(DownPeaks, down_file)
 
 D2i <- length(DBReportEdgeR)
 i <- as.integer(min(c(500, as.integer(max(c(D2i, 1))))))
-try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE)
+DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F)
 
 report2 <- dba.report(
                DBAnalysisEdgeR,

diff --git a/bin/DiffBind_v2_load.R b/bin/DiffBind_v2_load.R
@@ -42,11 +42,6 @@ DBdataCounts <- dba.count(samples, summits=summits_arg, bParallel=T)
 
 # save counts
 saveRDS(DBdataCounts, counts)
-cp_cmd <- paste("cp ", counts, " /data/OpenOmics/dev/datasets/outputs/w_blocking/")
-print(cp_cmd)
-system(cp_cmd, wait=TRUE)
-head <- system(paste("head ", counts), wait=TRUE)
-print(head)
 
 # save peaklist
 consensus <- dba.peakset(DBdataCounts, bRetrieve=T)

diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
@@ -1,7 +1,6 @@
 # Differential binding analysis rules
 # ~~~~
 import os
-import json
 from os.path import join
 from textwrap import dedent
 from scripts.common import allocated, mk_dir_if_not_exist
@@ -390,7 +389,7 @@ rule diffbind_edger_blocking:
         #!/bin/bash
         Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file="{output.diffbind_block_report}",
             params=list(csvfile="{input.csvfile}", peakcaller="{params.this_peaktool}", list_file="{output.full_list}",
-            contrasts="{params.this_contrast}", counts="{input.peak_counts}"))'
+            up_file="{output.up_file}", down_file="{output.down_file}", contrasts="{params.this_contrast}", counts="{input.peak_counts}"))'
         EOF
 
         chmod +x ${{tmp}}/rscript.sh
@@ -407,12 +406,25 @@ rule UROPA_prep_in:
         lambda w: join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2])
     params:
         fldr                            = join(uropa_dir, "{PeakTool1}"),
+    log:
+        join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}.log")
     output:
         json                            = [
-                                            join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}."+pktype+".json") \
-                                            for pktype in peak_types
+                                            join(
+                                                uropa_dir, 
+                                                "{PeakTool1}", 
+                                                "{name}.{PeakTool2}."+pktype+".json"
+                                            ) for pktype in peak_types
+                                          ],
+        reformat_bed                    = [
+                                            join(
+                                                uropa_dir, 
+                                                "{PeakTool1}", 
+                                                "{name}_{PeakTool2}_uropa_"+pktype+"_input.bed"
+                                            ) for pktype in peak_types
                                           ],
     run:
+        import json, csv
         # Dynamically creates UROPA config file
         if not os.path.exists(params.fldr): 
             os.mkdir(params.fldr, mode=0o775)
@@ -426,13 +438,31 @@ rule UROPA_prep_in:
 
         assert len(input) == len(peak_types)
 
-        for peak_type in peak_types:
+        for i, peak_type in enumerate(peak_types):
             json_construct = dict()
             json_construct['queries'] = []
             json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"]
             json_construct["priority"] = "Yes"
             json_construct['gtf'] = gtf
-            json_construct['bed'] = input[0]
+
+            # reformat to standard bed
+            rdr = csv.DictReader(input[i], delimiter='\t')
+            csv_map = {
+                'seqnames': 'chr',
+                'start': 'start',
+                'end': 'stop',
+                'strand': 'strand'
+            }
+            newbed = []
+            for row in rdr:
+                row = {csv_map[k.lower()]: v for k, v in row.items() if k.lower() in csv_map}
+                newbed.append(row)
+            with open(output.reformat_bed[i], 'w') as fo:
+                wrt = csv.DictWriter(fo, fieldnames=csv_map.values())
+                # wrt.writeheader() # bed file wo header row
+                wrt.writerows(newbed)
+            json_construct['bed'] = output.reformat_bed[i]
+
             if assay == 'cfchip':
                 if peak_type == 'protTSS':
                     for _d in (3000, 10000, 100000):
@@ -477,18 +507,18 @@ rule UROPA_prep_in:
                         this_q['distance'] = _d
                         json_construct['queries'].append(this_q)
 
-            with open(output.json[peak_types.index(peak_type)], 'w') as jo:
+            with open(output.json[i], 'w') as jo:
                 json.dump(json_construct, jo, indent=4)
                 jo.close()
 
-            if not os.path.exists(output.json[peak_types.index(peak_type)]):
-                raise FileNotFoundError(output.json[peak_types.index(peak_type)] + " does not exist!")
+            if not os.path.exists(output.json[i]):
+                raise FileNotFoundError(output.json[i] + " does not exist!")
 
 
 
 rule UROPA:
     input:
-        json                            = join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}.{_type}.json"),
+        this_json                       = join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}.{_type}.json"),
     output:
         txt                             = join(
                                             uropa_dir, 
@@ -509,10 +539,10 @@ rule UROPA:
     params:
         rname                           = "uropa",
         outroot                         = join(uropa_dir, "{PeakTool1}", "{name}_{PeakTool2}_uropa_{_type}"),
-    threads:  int(allocated("threads", "UROPA", cluster))
+    threads: int(allocated("threads", "UROPA", cluster))
     container:
         config["images"]["uropa"]
     shell:
         """
-        uropa -i {input.json} -p {params.outroot} -l {output.log} -t {threads} -s
+        uropa -i {input.this_json} -p {params.outroot} -l {output.log} -t {threads} -s
         """