Merge pull request #37 from tovahmarkowitz/main

Adding new DiffBindQC for cfChIP
OpenOmics · Mar 22, 2024 · 7c35675 · 7c35675
2 parents 0103a62 + 45215f0
commit 7c35675
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 8 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -206,8 +206,11 @@ if assay == "cfchip":
             join(workpath,"QC","H3K4me3_cfChIP_signature.txt"),
             expand(join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),name=samples, ext=["sorted", "Q5DD"]),
             expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput),
-            expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'), 
+            expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),
                 PeakTool=PeakTools,name=chips,type=["protTSS"]),
+            expand(join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools),
+	    expand(join(workpath, uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{type}_allhits.txt"),
+                PeakTool="DiffBindQC", type="protTSS"),
             expand(join(workpath,uropa_dir,"promoterTable1",'{PeakTool}_promoter_overlap_summaryTable.txt'),PeakTool=PeakTools),
             provided(expand(join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
                 zip,group1=zipGroup1,group2=zipGroup2,PeakTool=zipToolC), reps == "yes"),

diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk
@@ -16,7 +16,7 @@ rule cfChIPtool:
         rver="R/4.1.0",
         toolkit = config['references'][genome]['cfChIP_TOOLS_SRC'],
         tmpfile = lambda w: join(workpath,cfTool_subdir2, w.name + ".Q5DD.tagAlign"),
-        tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD.tagAlign"))
+        tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD_tagAlign"))
     container:
         config['images']['cfchip']
     shell: """
@@ -83,3 +83,30 @@ rule promoterTable2:
     Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')";
     """
 
+rule diffbindQC:
+    input:
+       lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
+    output:
+       html = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"),
+       bed = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"),
+    params:
+       rname="diffbindQC",
+       rscript=join(workpath,"workflow","scripts","DiffBind_v2_cfChIP_QC.Rmd"),
+       outdir = join(workpath, "QC", "AllSamples-{PeakTool}"),
+       contrast = "AllSamples",
+       csvfile = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"),
+       pythonscript = join(workpath,"workflow","scripts","prep_diffbindQC.py"),
+       PeakExtension= lambda w: PeakExtensions[w.PeakTool],
+       PeakTool="{PeakTool}",
+       peakcaller= lambda w: FileTypesDiffBind[w.PeakTool],
+    container:
+       config['images']['cfchip']
+    shell: """
+       python {params.pythonscript} --wp {workpath} \
+         --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \
+         --pc {params.peakcaller} --csv {params.csvfile}
+       cp {params.rscript} {params.outdir}
+       cd {params.outdir}
+       Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", 
+           params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))'
+    """
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
@@ -77,7 +77,8 @@ PeakExtensions = {
     'DiffbindDeseq2': '_Diffbind_Deseq2.bed', 
     'DiffbindEdgeRBlock': '_Diffbind_EdgeR_block.bed',
     'DiffbindDeseq2Block': '_Diffbind_Deseq2_block.bed',
-    'Genrich': '.narrowPeak'
+    'Genrich': '.narrowPeak',
+    'DiffBindQC': '_DiffBindQC_TMMcounts.bed'
 }
 
 FileTypesDiffBind = { 
@@ -211,7 +212,9 @@ if assay == "cfchip":
         input:
             lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
         output:
-            join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt')
+            txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
+            bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
+            bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
         params:
             rname="uropa",
             uropaver = config['tools']['UROPAVER'],
@@ -241,7 +244,9 @@ else:
         input:
             lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
         output:
-            join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt')
+            txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
+            bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
+            bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
         params:
             rname="uropa",
             uropaver = config['tools']['UROPAVER'],

diff --git a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd b/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd
@@ -114,6 +114,8 @@ if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
   minOv <- floor(ncol(samples$class)/3)
 }
 
+print(paste0("The minimum number of overlaps is: ", minOv))
+
 if (summits == TRUE) {
   DBdataCounts <- dba.count(samples, summits=250, minOverlap = minOv)
 } else {
@@ -147,9 +149,12 @@ consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-norma
   as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% 
   dplyr::select(1:4, Peaks, samples$samples$SampleID)
 
-outfile <- paste0("Count", "_", contrasts, "_", "TMM_min",  minOv, ".csv")
-write.csv(consensus2, outfile, row.names = F)
+outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv")
+write.csv(consensus2, outfile1, row.names = F)
 
+outfile2 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.bed")
+write.table(consensus2[,c("seqnames","start","end","Peaks")],
+    outfile2, quote=F, sep="\t", row.names=F, col.names=F)
 
 counts_TMM_ALL <- consensus2
 rownames(counts_TMM_ALL) <- counts_TMM_ALL$Peaks
@@ -166,7 +171,7 @@ if (nrow(samples$samples) < 16) {
 }
 umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2"))
 
-outfile <- paste0("UMAP", "_", contrasts, "_", "TMM_min",  minOv, ".csv")
+outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv")
 write.csv(umap_coord, outfile, row.names = F)
 ```
 

diff --git a/workflow/scripts/prep_diffbindQC.py b/workflow/scripts/prep_diffbindQC.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
+parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory')
+parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located')
+parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output')
+parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv')
+parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located')
+parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file')
+
+args = parser.parse_args()
+
+with open("config.json","r") as read_file:
+   config=json.load(read_file)
+
+chip2input = config['project']['peaks']['inputs']
+groupdata = config['project']['groups']
+
+tmpIDs = [x for xs in groupdata.values() for x in xs]
+Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)]
+
+samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
+         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
+
+count = 1
+for chip in chip2input.keys():
+   if set(Ncounts) == {1}: # if all samples only in one group
+      for key in groupdata.keys():
+          if chip in groupdata[key]:
+             condition = key
+      replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
+   else:
+      condition = ""
+      replicate = str(count)
+      count = count +1
+   bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
+   controlID = chip2input[chip]
+   if controlID != "":
+      bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
+   else:
+      bamControl = ""
+   peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
+   samplesheet.append(",".join([chip, condition, replicate, bamReads, 
+                   controlID, bamControl, peaks, args.peakcaller]))            
+
+f = open(args.csvfile, 'w')
+f.write ("\n".join(samplesheet))
+f.close()