From 45215f05b0f35ccda7370061fc70388a90569f9a Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Fri, 22 Mar 2024 14:43:50 -0400 Subject: [PATCH] Adding new DiffBindQC for cfChIP --- workflow/Snakefile | 5 ++- workflow/rules/cfChIP.smk | 29 +++++++++++- workflow/rules/dba.smk | 11 +++-- workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd | 11 +++-- workflow/scripts/prep_diffbindQC.py | 51 ++++++++++++++++++++++ 5 files changed, 99 insertions(+), 8 deletions(-) create mode 100644 workflow/scripts/prep_diffbindQC.py diff --git a/workflow/Snakefile b/workflow/Snakefile index 17cf7c1..6972337 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -206,8 +206,11 @@ if assay == "cfchip": join(workpath,"QC","H3K4me3_cfChIP_signature.txt"), expand(join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),name=samples, ext=["sorted", "Q5DD"]), expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput), - expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'), + expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'), PeakTool=PeakTools,name=chips,type=["protTSS"]), + expand(join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools), + expand(join(workpath, uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{type}_allhits.txt"), + PeakTool="DiffBindQC", type="protTSS"), expand(join(workpath,uropa_dir,"promoterTable1",'{PeakTool}_promoter_overlap_summaryTable.txt'),PeakTool=PeakTools), provided(expand(join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), zip,group1=zipGroup1,group2=zipGroup2,PeakTool=zipToolC), reps == "yes"), diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index 3231601..fa521d6 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -16,7 +16,7 @@ rule cfChIPtool: rver="R/4.1.0", toolkit = config['references'][genome]['cfChIP_TOOLS_SRC'], tmpfile = lambda w: join(workpath,cfTool_subdir2, w.name + ".Q5DD.tagAlign"), - tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD.tagAlign")) + tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD_tagAlign")) container: config['images']['cfchip'] shell: """ @@ -83,3 +83,30 @@ rule promoterTable2: Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')"; """ +rule diffbindQC: + input: + lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] + output: + html = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"), + bed = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), + params: + rname="diffbindQC", + rscript=join(workpath,"workflow","scripts","DiffBind_v2_cfChIP_QC.Rmd"), + outdir = join(workpath, "QC", "AllSamples-{PeakTool}"), + contrast = "AllSamples", + csvfile = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"), + pythonscript = join(workpath,"workflow","scripts","prep_diffbindQC.py"), + PeakExtension= lambda w: PeakExtensions[w.PeakTool], + PeakTool="{PeakTool}", + peakcaller= lambda w: FileTypesDiffBind[w.PeakTool], + container: + config['images']['cfchip'] + shell: """ + python {params.pythonscript} --wp {workpath} \ + --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \ + --pc {params.peakcaller} --csv {params.csvfile} + cp {params.rscript} {params.outdir} + cd {params.outdir} + Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", + params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))' + """ \ No newline at end of file diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index f23d19b..036bab1 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -77,7 +77,8 @@ PeakExtensions = { 'DiffbindDeseq2': '_Diffbind_Deseq2.bed', 'DiffbindEdgeRBlock': '_Diffbind_EdgeR_block.bed', 'DiffbindDeseq2Block': '_Diffbind_Deseq2_block.bed', - 'Genrich': '.narrowPeak' + 'Genrich': '.narrowPeak', + 'DiffBindQC': '_DiffBindQC_TMMcounts.bed' } FileTypesDiffBind = { @@ -211,7 +212,9 @@ if assay == "cfchip": input: lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ] output: - join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt') + txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), + bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), + bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), params: rname="uropa", uropaver = config['tools']['UROPAVER'], @@ -241,7 +244,9 @@ else: input: lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ] output: - join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt') + txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), + bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), + bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), params: rname="uropa", uropaver = config['tools']['UROPAVER'], diff --git a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd b/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd index 144459b..d058cec 100644 --- a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd +++ b/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd @@ -114,6 +114,8 @@ if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { minOv <- floor(ncol(samples$class)/3) } +print(paste0("The minimum number of overlaps is: ", minOv)) + if (summits == TRUE) { DBdataCounts <- dba.count(samples, summits=250, minOverlap = minOv) } else { @@ -147,9 +149,12 @@ consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-norma as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% dplyr::select(1:4, Peaks, samples$samples$SampleID) -outfile <- paste0("Count", "_", contrasts, "_", "TMM_min", minOv, ".csv") -write.csv(consensus2, outfile, row.names = F) +outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv") +write.csv(consensus2, outfile1, row.names = F) +outfile2 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.bed") +write.table(consensus2[,c("seqnames","start","end","Peaks")], + outfile2, quote=F, sep="\t", row.names=F, col.names=F) counts_TMM_ALL <- consensus2 rownames(counts_TMM_ALL) <- counts_TMM_ALL$Peaks @@ -166,7 +171,7 @@ if (nrow(samples$samples) < 16) { } umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2")) -outfile <- paste0("UMAP", "_", contrasts, "_", "TMM_min", minOv, ".csv") +outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv") write.csv(umap_coord, outfile, row.names = F) ``` diff --git a/workflow/scripts/prep_diffbindQC.py b/workflow/scripts/prep_diffbindQC.py new file mode 100644 index 0000000..550b5f9 --- /dev/null +++ b/workflow/scripts/prep_diffbindQC.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +import json +import argparse + +parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') +parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory') +parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located') +parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output') +parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv') +parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located') +parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file') + +args = parser.parse_args() + +with open("config.json","r") as read_file: + config=json.load(read_file) + +chip2input = config['project']['peaks']['inputs'] +groupdata = config['project']['groups'] + +tmpIDs = [x for xs in groupdata.values() for x in xs] +Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)] + +samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", + "ControlID", "bamControl", "Peaks", "PeakCaller"])] + +count = 1 +for chip in chip2input.keys(): + if set(Ncounts) == {1}: # if all samples only in one group + for key in groupdata.keys(): + if chip in groupdata[key]: + condition = key + replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) + else: + condition = "" + replicate = str(count) + count = count +1 + bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam" + controlID = chip2input[chip] + if controlID != "": + bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam" + else: + bamControl = "" + peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension + samplesheet.append(",".join([chip, condition, replicate, bamReads, + controlID, bamControl, peaks, args.peakcaller])) + +f = open(args.csvfile, 'w') +f.write ("\n".join(samplesheet)) +f.close()