Skip to content

Commit

Permalink
Merge pull request #37 from tovahmarkowitz/main
Browse files Browse the repository at this point in the history
Adding new DiffBindQC for cfChIP
  • Loading branch information
tovahmarkowitz authored Mar 22, 2024
2 parents 0103a62 + 45215f0 commit 7c35675
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 8 deletions.
5 changes: 4 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,11 @@ if assay == "cfchip":
join(workpath,"QC","H3K4me3_cfChIP_signature.txt"),
expand(join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),name=samples, ext=["sorted", "Q5DD"]),
expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput),
expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),
expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),
PeakTool=PeakTools,name=chips,type=["protTSS"]),
expand(join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools),
expand(join(workpath, uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{type}_allhits.txt"),
PeakTool="DiffBindQC", type="protTSS"),
expand(join(workpath,uropa_dir,"promoterTable1",'{PeakTool}_promoter_overlap_summaryTable.txt'),PeakTool=PeakTools),
provided(expand(join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
zip,group1=zipGroup1,group2=zipGroup2,PeakTool=zipToolC), reps == "yes"),
Expand Down
29 changes: 28 additions & 1 deletion workflow/rules/cfChIP.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ rule cfChIPtool:
rver="R/4.1.0",
toolkit = config['references'][genome]['cfChIP_TOOLS_SRC'],
tmpfile = lambda w: join(workpath,cfTool_subdir2, w.name + ".Q5DD.tagAlign"),
tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD.tagAlign"))
tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD_tagAlign"))
container:
config['images']['cfchip']
shell: """
Expand Down Expand Up @@ -83,3 +83,30 @@ rule promoterTable2:
Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')";
"""

rule diffbindQC:
input:
lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
output:
html = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"),
bed = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"),
params:
rname="diffbindQC",
rscript=join(workpath,"workflow","scripts","DiffBind_v2_cfChIP_QC.Rmd"),
outdir = join(workpath, "QC", "AllSamples-{PeakTool}"),
contrast = "AllSamples",
csvfile = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"),
pythonscript = join(workpath,"workflow","scripts","prep_diffbindQC.py"),
PeakExtension= lambda w: PeakExtensions[w.PeakTool],
PeakTool="{PeakTool}",
peakcaller= lambda w: FileTypesDiffBind[w.PeakTool],
container:
config['images']['cfchip']
shell: """
python {params.pythonscript} --wp {workpath} \
--pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \
--pc {params.peakcaller} --csv {params.csvfile}
cp {params.rscript} {params.outdir}
cd {params.outdir}
Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}",
params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))'
"""
11 changes: 8 additions & 3 deletions workflow/rules/dba.smk
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ PeakExtensions = {
'DiffbindDeseq2': '_Diffbind_Deseq2.bed',
'DiffbindEdgeRBlock': '_Diffbind_EdgeR_block.bed',
'DiffbindDeseq2Block': '_Diffbind_Deseq2_block.bed',
'Genrich': '.narrowPeak'
'Genrich': '.narrowPeak',
'DiffBindQC': '_DiffBindQC_TMMcounts.bed'
}

FileTypesDiffBind = {
Expand Down Expand Up @@ -211,7 +212,9 @@ if assay == "cfchip":
input:
lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
output:
join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt')
txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
params:
rname="uropa",
uropaver = config['tools']['UROPAVER'],
Expand Down Expand Up @@ -241,7 +244,9 @@ else:
input:
lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
output:
join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt')
txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
params:
rname="uropa",
uropaver = config['tools']['UROPAVER'],
Expand Down
11 changes: 8 additions & 3 deletions workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
minOv <- floor(ncol(samples$class)/3)
}
print(paste0("The minimum number of overlaps is: ", minOv))
if (summits == TRUE) {
DBdataCounts <- dba.count(samples, summits=250, minOverlap = minOv)
} else {
Expand Down Expand Up @@ -147,9 +149,12 @@ consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-norma
as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>%
dplyr::select(1:4, Peaks, samples$samples$SampleID)
outfile <- paste0("Count", "_", contrasts, "_", "TMM_min", minOv, ".csv")
write.csv(consensus2, outfile, row.names = F)
outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv")
write.csv(consensus2, outfile1, row.names = F)
outfile2 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.bed")
write.table(consensus2[,c("seqnames","start","end","Peaks")],
outfile2, quote=F, sep="\t", row.names=F, col.names=F)
counts_TMM_ALL <- consensus2
rownames(counts_TMM_ALL) <- counts_TMM_ALL$Peaks
Expand All @@ -166,7 +171,7 @@ if (nrow(samples$samples) < 16) {
}
umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2"))
outfile <- paste0("UMAP", "_", contrasts, "_", "TMM_min", minOv, ".csv")
outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv")
write.csv(umap_coord, outfile, row.names = F)
```

Expand Down
51 changes: 51 additions & 0 deletions workflow/scripts/prep_diffbindQC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python3

import json
import argparse

parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory')
parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located')
parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output')
parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv')
parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located')
parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file')

args = parser.parse_args()

with open("config.json","r") as read_file:
config=json.load(read_file)

chip2input = config['project']['peaks']['inputs']
groupdata = config['project']['groups']

tmpIDs = [x for xs in groupdata.values() for x in xs]
Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)]

samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads",
"ControlID", "bamControl", "Peaks", "PeakCaller"])]

count = 1
for chip in chip2input.keys():
if set(Ncounts) == {1}: # if all samples only in one group
for key in groupdata.keys():
if chip in groupdata[key]:
condition = key
replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
else:
condition = ""
replicate = str(count)
count = count +1
bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
controlID = chip2input[chip]
if controlID != "":
bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam"
else:
bamControl = ""
peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
samplesheet.append(",".join([chip, condition, replicate, bamReads,
controlID, bamControl, peaks, args.peakcaller]))

f = open(args.csvfile, 'w')
f.write ("\n".join(samplesheet))
f.close()

0 comments on commit 7c35675

Please sign in to comment.