diff --git a/bin/DiffBind_v2_ChIPseq.Rmd b/bin/DiffBind_v2_ChIPseq.Rmd deleted file mode 100755 index 400aa95..0000000 --- a/bin/DiffBind_v2_ChIPseq.Rmd +++ /dev/null @@ -1,297 +0,0 @@ ---- -title: "DiffBind: ChIP-seq pipeline" -output: - html_document: - toc: true - toc_float: - collapsed: false - number_sections: true - toc_depth: 3 - fig_width: 7 - fig_height: 6 -params: - csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" ---- - - - -```{r, include=FALSE, warning=FALSE, message=FALSE} -# inputs -dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") -csvfile <- params$csvfile -outbase <- dirname(csvfile) -contrasts <- params$contrasts -peakcaller <- params$peakcaller - -# file output suffixes -cp_bed <- "_Diffbind_consensusPeaks.bed" -edger_txt <- "_Diffbind_EdgeR.txt" -deseq2_txt <- "_Diffbind_Deseq2.txt" -edger_bed <- "_Diffbind_EdgeR.bed" -deseq2_bed <- "_Diffbind_Deseq2.bed" -deseq2_bed_fullist <- "_Diffbind_Deseq2_fullList.txt" -edger_bed_fullist <- "_Diffbind_EdgeR_fullList.txt" - -# knitr options -knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) - -# libraries -suppressMessages(library(DT)) -suppressMessages(library(DiffBind)) -suppressMessages(library(parallel)) -``` - -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* - -# Peak Data -Read in sample sheet information and peak information -```{r samples} -samples <- dba(sampleSheet=csvfile) -consensus <- dba.peakset(samples,consensus=DBA_CONDITION) -print(samples) -``` - -## Correlation heatmap: Only peaks -Pearson correlation of peak positions: all samples versus all samples -```{r heatmap1} -try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Only peaks -Variance of peak positions -```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) -``` - -## Overlapping peak counts -Number of overlapping peaks. -If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where -the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different -from the consensus peak set used for differential analyses. -```{r Venn, fig_height=4} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples,1:nrow(samples$samples)) -} else { - dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") - try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) - try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) -} -``` - -# Consensus peaks and counts -Consensus peaks are peaks found in at least two samples, independent of condition. -FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. -```{r peaksORsummits} -if ( grepl("narrow",samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad",samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} -``` - -```{r DBcount} -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits=250) -} else { - DBdataCounts <- dba.count(samples) -} -print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller, cp_bed) -consensus2 <- dba.peakset(DBdataCounts, bRetrieve=T) -consensus2$name <- paste0("Peak", 1:length(consensus2)) -#rtracklayer::export(consensus2,outfile2) -``` - -## Correlation heatmap: Peaks and reads -Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples -```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## Heatmap: Average signal across each peak -1000 most variable consensus peaks (library-size normalized counts) -```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Peaks and reads -Variation of library-size normalized counts of consensus peaks -```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) -``` - -# Set Up Contrast -Contrast is Group1 - Group2. -```{r contrast} -DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION) -print(DBdatacontrast) -``` - -# Differential Analysis -This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most -projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are -not changing between the two conditions. EdgeR also assumes that there are equal numbers -of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR -is especially useful when this assumption is true or when there are large differences in -library size across samples. All concentrations are on log2 scale. - -```{r analyze} -DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) -``` - -```{r report} -DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) -``` - -## PCA {.tabset .tabset-fade} -Variance of differential peaks only - -### DeSeq2 {-} -```{r PCA3, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r PCA4, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE) -``` - -## MA plot {.tabset .tabset-fade} -"Log concentration" means average concentration across all samples. -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r MA_D} -try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r MA_E} -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) -``` - -## Volcano plot {.tabset .tabset-fade} -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r Volcano1} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r Volcano2} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) -``` - -## Heatmap: Differential {.tabset .tabset-fade} -1000 most significant differential peaks (Deseq2 or EdgeR normalized) - -### DeSeq2 {-} -```{r heatmap4D} -try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -### EdgeR {-} -```{r heatmap4E} -try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -## Top 500 differentially bound peaks {.tabset .tabset-fade} -### DeSeq2 {-} -```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, deseq2_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, deseq2_bed) -DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) - -tryDeseqExport <- function(DBReportDeseq2, outfile2) { - tryCatch( - { - rtracklayer::export(DBReportDeseq2, outfile2) - }, - error = function(cond) { - print("ERROR: Failed to export DeSeq bed file `rtracklayer::export(DBReportDeseq2, outfile2)`, output blank file") - write.table(outfile2, file='empty', col.names=FALSE) - } - ) -} - -tryDeseqExport(DBReportDeseq2, file.path(outbase, outfile2)) -write.table(DBReportDeseq2, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) -D2i <- length(DBReportDeseq2) -if (D2i == 0) { - i=1 -} else if (D2i > 500) { - i=500 -} else { - i=D2i -} -try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, deseq2_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -### EdgeR {-} -```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller, edger_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, edger_bed) -DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) - -tryEdgeRExport <- function(edger_report, fout) { - tryCatch( - { - rtracklayer::export(edger_report, fout) - }, - error = function(cond) { - print("ERROR: Failed to export EdgeR bed file `rtracklayer::export(edger_report, fout))`, output blank file") - write.table(fout, file='empty', col.names=FALSE) - } - ) -} - -tryEdgeRExport(DBReportEdgeR, file.path(outbase, outfile2)) -write.table(DBReportEdgeR, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) - -Ei <- length(DBReportEdgeR) -if (Ei == 0) { - i=1 -} else if (Ei > 500) { - i=500 -} else { - i=Ei -} -try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, edger_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -## R tool version information -```{r Info} -sessionInfo() -``` - -
diff --git a/bin/DiffBind_v2_ChIPseq_block.Rmd b/bin/DiffBind_v2_ChIPseq_block.Rmd deleted file mode 100755 index 68b9f6a..0000000 --- a/bin/DiffBind_v2_ChIPseq_block.Rmd +++ /dev/null @@ -1,304 +0,0 @@ ---- -title: "DiffBind: ChIP-seq pipeline, paired/blocked analysis" -output: - html_document: - toc: true - toc_float: - collapsed: false - number_sections: true - toc_depth: 3 - fig_width: 7 - fig_height: 6 -params: - csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" ---- - - - -```{r, include=FALSE, warning=FALSE, message=FALSE} -# global variables -dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") -csvfile <- params$csvfile -outbase <- dirname(csvfile) -contrasts <- params$contrasts -peakcaller <- params$peakcaller - -# file output suffixes -cp_bed <- "_Diffbind_consensusPeaks_block.bed" -edger_txt <- "_Diffbind_EdgeR_block.txt" -deseq2_txt <- "_Diffbind_Deseq2_block.txt" -edger_bed <- "_Diffbind_EdgeR_block.bed" -deseq2_bed <- "_Diffbind_Deseq2_block.bed" -deseq2_bed_fullist <- "_Diffbind_Deseq2_fullList_block.txt" -edger_bed_fullist <- "_Diffbind_EdgeR_fullList_block.txt" - -# knittr configuration -knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) - -# load libs -suppressMessages(library(DT)) -suppressMessages(library(DiffBind)) -suppressMessages(library(parallel)) -``` - -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* - -# Peak Data -Read in sample sheet information and peak information -```{r samples} -samples <- dba(sampleSheet=csvfile) -consensus <- dba.peakset(samples, consensus=DBA_CONDITION) -print(samples) -``` - -## Correlation heatmap: Only peaks -Pearson correlation of peak positions: all samples versus all samples -```{r heatmap1} -try(dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1), silent=TRUE) -``` - -## PCA: Only peaks -Variance of peak positions -```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples,DBA_CONDITION), silent=TRUE) -``` - -## Overlapping peak counts -Number of overlapping peaks. -If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where -the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different -from the consensus peak set used for differential analyses. -```{r Venn, fig_height=4} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples, 1:nrow(samples$samples)) -} else { - dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") - try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"), silent=TRUE) - try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"), silent=TRUE) -} -``` - -# Consensus peaks and counts -Consensus peaks are peaks found in at least two samples, independent of condition. -FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. -```{r peaksORsummits} -if ( grepl("narrow", samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad", samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} -``` - -```{r DBcount} -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits=250) -} else { - DBdataCounts <- dba.count(samples) -} -print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller, cp_bed) -consensus2 <- dba.peakset(DBdataCounts, bRetrieve=T) -consensus2$name <- paste0("Peak", 1:length(consensus2)) -#rtracklayer::export(consensus2, outfile2) -``` - -## Correlation heatmap: Peaks and reads -Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples -```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1),silent=TRUE) -``` - -## Heatmap: Average signal across each peak -1000 most variable consensus peaks (library-size normalized counts) -```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1),silent=TRUE) -``` - -## PCA: Peaks and reads -Variation of library-size normalized counts of consensus peaks -```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts, DBA_CONDITION), silent=TRUE) -``` - -# Set Up Contrast -Contrast is Group1 - Group2. -```{r contrast} -DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION, - block=DBA_TREATMENT) -print(DBdatacontrast) -``` - -# Differential Analysis -This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most -projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are -not changing between the two conditions. EdgeR also assumes that there are equal numbers -of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR -is especially useful when this assumption is true or when there are large differences in -library size across samples. All concentrations are on log2 scale. - -```{r analyze} -DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) -``` - -```{r report} -DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK) -``` - -## PCA {.tabset .tabset-fade} -Variance of differential peaks only - -### DeSeq2 {-} -```{r PCA3, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK),silent=TRUE) -``` - -### EdgeR {-} -```{r PCA4, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK),silent=TRUE) -``` - -## MA plot {.tabset .tabset-fade} -"Log concentration" means average concentration across all samples. -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r MA_D} -try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE) -``` - -### EdgeR {-} -```{r MA_E} -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) -``` - -## Volcano plot {.tabset .tabset-fade} -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r Volcano1} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK), silent=TRUE) -``` - -### EdgeR {-} -```{r Volcano2} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK), silent=TRUE) -``` - -## Heatmap: Differential {.tabset .tabset-fade} -1000 most significant differential peaks (Deseq2 or EdgeR normalized) - -### DeSeq2 {-} -```{r heatmap4D} -try(dba.plotHeatmap(DBAnalysisDeseq2, contrast=1, method = DBA_DESEQ2_BLOCK, - correlations=FALSE, margin=20, cexRow=1, cexCol=1), silent=TRUE) -``` - -### EdgeR {-} -```{r heatmap4E} -try(dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK, - correlations=FALSE, margin=20, cexRow=1, cexCol=1), silent=TRUE) -``` - -## Top 500 differentially bound peaks {.tabset .tabset-fade} -### DeSeq2 {-} -```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, deseq2_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, deseq2_bed) -DBReportDeseq2$name <- paste0("Peak", 1:length(DBReportDeseq2)) - -tryDeseqExport <- function(DBReportDeseq2, outfile2) { - tryCatch( - { - rtracklayer::export(DBReportDeseq2, outfile2) - }, - error = function(cond) { - print("ERROR: Failed to export DeSeq bed file `rtracklayer::export(DBReportDeseq2, outfile2)`, output blank file") - write.table(outfile2, file='empty', col.names=FALSE) - } - ) -} - -tryDeseqExport(DBReportDeseq2, file.path(outbase, outfile2)) - -write.table(DBReportDeseq2, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) -D2i <- length(DBReportDeseq2) -if (D2i == 0) { - i=1 -} else if (D2i > 500) { - i=500 -} else { - i=D2i -} -try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2_BLOCK, - th=100,bNormalized=T,bFlip=FALSE,precision=0) - -outfile3 <- paste0(contrasts, "-", peakcaller, deseq2_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -### EdgeR {-} -```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller, edger_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, edger_bed) -DBReportEdgeR$name <- paste0("Peak", 1:length(DBReportEdgeR)) - -tryEdgeRExport <- function(edger_report, fout) { - tryCatch( - { - rtracklayer::export(edger_report, fout) - }, - error = function(cond) { - print("ERROR: Failed to export EdgeR bed file `rtracklayer::export(edger_report, fout))`, output blank file") - write.table(fout, file='empty', col.names=FALSE) - } - ) -} - -tryEdgeRExport(DBReportEdgeR, file.path(outbase, outfile2)) - -write.table(DBReportEdgeR, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) -Ei <- length(DBReportEdgeR) -if (Ei == 0) { - i=1 -} else if (Ei > 500) { - i=500 -} else { - i=Ei -} -try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F), silent=TRUE) - -report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER_BLOCK, - th=100,bNormalized=T,bFlip=FALSE,precision=0) -outfile3 <- paste0(contrasts, "-", peakcaller, edger_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -## R tool version information -```{r Info} -sessionInfo() -``` - -
\ No newline at end of file diff --git a/bin/DiffBind_v2_Deseq2.Rmd b/bin/DiffBind_v2_Deseq2.Rmd new file mode 100755 index 0000000..19e60fa --- /dev/null +++ b/bin/DiffBind_v2_Deseq2.Rmd @@ -0,0 +1,209 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "Deseq2" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + up_file: "" + down_file: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +# inputs +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +up_file <- params$up_file +down_file <- params$down_file +list_file <- params$list_file + +# knitr options +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) + +# libraries +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples,consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples,1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") + try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) + try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +if ( peakcaller == "macsNarrow" ) { + summits <- TRUE + print ("Narrow peak calling tool.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if (grepl("broad", samples$samples$Peaks[1])) { + summits <- FALSE + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} else { + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(file = peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2) +``` + + +## Heatmap: Differential +1000 most significant differential peaks (Deseq2 or EdgeR normalized) + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1) +``` + +## Top 500 or less differentially bound peaks +```{r Deseq2Report} +UpPeaks <- DBReportDeseq2[which(DBReportDeseq2$Fold > 0)] +rtracklayer::export(UpPeaks, up_file) + +DownPeaks <- DBReportDeseq2[which(DBReportDeseq2$Fold < 0)] +rtracklayer::export(DownPeaks, down_file) + +D2i <- length(DBReportDeseq2) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisDeseq2, + method = DBA_DESEQ2, + th=100, + bNormalized=T, + bFlip=FALSE, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + + + +## R tool version information +```{r Info} +sessionInfo() +``` + +
diff --git a/bin/DiffBind_v2_Deseq2_block.Rmd b/bin/DiffBind_v2_Deseq2_block.Rmd new file mode 100644 index 0000000..e4fba91 --- /dev/null +++ b/bin/DiffBind_v2_Deseq2_block.Rmd @@ -0,0 +1,197 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "Deseq2 with blocking" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +# global variables +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +list_file <- params$list_file + +# knittr configuration +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples, consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +dba.plotPCA(samples, DBA_CONDITION) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") + dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1") + dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2") +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +summits <- FALSE +if (peakcaller == "macsNarrow") { + summits <- TRUE + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if (grepl("broad", samples$samples$Peaks[1]) || peakcaller == "macsBroad") { + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} else { + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION, + block=DBA_TREATMENT) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) +``` + + +## Heatmap: Differential +1000 most significant differential peaks + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisDeseq2, contrast=1, method = DBA_DESEQ2_BLOCK, correlations=FALSE, margin=20, cexRow=1, cexCol=1) +``` + +## Top 500 or less differentially bound peaks + +```{r Deseq2Report} +D2i <- length(DBAnalysisDeseq2) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisDeseq2, + method = DBA_DESEQ2_BLOCK, + th=100, + bNormalized=T, + bFlip=FALSE, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + + +## R tool version information +```{r Info} +sessionInfo() +``` + +
\ No newline at end of file diff --git a/bin/DiffBind_v2_EdgeR.Rmd b/bin/DiffBind_v2_EdgeR.Rmd new file mode 100644 index 0000000..5f8e0cf --- /dev/null +++ b/bin/DiffBind_v2_EdgeR.Rmd @@ -0,0 +1,205 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "EdgeR" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + up_file: "" + down_file: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") + +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +up_file <- params$up_file +down_file <- params$down_file +list_file <- params$list_file +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples, consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +dba.plotPCA(samples, DBA_CONDITION) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") + dba.plotVenn(samples, samples$masks[[3]], main="Binding Site Overlaps: samples in Group1") + dba.plotVenn(samples, samples$masks[[4]], main="Binding Site Overlaps: samples in Group2") +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +if (peakcaller == "macsNarrow") { + summits <- TRUE + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else { + summits <- FALSE + print ("Assuming broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(file=peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories=DBA_CONDITION) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisEdgeR, method=DBA_EDGER) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisEdgeR, method=DBA_EDGER) +``` + + +## Heatmap: Differential +1000 most significant differential peaks (EdgeR or EdgeR normalized) + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER, correlations=FALSE, margin=20, cexRow=1, cexCol=1) +``` + +## Top 500 or less differentially bound peaks + +```{r EdgeRReport} +UpPeaks <- DBReportEdgeR[which(DBReportEdgeR$Fold > 0)] +rtracklayer::export(UpPeaks, up_file) + +DownPeaks <- DBReportEdgeR[which(DBReportEdgeR$Fold < 0)] +rtracklayer::export(DownPeaks, down_file) + +D2i <- length(DBReportEdgeR) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisEdgeR, + method = DBA_EDGER, + th=100, + bNormalized=T, + bFlip=FALSE, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + + +## R tool version information +```{r Info} +sessionInfo() +``` + +
\ No newline at end of file diff --git a/bin/DiffBind_v2_EdgeR_block.Rmd b/bin/DiffBind_v2_EdgeR_block.Rmd new file mode 100644 index 0000000..7b8054a --- /dev/null +++ b/bin/DiffBind_v2_EdgeR_block.Rmd @@ -0,0 +1,200 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "EdgeR with blocking" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + down_file: "" + up_file: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") + +# file output suffixes +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +list_file <- params$list_file +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples, consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +dba.plotPCA(samples, DBA_CONDITION) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") + dba.plotVenn(samples,samples$masks[[3]], main="Binding Site Overlaps: samples in Group1") + dba.plotVenn(samples,samples$masks[[4]], main="Binding Site Overlaps: samples in Group2") +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +if ( peakcaller == "macsNarrow" ) { + summits <- TRUE + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else { + summits <- FALSE + print ("Assuming broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, + minMembers=2, + categories=DBA_CONDITION, + block=DBA_TREATMENT) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method=DBA_EDGER) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method=DBA_EDGER_BLOCK) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER_BLOCK) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisEdgeR, method=DBA_EDGER_BLOCK) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisEdgeR, method=DBA_EDGER_BLOCK) +``` + + +## Heatmap: Differential +1000 most significant differential peaks + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER_BLOCK, correlations=FALSE, margin=20, cexRow=1, cexCol=1) +``` + +## Top 500 or less differentially bound peaks + +```{r EdgeRReport} +D2i <- length(DBReportEdgeR) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisEdgeR, + method = DBA_EDGER_BLOCK, + th=100, + bNormalized=T, + bFlip=F, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + +## R tool version information +```{r Info} +sessionInfo() +``` + +
diff --git a/bin/DiffBind_v2_load.R b/bin/DiffBind_v2_load.R new file mode 100755 index 0000000..099df3e --- /dev/null +++ b/bin/DiffBind_v2_load.R @@ -0,0 +1,47 @@ +#!/usr/bin/env Rscript +library(argparse) +library(stringi) +library(DiffBind) +library(parallel) + +cleanup_arg <- function(arg) { + arg <- stri_replace_all_fixed(arg, '"', '') # remote double quotes + arg <- stri_replace_all_fixed(arg, "'", "") # remove single quotes + arg <- stri_replace_all_charclass(arg, "\\p{WHITE_SPACE}", "") # remove all whitespace + return(arg) +} + +parser <- ArgumentParser(description= 'Load diffbind csv process with R::dba return RDS and BED') +parser$add_argument('--csvfile', '-c', help='CSV input file from `diffbind_csv`') +parser$add_argument('--counts', '-n', help='Peak count RDS output file', default=file.path(getwd(), "peak_counts.rds")) +parser$add_argument('--list', '-l', help='Peak list TXT output file', default=file.path(getwd(), "peak_list.bed")) +parser$add_argument('--peakcaller', '-p', help='String with peakcaller name') +xargs <- parser$parse_args() + +csvfile <- cleanup_arg(xargs$csvfile) +counts <- cleanup_arg(xargs$counts) +list <- cleanup_arg(xargs$list) +threads <- as.numeric(cleanup_arg(xargs$threads)) +peakcaller <- cleanup_arg(xargs$peakcaller) +samples <- dba(sampleSheet=csvfile) + +if ( peakcaller == "macsNarrow" ) { + summits_arg <- 250 + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else { + summits_arg <- FALSE + print ("Assuming broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} + +# count +DBdataCounts <- dba.count(samples, summits=summits_arg, bParallel=T) + +# save counts +saveRDS(DBdataCounts, counts) + +# save peaklist +consensus <- dba.peakset(DBdataCounts, bRetrieve=T) +consensus$name <- paste0("Peak", 1:length(consensus)) +rtracklayer::export(consensus, list) diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py index 235c3e4..ddd5292 100755 --- a/bin/prep_diffbind.py +++ b/bin/prep_diffbind.py @@ -5,7 +5,7 @@ from os.path import join -def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): +def main(contrast, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): config = json.load(open(join(wp, "config.json"))) chip2input = config['project']['peaks']['inputs'] groupdata = config['project']['groups'] @@ -20,9 +20,11 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): "bamControl", "Peaks", "PeakCaller"] samplesheet = [] - for condition in group1, group2: - for chip in groupdata[condition]: - replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) + # {group1}_vs_{group2} == condition + g1, g2 = contrast.split('_')[0], contrast.split('_')[2] + for group in [g1, g2]: + for chip in groupdata[group]: + replicate = str([ i + 1 for i in range(len(groupdata[group])) if groupdata[group][i]== chip ][0]) bamReads = join(bam_dir, chip + ".Q5DD.bam") controlID = chip2input[chip] if controlID != "": @@ -32,10 +34,10 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): peaks = join(wp, peaktool, chip, chip + peakext) if blocking: block = blocks[chip] - this_row = dict(zip(cols, [chip, condition, block, replicate, bamReads, + this_row = dict(zip(cols, [chip, group, block, replicate, bamReads, controlID, bamControl, peaks, peakcaller])) else: - this_row = dict(zip(cols, [chip, condition, replicate, bamReads, + this_row = dict(zip(cols, [chip, group, replicate, bamReads, controlID, bamControl, peaks, peakcaller])) samplesheet.append(this_row) @@ -49,8 +51,7 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): if __name__ == "__main__": parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') - parser.add_argument('--g1', dest='group1', required=True, help='Name of the first group') - parser.add_argument('--g2', dest='group2', required=True, help='Name of the second group') + parser.add_argument('--con', dest='contrast', required=True, help='Contrast string in [GROUP1]_vs_[GROUP2] format') parser.add_argument('--wp', dest='wp', required=True, help='Full path of the working directory') parser.add_argument('--pt', dest='peaktool', required=True, help='Name of the the peak calling tool, also the directory where the peak file will be located') @@ -61,4 +62,4 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): help='Name of the directory where the bam files are located') parser.add_argument('--csv', dest='csvfile', required=True, help='Name of the output csv file') args = parser.parse_args() - main(args.group1, args.group2, args.peaktool, args.peakext, args.peakcaller, args.csvfile, args.wp, args.bam_dir) + main(args.contrast, args.peaktool, args.peakext, args.peakcaller, args.csvfile, args.wp, args.bam_dir) diff --git a/bin/promoterAnnotation_by_Gene.R b/bin/promoterAnnotation_by_Gene.R index 846cfc0..e5f3421 100755 --- a/bin/promoterAnnotation_by_Gene.R +++ b/bin/promoterAnnotation_by_Gene.R @@ -8,31 +8,31 @@ # Created: August 8, 2022 # Updated: October 26, 2022 to work with uropa 4.0.2 # Updated: November 3, 2022 to fit with pipeline -# +# #################### # # Purpose: To take UROPA allhits output files using "TSSprot" conditions and -# create a table of which genes have annotations overlapping their +# create a table of which genes have annotations overlapping their # promoters and how many times. Output format: dataframe # -# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the -# TSS. Allhits files were chosen to capture information from "peaks" -# overlappingmultiple promoters. Finalhits files can also be processed -# with this pipeline. This script can handle multiple allhits files as -# long as there are equal numbers of sampleNames to go with them. Also, -# giving a matching DiffBind txt file will allow the allhits file to be -# filtered to only include the significant differential peaks or to +# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the +# TSS. Allhits files were chosen to capture information from "peaks" +# overlappingmultiple promoters. Finalhits files can also be processed +# with this pipeline. This script can handle multiple allhits files as +# long as there are equal numbers of sampleNames to go with them. Also, +# giving a matching DiffBind txt file will allow the allhits file to be +# filtered to only include the significant differential peaks or to # split the data by the direction of log fold-change. # # Requires: GenomicRanges, tidyr # # Function: promoterAnnotationByGene(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) -# +# # Variables: # allhitsFiles: [required] a vector of allhits files to process -# sampleNames: [required] a vector of short names for each allhits file +# sampleNames: [required] a vector of short names for each allhits file # to use as column headers -# diffbindFiles: [optional] a vector of diffbind files to use to filter each +# diffbindFiles: [optional] a vector of diffbind files to use to filter each # allhits file # direction: [optional] when filtering using diffbindFiles, define how to # filter using log fold change. "Both" is default @@ -43,137 +43,151 @@ # source("promoterAnnotation_by_Gene.R") # out1 <- promoterAnnotationByGene(allhitsA.txt, "A") # out2 <- promoterAnnotationByGene(allhitsA.txt, "A", diffbindA.txt, "both") -# out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), -# sampleNames=c("A","B"), -# diffbindFiles=c(diffbindA.txt,diffbindB.txt), +# out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), +# sampleNames=c("A","B"), +# diffbindFiles=c(diffbindA.txt,diffbindB.txt), # direction="pos") -# out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), -# sampleNames=c("Deseq2","EdgeR"), -# diffbindFiles=c(Deseq2.txt,EdgeR.txt), +# out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), +# sampleNames=c("Deseq2","EdgeR"), +# diffbindFiles=c(Deseq2.txt,EdgeR.txt), # direction="separate") -# +# #################### allhits2promoter <- function(allhitsFile) { - # cleaning up the allhits file to only keep information about peaks - # overlapping promoters - inData <- read.delim(allhitsFile) - tmp <- which(inData$name == "query_1") - if (length(tmp) == 0) { - print (paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters.")) - } else { - promoterData <- inData[tmp,] - promoterData <- promoterData[,c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")] - return(promoterData) - } + # cleaning up the allhits file to only keep information about peaks + # overlapping promoters + inData <- read.delim(allhitsFile) + tmp <- which(inData$name == "query_1") + if (length(tmp) == 0) { + print(paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters.")) + } else { + promoterData <- inData[tmp, ] + promoterData <- promoterData[, c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")] + return(promoterData) + } } filterPromoter <- function(Diffbind, promoterData, sampleName) { - # used by DiffbindFilterPromoter - promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, seqnames.field="peak_chr", - start.field="peak_start", end.field="peak_end", - starts.in.df.are.0based=F) - Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind) - ov <- GenomicRanges::countOverlaps(promoterData2,Diffbind2,type = "equal",maxgap=1) - promoterData3 <- promoterData[which(ov != 0),] - promoterData3$sample_id <- sampleName - return(promoterData3) + # used by DiffbindFilterPromoter + promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, + seqnames.field = "peak_chr", + start.field = "peak_start", end.field = "peak_end", + starts.in.df.are.0based = F + ) + Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind) + ov <- GenomicRanges::countOverlaps(promoterData2, Diffbind2, type = "equal", maxgap = 1) + promoterData3 <- promoterData[which(ov != 0), ] + promoterData3$sample_id <- sampleName + return(promoterData3) } DiffbindFilterPromoter <- function(DiffbindFile, promoterData, sampleName, direction) { - # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is - # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both". + # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is + # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both". Diffbind <- read.delim(DiffbindFile) - Diffbind <- Diffbind[which(Diffbind$FDR < 0.05),] + Diffbind <- Diffbind[which(Diffbind$FDR < 0.05), ] if ((direction == "both") | is.na(direction)) { - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) } else if (direction == "pos") { - sampleName <- paste0(sampleName, "_pos") - Diffbind <- Diffbind[which(Diffbind$Fold > 0),] - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + sampleName <- paste0(sampleName, "_pos") + Diffbind <- Diffbind[which(Diffbind$Fold > 0), ] + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) } else if (direction == "neg") { - sampleName <- paste0(sampleName, "_neg") - Diffbind <- Diffbind[which(Diffbind$Fold < 0),] - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + sampleName <- paste0(sampleName, "_neg") + Diffbind <- Diffbind[which(Diffbind$Fold < 0), ] + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) } else { - sampleNameP <- paste0(sampleName, "_pos") - DiffbindP <- Diffbind[which(Diffbind$Fold > 0),] - promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP) - sampleNameN <- paste0(sampleName, "_neg") - DiffbindN <- Diffbind[which(Diffbind$Fold < 0),] - promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN) - promoterData2 <- rbind(promoterDataP, promoterDataN) + sampleNameP <- paste0(sampleName, "_pos") + DiffbindP <- Diffbind[which(Diffbind$Fold > 0), ] + promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP) + sampleNameN <- paste0(sampleName, "_neg") + DiffbindN <- Diffbind[which(Diffbind$Fold < 0), ] + promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN) + promoterData2 <- rbind(promoterDataP, promoterDataN) } -return(promoterData2) + return(promoterData2) } createPromoterTable <- function(promoterData) { - # making final output table - PromoterTable <- data.frame( table(promoterData[,c("gene_id", "sample_id")] ) ) - PromoterTable2 <- merge( unique(promoterData[,c("gene_id", "gene_name")] ), PromoterTable) - PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from="sample_id", values_from="Freq") - return(PromoterTable3) + # making final output table + PromoterTable <- data.frame(table(promoterData[, c("gene_id", "sample_id")])) + PromoterTable2 <- merge(unique(promoterData[, c("gene_id", "gene_name")]), PromoterTable) + PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from = "sample_id", values_from = "Freq") + return(PromoterTable3) } -promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) { - # the main function - if ( length(allhitsFiles) != length(sampleNames) ) { - print("Number of allhits files and sample names don't match.") - } else { - if ( (length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1) ) { - print("Number of allhits files and diffbind files don't match.") +promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles = NA, direction = NA) { + # the main function + if (length(allhitsFiles) != length(sampleNames)) { + print("Number of allhits files and sample names don't match.") } else { - if ( length(allhitsFiles) == 1 ) { - promoterData <- allhits2promoter(allhitsFiles) - if (is.na(diffbindFiles)) { - promoterData$sample_id <- sampleNames + if ((length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1)) { + print("Number of allhits files and diffbind files don't match.") } else { - promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction) - } - } else { - for ( a in 1:length(allhitsFiles) ) { - print(a) - tmpA <- allhits2promoter(allhitsFiles[a]) - if (sum(is.na(diffbindFiles)) ==1) { - tmpA$sample_id <- sampleNames[a] - } else { - tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction) - } - if (a == 1) { - promoterData <- tmpA - } else { - promoterData <- rbind(promoterData, tmpA) - } + if (length(allhitsFiles) == 1) { + promoterData <- allhits2promoter(allhitsFiles) + if (is.na(diffbindFiles)) { + promoterData$sample_id <- sampleNames + } else { + promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction) + } + } else { + for (a in 1:length(allhitsFiles)) { + print(a) + tmpA <- allhits2promoter(allhitsFiles[a]) + if (sum(is.na(diffbindFiles)) == 1) { + tmpA$sample_id <- sampleNames[a] + } else { + tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction) + } + if (a == 1) { + promoterData <- tmpA + } else { + promoterData <- rbind(promoterData, tmpA) + } + } + } } - } + promoterTable <- createPromoterTable(promoterData) + return(promoterTable) } - promoterTable <- createPromoterTable(promoterData) - return(promoterTable) - } -} +} -peakcallVersion <- function(inFolder,outFile) { -# currently only works for macs outputs -# inFolder here is the folder where the uropa output files are located - filesA <- list.files(path=inFolder,pattern="allhits.txt") - samples <- matrix(unlist(strsplit(filesA,"_macs")),ncol=2,byrow=T)[,1] - filesA <- list.files(path=inFolder,pattern="allhits.txt",full.names = T) - promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesA, sampleNames=samples) - write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) +peakcallVersion <- function(inFolder, outFile) { + # currently only works for macs outputs + # inFolder here is the folder where the uropa output files are located + filesA <- list.files(path = inFolder, pattern = "allhits.txt") + samples <- matrix(unlist(strsplit(filesA, "_macs")), ncol = 2, byrow = T)[, 1] + filesA <- list.files(path = inFolder, pattern = "allhits.txt", full.names = T) + promoterInfo <- promoterAnnotationByGene(allhitsFiles = filesA, sampleNames = samples) + write.table(promoterInfo, outFile, quote = F, sep = "\t", row.names = F) } -diffbindVersion <- function(inFolder,outFile) { -# currently designed for macs peaks, analyzed by deseq2 -# analyzing both positive and negative together for now -# inFolder here is the root working directory for the project - uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind") - diffbindFolder <- paste0(inFolder, "/DiffBind") - filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt") - samples <- matrix(unlist(strsplit(filesU,"-macs")),ncol=2,byrow=T)[,1] - filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt",full.names=T) - filesD <- list.files(path=diffbindFolder, pattern="Deseq2.txt",full.names=T,recursive=T) - promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesU, - sampleNames=samples, diffbindFiles=filesD, direction="both") - write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) +diffbindVersion <- function(inFolder, outFile) { + # currently designed for macs peaks, analyzed by deseq2 + # analyzing both positive and negative together for now + # inFolder here is the root working directory for the project + uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind") + diffbindFolder <- paste0(inFolder, "/DiffBind") + + filesU <- list.files(path = uropaFolder, pattern = "DiffbindDeseq2_uropa_protTSS_allhits.txt") + + samples <- matrix(unlist(strsplit(filesU, "-macs")), ncol = 2, byrow = T)[, 1] + + filesU <- list.files(path = uropaFolder, pattern = "DiffbindDeseq2_uropa_protTSS_allhits.txt", full.names = T) + filesD <- list.files(path = diffbindFolder, pattern = "Deseq2.txt", full.names = T, recursive = T) + print(paste0("filesU: ", filesU)) + print(paste0("samples: ", samples)) + print(paste0("filesD: ", filesD)) + stop() + + promoterInfo <- promoterAnnotationByGene( + allhitsFiles = filesU, + sampleNames = samples, + diffbindFiles = filesD, + direction = "both" + ) + write.table(promoterInfo, outFile, quote = F, sep = "\t", row.names = F) } diff --git a/bin/uropa_input.py b/bin/uropa_input.py new file mode 100755 index 0000000..9be89ef --- /dev/null +++ b/bin/uropa_input.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +import os +import argparse +import json +import csv +import shutil + +def main(args): + base_query = { + "feature": ["gene"], + "filter_attribute": "gene_type", + "attribute_values": ["protein_coding"], + "feature_anchor": ["start"], + "relative_location": + ["PeakInsideFeature", "FeatureInsidePeak", "Upstream", + "Downstream", "OverlapStart", "OverlapEnd"], + "strand": "ignore" + } + + for i, peak_type in enumerate(args.peak_types): + if not os.path.exists(os.path.dirname(args.output_json[i])): + os.makedirs(os.path.abspath(os.path.dirname(args.output_json[i]))) + json_construct = dict() + json_construct['queries'] = [] + json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"] + json_construct["priority"] = "Yes" + # don't put outdir in json, leave to specify on command line execution + # json_construct["outdir"] = os.path.dirname(args.output_json[i]) + json_construct['gtf'] = args.gtf + json_construct['bed'] = str(args.bed) + + if args.assay == 'cfchip': + if peak_type == 'protTSS': + for ii, _d in enumerate([[3000], [10000], [100000]], start=1): + this_q = base_query.copy() + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + else: + if peak_type == 'prot': + for ii, _d in enumerate([[5000], [100000]], start=1): + this_q = base_query.copy() + del this_q["feature_anchor"] + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + elif peak_type == 'genes': + this_query = {} + this_query['feature'] = 'gene' + for ii, _d in enumerate([[5000], [100000]], start=1): + this_q = base_query.copy() + del this_q["feature_anchor"] + del this_q["filter_attribute"] + del this_q["attribute_value"] + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + elif peak_type == 'protSEC': + query_values = ( + ([3000, 1000], ["start"]), + ([3000], ["end"]), + ([100000], ["center"]), + ([100000], None) + ) + for ii, (_distance, feature_anchor) in enumerate(query_values, start=1): + this_q = base_query.copy() + del this_q["feature_anchor"] + if feature_anchor: + this_q["feature_anchor"] = feature_anchor + this_q['distance'] = _distance + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + elif peak_type == 'protTSS': + for ii, _d in enumerate([[3000, 1000], [10000], [100000]], start=1): + this_q = base_query.copy() + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + + with open(args.output_json[i], 'w') as jo: + json.dump(json_construct, jo, indent=4) + jo.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Script to prepare the uropa json input') + parser.add_argument('-g', dest='gtf', required=True, help='Gene GTF used in uropa annotation lookup') + parser.add_argument('-o', dest='output_json', nargs="*", required=True, help='Path to output UROPA input JSON') + parser.add_argument('-a', dest='assay', required=True, help='Type of assay being run') + parser.add_argument('-b', dest='bed', required=True, help='Bed used for UROPA annotation') + parser.add_argument('--types', '-t', dest='peak_types', nargs="+", required=True, help='Peak types: prot, protTSS, genes, protSEC') + args = parser.parse_args() + if isinstance(args.output_json, str): + args.output_json = [args.output_json] + if len(args.peak_types) != len(args.output_json): + raise ValueError('More peaks types than output file paths!') + main(args) \ No newline at end of file diff --git a/config/cluster.json b/config/cluster.json index 5cc360e..5f8d805 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -11,7 +11,8 @@ }, "trim": { "mem": "64g", - "threads": "32" + "threads": "32", + "time": "1-18:00:00" }, "kraken": { "cpus-per-task": "24", @@ -97,14 +98,47 @@ "mem": "50g" }, "diffbindQC": { + "threads": "16", "mem": "150g" }, - "diffbind": { + "diffbind_deseq": { + "threads": "16", "mem": "150g" }, - "MEME": { - "threads": "2", + "diffbind_edger": { + "threads": "16", + "mem": "150g" + }, + "diffbind_edger_blocking": { + "threads": "16", + "mem": "150g" + }, + "diffbind_deseq_blocking": { + "threads": "16", + "mem": "150g" + }, + "diffbind_count": { + "threads": "16", + "mem": "120g" + }, + "UROPA_macsNarrow": { + "threads": "16", + "mem": "32g", + "time": "1-00:00:00" + }, + "UROPA_macsBroad": { + "threads": "16", "mem": "32g", + "time": "1-00:00:00" + }, + "UROPA_diffbind": { + "threads": "16", + "mem": "32g", + "time": "1-00:00:00" + }, + "MEME": { + "threads": "28", + "mem": "32g", "time": "3-00:00:00", "ntasks": "--ntasks=28", "ntasks_per_core": "--ntasks-per-core=1", diff --git a/config/containers.json b/config/containers.json index 280e934..7c31275 100644 --- a/config/containers.json +++ b/config/containers.json @@ -1,7 +1,7 @@ { "images": { "cfchip": "docker://skchronicles/cfchip_toolkit:v0.5.0", - "uropa": "docker://quay.io/biocontainers/uropa:4.0.2--pyhdfd78af_0", + "uropa": "docker://rroutsong/uropa:4.0.3", "python": "docker://asyakhleborodova/chrom_seek_python:v0.1.0", "ppqt": "docker://asyakhleborodova/ppqt:v0.2.0" } diff --git a/docker/dedup/Dockerfile b/docker/dedup/Dockerfile new file mode 100644 index 0000000..4f97409 --- /dev/null +++ b/docker/dedup/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:latest +RUN apt-get update -q -y +RUN apt-get install samtools bedtools default-jre r-base python3 python3-pip curl build-essential -y +RUN python3 -m pip config set global.break-system-packages true +RUN ln -sf /usr/bin/python3 /usr/bin/python; ln -sf /usr/bin/pip3 /usr/bin/pip +RUN pip install MACS3 +RUN cd /usr/bin; curl -LJO https://github.com/broadinstitute/picard/releases/download/3.3.0/picard.jar +RUN echo "\npython3 -m pip config set global.break-system-packages true\n" >> /etc/bash.bashrc +RUN echo 'picard() {\n\tjava -Xmx$1 -jar /usr/bin/picard.jar "${@:2}"\n}\n' >> /etc/bash.bashrc \ No newline at end of file diff --git a/docker/uropa/Dockerfile b/docker/uropa/Dockerfile new file mode 100755 index 0000000..18c21aa --- /dev/null +++ b/docker/uropa/Dockerfile @@ -0,0 +1,5 @@ +FROM conda/miniconda3 +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge +RUN conda config --set channel_priority strict +RUN conda install bioconda::uropa diff --git a/src/run.py b/src/run.py index 9b66aab..bdb162c 100644 --- a/src/run.py +++ b/src/run.py @@ -13,7 +13,8 @@ fatal, which, exists, - err + err, + sanitize_slurm_env ) from . import version as __version__ @@ -235,9 +236,13 @@ def unpacked(nested_dict): """ # Iterate over all values of # given dictionary - for value in nested_dict.values(): + for key, value in nested_dict.items(): # Check if value is of dict type - if isinstance(value, dict): + # also exclude certain directories + dontcheck = ('userhome',) + # we exclude the /home directory so it does not interfere with + # container system + if isinstance(value, dict) and key not in dontcheck: # If value is dict then iterate # over all its values recursively for v in unpacked(value): @@ -677,22 +682,13 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None, if temp not in additional_bind_paths.split(','): addpaths.append(temp) bindpaths = ','.join(addpaths) - + # Set ENV variable 'SINGULARITY_CACHEDIR' # to output directory my_env = {}; my_env.update(os.environ) cache = os.path.join(outdir, ".singularity") my_env['SINGULARITY_CACHEDIR'] = cache my_env['APPTAINER_CACHEDIR'] = cache - # Removing R_SITE_LIB environment variable - # due to issue: https://github.com/OpenOmics/chrom-seek/issues/28 - # Using SINGULARITY_CONTAINALL or APPTAINER_CONTAINALL - # causes downstream using where $SLURM_JOBID is - # NOT exported within a container. - if 'R_LIBS_SITE' in my_env: - # functionally equivalent: - # unset R_LIBS_SITE - del my_env['R_LIBS_SITE'] if alt_cache: # Override the pipeline's default @@ -701,6 +697,8 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None, my_env['APPTAINER_CACHEDIR'] = alt_cache cache = alt_cache + my_env = sanitize_slurm_env(my_env) + if additional_bind_paths: # Add Bind PATHs for outdir and tmp dir if bindpaths: @@ -730,7 +728,7 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None, masterjob = subprocess.Popen([ 'snakemake', '-pr', '--rerun-incomplete', '--use-singularity', - '--singularity-args', "'-B {}'".format(bindpaths), + '--singularity-args', "\\-c \\-B '{}'".format(bindpaths), '--cores', str(threads), '--configfile=config.json' ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env) diff --git a/src/run.sh b/src/run.sh index 63463df..622a72e 100755 --- a/src/run.sh +++ b/src/run.sh @@ -212,23 +212,26 @@ function submit(){ fi cat << EOF > kickoff.sh #!/usr/bin/env bash -#SBATCH --cpus-per-task=16 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 #SBATCH --mem=64g +#SBATCH --partition=norm #SBATCH --time=5-00:00:00 #SBATCH --parsable #SBATCH -J "$2" #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --output "$3/logfiles/snakemake_${ts}.log" #SBATCH --error "$3/logfiles/snakemake_${ts}.log" + set -euo pipefail # Main process of pipeline snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\ - --use-singularity --singularity-args "'-B $4'" \\ + --use-singularity --singularity-args "\\-c \\-B '$4'" \\ --use-envmodules --configfile="$3/config.json" \\ --printshellcmds --cluster-config "$3/config/cluster.json" \\ --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \\ --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\ - --keep-remote --local-cores 14 2>&1 + --keep-remote --local-cores 30 2>&1 # Create summary report snakemake -d "$3" --report "Snakemake_Report.html" EOF diff --git a/src/utils.py b/src/utils.py index 9ef48d8..679c1d3 100644 --- a/src/utils.py +++ b/src/utils.py @@ -275,26 +275,6 @@ def check_cache(parser, cache, *args, **kwargs): return cache -def unpacked(nested_dict): - """Generator to recursively retrieves all values in a nested dictionary. - @param nested_dict dict[]: - Nested dictionary to unpack - @yields value in dictionary - """ - # Iterate over all values of given dictionary - for value in nested_dict.values(): - # Check if value is of dict type - if isinstance(value, dict): - # If value is dict then iterate over - # all its values recursively - for v in unpacked(value): - yield v - else: - # If value is not dict type then - # yield the value - yield value - - class Colors(): """Class encoding for ANSI escape sequeces for styling terminal text. Any string that is formatting with these styles must be terminated with @@ -358,10 +338,33 @@ def hashed(l): return h +def sanitize_slurm_env(my_env): + # Don't inherit TERM_PROGRAM cause its usually set by vscode + if 'TERM_PROGRAM' in my_env: + del my_env['TERM_PROGRAM'] + + # unset user locale, can't guarantee the user's locale on the + # container system + if 'LC_ALL' in my_env: + del my_env['LC_ALL'] + + # Removing R_SITE_LIB & R_LIBS_USER environment variable + # due to issue: https://github.com/OpenOmics/chrom-seek/issues/28 + # Using SINGULARITY_CONTAINALL or APPTAINER_CONTAINALL + # causes downstream using where $SLURM_JOBID is + # NOT exported within a container. + if 'R_LIBS_SITE' in my_env: + del my_env['R_LIBS_SITE'] + if 'R_LIBS_USER' in my_env: + del my_env['R_LIBS_USER'] + + return my_env + + if __name__ == '__main__': # Calculate MD5 checksum of entire file print('{} {}'.format(md5sum(sys.argv[0]), sys.argv[0])) - # Calcualte MD5 cehcksum of 512 byte chunck of file, + # Calcualte MD5 cehcksum of 512 byte chunk of file, # which is similar to following unix command: # dd if=utils.py bs=512 count=1 2>/dev/null | md5sum print('{} {}'.format(md5sum(sys.argv[0], first_block_only = True, blocksize = 512), sys.argv[0])) diff --git a/workflow/Snakefile b/workflow/Snakefile index 8faf4d1..7512a93 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,16 +1,19 @@ # Python standard library import datetime import json +from itertools import combinations from os.path import join from os import listdir # Local imports from scripts.common import provided, get_file_components from scripts.grouping import group_samples_by_reps, \ - group_output_files, zip_contrasts, get_peaktools + group_output_files, get_peaktools + configfile: "config.json" + # Global workflow variables today = str(datetime.datetime.today()).split()[0].replace('-', '') # YYYYMMDD samples = config['samples'] @@ -22,6 +25,13 @@ blocking = False if set(blocks.values()) in ({None}, {''} paired_end = False if config['project']['nends'] == 1 else True chips = config['project']['peaks']['chips'] contrast = config['project']['contrast'] + +# list of contrasts in {group1}_vs_{group2} +contrasts = [] +for group_list in contrast: + for combo in combinations(group_list, 2): + contrasts.append(f"{combo[0]}_vs_{combo[1]}") + chip2input = config['project']['peaks']['inputs'] has_inputs = False if set(chip2input.values()) in ({''}, {None}) else True groupdata = config['project']['groups'] @@ -29,8 +39,6 @@ peak_types = config['options']['peak_type_base'] rule_all_ins = [] groupdatawinput, groupswreps = group_samples_by_reps(groupdata, samples, chip2input) PeakTools = get_peaktools(assay) -zipGroup1, zipGroup2, zipToolC, contrasts \ - = zip_contrasts(contrast, PeakTools) file_stems, extRPGC, extaln = get_file_components(paired_end) groups = list(groupdatawinput.keys()) reps = True if len(groupswreps) > 0 else False @@ -59,6 +67,8 @@ cfTool_dir = join(workpath, "cfChIPtool") genrich_dir = join(workpath, "Genrich") MEME_dir = join(workpath, "MEME") manorm_dir = join(workpath, "MANorm") +dba_anno_path = join(workpath, 'dba_annotations') + # Read in resource information with open(join('config', 'cluster.json')) as fh: @@ -77,48 +87,28 @@ if assay == "cfchip": join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips )) - rule_all_ins.extend(expand( - join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=PeakTools, - name=chips, - _type=peak_types - )) - rule_all_ins.extend(expand( - join(uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool="DiffBindQC", - _type=peak_types - )) if has_inputs: - rule_all_ins.extend( - expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools) + rule_all_ins.append( + join(qc_dir, "AllSamples-macsNarrow", "AllSamples-macsNarrow_DiffBindQC_TMMcounts.bed") ) - rule_all_ins.extend(expand( - join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), - PeakTool=PeakTools - )) if reps: - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) - if blocking: - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) + # promotertable 1 + # rule_all_ins.extend(expand( + # join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), + # PeakTool=['macsNarrow'] + # )) if contrast: - rule_all_ins.extend(expand( - join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=['DiffbindEdgeR', 'DiffbindDeseq2'], - name=contrasts, - _type=["protTSS"] - )) - rule_all_ins.extend(expand( - join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), - PeakTool=PeakTools - )) + # promotertable 2 + # rule_all_ins.append(expand( + # join(uropa_dir, "promoterTable2", '{contrast}-{PeakTool}_DiffbindDeseq2_promoter_overlap_summaryTable.txt'), + # contrast=contrasts, PeakTool=PeakTools + # )) + # rule_all_ins.append(expand( + # join(uropa_diffbind_dir, "{contrast}-{PeakTool}-{diff_app}", '{contrast}-{PeakTool}_Diffbind_{diff_app}_promoter_overlap_summaryTable.txt'), + # contrast=contrasts, PeakTool=PeakTools, diff_app=['DeSeq2'] + # )) + pass else: - pass # remove manorm for now # rule_all_ins.extend(expand( # join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), @@ -132,8 +122,11 @@ if assay == "cfchip": # group2=zipGroup2, # tool=zipToolC # )) - + pass # remove this when manorm is added back elif assay in ["atac", "chip"]: + # turn off other peak types for now + # peak_types.extend(["prot", "protSEC", "genes"]) + # peak_types = list(set(peak_types)) # meme outputs turned off for now # if has_inputs: # rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) @@ -158,27 +151,6 @@ elif assay in ["atac", "chip"]: rule_all_ins.extend(expand( join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips )) - rule_all_ins.extend(expand( - join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=PeakTools, name=chips, _type=peak_types - )) - if reps: - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) - if blocking: - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) - if contrast: - rule_all_ins.extend(expand( - join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], - name=contrasts, - _type=["protTSS"], - )) else: pass # manorm turned off now @@ -197,15 +169,47 @@ elif assay in ["atac", "chip"]: rule_all_ins.append(join(workpath, "multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) rule_all_ins.extend( - expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples) + expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=chips) ) rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln)) -rule_all_ins.extend(expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips)) rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"])) if has_inputs: rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput)) - + +rule_all_ins.extend(expand( + join(uropa_dir, "{application}", "{name}_uropa_{_type}_allhits.txt"), + name=chips, + application=["macsNarrow", "macsBroad"], + _type=peak_types, +)) + +if reps and contrast: + rule_all_ins.extend(expand( + join(diffbind_dir, "{contrast}-{PeakTool}", "{contrast}-{PeakTool}_Diffbind_EdgeR.html"), + contrast=contrasts, PeakTool=PeakTools + )) + rule_all_ins.extend(expand( + join(diffbind_dir, "{contrast}-{PeakTool}", "{contrast}-{PeakTool}_Diffbind_DeSeq2.html"), + contrast=contrasts, PeakTool=PeakTools + )) + rule_all_ins.extend(expand( + join(uropa_dir, "DiffBind", "{contrast}-{PeakTool}-{differential_app}", "{contrast}_{PeakTool}_{differential_app}_{_type}_uropa_allhits.txt"), + contrast=contrasts, PeakTool=PeakTools, differential_app=['EdgeR'], _type=peak_types + )), + rule_all_ins.extend(expand( + join(dba_anno_path, "{contrast}_{PeakTool}_{_type}_uropa_allhits.txt"), + contrast=contrasts, PeakTool=PeakTools, _type=peak_types + )), + if blocking: + rule_all_ins.extend(expand( + join(diffbind_dir, "{contrast}-{PeakTool}", "{contrast}-{PeakTool}_Diffbind_blocking_EdgeR.html"), + contrast=contrasts, PeakTool=PeakTools + )) + rule_all_ins.extend(expand( + join(diffbind_dir, "{contrast}-{PeakTool}", "{contrast}-{PeakTool}_Diffbind_blocking_DeSeq2.html"), + contrast=contrasts, PeakTool=PeakTools + )) rule all: input: @@ -217,4 +221,5 @@ include: join("rules", "trim_align_dedup.smk") include: join("rules", "qc.smk") include: join("rules", "peakcall.smk") include: join("rules", "dba.smk") +include: join("rules", "uropa.smk") include: join("rules", "cfChIP.smk") \ No newline at end of file diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index 7020fba..470467f 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -9,7 +9,7 @@ bin_path = config['project']['binpath'] genome = config['options']['genome'] blocks = config['project']['blocks'] groupdata = config['project']['groups'] - +chips = config['project']['peaks']['chips'] # Directory end points bam_dir = join(workpath, "bam") @@ -67,15 +67,15 @@ rule cfChIPcompile: """ -rule promoterTable1: +rule promoterTable1_macsN: input: - expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_protTSS_allhits.txt'), PeakTool=PeakTools, name=chips), + expand(join(uropa_dir, "macsNarrow", "{name}_uropa_protTSS_allhits.txt"), name=chips) output: - txt = join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt") + txt = join(uropa_dir, "promoterTable1", "macsNarrow_promoter_overlap_summaryTable.txt") params: - rname = "promoter1", + rname = "promoterTable1", script = join(bin_path, "promoterAnnotation_by_Gene.R"), - infolder = join(uropa_dir, '{PeakTool}') + infolder = join(uropa_dir, "macsNarrow") container: config['images']['cfchip'] shell: @@ -86,11 +86,23 @@ rule promoterTable1: rule promoterTable2: input: - expand(join(uropa_diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts), + full_list = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_fullList.bed" + ), + peak_annos = [join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-DeSeq2", + "{group1}_vs_{group2}_{PeakTool}_DeSeq2_protTSS_uropa_allhits.txt") + for pk_type in peak_types], output: - txt = join(uropa_dir, "promoterTable2", 'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), + join( + uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-DeSeq2", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_DeSeq2_promoter_overlap_summaryTable.txt" + ), params: - rname = "promoter2", + rname = "promoterTable2", script1 = join(bin_path, "promoterAnnotation_by_Gene.R"), script2 = join(bin_path, "significantPathways.R"), infolder = workpath, @@ -99,28 +111,28 @@ rule promoterTable2: config['images']['cfchip'] shell: """ - Rscript -e "source('{params.script1}'); diffbindVersion('{params.infolder}','{output.txt}')"; - Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','KEGG')"; - Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')"; + Rscript -e "source('{params.script1}'); diffbindVersion('{params.infolder}', '{output}')"; + Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output}', '{params.gtf}', 'KEGG')"; + Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output}', '{params.gtf}', 'Reactome')"; """ -rule diffbindQC: +rule diffbindQC_macsN: input: - lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] + expand(join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), name=chips), output: - html = join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"), - bed = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), + html = join(qc_dir, "AllSamples-macsNarrow", "AllSamples-macsNarrow_DiffBindQC.html"), + bed = join(qc_dir, "AllSamples-macsNarrow", "AllSamples-macsNarrow_DiffBindQC_TMMcounts.bed"), params: - rname = "diffbindQC", + rname = "diffbindQC_macsN", contrast = "AllSamples", - PeakTool = "{PeakTool}", + PeakTool = "macsNarrow", rscript = join(bin_path, "DiffBind_v2_cfChIP_QC.Rmd"), - outdir = join(qc_dir, "AllSamples-{PeakTool}"), - csvfile = join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"), + outdir = join(qc_dir, "AllSamples-macsNarrow"), + csvfile = join(qc_dir, "AllSamples-macsNarrow", "AllSamples-macsNarrow_DiffBind_prep.csv"), pythonscript = join(bin_path, "prep_diffbindQC.py"), - PeakExtension = lambda w: PeakExtensions[w.PeakTool], - peakcaller = lambda w: FileTypesDiffBind[w.PeakTool], + PeakExtension = "_peaks.narrowPeak", + peakcaller = "narrowPeak", container: config['images']['cfchip'] shell: diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index eef1245..26f244d 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -1,15 +1,13 @@ -# Differential binding analysis rules -# ~~~~ -import os -import json from os.path import join +from textwrap import dedent +from itertools import combinations from scripts.common import allocated, mk_dir_if_not_exist from scripts.peakcall import outputIDR, zip_peak_files, \ calc_effective_genome_fraction, get_manorm_sizes from scripts.grouping import test_for_block -# ~~ workflow configuration +# ~~ workflow configuration & directories ~~ workpath = config["project"]["workpath"] bin_path = config["project"]["binpath"] genome = config["options"]["genome"] @@ -18,9 +16,9 @@ groupdata = config["project"]["groups"] contrast = config["project"]["contrast"] uropaver = config["tools"]["UROPAVER"] gtf = config["references"][genome]["GTFFILE"] - - -# ~~ directories +chips = config['project']['peaks']['chips'] +log_dir = join(workpath, "logfiles") +local_log_dir = join(log_dir, "local") diffbind_dir2 = join(workpath, "DiffBind_block") diffbind_dir = join(workpath, "DiffBind") uropa_dir = join(workpath, "UROPA_annotations") @@ -35,359 +33,388 @@ manorm_dir = join(workpath, "MANorm") downstream_dir = join(workpath, "Downstream") otherDirs = [qc_dir, homer_dir, uropa_dir] cfTool_dir = join(workpath, "cfChIPtool") -cfTool_subdir2 = join(cfTool_dir, "BED", "H3K4me3") +cfTool_subdir2 = join(cfTool_dir, "BED", "H3K4me3") +group_combos = [] - -# ~~ workflow switches +# ~~ workflow config ~~ +for (g1, g2) in list(combinations(config['project']['groups'].keys(), 2)): + group_combos.append(f"{g1}_vs_{g2}") blocking = False if set(blocks.values()) in ({None}, {""}) else True if reps == "yes": otherDirs.append(diffbind_dir) mk_dir_if_not_exist(PeakTools + otherDirs) -# ~~ peak calling configuration and outputs -PeakToolsNG = [tool for tool in PeakTools if tool != "gem"] -PeakExtensions = { - "macsNarrow": "_peaks.narrowPeak", - "macsBroad": "_peaks.broadPeak", - "sicer": "_broadpeaks.bed", - "gem": ".GEM_events.narrowPeak", - "MANorm": "_all_MA.bed", - "DiffbindEdgeR": "_Diffbind_EdgeR.bed", - "DiffbindDeseq2": "_Diffbind_Deseq2.bed", - "DiffbindEdgeRBlock": "_Diffbind_EdgeR_block.bed", - "DiffbindDeseq2Block": "_Diffbind_Deseq2_block.bed", - "Genrich": ".narrowPeak", - "DiffBindQC": "_DiffBindQC_TMMcounts.bed", -} - -FileTypesDiffBind = { - "macsNarrow": "narrowPeak", - "macsBroad": "narrowPeak", - "sicer": "bed", - "gem": "narrowPeak", - "Genrich": "narrowPeak", -} - -PeakExtensionsIDR = { - "macsNarrow": "_peaks.narrowPeak", - "macsBroad": "_peaks.broadPeak", - "sicer": "_sicer.broadPeak", -} - -FileTypesIDR = { - "macsNarrow": "narrowPeak", - "macsBroad": "broadPeak", - "sicer": "broadPeak", -} - -RankColIDR = {"macsNarrow": "q.value", "macsBroad": "q.value", "sicer": "q.value"} -IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = outputIDR( - groupswreps, groupdata, chip2input, PeakToolsNG -) -zipSample, zipTool, zipExt = zip_peak_files(chips, PeakTools, PeakExtensions) -contrastBlock = test_for_block(groupdata, contrast, blocks) -zipGroup1B, zipGroup2B, zipToolCB, contrastsB = zip_contrasts(contrastBlock, PeakTools) - - -# ~~ rules -rule init_diffbind: + +localrules: diffbind_csv_macsN, diffbind_csv_macsB, diffbind_csv_genrich + + +# ~~ differential binding analysis ~~ # +rule diffbind_count: input: - bed = lambda w: [ - join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) - for chip in chips - ], - output: csvfile = join( diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv", ), + output: + peak_counts = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_counts.rds" + ), + peak_list = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_fullList.bed" + ), params: - rname = "initialize_diffbind", - group1 = "{group1}", - group2 = "{group2}", + rname = "diffbind_count", + this_contrast = "{group1}_vs_{group2}", this_peaktool = "{PeakTool}", - peakcaller = lambda w: FileTypesDiffBind[w.PeakTool], - this_peakextension = lambda w: PeakExtensions[w.PeakTool], + this_script = join(bin_path, "DiffBind_v2_load.R"), + container: config["images"]["cfchip"] + shell: + dedent(""" + {params.this_script} \\ + --csvfile {input.csvfile} \\ + --counts {output.peak_counts} \\ + --list {output.peak_list} \\ + --peakcaller {params.this_peaktool} + """) + + + +rule diffbind_csv_macsN: + input: + bed = expand(join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), name=chips) + output: + csvfile = join( + diffbind_dir, + "{group1}_vs_{group2}-macsNarrow", + "{group1}_vs_{group2}-macsNarrow_Diffbind_prep.csv", + ), + params: + rname = "diffbind_csv_macsN", + this_peaktool = "macsNarrow", + peakcaller = "narrowPeak", + this_peakextension = "_peaks.narrowPeak", pythonscript = join(bin_path, "prep_diffbind.py"), bam_dir = bam_dir, workpath = workpath, - container: - config["images"]["cfchip"] - shell: - """ - python {params.pythonscript} \ - --g1 {params.group1} \ - --g2 {params.group2} \ - --wp {params.workpath} \ - --pt {params.this_peaktool} \ - --pe {params.this_peakextension} \ - --bd {params.bam_dir} \ - --pc {params.peakcaller} \ - --csv {output.csvfile} - """ - - -rule diffbind: + log: join(local_log_dir, "diffbind_csv_macsN", "{group1}_vs_{group2}_diffbind_csv.log") + run: + for i, contrast in enumerate(group_combos): + shell(dedent( + """ + python {params.pythonscript} \\ + --con \"""" + contrast + """\" \\ + --wp {params.workpath} \\ + --pt {params.this_peaktool} \\ + --pe {params.this_peakextension} \\ + --bd {params.bam_dir} \\ + --pc {params.peakcaller} \\ + --csv {output.csvfile} + """ + )) + + +rule diffbind_csv_genrich: input: + bed = expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips) + output: csvfile = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv", + "{group1}_vs_{group2}-Genrich", + "{group1}_vs_{group2}-Genrich_Diffbind_prep.csv", ), - bed = lambda w: [ - join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) - for chip in chips - ], + params: + rname = "diffbind_csv_genrich", + this_peaktool = "Genrich", + peakcaller = "narrowPeak", + this_peakextension = "_peaks.narrowPeak", + pythonscript = join(bin_path, "prep_diffbind.py"), + bam_dir = bam_dir, + workpath = workpath, + log: join(local_log_dir, "diffbind_csv_genrich", "{group1}_vs_{group2}_diffbind_csv.log") + run: + for i, contrast in enumerate(group_combos): + shell(dedent( + """ + python {params.pythonscript} \\ + --con \"""" + contrast + """\" \\ + --wp {params.workpath} \\ + --pt {params.this_peaktool} \\ + --pe {params.this_peakextension} \\ + --bd {params.bam_dir} \\ + --pc {params.peakcaller} \\ + --csv {output.csvfile} + """ + )) + + +rule diffbind_csv_macsB: + input: + beds = expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips), output: - diffbind_report = join( + csvfile = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind.html", + "{group1}_vs_{group2}-macsBroad", + "{group1}_vs_{group2}-macsBroad_Diffbind_prep.csv", ), - Deseq2 = join( + params: + rname = "diffbind_csv_macsB", + this_peaktool = "macsBroad", + peakcaller = "narrowPeak", + this_peakextension = "_peaks.broadPeak", + pythonscript = join(bin_path, "prep_diffbind.py"), + bam_dir = bam_dir, + workpath = workpath, + log: join(local_log_dir, "diffbind_csv_macsB", "{group1}_vs_{group2}_diffbind_csv.log") + run: + for i, contrast in enumerate(group_combos): + shell(dedent( + """ + python {params.pythonscript} \\ + --con "{wildcards.group1}_vs_{wildcards.group2}" \\ + --wp {params.workpath} \\ + --pt {params.this_peaktool} \\ + --pe {params.this_peakextension} \\ + --bd {params.bam_dir} \\ + --pc {params.peakcaller} \\ + --csv {output.csvfile} + """ + )) + + +# ~~ diffbind EdgeR DE analysis ~~ # +rule diffbind_edger: + input: + csvfile = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_prep.csv", ), - EdgeR = join( - diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed", + peak_counts = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_counts.rds" ), - EdgeR_txt = join( + output: + diffbind_report = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_EdgeR.html", ), - Deseq2_txt = join( + up_file = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_EdgeR_up.bed", ), - EdgeR_ftxt = join( + down_file = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_EdgeR_down.bed", ), - Deseq2_ftxt = join( - diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt", - ), - # consensus_pks = join( - # diffbind_dir, - # "{group1}_vs_{group2}-{PeakTool}", - # "{group1}_vs_{group2}-{PeakTool}_Diffbind_consensusPeaks.bed" - # ), + peak_list = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_EdgeR_peak_list.tab", + ) params: - rname = "diffbind", - this_peaktool = "{PeakTool}", - this_contrast = "{group1}_vs_{group2}", - this_peakextension = lambda w: PeakExtensions[w.PeakTool], - peakcaller = lambda w: FileTypesDiffBind[w.PeakTool], - rscript = join(bin_path, "DiffBind_v2_ChIPseq.Rmd"), - outdir = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"), + rname = "diffbind_edger", + rscript = join(bin_path, "DiffBind_v2_EdgeR.Rmd"), + outdir = join(diffbind_dir, "{contrast}-{PeakTool}"), container: config["images"]["cfchip"] shell: - """ + dedent(""" + if [ ! -d \"{tmpdir}\" ]; then mkdir -p \"{tmpdir}\"; fi + tmp=$(mktemp -d -p \"{tmpdir}\") + trap 'rm -rf "${{tmp}}"' EXIT + mkdir -p {params.outdir} cd {params.outdir} - Rscript -e 'rmarkdown::render("{params.rscript}", \ - output_file="{output.diffbind_report}", \ - params=list( \ - csvfile="{input.csvfile}", \ - contrasts="{params.this_contrast}", \ - peakcaller="{params.this_peaktool}" \ - ) \ - )' - """ - - -rule diffbind_blocking: + + cat <<'EOF' > ${{tmp}}/rscript.sh + #!/bin/bash + Rscript -e 'rmarkdown::render("{params.rscript}", output_file="{output.diffbind_report}", + params=list(csvfile="{input.csvfile}", peakcaller="{wildcards.PeakTool}", list_file="{output.peak_list}", + up_file="{output.up_file}", down_file="{output.down_file}", contrasts="{wildcards.contrast}", counts="{input.peak_counts}"))' + EOF + + chmod +x ${{tmp}}/rscript.sh + echo "--" + cat ${{tmp}}/rscript.sh + echo "--" + ls -al ${{tmp}} + sh ${{tmp}}/rscript.sh + """) + + +rule diffbind_edger_blocking: input: - csvfile = join( + csvfile = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_prep.csv", ), - bed = lambda w: [ - join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) - for chip in chips - ], + peak_counts = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_counts.rds"), output: diffbind_block_report = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_blocking_EdgeR.html", ), - Deseq2 = join( + peak_list = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.bed", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_block_EdgeR_peak_list.tab", ), - EdgeR = join( + params: + rname = "diffbind_edger_block", + blocking_rscript = join(bin_path, "DiffBind_v2_EdgeR_block.Rmd"), + outdir = join(diffbind_dir, "{contrast}-{PeakTool}"), + container: + config["images"]["cfchip"] + shell: + dedent(""" + if [ ! -d \"{tmpdir}\" ]; then mkdir -p \"{tmpdir}\"; fi + tmp=$(mktemp -d -p \"{tmpdir}\") + trap 'rm -rf "${{tmp}}"' EXIT + + mkdir -p {params.outdir} + cd {params.outdir} + + cat << EOF > ${{tmp}}/rscript.sh + #!/bin/bash + Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file="{output.diffbind_block_report}", + params=list(csvfile="{input.csvfile}", peakcaller="{wildcards.PeakTool}", list_file="{output.peak_list}", + contrasts="{wildcards.contrast}", counts="{input.peak_counts}"))' + EOF + + chmod +x ${{tmp}}/rscript.sh + echo "--" + cat ${{tmp}}/rscript.sh + echo "--" + ls -al ${{tmp}} + sh ${{tmp}}/rscript.sh + """) + + +# ~~ diffbind DeSeq2 DE analysis ~~ # +rule diffbind_deseq: + input: + csvfile = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.bed", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_prep.csv", ), - EdgeR_txt = join( + peak_counts = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.txt", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_counts.rds" ), - Deseq2_txt = join( + output: + diffbind_report = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.txt", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_DeSeq2.html", ), - EdgeR_ftxt = join( + up_file = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList_block.txt", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_Deseq2_up.bed", ), - Deseq2_ftxt = join( + down_file = join( diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList_block.txt", + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_Deseq2_down.bed", + ), + peak_list = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_Deseq2_peak_list.tab", ), - # consensus_pks = join( - # diffbind_dir, - # "{group1}_vs_{group2}-{PeakTool}", - # "{group1}_vs_{group2}-{PeakTool}_Diffbind_consensusPeaks_block.bed" - # ), params: - rname = "diffbind_block", - blocking_rscript = join(bin_path, "DiffBind_v2_ChIPseq_block.Rmd"), - outdir = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"), - this_peaktool = "{PeakTool}", - this_contrast = "{group1}_vs_{group2}", + rname = "diffbind_deseq2", + rscript = join(bin_path, "DiffBind_v2_Deseq2.Rmd"), + outdir = join(diffbind_dir, "{contrast}-{PeakTool}"), container: config["images"]["cfchip"] shell: - """ + dedent(""" + if [ ! -d \"{tmpdir}\\diffbind_deseq\" ]; then mkdir -p \"{tmpdir}\\diffbind_deseq\"; fi + tmp=$(mktemp -d -p \"{tmpdir}\\diffbind_deseq") + trap 'rm -rf "{tmpdir}\\diffbind_deseq"' EXIT + mkdir -p {params.outdir} cd {params.outdir} - Rscript -e 'rmarkdown::render("{params.blocking_rscript}", \ - output_file="{output.diffbind_block_report}", \ - params=list( \ - csvfile="{input.csvfile}", \ - contrasts="{params.this_contrast}", \ - peakcaller="{params.this_peaktool}" \ - ) \ - )' - """ + cat <<'EOF' > ${{tmp}}/rscript.sh + #!/bin/bash + Rscript -e 'rmarkdown::render("{params.rscript}", output_file="{output.diffbind_report}", + params=list(csvfile="{input.csvfile}", peakcaller="{wildcards.PeakTool}", list_file="{output.peak_list}", + up_file="{output.up_file}", down_file="{output.down_file}", contrasts="{wildcards.contrast}", counts="{input.peak_counts}"))' + EOF -localrules: UROPA_prep_in + chmod +x ${{tmp}}/rscript.sh + echo "--" + cat ${{tmp}}/rscript.sh + echo "--" + ls -al ${{tmp}} + sh ${{tmp}}/rscript.sh + """) -rule UROPA_prep_in: - input: - lambda w: join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]), - params: - fldr = join(uropa_dir, "{PeakTool1}"), - output: - json = join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}.{type}.json"), - run: - # Dynamically creates UROPA config file - if not os.path.exists(params.fldr): - os.mkdir(params.fldr, mode=0o775) - - json_construct = dict() - json_construct['queries'] = [] - json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"] - json_construct["priority"] = "Yes" - json_construct['gtf'] = gtf - json_construct['bed'] = input[0] - - base_query = { - "feature": "gene", - "filter.attribute": "gene_type", - "attribute.value": "protein_coding", - "feature.anchor": "start" - } - - if assay == 'cfchip': - if wildcards.type == 'protTSS': - for _d in (3000, 10000, 100000): - this_q = base_query.copy() - this_q['distance'] = _d - json_construct['queries'].append(this_q) - else: - if wildcards.type == 'prot': - for _d in (5000, 100000): - this_q = base_query.copy() - del this_q["feature.anchor"] - this_q['distance'] = _d - json_construct['queries'].append(this_q) - elif wildcards.type == 'genes': - this_query = {} - this_query['feature'] = 'gene' - for _d in (5000, 100000): - this_q = base_query.copy() - del this_q["feature.anchor"] - del this_q["filter.attribute"] - del this_q["attribute.value"] - this_q['distance'] = _d - json_construct['queries'].append(this_q) - elif wildcards.type == 'protSEC': - # distance, feature.anchor - query_values = ( - ([3000, 1000], "start"), - (3000, "end"), - (100000, "center"), - (100000, None) - ) - for _distance, feature_anchor in query_values: - this_q = base_query.copy() - del this_q["feature.anchor"] - if feature_anchor: - this_q["feature.anchor"] = feature_anchor - this_q['distance'] = _distance - json_construct['queries'].append(this_q) - elif wildcards.type == 'protTSS': - for _d in ([3000, 1000], 10000, 100000): - this_q = base_query.copy() - this_q['distance'] = _d - json_construct['queries'].append(this_q) - - with open(output.json, 'w') as jo: - json.dump(json_construct, jo, indent=4) - jo.close() - - if not os.path.exists(output.json): - raise FileNotFoundError(output.json + " does not exist!") - - - -rule UROPA: +rule diffbind_deseq_blocking: input: - json = join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}.{type}.json"), + csvfile = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_prep.csv", + ), + peak_counts = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_counts.rds" + ), output: - txt = join( - uropa_dir, - "{PeakTool1}", - "{name}_{PeakTool2}_uropa_{type}_allhits.txt" + diffbind_block_report = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_blocking_DeSeq2.html", + ), + peak_list = join( + diffbind_dir, + "{contrast}-{PeakTool}", + "{contrast}-{PeakTool}_Diffbind_block_Deseq2_peak_list.tab", ), - bed1 = temp(join( - uropa_dir, - "{PeakTool1}", - "{name}_{PeakTool2}_uropa_{type}_allhits.bed" - )), - bed2 = temp(join( - uropa_dir, - "{PeakTool1}", - "{name}_{PeakTool2}_uropa_{type}_finalhits.bed", - )), - log = join(uropa_dir, "{PeakTool1}", "{name}_{PeakTool2}_uropa_{type}.log") params: - rname="uropa", - outroot=join(uropa_dir, "{PeakTool1}", "{name}_{PeakTool2}_uropa_{type}"), - threads: 4 + rname = "diffbind_deseq_block", + blocking_rscript = join(bin_path, "DiffBind_v2_Deseq2_block.Rmd"), + outdir = join(diffbind_dir, "{contrast}-{PeakTool}"), container: - config["images"]["uropa"] + config["images"]["cfchip"] shell: - """ - uropa -i {input.json} -p {params.outroot} -l {output.log} -t {threads} -s - """ \ No newline at end of file + dedent(""" + if [ ! -d \"{tmpdir}\" ]; then mkdir -p \"{tmpdir}\"; fi + tmp=$(mktemp -d -p \"{tmpdir}\") + trap 'rm -rf "${{tmp}}"' EXIT + + mkdir -p {params.outdir} + cd {params.outdir} + cat <<'EOF' > ${{tmp}}/rscript.sh + #!/bin/bash + Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file="{output.diffbind_block_report}", + params=list(csvfile="{input.csvfile}", peakcaller="{wildcards.PeakTool}", list_file="{output.peak_list}", + contrasts="{wildcards.contrast}", counts="{input.peak_counts}"))' + EOF + + chmod +x ${{tmp}}/rscript.sh + echo "--" + cat ${{tmp}}/rscript.sh + echo "--" + sh ${{tmp}}/rscript.sh + """) diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index fd8a520..5b56e0c 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -4,7 +4,7 @@ # Common quality-control rules: preseq, NRF, rawfastqc, # fastqc, fastq_screen, multiQC from os.path import join -from scripts.peakcall import get_control_input, getMacTXT, getMacChip, \ +from scripts.peakcall import get_control_input, \ getSicerChips, getSicerFragLen, get_control_input @@ -89,8 +89,10 @@ rule genrich: rule MACS2_narrow: input: - chip = lambda w: getMacChip(bam_dir, w.name, paired_end), - txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), + chip = join(bam_dir, "{name}.Q5DD.bam") if paired_end \ + else join(bam_dir, "{name}.Q5DD_tagAlign.gz"), + txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") if paired_end \ + else join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), @@ -98,7 +100,7 @@ rule MACS2_narrow: rname = 'MACS2_narrow', gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "" + flag = lambda w: "-c" if chip2input[w.name] else "", shell: """ module load {params.macsver}; @@ -128,8 +130,10 @@ rule MACS2_narrow: rule MACS2_broad: input: - chip = lambda w: getMacChip(bam_dir, w.name, paired_end), - txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), + chip = join(bam_dir, "{name}.Q5DD.bam") if paired_end \ + else join(bam_dir, "{name}.Q5DD_tagAlign.gz"), + txt = join(ppqt_dir, "{name}.Q5DD.ppqt.txt") if paired_end \ + else join(ppqt_dir, "{name}.Q5DD_tagAlign.ppqt.txt"), c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), @@ -137,7 +141,7 @@ rule MACS2_broad: rname = 'MACS2_broad', gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], macsver = config['tools']['MACSVER'], - flag = lambda w: "-c" if chip2input[w.name] else "" + flag = lambda w: "-c" if chip2input[w.name] else "", shell: """ module load {params.macsver}; diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 82b4376..317c8af 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -395,15 +395,77 @@ rule deeptools_QC: """ -rule FRiP: +rule FRiP_macsN: input: - bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], + bed = expand(join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), name=chips), bam = join(bam_dir, "{name}.Q5DD.bam"), output: - join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), + join(workpath, "PeakQC", "macsNarrow.{name}.Q5DD.FRiP_table.txt"), params: - rname = "frip", - outroot = lambda w: join(peakqc_dir, w.PeakTool), + rname = "FRiP_macsN", + outroot = join(peakqc_dir, "macsNarrow"), + script = join(bin_path, "frip.py"), + genome = config['references'][genome]['REFLEN'], + tmpdir = tmpdir, + container: + config['images']['python'] + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + python {params.script} \\ + -p {input.bed} \\ + -b {input.bam} \\ + -g {params.genome} \\ + -o {params.outroot} + """ + + +rule FRiP_Genrich: + input: + bed = expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips), + bam = join(bam_dir, "{name}.Q5DD.bam"), + output: + join(workpath, "PeakQC", "Genrich.{name}.Q5DD.FRiP_table.txt"), + params: + rname = "FRiP_macsN", + outroot = join(peakqc_dir, "macsNarrow"), + script = join(bin_path, "frip.py"), + genome = config['references'][genome]['REFLEN'], + tmpdir = tmpdir, + container: + config['images']['python'] + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + python {params.script} \\ + -p {input.bed} \\ + -b {input.bam} \\ + -g {params.genome} \\ + -o {params.outroot} + """ + + +rule FRiP_macsB: + input: + bed = expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips), + bam = join(bam_dir, "{name}.Q5DD.bam"), + output: + join(workpath, "PeakQC", "macsBroad.{name}.Q5DD.FRiP_table.txt"), + params: + rname = "FRiP_macsB", + outroot = join(peakqc_dir, "macsBroad"), script = join(bin_path, "frip.py"), genome = config['references'][genome]['REFLEN'], tmpdir = tmpdir, diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 0642ce0..1a809f7 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -63,7 +63,7 @@ rule trim: javaram = "64g", sample = "{name}", threads: - 16 + 32 shell: """ module load {params.cutadaptver}; @@ -270,7 +270,7 @@ rule dedup: tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT - if [ "{assay}" == "cfchip" ];then + if [ "{assay}" == "cfchip" ]; then java -Xmx{params.javaram} \\ -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ -I {input.bam2} \\ diff --git a/workflow/rules/uropa.smk b/workflow/rules/uropa.smk new file mode 100644 index 0000000..0cb80b4 --- /dev/null +++ b/workflow/rules/uropa.smk @@ -0,0 +1,239 @@ +# ~~ Differential binding analysis rules ~~ +from os.path import join, sep +from textwrap import dedent + +localrules: UROPA_prep_in_macsB, UROPA_prep_in_macsN, \ + UROPA_prep_diffbind_edgeR, UROPA_prep_diffbind_DeSeq2, \ + rename_edger_dba_uropa + + +workpath = config['project']['workpath'] +uropa_dir = join(workpath, "UROPA_annotations") +uropa_diffbind_dir = join(uropa_dir, "DiffBind") +diffbind_dir = join(workpath, "DiffBind") +dba_anno_path = join(workpath, 'dba_annotations') + + +rule UROPA_prep_diffbind_edgeR: + input: + join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_fullList.bed" + ), + params: + rname = "UROPA_prep_in_diffbind", + this_script = join(bin_path, "uropa_input.py"), + this_gtf = gtf, + this_assay = assay, + peak_types = ' '.join(peak_types), + output: + this_json = [join( + uropa_diffbind_dir, + "{group1}_vs_{group2}.{PeakTool}.DiffBind.EdgeR." + pktype + ".json" + ) for pktype in peak_types], + log: join(local_log_dir, "UROPA_prep_in_diffbind", "{group1}_vs_{group2}-{PeakTool}_diffbind_prep.log") + shell: + dedent(""" + {params.this_script} \\ + -g {params.this_gtf} \\ + -o {output.this_json} \\ + -a {params.this_assay} \\ + -b {input} \\ + -t {params.peak_types} + """) + + +rule UROPA_prep_diffbind_DeSeq2: + input: + join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_fullList.bed" + ), + params: + rname = "UROPA_prep_in_diffbind", + this_script = join(bin_path, "uropa_input.py"), + this_gtf = gtf, + this_assay = assay, + peak_types = ' '.join(peak_types), + output: + this_json = [join( + uropa_diffbind_dir, + "{group1}_vs_{group2}.{PeakTool}.DiffBind.DeSeq2." + pktype + ".json" + ) for pktype in peak_types], + + log: join(local_log_dir, "UROPA_prep_in_diffbind", "{group1}_vs_{group2}-{PeakTool}_diffbind_prep.log") + shell: + dedent(""" + {params.this_script} \\ + -g {params.this_gtf} \\ + -o {output.this_json} \\ + -a {params.this_assay} \\ + -b {input} \\ + -t {params.peak_types} + """) + + +rule UROPA_diffbind: + input: + json = join( + uropa_dir, + "DiffBind", + "{group1}_vs_{group2}.{PeakTool}.DiffBind.{differential_app}.{_type}.json" + ), + params: + rname = "UROPA_diffbind", + outroot = join(uropa_diffbind_dir, "{group1}_vs_{group2}-{PeakTool}-{differential_app}"), + uropa_prefix = "{group1}_vs_{group2}_{PeakTool}_{differential_app}_{_type}_uropa", + output: + allhits_txt = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-{differential_app}", + "{group1}_vs_{group2}_{PeakTool}_{differential_app}_{_type}_uropa_allhits.txt"), + allhits_bed = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-{differential_app}", + "{group1}_vs_{group2}_{PeakTool}_{differential_app}_{_type}_uropa_allhits.bed"), + finalhits_txt = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-{differential_app}", + "{group1}_vs_{group2}_{PeakTool}_{differential_app}_{_type}_uropa_finalhits.txt"), + finalhits_bed = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-{differential_app}", + "{group1}_vs_{group2}_{PeakTool}_{differential_app}_{_type}_uropa_finalhits.bed"), + summary = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-{differential_app}", + "{group1}_vs_{group2}_{PeakTool}_{differential_app}_{_type}_uropa_summary.pdf"), + json = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-{differential_app}", + "{group1}_vs_{group2}_{PeakTool}_{differential_app}_{_type}_uropa.json"), + threads: int(allocated("threads", "UROPA_diffbind", cluster)), + container: config["images"]["uropa"] + shell: + dedent(""" + uropa \\ + -i {input} \\ + -o {params.outroot} \\ + -p {params.uropa_prefix} \\ + -t {threads} -s + """) + +rule rename_edger_dba_uropa: + input: + allhits_txt = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-EdgeR", + "{group1}_vs_{group2}_{PeakTool}_EdgeR_{_type}_uropa_allhits.txt"), + allhits_bed = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-EdgeR", + "{group1}_vs_{group2}_{PeakTool}_EdgeR_{_type}_uropa_allhits.bed"), + finalhits_txt = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-EdgeR", + "{group1}_vs_{group2}_{PeakTool}_EdgeR_{_type}_uropa_finalhits.txt"), + finalhits_bed = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-EdgeR", + "{group1}_vs_{group2}_{PeakTool}_EdgeR_{_type}_uropa_finalhits.bed"), + summary = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-EdgeR", + "{group1}_vs_{group2}_{PeakTool}_EdgeR_{_type}_uropa_summary.pdf"), + json = join(uropa_diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}-EdgeR", + "{group1}_vs_{group2}_{PeakTool}_EdgeR_{_type}_uropa.json"), + output: + allhits_txt = join(dba_anno_path, "{group1}_vs_{group2}_{PeakTool}_{_type}_uropa_allhits.txt"), + allhits_bed = join(dba_anno_path, "{group1}_vs_{group2}_{PeakTool}_{_type}_allhits.bed"), + finalhits_txt = join(dba_anno_path, "{group1}_vs_{group2}_{PeakTool}_{_type}_finalhits.txt"), + finalhits_bed = join(dba_anno_path, "{group1}_vs_{group2}_{PeakTool}_{_type}_finalhits.bed"), + summary = join(dba_anno_path, "{group1}_vs_{group2}_{PeakTool}_{_type}_summary.pdf"), + json = join(dba_anno_path, "{group1}_vs_{group2}_{PeakTool}_{_type}.json") + shell: + """ + cp {input.allhits_txt} {output.allhits_txt} + cp {input.allhits_bed} {output.allhits_bed} + cp {input.finalhits_txt} {output.finalhits_txt} + cp {input.finalhits_bed} {output.finalhits_bed} + cp {input.summary} {output.summary} + cp {input.json} {output.json} + """ + + +# ~~ macs Narrow peak annotation ~~ # +rule UROPA_prep_in_macsN: + input: + join(macsN_dir, "{name}", "{name}_peaks.narrowPeak") + params: + rname = "UROPA_prep_in_macsN", + this_script = join(bin_path, "uropa_input.py"), + this_gtf = gtf, + this_assay = assay, + peak_types = ' '.join(peak_types), + output: + this_json = [join( + uropa_dir, + "macsNarrow", + "{name}.macsNarrow." + pktype + ".json" + ) for pktype in peak_types], + log: join(local_log_dir, "UROPA_prep_in_macsN", "{name}_uropa_prep.log") + shell: + dedent(""" + {params.this_script} \\ + -g {params.this_gtf} \\ + -o {output.this_json} \\ + -a {params.this_assay} \\ + -b {input} \\ + -t {params.peak_types} + """) + + +rule UROPA_macsNarrow: + input: join(uropa_dir, "macsNarrow", "{name}.macsNarrow.{_type}.json") + params: + rname = "UROPA_macsN", + outroot = join(uropa_dir, "macsNarrow"), + output: + allhits_txt = join(uropa_dir, "macsNarrow", "{name}_uropa_{_type}_allhits.txt"), + allhits_bed = join(uropa_dir, "macsNarrow", "{name}_uropa_{_type}_allhits.bed"), + finalhits_txt = join(uropa_dir, "macsNarrow", "{name}_uropa_{_type}_finalhits.txt"), + finalhits_bed = join(uropa_dir, "macsNarrow", "{name}_uropa_{_type}_finalhits.bed"), + threads: int(allocated("threads", "UROPA_macsN", cluster)), + container: config["images"]["uropa"] + shell: "uropa -i {input} -p {wildcards.name}_uropa_{wildcards._type} -o {params.outroot} -t {threads} -s" + + +# ~~ macs Broad peak annotation ~~ # +rule UROPA_prep_in_macsB: + input: + join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), + params: + rname = "UROPA_prep_in_macsB", + this_script = join(bin_path, "uropa_input.py"), + this_gtf = gtf, + this_assay = assay, + peak_types = ' '.join(peak_types), + output: + this_json = [join( + uropa_dir, + "macsBroad", + "{name}.macsBroad." + pktype + ".json" + ) for pktype in peak_types], + shell: + dedent(""" + {params.this_script} \\ + -g {params.this_gtf} \\ + -o {output.this_json} \\ + -a {params.this_assay} \\ + -b {input} \\ + -t {params.peak_types} + """) + + +rule UROPA_macsBroad: + input: join(uropa_dir, "macsBroad", "{name}.macsBroad.{_type}.json") + params: + rname = "UROPA_macsB", + outroot = join(uropa_dir, "macsBroad"), + output: + allhits_txt = join(uropa_dir, "macsBroad", "{name}_uropa_{_type}_allhits.txt"), + allhits_bed = join(uropa_dir, "macsBroad", "{name}_uropa_{_type}_allhits.bed"), + finalhits_txt = join(uropa_dir, "macsBroad", "{name}_uropa_{_type}_finalhits.txt"), + finalhits_bed = join(uropa_dir, "macsBroad", "{name}_uropa_{_type}_finalhits.bed"), + threads: int(allocated("threads", "UROPA_macsBroad", cluster)), + container: config["images"]["uropa"], + shell: "uropa -i {input} -p {wildcards.name}_uropa_{wildcards._type} -o {params.outroot} -t {threads} -s" diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py index ee02b3e..6739fe6 100644 --- a/workflow/scripts/grouping.py +++ b/workflow/scripts/grouping.py @@ -51,26 +51,12 @@ def group_output_files(extensions, groupslist, inputnorm): return dtoolgroups, dtoolext -def zip_contrasts(contrast, PeakTools): - """ - making output file names for differential binding analyses - """ - zipGroup1, zipGroup2, zipTool, contrasts = [], [], [], [] - for g1, g2 in contrast: - for PeakTool in PeakTools: - zipGroup1.append(g1) - zipGroup2.append(g2) - zipTool.append(PeakTool) - contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool ) - return(zipGroup1, zipGroup2, zipTool, contrasts) - - def get_peaktools(assay_type): tools = ["macsNarrow"] if assay_type == "atac": tools.append("Genrich") elif assay_type == "chip": - tools.extend(["macsBroad"]) + tools.append("macsBroad") # turn sicer off for now # tools.extend(["macsBroad", "sicer"]) return tools diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py index 0d86cff..1567a55 100644 --- a/workflow/scripts/peakcall.py +++ b/workflow/scripts/peakcall.py @@ -73,22 +73,6 @@ def calc_effective_genome_fraction(effectivesize, genomefile): return(str(float(effectivesize)/ genomelen)) -def getMacChip(bam_dir, name, paired_end): - if paired_end: - chip = join(bam_dir, name + ".Q5DD.bam") - else: - chip = join(bam_dir, name + ".Q5DD_tagAlign.gz") - return chip - - -def getMacTXT(ppqt_dir, name, paired_end): - if paired_end: - txt = join(ppqt_dir, name + ".Q5DD.ppqt.txt") - else: - txt = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt") - return txt - - def getSicerChips(bam_dir, name, paired_end): if paired_end: chip = join(bam_dir, name + ".Q5DD.bam")