diff --git a/bin/DiffBind_v2_ChIPseq.Rmd b/bin/DiffBind_v2_ChIPseq.Rmd deleted file mode 100755 index 400aa95..0000000 --- a/bin/DiffBind_v2_ChIPseq.Rmd +++ /dev/null @@ -1,297 +0,0 @@ ---- -title: "DiffBind: ChIP-seq pipeline" -output: - html_document: - toc: true - toc_float: - collapsed: false - number_sections: true - toc_depth: 3 - fig_width: 7 - fig_height: 6 -params: - csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" ---- - - - -```{r, include=FALSE, warning=FALSE, message=FALSE} -# inputs -dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") -csvfile <- params$csvfile -outbase <- dirname(csvfile) -contrasts <- params$contrasts -peakcaller <- params$peakcaller - -# file output suffixes -cp_bed <- "_Diffbind_consensusPeaks.bed" -edger_txt <- "_Diffbind_EdgeR.txt" -deseq2_txt <- "_Diffbind_Deseq2.txt" -edger_bed <- "_Diffbind_EdgeR.bed" -deseq2_bed <- "_Diffbind_Deseq2.bed" -deseq2_bed_fullist <- "_Diffbind_Deseq2_fullList.txt" -edger_bed_fullist <- "_Diffbind_EdgeR_fullList.txt" - -# knitr options -knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) - -# libraries -suppressMessages(library(DT)) -suppressMessages(library(DiffBind)) -suppressMessages(library(parallel)) -``` - -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* - -# Peak Data -Read in sample sheet information and peak information -```{r samples} -samples <- dba(sampleSheet=csvfile) -consensus <- dba.peakset(samples,consensus=DBA_CONDITION) -print(samples) -``` - -## Correlation heatmap: Only peaks -Pearson correlation of peak positions: all samples versus all samples -```{r heatmap1} -try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Only peaks -Variance of peak positions -```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) -``` - -## Overlapping peak counts -Number of overlapping peaks. -If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where -the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different -from the consensus peak set used for differential analyses. -```{r Venn, fig_height=4} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples,1:nrow(samples$samples)) -} else { - dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") - try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) - try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) -} -``` - -# Consensus peaks and counts -Consensus peaks are peaks found in at least two samples, independent of condition. -FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. -```{r peaksORsummits} -if ( grepl("narrow",samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad",samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} -``` - -```{r DBcount} -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits=250) -} else { - DBdataCounts <- dba.count(samples) -} -print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller, cp_bed) -consensus2 <- dba.peakset(DBdataCounts, bRetrieve=T) -consensus2$name <- paste0("Peak", 1:length(consensus2)) -#rtracklayer::export(consensus2,outfile2) -``` - -## Correlation heatmap: Peaks and reads -Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples -```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## Heatmap: Average signal across each peak -1000 most variable consensus peaks (library-size normalized counts) -```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Peaks and reads -Variation of library-size normalized counts of consensus peaks -```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) -``` - -# Set Up Contrast -Contrast is Group1 - Group2. -```{r contrast} -DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION) -print(DBdatacontrast) -``` - -# Differential Analysis -This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most -projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are -not changing between the two conditions. EdgeR also assumes that there are equal numbers -of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR -is especially useful when this assumption is true or when there are large differences in -library size across samples. All concentrations are on log2 scale. - -```{r analyze} -DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) -``` - -```{r report} -DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) -``` - -## PCA {.tabset .tabset-fade} -Variance of differential peaks only - -### DeSeq2 {-} -```{r PCA3, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r PCA4, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE) -``` - -## MA plot {.tabset .tabset-fade} -"Log concentration" means average concentration across all samples. -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r MA_D} -try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r MA_E} -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) -``` - -## Volcano plot {.tabset .tabset-fade} -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r Volcano1} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r Volcano2} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) -``` - -## Heatmap: Differential {.tabset .tabset-fade} -1000 most significant differential peaks (Deseq2 or EdgeR normalized) - -### DeSeq2 {-} -```{r heatmap4D} -try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -### EdgeR {-} -```{r heatmap4E} -try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -## Top 500 differentially bound peaks {.tabset .tabset-fade} -### DeSeq2 {-} -```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, deseq2_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, deseq2_bed) -DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) - -tryDeseqExport <- function(DBReportDeseq2, outfile2) { - tryCatch( - { - rtracklayer::export(DBReportDeseq2, outfile2) - }, - error = function(cond) { - print("ERROR: Failed to export DeSeq bed file `rtracklayer::export(DBReportDeseq2, outfile2)`, output blank file") - write.table(outfile2, file='empty', col.names=FALSE) - } - ) -} - -tryDeseqExport(DBReportDeseq2, file.path(outbase, outfile2)) -write.table(DBReportDeseq2, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) -D2i <- length(DBReportDeseq2) -if (D2i == 0) { - i=1 -} else if (D2i > 500) { - i=500 -} else { - i=D2i -} -try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, deseq2_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -### EdgeR {-} -```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller, edger_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, edger_bed) -DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) - -tryEdgeRExport <- function(edger_report, fout) { - tryCatch( - { - rtracklayer::export(edger_report, fout) - }, - error = function(cond) { - print("ERROR: Failed to export EdgeR bed file `rtracklayer::export(edger_report, fout))`, output blank file") - write.table(fout, file='empty', col.names=FALSE) - } - ) -} - -tryEdgeRExport(DBReportEdgeR, file.path(outbase, outfile2)) -write.table(DBReportEdgeR, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) - -Ei <- length(DBReportEdgeR) -if (Ei == 0) { - i=1 -} else if (Ei > 500) { - i=500 -} else { - i=Ei -} -try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, edger_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -## R tool version information -```{r Info} -sessionInfo() -``` - -
diff --git a/bin/DiffBind_v2_ChIPseq_block.Rmd b/bin/DiffBind_v2_ChIPseq_block.Rmd deleted file mode 100755 index 68b9f6a..0000000 --- a/bin/DiffBind_v2_ChIPseq_block.Rmd +++ /dev/null @@ -1,304 +0,0 @@ ---- -title: "DiffBind: ChIP-seq pipeline, paired/blocked analysis" -output: - html_document: - toc: true - toc_float: - collapsed: false - number_sections: true - toc_depth: 3 - fig_width: 7 - fig_height: 6 -params: - csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" ---- - - - -```{r, include=FALSE, warning=FALSE, message=FALSE} -# global variables -dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") -csvfile <- params$csvfile -outbase <- dirname(csvfile) -contrasts <- params$contrasts -peakcaller <- params$peakcaller - -# file output suffixes -cp_bed <- "_Diffbind_consensusPeaks_block.bed" -edger_txt <- "_Diffbind_EdgeR_block.txt" -deseq2_txt <- "_Diffbind_Deseq2_block.txt" -edger_bed <- "_Diffbind_EdgeR_block.bed" -deseq2_bed <- "_Diffbind_Deseq2_block.bed" -deseq2_bed_fullist <- "_Diffbind_Deseq2_fullList_block.txt" -edger_bed_fullist <- "_Diffbind_EdgeR_fullList_block.txt" - -# knittr configuration -knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) - -# load libs -suppressMessages(library(DT)) -suppressMessages(library(DiffBind)) -suppressMessages(library(parallel)) -``` - -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* - -# Peak Data -Read in sample sheet information and peak information -```{r samples} -samples <- dba(sampleSheet=csvfile) -consensus <- dba.peakset(samples, consensus=DBA_CONDITION) -print(samples) -``` - -## Correlation heatmap: Only peaks -Pearson correlation of peak positions: all samples versus all samples -```{r heatmap1} -try(dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1), silent=TRUE) -``` - -## PCA: Only peaks -Variance of peak positions -```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples,DBA_CONDITION), silent=TRUE) -``` - -## Overlapping peak counts -Number of overlapping peaks. -If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where -the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different -from the consensus peak set used for differential analyses. -```{r Venn, fig_height=4} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples, 1:nrow(samples$samples)) -} else { - dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") - try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"), silent=TRUE) - try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"), silent=TRUE) -} -``` - -# Consensus peaks and counts -Consensus peaks are peaks found in at least two samples, independent of condition. -FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. -```{r peaksORsummits} -if ( grepl("narrow", samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad", samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} -``` - -```{r DBcount} -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits=250) -} else { - DBdataCounts <- dba.count(samples) -} -print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller, cp_bed) -consensus2 <- dba.peakset(DBdataCounts, bRetrieve=T) -consensus2$name <- paste0("Peak", 1:length(consensus2)) -#rtracklayer::export(consensus2, outfile2) -``` - -## Correlation heatmap: Peaks and reads -Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples -```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1),silent=TRUE) -``` - -## Heatmap: Average signal across each peak -1000 most variable consensus peaks (library-size normalized counts) -```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1),silent=TRUE) -``` - -## PCA: Peaks and reads -Variation of library-size normalized counts of consensus peaks -```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts, DBA_CONDITION), silent=TRUE) -``` - -# Set Up Contrast -Contrast is Group1 - Group2. -```{r contrast} -DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION, - block=DBA_TREATMENT) -print(DBdatacontrast) -``` - -# Differential Analysis -This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most -projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are -not changing between the two conditions. EdgeR also assumes that there are equal numbers -of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR -is especially useful when this assumption is true or when there are large differences in -library size across samples. All concentrations are on log2 scale. - -```{r analyze} -DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) -``` - -```{r report} -DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK) -``` - -## PCA {.tabset .tabset-fade} -Variance of differential peaks only - -### DeSeq2 {-} -```{r PCA3, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK),silent=TRUE) -``` - -### EdgeR {-} -```{r PCA4, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK),silent=TRUE) -``` - -## MA plot {.tabset .tabset-fade} -"Log concentration" means average concentration across all samples. -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r MA_D} -try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE) -``` - -### EdgeR {-} -```{r MA_E} -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) -``` - -## Volcano plot {.tabset .tabset-fade} -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r Volcano1} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK), silent=TRUE) -``` - -### EdgeR {-} -```{r Volcano2} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK), silent=TRUE) -``` - -## Heatmap: Differential {.tabset .tabset-fade} -1000 most significant differential peaks (Deseq2 or EdgeR normalized) - -### DeSeq2 {-} -```{r heatmap4D} -try(dba.plotHeatmap(DBAnalysisDeseq2, contrast=1, method = DBA_DESEQ2_BLOCK, - correlations=FALSE, margin=20, cexRow=1, cexCol=1), silent=TRUE) -``` - -### EdgeR {-} -```{r heatmap4E} -try(dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK, - correlations=FALSE, margin=20, cexRow=1, cexCol=1), silent=TRUE) -``` - -## Top 500 differentially bound peaks {.tabset .tabset-fade} -### DeSeq2 {-} -```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, deseq2_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, deseq2_bed) -DBReportDeseq2$name <- paste0("Peak", 1:length(DBReportDeseq2)) - -tryDeseqExport <- function(DBReportDeseq2, outfile2) { - tryCatch( - { - rtracklayer::export(DBReportDeseq2, outfile2) - }, - error = function(cond) { - print("ERROR: Failed to export DeSeq bed file `rtracklayer::export(DBReportDeseq2, outfile2)`, output blank file") - write.table(outfile2, file='empty', col.names=FALSE) - } - ) -} - -tryDeseqExport(DBReportDeseq2, file.path(outbase, outfile2)) - -write.table(DBReportDeseq2, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) -D2i <- length(DBReportDeseq2) -if (D2i == 0) { - i=1 -} else if (D2i > 500) { - i=500 -} else { - i=D2i -} -try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2_BLOCK, - th=100,bNormalized=T,bFlip=FALSE,precision=0) - -outfile3 <- paste0(contrasts, "-", peakcaller, deseq2_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -### EdgeR {-} -```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller, edger_txt) -outfile2 <- paste0(contrasts, "-", peakcaller, edger_bed) -DBReportEdgeR$name <- paste0("Peak", 1:length(DBReportEdgeR)) - -tryEdgeRExport <- function(edger_report, fout) { - tryCatch( - { - rtracklayer::export(edger_report, fout) - }, - error = function(cond) { - print("ERROR: Failed to export EdgeR bed file `rtracklayer::export(edger_report, fout))`, output blank file") - write.table(fout, file='empty', col.names=FALSE) - } - ) -} - -tryEdgeRExport(DBReportEdgeR, file.path(outbase, outfile2)) - -write.table(DBReportEdgeR, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) -Ei <- length(DBReportEdgeR) -if (Ei == 0) { - i=1 -} else if (Ei > 500) { - i=500 -} else { - i=Ei -} -try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F), silent=TRUE) - -report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER_BLOCK, - th=100,bNormalized=T,bFlip=FALSE,precision=0) -outfile3 <- paste0(contrasts, "-", peakcaller, edger_bed_fullist) -write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) -``` - -## R tool version information -```{r Info} -sessionInfo() -``` - - \ No newline at end of file diff --git a/bin/DiffBind_v2_Deseq2.Rmd b/bin/DiffBind_v2_Deseq2.Rmd new file mode 100755 index 0000000..19e60fa --- /dev/null +++ b/bin/DiffBind_v2_Deseq2.Rmd @@ -0,0 +1,209 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "Deseq2" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + up_file: "" + down_file: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +# inputs +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +up_file <- params$up_file +down_file <- params$down_file +list_file <- params$list_file + +# knitr options +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) + +# libraries +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples,consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples,1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") + try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) + try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +if ( peakcaller == "macsNarrow" ) { + summits <- TRUE + print ("Narrow peak calling tool.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if (grepl("broad", samples$samples$Peaks[1])) { + summits <- FALSE + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} else { + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(file = peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2) +``` + + +## Heatmap: Differential +1000 most significant differential peaks (Deseq2 or EdgeR normalized) + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1) +``` + +## Top 500 or less differentially bound peaks +```{r Deseq2Report} +UpPeaks <- DBReportDeseq2[which(DBReportDeseq2$Fold > 0)] +rtracklayer::export(UpPeaks, up_file) + +DownPeaks <- DBReportDeseq2[which(DBReportDeseq2$Fold < 0)] +rtracklayer::export(DownPeaks, down_file) + +D2i <- length(DBReportDeseq2) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisDeseq2, + method = DBA_DESEQ2, + th=100, + bNormalized=T, + bFlip=FALSE, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + + + +## R tool version information +```{r Info} +sessionInfo() +``` + + diff --git a/bin/DiffBind_v2_Deseq2_block.Rmd b/bin/DiffBind_v2_Deseq2_block.Rmd new file mode 100644 index 0000000..e4fba91 --- /dev/null +++ b/bin/DiffBind_v2_Deseq2_block.Rmd @@ -0,0 +1,197 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "Deseq2 with blocking" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +# global variables +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +list_file <- params$list_file + +# knittr configuration +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples, consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +dba.plotPCA(samples, DBA_CONDITION) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") + dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1") + dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2") +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +summits <- FALSE +if (peakcaller == "macsNarrow") { + summits <- TRUE + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if (grepl("broad", samples$samples$Peaks[1]) || peakcaller == "macsBroad") { + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} else { + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION, + block=DBA_TREATMENT) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) +``` + + +## Heatmap: Differential +1000 most significant differential peaks + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisDeseq2, contrast=1, method = DBA_DESEQ2_BLOCK, correlations=FALSE, margin=20, cexRow=1, cexCol=1) +``` + +## Top 500 or less differentially bound peaks + +```{r Deseq2Report} +D2i <- length(DBAnalysisDeseq2) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisDeseq2, + method = DBA_DESEQ2_BLOCK, + th=100, + bNormalized=T, + bFlip=FALSE, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + + +## R tool version information +```{r Info} +sessionInfo() +``` + + \ No newline at end of file diff --git a/bin/DiffBind_v2_EdgeR.Rmd b/bin/DiffBind_v2_EdgeR.Rmd new file mode 100644 index 0000000..5f8e0cf --- /dev/null +++ b/bin/DiffBind_v2_EdgeR.Rmd @@ -0,0 +1,205 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "EdgeR" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + up_file: "" + down_file: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") + +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +up_file <- params$up_file +down_file <- params$down_file +list_file <- params$list_file +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples, consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +dba.plotPCA(samples, DBA_CONDITION) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") + dba.plotVenn(samples, samples$masks[[3]], main="Binding Site Overlaps: samples in Group1") + dba.plotVenn(samples, samples$masks[[4]], main="Binding Site Overlaps: samples in Group2") +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +if (peakcaller == "macsNarrow") { + summits <- TRUE + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else { + summits <- FALSE + print ("Assuming broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(file=peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories=DBA_CONDITION) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisEdgeR, method=DBA_EDGER) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisEdgeR, method=DBA_EDGER) +``` + + +## Heatmap: Differential +1000 most significant differential peaks (EdgeR or EdgeR normalized) + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER, correlations=FALSE, margin=20, cexRow=1, cexCol=1) +``` + +## Top 500 or less differentially bound peaks + +```{r EdgeRReport} +UpPeaks <- DBReportEdgeR[which(DBReportEdgeR$Fold > 0)] +rtracklayer::export(UpPeaks, up_file) + +DownPeaks <- DBReportEdgeR[which(DBReportEdgeR$Fold < 0)] +rtracklayer::export(DownPeaks, down_file) + +D2i <- length(DBReportEdgeR) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisEdgeR, + method = DBA_EDGER, + th=100, + bNormalized=T, + bFlip=FALSE, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + + +## R tool version information +```{r Info} +sessionInfo() +``` + + \ No newline at end of file diff --git a/bin/DiffBind_v2_EdgeR_block.Rmd b/bin/DiffBind_v2_EdgeR_block.Rmd new file mode 100644 index 0000000..7b8054a --- /dev/null +++ b/bin/DiffBind_v2_EdgeR_block.Rmd @@ -0,0 +1,200 @@ +--- +title: "DiffBind: chrom-seek pipeline" +subtitle: "EdgeR with blocking" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macsNarrow" + counts: "" + down_file: "" + up_file: "" + list_file: "" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") + +# file output suffixes +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +peak_counts <- params$counts +list_file <- params$list_file +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples, consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +dba.plotPCA(samples, DBA_CONDITION) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples, 1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") + dba.plotVenn(samples,samples$masks[[3]], main="Binding Site Overlaps: samples in Group1") + dba.plotVenn(samples,samples$masks[[4]], main="Binding Site Overlaps: samples in Group2") +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r} +if ( peakcaller == "macsNarrow" ) { + summits <- TRUE + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else { + summits <- FALSE + print ("Assuming broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +DBdataCounts <- readRDS(peak_counts) +print(DBdataCounts) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +dba.plotPCA(DBdataCounts, DBA_CONDITION) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, + minMembers=2, + categories=DBA_CONDITION, + block=DBA_TREATMENT) +print(DBdatacontrast) +``` + +# Differential Analysis +All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method=DBA_EDGER) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method=DBA_EDGER_BLOCK) +``` + +## PCA +Variance of differential peaks only + +```{r PCA3, fig.height=5,fig.width=5} +dba.plotPCA(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER_BLOCK) +``` + +## MA plot +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +```{r MA_D} +dba.plotMA(DBAnalysisEdgeR, method=DBA_EDGER_BLOCK) +``` + +## Volcano plot +Each dot is a consensus peak. + +```{r Volcano1} +dba.plotVolcano(DBAnalysisEdgeR, method=DBA_EDGER_BLOCK) +``` + + +## Heatmap: Differential +1000 most significant differential peaks + +```{r heatmap4D} +dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method=DBA_EDGER_BLOCK, correlations=FALSE, margin=20, cexRow=1, cexCol=1) +``` + +## Top 500 or less differentially bound peaks + +```{r EdgeRReport} +D2i <- length(DBReportEdgeR) +i <- as.integer(min(c(500, as.integer(max(c(D2i, 1)))))) +DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F) + +report2 <- dba.report( + DBAnalysisEdgeR, + method = DBA_EDGER_BLOCK, + th=100, + bNormalized=T, + bFlip=F, + precision=0, + bCalled=T + ) +write.table(report2, list_file, quote=F, sep="\t", row.names=F) +``` + +## R tool version information +```{r Info} +sessionInfo() +``` + + diff --git a/bin/DiffBind_v2_load.R b/bin/DiffBind_v2_load.R new file mode 100755 index 0000000..099df3e --- /dev/null +++ b/bin/DiffBind_v2_load.R @@ -0,0 +1,47 @@ +#!/usr/bin/env Rscript +library(argparse) +library(stringi) +library(DiffBind) +library(parallel) + +cleanup_arg <- function(arg) { + arg <- stri_replace_all_fixed(arg, '"', '') # remote double quotes + arg <- stri_replace_all_fixed(arg, "'", "") # remove single quotes + arg <- stri_replace_all_charclass(arg, "\\p{WHITE_SPACE}", "") # remove all whitespace + return(arg) +} + +parser <- ArgumentParser(description= 'Load diffbind csv process with R::dba return RDS and BED') +parser$add_argument('--csvfile', '-c', help='CSV input file from `diffbind_csv`') +parser$add_argument('--counts', '-n', help='Peak count RDS output file', default=file.path(getwd(), "peak_counts.rds")) +parser$add_argument('--list', '-l', help='Peak list TXT output file', default=file.path(getwd(), "peak_list.bed")) +parser$add_argument('--peakcaller', '-p', help='String with peakcaller name') +xargs <- parser$parse_args() + +csvfile <- cleanup_arg(xargs$csvfile) +counts <- cleanup_arg(xargs$counts) +list <- cleanup_arg(xargs$list) +threads <- as.numeric(cleanup_arg(xargs$threads)) +peakcaller <- cleanup_arg(xargs$peakcaller) +samples <- dba(sampleSheet=csvfile) + +if ( peakcaller == "macsNarrow" ) { + summits_arg <- 250 + print ("Ran macsNarrow.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else { + summits_arg <- FALSE + print ("Assuming broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} + +# count +DBdataCounts <- dba.count(samples, summits=summits_arg, bParallel=T) + +# save counts +saveRDS(DBdataCounts, counts) + +# save peaklist +consensus <- dba.peakset(DBdataCounts, bRetrieve=T) +consensus$name <- paste0("Peak", 1:length(consensus)) +rtracklayer::export(consensus, list) diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py index 235c3e4..ddd5292 100755 --- a/bin/prep_diffbind.py +++ b/bin/prep_diffbind.py @@ -5,7 +5,7 @@ from os.path import join -def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): +def main(contrast, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): config = json.load(open(join(wp, "config.json"))) chip2input = config['project']['peaks']['inputs'] groupdata = config['project']['groups'] @@ -20,9 +20,11 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): "bamControl", "Peaks", "PeakCaller"] samplesheet = [] - for condition in group1, group2: - for chip in groupdata[condition]: - replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) + # {group1}_vs_{group2} == condition + g1, g2 = contrast.split('_')[0], contrast.split('_')[2] + for group in [g1, g2]: + for chip in groupdata[group]: + replicate = str([ i + 1 for i in range(len(groupdata[group])) if groupdata[group][i]== chip ][0]) bamReads = join(bam_dir, chip + ".Q5DD.bam") controlID = chip2input[chip] if controlID != "": @@ -32,10 +34,10 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): peaks = join(wp, peaktool, chip, chip + peakext) if blocking: block = blocks[chip] - this_row = dict(zip(cols, [chip, condition, block, replicate, bamReads, + this_row = dict(zip(cols, [chip, group, block, replicate, bamReads, controlID, bamControl, peaks, peakcaller])) else: - this_row = dict(zip(cols, [chip, condition, replicate, bamReads, + this_row = dict(zip(cols, [chip, group, replicate, bamReads, controlID, bamControl, peaks, peakcaller])) samplesheet.append(this_row) @@ -49,8 +51,7 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): if __name__ == "__main__": parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') - parser.add_argument('--g1', dest='group1', required=True, help='Name of the first group') - parser.add_argument('--g2', dest='group2', required=True, help='Name of the second group') + parser.add_argument('--con', dest='contrast', required=True, help='Contrast string in [GROUP1]_vs_[GROUP2] format') parser.add_argument('--wp', dest='wp', required=True, help='Full path of the working directory') parser.add_argument('--pt', dest='peaktool', required=True, help='Name of the the peak calling tool, also the directory where the peak file will be located') @@ -61,4 +62,4 @@ def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): help='Name of the directory where the bam files are located') parser.add_argument('--csv', dest='csvfile', required=True, help='Name of the output csv file') args = parser.parse_args() - main(args.group1, args.group2, args.peaktool, args.peakext, args.peakcaller, args.csvfile, args.wp, args.bam_dir) + main(args.contrast, args.peaktool, args.peakext, args.peakcaller, args.csvfile, args.wp, args.bam_dir) diff --git a/bin/promoterAnnotation_by_Gene.R b/bin/promoterAnnotation_by_Gene.R index 846cfc0..e5f3421 100755 --- a/bin/promoterAnnotation_by_Gene.R +++ b/bin/promoterAnnotation_by_Gene.R @@ -8,31 +8,31 @@ # Created: August 8, 2022 # Updated: October 26, 2022 to work with uropa 4.0.2 # Updated: November 3, 2022 to fit with pipeline -# +# #################### # # Purpose: To take UROPA allhits output files using "TSSprot" conditions and -# create a table of which genes have annotations overlapping their +# create a table of which genes have annotations overlapping their # promoters and how many times. Output format: dataframe # -# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the -# TSS. Allhits files were chosen to capture information from "peaks" -# overlappingmultiple promoters. Finalhits files can also be processed -# with this pipeline. This script can handle multiple allhits files as -# long as there are equal numbers of sampleNames to go with them. Also, -# giving a matching DiffBind txt file will allow the allhits file to be -# filtered to only include the significant differential peaks or to +# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the +# TSS. Allhits files were chosen to capture information from "peaks" +# overlappingmultiple promoters. Finalhits files can also be processed +# with this pipeline. This script can handle multiple allhits files as +# long as there are equal numbers of sampleNames to go with them. Also, +# giving a matching DiffBind txt file will allow the allhits file to be +# filtered to only include the significant differential peaks or to # split the data by the direction of log fold-change. # # Requires: GenomicRanges, tidyr # # Function: promoterAnnotationByGene(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) -# +# # Variables: # allhitsFiles: [required] a vector of allhits files to process -# sampleNames: [required] a vector of short names for each allhits file +# sampleNames: [required] a vector of short names for each allhits file # to use as column headers -# diffbindFiles: [optional] a vector of diffbind files to use to filter each +# diffbindFiles: [optional] a vector of diffbind files to use to filter each # allhits file # direction: [optional] when filtering using diffbindFiles, define how to # filter using log fold change. "Both" is default @@ -43,137 +43,151 @@ # source("promoterAnnotation_by_Gene.R") # out1 <- promoterAnnotationByGene(allhitsA.txt, "A") # out2 <- promoterAnnotationByGene(allhitsA.txt, "A", diffbindA.txt, "both") -# out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), -# sampleNames=c("A","B"), -# diffbindFiles=c(diffbindA.txt,diffbindB.txt), +# out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), +# sampleNames=c("A","B"), +# diffbindFiles=c(diffbindA.txt,diffbindB.txt), # direction="pos") -# out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), -# sampleNames=c("Deseq2","EdgeR"), -# diffbindFiles=c(Deseq2.txt,EdgeR.txt), +# out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), +# sampleNames=c("Deseq2","EdgeR"), +# diffbindFiles=c(Deseq2.txt,EdgeR.txt), # direction="separate") -# +# #################### allhits2promoter <- function(allhitsFile) { - # cleaning up the allhits file to only keep information about peaks - # overlapping promoters - inData <- read.delim(allhitsFile) - tmp <- which(inData$name == "query_1") - if (length(tmp) == 0) { - print (paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters.")) - } else { - promoterData <- inData[tmp,] - promoterData <- promoterData[,c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")] - return(promoterData) - } + # cleaning up the allhits file to only keep information about peaks + # overlapping promoters + inData <- read.delim(allhitsFile) + tmp <- which(inData$name == "query_1") + if (length(tmp) == 0) { + print(paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters.")) + } else { + promoterData <- inData[tmp, ] + promoterData <- promoterData[, c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")] + return(promoterData) + } } filterPromoter <- function(Diffbind, promoterData, sampleName) { - # used by DiffbindFilterPromoter - promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, seqnames.field="peak_chr", - start.field="peak_start", end.field="peak_end", - starts.in.df.are.0based=F) - Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind) - ov <- GenomicRanges::countOverlaps(promoterData2,Diffbind2,type = "equal",maxgap=1) - promoterData3 <- promoterData[which(ov != 0),] - promoterData3$sample_id <- sampleName - return(promoterData3) + # used by DiffbindFilterPromoter + promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, + seqnames.field = "peak_chr", + start.field = "peak_start", end.field = "peak_end", + starts.in.df.are.0based = F + ) + Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind) + ov <- GenomicRanges::countOverlaps(promoterData2, Diffbind2, type = "equal", maxgap = 1) + promoterData3 <- promoterData[which(ov != 0), ] + promoterData3$sample_id <- sampleName + return(promoterData3) } DiffbindFilterPromoter <- function(DiffbindFile, promoterData, sampleName, direction) { - # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is - # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both". + # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is + # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both". Diffbind <- read.delim(DiffbindFile) - Diffbind <- Diffbind[which(Diffbind$FDR < 0.05),] + Diffbind <- Diffbind[which(Diffbind$FDR < 0.05), ] if ((direction == "both") | is.na(direction)) { - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) } else if (direction == "pos") { - sampleName <- paste0(sampleName, "_pos") - Diffbind <- Diffbind[which(Diffbind$Fold > 0),] - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + sampleName <- paste0(sampleName, "_pos") + Diffbind <- Diffbind[which(Diffbind$Fold > 0), ] + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) } else if (direction == "neg") { - sampleName <- paste0(sampleName, "_neg") - Diffbind <- Diffbind[which(Diffbind$Fold < 0),] - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + sampleName <- paste0(sampleName, "_neg") + Diffbind <- Diffbind[which(Diffbind$Fold < 0), ] + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) } else { - sampleNameP <- paste0(sampleName, "_pos") - DiffbindP <- Diffbind[which(Diffbind$Fold > 0),] - promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP) - sampleNameN <- paste0(sampleName, "_neg") - DiffbindN <- Diffbind[which(Diffbind$Fold < 0),] - promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN) - promoterData2 <- rbind(promoterDataP, promoterDataN) + sampleNameP <- paste0(sampleName, "_pos") + DiffbindP <- Diffbind[which(Diffbind$Fold > 0), ] + promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP) + sampleNameN <- paste0(sampleName, "_neg") + DiffbindN <- Diffbind[which(Diffbind$Fold < 0), ] + promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN) + promoterData2 <- rbind(promoterDataP, promoterDataN) } -return(promoterData2) + return(promoterData2) } createPromoterTable <- function(promoterData) { - # making final output table - PromoterTable <- data.frame( table(promoterData[,c("gene_id", "sample_id")] ) ) - PromoterTable2 <- merge( unique(promoterData[,c("gene_id", "gene_name")] ), PromoterTable) - PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from="sample_id", values_from="Freq") - return(PromoterTable3) + # making final output table + PromoterTable <- data.frame(table(promoterData[, c("gene_id", "sample_id")])) + PromoterTable2 <- merge(unique(promoterData[, c("gene_id", "gene_name")]), PromoterTable) + PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from = "sample_id", values_from = "Freq") + return(PromoterTable3) } -promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) { - # the main function - if ( length(allhitsFiles) != length(sampleNames) ) { - print("Number of allhits files and sample names don't match.") - } else { - if ( (length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1) ) { - print("Number of allhits files and diffbind files don't match.") +promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles = NA, direction = NA) { + # the main function + if (length(allhitsFiles) != length(sampleNames)) { + print("Number of allhits files and sample names don't match.") } else { - if ( length(allhitsFiles) == 1 ) { - promoterData <- allhits2promoter(allhitsFiles) - if (is.na(diffbindFiles)) { - promoterData$sample_id <- sampleNames + if ((length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1)) { + print("Number of allhits files and diffbind files don't match.") } else { - promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction) - } - } else { - for ( a in 1:length(allhitsFiles) ) { - print(a) - tmpA <- allhits2promoter(allhitsFiles[a]) - if (sum(is.na(diffbindFiles)) ==1) { - tmpA$sample_id <- sampleNames[a] - } else { - tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction) - } - if (a == 1) { - promoterData <- tmpA - } else { - promoterData <- rbind(promoterData, tmpA) - } + if (length(allhitsFiles) == 1) { + promoterData <- allhits2promoter(allhitsFiles) + if (is.na(diffbindFiles)) { + promoterData$sample_id <- sampleNames + } else { + promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction) + } + } else { + for (a in 1:length(allhitsFiles)) { + print(a) + tmpA <- allhits2promoter(allhitsFiles[a]) + if (sum(is.na(diffbindFiles)) == 1) { + tmpA$sample_id <- sampleNames[a] + } else { + tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction) + } + if (a == 1) { + promoterData <- tmpA + } else { + promoterData <- rbind(promoterData, tmpA) + } + } + } } - } + promoterTable <- createPromoterTable(promoterData) + return(promoterTable) } - promoterTable <- createPromoterTable(promoterData) - return(promoterTable) - } -} +} -peakcallVersion <- function(inFolder,outFile) { -# currently only works for macs outputs -# inFolder here is the folder where the uropa output files are located - filesA <- list.files(path=inFolder,pattern="allhits.txt") - samples <- matrix(unlist(strsplit(filesA,"_macs")),ncol=2,byrow=T)[,1] - filesA <- list.files(path=inFolder,pattern="allhits.txt",full.names = T) - promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesA, sampleNames=samples) - write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) +peakcallVersion <- function(inFolder, outFile) { + # currently only works for macs outputs + # inFolder here is the folder where the uropa output files are located + filesA <- list.files(path = inFolder, pattern = "allhits.txt") + samples <- matrix(unlist(strsplit(filesA, "_macs")), ncol = 2, byrow = T)[, 1] + filesA <- list.files(path = inFolder, pattern = "allhits.txt", full.names = T) + promoterInfo <- promoterAnnotationByGene(allhitsFiles = filesA, sampleNames = samples) + write.table(promoterInfo, outFile, quote = F, sep = "\t", row.names = F) } -diffbindVersion <- function(inFolder,outFile) { -# currently designed for macs peaks, analyzed by deseq2 -# analyzing both positive and negative together for now -# inFolder here is the root working directory for the project - uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind") - diffbindFolder <- paste0(inFolder, "/DiffBind") - filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt") - samples <- matrix(unlist(strsplit(filesU,"-macs")),ncol=2,byrow=T)[,1] - filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt",full.names=T) - filesD <- list.files(path=diffbindFolder, pattern="Deseq2.txt",full.names=T,recursive=T) - promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesU, - sampleNames=samples, diffbindFiles=filesD, direction="both") - write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) +diffbindVersion <- function(inFolder, outFile) { + # currently designed for macs peaks, analyzed by deseq2 + # analyzing both positive and negative together for now + # inFolder here is the root working directory for the project + uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind") + diffbindFolder <- paste0(inFolder, "/DiffBind") + + filesU <- list.files(path = uropaFolder, pattern = "DiffbindDeseq2_uropa_protTSS_allhits.txt") + + samples <- matrix(unlist(strsplit(filesU, "-macs")), ncol = 2, byrow = T)[, 1] + + filesU <- list.files(path = uropaFolder, pattern = "DiffbindDeseq2_uropa_protTSS_allhits.txt", full.names = T) + filesD <- list.files(path = diffbindFolder, pattern = "Deseq2.txt", full.names = T, recursive = T) + print(paste0("filesU: ", filesU)) + print(paste0("samples: ", samples)) + print(paste0("filesD: ", filesD)) + stop() + + promoterInfo <- promoterAnnotationByGene( + allhitsFiles = filesU, + sampleNames = samples, + diffbindFiles = filesD, + direction = "both" + ) + write.table(promoterInfo, outFile, quote = F, sep = "\t", row.names = F) } diff --git a/bin/uropa_input.py b/bin/uropa_input.py new file mode 100755 index 0000000..9be89ef --- /dev/null +++ b/bin/uropa_input.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +import os +import argparse +import json +import csv +import shutil + +def main(args): + base_query = { + "feature": ["gene"], + "filter_attribute": "gene_type", + "attribute_values": ["protein_coding"], + "feature_anchor": ["start"], + "relative_location": + ["PeakInsideFeature", "FeatureInsidePeak", "Upstream", + "Downstream", "OverlapStart", "OverlapEnd"], + "strand": "ignore" + } + + for i, peak_type in enumerate(args.peak_types): + if not os.path.exists(os.path.dirname(args.output_json[i])): + os.makedirs(os.path.abspath(os.path.dirname(args.output_json[i]))) + json_construct = dict() + json_construct['queries'] = [] + json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"] + json_construct["priority"] = "Yes" + # don't put outdir in json, leave to specify on command line execution + # json_construct["outdir"] = os.path.dirname(args.output_json[i]) + json_construct['gtf'] = args.gtf + json_construct['bed'] = str(args.bed) + + if args.assay == 'cfchip': + if peak_type == 'protTSS': + for ii, _d in enumerate([[3000], [10000], [100000]], start=1): + this_q = base_query.copy() + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + else: + if peak_type == 'prot': + for ii, _d in enumerate([[5000], [100000]], start=1): + this_q = base_query.copy() + del this_q["feature_anchor"] + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + elif peak_type == 'genes': + this_query = {} + this_query['feature'] = 'gene' + for ii, _d in enumerate([[5000], [100000]], start=1): + this_q = base_query.copy() + del this_q["feature_anchor"] + del this_q["filter_attribute"] + del this_q["attribute_value"] + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + elif peak_type == 'protSEC': + query_values = ( + ([3000, 1000], ["start"]), + ([3000], ["end"]), + ([100000], ["center"]), + ([100000], None) + ) + for ii, (_distance, feature_anchor) in enumerate(query_values, start=1): + this_q = base_query.copy() + del this_q["feature_anchor"] + if feature_anchor: + this_q["feature_anchor"] = feature_anchor + this_q['distance'] = _distance + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + elif peak_type == 'protTSS': + for ii, _d in enumerate([[3000, 1000], [10000], [100000]], start=1): + this_q = base_query.copy() + this_q['distance'] = _d + this_q['name'] = f'query_{str(ii)}' + json_construct['queries'].append(this_q) + + with open(args.output_json[i], 'w') as jo: + json.dump(json_construct, jo, indent=4) + jo.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Script to prepare the uropa json input') + parser.add_argument('-g', dest='gtf', required=True, help='Gene GTF used in uropa annotation lookup') + parser.add_argument('-o', dest='output_json', nargs="*", required=True, help='Path to output UROPA input JSON') + parser.add_argument('-a', dest='assay', required=True, help='Type of assay being run') + parser.add_argument('-b', dest='bed', required=True, help='Bed used for UROPA annotation') + parser.add_argument('--types', '-t', dest='peak_types', nargs="+", required=True, help='Peak types: prot, protTSS, genes, protSEC') + args = parser.parse_args() + if isinstance(args.output_json, str): + args.output_json = [args.output_json] + if len(args.peak_types) != len(args.output_json): + raise ValueError('More peaks types than output file paths!') + main(args) \ No newline at end of file diff --git a/config/cluster.json b/config/cluster.json index 5cc360e..5f8d805 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -11,7 +11,8 @@ }, "trim": { "mem": "64g", - "threads": "32" + "threads": "32", + "time": "1-18:00:00" }, "kraken": { "cpus-per-task": "24", @@ -97,14 +98,47 @@ "mem": "50g" }, "diffbindQC": { + "threads": "16", "mem": "150g" }, - "diffbind": { + "diffbind_deseq": { + "threads": "16", "mem": "150g" }, - "MEME": { - "threads": "2", + "diffbind_edger": { + "threads": "16", + "mem": "150g" + }, + "diffbind_edger_blocking": { + "threads": "16", + "mem": "150g" + }, + "diffbind_deseq_blocking": { + "threads": "16", + "mem": "150g" + }, + "diffbind_count": { + "threads": "16", + "mem": "120g" + }, + "UROPA_macsNarrow": { + "threads": "16", + "mem": "32g", + "time": "1-00:00:00" + }, + "UROPA_macsBroad": { + "threads": "16", "mem": "32g", + "time": "1-00:00:00" + }, + "UROPA_diffbind": { + "threads": "16", + "mem": "32g", + "time": "1-00:00:00" + }, + "MEME": { + "threads": "28", + "mem": "32g", "time": "3-00:00:00", "ntasks": "--ntasks=28", "ntasks_per_core": "--ntasks-per-core=1", diff --git a/config/containers.json b/config/containers.json index 280e934..7c31275 100644 --- a/config/containers.json +++ b/config/containers.json @@ -1,7 +1,7 @@ { "images": { "cfchip": "docker://skchronicles/cfchip_toolkit:v0.5.0", - "uropa": "docker://quay.io/biocontainers/uropa:4.0.2--pyhdfd78af_0", + "uropa": "docker://rroutsong/uropa:4.0.3", "python": "docker://asyakhleborodova/chrom_seek_python:v0.1.0", "ppqt": "docker://asyakhleborodova/ppqt:v0.2.0" } diff --git a/docker/dedup/Dockerfile b/docker/dedup/Dockerfile new file mode 100644 index 0000000..4f97409 --- /dev/null +++ b/docker/dedup/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:latest +RUN apt-get update -q -y +RUN apt-get install samtools bedtools default-jre r-base python3 python3-pip curl build-essential -y +RUN python3 -m pip config set global.break-system-packages true +RUN ln -sf /usr/bin/python3 /usr/bin/python; ln -sf /usr/bin/pip3 /usr/bin/pip +RUN pip install MACS3 +RUN cd /usr/bin; curl -LJO https://github.com/broadinstitute/picard/releases/download/3.3.0/picard.jar +RUN echo "\npython3 -m pip config set global.break-system-packages true\n" >> /etc/bash.bashrc +RUN echo 'picard() {\n\tjava -Xmx$1 -jar /usr/bin/picard.jar "${@:2}"\n}\n' >> /etc/bash.bashrc \ No newline at end of file diff --git a/docker/uropa/Dockerfile b/docker/uropa/Dockerfile new file mode 100755 index 0000000..18c21aa --- /dev/null +++ b/docker/uropa/Dockerfile @@ -0,0 +1,5 @@ +FROM conda/miniconda3 +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge +RUN conda config --set channel_priority strict +RUN conda install bioconda::uropa diff --git a/src/run.py b/src/run.py index 9b66aab..bdb162c 100644 --- a/src/run.py +++ b/src/run.py @@ -13,7 +13,8 @@ fatal, which, exists, - err + err, + sanitize_slurm_env ) from . import version as __version__ @@ -235,9 +236,13 @@ def unpacked(nested_dict): """ # Iterate over all values of # given dictionary - for value in nested_dict.values(): + for key, value in nested_dict.items(): # Check if value is of dict type - if isinstance(value, dict): + # also exclude certain directories + dontcheck = ('userhome',) + # we exclude the /home directory so it does not interfere with + # container system + if isinstance(value, dict) and key not in dontcheck: # If value is dict then iterate # over all its values recursively for v in unpacked(value): @@ -677,22 +682,13 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None, if temp not in additional_bind_paths.split(','): addpaths.append(temp) bindpaths = ','.join(addpaths) - + # Set ENV variable 'SINGULARITY_CACHEDIR' # to output directory my_env = {}; my_env.update(os.environ) cache = os.path.join(outdir, ".singularity") my_env['SINGULARITY_CACHEDIR'] = cache my_env['APPTAINER_CACHEDIR'] = cache - # Removing R_SITE_LIB environment variable - # due to issue: https://github.com/OpenOmics/chrom-seek/issues/28 - # Using SINGULARITY_CONTAINALL or APPTAINER_CONTAINALL - # causes downstream using where $SLURM_JOBID is - # NOT exported within a container. - if 'R_LIBS_SITE' in my_env: - # functionally equivalent: - # unset R_LIBS_SITE - del my_env['R_LIBS_SITE'] if alt_cache: # Override the pipeline's default @@ -701,6 +697,8 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None, my_env['APPTAINER_CACHEDIR'] = alt_cache cache = alt_cache + my_env = sanitize_slurm_env(my_env) + if additional_bind_paths: # Add Bind PATHs for outdir and tmp dir if bindpaths: @@ -730,7 +728,7 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None, masterjob = subprocess.Popen([ 'snakemake', '-pr', '--rerun-incomplete', '--use-singularity', - '--singularity-args', "'-B {}'".format(bindpaths), + '--singularity-args', "\\-c \\-B '{}'".format(bindpaths), '--cores', str(threads), '--configfile=config.json' ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env) diff --git a/src/run.sh b/src/run.sh index 63463df..622a72e 100755 --- a/src/run.sh +++ b/src/run.sh @@ -212,23 +212,26 @@ function submit(){ fi cat << EOF > kickoff.sh #!/usr/bin/env bash -#SBATCH --cpus-per-task=16 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 #SBATCH --mem=64g +#SBATCH --partition=norm #SBATCH --time=5-00:00:00 #SBATCH --parsable #SBATCH -J "$2" #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --output "$3/logfiles/snakemake_${ts}.log" #SBATCH --error "$3/logfiles/snakemake_${ts}.log" + set -euo pipefail # Main process of pipeline snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\ - --use-singularity --singularity-args "'-B $4'" \\ + --use-singularity --singularity-args "\\-c \\-B '$4'" \\ --use-envmodules --configfile="$3/config.json" \\ --printshellcmds --cluster-config "$3/config/cluster.json" \\ --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \\ --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\ - --keep-remote --local-cores 14 2>&1 + --keep-remote --local-cores 30 2>&1 # Create summary report snakemake -d "$3" --report "Snakemake_Report.html" EOF diff --git a/src/utils.py b/src/utils.py index 9ef48d8..679c1d3 100644 --- a/src/utils.py +++ b/src/utils.py @@ -275,26 +275,6 @@ def check_cache(parser, cache, *args, **kwargs): return cache -def unpacked(nested_dict): - """Generator to recursively retrieves all values in a nested dictionary. - @param nested_dict dict[