diff --git a/DESCRIPTION b/DESCRIPTION index 87d2e70..ef1653d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Maintainer: Stefan Dentro License: GPL-3 Type: Package Title: Battenberg subclonal copy number caller -Version: 2.2.0 +Version: 2.2.1 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"), person("Peter", "Van Loo", role=c("aut")), person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")), diff --git a/R/clonal_ascat.R b/R/clonal_ascat.R index a1190ad..f9c2df9 100755 --- a/R/clonal_ascat.R +++ b/R/clonal_ascat.R @@ -1242,24 +1242,32 @@ find_centroid_of_global_minima <- function( d, ref_seg_matrix, ref_major, ref_mi #' @param distancepng if NA: distance is plotted, if filename is given, the plot is written to a .png file (Default NA) #' @param copynumberprofilespng if NA: possible copy number profiles are plotted, if filename is given, the plot is written to a .png file (Default NA) #' @param nonroundedprofilepng if NA: copy number profile before rounding is plotted (total copy number as well as the copy number of the minor allele), if filename is given, the plot is written to a .png file (Default NA) +#' @param cnaStatusFile File where the copy number profile status is written to. This contains either the message "No suitable copy number solution found" or "X copy number solutions found" (Default copynumber_solution_status.txt) #' @param gamma technology parameter, compaction of Log R profiles (expected decrease in case of deletion in diploid sample, 100 "\%" aberrant cells; 1 in ideal case, 0.55 of Illumina 109K arrays) (Default 0.55) #' @param allow100percent A boolean whether to allow a 100"\%" cellularity solution #' @param reliabilityFile String to where fit reliabilty information should be written. This file contains backtransformed BAF and LogR values for segments using the fitted copy number profile (Default NA) #' @param min.ploidy The minimum ploidy to consider (Default 1.6) #' @param max.ploidy The maximum ploidy to consider (Default 4.8) #' @param min.rho The minimum cellularity to consider (Default 0.1) +#' @param max.rho The maximum cellularity to consider (Default 1.0) #' @param min.goodness The minimum goodness of fit for a solution to have to be considered (Default 63) #' @param uninformative_BAF_threshold The threshold beyond which BAF becomes uninformative (Default 0.51) #' @return A list with fields psi, rho and ploidy #' @export #the limit on rho is lenient and may lead to spurious solutions -runASCAT = function(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choice, distancepng = NA, copynumberprofilespng = NA, nonroundedprofilepng = NA, gamma = 0.55, allow100percent,reliabilityFile=NA,min.ploidy=1.6,max.ploidy=4.8,min.rho=0.1,max.rho=1.05,min.goodness=63, uninformative_BAF_threshold = 0.51) { +runASCAT = function(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choice, distancepng = NA, copynumberprofilespng = NA, nonroundedprofilepng = NA, cnaStatusFile = "copynumber_solution_status.txt", gamma = 0.55, allow100percent,reliabilityFile=NA,min.ploidy=1.6,max.ploidy=4.8,min.rho=0.1,max.rho=1.0,min.goodness=63, uninformative_BAF_threshold = 0.51) { ch = chromosomes b = bafsegmented r = lrrsegmented[names(bafsegmented)] + # Adapt the rho/psi boundaries for the local maximum searching below to work + dist_min_psi = max(min.ploidy-0.6, 0) + dist_max_psi = max.ploidy+0.6 + dist_min_rho = max(min.rho-0.03, 0.05) + dist_max_rho = max.rho+0.03 + s = make_segments(r,b) - dist_matrix_info <- create_distance_matrix( s, dist_choice, gamma, uninformative_BAF_threshold=uninformative_BAF_threshold, min_psi=min.ploidy, max_psi=max.ploidy, min_rho=min.rho, max_rho=max.rho) + dist_matrix_info <- create_distance_matrix( s, dist_choice, gamma, uninformative_BAF_threshold=uninformative_BAF_threshold, min_psi=dist_min_psi, max_psi=dist_max_psi, min_rho=dist_min_rho, max_rho=dist_max_rho) d = dist_matrix_info$distance_matrix minimise = dist_matrix_info$minimise @@ -1366,6 +1374,7 @@ runASCAT = function(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choi rho_opt1_plot = vector(mode="numeric") if (nropt>0) { + write.table(paste(nropt, " copy number solutions found", sep=""), file=cnaStatusFile, quote=F, col.names=F, row.names=F) optlim = sort(localmin)[1] for (i in 1:length(optima)) { if(optima[[i]][1] == optlim) { @@ -1382,7 +1391,11 @@ runASCAT = function(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choi } } } else { - print("No solution found") + write.table(paste("no copy number solutions found", sep=""), file=cnaStatusFile, quote=F, col.names=F, row.names=F) + print("No suitable copy number solution found") + psi = NA + ploidy = NA + rho = NA } # separated plotting from logic: create distanceplot here @@ -1653,7 +1666,7 @@ recalc_psi_t = function(psi, rho, gamma_param, lrrsegmented, segBAF.table, sigle segs = do.call(rbind, segs) # Calculate psi_t as the weighted average copy number across all segments - psi_t = sum(segs$psi_t * segs$length) / sum(segs$length) + psi_t = sum(segs$psi_t * segs$length, na.rm=T) / sum(segs$length, na.rm=T) return(psi_t) } diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R index b84bdee..dcb4eab 100644 --- a/R/fitcopynumber.R +++ b/R/fitcopynumber.R @@ -23,7 +23,7 @@ #' @param read_depth Legacy parameter that is no longer used (Default 30) #' @author dw9, sd11 #' @export -fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmented, inputfile.baf, inputfile.logr, dist_choice, ascat_dist_choice, min.ploidy=1.6, max.ploidy=4.8, min.rho=0.1, max.rho=1.05, min.goodness=63, uninformative_BAF_threshold=0.51, gamma_param=1, use_preset_rho_psi=F, preset_rho=NA, preset_psi=NA, read_depth=30) { +fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmented, inputfile.baf, inputfile.logr, dist_choice, ascat_dist_choice, min.ploidy=1.6, max.ploidy=4.8, min.rho=0.1, max.rho=1.0, min.goodness=63, uninformative_BAF_threshold=0.51, gamma_param=1, use_preset_rho_psi=F, preset_rho=NA, preset_psi=NA, read_depth=30) { assert.file.exists(inputfile.baf.segmented) assert.file.exists(inputfile.baf) @@ -31,12 +31,10 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente # Check for enough options supplied for rho and psi if ((max.ploidy - min.ploidy) < 0.05) { - warning(paste("Supplied ploidy range must be larger than 0.05: ", min.ploidy, "-", max.ploidy, sep="")) - quit(save="no", status=1) + stop(paste("Supplied ploidy range must be larger than 0.05: ", min.ploidy, "-", max.ploidy, sep="")) } if ((max.rho - min.rho) < 0.01) { - warning(paste("Supplied rho range must be larger than 0.01: ", min.rho, "-", max.rho, sep="")) - quit(save="no", status=1) + stop(paste("Supplied rho range must be larger than 0.01: ", min.rho, "-", max.rho, sep="")) } # Read in the required data @@ -117,19 +115,10 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente selection = c(selection, matched.segmented.BAF.data.chr[,2] %in% logR.data.chr[,2]) } - print(dim(matched.segmented.BAF.data)) - print(dim(segmented.logR.data)) - matched.segmented.BAF.data = matched.segmented.BAF.data[selection,] segmented.logR.data = segmented.logR.data[selection,] - - print(dim(matched.segmented.BAF.data)) - print(dim(segmented.logR.data)) - print(dim(logR.data)) - row.names(segmented.logR.data) = row.names(matched.segmented.BAF.data) row.names(logR.data) = row.names(matched.segmented.BAF.data) - write.table(segmented.logR.data,paste(samplename,".logRsegmented.txt",sep=""),sep="\t",quote=F,col.names=F,row.names=F) segBAF = 1-matched.segmented.BAF.data[,5] @@ -138,7 +127,7 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente names(segBAF) = rownames(matched.segmented.BAF.data) names(segLogR) = rownames(matched.segmented.BAF.data) names(logR) = rownames(matched.segmented.BAF.data) - print(unique(logR.data[,1])) + chr.segs = NULL for(ch in 1:length(chr.names)){ chr.segs[[ch]] = which(logR.data[,1]==chr.names[ch]) @@ -147,11 +136,12 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente if(use_preset_rho_psi){ ascat_optimum_pair = list(rho=preset_rho, psi = preset_psi, ploidy = preset_psi) }else{ - distance.outfile=paste(outputfile.prefix,"distance.png",sep="",collapse="") # kjd 20-2-2014 - copynumberprofile.outfile=paste(outputfile.prefix,"copynumberprofile.png",sep="",collapse="") # kjd 20-2-2014 - nonroundedprofile.outfile=paste(outputfile.prefix,"nonroundedprofile.png",sep="",collapse="") # kjd 20-2-2014 + distance.outfile=paste(outputfile.prefix, "distance.png", sep="", collapse="") # kjd 20-2-2014 + copynumberprofile.outfile=paste(outputfile.prefix, "copynumberprofile.png", sep="", collapse="") # kjd 20-2-2014 + nonroundedprofile.outfile=paste(outputfile.prefix, "nonroundedprofile.png", sep="", collapse="") # kjd 20-2-2014 + cnaStatusFile = paste(outputfile.prefix, "copynumber_solution_status.txt", sep="", collapse="") - ascat_optimum_pair=runASCAT(logR, 1-BAF.data[,3], segLogR, segBAF, chr.segs, ascat_dist_choice,distance.outfile, copynumberprofile.outfile, nonroundedprofile.outfile, gamma=gamma_param, allow100percent=T, reliabilityFile=NA, min.ploidy=min.ploidy, max.ploidy=max.ploidy, min.rho=min.rho, max.rho=max.rho, min.goodness) # kjd 4-2-2014 + ascat_optimum_pair = runASCAT(logR, 1-BAF.data[,3], segLogR, segBAF, chr.segs, ascat_dist_choice,distance.outfile, copynumberprofile.outfile, nonroundedprofile.outfile, cnaStatusFile=cnaStatusFile, gamma=gamma_param, allow100percent=T, reliabilityFile=NA, min.ploidy=min.ploidy, max.ploidy=max.ploidy, min.rho=min.rho, max.rho=max.rho, min.goodness) # kjd 4-2-2014 } distance.outfile=paste(outputfile.prefix,"second_distance.png",sep="",collapse="") # kjd 20-2-2014 @@ -185,6 +175,9 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente #' @param output.figures.prefix Prefix of the filenames for the chromosome specific copy number figures #' @param output.gw.figures.prefix Prefix of the filenames for the genome wide copy number figures #' @param chr_names Vector of allowed chromosome names +#' @param masking_output_file Filename of where the masking details need to be written. Masking is performed to remove very high copy number state segments +#' @param max_allowed_state The maximum CN state allowed (Default 100) +#' @param sv_breakpoints_file A two column file with breakpoints from structural variants. These are used when making the figures #' @param gamma Technology specific scaling parameter for LogR (Default 1) #' @param segmentation.gamma Legacy parameter that is no longer used (Default NA) #' @param siglevel Threshold under which a p-value becomes significant. When it is significant a second copy number state will be fitted (Default 0.05) @@ -192,7 +185,7 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente #' @param noperms The number of permutations to be run when bootstrapping the confidence intervals on the copy number state of each segment (Default 1000) #' @author dw9, sd11 #' @export -callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.file, output.file, output.figures.prefix, output.gw.figures.prefix, chr_names, sv_breakpoints_file=NULL, gamma=1, segmentation.gamma=NA, siglevel=0.05, maxdist=0.01, noperms=1000, seed=as.integer(Sys.time())) { +callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.file, output.file, output.figures.prefix, output.gw.figures.prefix, chr_names, masking_output_file, max_allowed_state=250, sv_breakpoints_file=NULL, gamma=1, segmentation.gamma=NA, siglevel=0.05, maxdist=0.01, noperms=1000, seed=as.integer(Sys.time())) { set.seed(seed) @@ -237,14 +230,14 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil # = as.vector(ctrans.logR[as.vector(LogRvals[,1])]*1000000000+LogRvals[,2]) BAFpos = as.vector(ctrans[as.vector(BAFvals[,1])]*1000000000+BAFvals[,2]) - save(file="subclones_temp.RData", BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms) + #save(file="subclones_temp.RData", BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms) ################################################################################################ # Determine copy number for each segment ################################################################################################ res = determine_copynumber(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms) subcloneres = res$subcloneres - #write.table(subcloneres, gsub(".txt", "_1.txt", output.file), quote=F, col.names=T, row.names=F, sep="\t") + write.table(subcloneres, gsub(".txt", "_1.txt", output.file), quote=F, col.names=T, row.names=F, sep="\t") # Scan the segments for cases that should be merged res = merge_segments(subcloneres, BAFvals, LogRvals, rho, psi, gamma) @@ -253,6 +246,16 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil res = determine_copynumber(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctrans.logR, maxdist, siglevel, noperms) subcloneres = res$subcloneres BAFpvals = res$BAFpvals + + # Scan for very high copy number segments and set those to NA - This is in part an artifact of small segments + res = mask_high_cn_segments(subcloneres, BAFvals, max_allowed_state) + subcloneres = res$subclones + BAFvals = res$bafsegmented + write.table(BAFvals, file=baf.segmented.file, sep="\t", row.names=F, col.names=T, quote=F) + # Write the masking details to file + masking_details = data.frame(samplename=sample.name, masked_count=res$masked_count, masked_size=res$masked_size, max_allowed_state=max_allowed_state) + write.table(masking_details, file=masking_output_file, quote=F, col.names=T, row.names=F, sep="\t") + # Write the final copy number profile write.table(subcloneres, output.file, quote=F, col.names=T, row.names=F, sep="\t") ################################################################################################ @@ -260,7 +263,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil ################################################################################################ # Collapse the BAFsegmented into breakpoints to be used in plotting segment_breakpoints = collapse_bafsegmented_to_segments(BAFvals) - if (!is.null(sv_breakpoints_file)) { + if (!is.null(sv_breakpoints_file) & !sv_breakpoints_file=="NA") { svs = read.table(sv_breakpoints_file, header=T, stringsAsFactors=F) } @@ -346,9 +349,6 @@ determine_copynumber = function(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctra BAFpvals = vector(length=length(BAFseg)) subcloneres = NULL - print(head(BAFvals)) - print(head(LogRvals)) - for (i in 1:length(BAFlevels)) { # subcloneres = rbind(subcloneres, fit_segment(BAFpos, LogRpos, BAFlevels, BAFphased, LogRvals, switchpoints, rho, psi, gamma, i)) l = BAFlevels[i] @@ -477,7 +477,6 @@ determine_copynumber = function(BAFvals, LogRvals, rho, psi, gamma, ctrans, ctra } } - print(head(subcloneres)) colnames(subcloneres) = c("chr","startpos","endpos","BAF","pval","LogR","ntot", "nMaj1_A","nMin1_A","frac1_A","nMaj2_A","nMin2_A","frac2_A","SDfrac_A","SDfrac_A_BS","frac1_A_0.025","frac1_A_0.975", "nMaj1_B","nMin1_B","frac1_B","nMaj2_B","nMin2_B","frac2_B","SDfrac_B","SDfrac_B_BS","frac1_B_0.025","frac1_B_0.975", @@ -515,6 +514,7 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm counter = 1 previous_merged = F + print("Merging segments") while(merged) { print(paste0("Iter: ",counter)) merged = F @@ -528,7 +528,6 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm # Don't do any double merging. This needs to happen at the next iteration, as we've just merged i with i-1 we don't add i again if (previous_merged) { - print("previous merged, skip next") previous_merged = F #subclones_cleaned = rbind(subclones_cleaned, subclones[i-1,]) next @@ -554,16 +553,12 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm # It would be nice to store the baf and logr values for this segment temporarily, but the memory allocation time is too great, therefore the selection happens a couple of times inline below if (not_subclonal & nmin_equals & nmaj_equals) { # MERGE - print("MERGING 3") - print(subclones[i-1,1:11]) - print(subclones[i,1:11]) - new_entry = data.frame(subclones[i-1,]) new_entry$endpos = subclones[i,]$endpos new_entry$BAF = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], - bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) + bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T) new_entry$LogR = mean(c(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3], - logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3])) + logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3]), na.rm=T) subclones_cleaned = rbind(subclones_cleaned, new_entry) # Set BAFseg to reflect the current segment @@ -603,16 +598,12 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm if (!baf_significant & nmin_curr==nmin_prev & nmaj_curr==nmaj_prev) { # MERGE - print("MERGING 4") - print(subclones[i-1,1:11]) - print(subclones[i,1:11]) - new_entry = data.frame(subclones[i-1,]) new_entry$endpos = subclones$endpos[i] new_entry$BAF = mean(c(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], - bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) + bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]]), na.rm=T) new_entry$LogR = mean(c(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3], - logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3])) + logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3]), na.rm=T) subclones_cleaned = rbind(subclones_cleaned, new_entry) # Set BAFseg to reflect the current segment @@ -638,6 +629,31 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm } return(list(bafsegmented=bafsegmented, subclones=subclones)) } + +#' Mask segments that have a too high CN state +#' @param subclones +#' @param bafsegmented +#' @param max_allowed_state +#' @return A list with the masked subclones, bafsegmented and the number of segments masked and their total genome size +#' @author sd11 +mask_high_cn_segments = function(subclones, bafsegmented, max_allowed_state) { + count = 0 + masked_size = 0 + for (i in 1:nrow(subclones)) { + if (subclones$nMaj1_A[i] > max_allowed_state | subclones$nMin1_A[i] > max_allowed_state) { + # Mask this segment + subclones[i, "nMaj1_A"] = NA + subclones[i, "nMin1_A"] = NA + subclones[i, "nMaj2_A"] = NA + subclones[i, "nMin2_A"] = NA + # Mask the BAFsegmented + bafsegmented[subclones$chr[i] == bafsegmented$Chromosome & subclones$startpos[i] < bafsegmented$Position & subclones$endpos[i] >= bafsegmented$Position,c("BAFseg")] = NA + count = count+1 + masked_size = masked_size + (subclones$endpos[i]-subclones$startpos[i]) + } + } + return(list(subclones=subclones, bafsegmented=bafsegmented, masked_count=count, masked_size=masked_size)) +} #' Plot the copy number genome wide in two different ways. This creates the Battenberg average diff --git a/R/haplotype.R b/R/haplotype.R index a03d438..2b9f677 100644 --- a/R/haplotype.R +++ b/R/haplotype.R @@ -47,7 +47,7 @@ GetChromosomeBAFs_SNP6 = function(chrom, alleleFreqFile, haplotypeFile, samplena #' @param minCounts An integer describing the minimum number of reads covering this position to be included in the output. #' @author dw9 #' @export -GetChromosomeBAFs = function(chrom, SNP_file, haplotypeFile, samplename, outfile, chr_names, minCounts=10) { +GetChromosomeBAFs = function(chrom, SNP_file, haplotypeFile, samplename, outfile, chr_names, minCounts=1) { # Read in the SNP and haplotype info snp_data = read.table(SNP_file, comment.char="", sep="\t", header=T, stringsAsFactors=F) variant_data = read.table(haplotypeFile, header=F) diff --git a/R/segmentation.R b/R/segmentation.R index 61fa31c..89c1a5c 100644 --- a/R/segmentation.R +++ b/R/segmentation.R @@ -102,9 +102,10 @@ segment.baf.phased = function(samplename, inputfile, outputfile, gamma=10, phase #' @param kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default 3) #' @param phasegamma Gamma parameter used when correcting phasing mistakes (Default 3) #' @param phasekmin Kmin parameter used when correcting phasing mistakes (Default 3) +#' @param no_segmentation Do not perform segmentation. This step will switch the haplotype blocks, but then just takes the mean BAFphased as BAFsegm #' @author sd11, dw9 #' @export -segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=10, phasegamma=3, kmin=3, phasekmin=3) { +segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=10, phasegamma=3, kmin=3, phasekmin=3, no_segmentation=F) { #' Function that takes SNPs that belong to a single segment and looks for big holes between #' each pair of SNPs. If there is a big hole it will add another breakpoint to the breakpoints data.frame @@ -195,8 +196,9 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 #' @param phasegamma #' @param kmin #' @param gamma + #' @param no_segmentation Do not perform segmentation. This step will switch the haplotype blocks, but then just takes the mean BAFphased as BAFsegm #' @return A data.frame with columns Chromosome,Position,BAF,BAFphased,BAFseg - run_pcf = function(BAFrawchr, presegment_chrom_start, presegment_chrom_end, phasekmin, phasegamma, kmin, gamma) { + run_pcf = function(BAFrawchr, presegment_chrom_start, presegment_chrom_end, phasekmin, phasegamma, kmin, gamma, no_segmentation=F) { row.indices = which(BAFrawchr$Position >= presegment_chrom_start & BAFrawchr$Position <= presegment_chrom_end) @@ -227,7 +229,7 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 BAFphased = ifelse(BAFsegm>0.5,BAF,1-BAF) - if(length(BAFphased)<50){ + if(length(BAFphased)<50 | no_segmentation){ BAFphseg = rep(mean(BAFphased),length(BAFphased)) }else{ res = Battenberg:::selectFastPcf(BAFphased,kmin,gamma*sdev,T) @@ -254,12 +256,10 @@ segment.baf.phased.sv = function(samplename, inputfile, outputfile, svs, gamma=1 svs_chrom = svs[svs$chromosome==chr,] breakpoints_chrom = svs_to_presegment_breakpoints(chr, svs_chrom, BAFrawchr, addin_bigholes=T) - print(breakpoints_chrom) BAFoutputchr = NULL for (r in 1:nrow(breakpoints_chrom)) { - print(breakpoints_chrom[r,]) - BAFoutput_preseg = run_pcf(BAFrawchr, breakpoints_chrom$start[r], breakpoints_chrom$end[r], phasekmin, phasegamma, kmin, gamma) + BAFoutput_preseg = run_pcf(BAFrawchr, breakpoints_chrom$start[r], breakpoints_chrom$end[r], phasekmin, phasegamma, kmin, gamma, no_segmentation) BAFoutputchr = rbind(BAFoutputchr, BAFoutput_preseg) } diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R index 8435fc4..3c4e41f 100644 --- a/inst/example/battenberg_snp6.R +++ b/inst/example/battenberg_snp6.R @@ -4,7 +4,7 @@ NORMALCEL = toString(args[2]) TUMOURCEL = toString(args[3]) RUN_DIR = toString(args[4]) -library(Battenberg, lib="~/R/x86_64-unknown-linux-gnu-library/3.2/") +library(Battenberg) library(doParallel) ############################################################################### @@ -42,9 +42,10 @@ PHASING_GAMMA = 1 SEGMENTATION_GAMMA = 10 CLONALITY_DIST_METRIC = 0 ASCAT_DIST_METRIC = 1 -MIN_PLOIDY = 1.6 -MAX_PLOIDY = 4.8 -MIN_RHO = 0.1 +MIN_PLOIDY = 1.6 #1.6 +MAX_PLOIDY = 4.8 #4.8 +MIN_RHO = 0.13 #0.1 +MAX_RHO = 1.02 #1 MIN_GOODNESS_OF_FIT = 0.63 BALANCED_THRESHOLD = 0.51 MIN_NORMAL_DEPTH = 10 @@ -128,7 +129,7 @@ foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.snp6","run chr_names=chrom_names) # Cleanup temp Impute output - unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep="")) + unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "_*K.txt*", sep="")) } # Kill the threads as from here its all single core @@ -141,6 +142,7 @@ combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), no.chrs=length(chrom_names)) # Segment the phased and haplotyped BAF data +# Call segment.baf.phased.sv when SVs are available segment.baf.phased(samplename=TUMOURNAME, inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), @@ -154,12 +156,13 @@ fit.copy.number(samplename=TUMOURNAME, outputfile.prefix=paste(TUMOURNAME, "_", sep=""), inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), - inputfile.logr=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), + inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), dist_choice=CLONALITY_DIST_METRIC, ascat_dist_choice=ASCAT_DIST_METRIC, min.ploidy=MIN_PLOIDY, max.ploidy=MAX_PLOIDY, min.rho=MIN_RHO, + max.rho=MAX_RHO, min.goodness=MIN_GOODNESS_OF_FIT, uninformative_BAF_threshold=BALANCED_THRESHOLD, gamma_param=PLATFORM_GAMMA, @@ -171,14 +174,17 @@ fit.copy.number(samplename=TUMOURNAME, # Go over all segments, determine which segements are a mixture of two states and fit a second CN state callSubclones(sample.name=TUMOURNAME, baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logr.file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), + logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), + masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""), chr_names=chrom_names, gamma=PLATFORM_GAMMA, segmentation.gamma=NA, siglevel=0.05, maxdist=0.01, - noperms=1000) + noperms=1000, + max_allowed_state=250, + sv_breakpoints_file=NULL) diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R index af0b84b..a3e28d5 100644 --- a/inst/example/battenberg_wgs.R +++ b/inst/example/battenberg_wgs.R @@ -10,8 +10,8 @@ library(Battenberg) library(doParallel) ############################################################################### -# 2015-04-12 -# A pure R Battenberg v2.0.0 SNP6 pipeline implementation. +# 2016-08-03 +# A pure R Battenberg v2.0.1 WGS pipeline implementation. # sd11@sanger.ac.uk ############################################################################### @@ -37,12 +37,14 @@ PHASING_GAMMA = 1 SEGMENTATION_GAMMA = 10 CLONALITY_DIST_METRIC = 0 ASCAT_DIST_METRIC = 1 -MIN_PLOIDY = 1.6 -MAX_PLOIDY = 4.8 -MIN_RHO = 0.1 +MIN_PLOIDY = 1.6 #1.6 +MAX_PLOIDY = 4.8 #4.8 +MIN_RHO = 0.13 #0.1 +MAX_RHO = 1.02 #1 MIN_GOODNESS_OF_FIT = 0.63 BALANCED_THRESHOLD = 0.51 MIN_NORMAL_DEPTH = 10 +MIN_TUMOUR_DEPTH = 1 MIN_BASE_QUAL = 20 MIN_MAP_QUAL = 35 @@ -132,7 +134,7 @@ foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run. samplename=TUMOURNAME, outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), chr_names=chrom_names, - minCounts=MIN_NORMAL_DEPTH) + minCounts=MIN_TUMOUR_DEPTH) # Plot what we have until this point plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), @@ -142,7 +144,7 @@ foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run. chr_names=chrom_names) # Cleanup temp Impute output - unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep="")) + unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "_*K.txt*", sep="")) } # Kill the threads as from here its all single core @@ -155,6 +157,7 @@ combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), no.chrs=length(chrom_names)) # Segment the phased and haplotyped BAF data +# Call segment.baf.phased.sv when SVs are available segment.baf.phased(samplename=TUMOURNAME, inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), @@ -174,6 +177,7 @@ fit.copy.number(samplename=TUMOURNAME, min.ploidy=MIN_PLOIDY, max.ploidy=MAX_PLOIDY, min.rho=MIN_RHO, + max.rho=MAX_RHO, min.goodness=MIN_GOODNESS_OF_FIT, uninformative_BAF_threshold=BALANCED_THRESHOLD, gamma_param=PLATFORM_GAMMA, @@ -190,9 +194,12 @@ callSubclones(sample.name=TUMOURNAME, output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), + masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""), chr_names=chrom_names, gamma=PLATFORM_GAMMA, segmentation.gamma=NA, siglevel=0.05, maxdist=0.01, - noperms=1000) + noperms=1000, + max_allowed_state=250, + sv_breakpoints_file=NULL) diff --git a/inst/example/parse_svs.R b/inst/example/parse_svs.R index 8899d79..5359245 100644 --- a/inst/example/parse_svs.R +++ b/inst/example/parse_svs.R @@ -4,12 +4,14 @@ library(VariantAnnotation) parse_brass_svs = function(vcffile, outfile, ref_genome="hg19") { svs = parse_svs_1(vcffile, ref_genome=ref_genome) write_svs(svs, outfile) + return(svs) } #' Parses ICGC consensus SV calls into a dataframe with a line for each SV and two columns: chromosome and position parse_icgc_consensus_svs = function(vcffile, outfile, ref_genome="hg19") { svs = parse_svs_1(vcffile, ref_genome=ref_genome) write_svs(svs, outfile) + return(svs) } #' Helper function that writes the given SVs to file @@ -43,5 +45,30 @@ parse_svs_1 = function(vcffile, ref_genome="hg19") { output = with(output, output[order(chromosome, position),]) output$chromosome = as.character(output$chromosome) output = unique(output) + output = output[with(output, order(chromosome, position)),] return(output) } + +#' Helper function that works on cases where SVs have been encoded as such: +#' +#' #CHROM POS ... INFO +#' 1 123 ... ...CHR2=1;END=143274758... +#' 1 234 ... ...CHR2=1;END=143274758... +#' 1 280 ... ...CHR2=1;END=143274758... +parse_svs_2 = function(vcffile, ref_genome="hg19") { + v = readVcf(vcffile, ref_genome) + output = data.frame(chromosome=seqnames(v), position=start(v)) + output = rbind(output, data.frame(chromosome=info(v)$CHR2, position=info(v)$END)) + return(output) +} + +parse_delly_svs = function(vcffile, outfile, ref_genome="hg19") { + svs = parse_svs_2(vcffile, ref_genome) + write_svs(svs, outfile) + return(svs) +} + + + + + diff --git a/man/GetChromosomeBAFs.Rd b/man/GetChromosomeBAFs.Rd index a530687..9dd3bc8 100644 --- a/man/GetChromosomeBAFs.Rd +++ b/man/GetChromosomeBAFs.Rd @@ -5,7 +5,7 @@ \title{Morphs phased SNPs from WGS input into haplotype blocks} \usage{ GetChromosomeBAFs(chrom, SNP_file, haplotypeFile, samplename, outfile, - chr_names, minCounts = 10) + chr_names, minCounts = 1) } \arguments{ \item{chrom}{The chromosome number for which this function is called.} diff --git a/man/callSubclones.Rd b/man/callSubclones.Rd index 6e5eed1..bb72cd2 100644 --- a/man/callSubclones.Rd +++ b/man/callSubclones.Rd @@ -6,9 +6,9 @@ \usage{ callSubclones(sample.name, baf.segmented.file, logr.file, rho.psi.file, output.file, output.figures.prefix, output.gw.figures.prefix, chr_names, - sv_breakpoints_file = NULL, gamma = 1, segmentation.gamma = NA, - siglevel = 0.05, maxdist = 0.01, noperms = 1000, - seed = as.integer(Sys.time())) + masking_output_file, max_allowed_state = 100, sv_breakpoints_file = NULL, + gamma = 1, segmentation.gamma = NA, siglevel = 0.05, maxdist = 0.01, + noperms = 1000, seed = as.integer(Sys.time())) } \arguments{ \item{sample.name}{Name of the sample, used in figures} @@ -27,6 +27,12 @@ callSubclones(sample.name, baf.segmented.file, logr.file, rho.psi.file, \item{chr_names}{Vector of allowed chromosome names} +\item{masking_output_file}{Filename of where the masking details need to be written. Masking is performed to remove very high copy number state segments} + +\item{max_allowed_state}{The maximum CN state allowed (Default 100)} + +\item{sv_breakpoints_file}{A two column file with breakpoints from structural variants. These are used when making the figures} + \item{gamma}{Technology specific scaling parameter for LogR (Default 1)} \item{segmentation.gamma}{Legacy parameter that is no longer used (Default NA)} diff --git a/man/fit.copy.number.Rd b/man/fit.copy.number.Rd index 3983bf2..199fbe5 100644 --- a/man/fit.copy.number.Rd +++ b/man/fit.copy.number.Rd @@ -6,7 +6,7 @@ \usage{ fit.copy.number(samplename, outputfile.prefix, inputfile.baf.segmented, inputfile.baf, inputfile.logr, dist_choice, ascat_dist_choice, - min.ploidy = 1.6, max.ploidy = 4.8, min.rho = 0.1, max.rho = 1.05, + min.ploidy = 1.6, max.ploidy = 4.8, min.rho = 0.1, max.rho = 1, min.goodness = 63, uninformative_BAF_threshold = 0.51, gamma_param = 1, use_preset_rho_psi = F, preset_rho = NA, preset_psi = NA, read_depth = 30) diff --git a/man/runASCAT.Rd b/man/runASCAT.Rd index ae03870..b1a82c5 100644 --- a/man/runASCAT.Rd +++ b/man/runASCAT.Rd @@ -6,8 +6,9 @@ \usage{ runASCAT(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choice, distancepng = NA, copynumberprofilespng = NA, nonroundedprofilepng = NA, - gamma = 0.55, allow100percent, reliabilityFile = NA, min.ploidy = 1.6, - max.ploidy = 4.8, min.rho = 0.1, max.rho = 1.05, min.goodness = 63, + cnaStatusFile = "copynumber_solution_status.txt", gamma = 0.55, + allow100percent, reliabilityFile = NA, min.ploidy = 1.6, + max.ploidy = 4.8, min.rho = 0.1, max.rho = 1, min.goodness = 63, uninformative_BAF_threshold = 0.51) } \arguments{ @@ -29,6 +30,8 @@ runASCAT(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choice, \item{nonroundedprofilepng}{if NA: copy number profile before rounding is plotted (total copy number as well as the copy number of the minor allele), if filename is given, the plot is written to a .png file (Default NA)} +\item{cnaStatusFile}{File where the copy number profile status is written to. This contains either the message "No suitable copy number solution found" or "X copy number solutions found" (Default copynumber_solution_status.txt)} + \item{gamma}{technology parameter, compaction of Log R profiles (expected decrease in case of deletion in diploid sample, 100 "\%" aberrant cells; 1 in ideal case, 0.55 of Illumina 109K arrays) (Default 0.55)} \item{allow100percent}{A boolean whether to allow a 100"\%" cellularity solution} @@ -41,6 +44,8 @@ runASCAT(lrr, baf, lrrsegmented, bafsegmented, chromosomes, dist_choice, \item{min.rho}{The minimum cellularity to consider (Default 0.1)} +\item{max.rho}{The maximum cellularity to consider (Default 1.0)} + \item{min.goodness}{The minimum goodness of fit for a solution to have to be considered (Default 63)} \item{uninformative_BAF_threshold}{The threshold beyond which BAF becomes uninformative (Default 0.51)} diff --git a/man/segment.baf.phased.sv.Rd b/man/segment.baf.phased.sv.Rd index 4a22fde..fa89b68 100644 --- a/man/segment.baf.phased.sv.Rd +++ b/man/segment.baf.phased.sv.Rd @@ -5,7 +5,7 @@ \title{Segment BAF with the inclusion of structural variant breakpoints} \usage{ segment.baf.phased.sv(samplename, inputfile, outputfile, svs, gamma = 10, - phasegamma = 3, kmin = 3, phasekmin = 3) + phasegamma = 3, kmin = 3, phasekmin = 3, no_segmentation = F) } \arguments{ \item{samplename}{Name of the sample, which is used to name output figures} @@ -23,6 +23,8 @@ segment.baf.phased.sv(samplename, inputfile, outputfile, svs, gamma = 10, \item{kmin}{Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default 3)} \item{phasekmin}{Kmin parameter used when correcting phasing mistakes (Default 3)} + +\item{no_segmentation}{Do not perform segmentation. This step will switch the haplotype blocks, but then just takes the mean BAFphased as BAFsegm} } \description{ This function takes the SV breakpoints as initial segments and runs PCF on each