From 2d59523132d16d19e1767956343197246b61508e Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Wed, 8 Jan 2020 09:41:55 -0500 Subject: [PATCH 01/10] Add the functions --- .../util/chr-break-calculate.R | 181 ++++++++++++++++++ .../util/chr-break-plot.R | 112 +++++++++++ 2 files changed, 293 insertions(+) create mode 100644 analyses/chromosomal-instability/util/chr-break-calculate.R create mode 100644 analyses/chromosomal-instability/util/chr-break-plot.R diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R new file mode 100644 index 0000000000..6aabc8a702 --- /dev/null +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -0,0 +1,181 @@ +# Functions for chromosomal instability plots +# +# C. Savonen for ALSF - CCDL +# +# 2020 + +make_granges <- function(break_df = NULL, + sample_id = NULL, + samples_col = "samples", + chrom_col = "chrom", + start_col = "start", + end_col = "end") { + # For a given breaks data.frame make a GenomicRanger object from it. Can + # filter to a single samples' data. + # + # Args: + # break_df: for a data.frame with chromosomal coordinates and sample IDs. + # sample_id: sample ID for which the data needs to be extracted and made into + # a GenomicRanges object. If "all" is designated, all samples will + # be kept and the data combined. The original data.frame is stored + # in elementMetadata@listData + # samples_col: character string that indicates the column name with the + # sample ID information. Default is "samples". + # chrom_col: character string that indicates the column name with the + # chromosome information. Default is "chrom". + # start_col: character string that indicates the column name with the + # start coordinate. Default is "start". + # end_col: character string that indicates the column name with the + # end coordinate. Default is "end". + # + # Returns: + # A Genomic Ranges formatted object for the particular sample that has the + # break points from the data.frame + + # If no data.frame is provided, stop + if (is.null(breaks_df)) { + stop("No breaks data.frame has been provided. ") + } + + # List the columns we need + needed_cols <- c(samples_col, chrom_col, start_col, end_col) + + # Get logical vector indicating which are in metadata + found_cols <- (needed_cols %in% colnames(break_df)) + + # If not all the columns are found, stop + if (!all(found_cols)) { + stop(cat( + "The following column names specified for making the GenomicRanges object: \n", + paste(needed_cols[which(!found_cols)], collapse = "\n"), + "\n ...were not found in the specified metadata data.frame.", + "Check your `_col` arguments." + )) + } + + if (sample_id != "all") { + # Extract samples' info from the data.frame + break_df <- break_df %>% + dplyr::filter(!!rlang::sym(samples_col) == sample_id) + + # Stop if the sample doesn't exist in the data.frame + if (nrow(break_df) == 0) { + stop("No sample data was found for the given `sample_id` argument.") + } + } + + # Make GRanges for CNV data + granges <- GenomicRanges::GRanges( + seqnames = dplyr::pull(break_df, !!rlang::sym(chrom_col)), + ranges = IRanges::IRanges( + start = dplyr::pull(break_df, !!rlang::sym(start_col)), + end = dplyr::pull(break_df, !!rlang::sym(end_col)) + ), + mcols = break_df + ) + + return(granges) +} + +break_density <- function(sv_breaks = NULL, + cnv_breaks = NULL, + sample_id = NULL, + window_size = 1e6, + chr_sizes_list = NULL, + samples_col_cnv = "samples", + chrom_col_cnv = "chrom", + start_col_cnv = "start", + end_col_cnv = "end", + samples_col_sv = "samples", + chrom_col_sv = "chrom", + start_col_sv = "start", + end_col_sv = "end") { + # For given breaks data.frame(s), calculate the breaks density for a tiled + # windows across the genome. Returns the data as a GenomicRanges object for + # easy mapping with ggbio. Where the density and counts are stored in + # @elementMetadata@listData. + # + # Args: + # sv_breaks: a data.frame with the breaks for the SV data. + # cnv_breaks: a data.frame with the breaks for the SV data. + # sample_id: sample ID for which the data needs to be extracted and made into + # a GenomicRanges object. If "all" is designated, all samples will + # be kept. + # chr_size_list: a named numeric vector of the sizes (bp) of the chromosomes. + # names of the chromosomes must match the format of the input + # break data. e.g. "chr1" or "1". + # window_size: What size windows to calculate break density. Default is 1kb. + # samples_col_sv/cnv: character string that indicates the column name with the + # sample ID information. Default is "samples". Will be passed to + # `make_granges`` function. + # chrom_col_sv/cnv: character string that indicates the column name with the + # chromosome information. Default is "chrom". Will be passed to + # `make_granges`` function. + # start_col_sv/cnv: character string that indicates the column name with the + # start coordinate. Default is "start". Will be passed to + # `make_granges`` function. + # end_col_sv/cnv: character string that indicates the column name with the + # end coordinate. Default is "end". Will be passed to + # `make_granges`` function. + # + # Check that a sample ID has been specified. + if (is.null(sample_id)) { + stop("No sample ID has been specified. Use the `sample_id` argument.") + } + + # Make CNV into GRanges + if (!is.null(cnv_breaks)) { + cnv_ranges <- make_granges(cnv_breaks, + sample_id = sample_id, + samples_col = samples_col_cnv, + chrom_col = chrom_col_cnv, + start_col = start_col_cnv, + end_col = end_col_cnv + ) + # This will get written over if there are both datasets + combo_ranges <- cnv_ranges + } + # Make SV into GRanges + if (!is.null(sv_breaks)) { + sv_ranges <- make_granges(sv_breaks, + sample_id = sample_id, + samples_col = samples_col_sv, + chrom_col = chrom_col_sv, + start_col = start_col_sv, + end_col = end_col_sv + ) + # This will get written over if there are both datasets + combo_ranges <- sv_ranges + } + + # If both datasets are given, reduce them into one. + if (!is.null(cnv_breaks) && !is.null(sv_breaks)) { + combo_ranges <- GenomicRanges::union( + cnv_ranges, + sv_ranges + ) + } + + # Create genome bins + bins <- GenomicRanges::tileGenome(chr_sizes_list, + tilewidth = window_size + ) + + # Uncompress GRangesList + bins <- unlist(bins) + + # Get counts for each genome bin + bin_counts <- GenomicRanges::countOverlaps( + bins, + combo_ranges + ) + + # Store count info + bins@elementMetadata@listData$counts <- bin_counts + + # Calculate and store density + bins@elementMetadata@listData$density <- bin_counts / window_size + + # Return the GRanges object for mapping purposes + return(bins) +} diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R new file mode 100644 index 0000000000..b516e6c467 --- /dev/null +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -0,0 +1,112 @@ +# Functions for chromosomal instability calculations +# +# C. Savonen for ALSF - CCDL +# +# 2020 + +map_density_plot <- function(granges, + y_val, + y_lab, + color, + main_title) { + # Given a GRanges object, plot it y value along its chromosomal mappings using + # ggbio. + # + # Args: + # granges: A Granges object to plot + # y_val: a character string of the columnname in listData spot of the + # GenomicRanges to plot on the y axis + # color: a color parameter + # y_lab: a character string to use for the ylabel. Will be passed to + # ggplot2::ylab argument. + # main_title: a character string to use for the main title. + # + # Returns: + # ggplot of chromosomal mapping of the y value given. + # + # For setting the scale later, need to get y's max + max_y <- max( + dplyr::pull( + data.frame(granges@elementMetadata@listData), + !!rlang::sym(y_val) + ) + ) + # Make the density plot + density_plot <- ggbio::autoplot(granges, ggplot2::aes(y = !!rlang::sym(y_val)), + geom = "line", scales = "free_x", space = "free_x", + colour = color + ) + + ggplot2::theme_classic() + + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 3, angle = 45, hjust = 1)) + + ggplot2::ylab(y_lab) + + ggplot2::ggtitle(main_title) + + ggplot2::scale_y_continuous(breaks = seq(0, max_y, by = 2)) + + # Print out plot + density_plot@ggplot +} + +chr_break_plot <- function(granges_list) { + # A wrapper function to make a 3 row chromosomal map plot for a set of GRanges + # objects that contain common_density, cnv_density, and sv_density. + # + # Args: + # granges_list: A list of Granges object to plot as a combination plot + # + # Returns: + # ggplot of chromosomal mapping of the y value given. + # + # Get the sample name + sample_name <- common_samples[parent.frame()$i[]] + + # Make combined SV and CNV plot + common_plot <- map_density_plot(sample_densities$common_density, + y_val = "counts", + y_lab = "Breaks per Mb", + color = "blue", + main_title = "Common Breaks" + ) + + # Make CNV plot + cnv_plot <- map_density_plot(sample_densities$cnv_density, + y_val = "counts", + y_lab = "Breaks per Mb", + color = "darkgreen", + main_title = "CNV Breaks" + ) + + # Make SV plot + sv_plot <- map_density_plot(sample_densities$sv_density, + y_val = "counts", + y_lab = "Breaks per Mb", + color = "orange", + main_title = "SV Breaks" + ) + + # Make a title + title <- cowplot::ggdraw() + + cowplot::draw_label(paste(sample_name, " - Chromosomal Break Density"), + fontface = "bold", x = .4, hjust = 0, size = 12 + ) + + # Put all plots and title together + full_plot <- cowplot::plot_grid(title, + common_plot, + cnv_plot, + sv_plot, + nrow = 4, + axis = "b", + rel_heights = c(2, 5, 5, 5) + ) + + # Save plot to PNG + cowplot::save_plot( + plot = full_plot, + filename = file.path(plots_dir, paste0(sample_name, "_breaks.png")), + base_height = 7, + base_width = 20 + ) + + # Print out the plot while we are here + full_plot +} From 6951c98714c1ce5937413625386a3082637cc0a7 Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Wed, 8 Jan 2020 09:58:49 -0500 Subject: [PATCH 02/10] Edited a messed up comment --- analyses/chromosomal-instability/util/chr-break-calculate.R | 2 +- analyses/chromosomal-instability/util/chr-break-plot.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index 6aabc8a702..ebf4e85ba8 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability plots +# Functions for chromosomal instability breakpoint calculations. # # C. Savonen for ALSF - CCDL # diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R index b516e6c467..f1b556b8d1 100644 --- a/analyses/chromosomal-instability/util/chr-break-plot.R +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability calculations +# Functions for chromosomal instability plots # # C. Savonen for ALSF - CCDL # From 9e0d321888c2ab5c99f78e86f24289a1ef6b6626 Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Wed, 8 Jan 2020 10:39:41 -0500 Subject: [PATCH 03/10] Fixed some issues with the functions --- .../util/chr-break-calculate.R | 24 +++++++++++------ .../util/chr-break-plot.R | 27 ++++++++++--------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index ebf4e85ba8..964a3f6675 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability breakpoint calculations. +# Functions for chromosomal instability plots # # C. Savonen for ALSF - CCDL # @@ -33,7 +33,7 @@ make_granges <- function(break_df = NULL, # break points from the data.frame # If no data.frame is provided, stop - if (is.null(breaks_df)) { + if (is.null(break_df)) { stop("No breaks data.frame has been provided. ") } @@ -56,7 +56,7 @@ make_granges <- function(break_df = NULL, if (sample_id != "all") { # Extract samples' info from the data.frame break_df <- break_df %>% - dplyr::filter(!!rlang::sym(samples_col) == sample_id) + dplyr::filter(c(sample_id) %in% !!rlang::sym(samples_col)) # Stop if the sample doesn't exist in the data.frame if (nrow(break_df) == 0) { @@ -100,7 +100,7 @@ break_density <- function(sv_breaks = NULL, # cnv_breaks: a data.frame with the breaks for the SV data. # sample_id: sample ID for which the data needs to be extracted and made into # a GenomicRanges object. If "all" is designated, all samples will - # be kept. + # be kept. Multiple samples can be designated as a character vector. # chr_size_list: a named numeric vector of the sizes (bp) of the chromosomes. # names of the chromosomes must match the format of the input # break data. e.g. "chr1" or "1". @@ -122,10 +122,14 @@ break_density <- function(sv_breaks = NULL, if (is.null(sample_id)) { stop("No sample ID has been specified. Use the `sample_id` argument.") } - + + # Determine how many samples are in the group + n_samples <- length(sample_id) + # Make CNV into GRanges if (!is.null(cnv_breaks)) { - cnv_ranges <- make_granges(cnv_breaks, + cnv_ranges <- make_granges( + break_df = cnv_breaks, sample_id = sample_id, samples_col = samples_col_cnv, chrom_col = chrom_col_cnv, @@ -137,7 +141,8 @@ break_density <- function(sv_breaks = NULL, } # Make SV into GRanges if (!is.null(sv_breaks)) { - sv_ranges <- make_granges(sv_breaks, + sv_ranges <- make_granges( + break_df = sv_breaks, sample_id = sample_id, samples_col = samples_col_sv, chrom_col = chrom_col_sv, @@ -171,8 +176,11 @@ break_density <- function(sv_breaks = NULL, ) # Store count info - bins@elementMetadata@listData$counts <- bin_counts + bins@elementMetadata@listData$total_counts <- bin_counts + # Store average count info + bins@elementMetadata@listData$avg_counts <- bin_counts / n_samples + # Calculate and store density bins@elementMetadata@listData$density <- bin_counts / window_size diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R index f1b556b8d1..2c1063d976 100644 --- a/analyses/chromosomal-instability/util/chr-break-plot.R +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability plots +# Functions for chromosomal instability calculations # # C. Savonen for ALSF - CCDL # @@ -46,38 +46,39 @@ map_density_plot <- function(granges, density_plot@ggplot } -chr_break_plot <- function(granges_list) { +chr_break_plot <- function(granges_list, + plot_name, + y_val) { # A wrapper function to make a 3 row chromosomal map plot for a set of GRanges # objects that contain common_density, cnv_density, and sv_density. # # Args: # granges_list: A list of Granges object to plot as a combination plot + # plot_name: a character string specifying the plot + # y_val: to be passed to map_density plot for mapping. # # Returns: # ggplot of chromosomal mapping of the y value given. # - # Get the sample name - sample_name <- common_samples[parent.frame()$i[]] - # Make combined SV and CNV plot - common_plot <- map_density_plot(sample_densities$common_density, - y_val = "counts", + common_plot <- map_density_plot(granges_list$common_density, + y_val = y_val, y_lab = "Breaks per Mb", color = "blue", main_title = "Common Breaks" ) # Make CNV plot - cnv_plot <- map_density_plot(sample_densities$cnv_density, - y_val = "counts", + cnv_plot <- map_density_plot(granges_list$cnv_density, + y_val = y_val, y_lab = "Breaks per Mb", color = "darkgreen", main_title = "CNV Breaks" ) # Make SV plot - sv_plot <- map_density_plot(sample_densities$sv_density, - y_val = "counts", + sv_plot <- map_density_plot(granges_list$sv_density, + y_val = y_val, y_lab = "Breaks per Mb", color = "orange", main_title = "SV Breaks" @@ -85,7 +86,7 @@ chr_break_plot <- function(granges_list) { # Make a title title <- cowplot::ggdraw() + - cowplot::draw_label(paste(sample_name, " - Chromosomal Break Density"), + cowplot::draw_label(paste(plot_name, " - Chromosomal Break Density"), fontface = "bold", x = .4, hjust = 0, size = 12 ) @@ -102,7 +103,7 @@ chr_break_plot <- function(granges_list) { # Save plot to PNG cowplot::save_plot( plot = full_plot, - filename = file.path(plots_dir, paste0(sample_name, "_breaks.png")), + filename = file.path(plots_dir, paste0(plot_name, "_breaks.png")), base_height = 7, base_width = 20 ) From 5a34518d471013d7fe22b7105ad9968fb59f3378 Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Wed, 8 Jan 2020 14:21:15 -0500 Subject: [PATCH 04/10] The functions handle grouped data --- .../util/chr-break-calculate.R | 64 ++++++++++++++++--- .../util/chr-break-plot.R | 28 +++++--- 2 files changed, 74 insertions(+), 18 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index 964a3f6675..114acd19e8 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -53,10 +53,10 @@ make_granges <- function(break_df = NULL, )) } - if (sample_id != "all") { + if (sample_id[1] != "all") { # Extract samples' info from the data.frame break_df <- break_df %>% - dplyr::filter(c(sample_id) %in% !!rlang::sym(samples_col)) + dplyr::filter(!!rlang::sym(samples_col) %in% c(sample_id)) # Stop if the sample doesn't exist in the data.frame if (nrow(break_df) == 0) { @@ -71,7 +71,7 @@ make_granges <- function(break_df = NULL, start = dplyr::pull(break_df, !!rlang::sym(start_col)), end = dplyr::pull(break_df, !!rlang::sym(end_col)) ), - mcols = break_df + mcols = dplyr::pull(break_df, !!rlang::sym(samples_col)) ) return(granges) @@ -122,10 +122,10 @@ break_density <- function(sv_breaks = NULL, if (is.null(sample_id)) { stop("No sample ID has been specified. Use the `sample_id` argument.") } - + # Determine how many samples are in the group n_samples <- length(sample_id) - + # Make CNV into GRanges if (!is.null(cnv_breaks)) { cnv_ranges <- make_granges( @@ -159,7 +159,24 @@ break_density <- function(sv_breaks = NULL, cnv_ranges, sv_ranges ) - } + + # Carry over list data from sv_ranges + sv_overlaps <- GenomicRanges::findOverlaps(sv_ranges, combo_ranges) + + # Carry over list data from cnv_ranges + cnv_overlaps <- GenomicRanges::findOverlaps(cnv_ranges, combo_ranges) + + # Set up an empty list where we can store what sample each sequence came from + combo_ranges@elementMetadata@listData$samples <- rep(NA, length(combo_ranges)) + + # Bring over CNV samples + combo_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- + cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to] + + # Bring over SV samples + combo_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- + cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to] + } # Create genome bins bins <- GenomicRanges::tileGenome(chr_sizes_list, @@ -174,13 +191,40 @@ break_density <- function(sv_breaks = NULL, bins, combo_ranges ) - + ########################### Calculate summary stats ########################## + # Get a per sample break down if there is more than one sample + if (n_samples > 1) { + + # Get counts for each genome bin + bin_indices <- GenomicRanges::findOverlaps( + bins, + combo_ranges + ) + + # Get list of samples + bin_samples <- combo_ranges@elementMetadata@listData$mcols[bin_indices@to] + + # Make a matrix that has the number of breaks per sample for each bin + freq_per_bin <- table(bin_indices@from, bin_samples) %>% + data.frame() %>% + tidyr::spread(Var1, Freq) %>% + tibble::column_to_rownames("bin_samples") %>% + t() + + # Calculate the median breaks per bin + median_counts <- apply(freq_per_bin, 1, median) + + # Store the median break counts, some bins data may be dropped off so we need + # to start with NAs and then fill in the data based on the indices. + bins@elementMetadata@listData$median_counts <- rep(NA, length(bins)) + bins@elementMetadata@listData$median_counts[as.numeric(names(median_counts))] <- median_counts + + # Store average count info + bins@elementMetadata@listData$avg_counts <- bin_counts / n_samples + } # Store count info bins@elementMetadata@listData$total_counts <- bin_counts - # Store average count info - bins@elementMetadata@listData$avg_counts <- bin_counts / n_samples - # Calculate and store density bins@elementMetadata@listData$density <- bin_counts / window_size diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R index 2c1063d976..8b3a7b93bf 100644 --- a/analyses/chromosomal-instability/util/chr-break-plot.R +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -26,10 +26,11 @@ map_density_plot <- function(granges, # # For setting the scale later, need to get y's max max_y <- max( - dplyr::pull( - data.frame(granges@elementMetadata@listData), + data.frame(granges@elementMetadata@listData) %>% + dplyr::pull( !!rlang::sym(y_val) - ) + ), + na.rm = TRUE ) # Make the density plot density_plot <- ggbio::autoplot(granges, ggplot2::aes(y = !!rlang::sym(y_val)), @@ -48,7 +49,9 @@ map_density_plot <- function(granges, chr_break_plot <- function(granges_list, plot_name, - y_val) { + y_val, + y_lab, + plot_dir) { # A wrapper function to make a 3 row chromosomal map plot for a set of GRanges # objects that contain common_density, cnv_density, and sv_density. # @@ -56,6 +59,8 @@ chr_break_plot <- function(granges_list, # granges_list: A list of Granges object to plot as a combination plot # plot_name: a character string specifying the plot # y_val: to be passed to map_density plot for mapping. + # y_lab: to be passed to map_density plot for y axis label + # plot_dir: a file path where you would like the plot PNG to be saved. # # Returns: # ggplot of chromosomal mapping of the y value given. @@ -63,7 +68,7 @@ chr_break_plot <- function(granges_list, # Make combined SV and CNV plot common_plot <- map_density_plot(granges_list$common_density, y_val = y_val, - y_lab = "Breaks per Mb", + y_lab = y_lab, color = "blue", main_title = "Common Breaks" ) @@ -71,7 +76,7 @@ chr_break_plot <- function(granges_list, # Make CNV plot cnv_plot <- map_density_plot(granges_list$cnv_density, y_val = y_val, - y_lab = "Breaks per Mb", + y_lab = y_lab, color = "darkgreen", main_title = "CNV Breaks" ) @@ -79,7 +84,7 @@ chr_break_plot <- function(granges_list, # Make SV plot sv_plot <- map_density_plot(granges_list$sv_density, y_val = y_val, - y_lab = "Breaks per Mb", + y_lab = y_lab, color = "orange", main_title = "SV Breaks" ) @@ -103,7 +108,7 @@ chr_break_plot <- function(granges_list, # Save plot to PNG cowplot::save_plot( plot = full_plot, - filename = file.path(plots_dir, paste0(plot_name, "_breaks.png")), + filename = file.path(plot_dir, paste0(plot_name, "_breaks.png")), base_height = 7, base_width = 20 ) @@ -111,3 +116,10 @@ chr_break_plot <- function(granges_list, # Print out the plot while we are here full_plot } + +# Make the combo plot +chr_break_plot(granges_list = granges_list, + plot_name = group_name, + y_val = "median_counts", + y_lab = "Avg Breaks per Mb", + plot_dir = hist_plots_dir) From 0e7cbbb7ec510e684ffe11c20483723c93d8a89e Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Wed, 8 Jan 2020 14:47:05 -0500 Subject: [PATCH 05/10] Got rid of a development remnant I found --- analyses/chromosomal-instability/util/chr-break-plot.R | 7 ------- 1 file changed, 7 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R index 8b3a7b93bf..f83cf7f790 100644 --- a/analyses/chromosomal-instability/util/chr-break-plot.R +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -116,10 +116,3 @@ chr_break_plot <- function(granges_list, # Print out the plot while we are here full_plot } - -# Make the combo plot -chr_break_plot(granges_list = granges_list, - plot_name = group_name, - y_val = "median_counts", - y_lab = "Avg Breaks per Mb", - plot_dir = hist_plots_dir) From 94560dad4f313fb69c2b4e07b602cf84680959de Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Wed, 8 Jan 2020 14:47:50 -0500 Subject: [PATCH 06/10] minor typo fixes --- .../util/chr-break-calculate.R | 30 +++++++++---------- .../util/chr-break-plot.R | 12 ++++---- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index 114acd19e8..1708dab5a8 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability plots +# Functions for chromosomal instability calculations # # C. Savonen for ALSF - CCDL # @@ -10,7 +10,7 @@ make_granges <- function(break_df = NULL, chrom_col = "chrom", start_col = "start", end_col = "end") { - # For a given breaks data.frame make a GenomicRanger object from it. Can + # For a given breaks data.frame make a GenomicRanges object from it. Can # filter to a single samples' data. # # Args: @@ -92,7 +92,7 @@ break_density <- function(sv_breaks = NULL, end_col_sv = "end") { # For given breaks data.frame(s), calculate the breaks density for a tiled # windows across the genome. Returns the data as a GenomicRanges object for - # easy mapping with ggbio. Where the density and counts are stored in + # easy mapping with `ggbio`. Where the density and counts are stored in # @elementMetadata@listData. # # Args: @@ -159,7 +159,7 @@ break_density <- function(sv_breaks = NULL, cnv_ranges, sv_ranges ) - + # Carry over list data from sv_ranges sv_overlaps <- GenomicRanges::findOverlaps(sv_ranges, combo_ranges) @@ -194,31 +194,31 @@ break_density <- function(sv_breaks = NULL, ########################### Calculate summary stats ########################## # Get a per sample break down if there is more than one sample if (n_samples > 1) { - + # Get counts for each genome bin bin_indices <- GenomicRanges::findOverlaps( bins, combo_ranges ) - - # Get list of samples - bin_samples <- combo_ranges@elementMetadata@listData$mcols[bin_indices@to] - + + # Get list of samples + bin_samples <- combo_ranges@elementMetadata@listData$mcols[bin_indices@to] + # Make a matrix that has the number of breaks per sample for each bin - freq_per_bin <- table(bin_indices@from, bin_samples) %>% + freq_per_bin <- table(bin_indices@from, bin_samples) %>% data.frame() %>% - tidyr::spread(Var1, Freq) %>% + tidyr::spread(Var1, Freq) %>% tibble::column_to_rownames("bin_samples") %>% t() - + # Calculate the median breaks per bin median_counts <- apply(freq_per_bin, 1, median) - - # Store the median break counts, some bins data may be dropped off so we need + + # Store the median break counts, some bins data may be dropped off so we need # to start with NAs and then fill in the data based on the indices. bins@elementMetadata@listData$median_counts <- rep(NA, length(bins)) bins@elementMetadata@listData$median_counts[as.numeric(names(median_counts))] <- median_counts - + # Store average count info bins@elementMetadata@listData$avg_counts <- bin_counts / n_samples } diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R index f83cf7f790..96145ed9f9 100644 --- a/analyses/chromosomal-instability/util/chr-break-plot.R +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability calculations +# Functions for chromosomal instability plots # # C. Savonen for ALSF - CCDL # @@ -26,10 +26,10 @@ map_density_plot <- function(granges, # # For setting the scale later, need to get y's max max_y <- max( - data.frame(granges@elementMetadata@listData) %>% + data.frame(granges@elementMetadata@listData) %>% dplyr::pull( !!rlang::sym(y_val) - ), + ), na.rm = TRUE ) # Make the density plot @@ -49,8 +49,8 @@ map_density_plot <- function(granges, chr_break_plot <- function(granges_list, plot_name, - y_val, - y_lab, + y_val, + y_lab, plot_dir) { # A wrapper function to make a 3 row chromosomal map plot for a set of GRanges # objects that contain common_density, cnv_density, and sv_density. @@ -58,7 +58,7 @@ chr_break_plot <- function(granges_list, # Args: # granges_list: A list of Granges object to plot as a combination plot # plot_name: a character string specifying the plot - # y_val: to be passed to map_density plot for mapping. + # y_val: to be passed to map_density plot for mapping. # y_lab: to be passed to map_density plot for y axis label # plot_dir: a file path where you would like the plot PNG to be saved. # From 252884f63fdf04165fa6a0b4955a245ddda714d1 Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Thu, 9 Jan 2020 08:34:03 -0500 Subject: [PATCH 07/10] Incorporate @sjspielman suggestions --- .../util/chr-break-calculate.R | 104 +++++++++++------- .../util/chr-break-plot.R | 25 +++-- 2 files changed, 76 insertions(+), 53 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index 1708dab5a8..b06c0ef008 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability calculations +# Functions for chromosomal instability plots # # C. Savonen for ALSF - CCDL # @@ -10,17 +10,20 @@ make_granges <- function(break_df = NULL, chrom_col = "chrom", start_col = "start", end_col = "end") { - # For a given breaks data.frame make a GenomicRanges object from it. Can + # For a given breaks data.frame make a GenomicRanger object from it. Can # filter to a single samples' data. # # Args: # break_df: for a data.frame with chromosomal coordinates and sample IDs. - # sample_id: sample ID for which the data needs to be extracted and made into - # a GenomicRanges object. If "all" is designated, all samples will - # be kept and the data combined. The original data.frame is stored - # in elementMetadata@listData + # sample_id: a character string that designates which data needs to be + # extracted and made intoa GenomicRanges object by matching the + # information in the designated sample_col. If "all" is designated, + # all samples will be kept. Multiple samples can be designated + # as a character vector. # samples_col: character string that indicates the column name with the - # sample ID information. Default is "samples". + # sample ID information. Default is "samples". This information + # will be store in the GenomicRanges object in + # `elementData@listData$samples` # chrom_col: character string that indicates the column name with the # chromosome information. Default is "chrom". # start_col: character string that indicates the column name with the @@ -92,22 +95,24 @@ break_density <- function(sv_breaks = NULL, end_col_sv = "end") { # For given breaks data.frame(s), calculate the breaks density for a tiled # windows across the genome. Returns the data as a GenomicRanges object for - # easy mapping with `ggbio`. Where the density and counts are stored in + # easy mapping with ggbio. Where the density and counts are stored in # @elementMetadata@listData. # # Args: # sv_breaks: a data.frame with the breaks for the SV data. - # cnv_breaks: a data.frame with the breaks for the SV data. - # sample_id: sample ID for which the data needs to be extracted and made into - # a GenomicRanges object. If "all" is designated, all samples will - # be kept. Multiple samples can be designated as a character vector. + # cnv_breaks: a data.frame with the breaks for the CNV data. + # sample_id: a character string that designates which data needs to be + # extracted and made intoa GenomicRanges object by matching the + # information in the designated sample_col. If "all" is designated, + # all samples will be kept. Multiple samples can be designated + # as a character vector. # chr_size_list: a named numeric vector of the sizes (bp) of the chromosomes. # names of the chromosomes must match the format of the input # break data. e.g. "chr1" or "1". - # window_size: What size windows to calculate break density. Default is 1kb. + # window_size: What size windows to calculate break density. Default is 1Mb. # samples_col_sv/cnv: character string that indicates the column name with the # sample ID information. Default is "samples". Will be passed to - # `make_granges`` function. + # `make_granges` function. # chrom_col_sv/cnv: character string that indicates the column name with the # chromosome information. Default is "chrom". Will be passed to # `make_granges`` function. @@ -116,18 +121,20 @@ break_density <- function(sv_breaks = NULL, # `make_granges`` function. # end_col_sv/cnv: character string that indicates the column name with the # end coordinate. Default is "end". Will be passed to - # `make_granges`` function. + # `make_granges` function. # # Check that a sample ID has been specified. if (is.null(sample_id)) { - stop("No sample ID has been specified. Use the `sample_id` argument.") + stop("No sample ID(s) have been specified. Use the `sample_id` argument.") } # Determine how many samples are in the group n_samples <- length(sample_id) - # Make CNV into GRanges - if (!is.null(cnv_breaks)) { + # If both datasets are given, make them into one and use this + if (!is.null(cnv_breaks) & !is.null(sv_breaks)) { + + # Read in CNV data cnv_ranges <- make_granges( break_df = cnv_breaks, sample_id = sample_id, @@ -135,12 +142,8 @@ break_density <- function(sv_breaks = NULL, chrom_col = chrom_col_cnv, start_col = start_col_cnv, end_col = end_col_cnv - ) - # This will get written over if there are both datasets - combo_ranges <- cnv_ranges - } - # Make SV into GRanges - if (!is.null(sv_breaks)) { + + # Read in SV data sv_ranges <- make_granges( break_df = sv_breaks, sample_id = sample_id, @@ -148,37 +151,54 @@ break_density <- function(sv_breaks = NULL, chrom_col = chrom_col_sv, start_col = start_col_sv, end_col = end_col_sv - ) - # This will get written over if there are both datasets - combo_ranges <- sv_ranges - } - - # If both datasets are given, reduce them into one. - if (!is.null(cnv_breaks) && !is.null(sv_breaks)) { - combo_ranges <- GenomicRanges::union( + ) + # Combine datasets + break_ranges <- GenomicRanges::union( cnv_ranges, sv_ranges ) # Carry over list data from sv_ranges - sv_overlaps <- GenomicRanges::findOverlaps(sv_ranges, combo_ranges) + sv_overlaps <- GenomicRanges::findOverlaps(sv_ranges, break_ranges) # Carry over list data from cnv_ranges - cnv_overlaps <- GenomicRanges::findOverlaps(cnv_ranges, combo_ranges) + cnv_overlaps <- GenomicRanges::findOverlaps(cnv_ranges, break_ranges) # Set up an empty list where we can store what sample each sequence came from - combo_ranges@elementMetadata@listData$samples <- rep(NA, length(combo_ranges)) + break_ranges@elementMetadata@listData$samples <- rep(NA, length(break_ranges)) # Bring over CNV samples - combo_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- + break_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to] # Bring over SV samples - combo_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- + break_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to] - } - # Create genome bins + } else if (!is.null(cnv_breaks) & is.null(sv_breaks)) { + # If only the CNV data is given, use this data only + break_ranges <- make_granges( + break_df = cnv_breaks, + sample_id = sample_id, + samples_col = samples_col_cnv, + chrom_col = chrom_col_cnv, + start_col = start_col_cnv, + end_col = end_col_cnv + ) + } else if (!is.null(sv_breaks) & is.null(cnv_breaks)) { + break_ranges <- make_granges( + break_df = sv_breaks, + sample_id = sample_id, + samples_col = samples_col_sv, + chrom_col = chrom_col_sv, + start_col = start_col_sv, + end_col = end_col_sv + ) + } else if (is.null(sv_breaks) & is.null(cnv_breaks)) { + stop("No data has been provided in either the `sv_break` or `cnv_break` arguments.") + } + + ######################### Tally breaks by genome bins ######################## bins <- GenomicRanges::tileGenome(chr_sizes_list, tilewidth = window_size ) @@ -189,7 +209,7 @@ break_density <- function(sv_breaks = NULL, # Get counts for each genome bin bin_counts <- GenomicRanges::countOverlaps( bins, - combo_ranges + break_ranges ) ########################### Calculate summary stats ########################## # Get a per sample break down if there is more than one sample @@ -198,11 +218,11 @@ break_density <- function(sv_breaks = NULL, # Get counts for each genome bin bin_indices <- GenomicRanges::findOverlaps( bins, - combo_ranges + break_ranges ) # Get list of samples - bin_samples <- combo_ranges@elementMetadata@listData$mcols[bin_indices@to] + bin_samples <- break_ranges@elementMetadata@listData$mcols[bin_indices@to] # Make a matrix that has the number of breaks per sample for each bin freq_per_bin <- table(bin_indices@from, bin_samples) %>% diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R index 96145ed9f9..67f658fdc5 100644 --- a/analyses/chromosomal-instability/util/chr-break-plot.R +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -1,4 +1,4 @@ -# Functions for chromosomal instability plots +# Functions for chromosomal instability calculations # # C. Savonen for ALSF - CCDL # @@ -23,13 +23,16 @@ map_density_plot <- function(granges, # # Returns: # ggplot of chromosomal mapping of the y value given. - # + + # For the y-axis ticks, default is to print every two + by_interval <- 2 + # For setting the scale later, need to get y's max max_y <- max( - data.frame(granges@elementMetadata@listData) %>% + data.frame(granges@elementMetadata@listData) %>% dplyr::pull( !!rlang::sym(y_val) - ), + ), na.rm = TRUE ) # Make the density plot @@ -41,24 +44,24 @@ map_density_plot <- function(granges, ggplot2::theme(axis.text.x = ggplot2::element_text(size = 3, angle = 45, hjust = 1)) + ggplot2::ylab(y_lab) + ggplot2::ggtitle(main_title) + - ggplot2::scale_y_continuous(breaks = seq(0, max_y, by = 2)) + ggplot2::scale_y_continuous(breaks = seq(0, max_y, by = by_interval)) # Print out plot density_plot@ggplot } -chr_break_plot <- function(granges_list, - plot_name, - y_val, - y_lab, - plot_dir) { +multipanel_break_plot <- function(granges_list, + plot_name, + y_val, + y_lab, + plot_dir) { # A wrapper function to make a 3 row chromosomal map plot for a set of GRanges # objects that contain common_density, cnv_density, and sv_density. # # Args: # granges_list: A list of Granges object to plot as a combination plot # plot_name: a character string specifying the plot - # y_val: to be passed to map_density plot for mapping. + # y_val: to be passed to map_density plot for mapping. # y_lab: to be passed to map_density plot for y axis label # plot_dir: a file path where you would like the plot PNG to be saved. # From 1a95792d0d1f734ec9ddf06de964ed51892ea2e2 Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Thu, 9 Jan 2020 08:34:59 -0500 Subject: [PATCH 08/10] Missed a paranthesis --- analyses/chromosomal-instability/util/chr-break-calculate.R | 1 + 1 file changed, 1 insertion(+) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index b06c0ef008..5435b988e4 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -142,6 +142,7 @@ break_density <- function(sv_breaks = NULL, chrom_col = chrom_col_cnv, start_col = start_col_cnv, end_col = end_col_cnv + ) # Read in SV data sv_ranges <- make_granges( From b0f9cb951dbfb5857f6a778d8ce84d2099012e71 Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Thu, 9 Jan 2020 08:36:49 -0500 Subject: [PATCH 09/10] re-linter --- .../util/chr-break-calculate.R | 3 +-- .../util/chr-break-plot.R | 18 +++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index 5435b988e4..c87020c7d4 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -152,7 +152,7 @@ break_density <- function(sv_breaks = NULL, chrom_col = chrom_col_sv, start_col = start_col_sv, end_col = end_col_sv - ) + ) # Combine datasets break_ranges <- GenomicRanges::union( cnv_ranges, @@ -175,7 +175,6 @@ break_density <- function(sv_breaks = NULL, # Bring over SV samples break_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to] - } else if (!is.null(cnv_breaks) & is.null(sv_breaks)) { # If only the CNV data is given, use this data only break_ranges <- make_granges( diff --git a/analyses/chromosomal-instability/util/chr-break-plot.R b/analyses/chromosomal-instability/util/chr-break-plot.R index 67f658fdc5..be2ef1bf91 100644 --- a/analyses/chromosomal-instability/util/chr-break-plot.R +++ b/analyses/chromosomal-instability/util/chr-break-plot.R @@ -23,16 +23,16 @@ map_density_plot <- function(granges, # # Returns: # ggplot of chromosomal mapping of the y value given. - - # For the y-axis ticks, default is to print every two + + # For the y-axis ticks, default is to print every two by_interval <- 2 - + # For setting the scale later, need to get y's max max_y <- max( - data.frame(granges@elementMetadata@listData) %>% + data.frame(granges@elementMetadata@listData) %>% dplyr::pull( - !!rlang::sym(y_val) - ), + !!rlang::sym(y_val) + ), na.rm = TRUE ) # Make the density plot @@ -52,8 +52,8 @@ map_density_plot <- function(granges, multipanel_break_plot <- function(granges_list, plot_name, - y_val, - y_lab, + y_val, + y_lab, plot_dir) { # A wrapper function to make a 3 row chromosomal map plot for a set of GRanges # objects that contain common_density, cnv_density, and sv_density. @@ -61,7 +61,7 @@ multipanel_break_plot <- function(granges_list, # Args: # granges_list: A list of Granges object to plot as a combination plot # plot_name: a character string specifying the plot - # y_val: to be passed to map_density plot for mapping. + # y_val: to be passed to map_density plot for mapping. # y_lab: to be passed to map_density plot for y axis label # plot_dir: a file path where you would like the plot PNG to be saved. # From f595d31f051ab8f0cc06d7fbbc2cbfc13d655a3a Mon Sep 17 00:00:00 2001 From: Candace Savonen Date: Fri, 10 Jan 2020 12:29:06 -0500 Subject: [PATCH 10/10] Fix typos spotted by @jaclyn-taroni --- .../util/chr-break-calculate.R | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/analyses/chromosomal-instability/util/chr-break-calculate.R b/analyses/chromosomal-instability/util/chr-break-calculate.R index c87020c7d4..88a649a048 100644 --- a/analyses/chromosomal-instability/util/chr-break-calculate.R +++ b/analyses/chromosomal-instability/util/chr-break-calculate.R @@ -10,13 +10,13 @@ make_granges <- function(break_df = NULL, chrom_col = "chrom", start_col = "start", end_col = "end") { - # For a given breaks data.frame make a GenomicRanger object from it. Can + # For a given breaks data.frame make a GenomicRanges object from it. Can # filter to a single samples' data. # # Args: # break_df: for a data.frame with chromosomal coordinates and sample IDs. # sample_id: a character string that designates which data needs to be - # extracted and made intoa GenomicRanges object by matching the + # extracted and made into a GenomicRanges object by matching the # information in the designated sample_col. If "all" is designated, # all samples will be kept. Multiple samples can be designated # as a character vector. @@ -32,8 +32,8 @@ make_granges <- function(break_df = NULL, # end coordinate. Default is "end". # # Returns: - # A Genomic Ranges formatted object for the particular sample that has the - # break points from the data.frame + # A GenomicRanges object for the particular sample that has the breakpoints + # from the data.frame # If no data.frame is provided, stop if (is.null(break_df)) { @@ -160,21 +160,24 @@ break_density <- function(sv_breaks = NULL, ) # Carry over list data from sv_ranges - sv_overlaps <- GenomicRanges::findOverlaps(sv_ranges, break_ranges) + sv_overlaps <- suppressWarnings(GenomicRanges::findOverlaps(sv_ranges, + break_ranges)) # Carry over list data from cnv_ranges - cnv_overlaps <- GenomicRanges::findOverlaps(cnv_ranges, break_ranges) + cnv_overlaps <- suppressWarnings(GenomicRanges::findOverlaps(cnv_ranges, + break_ranges)) # Set up an empty list where we can store what sample each sequence came from - break_ranges@elementMetadata@listData$samples <- rep(NA, length(break_ranges)) + break_ranges@elementMetadata@listData$mcols <- rep(NA, length(break_ranges)) # Bring over CNV samples - break_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- + break_ranges@elementMetadata@listData$mcols[cnv_overlaps@from] <- cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to] # Bring over SV samples - break_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <- + break_ranges@elementMetadata@listData$mcols[cnv_overlaps@from] <- cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to] + } else if (!is.null(cnv_breaks) & is.null(sv_breaks)) { # If only the CNV data is given, use this data only break_ranges <- make_granges( @@ -207,19 +210,19 @@ break_density <- function(sv_breaks = NULL, bins <- unlist(bins) # Get counts for each genome bin - bin_counts <- GenomicRanges::countOverlaps( + bin_counts <- suppressWarnings(GenomicRanges::countOverlaps( bins, break_ranges - ) + )) ########################### Calculate summary stats ########################## # Get a per sample break down if there is more than one sample if (n_samples > 1) { # Get counts for each genome bin - bin_indices <- GenomicRanges::findOverlaps( + bin_indices <- suppressWarnings(GenomicRanges::findOverlaps( bins, break_ranges - ) + )) # Get list of samples bin_samples <- break_ranges@elementMetadata@listData$mcols[bin_indices@to]