Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Chromosomal Instability (PR 2 of 3): The functions #413

Merged
merged 14 commits into from
Jan 10, 2020
233 changes: 233 additions & 0 deletions analyses/chromosomal-instability/util/chr-break-calculate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# Functions for chromosomal instability calculations
#
# C. Savonen for ALSF - CCDL
#
# 2020

make_granges <- function(break_df = NULL,
sample_id = NULL,
samples_col = "samples",
chrom_col = "chrom",
start_col = "start",
end_col = "end") {
# For a given breaks data.frame make a GenomicRanges object from it. Can
# filter to a single samples' data.
#
# Args:
# break_df: for a data.frame with chromosomal coordinates and sample IDs.
# sample_id: sample ID for which the data needs to be extracted and made into
# a GenomicRanges object. If "all" is designated, all samples will
# be kept and the data combined. The original data.frame is stored
# in elementMetadata@listData
# samples_col: character string that indicates the column name with the
# sample ID information. Default is "samples".
# chrom_col: character string that indicates the column name with the
# chromosome information. Default is "chrom".
# start_col: character string that indicates the column name with the
# start coordinate. Default is "start".
# end_col: character string that indicates the column name with the
# end coordinate. Default is "end".
#
# Returns:
# A Genomic Ranges formatted object for the particular sample that has the
# break points from the data.frame

# If no data.frame is provided, stop
if (is.null(break_df)) {
stop("No breaks data.frame has been provided. ")
}

# List the columns we need
needed_cols <- c(samples_col, chrom_col, start_col, end_col)

# Get logical vector indicating which are in metadata
found_cols <- (needed_cols %in% colnames(break_df))

# If not all the columns are found, stop
if (!all(found_cols)) {
stop(cat(
"The following column names specified for making the GenomicRanges object: \n",
paste(needed_cols[which(!found_cols)], collapse = "\n"),
"\n ...were not found in the specified metadata data.frame.",
"Check your `_col` arguments."
))
}

if (sample_id[1] != "all") {
# Extract samples' info from the data.frame
break_df <- break_df %>%
dplyr::filter(!!rlang::sym(samples_col) %in% c(sample_id))

# Stop if the sample doesn't exist in the data.frame
if (nrow(break_df) == 0) {
stop("No sample data was found for the given `sample_id` argument.")
}
}

# Make GRanges for CNV data
granges <- GenomicRanges::GRanges(
seqnames = dplyr::pull(break_df, !!rlang::sym(chrom_col)),
ranges = IRanges::IRanges(
start = dplyr::pull(break_df, !!rlang::sym(start_col)),
end = dplyr::pull(break_df, !!rlang::sym(end_col))
),
mcols = dplyr::pull(break_df, !!rlang::sym(samples_col))
)

return(granges)
}

break_density <- function(sv_breaks = NULL,
cnv_breaks = NULL,
sample_id = NULL,
window_size = 1e6,
chr_sizes_list = NULL,
samples_col_cnv = "samples",
chrom_col_cnv = "chrom",
start_col_cnv = "start",
end_col_cnv = "end",
samples_col_sv = "samples",
chrom_col_sv = "chrom",
start_col_sv = "start",
end_col_sv = "end") {
# For given breaks data.frame(s), calculate the breaks density for a tiled
# windows across the genome. Returns the data as a GenomicRanges object for
# easy mapping with `ggbio`. Where the density and counts are stored in
# @elementMetadata@listData.
#
# Args:
# sv_breaks: a data.frame with the breaks for the SV data.
# cnv_breaks: a data.frame with the breaks for the SV data.
# sample_id: sample ID for which the data needs to be extracted and made into
# a GenomicRanges object. If "all" is designated, all samples will
# be kept. Multiple samples can be designated as a character vector.
# chr_size_list: a named numeric vector of the sizes (bp) of the chromosomes.
# names of the chromosomes must match the format of the input
# break data. e.g. "chr1" or "1".
# window_size: What size windows to calculate break density. Default is 1kb.
# samples_col_sv/cnv: character string that indicates the column name with the
# sample ID information. Default is "samples". Will be passed to
# `make_granges`` function.
# chrom_col_sv/cnv: character string that indicates the column name with the
# chromosome information. Default is "chrom". Will be passed to
# `make_granges`` function.
# start_col_sv/cnv: character string that indicates the column name with the
# start coordinate. Default is "start". Will be passed to
# `make_granges`` function.
# end_col_sv/cnv: character string that indicates the column name with the
# end coordinate. Default is "end". Will be passed to
# `make_granges`` function.
#
# Check that a sample ID has been specified.
if (is.null(sample_id)) {
stop("No sample ID has been specified. Use the `sample_id` argument.")
}

# Determine how many samples are in the group
n_samples <- length(sample_id)

# Make CNV into GRanges
if (!is.null(cnv_breaks)) {
cnv_ranges <- make_granges(
break_df = cnv_breaks,
sample_id = sample_id,
samples_col = samples_col_cnv,
chrom_col = chrom_col_cnv,
start_col = start_col_cnv,
end_col = end_col_cnv
)
# This will get written over if there are both datasets
combo_ranges <- cnv_ranges
}
# Make SV into GRanges
if (!is.null(sv_breaks)) {
sv_ranges <- make_granges(
break_df = sv_breaks,
sample_id = sample_id,
samples_col = samples_col_sv,
chrom_col = chrom_col_sv,
start_col = start_col_sv,
end_col = end_col_sv
)
# This will get written over if there are both datasets
combo_ranges <- sv_ranges
}

# If both datasets are given, reduce them into one.
if (!is.null(cnv_breaks) && !is.null(sv_breaks)) {
combo_ranges <- GenomicRanges::union(
cnv_ranges,
sv_ranges
)

# Carry over list data from sv_ranges
sv_overlaps <- GenomicRanges::findOverlaps(sv_ranges, combo_ranges)

# Carry over list data from cnv_ranges
cnv_overlaps <- GenomicRanges::findOverlaps(cnv_ranges, combo_ranges)

# Set up an empty list where we can store what sample each sequence came from
combo_ranges@elementMetadata@listData$samples <- rep(NA, length(combo_ranges))

# Bring over CNV samples
combo_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <-
cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to]

# Bring over SV samples
combo_ranges@elementMetadata@listData$samples[cnv_overlaps@from] <-
cnv_ranges@elementMetadata@listData$mcols[cnv_overlaps@to]
}

# Create genome bins
bins <- GenomicRanges::tileGenome(chr_sizes_list,
tilewidth = window_size
)

# Uncompress GRangesList
bins <- unlist(bins)

# Get counts for each genome bin
bin_counts <- GenomicRanges::countOverlaps(
bins,
combo_ranges
)
########################### Calculate summary stats ##########################
# Get a per sample break down if there is more than one sample
if (n_samples > 1) {

# Get counts for each genome bin
bin_indices <- GenomicRanges::findOverlaps(
bins,
combo_ranges
)

# Get list of samples
bin_samples <- combo_ranges@elementMetadata@listData$mcols[bin_indices@to]

# Make a matrix that has the number of breaks per sample for each bin
freq_per_bin <- table(bin_indices@from, bin_samples) %>%
data.frame() %>%
tidyr::spread(Var1, Freq) %>%
tibble::column_to_rownames("bin_samples") %>%
t()

# Calculate the median breaks per bin
median_counts <- apply(freq_per_bin, 1, median)

# Store the median break counts, some bins data may be dropped off so we need
# to start with NAs and then fill in the data based on the indices.
bins@elementMetadata@listData$median_counts <- rep(NA, length(bins))
bins@elementMetadata@listData$median_counts[as.numeric(names(median_counts))] <- median_counts

# Store average count info
bins@elementMetadata@listData$avg_counts <- bin_counts / n_samples
}
# Store count info
bins@elementMetadata@listData$total_counts <- bin_counts

# Calculate and store density
bins@elementMetadata@listData$density <- bin_counts / window_size

# Return the GRanges object for mapping purposes
return(bins)
}
118 changes: 118 additions & 0 deletions analyses/chromosomal-instability/util/chr-break-plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Functions for chromosomal instability plots
#
# C. Savonen for ALSF - CCDL
#
# 2020

map_density_plot <- function(granges,
y_val,
y_lab,
color,
main_title) {
# Given a GRanges object, plot it y value along its chromosomal mappings using
# ggbio.
#
# Args:
# granges: A Granges object to plot
# y_val: a character string of the columnname in listData spot of the
# GenomicRanges to plot on the y axis
# color: a color parameter
# y_lab: a character string to use for the ylabel. Will be passed to
# ggplot2::ylab argument.
# main_title: a character string to use for the main title.
#
# Returns:
# ggplot of chromosomal mapping of the y value given.
#
# For setting the scale later, need to get y's max
max_y <- max(
data.frame(granges@elementMetadata@listData) %>%
dplyr::pull(
!!rlang::sym(y_val)
),
na.rm = TRUE
)
# Make the density plot
density_plot <- ggbio::autoplot(granges, ggplot2::aes(y = !!rlang::sym(y_val)),
geom = "line", scales = "free_x", space = "free_x",
colour = color
) +
ggplot2::theme_classic() +
ggplot2::theme(axis.text.x = ggplot2::element_text(size = 3, angle = 45, hjust = 1)) +
ggplot2::ylab(y_lab) +
ggplot2::ggtitle(main_title) +
ggplot2::scale_y_continuous(breaks = seq(0, max_y, by = 2))

# Print out plot
density_plot@ggplot
}

chr_break_plot <- function(granges_list,
plot_name,
y_val,
y_lab,
plot_dir) {
# A wrapper function to make a 3 row chromosomal map plot for a set of GRanges
# objects that contain common_density, cnv_density, and sv_density.
#
# Args:
# granges_list: A list of Granges object to plot as a combination plot
# plot_name: a character string specifying the plot
# y_val: to be passed to map_density plot for mapping.
# y_lab: to be passed to map_density plot for y axis label
# plot_dir: a file path where you would like the plot PNG to be saved.
#
# Returns:
# ggplot of chromosomal mapping of the y value given.
#
# Make combined SV and CNV plot
common_plot <- map_density_plot(granges_list$common_density,
y_val = y_val,
y_lab = y_lab,
color = "blue",
main_title = "Common Breaks"
)

# Make CNV plot
cnv_plot <- map_density_plot(granges_list$cnv_density,
y_val = y_val,
y_lab = y_lab,
color = "darkgreen",
main_title = "CNV Breaks"
)

# Make SV plot
sv_plot <- map_density_plot(granges_list$sv_density,
y_val = y_val,
y_lab = y_lab,
color = "orange",
main_title = "SV Breaks"
)

# Make a title
title <- cowplot::ggdraw() +
cowplot::draw_label(paste(plot_name, " - Chromosomal Break Density"),
fontface = "bold", x = .4, hjust = 0, size = 12
)

# Put all plots and title together
full_plot <- cowplot::plot_grid(title,
common_plot,
cnv_plot,
sv_plot,
nrow = 4,
axis = "b",
rel_heights = c(2, 5, 5, 5)
)

# Save plot to PNG
cowplot::save_plot(
plot = full_plot,
filename = file.path(plot_dir, paste0(plot_name, "_breaks.png")),
base_height = 7,
base_width = 20
)

# Print out the plot while we are here
full_plot
}