Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Updates to chromosomal-instability #492

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 71 additions & 6 deletions analyses/chromosomal-instability/00-setup-breakpoint-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
# the effectively surveyed regions of the genome for the WGS samples.",
# --surveyed_wxs: File path that specifies the BED regions file that indicates the
# effectively surveyed regions of the genome for the WXS samples.
# --gap: An integer that indicates how many base pairs away a CNV and SV and
# still be considered the same. Will be passed to maxgap argument in
# GenomicRanges::findOverlaps. Default is 0.
# --drop_sex: If TRUE, will drop the sex chromosomes. Default is FALSE
#
# Command line example:
#
Expand All @@ -37,6 +41,13 @@
# Establish base dir
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# We need the `make_granges` function from here
source(file.path(root_dir,
"analyses",
"chromosomal-instability",
"util",
"chr-break-calculate.R"))

# Magrittr pipe
`%>%` <- dplyr::`%>%`

Expand Down Expand Up @@ -92,6 +103,18 @@ option_list <- list(
'OpenPBTA-analysis') that specifies the BED regions file that indicates the
effectively surveyed regions of the genome for the WXS samples.",
metavar = "character"
),
make_option(
opt_str = "--gap", default = 0,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this catch adjacent breaks, like a break at position 10 and another at 11? Because those should really be the same, but I can't tell if they are considered gap of 0 or 1.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From GenomicRanges docs: The gap between 2 ranges is the number of positions that separate them. The gap between 2 adjacent ranges is 0. I interpret this to mean that maxgap = 0, means they are right next to each other. Is this what we want?

help = "An integer that indicates how many base pairs away a CNV and SV
breakpoint can be and still be considered the same. Will be passed to maxgap
argument in GenomicRanges::findOverlaps.",
metavar = "number"
),
make_option(
opt_str = "--drop_sex", action = "store_true",
default = FALSE, help = "If TRUE, will drop the sex chromosomes. Default is FALSE",
metavar = "character"
)
)

Expand Down Expand Up @@ -173,7 +196,14 @@ sv_df <- data.table::fread(opt$sv, data.table = FALSE) %>%
`23` = "X", `24` = "Y"
)
)

####################### Drop Sex Chr if option is on ###########################
if (opt$drop_sex){
sv_df <- sv_df %>%
dplyr::filter(!(chrom %in% c("X", "Y", "M")))

cnv_df <- cnv_df %>%
dplyr::filter(!(chrom %in% c("X", "Y", "M")))
}
#################### Format the data as chromosomes breaks #####################
# Only keep samples for which there are both SV and CNV data
common_samples <- dplyr::intersect(
Expand Down Expand Up @@ -215,14 +245,48 @@ sv_breaks <- data.frame(
samples %in% common_samples
)

############################## Create union of breaks ##########################
# Make an union of breaks data.frame.
union_of_breaks <- dplyr::bind_rows(sv_breaks, cnv_breaks) %>%
dplyr::distinct(samples, chrom, coord, .keep_all = TRUE)
# Remake this in case some samples' data got filtered out in the process
common_samples <- dplyr::intersect(
unique(cnv_breaks$samples),
unique(sv_breaks$samples)
)

######################### Create intersection of breaks ########################
# Make an intersection of breaks data.frame.
intersection_of_breaks <- lapply(common_samples, function(sample_id) {

# Make into GenomicRanges objects
sv_ranges <- make_granges(sv_breaks,
sample_id = sample_id,
start_col = "coord",
end_col = "coord")
cnv_ranges <- make_granges(cnv_breaks,
sample_id = sample_id,
start_col = "coord",
end_col = "coord")
# Find overlaps
intersection_df <- IRanges::mergeByOverlaps(
sv_ranges,
cnv_ranges,
maxgap = opt$gap) %>%
# Coerce to data.frame
as.data.frame() %>%
# Remove these, they are redundant
dplyr::select(-dplyr::contains("mcols"))

return(intersection_df)
})

# Bring along sample names
names(intersection_of_breaks) <- common_samples

# Collaps into a data.frame
intersection_of_breaks <- dplyr::bind_rows(intersection_of_breaks,
.id = "samples")

# Put all the breaks into a list.
breaks_list <- list(
union_of_breaks = union_of_breaks,
intersection_of_breaks = intersection_of_breaks,
cnv_breaks = cnv_breaks,
sv_breaks = sv_breaks
)
Expand Down Expand Up @@ -281,3 +345,4 @@ purrr::imap(breaks_density_list, function(.x, name = .y) {
file.path(opt$output, paste0(name, "_densities.tsv"))
)
})

Loading