Skip to content

Commit

Permalink
Oncoprint output count tables (AlexsLemonade#1191)
Browse files Browse the repository at this point in the history
  • Loading branch information
runjin326 authored Oct 23, 2021
1 parent b00d344 commit 78220da
Show file tree
Hide file tree
Showing 16 changed files with 681 additions and 11 deletions.
2 changes: 1 addition & 1 deletion analyses/oncoprint-landscape/01-map-to-sample_id.R
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ gather_n_per_cancer_group <- function(metadata, broad_histology){
}
}

output_file <- file.path('data', paste0(opt$filename_lead, "_n_per_cancer_group.tsv"))
output_file <- file.path('tables', paste0(opt$filename_lead, "_n_per_cancer_group.tsv"))
lapply(as.list(c("Low-grade astrocytic tumor","Embryonal tumor",
"Diffuse astrocytic and oligodendroglial tumor",
"Other CNS")), function(x) gather_n_per_cancer_group(histologies_df,x)) %>%
Expand Down
26 changes: 22 additions & 4 deletions analyses/oncoprint-landscape/02-plot-oncoprint.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}

# Path to output directory for tables produced
tables_dir <-
file.path(root_dir, "analyses", "oncoprint-landscape", "tables")

if (!dir.exists(tables_dir)) {
dir.create(tables_dir)
}

# Source the custom functions script
source(
file.path(
Expand Down Expand Up @@ -112,6 +120,12 @@ option_list <- list(
action = "store_true",
default = FALSE,
help = "logical statement on whether to include intronic variants in oncoprint plot"
),
optparse::make_option(
c("--output_table"),
type = "character",
default = NULL,
help = "file name for the output tables"
)
)

Expand Down Expand Up @@ -269,22 +283,26 @@ if (!is.null(opt$goi_list)){
# Get top mutated genes per this subset object
gene_sum <- mafSummary(filtered_maf_object)$gene.summary

# write out results
if (!is.null(opt$output_table)) {
readr::write_tsv(gene_sum, file.path("tables", opt$output_table))
}

# Sort to get top altered genes rather than mutated only genes
goi_list <- gene_sum %>%
dplyr::arrange(dplyr::desc(AlteredSamples)) %>%
# Filter to genes where multiple samples have an alteration
dplyr::filter(AlteredSamples > 1) %>%
dplyr::pull(Hugo_Symbol)

if (opt$top_n < length(goi_list)) {
# Now let's filter to the `top_n` genes
goi_list <- goi_list[1:opt$top_n]

}

}

#### Plot and Save Oncoprint --------------------------------------------------
### Plot and Save Oncoprint --------------------------------------------------

# Given a maf object, plot an oncoprint of the variants in the
# dataset and save as a png file.
Expand Down
236 changes: 236 additions & 0 deletions analyses/oncoprint-landscape/03-oncoprint-n-count-table.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# This script outputs the oncoprint N counts table

#### Set Up --------------------------------------------------------------------

# Load libraries
library(dplyr)
library(maftools)

# Get `magrittr` pipe
`%>%` <- dplyr::`%>%`

#### Directories and Files -----------------------------------------------------

# Detect the ".git" folder -- this will in the project root directory.
# Use this as the root directory to ensure proper execution, no matter where
# it is called from.
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Path to output directory for table produced
results_dir <-
file.path(root_dir, "analyses", "oncoprint-landscape", "tables")

if (!dir.exists(results_dir)) {
dir.create(results_dir)
}

#### Command line options ------------------------------------------------------

# Declare command line options
option_list <- list(
optparse::make_option(
c("-m", "--maf_file_po"),
type = "character",
default = NULL,
help = "file path to MAF file that contains SNV information of primary_only samples",
),
optparse::make_option(
c("-c", "--cnv_file_po"),
type = "character",
default = NULL,
help = "file path to file that contains CNV information of primary_only samples"
),
optparse::make_option(
c("-f", "--fusion_file_po"),
type = "character",
default = NULL,
help = "file path to file that contains fusion information of primary_only samples"
),
optparse::make_option(
c("-g", "--maf_file_pp"),
type = "character",
default = NULL,
help = "file path to MAF file that contains SNV information of primary-plus samples",
),
optparse::make_option(
c("-d", "--cnv_file_pp"),
type = "character",
default = NULL,
help = "file path to file that contains CNV information of primary-plus samples"
),
optparse::make_option(
c("-e", "--fusion_file_pp"),
type = "character",
default = NULL,
help = "file path to file that contains fusion information of primary-plus samples"
),
optparse::make_option(
c("-s", "--metadata_file"),
type = "character",
default = NULL,
help = "file path to the histologies file"
),
optparse::make_option(
c("-l", "--goi_list"),
type = "character",
default = NULL,
help = "comma-separated list of genes of interest files that contain the
genes to include on oncoprint"
),
optparse::make_option(
c("-b", "--broad_histology_list"),
type = "character",
default = NULL,
help = "comma-separated list of `broad_histology` value to output numbers entering oncoprint"
),
optparse::make_option(
c("-o", "--short_name_list"),
type = "character",
default = NULL,
help = "comma-separated list of file prefix for output tables"
),
optparse::make_option(
c("--include_introns"),
action = "store_true",
default = FALSE,
help = "logical statement on whether to include intronic variants in oncoprint plot"
)
)

# Read the arguments passed
opt_parser <- optparse::OptionParser(option_list = option_list)
opt <- optparse::parse_args(opt_parser)

# Define cnv_file, fusion_file, and genes object here as they still need to
# be defined for the `prepare_and_plot_oncoprint` custom function (the
# cnv_file specifically for the `read.maf` function within the custom function),
# even if they are NULL
broad_histology_list<-unlist(strsplit(opt$broad_histology_list,","))
short_name_list<-unlist(strsplit(opt$short_name_list,","))

# Source the custom functions script
source(
file.path(
root_dir,
"analyses",
"oncoprint-landscape",
"util",
"oncoplot-functions.R"
)
)

#### Read in data --------------------------------------------------------------

# Read in metadata
metadata <- readr::read_tsv(opt$metadata_file, guess_max = 10000) %>%
dplyr::rename(Tumor_Sample_Barcode = sample_id)

# Read in MAF file
maf_df_po <- data.table::fread(opt$maf_file_po,
stringsAsFactors = FALSE,
data.table = FALSE)
maf_df_pp <- data.table::fread(opt$maf_file_pp,
stringsAsFactors = FALSE,
data.table = FALSE)

if (!opt$include_introns) {
maf_df_po <- maf_df_po %>%
dplyr::filter(Variant_Classification != "Intron")

maf_df_pp <- maf_df_pp %>%
dplyr::filter(Variant_Classification != "Intron")
}

# Read in cnv file
cnv_df_po <- readr::read_tsv(opt$cnv_file_po)
cnv_df_pp <- readr::read_tsv(opt$cnv_file_pp)


# Read in fusion file and join
fusion_df_po <- readr::read_tsv(opt$fusion_file_po)
fusion_df_pp <- readr::read_tsv(opt$fusion_file_pp)


#### Set up oncoprint annotation objects --------------------------------------
oncoprint_n_table_po <- data.frame(matrix(ncol = 3, nrow = 0))
oncoprint_n_table_pp <- data.frame(matrix(ncol = 3, nrow = 0))

colnames(oncoprint_n_table_po) <- c("broad_histology", "n_sample", "tumor_type")
colnames(oncoprint_n_table_pp) <- c("broad_histology", "n_sample", "tumor_type")

for(i in 1:length(broad_histology_list)){
histology_of_interest <- broad_histology_list[i]

if (!histology_of_interest == "Other CNS") {
metadata_each <- metadata %>%
dplyr::filter(broad_histology == histology_of_interest)
} else {
metadata_each <- metadata %>%
dplyr::filter(
broad_histology %in% c(
"Ependymal tumor",
"Tumors of sellar region",
"Neuronal and mixed neuronal-glial tumor",
"Tumor of cranial and paraspinal nerves",
"Meningioma",
"Mesenchymal non-meningothelial tumor",
"Germ cell tumor",
"Choroid plexus tumor",
"Histiocytic tumor",
"Tumor of pineal region",
"Metastatic tumors",
"Other astrocytic tumor",
"Lymphoma",
"Melanocytic tumor",
"Other tumor"
)
)
}
# Now filter the remaining data files
maf_po_each <- maf_df_po %>%
dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)

cnv_po_each <- cnv_df_po %>%
dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)

fusion_po_each <- fusion_df_po %>%
dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)

# get bs ids in each dataframe
maf_po_bsid <- maf_po_each %>% pull(Tumor_Sample_Barcode) %>% unique()
cnv_po_bsid <- cnv_po_each %>% pull(Tumor_Sample_Barcode) %>% unique()
fusion_po_bsid <- fusion_po_each %>% pull(Tumor_Sample_Barcode) %>% unique()
# take union
all_po_bsid <- union(maf_po_bsid, cnv_po_bsid) %>% union(fusion_po_bsid)
# write out broad histology and number of samples to a table
oncoprint_n_table_po[i,1] <- histology_of_interest
oncoprint_n_table_po[i,2] <- length(all_po_bsid)
oncoprint_n_table_po[i,3] <- "primary_only"

# Now do the same thing for primary plus samples
maf_pp_each <- maf_df_pp %>%
dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)

cnv_pp_each <- cnv_df_pp %>%
dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)

fusion_pp_each <- fusion_df_pp %>%
dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)

# get bs ids in each dataframe
maf_pp_bsid <- maf_pp_each %>% pull(Tumor_Sample_Barcode) %>% unique()
cnv_pp_bsid <- cnv_pp_each %>% pull(Tumor_Sample_Barcode) %>% unique()
fusion_pp_bsid <- fusion_pp_each %>% pull(Tumor_Sample_Barcode) %>% unique()
# take union
all_pp_bsid <- union(maf_pp_bsid, cnv_pp_bsid) %>% union(fusion_pp_bsid)
# write out broad histology and number of samples to a table
oncoprint_n_table_pp[i,1] <- histology_of_interest
oncoprint_n_table_pp[i,2] <- length(all_pp_bsid)
oncoprint_n_table_pp[i,3] <- "primary-plus"

}

# write out n number entering oncoprint
bind_rows(oncoprint_n_table_pp, oncoprint_n_table_po) %>%
readr::write_tsv(file.path(results_dir, "sample_n_in_oncoprint.tsv"))

8 changes: 7 additions & 1 deletion analyses/oncoprint-landscape/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,16 @@ bash run-oncoprint.sh
* Filtering via an [independent specimen file](https://alexslemonade.github.io/OpenPBTA-manuscript/#selection-of-independent-samples) is optional, but highly recommended.
* `02-plot-oncoprint.R` takes the files from above and optionally a file or set of files (to be concatenated) that will restrict the set of genes that are being plotted in an OncoPrint.
* Running this via `run-oncoprint.sh` will restrict plotting to a list of top mutated genes (generated in `analyses/interaction-plots/scripts/01-disease-specimen-lists.R`) and top genes with recurrent CNVs (generated in `analyses/focal-cn-file-preparation/06-find-recurrent-calls.Rmd`)

* This script also returns tables summary of the total of number of samples that have alterations in gene of interest
* `03-oncoprint-n-count-table.R` counts the number of samples that enter oncoprint plotting (with or without alterations in genes of interest) for each broad histology group

### Folder Structure

```
├── 00-prepare-goi-lists.R
├── 01-map-to-sample_id.R
├── 02-plot-oncoprint.R
├── 03-oncoprint-n-count-table.R
├── README.md
├── data
│   ├── embryonal-tumor_goi_list.tsv
Expand All @@ -45,6 +47,10 @@ bash run-oncoprint.sh
│   ├── primary-plus_*histology*_oncoprint.png
│   ├── primary_only_*histology*_goi_oncoprint.png
│   └── primary_only_*histology*_oncoprint.png
├── tables
│   ├── summary_n_in_oncoprint.tsv
│   ├── primary-plus_*histology*_summary_n.tsv
│   └── primary_only_*histology*_summary_n.tsv
├── run-oncoprint.sh
└── util
└── oncoplot-functions.R
Expand Down
22 changes: 17 additions & 5 deletions analyses/oncoprint-landscape/run-oncoprint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ declare -A labels=(
)

declare -A goi_files=(
[lgat]="lgat_goi_list.tsv"
[lgat]="lgat_goi_list.tsv"
[embryonal]="embryonal-tumor_goi_list.tsv"
[hgat]="hgat_goi_list.tsv"
[other]="other_goi_list.tsv"
Expand All @@ -94,8 +94,8 @@ for histology in "${histologies[@]}"; do
--fusion_file "${intermediate_directory}/${filename}_fusions.tsv" \
--metadata_file "${histologies_file}" \
--png_name "${filename}_${histology}_oncoprint.png" \
--broad_histology "${labels[$histology]}"
--broad_histology "${labels[$histology]}"

# Genes of interest only version of oncoprint
Rscript --vanilla 02-plot-oncoprint.R \
--maf_file "${intermediate_directory}/${filename}_maf.tsv" \
Expand All @@ -105,7 +105,19 @@ for histology in "${histologies[@]}"; do
--goi_list "${oncoprint_data_directory}/${goi_files[$histology]}" \
--top_n 20 \
--png_name "${filename}_${histology}_goi_oncoprint.png" \
--broad_histology "${labels[$histology]}"
--broad_histology "${labels[$histology]}" \
--output_table "${filename}_${histology}_oncoprint_summary_n.tsv"

done
done
done

Rscript --vanilla 03-oncoprint-n-count-table.R \
--maf_file_po "${intermediate_directory}/primary_only_maf.tsv" \
--cnv_file_po "${intermediate_directory}/primary_only_cnv.tsv" \
--fusion_file_po "${intermediate_directory}/primary_only_fusions.tsv" \
--maf_file_pp "${intermediate_directory}/primary-plus_maf.tsv" \
--cnv_file_pp "${intermediate_directory}/primary-plus_cnv.tsv" \
--fusion_file_pp "${intermediate_directory}/primary-plus_fusions.tsv" \
--metadata_file "${histologies_file}" \
--broad_histology_list "Low-grade astrocytic tumor,Embryonal tumor,Diffuse astrocytic and oligodendroglial tumor,Other CNS" \
--short_name_list "lgat,embryonal,hgat,other"
Loading

0 comments on commit 78220da

Please sign in to comment.