Oncoprint output count tables (AlexsLemonade#1191)

LauraEgolf · Oct 23, 2021 · 78220da · 78220da
1 parent b00d344
commit 78220da
Show file tree

Hide file tree

Showing 16 changed files with 681 additions and 11 deletions.
diff --git a/analyses/oncoprint-landscape/01-map-to-sample_id.R b/analyses/oncoprint-landscape/01-map-to-sample_id.R
@@ -387,7 +387,7 @@ gather_n_per_cancer_group <- function(metadata, broad_histology){
   }
 }
 
-output_file <- file.path('data', paste0(opt$filename_lead, "_n_per_cancer_group.tsv"))
+output_file <- file.path('tables', paste0(opt$filename_lead, "_n_per_cancer_group.tsv"))
 lapply(as.list(c("Low-grade astrocytic tumor","Embryonal tumor",
          "Diffuse astrocytic and oligodendroglial tumor",
          "Other CNS")), function(x) gather_n_per_cancer_group(histologies_df,x)) %>%

diff --git a/analyses/oncoprint-landscape/02-plot-oncoprint.R b/analyses/oncoprint-landscape/02-plot-oncoprint.R
@@ -43,6 +43,14 @@ if (!dir.exists(plots_dir)) {
   dir.create(plots_dir)
 }
 
+# Path to output directory for tables produced
+tables_dir <-
+  file.path(root_dir, "analyses", "oncoprint-landscape", "tables")
+
+if (!dir.exists(tables_dir)) {
+  dir.create(tables_dir)
+}
+
 # Source the custom functions script
 source(
   file.path(
@@ -112,6 +120,12 @@ option_list <- list(
     action = "store_true",
     default = FALSE,
     help = "logical statement on whether to include intronic variants in oncoprint plot"
+  ),
+  optparse::make_option(
+    c("--output_table"),
+    type = "character",
+    default = NULL,
+    help = "file name for the output tables"
   )
 )
 
@@ -269,22 +283,26 @@ if (!is.null(opt$goi_list)){
   # Get top mutated genes per this subset object
   gene_sum <- mafSummary(filtered_maf_object)$gene.summary
 
+  # write out results 
+  if (!is.null(opt$output_table)) {
+    readr::write_tsv(gene_sum, file.path("tables", opt$output_table))
+  }
+
   # Sort to get top altered genes rather than mutated only genes
   goi_list <- gene_sum %>%
     dplyr::arrange(dplyr::desc(AlteredSamples)) %>%
     # Filter to genes where multiple samples have an alteration
     dplyr::filter(AlteredSamples > 1) %>%
     dplyr::pull(Hugo_Symbol)
-  
+
   if (opt$top_n < length(goi_list)) {
     # Now let's filter to the `top_n` genes
     goi_list <- goi_list[1:opt$top_n]
-    
+
   }
-
 }
 
-#### Plot and Save Oncoprint --------------------------------------------------
+### Plot and Save Oncoprint --------------------------------------------------
 
 # Given a maf object, plot an oncoprint of the variants in the
 # dataset and save as a png file.

diff --git a/analyses/oncoprint-landscape/03-oncoprint-n-count-table.R b/analyses/oncoprint-landscape/03-oncoprint-n-count-table.R
@@ -0,0 +1,236 @@
+# This script outputs the oncoprint N counts table 
+
+#### Set Up --------------------------------------------------------------------
+
+# Load libraries
+library(dplyr)
+library(maftools)
+
+# Get `magrittr` pipe
+`%>%` <- dplyr::`%>%`
+
+#### Directories and Files -----------------------------------------------------
+
+# Detect the ".git" folder -- this will in the project root directory.
+# Use this as the root directory to ensure proper execution, no matter where
+# it is called from.
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+# Path to output directory for table produced
+results_dir <-
+  file.path(root_dir, "analyses", "oncoprint-landscape", "tables")
+
+if (!dir.exists(results_dir)) {
+  dir.create(results_dir)
+}
+
+#### Command line options ------------------------------------------------------
+
+# Declare command line options
+option_list <- list(
+  optparse::make_option(
+    c("-m", "--maf_file_po"),
+    type = "character",
+    default = NULL,
+    help = "file path to MAF file that contains SNV information of primary_only samples",
+  ),
+  optparse::make_option(
+    c("-c", "--cnv_file_po"),
+    type = "character",
+    default = NULL,
+    help = "file path to file that contains CNV information of primary_only samples"
+  ),
+  optparse::make_option(
+    c("-f", "--fusion_file_po"),
+    type = "character",
+    default = NULL,
+    help = "file path to file that contains fusion information of primary_only samples"
+  ),
+  optparse::make_option(
+    c("-g", "--maf_file_pp"),
+    type = "character",
+    default = NULL,
+    help = "file path to MAF file that contains SNV information of primary-plus samples",
+  ),
+  optparse::make_option(
+    c("-d", "--cnv_file_pp"),
+    type = "character",
+    default = NULL,
+    help = "file path to file that contains CNV information of primary-plus samples"
+  ),
+  optparse::make_option(
+    c("-e", "--fusion_file_pp"),
+    type = "character",
+    default = NULL,
+    help = "file path to file that contains fusion information of primary-plus samples"
+  ),
+  optparse::make_option(
+    c("-s", "--metadata_file"),
+    type = "character",
+    default = NULL,
+    help = "file path to the histologies file"
+  ),
+  optparse::make_option(
+    c("-l", "--goi_list"),
+    type = "character",
+    default = NULL,
+    help = "comma-separated list of genes of interest files that contain the
+            genes to include on oncoprint"
+  ),
+  optparse::make_option(
+    c("-b", "--broad_histology_list"),
+    type = "character",
+    default = NULL,
+    help = "comma-separated list of `broad_histology` value to output numbers entering oncoprint"
+  ),
+  optparse::make_option(
+    c("-o", "--short_name_list"),
+    type = "character",
+    default = NULL,
+    help = "comma-separated list of file prefix for output tables"
+  ),
+  optparse::make_option(
+    c("--include_introns"),
+    action = "store_true",
+    default = FALSE,
+    help = "logical statement on whether to include intronic variants in oncoprint plot"
+  )
+)
+
+# Read the arguments passed
+opt_parser <- optparse::OptionParser(option_list = option_list)
+opt <- optparse::parse_args(opt_parser)
+
+# Define cnv_file, fusion_file, and genes object here as they still need to
+# be defined for the `prepare_and_plot_oncoprint` custom function (the
+# cnv_file specifically for the `read.maf` function within the custom function),
+# even if they are NULL
+broad_histology_list<-unlist(strsplit(opt$broad_histology_list,","))
+short_name_list<-unlist(strsplit(opt$short_name_list,","))
+
+# Source the custom functions script
+source(
+  file.path(
+    root_dir,
+    "analyses",
+    "oncoprint-landscape",
+    "util",
+    "oncoplot-functions.R"
+  )
+)
+
+#### Read in data --------------------------------------------------------------
+
+# Read in metadata
+metadata <- readr::read_tsv(opt$metadata_file, guess_max = 10000) %>%
+  dplyr::rename(Tumor_Sample_Barcode = sample_id)
+
+# Read in MAF file
+maf_df_po <- data.table::fread(opt$maf_file_po,
+                               stringsAsFactors = FALSE,
+                               data.table = FALSE)
+maf_df_pp <- data.table::fread(opt$maf_file_pp,
+                               stringsAsFactors = FALSE,
+                               data.table = FALSE)
+
+if (!opt$include_introns) {
+  maf_df_po <- maf_df_po %>%
+    dplyr::filter(Variant_Classification != "Intron")
+
+  maf_df_pp <- maf_df_pp %>%
+    dplyr::filter(Variant_Classification != "Intron")
+}
+
+# Read in cnv file
+cnv_df_po <- readr::read_tsv(opt$cnv_file_po) 
+cnv_df_pp <- readr::read_tsv(opt$cnv_file_pp) 
+
+
+# Read in fusion file and join
+fusion_df_po <- readr::read_tsv(opt$fusion_file_po)
+fusion_df_pp <- readr::read_tsv(opt$fusion_file_pp)
+
+
+#### Set up oncoprint annotation objects --------------------------------------
+oncoprint_n_table_po <- data.frame(matrix(ncol = 3, nrow = 0))
+oncoprint_n_table_pp <- data.frame(matrix(ncol = 3, nrow = 0))
+
+colnames(oncoprint_n_table_po) <- c("broad_histology", "n_sample", "tumor_type")
+colnames(oncoprint_n_table_pp) <- c("broad_histology", "n_sample", "tumor_type")
+
+for(i in 1:length(broad_histology_list)){
+  histology_of_interest <- broad_histology_list[i]
+
+  if (!histology_of_interest == "Other CNS") {
+    metadata_each <- metadata %>%
+      dplyr::filter(broad_histology == histology_of_interest)
+  } else {
+    metadata_each <- metadata %>%
+      dplyr::filter(
+        broad_histology %in% c(
+          "Ependymal tumor",
+          "Tumors of sellar region",
+          "Neuronal and mixed neuronal-glial tumor",
+          "Tumor of cranial and paraspinal nerves",
+          "Meningioma",
+          "Mesenchymal non-meningothelial tumor",
+          "Germ cell tumor",
+          "Choroid plexus tumor",
+          "Histiocytic tumor",
+          "Tumor of pineal region",
+          "Metastatic tumors",
+          "Other astrocytic tumor",
+          "Lymphoma",
+          "Melanocytic tumor",
+          "Other tumor"
+        )
+      )
+    }
+  # Now filter the remaining data files
+  maf_po_each <- maf_df_po %>%
+    dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)
+
+  cnv_po_each <- cnv_df_po %>%
+    dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)
+
+  fusion_po_each <- fusion_df_po %>%
+    dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)
+
+  # get bs ids in each dataframe
+  maf_po_bsid <- maf_po_each %>% pull(Tumor_Sample_Barcode) %>% unique()
+  cnv_po_bsid <- cnv_po_each %>% pull(Tumor_Sample_Barcode) %>% unique()
+  fusion_po_bsid <- fusion_po_each %>% pull(Tumor_Sample_Barcode) %>% unique()
+  # take union
+  all_po_bsid <- union(maf_po_bsid, cnv_po_bsid) %>% union(fusion_po_bsid) 
+  # write out broad histology and number of samples to a table 
+  oncoprint_n_table_po[i,1] <- histology_of_interest
+  oncoprint_n_table_po[i,2] <- length(all_po_bsid)
+  oncoprint_n_table_po[i,3] <- "primary_only"
+
+  # Now do the same thing for primary plus samples
+  maf_pp_each <- maf_df_pp %>%
+    dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)
+
+  cnv_pp_each <- cnv_df_pp %>%
+    dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)
+
+  fusion_pp_each <- fusion_df_pp %>%
+    dplyr::filter(Tumor_Sample_Barcode %in% metadata_each$Tumor_Sample_Barcode)
+
+  # get bs ids in each dataframe
+  maf_pp_bsid <- maf_pp_each %>% pull(Tumor_Sample_Barcode) %>% unique()
+  cnv_pp_bsid <- cnv_pp_each %>% pull(Tumor_Sample_Barcode) %>% unique()
+  fusion_pp_bsid <- fusion_pp_each %>% pull(Tumor_Sample_Barcode) %>% unique()
+  # take union
+  all_pp_bsid <- union(maf_pp_bsid, cnv_pp_bsid) %>% union(fusion_pp_bsid) 
+  # write out broad histology and number of samples to a table 
+  oncoprint_n_table_pp[i,1] <- histology_of_interest
+  oncoprint_n_table_pp[i,2] <- length(all_pp_bsid)
+  oncoprint_n_table_pp[i,3] <- "primary-plus"
+
+}
+
+# write out n number entering oncoprint
+bind_rows(oncoprint_n_table_pp, oncoprint_n_table_po) %>% 
+  readr::write_tsv(file.path(results_dir, "sample_n_in_oncoprint.tsv"))
+
diff --git a/analyses/oncoprint-landscape/README.md b/analyses/oncoprint-landscape/README.md
@@ -22,14 +22,16 @@ bash run-oncoprint.sh
   * Filtering via an [independent specimen file](https://alexslemonade.github.io/OpenPBTA-manuscript/#selection-of-independent-samples) is optional, but highly recommended.
 * `02-plot-oncoprint.R` takes the files from above and optionally a file or set of files (to be concatenated) that will restrict the set of genes that are being plotted in an OncoPrint.
 	* Running this via `run-oncoprint.sh` will restrict plotting to a list of top mutated genes (generated in `analyses/interaction-plots/scripts/01-disease-specimen-lists.R`) and top genes with recurrent CNVs (generated in `analyses/focal-cn-file-preparation/06-find-recurrent-calls.Rmd`)
-
+  * This script also returns tables summary of the total of number of samples that have alterations in gene of interest
+* `03-oncoprint-n-count-table.R` counts the number of samples that enter oncoprint plotting (with or without alterations in genes of interest) for each broad histology group
 
 ### Folder Structure
 
 ```
 ├── 00-prepare-goi-lists.R
 ├── 01-map-to-sample_id.R
 ├── 02-plot-oncoprint.R
+├── 03-oncoprint-n-count-table.R
 ├── README.md
 ├── data
 │   ├── embryonal-tumor_goi_list.tsv
@@ -45,6 +47,10 @@ bash run-oncoprint.sh
 │   ├── primary-plus_*histology*_oncoprint.png
 │   ├── primary_only_*histology*_goi_oncoprint.png
 │   └── primary_only_*histology*_oncoprint.png
+├── tables
+│   ├── summary_n_in_oncoprint.tsv
+│   ├── primary-plus_*histology*_summary_n.tsv
+│   └── primary_only_*histology*_summary_n.tsv
 ├── run-oncoprint.sh
 └── util
     └── oncoplot-functions.R

diff --git a/analyses/oncoprint-landscape/run-oncoprint.sh b/analyses/oncoprint-landscape/run-oncoprint.sh
@@ -74,7 +74,7 @@ declare -A labels=(
 )
 
 declare -A goi_files=(
-  [lgat]="lgat_goi_list.tsv"  
+  [lgat]="lgat_goi_list.tsv"
   [embryonal]="embryonal-tumor_goi_list.tsv"
   [hgat]="hgat_goi_list.tsv"
   [other]="other_goi_list.tsv"
@@ -94,8 +94,8 @@ for histology in "${histologies[@]}"; do
       --fusion_file "${intermediate_directory}/${filename}_fusions.tsv" \
       --metadata_file "${histologies_file}" \
       --png_name "${filename}_${histology}_oncoprint.png" \
-      --broad_histology "${labels[$histology]}"
-      
+      --broad_histology "${labels[$histology]}" 
+
     # Genes of interest only version of oncoprint
     Rscript --vanilla 02-plot-oncoprint.R \
       --maf_file "${intermediate_directory}/${filename}_maf.tsv" \
@@ -105,7 +105,19 @@ for histology in "${histologies[@]}"; do
       --goi_list "${oncoprint_data_directory}/${goi_files[$histology]}" \
       --top_n 20 \
       --png_name "${filename}_${histology}_goi_oncoprint.png" \
-      --broad_histology "${labels[$histology]}"
+      --broad_histology "${labels[$histology]}" \
+      --output_table "${filename}_${histology}_oncoprint_summary_n.tsv"
 
-  done  
+  done
 done
+
+Rscript --vanilla 03-oncoprint-n-count-table.R \
+  --maf_file_po "${intermediate_directory}/primary_only_maf.tsv" \
+  --cnv_file_po "${intermediate_directory}/primary_only_cnv.tsv" \
+  --fusion_file_po "${intermediate_directory}/primary_only_fusions.tsv" \
+  --maf_file_pp "${intermediate_directory}/primary-plus_maf.tsv" \
+  --cnv_file_pp "${intermediate_directory}/primary-plus_cnv.tsv" \
+  --fusion_file_pp "${intermediate_directory}/primary-plus_fusions.tsv" \
+  --metadata_file "${histologies_file}" \
+  --broad_histology_list "Low-grade astrocytic tumor,Embryonal tumor,Diffuse astrocytic and oligodendroglial tumor,Other CNS" \
+  --short_name_list "lgat,embryonal,hgat,other"