AlexsLemonade · jaclyn-taroni · May 13, 2021 · Apr 29, 2021 · Apr 29, 2021 · May 3, 2021
diff --git a/analyses/oncoprint-landscape/00-prepare-goi-lists.R b/analyses/oncoprint-landscape/00-prepare-goi-lists.R
@@ -0,0 +1,57 @@
+# Take in oncoprint-goi-lists-OpenPBTA.csv and create a goi file for each
+# column with associated genes of interest for each specified broad histology
+#
+#
+# Chante Bethell for CCDL 2021
+#
+# # #### USAGE
+# This script is intended to be sourced in the script as follows:
+# 
+# Rscript --vanilla 00-prepare-goi-lists.R \
+
+
+#### Set Up --------------------------------------------------------------------
+
+library(dplyr)
+library(stringr)
+
+#### Directories and Files -----------------------------------------------------
+
+# Detect the ".git" folder -- this will in the project root directory.
+# Use this as the root directory to ensure proper execution, no matter where
+# it is called from.
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+# Path to directory that contains the manually curated genes of interest list
+data_dir <-
+  file.path(root_dir, "analyses", "oncoprint-landscape", "data")
+
+# Each histology has it's own column, and there are source columns
+all_goi_df <- readr::read_csv(file.path("data", 
+                                        "oncoprint-goi-lists-OpenPBTA.csv"))
+
+# Drop the source columns
+all_goi_df <- all_goi_df %>% 
+  select(-contains("Source"))
+
+# Now each column will be a broad histology
+for (col_iter in 1:ncol(all_goi_df)) {
+
+  # The broad histology is the column name, but let's make it all lowercase
+  # and replace spaces with hyphens for use as part of the output file name
+  broad_histology <- str_to_lower(str_replace_all(
+      colnames(all_goi_df)[col_iter],
+      pattern = " ",
+      replacement = "-"
+    ))
+
+  # Create the output file name
+  output_file <- file.path(data_dir, str_c(broad_histology, "_goi_list.tsv"))
+
+  # Write the current column to file, removing any NA values
+  all_goi_df[, col_iter] %>%
+    tidyr::drop_na() %>%
+    distinct() %>%
+    readr::write_tsv(output_file)
+
+}
diff --git a/...oncoprint-landscape/00-map-to-sample_id.R → ...oncoprint-landscape/01-map-to-sample_id.R b/...oncoprint-landscape/00-map-to-sample_id.R → ...oncoprint-landscape/01-map-to-sample_id.R
diff --git a/...s/oncoprint-landscape/01-plot-oncoprint.R → ...s/oncoprint-landscape/02-plot-oncoprint.R b/...s/oncoprint-landscape/01-plot-oncoprint.R → ...s/oncoprint-landscape/02-plot-oncoprint.R
@@ -95,11 +95,23 @@ option_list <- list(
     default = NULL,
     help = "optional name of `broad_histology` value to plot associated oncoprint"
   ),
+  optparse::make_option(
+    c("-n", "--top_n"),
+    type = "integer",
+    default = 25,
+    help = "`n` to display top n genes based on count of mutations, default is 25"
+  ),
   optparse::make_option(
     c("-p", "--png_name"),
     type = "character",
     default = NULL,
     help = "oncoprint output png file name"
+  ),
+  optparse::make_option(
+    c("--include_introns"),
+    action = "store_true",
+    default = FALSE,
+    help = "logical statement on whether to include intronic variants in oncoprint plot"
   )
 )
 
@@ -115,22 +127,6 @@ cnv_df <- opt$cnv_file
 fusion_df <- opt$fusion_file
 goi_list <- opt$goi_list
 
-#### Functions ----------------------------------------------------------------
-
-read_genes <- function(gene_list) {
-  # This function takes in the file path to a gene list and pulls out
-  # the gene information from that list
-  #
-  # Args:
-  #   gene_list: file path to genes of interest file
-  #
-  # Return:
-  #   genes: a vector of genes from the genes of interest file
-
-  genes <- readr::read_tsv(gene_list) %>%
-    dplyr::pull("gene")
-}
-
 #### Read in data --------------------------------------------------------------
 
 # Read in metadata
@@ -142,6 +138,11 @@ maf_df <- data.table::fread(opt$maf_file,
                             stringsAsFactors = FALSE,
                             data.table = FALSE)
 
+if (!opt$include_introns) {
+  maf_df <- maf_df %>%
+    dplyr::filter(Variant_Classification != "Intron")
+}
+
 # Read in cnv file
 if (!is.null(opt$cnv_file)) {
   cnv_df <- readr::read_tsv(opt$cnv_file) %>%
@@ -155,16 +156,6 @@ if (!is.null(opt$fusion_file)) {
   fusion_df <- readr::read_tsv(opt$fusion_file)
 }
 
-# Read in gene information from the list of genes of interest files
-if (!is.null(opt$goi_list)) {
-  goi_files <- unlist(stringr::str_split(goi_list, ",| "))
-  # Read in using the `read_genes` custom function and unlist the gene column
-  # data from the genes of interest file paths given
-  goi_list <- lapply(goi_files, read_genes)
-    # Include only the unique genes of interest
-  goi_list <- unique(unlist(goi_list))
-}
-
 #### Set up oncoprint annotation objects --------------------------------------
 # Read in histology standard color palette for project
 histology_label_mapping <- readr::read_tsv(
@@ -254,6 +245,44 @@ maf_object <- prepare_maf_object(
   fusion_df = fusion_df
 )
 
+#### Subset MAF Object (Optional)----------------------------------------------
+
+# Code here is specifically adapted from:
+# https://github.com/marislab/create-pptc-pdx-oncoprints/blob/master/R/create-complexheat-oncoprint-revision.R
+
+# We only need to subset the GOI list if there are more GOI than the top n argument
+# Subset `maf_object` for histology-specific goi list
+if (!is.null(opt$goi_list)){
+
+  # Read in genes of interest information using the `read_tsv()` function
+  goi_list <- readr::read_tsv(opt$goi_list) %>%
+    as.matrix()
+
+  filtered_maf_object <- subsetMaf(
+    maf = maf_object,
+    tsb = metadata$Tumor_Sample_Barcode,
+    genes = goi_list,
+    mafObj = TRUE
+  )
+
+  # Get top mutated genes per this subset object
+  gene_sum <- mafSummary(filtered_maf_object)$gene.summary
+
+  # Sort to get top altered genes rather than mutated only genes
+  goi_list <- gene_sum %>%
+    dplyr::arrange(dplyr::desc(AlteredSamples)) %>%
+    # Filter to genes where multiple samples have an alteration
+    dplyr::filter(AlteredSamples > 1) %>%
+    dplyr::pull(Hugo_Symbol)
+
+  if (opt$top_n < length(goi_list)) {
+    # Now let's filter to the `top_n` genes
+    goi_list <- goi_list[1:opt$top_n]
+
+  }
+
+}
+
 #### Plot and Save Oncoprint --------------------------------------------------
 
 # Given a maf object, plot an oncoprint of the variants in the
@@ -265,6 +294,7 @@ png(
   units = "cm",
   res = 300
 )
+
 oncoplot(
   maf_object,
   clinicalFeatures = "display_group",
@@ -279,6 +309,8 @@ oncoplot(
   colors = oncoprint_col_palette,
   annotationColor = annotation_colors,
   bgCol = "#F5F5F5",
-  top = 25
+  top = opt$top_n
 )
+
 dev.off()
+
diff --git a/analyses/oncoprint-landscape/data/embryonal-tumor_goi_list.tsv b/analyses/oncoprint-landscape/data/embryonal-tumor_goi_list.tsv
@@ -0,0 +1,63 @@
+Embryonal tumor
+DICER1
+TP53
+MSH2
+TERT
+TFPT
+PAARP8
+CENPE
+DDX11
+MUTYH
+CHEK2
+CTNNB1
+DYNC2H1
+PTCH1
+MAP4K4
+SUFU
+ROS1
+KSR2
+RASSF5
+FOXO3
+IGFN1
+BCOR
+TTYH1
+MIR17HG
+LIN28A
+APC
+CSNK2B
+SMO
+KMT2D
+SMARCA4
+PRDM6
+KMT2C
+KDM6A
+CREBBP
+ZMYM3
+GSE1
+ARID1A
+MED12
+GFIB
+MYCN
+OTX2
+TCF4
+ZIC1
+GFI1
+TBR1
+BRCA2
+ATM
+PTEN
+PIK3CA
+PRKAR1A
+BAI3
+EPHA7
+KBTBD4
+CTDNEP1
+DDX3X
+SYNCRIP
+IDH1
+CDK6
+SNCAIP
+FOXR2
+BEND2
+MN1
+SMARCB1
diff --git a/analyses/oncoprint-landscape/data/ependymal-tumor_goi_list.tsv b/analyses/oncoprint-landscape/data/ependymal-tumor_goi_list.tsv
@@ -0,0 +1,14 @@
+Ependymal tumor
+H3F3A
+CDKN2A
+RELA
+YAP1
+C11orf95
+MAMLD1
+FAM118B
+MAML2
+NF2
+CLDN1
+PTEN
+ARL4D
+L1CAM
diff --git a/analyses/oncoprint-landscape/data/hgat_goi_list.tsv b/analyses/oncoprint-landscape/data/hgat_goi_list.tsv
@@ -0,0 +1,82 @@
+HGAT
+H3F3A
+TP53
+ATRX
+PPM1D
+PIK3CA
+NF1
+PIK3R1
+PDGFRA
+ACVR1
+PTEN
+EGFR
+ATM
+FGFR1
+CCND2
+HIST1H3B
+KIT
+KDR
+CDKN2A
+MET
+IGF1R
+BRAF
+IDH1
+MYCN
+CDK4
+ID2
+MYC
+TOP3A
+CDK6
+ASXL1
+KRAS
+MDM2
+TERT
+MAFK
+PDGFA
+PIK3C2B
+PLAGL2
+GAB2
+AURKB
+NFIB
+PIK3C2G
+AKT1
+CCND1
+ID3
+APOBEC3H
+AKT2
+GOLPH3
+FGFR2
+BCOR
+CDKN1B
+CDKN1C
+RB1
+SMARCE1
+CDKN2C
+CIC
+DIS3L2P1
+CDKN2B
+SETD2
+KDM6B
+NTRK1
+NTRK2
+NTRK3
+HIST1H3C
+HIST2H3C
+DDX11
+TSC2
+DDR2
+TOP2A
+FOSB
+VEGFA
+NRAS
+MTOR
+PTPN11
+FANCA
+SLCO1B3
+PDGFC
+YES1
+FYN
+POLE
+HIST1H2BE
+KMT2C
+TDRD9
diff --git a/analyses/oncoprint-landscape/data/lgat_goi_list.tsv b/analyses/oncoprint-landscape/data/lgat_goi_list.tsv
@@ -0,0 +1,28 @@
+LGAT
+KIAA1549
+BRAF
+FGFR1
+TACC1
+MYB
+MYBL
+NTRK1
+NTRK2
+NTRK3
+IDH
+H3F3A
+RAF1
+PTPN11
+TSC1
+TSC2
+PIK3CA
+PRKCA
+FGFR2
+FGFR3
+MAP2K1
+ALK
+ROS1
+QKI
+KRAS
+TP53
+ATRX
+CDKN2A