AlexsLemonade · kgaonkar6 · Feb 3, 2021 · Feb 3, 2021 · Feb 4, 2021 · Feb 5, 2021
diff --git a/analyses/hotspots-detection/01-reccurence-hotspot-overlap.Rmd b/analyses/hotspots-detection/01-reccurence-hotspot-overlap.Rmd
@@ -0,0 +1,383 @@
+---
+title: "Recurrent gene hotspots"
+output: html_notebook
+params:
+  db_file:
+    label: "all callers db"
+    value: scratch/snv_db.sqlite
+    input: file
+  cancer_hotspot_file:
+    label: "MSKCC cancer hotspot file"
+    value: input/hotspots_v2.xls
+    input: file  
+  gene_list :
+    label: gene table with gene type annotation 
+    value: ../fusion_filtering/references/genelistreference.txt 
+    input: file  
+---
+
+In this notebook we will use look at recurrence and hotspots in OpenPBTA snv
+
+Recurrence:
+
+- Filter Variant_Classification ='Missense_Mutation|Splice_Region|In_Frame_Del|Frame_Shift_Del|Splice_Site|Splice_Region|Nonsense_Mutation|Nonstop_Mutation|In_Frame_Ins|Frameshift_Ins',
+- Filter IMPACT ='HIGH|MODERATE'
+- Filter given gene list from fusion_filtering/reference/genelistreference.txt 
+- Count reoccurence per Hugo_Symbol+Protein_position+DOMAINS+type
+
+Hotspots:
+-Protein_position overlaps MSKCC cancer hotspot database
+
+
+
+## Setup
+```{r}
+library("tidyverse")
+library("maftools")
+
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+data_dir <- file.path(root_dir, "data")
+mskcc_cancer_hotspot_file <- params$cancer_hotspot_file
+internal_hotspot_file <- params$internal_hotspot_file
+
+results_dir <- file.path(root_dir,
+                         "analyses",
+                         "hotspots-detection",
+                         "results")
+
+if (!dir.exists(results_dir)) {
+  dir.create(results_dir)
+}
+
+# Select independent sample set
+# Include primary tumors when available, if not select other tumor_descriptor types
+independentsamples <- read_tsv(file.path(
+  root_dir,
+  "analyses",
+  "independent-samples",
+  "results",
+  "independent-specimens.wgswxs.primary-plus.tsv"))  %>%
+  pull(Kids_First_Biospecimen_ID)
+```
+
+## Cancer hotspot files
+```{r}
+
+# Cancer database
+hotspot_database_2017_snv <- readxl::read_xls(mskcc_cancer_hotspot_file,sheet = 1) 
+hotspot_database_2017_indel <- readxl::read_xls(mskcc_cancer_hotspot_file,sheet = 2)
+
+hotspot_database <- bind_rows( hotspot_database_2017_snv, hotspot_database_2017_indel) %>%
+  dplyr::select("Amino_Acid_Position","Hugo_Symbol") %>%
+  dplyr::mutate(hotspot_database="MSKCC") %>%
+  unique()
+
+```
+
+## Gene table for reccurence checks
+```{r}
+gene_table <- read_tsv("../fusion_filtering/references/genelistreference.txt")%>%
+  dplyr::rename(Hugo_Symbol=Gene_Symbol) %>%
+  dplyr::select(Hugo_Symbol,type)
+
+brain_goi <- read_tsv("input/brain-goi-list-new.txt",col_names = "Hugo_Symbol") %>%
+	dplyr::mutate(type="brain-goi")
+
+gene_table <- bind_rows(gene_table,
+			brain_goi)
+
+```
+
+## Read database 
+```{r}
+
+db_file <- file.path(root_dir, params$db_file)
+
+# Start up connection
+con <- DBI::dbConnect(RSQLite::SQLite(), db_file,path = ":memory:")
+
+```
+
+## Designate caller tables from SQL file
+```{r}
+strelka <- dplyr::tbl(con, "strelka")
+mutect <- dplyr::tbl(con, "mutect")
+vardict <- dplyr::tbl(con, "vardict")
+lancet <- dplyr::tbl(con, "lancet") 	
+
+```
+
+## Subset maf per caller
+```{r}
+source ("utils/prepMaf.R")
+strelka_subset<-prepMaf(strelka, gene_table = gene_table) %>%
+       mutate(caller="strelka")	
+mutect_subset<-prepMaf(mutect, gene_table = gene_table) %>%
+	mutate(caller="mutect")
+vardict_subset<-prepMaf(vardict, gene_table = gene_table) %>%
+	mutate(caller="vardict")
+lancet_subset<-prepMaf(lancet, gene_table = gene_table) %>%
+	mutate(caller="lancet")
+
+# combine
+combined_maf_subsets <-bind_rows(strelka_subset,
+mutect_subset,
+vardict_subset,
+lancet_subset) 
+
+
+# save for future runs
+saveRDS(combined_maf_subsets, file.path("results","combined_maf_subsets.RDS"))
+
+```
+
+
+## Get filtered numbers
+
+Upset plots to visualize caller contributions
+
+```{r}
+source(file.path("utils","getUpset.R"))
+
+```
+
+## Upset graphs 
+
+SNP: Single nucleotide polymorphism -- a substitution in one nucleotide
+DNP: Double nucleotide polymorphism -- a substitution in two consecutive nucleotides
+TNP: Triple nucleotide polymorphism -- a substitution in three consecutive nucleotides
+ONP: Oligo-nucleotide polymorphism -- a substitution in more than three consecutive nucleotides
+
+
+```{r}
+getUpset(combined_maf_subsets,c("SNP","DNP","TNP","ONP"))
+```
+Vardict uniquely support a lot of the filtered sites but interestingly majority of the subtitution mutations are supported by all callers
+
+INS: Insertion -- the addition of nucleotides
+DEL: Deletion -- the removal of nucleotides
+```{r}
+getUpset(combined_maf_subsets,c("INS","DEL"))
+```
+Vardict and Lancet uniquely support a lot of filtred indels
+
+
+VAF plot
+```{r}
+
+ggplot(combined_maf_subsets)+geom_violin(aes(x=caller,y=VAF,color=caller))+theme_bw()
+```
+
+## Summarise calls per site in Tumor_Sample_Barcode for brain GOI only 
+```{r}
+summarise_sites <- combined_maf_subsets %>%
+    left_join(gene_table,by="Hugo_Symbol") %>%
+  mutate(gnomad_AF_common = gnomAD_AF>0.001 & !is.na(gnomAD_AF)) %>%
+  group_by(Chromosome,Start_Position,End_Position,
+           Amino_Acid_Position,Hugo_Symbol,
+           Tumor_Sample_Barcode) %>%
+  summarise(type=toString(unique(type)),
+            gnomad_AF_common = any(gnomad_AF_common),
+            VAF = mean(VAF),
+            dbSNP_RS = toString(unique(dbSNP_RS)),
+            Variant_Classification=toString(unique(Variant_Classification)),
+            Reference_Allele=toString(unique(Reference_Allele)),
+            Tumor_Seq_Allele2=toString(unique(Tumor_Seq_Allele2)),
+            HGVSp_Short=toString(unique(HGVSp_Short)),
+            Variant_Type=toString(unique(Variant_Type))) %>%
+	left_join(hotspot_database,by=c("Amino_Acid_Position","Hugo_Symbol")) %>%
+  mutate(hotspot_database = case_when(
+    grepl("Cosmic",type) & !is.na(hotspot_database) ~ paste0(hotspot_database,",Cosmic"),
+    grepl("Cosmic",type) & is.na(hotspot_database) ~ "Cosmic",
+    TRUE ~ hotspot_database
+  )) %>%
+  dplyr::filter( grepl("brain-goi",type)) %>%
+  dplyr::filter(Tumor_Sample_Barcode %in% independentsamples)
+
+
+```
+
+## Recurrence in genes part of brain-goi from filtered snv calls
+```{r}
+calls_recurrence <- summarise_sites %>%
+    count(Chromosome,
+          Start_Position,
+          End_Position,
+          Amino_Acid_Position,
+          Hugo_Symbol,type,
+          gnomad_AF_common,
+          hotspot_database,
+          Variant_Classification,
+          dbSNP_RS,
+          Reference_Allele,
+          Tumor_Seq_Allele2,
+          HGVSp_Short,
+          Variant_Type,
+          sort = TRUE) %>%
+  dplyr::filter(n>2)
+
+```
+
+## Plot VAF separating by present or absent in MSKCC database
+```{r}
+calls_recurrence %>% 
+  left_join(summarise_sites) %>%
+  ggplot(aes(x=hotspot_database,y=VAF))+ geom_violin()+theme_bw()
+```
+
+## Sites not part of MSKCC cancer hotspots/Cosmic
+```{r}
+# maftools needs a maf file to be read via read.maf
+# so we are using the exact sites that don't overlap
+# MSKCC cancer database nor are Cosmic census gene lists
+uniqhits <- calls_recurrence %>% 
+  left_join(combined_maf_subsets) %>% 
+  dplyr::filter(is.na(hotspot_database) ) 
+
+uniqhitsGenes<- uniqhits %>%
+  pull(Hugo_Symbol) %>%
+  unique()
+
+# save as maf for maftools functions
+uniqhits %>%
+  dplyr::select(-caller) %>%
+  group_by(Chromosome,Start_Position,End_Position,
+           Amino_Acid_Position,Hugo_Symbol,
+           Tumor_Sample_Barcode,
+           Variant_Type,
+           Reference_Allele,
+           Tumor_Seq_Allele2,
+           HGVSp_Short,
+           Variant_Classification) %>%
+  # doing a mean so that caller specific VAF 
+  # calculations doesn't unique rows
+  summarise(VAF = mean(VAF)) %>%
+  write_tsv("results/uniqhits.maf")
+```
+
+
+### Upset plots for the uniq hits 
+
+SNP: Single nucleotide polymorphism -- a substitution in one nucleotide
+DNP: Double nucleotide polymorphism -- a substitution in two consecutive nucleotides
+TNP: Triple nucleotide polymorphism -- a substitution in three consecutive nucleotides
+ONP: Oligo-nucleotide polymorphism -- a substitution in more than three consecutive nucleotides
+
+
+```{r}
+getUpset(uniqhits,c("SNP","DNP","TNP","ONP"))
+```
+Interestingly a lot of the uniq hits are supported by strelka only, followed by vardict.
+
+
+INS: Insertion -- the addition of nucleotides
+DEL: Deletion -- the removal of nucleotides
+```{r}
+getUpset(uniqhits,c("INS","DEL"))
+```
+Vardict and Lancet again uniquely support a lot of novel indels.
+
+## Summary plots
+```{r}
+uniqhitsmaf <- maftools::read.maf("results/uniqhits.maf")
+uniq.titv = titv(maf = uniqhitsmaf, plot = FALSE, useSyn = TRUE)
+#plot titv summary
+plotTiTv(res = uniq.titv)
+
+# Here the VAF column is unique to each Tumor_Sample_Barcode 
+# which we didn't considere as a column to count recurrence on.
+# So the values here maybe difference compared to what we find
+# in calls_recurrence
+plotVaf(uniqhitsmaf,vafCol = "VAF")
+
+
+```
+Except KMT5B, all calls above which are in genes not in Cosmic Census and don't overlap sites of cancer hotspot database (MSKCC) have low VAF.
+
+
+## Genes with multiple hits in recurrence
+```{r}
+multihitsGenes <- calls_recurrence %>% 
+  group_by(Hugo_Symbol) %>%
+  tally() %>%
+  filter(n>5) %>%
+  pull(Hugo_Symbol)
+
+# maftools needs a maf file to be read via read.maf
+multihits <- combined_maf_subsets %>% 
+  # selecting sites in multiHits genes from filtered maf  
+  dplyr::filter(Hugo_Symbol %in% multihitsGenes) %>%
+  # doing an inner join with recurrent sites
+  # to visualize just the VAFs of these sites
+  inner_join(calls_recurrence)
+
+multihits %>%  
+  dplyr::select(-caller) %>%
+  group_by(Chromosome,Start_Position,End_Position,
+           Amino_Acid_Position,Hugo_Symbol,
+           Tumor_Sample_Barcode,
+           Variant_Type,
+           Reference_Allele,
+           Tumor_Seq_Allele2,
+           HGVSp_Short,
+           Variant_Classification) %>%
+  # doing a mean so that caller specific VAF 
+  # calculations doesn't unique rows
+  summarise(VAF = mean(VAF)) %>%
+  write_tsv("results/multihits.maf")
+
+```
+
+### Upset plots for the uniq hits 
+
+SNP: Single nucleotide polymorphism -- a substitution in one nucleotide
+DNP: Double nucleotide polymorphism -- a substitution in two consecutive nucleotides
+TNP: Triple nucleotide polymorphism -- a substitution in three consecutive nucleotides
+ONP: Oligo-nucleotide polymorphism -- a substitution in more than three consecutive nucleotides
+
+
+```{r}
+getUpset(multihits,c("SNP","DNP","TNP","ONP"))
+```
+Vardict uniquely support a lot of the multihit sites but interestingly majority of the subtitution mutations are supported by all callers
+
+INS: Insertion -- the addition of nucleotides
+DEL: Deletion -- the removal of nucleotides
+```{r}
+getUpset(multihits,c("INS","DEL"))
+```
+Vardict and Lancet uniquely support a lot of the multihit sites 
+
+
+### Plot lollipops
+```{r}
+multihitsmaf <- maftools::read.maf("results/multihits.maf")
+pdf("results/multihits.pdf",width = 15,height = 8)
+lapply(multihitsGenes,function(x){
+maftools::lollipopPlot(multihitsmaf,gene = x ,AACol = "HGVSp_Short")})
+dev.off()
+```
+```{r}
+multi.titv = titv(maf = multihitsmaf, plot = FALSE, useSyn = TRUE)
+#plot titv summary
+plotTiTv(res = multi.titv)
+
+# Here the VAF column is unique to each Tumor_Sample_Barcode 
+# which we didn't considere as a column to count recurrence on.
+# So the values here maybe difference compared to what we find
+# in calls_recurrence
+plotVaf(multihitsmaf,vafCol = "VAF")
+```
+
+
+## Save
+```{r}
+calls_recurrence %>% write_tsv(file.path(results_dir,"snv_recurrence.tsv"))
+```
+
+
+
+```{r}
+DBI::dbDisconnect(con)
+```
+