AlexsLemonade · jaclyn-taroni · Jan 6, 2020 · Dec 30, 2019 · Dec 30, 2019 · Dec 31, 2019
diff --git a/analyses/README.md b/analyses/README.md
@@ -36,5 +36,5 @@ Note that _nearly all_ modules use the harmonized clinical data file (`pbta-hist
 | [`survival-analysis`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/survival-analysis) | TBD | *In progress*; will eventually contain functions for various types of survival analysis ([#18](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/18)) | N/A
 | [`sv-analysis`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/sv-analysis) | `pbta-sv-manta.tsv.gz`, `independent-specimens.wgs.primary-plus.tsv` | *In progress*; chromothripsis analysis per [#27](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/27)| N/A
 | [`tmb-compare-tcga`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/tmb-compare-tcga) | `pbta-snv-consensus-mutation-tmb-coding.tsv` | Compares PBTA tumor mutation burden to adult TCGA data; may need to be updated per [#257](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/257) | N/A
-| [`tp53_nf1_score`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/tp53_nf1_score) | `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` | Applies _TP53_ inactivation, _NF1_ inactivation, and Ras activation classifiers to RNA-seq data | N/A
+| [`tp53_nf1_score`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/tp53_nf1_score) | `pbta-snv-consensus-mutation.maf.tsv.gz`, `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds`, `pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` | Applies _TP53_ inactivation, _NF1_ inactivation, and Ras activation classifiers to RNA-seq data | N/A
 | [`transcriptomic-dimension-reduction`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/transcriptomic-dimension-reduction)| `pbta-gene-expression-rsem-fpkm.polya.rds`, `pbta-gene-expression-rsem-fpkm.stranded.rds`, `pbta-gene-expression-kallisto.polya.rds`, `pbta-gene-expression-kallisto.stranded.rds` | Dimension reduction and visualization of RNA-seq data (part of [#9](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/9)) | N/A
diff --git a/analyses/tp53_nf1_score/00-tp53-nf1-alterations.R b/analyses/tp53_nf1_score/00-tp53-nf1-alterations.R
@@ -0,0 +1,103 @@
+# Author: Krutika Gaonkar
+#
+# Read in concensus snv calls to gather alterations in TP53 and NF1
+# to evaluate classifier
+# @params snvConcensus multi-caller concensus snv calls
+# @params clincalFile clinical file: pbta-histologies.tsv
+# @params outputFolder output folder for alteration file
+# @params gencode cds bed file from gencode
+
+suppressPackageStartupMessages(library("optparse"))
+suppressPackageStartupMessages(library("tidyverse"))
+suppressPackageStartupMessages(library("readr"))
+suppressPackageStartupMessages(library("GenomicRanges"))
+
+#### Source functions ----------------------------------------------------------
+# We can use functions from the `snv-callers` module of the OpenPBTA project
+# TODO: if a common util folder is established, use that instead
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+source(file.path(root_dir, "analyses", "snv-callers", "util",
+                 "tmb_functions.R"))
+
+#### Parse command line options ------------------------------------------------
+
+option_list <- list(
+  make_option(c("-s", "--snvConsensus"),type="character",
+              help="Consensus snv calls (.tsv) "),
+  make_option(c("-c","--clinicalFile"),type="character",
+              help="clinical file for all samples (.tsv)"),
+  make_option(c("-o","--outputFolder"),type="character",
+              help="output folder for results "),
+  make_option(c("-g","--gencode"),type="character",
+              help="cds gencode bed file")
+)
+
+opt <- parse_args(OptionParser(option_list=option_list))
+snvConsensusFile <- opt$snvConsensus
+clinicalFile <- opt$clinicalFile
+outputFolder <- opt$outputFolder
+gencodeBed <- opt$gencode
+
+#### Generate files with TP53, NF1 mutations -----------------------------------
+
+# read in consensus SNV files
+consensus_snv <- read_tsv(snvConsensusFile)
-consensus_snv <- read_tsv(snvConsensusFile)
+consensus_snv <-data.table::fread(snvConsensusFile, select = c("Chromosome", "Start_Position", "End_Position", "Strand", "Variant_Classification", "Tumor_Sample_Barcode", "Hugo_Symbol"), 
+data.table = FALSE)
-consensus_snv <- read_tsv(snvConsensusFile)
+consensus_snv <-data.table::fread(snvConsensusFile, select = c("Chromosome", "Start_Position", "End_Position", "Strand", "Variant_Classification", "Tumor_Sample_Barcode", "Hugo_Symbol"), 
+data.table = FALSE)
+# gencode cds region BED file
+gencode_cds <- read_tsv(gencodeBed, col_names = FALSE)
+# clinical file
+clinical <- read_tsv(clinicalFile)
+
+# filter the MAF data.frame to only include entries that fall within the
+# CDS bed file regions
+coding_consensus_snv <- snv_ranges_filter(maf_df = consensus_snv,
+                                          keep_ranges = gencode_cds)
+
+# subset to TP53, removing silent mutations and mutations in introns
+tp53_coding <- coding_consensus_snv %>%
+  filter(Hugo_Symbol == "TP53") %>%
+  filter(!(Variant_Classification %in% c("Silent", "Intron")))
+
+# subset to NF1, removing silent mutations, mutations in introns, and missense
+# mutations -- we exclude missense mutations because they are not annotated
+# with OncoKB
+# https://github.com/AlexsLemonade/OpenPBTA-analysis/pull/381#issuecomment-570748578
+nf1_coding <- coding_consensus_snv %>%
+  filter(Hugo_Symbol == "NF1") %>%
+  filter(!(Variant_Classification %in% c("Silent",
+                                         "Intron",
+                                         "Missense_Mutation")))
+
+# include only the relevant columns from the MAF file
+tp53_nf1_coding <- tp53_coding %>%
-tp53_nf1_coding <- tp53_coding %>%
+tp53_nf1_coding <- coding_consensus_snv %>%
+  filter(Hugo_Symbol %in% c("NF1", "TP53")) %>%
+  filter(!(Variant_Classification %in% c("Silent",
+                                         "Intron",
+                                         "Missense_Mutation")))
-tp53_nf1_coding <- tp53_coding %>%
+tp53_nf1_coding <- coding_consensus_snv %>%
+  filter(Hugo_Symbol %in% c("NF1", "TP53")) %>%
+  filter(!(Variant_Classification %in% c("Silent",
+                                         "Intron",
+                                         "Missense_Mutation")))
+  bind_rows(nf1_coding) %>%
+  select(Chromosome, Start_Position, End_Position, Strand,
+         Variant_Classification, Tumor_Sample_Barcode, Hugo_Symbol)
+
+# biospecimen IDs for tumor or cell line DNA-seq
+bs_ids <- clinical %>%
+  filter(sample_type != "Normal",
+         experimental_strategy != "RNA-Seq") %>%
+  pull(Kids_First_Biospecimen_ID)
+
+# all BS ids that are not in the data frame that contain the TP53 and NF1
+# coding mutations should be labeled as not having either
+bs_ids_without_mut <- setdiff(bs_ids,
+                              unique(tp53_nf1_coding$Tumor_Sample_Barcode))
+
+# create a data.frame with wildtype BS IDs for joining
+without_mut_df <- data.frame(
+  Chromosome = NA,
+  Start_Position = NA,
+  End_Position = NA,
+  Strand = NA,
+  Variant_Classification = NA,
+  Tumor_Sample_Barcode = bs_ids_without_mut,
+  Hugo_Symbol = "No_TP53_NF1_alt"
+)
+
+tp53_nf1_coding <- bind_rows(tp53_nf1_coding,
+                             without_mut_df)
+
+# save TP53 and NF1 SNV alterations
+write_tsv(tp53_nf1_coding,
+          file.path(outputFolder,"TP53_NF1_snv_alteration.tsv"))
diff --git a/analyses/tp53_nf1_score/README.md b/analyses/tp53_nf1_score/README.md
@@ -19,6 +19,10 @@ bash analyses/tp53_nf1_score/run_classifier.sh
 
 #### Output
 
-It produces  `results/pbta-gene-expression-rsem-fpkm-collapsed.stranded_classifier_scores.tsv`  and `results/pbta-gene-expression-rsem-fpkm-collapsed.polya_classifier_scores.tsv`, which contains all 3 classifier scores for the stranded data and for shuffled stranded (e.g., random) data.
+`00-tp53-nf1-alterations.R` produces `TP53_NF1_snv_alteration.tsv`, which contains information about the presence or absence of coding SNVs in _TP53_ and _NF1_ for the purpose of evaluating the classifier results.
+For evaluation purposes, a coding SNV 1) is found in a CDS region and 2) is not a silent mutation or in an intron as indicated by the `Variant_Classification` column of the consensus mutation file.
+_NF1_ positive examples are additionally filtered to remove missense mutations, as these are not annotated with OncoKB ([#381 (comment)](https://github.com/AlexsLemonade/OpenPBTA-analysis/pull/381#issuecomment-570748578)).
+
+`01-apply-classifier.py` produces  `results/pbta-gene-expression-rsem-fpkm-collapsed.stranded_classifier_scores.tsv`  and `results/pbta-gene-expression-rsem-fpkm-collapsed.polya_classifier_scores.tsv`, which contains all 3 classifier scores for the stranded data and for shuffled stranded (e.g., random) data.
 
 Because some of the classifier genes are not present in the OpenPBTA dataset, the scores should be interpreted as continuous values representing relative gene alterations and not as probabilities.