-
Notifications
You must be signed in to change notification settings - Fork 83
Get tp53 nf1 alt #381
Get tp53 nf1 alt #381
Changes from 22 commits
60be0e1
a94a0ed
a58a58d
30e3844
4113537
6648b53
e798b0a
4c326f3
c2f4ae4
a389305
9aa1e05
d05a62d
40fdb39
d47a30d
266b8b3
e71785d
9d0ce45
4a22f20
9a6276c
168c063
1217239
4c2257e
a433110
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,103 @@ | ||||||||||||||
# Author: Krutika Gaonkar | ||||||||||||||
# | ||||||||||||||
# Read in concensus snv calls to gather alterations in TP53 and NF1 | ||||||||||||||
# to evaluate classifier | ||||||||||||||
# @params snvConcensus multi-caller concensus snv calls | ||||||||||||||
# @params clincalFile clinical file: pbta-histologies.tsv | ||||||||||||||
# @params outputFolder output folder for alteration file | ||||||||||||||
# @params gencode cds bed file from gencode | ||||||||||||||
|
||||||||||||||
suppressPackageStartupMessages(library("optparse")) | ||||||||||||||
suppressPackageStartupMessages(library("tidyverse")) | ||||||||||||||
suppressPackageStartupMessages(library("readr")) | ||||||||||||||
suppressPackageStartupMessages(library("GenomicRanges")) | ||||||||||||||
|
||||||||||||||
#### Source functions ---------------------------------------------------------- | ||||||||||||||
# We can use functions from the `snv-callers` module of the OpenPBTA project | ||||||||||||||
# TODO: if a common util folder is established, use that instead | ||||||||||||||
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) | ||||||||||||||
source(file.path(root_dir, "analyses", "snv-callers", "util", | ||||||||||||||
"tmb_functions.R")) | ||||||||||||||
|
||||||||||||||
#### Parse command line options ------------------------------------------------ | ||||||||||||||
|
||||||||||||||
option_list <- list( | ||||||||||||||
make_option(c("-s", "--snvConsensus"),type="character", | ||||||||||||||
help="Consensus snv calls (.tsv) "), | ||||||||||||||
make_option(c("-c","--clinicalFile"),type="character", | ||||||||||||||
help="clinical file for all samples (.tsv)"), | ||||||||||||||
make_option(c("-o","--outputFolder"),type="character", | ||||||||||||||
help="output folder for results "), | ||||||||||||||
make_option(c("-g","--gencode"),type="character", | ||||||||||||||
help="cds gencode bed file") | ||||||||||||||
) | ||||||||||||||
|
||||||||||||||
opt <- parse_args(OptionParser(option_list=option_list)) | ||||||||||||||
snvConsensusFile <- opt$snvConsensus | ||||||||||||||
clinicalFile <- opt$clinicalFile | ||||||||||||||
outputFolder <- opt$outputFolder | ||||||||||||||
gencodeBed <- opt$gencode | ||||||||||||||
|
||||||||||||||
#### Generate files with TP53, NF1 mutations ----------------------------------- | ||||||||||||||
|
||||||||||||||
# read in consensus SNV files | ||||||||||||||
consensus_snv <- read_tsv(snvConsensusFile) | ||||||||||||||
# gencode cds region BED file | ||||||||||||||
gencode_cds <- read_tsv(gencodeBed, col_names = FALSE) | ||||||||||||||
# clinical file | ||||||||||||||
clinical <- read_tsv(clinicalFile) | ||||||||||||||
|
||||||||||||||
# filter the MAF data.frame to only include entries that fall within the | ||||||||||||||
# CDS bed file regions | ||||||||||||||
coding_consensus_snv <- snv_ranges_filter(maf_df = consensus_snv, | ||||||||||||||
keep_ranges = gencode_cds) | ||||||||||||||
|
||||||||||||||
# subset to TP53, removing silent mutations and mutations in introns | ||||||||||||||
tp53_coding <- coding_consensus_snv %>% | ||||||||||||||
filter(Hugo_Symbol == "TP53") %>% | ||||||||||||||
filter(!(Variant_Classification %in% c("Silent", "Intron"))) | ||||||||||||||
|
||||||||||||||
# subset to NF1, removing silent mutations, mutations in introns, and missense | ||||||||||||||
# mutations -- we exclude missense mutations because they are not annotated | ||||||||||||||
# with OncoKB | ||||||||||||||
# https://github.com/AlexsLemonade/OpenPBTA-analysis/pull/381#issuecomment-570748578 | ||||||||||||||
nf1_coding <- coding_consensus_snv %>% | ||||||||||||||
filter(Hugo_Symbol == "NF1") %>% | ||||||||||||||
filter(!(Variant_Classification %in% c("Silent", | ||||||||||||||
"Intron", | ||||||||||||||
"Missense_Mutation"))) | ||||||||||||||
|
||||||||||||||
# include only the relevant columns from the MAF file | ||||||||||||||
tp53_nf1_coding <- tp53_coding %>% | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If these two data.frames are going to bound together anyway, is there any reason you can't just do one filtering step where you select both NF1 and TP53?
Suggested change
If the concern is getting Missense_Mutations that are TP53 then you could add an another filter step to remove that combo. e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added this as two, more explicit steps on purpose because it's important to document the logic around NF1 specifically and because multiple people are working on this module. |
||||||||||||||
bind_rows(nf1_coding) %>% | ||||||||||||||
select(Chromosome, Start_Position, End_Position, Strand, | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are only going to use these columns, you could probably speed up the reading in process at line 44. I will comment my suggestion there. |
||||||||||||||
Variant_Classification, Tumor_Sample_Barcode, Hugo_Symbol) | ||||||||||||||
|
||||||||||||||
# biospecimen IDs for tumor or cell line DNA-seq | ||||||||||||||
bs_ids <- clinical %>% | ||||||||||||||
filter(sample_type != "Normal", | ||||||||||||||
experimental_strategy != "RNA-Seq") %>% | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want those There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The panel samples (and the cell line samples) were all included in the original pull request. |
||||||||||||||
pull(Kids_First_Biospecimen_ID) | ||||||||||||||
|
||||||||||||||
# all BS ids that are not in the data frame that contain the TP53 and NF1 | ||||||||||||||
# coding mutations should be labeled as not having either | ||||||||||||||
bs_ids_without_mut <- setdiff(bs_ids, | ||||||||||||||
unique(tp53_nf1_coding$Tumor_Sample_Barcode)) | ||||||||||||||
|
||||||||||||||
# create a data.frame with wildtype BS IDs for joining | ||||||||||||||
without_mut_df <- data.frame( | ||||||||||||||
Chromosome = NA, | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe |
||||||||||||||
Start_Position = NA, | ||||||||||||||
End_Position = NA, | ||||||||||||||
Strand = NA, | ||||||||||||||
Variant_Classification = NA, | ||||||||||||||
Tumor_Sample_Barcode = bs_ids_without_mut, | ||||||||||||||
Hugo_Symbol = "No_TP53_NF1_alt" | ||||||||||||||
) | ||||||||||||||
|
||||||||||||||
tp53_nf1_coding <- bind_rows(tp53_nf1_coding, | ||||||||||||||
without_mut_df) | ||||||||||||||
|
||||||||||||||
# save TP53 and NF1 SNV alterations | ||||||||||||||
write_tsv(tp53_nf1_coding, | ||||||||||||||
file.path(outputFolder,"TP53_NF1_snv_alteration.tsv")) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If these are the only columns you are using, this should speed up this step a bit, but mostly this will be sped up by using
data.table::fread
instead ofreadr::read_tsv
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
data.table::fread
is faster for big files.