diff --git a/.circleci/config.yml b/.circleci/config.yml index 58ca37cbf0..5827b300f4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -61,7 +61,11 @@ jobs: - run: name: Independent samples command: ./scripts/run_in_ci.sh bash analyses/independent-samples/run-independent-samples.sh - + + - run: + name: Oncoprint plotting + command: ./scripts/run_in_ci.sh bash "analyses/oncoprint-landscape/run-oncoprint.sh" + ################################ #### Add your analysis here #### ################################ @@ -69,7 +73,6 @@ jobs: - run: name: SNV Caller Analysis command: OPENPBTA_VAF_CUTOFF=0.5 ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_evals.sh - deploy: machine: diff --git a/analyses/oncoprint-landscape/01-plot-oncoprint.R b/analyses/oncoprint-landscape/01-plot-oncoprint.R new file mode 100644 index 0000000000..81004ab102 --- /dev/null +++ b/analyses/oncoprint-landscape/01-plot-oncoprint.R @@ -0,0 +1,249 @@ +# This script displays an oncoprint displaying the landscape across PBTA given +# the relevant metadata and output MAF files from the snv callers strelka2, +# mutect2, lancet, and vardict. It addresses issue #6 in the OpenPBTA-analysis +# github repository. +# +# Code adapted from the PPTC PDX Oncoprint Generation repository here: +# https://github.com/marislab/create-pptc-pdx-oncoprints/tree/master/R +# +# Chante Bethell for CCDL 2019 and Jo Lynne Rokita +# +# #### USAGE +# This script is intended to be run via the command line from the top directory +# of the repository as follows: +# +# Rscript 'analyses/oncoprint-landscape/01-plot-oncoprint.R' + +#### Set Up -------------------------------------------------------------------- + +# Install maftools +if (!("maftools" %in% installed.packages())) { + install.packages("BiocManager") + BiocManager::install("maftools") +} +library(maftools) + +# Get `magrittr` pipe +`%>%` <- dplyr::`%>%` + +#### Directories and Files ----------------------------------------------------- + +# Detect the ".git" folder -- this will in the project root directory. +# Use this as the root directory to ensure proper execution, no matter where +# it is called from. +root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) + +# Path to the data obtained via `bash download-data.sh`. +data_dir <- file.path(root_dir, "data") + +# Path to output directory for plots produced +plots_dir <- + file.path(root_dir, "analyses", "oncoprint-landscape", "plots") + +if (!dir.exists(plots_dir)) { + dir.create(plots_dir) +} + +# Source the color palette for plots +source( + file.path( + root_dir, + "analyses", + "oncoprint-landscape", + "util", + "oncoplot-palette.R" + ) +) + +#### Command line options ------------------------------------------------------ + +# Declare command line options +option_list <- list( + optparse::make_option( + c("-m", "--maf_file"), + type = "character", + default = NULL, + help = "file path to MAF file that contains snv information", + ), + optparse::make_option( + c("-c", "--cnv_file"), + type = "character", + default = NULL, + help = "file path to SEG file that contains cnv information" + ), + optparse::make_option( + c("-f", "--fusion_file"), + type = "character", + default = NULL, + help = "file path to file that contains fusion information" + ), + optparse::make_option( + c("-g", "--goi_list"), + type = "character", + default = NULL, + help = "file path to file that contains list of genes to include on + oncoprint" + ), + optparse::make_option( + c("-p", "--png_name"), + type = "character", + default = NULL, + help = "oncoprint output png file name" + ) +) + +# Read the arguments passed +opt_parser <- optparse::OptionParser(option_list = option_list) +opt <- optparse::parse_args(opt_parser) + +maf <- opt$maf_file + +# Define cnv_file object here as it still needs to be defined for the `read.maf` +# function, even if it is NULL +cnv_file <- opt$cnv_file + +#### Read in data -------------------------------------------------------------- + +# Read in metadata +metadata <- + readr::read_tsv(file.path(data_dir, "pbta-histologies.tsv")) + +# Rename for maftools function +metadata <- metadata %>% + dplyr::rename("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID") + +# Read in MAF file +maf_df <- data.table::fread(maf, stringsAsFactors = FALSE) + +# Read in cnv file +if (!is.null(opt$cnv_file)) { + cnv_file <- data.table::fread(opt$cnv_file, stringsAsFactors = FALSE) + # TODO: Filter and set up `cnv_file` to be in the column format - + # "Hugo_Symbol, Tumor_Sample_Barcode, Variant_Classification" as required by + # the `read.maf function` +} + +# Read in fusion file +if (!is.null(opt$fusion_file)) { + fusion_file <- + data.table::fread(opt$fusion_file, + data.table = FALSE, + stringsAsFactors = FALSE) + + #### Incorporate Fusion Data ------------------------------------------------- + # TODO: Once the consensus calls of the fusion data are obtained, this section + # will need to be adapted to the format of the fusion input file. For example, + # the way we separate the genes out of `FusionName` may need to be adapted. + + # Separate fusion gene partners and add variant classification and center + fus_sep <- fusion_file %>% + # Separate the 5' and 3' genes + tidyr::separate(FusionName, c("Gene1", "Gene2"), sep = "--") %>% + dplyr::select(Sample, Gene1, Gene2) + + reformat_fusion <- fus_sep %>% + # Here we want to tally how many times the 5' gene shows up as a fusion hit + # in a sample + dplyr::group_by(Sample, Gene1) %>% + dplyr::tally() %>% + # If the sample-5' gene pair shows up more than once, call it a multi hit + # fusion + dplyr::mutate(Variant_Classification = + dplyr::if_else(n == 1, "Fusion", "Multi_Hit_Fusion"), + # Required column for joining with MAF + Variant_Type = "OTHER") %>% + # Correct format for joining with MAF + dplyr::rename(Tumor_Sample_Barcode = Sample, Hugo_Symbol = Gene1) %>% + dplyr::select(-n) + + # Merge with MAF + maf_df <- dplyr::bind_rows(maf_df, reformat_fusion) +} + +#### Convert into MAF object --------------------------------------------------- + +maf_object <- + read.maf( + maf = maf_df, + clinicalData = metadata, + cnTable = cnv_file, + removeDuplicatedVariants = FALSE, + vc_nonSyn = c( + "Frame_Shift_Del", + "Frame_Shift_Ins", + "Splice_Site", + "Nonsense_Mutation", + "Nonstop_Mutation", + "In_Frame_Del", + "In_Frame_Ins", + "Missense_Mutation", + "Fusion", + "Multi_Hit", + "Multi_Hit_Fusion" + ) + ) + +#### Specify genes ------------------------------------------------------------- + +if (!is.null(opt$goi_list)) { + # Read in gene list + goi_list <- + read.delim( + file.path(opt$goi_list), + sep = "\t", + header = FALSE, + as.is = TRUE + ) + + # Get top mutated this data and goi list + gene_sum <- mafSummary(maf_object)$gene.summary + + # Subset for genes in the histology-specific list + subset_gene_sum <- subset(gene_sum, Hugo_Symbol %in% goi_list$V1) + + # Get top altered genes + goi_ordered <- + subset_gene_sum[order(subset_gene_sum$AlteredSamples, decreasing = TRUE), ] + + # Select n top genes + num_genes <- ifelse(nrow(goi_ordered) > 20, 20, nrow(goi_ordered)) + goi_ordered_num <- goi_ordered[1:num_genes, ] + genes <- goi_ordered_num$Hugo_Symbol + +} else { + # If a gene list is not supplied, we do not want the `oncoplot` function to + # filter the genes to be plotted, so we assign NULL to the `genes`` object. + genes <- NULL +} + +#### Plot and Save Oncoprint --------------------------------------------------- + +# Given a maf file, plot an oncoprint of the variants in the +# dataset and save as a png file. +png( + file.path(plots_dir, opt$png_name), + width = 65, + height = 30, + units = "cm", + res = 300 +) +oncoplot( + maf_object, + clinicalFeatures = c( + "broad_histology", + "short_histology", + "reported_gender", + "tumor_descriptor", + "molecular_subtype" + ), + genes = genes, + logColBar = TRUE, + sortByAnnotation = TRUE, + showTumorSampleBarcodes = TRUE, + removeNonMutated = FALSE, + annotationFontSize = 0.7, + SampleNamefontSize = 0.5, + fontSize = 0.7, + colors = color_palette +) +dev.off() diff --git a/analyses/oncoprint-landscape/README.md b/analyses/oncoprint-landscape/README.md new file mode 100644 index 0000000000..f85c509312 --- /dev/null +++ b/analyses/oncoprint-landscape/README.md @@ -0,0 +1,40 @@ +# OpenPBTA Oncoprint + +## Usage + +To run the Rscript in this module from the command line as intended, use: + +``` +bash run-oncoprint.sh +``` + +`run-oncoprint.sh` is designed to be run as if it was called from this module directory even when called from outside of this directory. + +## Folder content + +This folder contains a script tasked to produce an oncoprint displaying the lanscape of the genetic lesions in the PBTA dataset. + +`01-plot-oncoprint.R` is a script written produce an oncoprint displaying the lanscape of the genetic lesions in the PBTA dataset (as discussed in [issue #6](https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/6)). +This script produces an oncoprint plot containing SNV information, as well as the optional CNV and fusion information. + +_The oncoprint can be viewed below:_ +![Oncoprint](plots/maf_oncoprint.png) + +## Folder structure + +The structure of this folder is as follows: + +``` +oncoprint-landscape +├── 01-plot-oncoprint.R +├── README.md +├── driver-lists +│   ├── brain-goi-list-long.txt +│   └── brain-goi-list-short.txt +├── plots +│   └── maf_oncoprint.png +├── run-oncoprint.sh +└── util + └── oncoplot-palette.R + +``` diff --git a/analyses/oncoprint-landscape/driver-lists/brain-goi-list-long.txt b/analyses/oncoprint-landscape/driver-lists/brain-goi-list-long.txt new file mode 100644 index 0000000000..ead82f21b6 --- /dev/null +++ b/analyses/oncoprint-landscape/driver-lists/brain-goi-list-long.txt @@ -0,0 +1,110 @@ +UTX +H3F3A +DDX3X +KMT2D +CTNNB1 +SMARCA4 +TP53 +PTCH1 +KDM6A +FGFR1 +PIK3CA +KMT2C +BRAF +CREBBP +ACVR1 +ATRX +NF1 +PDGFRA +HIST1H3B +HIST1H3C +CHD6 +CHD7 +CHD8 +CHD2 +CHD4 +SETD2 +CDC27 +CDKN2A +CHRD +EGFR +RUNX1 +IGF2R +RELA +ALK +YAP1 +NFIA +EZH2 +HIST1H3A +SMARCB1 +MYCN +IDH2 +IDH1 +NF2 +AKT1 +SMO +SUFU +TRAF7 +POLR2A +TERT +NRAS +KRAS +RAF1 +PTPN11 +NTRK2 +TSC1 +TSC2 +GLI2 +APC +PTCH2 +PRDM6 +GFI1 +PRDM6 +CDK6 +MET +KIT +APOBEC3A +APOBEC3B +ETV6 +NTRK3 +KMT2A +CR2 +VWF +ASXL1 +BTNL3 +CDKN2B +MYC +C11ORF70 +C11ORF95 +TTYH1 +LAIR1 +DPRX +CIC +DUX4 +DNMT3B +CCND1 +CCND2 +LIN28A +DNMT3B +MIR519C +MIR515-1 +MIR515-2 +SNCAIP +LRB1B +OTX2 +DUX4L1 +CFAP300 +MLH1 +MSH2 +PMS2 +MSH6 +PMS1 +POLE +POLD1 +MLH3 +MSH4 +MSH5 +KMT5B +KDM6A +EZHIP +CXORF67 diff --git a/analyses/oncoprint-landscape/driver-lists/brain-goi-list-short.txt b/analyses/oncoprint-landscape/driver-lists/brain-goi-list-short.txt new file mode 100644 index 0000000000..a43c825f0e --- /dev/null +++ b/analyses/oncoprint-landscape/driver-lists/brain-goi-list-short.txt @@ -0,0 +1,26 @@ +CTNNB1 +PTCH1 +MYCN +TP53 +MYC +ATRX +H3F3A +HIST1H3B +FGFR1 +SETD1 +ID2 +CDKN2A +CDKN2B +SMARCB1 +BRAF +MSH2 +CDK4 +CFAP300 +MIR519C +SMARCA4 +CIC +BRAF +POLE +SETD2 +NF1 +MYC \ No newline at end of file diff --git a/analyses/oncoprint-landscape/plots/maf_oncoprint.png b/analyses/oncoprint-landscape/plots/maf_oncoprint.png new file mode 100644 index 0000000000..2fe23e4a84 Binary files /dev/null and b/analyses/oncoprint-landscape/plots/maf_oncoprint.png differ diff --git a/analyses/oncoprint-landscape/run-oncoprint.sh b/analyses/oncoprint-landscape/run-oncoprint.sh new file mode 100644 index 0000000000..ca62345e7c --- /dev/null +++ b/analyses/oncoprint-landscape/run-oncoprint.sh @@ -0,0 +1,21 @@ +# Chante Bethell for CCDL 2019 +# Run 01-plot-oncoprint.R +# +# Usage: bash run-oncoprint.sh + +# This script should always run as if it were being called from +# the directory it lives in. +script_directory="$(perl -e 'use File::Basename; + use Cwd "abs_path"; + print dirname(abs_path(@ARGV[0]));' -- "$0")" +cd "$script_directory" || exit + +mkdir driver-lists +wget --no-clobber --directory-prefix=driver-lists -O driver-lists/brain-goi-list-long.txt https://github.com/marislab/create-pptc-pdx-oncoprints/raw/2c9ed2a2331bcef3003d6aa25a130485d76a3535/data/brain-goi-list-old.txt +wget --no-clobber --directory-prefix=driver-lists -O driver-lists/brain-goi-list-short.txt https://github.com/marislab/create-pptc-pdx-oncoprints/raw/2c9ed2a2331bcef3003d6aa25a130485d76a3535/data/brain-goi-list.txt + +Rscript --vanilla 01-plot-oncoprint.R \ + --maf_file ../../data/pbta-snv-mutect2.vep.maf.gz \ + --fusion_file ../../scratch/arriba.tsv \ + --goi_list driver-lists/brain-goi-list-long.txt \ + --png_name maf_oncoprint.png diff --git a/analyses/oncoprint-landscape/util/oncoplot-palette.R b/analyses/oncoprint-landscape/util/oncoplot-palette.R new file mode 100644 index 0000000000..c644324f97 --- /dev/null +++ b/analyses/oncoprint-landscape/util/oncoplot-palette.R @@ -0,0 +1,32 @@ +# Chante Bethell for CCDL 2019 and Jo Lynne Rokita +# Code adapted from the PPTC PDX Oncoprint Generation repository here: +# https://github.com/marislab/create-pptc-pdx-oncoprints/tree/master/R +# +# # #### USAGE +# This script is intended to be sourced in the +# 'analyses/oncoprint-landscape/01-plot-oncoprint.R' script as follows: +# +# source(file.path("util", "oncoplot-palette.R")) + + +# Define a color vector for plots +color_palette <- c("Missense_Mutation" = "#35978f", + "Nonsense_Mutation" = "#000000", + "Frame_Shift_Del" = "#56B4E9", + "Frame_Shift_Ins" = "#FFBBFF", + "Splice_Site" = "#F0E442", + "Translation_Start_Site" = "#191970", + "Nonstop_Mutation" = "#545454", + "In_Frame_Del" = "#CAE1FF", + "In_Frame_Ins" = "#FFE4E1", + "Stop_Codon_Ins" = "#CC79A7", + "Start_Codon_Del" = "#56B4E9", + "Fusion" = "#7B68EE", + "Multi_Hit" = "#f46d43", + "Hom_Deletion" = "#313695", + "Hem_Deletion" = "#abd9e9", + "Amplification" = "#c51b7d", + "Loss" = "#0072B2", + "Gain" = "#D55E00", + "High_Level_Gain" = "#FF0000", + "Multi_Hit_Fusion" = "#CD96CD")