Merge remote-tracking branch ‘Alex/master’ into collapse-rnaseq

AlexsLemonade · Nov 4, 2019 · 7e8cee6 · 7e8cee6
2 parents 8563cf7 + 3f1819d
commit 7e8cee6
Show file tree

Hide file tree

Showing 31 changed files with 2,865,821 additions and 214 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -67,15 +67,19 @@ jobs:
       - run:
           name: Independent samples
           command: ./scripts/run_in_ci.sh bash analyses/independent-samples/run-independent-samples.sh 
-
+
+      - run:
+          name: Focal CN Preparation
+          command: ./scripts/run_in_ci.sh bash analyses/focal-cn-file-preparation/run-prepare-cn.sh
+
       - run:
           name: Interaction plot
           command: ./scripts/run_in_ci.sh bash analyses/interaction-plots/01-create-interaction-plots.sh     
-     
+
       - run:
           name: Oncoprint plotting
           command: ./scripts/run_in_ci.sh bash "analyses/oncoprint-landscape/run-oncoprint.sh"
-
+          
          ################################
          #### Add your analysis here ####
          ################################

diff --git a/Dockerfile b/Dockerfile
@@ -15,7 +15,7 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \
 
 # Required forinteractive sample distribution plots
 # map view is needed to create HTML outputs of the interactive plots
-RUN apt-get update -qq && apt-get -y --no-install-recommends install \    
+RUN apt-get update -qq && apt-get -y --no-install-recommends install \
     && install2.r --error \
     --deps TRUE \
     gdalUtils \
@@ -28,9 +28,9 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \
     mapview
 
 # Installs packages needed for still treemap, interactive plots, and hex plots
-# Rtsne and umap are required for dimension reduction analyses 
+# Rtsne and umap are required for dimension reduction analyses
 # optparse is needed for passing arguments from the command line to R script
-RUN apt-get update -qq && apt-get -y --no-install-recommends install \    
+RUN apt-get update -qq && apt-get -y --no-install-recommends install \
     && install2.r --error \
     --deps TRUE \
     R.utils \
@@ -45,7 +45,7 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \
     pheatmap \
     RColorBrewer \
     viridis \
-    data.table 
+    data.table
 
 # maftools for proof of concept in create-subset-files
 RUN R -e "BiocManager::install(c('maftools'), update = FALSE)"
@@ -72,7 +72,7 @@ RUN R -e "devtools::install_github('timelyportfolio/d3treeR', ref = '0eaba7f1c64
 RUN R -e "devtools::install_github('clauswilke/colorblindr', ref = '1ac3d4d62dad047b68bb66c06cee927a4517d678', dependencies = TRUE)"
 
 # Required for sex prediction from RNA-seq data
-RUN apt-get update -qq && apt-get -y --no-install-recommends install \    
+RUN apt-get update -qq && apt-get -y --no-install-recommends install \
     && install2.r --error \
     --deps TRUE \
     glmnet \
@@ -86,24 +86,26 @@ RUN apt-get -y update && apt-get install -y \
    && rm -rf /var/lib/apt/lists/
 
 # Install for SNV comparison plots
-RUN apt-get update -qq && apt-get -y --no-install-recommends install \    
+RUN apt-get update -qq && apt-get -y --no-install-recommends install \
     && install2.r --error \
     --deps TRUE \
     UpSetR
 
 RUN R -e "devtools::install_github('const-ae/ggupset', ref = '7a33263cc5fafdd72a5bfcbebe5185fafe050c73', dependencies = TRUE)"
 
 # GGally and its required packages
-RUN apt-get update -qq && apt-get -y --no-install-recommends install \    
+RUN apt-get update -qq && apt-get -y --no-install-recommends install \
     && install2.r --error \
     lattice \
     rpart \
     class \
     MASS \
     GGally
 
-# Required for summarizing the RNA-seq matrices
-RUN Rscript -e "install.packages('reshape2')"
+# Help display tables in R Notebooks
+RUN apt-get update -qq && apt-get -y --no-install-recommends install \
+    && install2.r --error \
+    flextable
 
 #### Please install your dependencies here
 #### Add a comment to indicate what analysis it is required for
diff --git a/README.md b/README.md
@@ -68,7 +68,9 @@ Users performing analyses, should always refer to the symlinks in the `data/` di
 ### Data Access via Download Script
 
 We have created a shell script that will download the latest release from AWS S3.
-OS X users must use [homebrew](https://brew.sh/) to install `md5sha1sum` via the command `brew install md5sha1sum` before running the download script the first time.
+macOS users must install `md5sum` before running the download script the first time. 
+This installed with [homebrew](https://brew.sh/) via the command `brew install md5sha1sum` or [conda/miniconda](https://docs.conda.io/projects/conda/en/latest/) via the command `conda install -c conda-forge coreutils`.
+
 Once this has been done, run `bash download-data.sh` to acquire the latest release.
 This will create symlinks in `data/` to the latest files.
 It's safe to re-run `bash download-data.sh` to check that you have the most recent release of the data.
@@ -85,13 +87,49 @@ Users downloading via CAVATICA should place the data files within a `data/releas
 ## Data Formats
 
 The release notes for each release are provided in the `release-notes.md` file that accompanies the data files.
-Somatic Single Nucleotide Variant (SNV) data are provided in [Annotated MAF format](doc/format/vep-maf.md) files for each of the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#somatic-single-nucleotide-variant-calling).
-Somatic Copy Number Variant (CNV) data are provided in a modified [SEG format](https://software.broadinstitute.org/software/igv/SEG) for each of the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#somatic-copy-number-variant-calling).
-- CNVkit SEG files have an additional column `copy.num` to denote copy number of each segment, derived from the CNS file output of the algorithm.
-Somatic Structural Variant Data (Somatic SV) are provided in the [Annotated Manta TSV](doc/format/manta-tsv-header.md) format produced by the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#somatic-structural-variant-calling).
-Gene expression estimates from the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#gene-expression-abundance-estimation) are provided as a gene by sample matrix.
-Gene Fusions produced by the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#rna-fusion-calling-and-prioritization) are provided as [Arriba TSV](doc/format/arriba-tsv-header.md) and [STARFusion TSV](doc/format/starfusion-tsv-header.md) respectively.
-[Harmonized clinical data](https://alexslemonade.github.io/OpenPBTA-manuscript/#clinical-data-harmonization) are released as tab separated values.
+
+* Somatic Single Nucleotide Variant (SNV) data are provided in [Annotated MAF format](doc/format/vep-maf.md) files for each of the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#somatic-single-nucleotide-variant-calling).
+* Somatic Copy Number Variant (CNV) data are provided in a modified [SEG format](https://software.broadinstitute.org/software/igv/SEG) for each of the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#somatic-copy-number-variant-calling).
+  * The CNVkit SEG file has an additional column `copy.num` to denote copy number of each segment, derived from the CNS file output of the algorithm described [here](https://cnvkit.readthedocs.io/en/stable/fileformats.html).
+  * The ControlFreeC TSV file is a merge of `*_CNVs` files produced from the algorithm, and columns are described [here](http://boevalab.inf.ethz.ch/FREEC/tutorial.html#OUTPUT).
+  * NOTE: The _copy number_ annotated in the CNVkit SEG file is annotated with respect to ploidy 2, however, the _status_ annotated in the ControlFreeC TSV file is annotated with respect to inferred ploidy from the algorithm, which is recorded in the `pbta_histologies.tsv` file. See the table below for examples of possible interpretations.
+
+| Ploidy | Copy Number | Gain/Loss Interpretation     |
+|--------|-------------|------------------------------|
+| 2      | 0           | Loss; homozygous deletion    |
+| 2      | 1           | Loss; hemizygous deletion    |
+| 2      | 2           | Copy neutral                 |
+| 2      | 3           | Gain; one copy gain          |
+| 2      | 4           | Gain; two copy gain          |
+| 2      | 5+          | Gain; possible amplification |
+| 3      | 0           | Loss; 3 copy loss            |
+| 3      | 1           | Loss; 2 copy loss            |
+| 3      | 2           | Loss; 1 copy loss            |
+| 3      | 3           | Copy neutral                 |
+| 3      | 4           | Gain; one copy gain          |
+| 3      | 5           | Gain; two copy gain          |
+| 3      | 6+          | Gain; possible amplification |
+| 4      | 0           | Loss; 4 copy loss            |
+| 4      | 1           | Loss; 3 copy loss            |
+| 4      | 2           | Loss; 2 copy loss            |
+| 4      | 3           | Loss; 1 copy loss            |
+| 4      | 4           | Copy neutral                 |
+| 4      | 5           | Gain; one copy gain          |
+| 4      | 6           | Gain; two copy gain          |
+| 4      | 7+          | Gain; possible amplification |
+
+* Somatic Structural Variant Data (Somatic SV) are provided in the [Annotated Manta TSV](doc/format/manta-tsv-header.md) format produced by the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#somatic-structural-variant-calling).
+* Gene expression estimates from the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#gene-expression-abundance-estimation) are provided as a gene by sample matrix.
+* Gene Fusions produced by the [applied software packages](https://alexslemonade.github.io/OpenPBTA-manuscript/#rna-fusion-calling-and-prioritization) are provided as [Arriba TSV](doc/format/arriba-tsv-header.md) and [STARFusion TSV](doc/format/starfusion-tsv-header.md) respectively.
+* [Harmonized clinical data](https://alexslemonade.github.io/OpenPBTA-manuscript/#clinical-data-harmonization) are released as tab separated values.
+* For participants with multiple tumor specimens, [Independent specimen lists](https://alexslemonade.github.io/OpenPBTA-manuscript/#selection-of-independent-samples) are provided as TSV files with columns for participant ID and specimen ID. 
+These files are used for analyses such as mutation co-occurence, where repeated samples might cause bias.
+There are four of these files:
+  1. `independent-specimens.wgs.primary.tsv` with WGS samples and only primary tumors
+  2. `independent-specimens.wgs.primary-plus.tsv` as above, but including non-primary tumors where a primary tumor sample is not available
+  3. `independent-specimens.wgswxs.primary.tsv` Only primary tumors, but with WXS where WGS is not available 
+  4. `independent-specimens.wgswxs.primary-plus.tsv` as above, but including non-primary tumors where a primary tumor sample is not available.
+
 
 ### Data Caveats
 

diff --git a/analyses/cnv-comparison/README.md b/analyses/cnv-comparison/README.md
@@ -0,0 +1,10 @@
+## Compare CNV callers
+
+This analysis is **DEPRECATED**. 
+It was designed to compare results from CNVkit and ControlFreeC when both methods produced SEG files and when the following [was noted in the README](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/0c2d0d25c01dcbbbd63f94b064a69afc9dc44ea8#data-caveats):
+
+> We noticed ControlFreeC does not properly handle aneuploidy well for a subset of samples in that it calls the entire genome gained. 
+
+As of `release-v7-20191031`, the CNV files are in two different formats (see: [CNVkit format](https://cnvkit.readthedocs.io/en/stable/fileformats.html) and [ControlFreeC format](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/controlfreec-tsv.md)).
+
+Consensus copy number calls are tracked here: https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/128
diff --git a/analyses/collapse-rnaseq/00-create-rsem-files.R b/analyses/collapse-rnaseq/00-create-rsem-files.R
@@ -0,0 +1,71 @@
+# Author: Komal S. Rathi
+# Date: 11/09/2019
+# Function: Merge RSEM files and split into polya and stranded
+
+# load libraries
+suppressPackageStartupMessages(library(optparse))
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(reshape2))
+
+# read params
+option_list <- list(
+  make_option(c("-i", "--inputdir"), type = "character",
+              help = "Input directory for RSEM files"),
+  make_option(c("-c", "--clinical"), type = "character",
+              help = "Histology file (.TSV)"),
+  make_option(c("-m", "--manifest"), type = "character",
+              help = "Manifest file (.csv)"),
+  make_option(c("-o", "--outdir"), type = "character",
+              help = "Path to output directory")
+)
+
+# parse the parameters
+opt <- parse_args(OptionParser(option_list = option_list))
+topDir <- opt$inputdir
+clin <- opt$clinical
+manifest <- opt$manifest
+outdir <- opt$outdir
+
+# read manifest file
+manifest <- read.csv(manifest, stringsAsFactors = F)
+manifest <- manifest[,c("Kids.First.Biospecimen.ID","name")]
+manifest$name <- gsub('[.].*', '', manifest$name)
+
+# read histology file and split into polyA and stranded
+clin <- read.delim(clin, stringsAsFactors = F)
+polya <- clin %>% 
+  filter(experimental_strategy == "RNA-Seq" & RNA_library == "poly-A") %>%
+  as.data.frame()
+
+stranded <- clin %>% 
+  filter(experimental_strategy == "RNA-Seq" & RNA_library == "stranded") %>%
+  as.data.frame()
+
+# read and merge RSEM genes files
+lfiles <- list.files(path = topDir, pattern = "*.rsem.genes.results.gz", recursive = TRUE, full.names = T)
+read.rsem <- function(x){
+  print(x)
+  dat <- data.table::fread(x)
+  filename <- gsub('.*[/]|.rsem.genes.results.gz','',x)
+  sample.id <- manifest[which(manifest$name %in% filename),'Kids.First.Biospecimen.ID']
+  dat$Sample <- sample.id
+  return(dat)
+}
+expr <- lapply(lfiles, read.rsem)
+expr <- data.table::rbindlist(expr)
+expr.fpkm <- dcast(expr, gene_id~Sample, value.var = 'FPKM')  # FPKM
+expr.counts <- dcast(expr, gene_id~Sample, value.var = 'expected_count')  # counts
+
+# split into polya and stranded matrices
+polya.fpkm <- expr.fpkm[,polya$Kids_First_Biospecimen_ID]
+stranded.fpkm <- expr.fpkm[,stranded$Kids_First_Biospecimen_ID]
+polya.counts <- expr.counts[,polya$Kids_First_Biospecimen_ID]
+stranded.counts <- expr.counts[,stranded$Kids_First_Biospecimen_ID]
+
+# save output
+saveRDS(polya.fpkm, file = paste0(outdir,'/pbta-gene-expression-rsem-fpkm.polya.rds'))
+saveRDS(polya.counts, file = paste0(outdir, '/pbta-gene-counts-rsem-expected_count.polya.rds'))
+saveRDS(stranded.fpkm, file = paste0(outdir, '/pbta-gene-expression-rsem-fpkm.stranded.rds'))
+saveRDS(stranded.counts, file = paste0(outdir, '/pbta-gene-counts-rsem-expected_count.stranded.rds'))
+print("Done!")
+
diff --git a/analyses/collapse-rnaseq/02-analyze-drops.Rmd b/analyses/collapse-rnaseq/02-analyze-drops.Rmd
@@ -0,0 +1,41 @@
+---
+title: "Summary of collapsed symbols"
+output: html_notebook
+---
+
+```{r}
+# load libraries
+library(tidyverse)
+library(reshape2)
+library(refGenome)
+
+# actual code
+print("Generating input matrix...!")
+expr <- readRDS(input.dat)
+  
+# reduce dataframe
+expr <- expr[which(rowSums(expr[,2:ncol(expr)]) > 0),] # remove all genes with no expression
+expr <- expr[grep('_PAR_', expr$gene_id, invert = T),] # discard PAR_* chromosomes from analysis
+
+# collapse to matrix of HUGO symbols x Sample identifiers
+# take mean per row and use the max value for duplicated gene symbols
+expr.collapsed <- expr %>% 
+  separate(gene_id, c("gene_id", "gene_symbol"), sep = "\\_", extra = "merge") %>%
+  pivot_longer(-c(gene_id, gene_symbol), 
+               names_to = "sample_name", values_to = "fpkm") %>% 
+  group_by(gene_id) %>%
+  mutate(means = mean(fpkm)) %>% 
+  group_by(gene_symbol) %>%
+  filter(means == max(means)) %>% 
+  select(-gene_id) %>% unique()
+
+# matrix of HUGO symbols x Sample identifiers
+expr.input <- acast(expr.collapsed, gene_symbol~sample_name, value.var = 'fpkm')
+print(dim(expr.input))
+
+# save matrix
+saveRDS(object = expr.input, file = output.mat)
+print("Matrix generated. Done!!")
+
+```
+
diff --git a/analyses/collapse-rnaseq/02-analyze-drops.nb.html b/analyses/collapse-rnaseq/02-analyze-drops.nb.html
diff --git a/analyses/create-subset-files/01-get_biospecimen_identifiers.R b/analyses/create-subset-files/01-get_biospecimen_identifiers.R
@@ -54,9 +54,13 @@ get_biospecimen_ids <- function(filename, id_mapping_df) {
                                   data.table = FALSE)
     biospecimen_ids <- unique(snv_file$Tumor_Sample_Barcode)
   } else if (grepl("pbta-cnv", filename)) {
-    # in a column 'ID'
+    # the two CNV files now have different structures
     cnv_file <- read_tsv(filename)
-    biospecimen_ids <- unique(cnv_file$ID)
+    if (grepl("controlfreec", filename)) {
+      biospecimen_ids <- unique(cnv_file$tumor)
+    } else {
+      biospecimen_ids <- unique(cnv_file$ID)
+    }
   } else if (grepl("pbta-fusion", filename)) {
     # in a column 'tumor_id'
     fusion_file <- read_tsv(filename)
@@ -163,6 +167,9 @@ participant_id_list <- purrr::map(files_to_subset,
                                   ~ get_biospecimen_ids(.x, id_mapping_df)) %>%
   stats::setNames(files_to_subset)
 
+# explicitly perform garbage collection here
+gc(verbose = FALSE)
+
 # split up information for poly-A and stranded expression data vs. all else
 polya_participant_list <- purrr::keep(
   participant_id_list,

diff --git a/analyses/create-subset-files/02-subset_files.R b/analyses/create-subset-files/02-subset_files.R
@@ -86,8 +86,9 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
 
     # in a column 'ID'
     cnv_file <- readr::read_tsv(filename)
+    biospecimen_column <- intersect(colnames(cnv_file), c("ID", "tumor"))
     cnv_file %>%
-      dplyr::filter(ID %in% biospecimen_ids) %>%
+      dplyr::filter(!!rlang::sym(biospecimen_column) %in% biospecimen_ids) %>%
       readr::write_tsv(output_file)
 
   } else if (grepl("pbta-fusion", filename)) {

diff --git a/analyses/create-subset-files/create_subset_files.sh b/analyses/create-subset-files/create_subset_files.sh
@@ -7,7 +7,7 @@ set -o pipefail
 
 # Set defaults for release and biospecimen file name
 BIOSPECIMEN_FILE=${BIOSPECIMEN_FILE:-biospecimen_ids_for_subset.RDS}
-RELEASE=${RELEASE:-release-v6-20191030}
+RELEASE=${RELEASE:-release-v7-20191031}
 NUM_MATCHED=${NUM_MATCHED:-15}
 
 # This script should always run as if it were being called from
@@ -40,6 +40,9 @@ Rscript --vanilla 02-subset_files.R \
 # histologies file
 cp $FULL_DIRECTORY/pbta-histologies.tsv $SUBSET_DIRECTORY
 
+# independent specimen files
+cp $FULL_DIRECTORY/independent-specimens*.tsv $SUBSET_DIRECTORY
+
 # all bed files
 cp $FULL_DIRECTORY/*.bed $SUBSET_DIRECTORY