AlexsLemonade · jashapiro · Mar 2, 2020 · Mar 2, 2020 · Mar 2, 2020 · Mar 2, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -46,7 +46,7 @@ jobs:
 
       - run:
           name: Molecular subtyping Chordoma
-          command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd', clean = TRUE)"
+          command: OPENPBTA_SUBSET=0 ./scripts/run_in_ci.sh bash analyses/molecular-subtyping-chordoma/run-molecular-subtyping-chordoma.sh
 
       - run:
           name: Molecular subtyping - Ependymoma

diff --git a/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R b/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R
@@ -0,0 +1,86 @@
+# This script subsets the focal copy number, RNA expression, histologies`
+# files to include only Chordoma samples.
+
+# Written originally Chante Bethell 2019
+# (Adapted for this module by Candace Savonen 2020)
+#
+# #### USAGE
+# This script is intended to be run via the command line from the top directory
+# of the repository as follows:
+#
+# Rscript 'analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R'
+
+
+#### Set Up --------------------------------------------------------------------
+
+# Get `magrittr` pipe
+`%>%` <- dplyr::`%>%`
+
+#### Directories and Files -----------------------------------------------------
+
+# Detect the ".git" folder -- this will in the project root directory.
+# Use this as the root directory to ensure proper sourcing of functions no
+# matter where this is called from
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+# Set path to results and plots directory
+results_dir <-
+  file.path(root_dir, "analyses", "molecular-subtyping-chordoma", "chordoma-subset")
+
+if (!dir.exists(results_dir)) {
+  dir.create(results_dir)
+}
+
+# Read in metadata
+metadata <-
+  readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv"))
+
+#### Filter metadata -----------------------------------------------------------
+# Select wanted columns in metadata for merging and assign to a new object
+chordoma_metadata <- metadata %>%
+  dplyr::select(
+    sample_id,
+    Kids_First_Participant_ID,
+    Kids_First_Biospecimen_ID
+  ) %>%
+  dplyr::filter(short_histology == "Chordoma")
+
+#### Filter expression data ----------------------------------------------------
+# Read in the stranded expression data file
+expression_data <- read_rds(file.path(
+  "data",
+  "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"
+))
+
+# Filter to Chordoma samples only -- we can use chordoma_df because it is subset to
+# RNA-seq samples
+expression_data <- expression_data %>%
+  dplyr::select(intersect(
+    chordoma_metadata$Kids_First_Biospecimen_ID,
+    colnames(expression_data)
+  )) %>%
+  readr::write_rds(file.path(
+    results_dir,
+    "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds"
+  ))
+
+#### Filter focal CN data ------------------------------------------------------
+
+# Filter focal CN to Chordoma samples only
+cn_metadata <- data.table::fread(file.path(
+  "data",
+  "consensus_seg_annotated_cn_autosomes.tsv.gz"
+)) %>%
+  dplyr::left_join(chordoma_metadata,
+    by = c("biospecimen_id" = "Kids_First_Biospecimen_ID")
+  ) %>%
+  dplyr::select(
+    gene_symbol,
+    sample_id,
+    Kids_First_Participant_ID,
+    biospecimen_id,
+    status
+  ) %>%
+  dplyr::filter(sample_id %in% chordoma_metadata$sample_id) %>%
+  # Write to file
+  readr::write_tsv(file.path(results_dir, "chordoma-only_cn_autosomes.tsv.gz"))
diff --git a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd
@@ -18,32 +18,42 @@ library(ggplot2)
 ### Read in data
 
 ```{r}
-data_dir <- file.path("..", "..", "data")
-histologies_df <- read_tsv(file.path(data_dir,
-                                     "pbta-histologies.tsv"),
-                           col_types = cols(
-                             molecular_subtype = col_character()
-                           ))
+# File path to subsetted files directory
+input_dir <- "chordoma-subset"
 ```
 
+Set up metadata
+
 ```{r}
-# TODO: update to use consensus file with annotations included in the data 
-# download?
-focal_cn_df <- 
-  read_tsv(file.path("..", "focal-cn-file-preparation", "results",
-                     "consensus_seg_annotated_cn_autosomes.tsv.gz"))
+# Read in non-subsetted metadata
+histologies_df <- readr::read_tsv(file.path("..", "..", "data", "pbta-histologies.tsv"))
+
+# Subset metadata
+subset_metadata <- histologies_df %>%
+  dplyr::filter(short_histology == "Chordoma") %>%
+  select(
+    Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
+    experimental_strategy
+  )
 ```
 
 ```{r}
-# we need to include the sample_id field from pbta-histologies.tsv in the final 
-# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression 
-# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same 
-# event for a given individual). To get the SMARCB1 jitter plot in the photo 
-# here #250 (comment), you will first need to read in the collapsed expression 
+subset_focal_cn_df <-
+  data.table::fread(file.path(input_dir, "chordoma-only_cn_autosomes.tsv.gz"))
+```
+
+```{r}
+# we need to include the sample_id field from pbta-histologies.tsv in the final
+# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression
+# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same
+# event for a given individual). To get the SMARCB1 jitter plot in the photo
+# here #250 (comment), you will first need to read in the collapsed expression
 # data
-expression_data <- 
-  read_rds(file.path(data_dir, 
-                     "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
+subset_expression_data <-
+  read_rds(file.path(
+    input_dir,
+    "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds"
+  ))
 ```
 
 ### Output
@@ -66,59 +76,38 @@ output_file <- file.path(results_dir, "chordoma_smarcb1_status.tsv")
 
 ## Prepare the data
 
-Extracting the biospecimen IDs that correspond to chordoma samples
+Extracting the chordoma samples that have a loss of SMARCB1 from `subset_focal_cn_df`
 
 ```{r}
-chordoma_samples <- histologies_df %>%
-  filter(short_histology == "Chordoma") %>% 
-  pull(Kids_First_Biospecimen_ID)
-```
-
-Extracting the chordoma samples that have a loss of SMARCB1 from `focal_cn_df`
-
-```{r}
-chordoma_loss <- focal_cn_df %>% 
-  filter(biospecimen_id %in% chordoma_samples, 
-         gene_symbol == "SMARCB1",
-         status == "loss")
-chordoma_loss 
-```
-
-```{r}
-# remove large copy number data frame
-rm(focal_cn_df)
-```
-
-Modifying the output file to filter only chorodoma diagnoses and removing columns with information not relevant to the study
-```{r}
-chordoma_id_df <- histologies_df %>% 
-  # only rows with chordoma samples
-  filter(short_histology == "Chordoma") %>%
-  # select only these columns that we'll need later
-  select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
-         experimental_strategy)
-chordoma_id_df
+chordoma_loss <- subset_focal_cn_df %>%
+  filter(
+    gene_symbol == "SMARCB1",
+    status == "loss"
+  )
+chordoma_loss
 ```
 
 Distinguishing the chordoma samples with no copy number change chromosome 22
 ```{r}
-copy_neutral_df <- chordoma_id_df %>% 
+copy_neutral_df <- subset_metadata %>%
   # the copy events can only be taken from WGS data not RNA-seq data
   # we also only want biospecimens where a loss was not recorded to avoid duplicates
-  filter(experimental_strategy == "WGS",
-         !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
+  filter(
+    experimental_strategy == "WGS",
+    !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)
+  ) %>%
   # if there's no loss, let's assume status is copy neutral
   mutate(status = "neutral") %>%
   # let's get the columns to match chordoma_loss
-  rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
+  dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
   select(biospecimen_id, status)
 copy_neutral_df
 ```
 
 Joining chordoma samples (both with chr 22 loss and no copy number change)
 ```{r}
-chordoma_copy <- chordoma_loss %>% 
-  #join the losses with the neutrals to get a new data frame
+chordoma_copy <- chordoma_loss %>%
+  # join the losses with the neutrals to get a new data frame
   select(biospecimen_id, status) %>%
   bind_rows(copy_neutral_df)
 chordoma_copy
@@ -129,10 +118,13 @@ Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy
 chordoma_copy <- chordoma_copy %>%
   # get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame
   # then use biospecimen IDs to add the sample_id info
-  inner_join(select(chordoma_id_df,
-                    Kids_First_Biospecimen_ID,
-                    sample_id),
-             by = c("biospecimen_id" = "Kids_First_Biospecimen_ID"))
+  inner_join(select(
+    subset_metadata,
+    Kids_First_Biospecimen_ID,
+    sample_id
+  ),
+  by = c("biospecimen_id" = "Kids_First_Biospecimen_ID")
+  )
 chordoma_copy
 ```
 
@@ -141,16 +133,15 @@ Look at SMARCB1 expression values only in chordoma
 ```{r}
 # get the row that contains the SMARCB1 values
 # gene symbols are rownames
-smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ]
-
-# now only the columns correspond to chordoma samples
-smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ]
-smarcb1_expression
+smarcb1_expression <- subset_expression_data[which(rownames(subset_expression_data) == "SMARCB1"), ]
 ```
 
 ```{r}
-# remove large expression matrix that's no longer needed
-rm(expression_data)
+# now only the columns correspond to chordoma samples
+smarcb1_expression <- smarcb1_expression[, which(colnames(subset_expression_data) %in% subset_metadata$Kids_First_Biospecimen_ID) ]
+
+# Print out the expression for SMARCB1
+smarcb1_expression
 ```
 
 The `smarcb1_expression` is a not a friendly form ^^; Transposing needed: 
@@ -163,17 +154,20 @@ smarcb1_expression <- t(smarcb1_expression) %>%
   # we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID
   tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
   # give SMARCB1 column a slightly better column name
-  rename(SMARCB1_expression = SMARCB1)
+  dplyr::rename(SMARCB1_expression = SMARCB1)
 smarcb1_expression
 ```
 
 This also needs sample_id to add it in
 ```{r}
 smarcb1_expression <- smarcb1_expression %>%
-  inner_join(select(chordoma_id_df,
-                    Kids_First_Biospecimen_ID,
-                    sample_id),
-             by = "Kids_First_Biospecimen_ID")
+  inner_join(select(
+    subset_metadata,
+    Kids_First_Biospecimen_ID,
+    sample_id
+  ),
+  by = "Kids_First_Biospecimen_ID"
+  )
 smarcb1_expression
 ```
 
@@ -182,29 +176,34 @@ Joining the copy number data with the expression data in this step
 chordoma_smarcb1_df <- smarcb1_expression %>%
   # any missing samples will get filled with NA when using a full join
   full_join(chordoma_copy, by = "sample_id") %>%
-  rename(Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID,
-         Kids_First_Biospecimen_ID_RNA = biospecimen_id)
+  dplyr::rename(
+    Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID,
+    Kids_First_Biospecimen_ID_RNA = biospecimen_id
+  )
 
 # this step adds in the participant identifier (sample_id to match between the two data.frame)
-chordoma_smarcb1_df <- chordoma_id_df %>%
+chordoma_smarcb1_df <- subset_metadata %>%
   select(sample_id, Kids_First_Participant_ID) %>%
   distinct() %>%
   inner_join(chordoma_smarcb1_df,
-             by = "sample_id")
+    by = "sample_id"
+  )
 
 chordoma_smarcb1_df
 ```
 
 ```{r}
 chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
-  select(Kids_First_Participant_ID, 
-         Kids_First_Biospecimen_ID_DNA,
-         Kids_First_Biospecimen_ID_RNA,
-         sample_id,
-         status,
-         SMARCB1_expression) %>%
+  select(
+    Kids_First_Participant_ID,
+    Kids_First_Biospecimen_ID_DNA,
+    Kids_First_Biospecimen_ID_RNA,
+    sample_id,
+    status,
+    SMARCB1_expression
+  ) %>%
   # 'status' is replaced a more descriptive name
-  rename(focal_SMARCB1_status = status)
+  dplyr::rename(focal_SMARCB1_status = status)
 chordoma_smarcb1_df
 ```
 
@@ -218,20 +217,24 @@ chordoma_smarcb1_df %>%
   # drop the sample that doesn't have WGS data
   tidyr::drop_na() %>%
   # this step specifies what should go on the x- and y-axes
-  ggplot(aes(x = focal_SMARCB1_status,
-             y = SMARCB1_expression)) +
-  # we want a jitter plot where the points aren't too far 
+  ggplot(aes(
+    x = focal_SMARCB1_status,
+    y = SMARCB1_expression
+  )) +
+  # we want a jitter plot where the points aren't too far
   # apart that's what width does
   geom_jitter(width = 0.1) +
   # this is plotting the median as a blue diamond
-  stat_summary(fun.y = "median", 
-               geom = "point", 
-               size = 3, 
-               color = "blue", 
-               shape = 18) +
-               xlab("SMARCB1 status") +
-   xlab("SMARCB1 status") +
-   ylab("SMARCB1 expression")
+  stat_summary(
+    fun.y = "median",
+    geom = "point",
+    size = 3,
+    color = "blue",
+    shape = 18
+  ) +
+  xlab("SMARCB1 status") +
+  xlab("SMARCB1 status") +
+  ylab("SMARCB1 expression")
 ```
 
 ```{r}
@@ -244,3 +247,9 @@ Write the table to file.
 chordoma_smarcb1_df %>%
   write_tsv(output_file)
 ```
+
+### Session Info
+
+```{r}
+sessionInfo()
+```