diff --git a/.circleci/config.yml b/.circleci/config.yml index 14224c9c80..1d3096f9ea 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -46,7 +46,7 @@ jobs: - run: name: Molecular subtyping Chordoma - command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd', clean = TRUE)" + command: OPENPBTA_SUBSET=0 ./scripts/run_in_ci.sh bash analyses/molecular-subtyping-chordoma/run-molecular-subtyping-chordoma.sh - run: name: Molecular subtyping - Ependymoma diff --git a/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R b/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R new file mode 100644 index 0000000000..b23ced5f10 --- /dev/null +++ b/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R @@ -0,0 +1,86 @@ +# This script subsets the focal copy number, RNA expression, histologies` +# files to include only Chordoma samples. + +# Written originally Chante Bethell 2019 +# (Adapted for this module by Candace Savonen 2020) +# +# #### USAGE +# This script is intended to be run via the command line from the top directory +# of the repository as follows: +# +# Rscript 'analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R' + + +#### Set Up -------------------------------------------------------------------- + +# Get `magrittr` pipe +`%>%` <- dplyr::`%>%` + +#### Directories and Files ----------------------------------------------------- + +# Detect the ".git" folder -- this will in the project root directory. +# Use this as the root directory to ensure proper sourcing of functions no +# matter where this is called from +root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) + +# Set path to results and plots directory +results_dir <- + file.path(root_dir, "analyses", "molecular-subtyping-chordoma", "chordoma-subset") + +if (!dir.exists(results_dir)) { + dir.create(results_dir) +} + +# Read in metadata +metadata <- + readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv")) + +#### Filter metadata ----------------------------------------------------------- +# Select wanted columns in metadata for merging and assign to a new object +chordoma_metadata <- metadata %>% + dplyr::select( + sample_id, + Kids_First_Participant_ID, + Kids_First_Biospecimen_ID + ) %>% + dplyr::filter(short_histology == "Chordoma") + +#### Filter expression data ---------------------------------------------------- +# Read in the stranded expression data file +expression_data <- read_rds(file.path( + "data", + "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds" +)) + +# Filter to Chordoma samples only -- we can use chordoma_df because it is subset to +# RNA-seq samples +expression_data <- expression_data %>% + dplyr::select(intersect( + chordoma_metadata$Kids_First_Biospecimen_ID, + colnames(expression_data) + )) %>% + readr::write_rds(file.path( + results_dir, + "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds" + )) + +#### Filter focal CN data ------------------------------------------------------ + +# Filter focal CN to Chordoma samples only +cn_metadata <- data.table::fread(file.path( + "data", + "consensus_seg_annotated_cn_autosomes.tsv.gz" +)) %>% + dplyr::left_join(chordoma_metadata, + by = c("biospecimen_id" = "Kids_First_Biospecimen_ID") + ) %>% + dplyr::select( + gene_symbol, + sample_id, + Kids_First_Participant_ID, + biospecimen_id, + status + ) %>% + dplyr::filter(sample_id %in% chordoma_metadata$sample_id) %>% + # Write to file + readr::write_tsv(file.path(results_dir, "chordoma-only_cn_autosomes.tsv.gz")) diff --git a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd index cd9b46083a..f469a38fb2 100644 --- a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd +++ b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd @@ -18,32 +18,42 @@ library(ggplot2) ### Read in data ```{r} -data_dir <- file.path("..", "..", "data") -histologies_df <- read_tsv(file.path(data_dir, - "pbta-histologies.tsv"), - col_types = cols( - molecular_subtype = col_character() - )) +# File path to subsetted files directory +input_dir <- "chordoma-subset" ``` +Set up metadata + ```{r} -# TODO: update to use consensus file with annotations included in the data -# download? -focal_cn_df <- - read_tsv(file.path("..", "focal-cn-file-preparation", "results", - "consensus_seg_annotated_cn_autosomes.tsv.gz")) +# Read in non-subsetted metadata +histologies_df <- readr::read_tsv(file.path("..", "..", "data", "pbta-histologies.tsv")) + +# Subset metadata +subset_metadata <- histologies_df %>% + dplyr::filter(short_histology == "Chordoma") %>% + select( + Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID, + experimental_strategy + ) ``` ```{r} -# we need to include the sample_id field from pbta-histologies.tsv in the final -# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression -# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same -# event for a given individual). To get the SMARCB1 jitter plot in the photo -# here #250 (comment), you will first need to read in the collapsed expression +subset_focal_cn_df <- + data.table::fread(file.path(input_dir, "chordoma-only_cn_autosomes.tsv.gz")) +``` + +```{r} +# we need to include the sample_id field from pbta-histologies.tsv in the final +# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression +# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same +# event for a given individual). To get the SMARCB1 jitter plot in the photo +# here #250 (comment), you will first need to read in the collapsed expression # data -expression_data <- - read_rds(file.path(data_dir, - "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds")) +subset_expression_data <- + read_rds(file.path( + input_dir, + "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds" + )) ``` ### Output @@ -66,59 +76,38 @@ output_file <- file.path(results_dir, "chordoma_smarcb1_status.tsv") ## Prepare the data -Extracting the biospecimen IDs that correspond to chordoma samples +Extracting the chordoma samples that have a loss of SMARCB1 from `subset_focal_cn_df` ```{r} -chordoma_samples <- histologies_df %>% - filter(short_histology == "Chordoma") %>% - pull(Kids_First_Biospecimen_ID) -``` - -Extracting the chordoma samples that have a loss of SMARCB1 from `focal_cn_df` - -```{r} -chordoma_loss <- focal_cn_df %>% - filter(biospecimen_id %in% chordoma_samples, - gene_symbol == "SMARCB1", - status == "loss") -chordoma_loss -``` - -```{r} -# remove large copy number data frame -rm(focal_cn_df) -``` - -Modifying the output file to filter only chorodoma diagnoses and removing columns with information not relevant to the study -```{r} -chordoma_id_df <- histologies_df %>% - # only rows with chordoma samples - filter(short_histology == "Chordoma") %>% - # select only these columns that we'll need later - select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID, - experimental_strategy) -chordoma_id_df +chordoma_loss <- subset_focal_cn_df %>% + filter( + gene_symbol == "SMARCB1", + status == "loss" + ) +chordoma_loss ``` Distinguishing the chordoma samples with no copy number change chromosome 22 ```{r} -copy_neutral_df <- chordoma_id_df %>% +copy_neutral_df <- subset_metadata %>% # the copy events can only be taken from WGS data not RNA-seq data # we also only want biospecimens where a loss was not recorded to avoid duplicates - filter(experimental_strategy == "WGS", - !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>% + filter( + experimental_strategy == "WGS", + !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id) + ) %>% # if there's no loss, let's assume status is copy neutral mutate(status = "neutral") %>% # let's get the columns to match chordoma_loss - rename(biospecimen_id = Kids_First_Biospecimen_ID) %>% + dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>% select(biospecimen_id, status) copy_neutral_df ``` Joining chordoma samples (both with chr 22 loss and no copy number change) ```{r} -chordoma_copy <- chordoma_loss %>% - #join the losses with the neutrals to get a new data frame +chordoma_copy <- chordoma_loss %>% + # join the losses with the neutrals to get a new data frame select(biospecimen_id, status) %>% bind_rows(copy_neutral_df) chordoma_copy @@ -129,10 +118,13 @@ Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy chordoma_copy <- chordoma_copy %>% # get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame # then use biospecimen IDs to add the sample_id info - inner_join(select(chordoma_id_df, - Kids_First_Biospecimen_ID, - sample_id), - by = c("biospecimen_id" = "Kids_First_Biospecimen_ID")) + inner_join(select( + subset_metadata, + Kids_First_Biospecimen_ID, + sample_id + ), + by = c("biospecimen_id" = "Kids_First_Biospecimen_ID") + ) chordoma_copy ``` @@ -141,16 +133,15 @@ Look at SMARCB1 expression values only in chordoma ```{r} # get the row that contains the SMARCB1 values # gene symbols are rownames -smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ] - -# now only the columns correspond to chordoma samples -smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ] -smarcb1_expression +smarcb1_expression <- subset_expression_data[which(rownames(subset_expression_data) == "SMARCB1"), ] ``` ```{r} -# remove large expression matrix that's no longer needed -rm(expression_data) +# now only the columns correspond to chordoma samples +smarcb1_expression <- smarcb1_expression[, which(colnames(subset_expression_data) %in% subset_metadata$Kids_First_Biospecimen_ID) ] + +# Print out the expression for SMARCB1 +smarcb1_expression ``` The `smarcb1_expression` is a not a friendly form ^^; Transposing needed: @@ -163,17 +154,20 @@ smarcb1_expression <- t(smarcb1_expression) %>% # we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>% # give SMARCB1 column a slightly better column name - rename(SMARCB1_expression = SMARCB1) + dplyr::rename(SMARCB1_expression = SMARCB1) smarcb1_expression ``` This also needs sample_id to add it in ```{r} smarcb1_expression <- smarcb1_expression %>% - inner_join(select(chordoma_id_df, - Kids_First_Biospecimen_ID, - sample_id), - by = "Kids_First_Biospecimen_ID") + inner_join(select( + subset_metadata, + Kids_First_Biospecimen_ID, + sample_id + ), + by = "Kids_First_Biospecimen_ID" + ) smarcb1_expression ``` @@ -182,29 +176,34 @@ Joining the copy number data with the expression data in this step chordoma_smarcb1_df <- smarcb1_expression %>% # any missing samples will get filled with NA when using a full join full_join(chordoma_copy, by = "sample_id") %>% - rename(Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID, - Kids_First_Biospecimen_ID_RNA = biospecimen_id) + dplyr::rename( + Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID, + Kids_First_Biospecimen_ID_RNA = biospecimen_id + ) # this step adds in the participant identifier (sample_id to match between the two data.frame) -chordoma_smarcb1_df <- chordoma_id_df %>% +chordoma_smarcb1_df <- subset_metadata %>% select(sample_id, Kids_First_Participant_ID) %>% distinct() %>% inner_join(chordoma_smarcb1_df, - by = "sample_id") + by = "sample_id" + ) chordoma_smarcb1_df ``` ```{r} chordoma_smarcb1_df <- chordoma_smarcb1_df %>% - select(Kids_First_Participant_ID, - Kids_First_Biospecimen_ID_DNA, - Kids_First_Biospecimen_ID_RNA, - sample_id, - status, - SMARCB1_expression) %>% + select( + Kids_First_Participant_ID, + Kids_First_Biospecimen_ID_DNA, + Kids_First_Biospecimen_ID_RNA, + sample_id, + status, + SMARCB1_expression + ) %>% # 'status' is replaced a more descriptive name - rename(focal_SMARCB1_status = status) + dplyr::rename(focal_SMARCB1_status = status) chordoma_smarcb1_df ``` @@ -218,20 +217,24 @@ chordoma_smarcb1_df %>% # drop the sample that doesn't have WGS data tidyr::drop_na() %>% # this step specifies what should go on the x- and y-axes - ggplot(aes(x = focal_SMARCB1_status, - y = SMARCB1_expression)) + - # we want a jitter plot where the points aren't too far + ggplot(aes( + x = focal_SMARCB1_status, + y = SMARCB1_expression + )) + + # we want a jitter plot where the points aren't too far # apart that's what width does geom_jitter(width = 0.1) + # this is plotting the median as a blue diamond - stat_summary(fun.y = "median", - geom = "point", - size = 3, - color = "blue", - shape = 18) + - xlab("SMARCB1 status") + - xlab("SMARCB1 status") + - ylab("SMARCB1 expression") + stat_summary( + fun.y = "median", + geom = "point", + size = 3, + color = "blue", + shape = 18 + ) + + xlab("SMARCB1 status") + + xlab("SMARCB1 status") + + ylab("SMARCB1 expression") ``` ```{r} @@ -244,3 +247,9 @@ Write the table to file. chordoma_smarcb1_df %>% write_tsv(output_file) ``` + +### Session Info + +```{r} +sessionInfo() +``` diff --git a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html index e83ff5467b..f126a3eeb0 100644 --- a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html +++ b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html @@ -1776,26 +1776,9 @@

20191121

Set up

- -
library(dplyr)
- - -
Registered S3 method overwritten by 'dplyr':
-  method           from
-  print.rowwise_df     
-
-Attaching package: ‘dplyr’
-
-The following objects are masked from ‘package:stats’:
-
-    filter, lag
-
-The following objects are masked from ‘package:base’:
-
-    intersect, setdiff, setequal, union
- - -
library(readr)
+
+
library(dplyr)
+library(readr)
 library(ggplot2)
@@ -1804,51 +1787,74 @@

Set up

Read in data

- -
data_dir <- file.path("..", "..", "data")
-histologies_df <- read_tsv(file.path(data_dir,
-                                     "pbta-histologies.tsv"),
-                           col_types = cols(
-                             molecular_subtype = col_character()
-                           ))
+ +
# File path to subsetted files directory
+input_dir <- "chordoma-subset"
+

Set up metadata

- -
# TODO: update to use consensus file with annotations included in the data 
-# download?
-focal_cn_df <- 
-  read_tsv(file.path("..", "focal-cn-file-preparation", "results",
-                     "consensus_seg_annotated_cn_autosomes.tsv.gz"))
+ +
# Read in non-subsetted metadata
+histologies_df <- readr::read_tsv(file.path("..", "..", "data", "pbta-histologies.tsv"))
- +
Parsed with column specification:
 cols(
-  biospecimen_id = col_character(),
-  status = col_character(),
-  copy_number = col_double(),
-  ploidy = col_double(),
-  ensembl = col_character(),
-  gene_symbol = col_character(),
-  cytoband = col_character()
-)
+ .default = col_character(), + OS_days = col_double(), + age_last_update_days = col_double(), + normal_fraction = col_double(), + tumor_fraction = col_double(), + tumor_ploidy = col_double(), + molecular_subtype = col_logical() +) +See spec(...) for full column specifications. +493 parsing failures. + row col expected actual file +2334 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv' +2335 molecular_subtype 1/0/T/F/TRUE/FALSE Group4 '../../data/pbta-histologies.tsv' +2336 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv' +2337 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv' +2338 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv' +.... ................. .................. ...... ................................. +See problems(...) for more details.
+ +
# Subset metadata
+subset_metadata <- histologies_df %>%
+  dplyr::filter(short_histology == "Chordoma") %>%
+  select(
+    Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
+    experimental_strategy
+  )
+ + + + + + +
subset_focal_cn_df <-
+  data.table::fread(file.path(input_dir, "chordoma-only_cn_autosomes.tsv.gz"))
+ - -
# we need to include the sample_id field from pbta-histologies.tsv in the final 
-# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression 
-# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same 
-# event for a given individual). To get the SMARCB1 jitter plot in the photo 
-# here #250 (comment), you will first need to read in the collapsed expression 
+
+
# we need to include the sample_id field from pbta-histologies.tsv in the final
+# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression
+# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same
+# event for a given individual). To get the SMARCB1 jitter plot in the photo
+# here #250 (comment), you will first need to read in the collapsed expression
 # data
-expression_data <- 
-  read_rds(file.path(data_dir, 
-                     "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
+subset_expression_data <- + read_rds(file.path( + input_dir, + "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds" + ))
@@ -1878,59 +1884,21 @@

Output

Prepare the data

-

Extracting the biospecimen IDs that correspond to chordoma samples

- - - -
chordoma_samples <- histologies_df %>%
-  filter(short_histology == "Chordoma") %>% 
-  pull(Kids_First_Biospecimen_ID)
- - - -

Extracting the chordoma samples that have a loss of SMARCB1 from focal_cn_df

- - - -
chordoma_loss <- focal_cn_df %>% 
-  filter(biospecimen_id %in% chordoma_samples, 
-         gene_symbol == "SMARCB1",
-         status == "loss")
-chordoma_loss 
- - -
- -
- - - +

Extracting the chordoma samples that have a loss of SMARCB1 from subset_focal_cn_df

- -
# remove large copy number data frame
-rm(focal_cn_df)
+ +
chordoma_loss <- subset_focal_cn_df %>%
+  filter(
+    gene_symbol == "SMARCB1",
+    status == "loss"
+  )
+chordoma_loss
- - -

Modifying the output file to filter only chorodoma diagnoses and removing columns with information not relevant to the study

- - - -
chordoma_id_df <- histologies_df %>% 
-  # only rows with chordoma samples
-  filter(short_histology == "Chordoma") %>%
-  # select only these columns that we'll need later
-  select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
-         experimental_strategy)
-chordoma_id_df
- - +
@@ -1939,20 +1907,22 @@

Prepare the data

Distinguishing the chordoma samples with no copy number change chromosome 22

- -
copy_neutral_df <- chordoma_id_df %>% 
+
+
copy_neutral_df <- subset_metadata %>%
   # the copy events can only be taken from WGS data not RNA-seq data
   # we also only want biospecimens where a loss was not recorded to avoid duplicates
-  filter(experimental_strategy == "WGS",
-         !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
+  filter(
+    experimental_strategy == "WGS",
+    !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)
+  ) %>%
   # if there's no loss, let's assume status is copy neutral
   mutate(status = "neutral") %>%
   # let's get the columns to match chordoma_loss
-  rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
+  dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
   select(biospecimen_id, status)
 copy_neutral_df
- +