diff --git a/.circleci/config.yml b/.circleci/config.yml index 14224c9c80..1d3096f9ea 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -46,7 +46,7 @@ jobs: - run: name: Molecular subtyping Chordoma - command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd', clean = TRUE)" + command: OPENPBTA_SUBSET=0 ./scripts/run_in_ci.sh bash analyses/molecular-subtyping-chordoma/run-molecular-subtyping-chordoma.sh - run: name: Molecular subtyping - Ependymoma diff --git a/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R b/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R new file mode 100644 index 0000000000..b23ced5f10 --- /dev/null +++ b/analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R @@ -0,0 +1,86 @@ +# This script subsets the focal copy number, RNA expression, histologies` +# files to include only Chordoma samples. + +# Written originally Chante Bethell 2019 +# (Adapted for this module by Candace Savonen 2020) +# +# #### USAGE +# This script is intended to be run via the command line from the top directory +# of the repository as follows: +# +# Rscript 'analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R' + + +#### Set Up -------------------------------------------------------------------- + +# Get `magrittr` pipe +`%>%` <- dplyr::`%>%` + +#### Directories and Files ----------------------------------------------------- + +# Detect the ".git" folder -- this will in the project root directory. +# Use this as the root directory to ensure proper sourcing of functions no +# matter where this is called from +root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) + +# Set path to results and plots directory +results_dir <- + file.path(root_dir, "analyses", "molecular-subtyping-chordoma", "chordoma-subset") + +if (!dir.exists(results_dir)) { + dir.create(results_dir) +} + +# Read in metadata +metadata <- + readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv")) + +#### Filter metadata ----------------------------------------------------------- +# Select wanted columns in metadata for merging and assign to a new object +chordoma_metadata <- metadata %>% + dplyr::select( + sample_id, + Kids_First_Participant_ID, + Kids_First_Biospecimen_ID + ) %>% + dplyr::filter(short_histology == "Chordoma") + +#### Filter expression data ---------------------------------------------------- +# Read in the stranded expression data file +expression_data <- read_rds(file.path( + "data", + "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds" +)) + +# Filter to Chordoma samples only -- we can use chordoma_df because it is subset to +# RNA-seq samples +expression_data <- expression_data %>% + dplyr::select(intersect( + chordoma_metadata$Kids_First_Biospecimen_ID, + colnames(expression_data) + )) %>% + readr::write_rds(file.path( + results_dir, + "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds" + )) + +#### Filter focal CN data ------------------------------------------------------ + +# Filter focal CN to Chordoma samples only +cn_metadata <- data.table::fread(file.path( + "data", + "consensus_seg_annotated_cn_autosomes.tsv.gz" +)) %>% + dplyr::left_join(chordoma_metadata, + by = c("biospecimen_id" = "Kids_First_Biospecimen_ID") + ) %>% + dplyr::select( + gene_symbol, + sample_id, + Kids_First_Participant_ID, + biospecimen_id, + status + ) %>% + dplyr::filter(sample_id %in% chordoma_metadata$sample_id) %>% + # Write to file + readr::write_tsv(file.path(results_dir, "chordoma-only_cn_autosomes.tsv.gz")) diff --git a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd index cd9b46083a..f469a38fb2 100644 --- a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd +++ b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd @@ -18,32 +18,42 @@ library(ggplot2) ### Read in data ```{r} -data_dir <- file.path("..", "..", "data") -histologies_df <- read_tsv(file.path(data_dir, - "pbta-histologies.tsv"), - col_types = cols( - molecular_subtype = col_character() - )) +# File path to subsetted files directory +input_dir <- "chordoma-subset" ``` +Set up metadata + ```{r} -# TODO: update to use consensus file with annotations included in the data -# download? -focal_cn_df <- - read_tsv(file.path("..", "focal-cn-file-preparation", "results", - "consensus_seg_annotated_cn_autosomes.tsv.gz")) +# Read in non-subsetted metadata +histologies_df <- readr::read_tsv(file.path("..", "..", "data", "pbta-histologies.tsv")) + +# Subset metadata +subset_metadata <- histologies_df %>% + dplyr::filter(short_histology == "Chordoma") %>% + select( + Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID, + experimental_strategy + ) ``` ```{r} -# we need to include the sample_id field from pbta-histologies.tsv in the final -# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression -# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same -# event for a given individual). To get the SMARCB1 jitter plot in the photo -# here #250 (comment), you will first need to read in the collapsed expression +subset_focal_cn_df <- + data.table::fread(file.path(input_dir, "chordoma-only_cn_autosomes.tsv.gz")) +``` + +```{r} +# we need to include the sample_id field from pbta-histologies.tsv in the final +# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression +# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same +# event for a given individual). To get the SMARCB1 jitter plot in the photo +# here #250 (comment), you will first need to read in the collapsed expression # data -expression_data <- - read_rds(file.path(data_dir, - "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds")) +subset_expression_data <- + read_rds(file.path( + input_dir, + "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds" + )) ``` ### Output @@ -66,59 +76,38 @@ output_file <- file.path(results_dir, "chordoma_smarcb1_status.tsv") ## Prepare the data -Extracting the biospecimen IDs that correspond to chordoma samples +Extracting the chordoma samples that have a loss of SMARCB1 from `subset_focal_cn_df` ```{r} -chordoma_samples <- histologies_df %>% - filter(short_histology == "Chordoma") %>% - pull(Kids_First_Biospecimen_ID) -``` - -Extracting the chordoma samples that have a loss of SMARCB1 from `focal_cn_df` - -```{r} -chordoma_loss <- focal_cn_df %>% - filter(biospecimen_id %in% chordoma_samples, - gene_symbol == "SMARCB1", - status == "loss") -chordoma_loss -``` - -```{r} -# remove large copy number data frame -rm(focal_cn_df) -``` - -Modifying the output file to filter only chorodoma diagnoses and removing columns with information not relevant to the study -```{r} -chordoma_id_df <- histologies_df %>% - # only rows with chordoma samples - filter(short_histology == "Chordoma") %>% - # select only these columns that we'll need later - select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID, - experimental_strategy) -chordoma_id_df +chordoma_loss <- subset_focal_cn_df %>% + filter( + gene_symbol == "SMARCB1", + status == "loss" + ) +chordoma_loss ``` Distinguishing the chordoma samples with no copy number change chromosome 22 ```{r} -copy_neutral_df <- chordoma_id_df %>% +copy_neutral_df <- subset_metadata %>% # the copy events can only be taken from WGS data not RNA-seq data # we also only want biospecimens where a loss was not recorded to avoid duplicates - filter(experimental_strategy == "WGS", - !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>% + filter( + experimental_strategy == "WGS", + !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id) + ) %>% # if there's no loss, let's assume status is copy neutral mutate(status = "neutral") %>% # let's get the columns to match chordoma_loss - rename(biospecimen_id = Kids_First_Biospecimen_ID) %>% + dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>% select(biospecimen_id, status) copy_neutral_df ``` Joining chordoma samples (both with chr 22 loss and no copy number change) ```{r} -chordoma_copy <- chordoma_loss %>% - #join the losses with the neutrals to get a new data frame +chordoma_copy <- chordoma_loss %>% + # join the losses with the neutrals to get a new data frame select(biospecimen_id, status) %>% bind_rows(copy_neutral_df) chordoma_copy @@ -129,10 +118,13 @@ Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy chordoma_copy <- chordoma_copy %>% # get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame # then use biospecimen IDs to add the sample_id info - inner_join(select(chordoma_id_df, - Kids_First_Biospecimen_ID, - sample_id), - by = c("biospecimen_id" = "Kids_First_Biospecimen_ID")) + inner_join(select( + subset_metadata, + Kids_First_Biospecimen_ID, + sample_id + ), + by = c("biospecimen_id" = "Kids_First_Biospecimen_ID") + ) chordoma_copy ``` @@ -141,16 +133,15 @@ Look at SMARCB1 expression values only in chordoma ```{r} # get the row that contains the SMARCB1 values # gene symbols are rownames -smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ] - -# now only the columns correspond to chordoma samples -smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ] -smarcb1_expression +smarcb1_expression <- subset_expression_data[which(rownames(subset_expression_data) == "SMARCB1"), ] ``` ```{r} -# remove large expression matrix that's no longer needed -rm(expression_data) +# now only the columns correspond to chordoma samples +smarcb1_expression <- smarcb1_expression[, which(colnames(subset_expression_data) %in% subset_metadata$Kids_First_Biospecimen_ID) ] + +# Print out the expression for SMARCB1 +smarcb1_expression ``` The `smarcb1_expression` is a not a friendly form ^^; Transposing needed: @@ -163,17 +154,20 @@ smarcb1_expression <- t(smarcb1_expression) %>% # we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>% # give SMARCB1 column a slightly better column name - rename(SMARCB1_expression = SMARCB1) + dplyr::rename(SMARCB1_expression = SMARCB1) smarcb1_expression ``` This also needs sample_id to add it in ```{r} smarcb1_expression <- smarcb1_expression %>% - inner_join(select(chordoma_id_df, - Kids_First_Biospecimen_ID, - sample_id), - by = "Kids_First_Biospecimen_ID") + inner_join(select( + subset_metadata, + Kids_First_Biospecimen_ID, + sample_id + ), + by = "Kids_First_Biospecimen_ID" + ) smarcb1_expression ``` @@ -182,29 +176,34 @@ Joining the copy number data with the expression data in this step chordoma_smarcb1_df <- smarcb1_expression %>% # any missing samples will get filled with NA when using a full join full_join(chordoma_copy, by = "sample_id") %>% - rename(Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID, - Kids_First_Biospecimen_ID_RNA = biospecimen_id) + dplyr::rename( + Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID, + Kids_First_Biospecimen_ID_RNA = biospecimen_id + ) # this step adds in the participant identifier (sample_id to match between the two data.frame) -chordoma_smarcb1_df <- chordoma_id_df %>% +chordoma_smarcb1_df <- subset_metadata %>% select(sample_id, Kids_First_Participant_ID) %>% distinct() %>% inner_join(chordoma_smarcb1_df, - by = "sample_id") + by = "sample_id" + ) chordoma_smarcb1_df ``` ```{r} chordoma_smarcb1_df <- chordoma_smarcb1_df %>% - select(Kids_First_Participant_ID, - Kids_First_Biospecimen_ID_DNA, - Kids_First_Biospecimen_ID_RNA, - sample_id, - status, - SMARCB1_expression) %>% + select( + Kids_First_Participant_ID, + Kids_First_Biospecimen_ID_DNA, + Kids_First_Biospecimen_ID_RNA, + sample_id, + status, + SMARCB1_expression + ) %>% # 'status' is replaced a more descriptive name - rename(focal_SMARCB1_status = status) + dplyr::rename(focal_SMARCB1_status = status) chordoma_smarcb1_df ``` @@ -218,20 +217,24 @@ chordoma_smarcb1_df %>% # drop the sample that doesn't have WGS data tidyr::drop_na() %>% # this step specifies what should go on the x- and y-axes - ggplot(aes(x = focal_SMARCB1_status, - y = SMARCB1_expression)) + - # we want a jitter plot where the points aren't too far + ggplot(aes( + x = focal_SMARCB1_status, + y = SMARCB1_expression + )) + + # we want a jitter plot where the points aren't too far # apart that's what width does geom_jitter(width = 0.1) + # this is plotting the median as a blue diamond - stat_summary(fun.y = "median", - geom = "point", - size = 3, - color = "blue", - shape = 18) + - xlab("SMARCB1 status") + - xlab("SMARCB1 status") + - ylab("SMARCB1 expression") + stat_summary( + fun.y = "median", + geom = "point", + size = 3, + color = "blue", + shape = 18 + ) + + xlab("SMARCB1 status") + + xlab("SMARCB1 status") + + ylab("SMARCB1 expression") ``` ```{r} @@ -244,3 +247,9 @@ Write the table to file. chordoma_smarcb1_df %>% write_tsv(output_file) ``` + +### Session Info + +```{r} +sessionInfo() +``` diff --git a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html index e83ff5467b..f126a3eeb0 100644 --- a/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html +++ b/analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.nb.html @@ -1776,26 +1776,9 @@
library(dplyr)
-
-
-Registered S3 method overwritten by 'dplyr':
- method from
- print.rowwise_df
-
-Attaching package: ‘dplyr’
-
-The following objects are masked from ‘package:stats’:
-
- filter, lag
-
-The following objects are masked from ‘package:base’:
-
- intersect, setdiff, setequal, union
-
-
-library(readr)
+
+library(dplyr)
+library(readr)
library(ggplot2)
@@ -1804,51 +1787,74 @@ Set up
Read in data
-
-data_dir <- file.path("..", "..", "data")
-histologies_df <- read_tsv(file.path(data_dir,
- "pbta-histologies.tsv"),
- col_types = cols(
- molecular_subtype = col_character()
- ))
+
+# File path to subsetted files directory
+input_dir <- "chordoma-subset"
+Set up metadata
-
-# TODO: update to use consensus file with annotations included in the data
-# download?
-focal_cn_df <-
- read_tsv(file.path("..", "focal-cn-file-preparation", "results",
- "consensus_seg_annotated_cn_autosomes.tsv.gz"))
+
+# Read in non-subsetted metadata
+histologies_df <- readr::read_tsv(file.path("..", "..", "data", "pbta-histologies.tsv"))
-
+
Parsed with column specification:
cols(
- biospecimen_id = [31mcol_character()[39m,
- status = [31mcol_character()[39m,
- copy_number = [32mcol_double()[39m,
- ploidy = [32mcol_double()[39m,
- ensembl = [31mcol_character()[39m,
- gene_symbol = [31mcol_character()[39m,
- cytoband = [31mcol_character()[39m
-)
+ .default = col_character(),
+ OS_days = [32mcol_double()[39m,
+ age_last_update_days = [32mcol_double()[39m,
+ normal_fraction = [32mcol_double()[39m,
+ tumor_fraction = [32mcol_double()[39m,
+ tumor_ploidy = [32mcol_double()[39m,
+ molecular_subtype = [33mcol_logical()[39m
+)
+See spec(...) for full column specifications.
+493 parsing failures.
+ row col expected actual file
+2334 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
+2335 molecular_subtype 1/0/T/F/TRUE/FALSE Group4 '../../data/pbta-histologies.tsv'
+2336 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
+2337 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
+2338 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
+.... ................. .................. ...... .................................
+See problems(...) for more details.
+
+# Subset metadata
+subset_metadata <- histologies_df %>%
+ dplyr::filter(short_histology == "Chordoma") %>%
+ select(
+ Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
+ experimental_strategy
+ )
+
+
+
+
+
+
+subset_focal_cn_df <-
+ data.table::fread(file.path(input_dir, "chordoma-only_cn_autosomes.tsv.gz"))
+
-
-# we need to include the sample_id field from pbta-histologies.tsv in the final
-# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression
-# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same
-# event for a given individual). To get the SMARCB1 jitter plot in the photo
-# here #250 (comment), you will first need to read in the collapsed expression
+
+# we need to include the sample_id field from pbta-histologies.tsv in the final
+# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression
+# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same
+# event for a given individual). To get the SMARCB1 jitter plot in the photo
+# here #250 (comment), you will first need to read in the collapsed expression
# data
-expression_data <-
- read_rds(file.path(data_dir,
- "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
+subset_expression_data <-
+ read_rds(file.path(
+ input_dir,
+ "chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds"
+ ))
@@ -1878,59 +1884,21 @@ Extracting the biospecimen IDs that correspond to chordoma samples
- - - -chordoma_samples <- histologies_df %>%
- filter(short_histology == "Chordoma") %>%
- pull(Kids_First_Biospecimen_ID)
-
-
-
-Extracting the chordoma samples that have a loss of SMARCB1 from focal_cn_df
chordoma_loss <- focal_cn_df %>%
- filter(biospecimen_id %in% chordoma_samples,
- gene_symbol == "SMARCB1",
- status == "loss")
-chordoma_loss
-
-
-Extracting the chordoma samples that have a loss of SMARCB1 from subset_focal_cn_df
# remove large copy number data frame
-rm(focal_cn_df)
+
+chordoma_loss <- subset_focal_cn_df %>%
+ filter(
+ gene_symbol == "SMARCB1",
+ status == "loss"
+ )
+chordoma_loss
-
-
-Modifying the output file to filter only chorodoma diagnoses and removing columns with information not relevant to the study
- - - -chordoma_id_df <- histologies_df %>%
- # only rows with chordoma samples
- filter(short_histology == "Chordoma") %>%
- # select only these columns that we'll need later
- select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
- experimental_strategy)
-chordoma_id_df
-
-
+
Distinguishing the chordoma samples with no copy number change chromosome 22
- -copy_neutral_df <- chordoma_id_df %>%
+
+copy_neutral_df <- subset_metadata %>%
# the copy events can only be taken from WGS data not RNA-seq data
# we also only want biospecimens where a loss was not recorded to avoid duplicates
- filter(experimental_strategy == "WGS",
- !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
+ filter(
+ experimental_strategy == "WGS",
+ !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)
+ ) %>%
# if there's no loss, let's assume status is copy neutral
mutate(status = "neutral") %>%
# let's get the columns to match chordoma_loss
- rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
+ dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
select(biospecimen_id, status)
copy_neutral_df
-
+